From c4692054e8111f1627573c8cbd1f266bfa9e516f Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Tue, 5 May 2026 02:18:31 +0000 Subject: [PATCH 01/18] Initial plan From 79df2238830e5739d6c79ca1c860bdc3da07bd97 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Tue, 5 May 2026 02:26:04 +0000 Subject: [PATCH 02/18] feat: implement Deployment Stacks for idempotent destroy + extended state schema - Deploy workflow: use `az stack sub create` with `--action-on-unmanage deleteAll` as the default deployment method, with `az deployment sub create` as fallback - Deploy workflow: add managed resources capture step after deploy that walks deployment operations or stack resources to populate state.managedResources[] - Destroy workflow: use `az stack sub delete` when stackId is present in state, covering multi-RG, sub-scope, and MG-scope resources uniformly - Destroy workflow: add soft-delete purge loop for Key Vault and Cognitive Services - Destroy workflow: add deployment history cleanup step - Destroy workflow: support new terminal statuses: `partially-destroyed` and `retained-soft-deleted` - State schema: extend state.json with stackId, deployMethod, managedResources[], resourceGroups[], subscriptions[], externalReferences[] - Metadata schema: add deployMethod and resourceGroups[] fields - Documentation: update deployment state docs with new schema, statuses, and destroy strategy selection logic - Regenerate workflow documentation pages Agent-Logs-Url: https://github.com/Azure/git-ape/sessions/d2d1da54-9a38-41ef-9254-b5f585eab10e Co-authored-by: arnaudlh <20535201+arnaudlh@users.noreply.github.com> --- .github/workflows/git-ape-deploy.exampleyml | 220 +++++++++-- .github/workflows/git-ape-destroy.exampleyml | 376 ++++++++++++++---- website/docs/deployment/state.md | 95 ++++- website/docs/workflows/git-ape-deploy.md | 222 +++++++++-- website/docs/workflows/git-ape-destroy.md | 378 +++++++++++++++---- 5 files changed, 1066 insertions(+), 225 deletions(-) diff --git a/.github/workflows/git-ape-deploy.exampleyml b/.github/workflows/git-ape-deploy.exampleyml index 48c6d71..018c461 100644 --- a/.github/workflows/git-ape-deploy.exampleyml +++ b/.github/workflows/git-ape-deploy.exampleyml @@ -240,18 +240,39 @@ jobs: echo "๐Ÿš€ Starting deployment: ${{ matrix.deployment_id }}" START_TIME=$(date +%s) - DEPLOY_OUTPUT=$(az deployment sub create \ - --name "${{ matrix.deployment_id }}" \ - --location "${{ steps.params.outputs.location }}" \ - --template-file "${{ steps.params.outputs.deploy_dir }}/template.json" \ - --parameters @"${{ steps.params.outputs.deploy_dir }}/parameters.json" \ - --output json 2>&1) + DEPLOY_DIR="${{ steps.params.outputs.deploy_dir }}" + DEPLOYMENT_ID="${{ matrix.deployment_id }}" + LOCATION="${{ steps.params.outputs.location }}" + + # Determine deploy method: prefer deployment stacks (idempotent destroy) + # Fall back to az deployment sub create if stacks are unavailable + DEPLOY_METHOD="stack" + + if [[ "$DEPLOY_METHOD" == "stack" ]]; then + DEPLOY_OUTPUT=$(az stack sub create \ + --name "$DEPLOYMENT_ID" \ + --location "$LOCATION" \ + --template-file "$DEPLOY_DIR/template.json" \ + --parameters @"$DEPLOY_DIR/parameters.json" \ + --action-on-unmanage deleteAll \ + --deny-settings-mode none \ + --yes \ + --output json 2>&1) + else + DEPLOY_OUTPUT=$(az deployment sub create \ + --name "$DEPLOYMENT_ID" \ + --location "$LOCATION" \ + --template-file "$DEPLOY_DIR/template.json" \ + --parameters @"$DEPLOY_DIR/parameters.json" \ + --output json 2>&1) + fi EXIT_CODE=$? END_TIME=$(date +%s) DURATION=$((END_TIME - START_TIME)) echo "deploy_duration=${DURATION}s" >> "$GITHUB_OUTPUT" + echo "deploy_method=$DEPLOY_METHOD" >> "$GITHUB_OUTPUT" if [[ $EXIT_CODE -ne 0 ]]; then echo "deploy_status=failed" >> "$GITHUB_OUTPUT" @@ -270,8 +291,18 @@ jobs: echo "deploy_status=succeeded" >> "$GITHUB_OUTPUT" - # Extract outputs - OUTPUTS=$(echo "$DEPLOY_OUTPUT" | jq -r '.properties.outputs // {}') + # Extract outputs depending on deploy method + if [[ "$DEPLOY_METHOD" == "stack" ]]; then + # For stacks, extract the stack ID + STACK_ID=$(echo "$DEPLOY_OUTPUT" | jq -r '.id // empty') + echo "stack_id=$STACK_ID" >> "$GITHUB_OUTPUT" + + # Extract outputs from the stack's deployment + OUTPUTS=$(echo "$DEPLOY_OUTPUT" | jq -r '.outputs // {}') + else + OUTPUTS=$(echo "$DEPLOY_OUTPUT" | jq -r '.properties.outputs // {}') + fi + echo "deploy_outputs<> "$GITHUB_OUTPUT" echo "$OUTPUTS" >> "$GITHUB_OUTPUT" echo "EOF" >> "$GITHUB_OUTPUT" @@ -280,7 +311,99 @@ jobs: RG_NAME=$(echo "$OUTPUTS" | jq -r '.resourceGroupName.value // empty') echo "resource_group=$RG_NAME" >> "$GITHUB_OUTPUT" - echo "โœ… Deployment succeeded in ${DURATION}s" + echo "โœ… Deployment succeeded in ${DURATION}s (method: $DEPLOY_METHOD)" + + - name: Capture managed resources + id: capture + if: steps.deploy.outputs.deploy_status == 'succeeded' + run: | + DEPLOYMENT_ID="${{ matrix.deployment_id }}" + DEPLOY_METHOD="${{ steps.deploy.outputs.deploy_method }}" + RG_NAME="${{ steps.deploy.outputs.resource_group }}" + STACK_ID="${{ steps.deploy.outputs.stack_id }}" + + # Known soft-deletable resource types + SOFT_DELETABLE_TYPES="Microsoft.KeyVault/vaults Microsoft.CognitiveServices/accounts Microsoft.AppConfiguration/configurationStores Microsoft.ApiManagement/service Microsoft.MachineLearningServices/workspaces Microsoft.RecoveryServices/vaults" + + MANAGED_RESOURCES="[]" + RESOURCE_GROUPS="[]" + + if [[ "$DEPLOY_METHOD" == "stack" && -n "$STACK_ID" ]]; then + # Stacks natively track all managed resources + STACK_RESOURCES=$(az stack sub show \ + --name "$DEPLOYMENT_ID" \ + --query "resources[].id" \ + -o json 2>/dev/null || echo "[]") + + # Build managedResources array from stack resources + for RES_ID in $(echo "$STACK_RESOURCES" | jq -r '.[]' 2>/dev/null); do + RES_TYPE=$(echo "$RES_ID" | grep -oP 'providers/\K[^/]+/[^/]+' | tail -1) + RES_SCOPE="resourceGroup" + if echo "$RES_ID" | grep -q "/resourceGroups/"; then + RES_SCOPE="resourceGroup" + else + RES_SCOPE="subscription" + fi + + IS_SOFT_DELETABLE="false" + for SD_TYPE in $SOFT_DELETABLE_TYPES; do + if [[ "$RES_TYPE" == "$SD_TYPE" ]]; then + IS_SOFT_DELETABLE="true" + break + fi + done + + MANAGED_RESOURCES=$(echo "$MANAGED_RESOURCES" | jq --arg id "$RES_ID" --arg type "$RES_TYPE" \ + --arg scope "$RES_SCOPE" --argjson sd "$IS_SOFT_DELETABLE" \ + '. + [{"id": $id, "type": $type, "scope": $scope, "softDeletable": $sd, "purgeProtected": false}]') + done + + # Extract resource groups from managed resources + RESOURCE_GROUPS=$(echo "$MANAGED_RESOURCES" | jq -c '[.[].id | capture("/resourceGroups/(?[^/]+)") | .rg] | unique') + else + # Fallback: walk deployment operations recursively + OPS=$(az deployment operation sub list \ + --name "$DEPLOYMENT_ID" \ + --query "[?properties.provisioningState=='Succeeded' && properties.targetResource.id != null].properties.targetResource" \ + -o json 2>/dev/null || echo "[]") + + for RES_ID in $(echo "$OPS" | jq -r '.[].id // empty' 2>/dev/null); do + RES_TYPE=$(echo "$OPS" | jq -r ".[] | select(.id == \"$RES_ID\") | .resourceType // empty") + RES_SCOPE="resourceGroup" + if echo "$RES_ID" | grep -q "/resourceGroups/"; then + RES_SCOPE="resourceGroup" + else + RES_SCOPE="subscription" + fi + + IS_SOFT_DELETABLE="false" + for SD_TYPE in $SOFT_DELETABLE_TYPES; do + if [[ "$RES_TYPE" == "$SD_TYPE" ]]; then + IS_SOFT_DELETABLE="true" + break + fi + done + + MANAGED_RESOURCES=$(echo "$MANAGED_RESOURCES" | jq --arg id "$RES_ID" --arg type "$RES_TYPE" \ + --arg scope "$RES_SCOPE" --argjson sd "$IS_SOFT_DELETABLE" \ + '. + [{"id": $id, "type": $type, "scope": $scope, "softDeletable": $sd, "purgeProtected": false}]') + done + + # Collect resource groups + if [[ -n "$RG_NAME" ]]; then + RESOURCE_GROUPS="[\"$RG_NAME\"]" + fi + fi + + echo "managed_resources<> "$GITHUB_OUTPUT" + echo "$MANAGED_RESOURCES" >> "$GITHUB_OUTPUT" + echo "EOF" >> "$GITHUB_OUTPUT" + echo "resource_groups<> "$GITHUB_OUTPUT" + echo "$RESOURCE_GROUPS" >> "$GITHUB_OUTPUT" + echo "EOF" >> "$GITHUB_OUTPUT" + + RESOURCE_COUNT=$(echo "$MANAGED_RESOURCES" | jq 'length') + echo "๐Ÿ“‹ Captured $RESOURCE_COUNT managed resources" - name: Run integration tests id: tests @@ -349,25 +472,60 @@ jobs: DEPLOY_DIR="${{ steps.params.outputs.deploy_dir }}" STATUS="${{ steps.deploy.outputs.deploy_status || 'failed' }}" TIMESTAMP=$(date -u +%Y-%m-%dT%H:%M:%SZ) + DEPLOY_METHOD="${{ steps.deploy.outputs.deploy_method }}" + STACK_ID="${{ steps.deploy.outputs.stack_id }}" + MANAGED_RESOURCES='${{ steps.capture.outputs.managed_resources }}' + RESOURCE_GROUPS='${{ steps.capture.outputs.resource_groups }}' + + # Ensure managed resources and resource groups are valid JSON + if ! echo "$MANAGED_RESOURCES" | jq empty 2>/dev/null; then + MANAGED_RESOURCES="[]" + fi + if ! echo "$RESOURCE_GROUPS" | jq empty 2>/dev/null; then + RESOURCE_GROUPS="[]" + fi - # Create/update state.json - cat > "$DEPLOY_DIR/state.json" < "$DEPLOY_DIR/state.json" - name: Commit deployment state if: always() @@ -376,9 +534,13 @@ jobs: STATUS="${{ steps.deploy.outputs.deploy_status }}" STATUS=${STATUS:-failed} - # Update metadata.json status from pending to actual result + # Update metadata.json status from pending to actual result, add deployMethod and resourceGroups if [[ -f "$DEPLOY_DIR/metadata.json" ]]; then - jq --arg status "$STATUS" '.status = $status' \ + DEPLOY_METHOD="${{ steps.deploy.outputs.deploy_method }}" + DEPLOY_METHOD=${DEPLOY_METHOD:-subscription} + RG_NAME="${{ steps.deploy.outputs.resource_group }}" + jq --arg status "$STATUS" --arg method "$DEPLOY_METHOD" --arg rg "$RG_NAME" \ + '.status = $status | .deployMethod = $method | .resourceGroups = (if $rg == "" then [] else [$rg] end)' \ "$DEPLOY_DIR/metadata.json" > "$DEPLOY_DIR/metadata.json.tmp" \ && mv "$DEPLOY_DIR/metadata.json.tmp" "$DEPLOY_DIR/metadata.json" fi diff --git a/.github/workflows/git-ape-destroy.exampleyml b/.github/workflows/git-ape-destroy.exampleyml index 1afc7ae..2f58066 100644 --- a/.github/workflows/git-ape-destroy.exampleyml +++ b/.github/workflows/git-ape-destroy.exampleyml @@ -132,16 +132,34 @@ jobs: fi RG_NAME=$(jq -r '.resourceGroup // empty' "$STATE_FILE") - - if [[ -z "$RG_NAME" ]]; then - echo "::error::No resource group found in state file" + STACK_ID=$(jq -r '.stackId // empty' "$STATE_FILE") + DEPLOY_METHOD=$(jq -r '.deployMethod // "subscription"' "$STATE_FILE") + MANAGED_RESOURCES=$(jq -c '.managedResources // []' "$STATE_FILE") + RESOURCE_GROUPS=$(jq -c '.resourceGroups // []' "$STATE_FILE") + + # Fallback: if no stackId and no resourceGroup, cannot proceed + if [[ -z "$STACK_ID" && -z "$RG_NAME" ]]; then + echo "::error::No stack ID or resource group found in state file" echo "found=false" >> "$GITHUB_OUTPUT" exit 1 fi echo "found=true" >> "$GITHUB_OUTPUT" echo "resource_group=$RG_NAME" >> "$GITHUB_OUTPUT" - echo "Will destroy resource group: $RG_NAME" + echo "stack_id=$STACK_ID" >> "$GITHUB_OUTPUT" + echo "deploy_method=$DEPLOY_METHOD" >> "$GITHUB_OUTPUT" + echo "managed_resources<> "$GITHUB_OUTPUT" + echo "$MANAGED_RESOURCES" >> "$GITHUB_OUTPUT" + echo "EOF" >> "$GITHUB_OUTPUT" + echo "resource_groups<> "$GITHUB_OUTPUT" + echo "$RESOURCE_GROUPS" >> "$GITHUB_OUTPUT" + echo "EOF" >> "$GITHUB_OUTPUT" + + if [[ -n "$STACK_ID" ]]; then + echo "Will destroy via deployment stack: $STACK_ID" + else + echo "Will destroy resource group: $RG_NAME (fallback method)" + fi - name: Azure Login (OIDC) if: steps.state.outputs.found == 'true' @@ -157,92 +175,153 @@ jobs: run: | RG="${{ steps.state.outputs.resource_group }}" DEPLOYMENT_ID="${{ matrix.deployment_id }}" + STACK_ID="${{ steps.state.outputs.stack_id }}" + DEPLOY_METHOD="${{ steps.state.outputs.deploy_method }}" - # Check if resource group exists - EXISTS=$(az group exists --name "$RG") - echo "exists=$EXISTS" >> "$GITHUB_OUTPUT" - - if [[ "$EXISTS" != "true" ]]; then - echo "Resource group $RG does not exist (already deleted?)" - echo "resource_count=0" >> "$GITHUB_OUTPUT" - echo "sub_count=0" >> "$GITHUB_OUTPUT" - exit 0 + echo "=== Destroy Plan ===" + echo "Deployment: $DEPLOYMENT_ID" + echo "Method: $DEPLOY_METHOD" + + if [[ -n "$STACK_ID" ]]; then + # Check if stack still exists + STACK_EXISTS=$(az stack sub show --name "$DEPLOYMENT_ID" --query "id" -o tsv 2>/dev/null || echo "") + if [[ -n "$STACK_EXISTS" ]]; then + echo "stack_exists=true" >> "$GITHUB_OUTPUT" + echo "Stack: $STACK_ID (exists)" + + # List resources in the stack + STACK_RESOURCES=$(az stack sub show --name "$DEPLOYMENT_ID" --query "resources[].id" -o json 2>/dev/null || echo "[]") + RESOURCE_COUNT=$(echo "$STACK_RESOURCES" | jq 'length') + echo "resource_count=$RESOURCE_COUNT" >> "$GITHUB_OUTPUT" + echo "Resources: $RESOURCE_COUNT managed by stack" + else + echo "stack_exists=false" >> "$GITHUB_OUTPUT" + echo "Stack not found โ€” will use fallback" + echo "resource_count=0" >> "$GITHUB_OUTPUT" + fi + else + echo "stack_exists=false" >> "$GITHUB_OUTPUT" fi - # Inventory RG resources - RESOURCES=$(az resource list --resource-group "$RG" \ - --query "[].{name:name, type:type, id:id, provisioningState:provisioningState}" \ - --output json 2>/dev/null || echo "[]") - RESOURCE_COUNT=$(echo "$RESOURCES" | jq 'length') + # Check resource group existence (for fallback or soft-delete sweep) + if [[ -n "$RG" ]]; then + EXISTS=$(az group exists --name "$RG") + echo "rg_exists=$EXISTS" >> "$GITHUB_OUTPUT" + echo "RG: $RG (exists=$EXISTS)" + + if [[ "$EXISTS" == "true" ]]; then + RESOURCES=$(az resource list --resource-group "$RG" \ + --query "[].{name:name, type:type, id:id, provisioningState:provisioningState}" \ + --output json 2>/dev/null || echo "[]") + RESOURCE_COUNT=$(echo "$RESOURCES" | jq 'length') + # Only set resource_count if stack_exists is false (avoid overwrite) + if [[ "$STACK_ID" == "" ]]; then + echo "resource_count=$RESOURCE_COUNT" >> "$GITHUB_OUTPUT" + fi + echo "resources<> "$GITHUB_OUTPUT" + echo "$RESOURCES" >> "$GITHUB_OUTPUT" + echo "EOF" >> "$GITHUB_OUTPUT" + echo "$RESOURCES" | jq -r '.[] | " - \(.type)/\(.name) (\(.provisioningState))"' + fi + else + echo "rg_exists=false" >> "$GITHUB_OUTPUT" + fi - echo "resource_count=$RESOURCE_COUNT" >> "$GITHUB_OUTPUT" - echo "resources<> "$GITHUB_OUTPUT" - echo "$RESOURCES" >> "$GITHUB_OUTPUT" + # Identify soft-deletable resources from state + MANAGED_RESOURCES='${{ steps.state.outputs.managed_resources }}' + SOFT_DELETABLE=$(echo "$MANAGED_RESOURCES" | jq -c '[.[] | select(.softDeletable == true)]' 2>/dev/null || echo "[]") + SOFT_COUNT=$(echo "$SOFT_DELETABLE" | jq 'length') + echo "soft_deletable<> "$GITHUB_OUTPUT" + echo "$SOFT_DELETABLE" >> "$GITHUB_OUTPUT" echo "EOF" >> "$GITHUB_OUTPUT" + echo "soft_count=$SOFT_COUNT" >> "$GITHUB_OUTPUT" - echo "Resource group $RG has $RESOURCE_COUNT resources" - echo "$RESOURCES" | jq -r '.[] | " - \(.type)/\(.name) (\(.provisioningState))"' + if [[ "$SOFT_COUNT" -gt 0 ]]; then + echo "Soft-deletable: $SOFT_COUNT resource(s) โ€” will attempt purge after deletion" + echo "$SOFT_DELETABLE" | jq -r '.[] | " - \(.type): \(.id)"' + fi - # Query deployment operations to find subscription-scoped resources - # These are NOT deleted by az group delete (e.g. role assignments, policy assignments) + # Query subscription-scoped resources (for fallback only) SUB_RESOURCES="[]" - - OPS=$(az deployment operation sub list \ - --name "$DEPLOYMENT_ID" \ - --query "[?properties.provisioningState=='Succeeded' && properties.targetResource.id != null].properties.targetResource" \ - -o json 2>/dev/null || echo "[]") - - if [[ "$OPS" != "[]" ]]; then - # Find subscription-scoped authorization/policy resources (role assignments, etc.) - # These live outside the RG and survive az group delete - SUB_RESOURCES=$(echo "$OPS" | jq -c '[ - .[] | select( - (.resourceType // "" | test("Microsoft.Authorization|Microsoft.Policy")) and - (.id // "" | test("/resourceGroups/") | not) - ) - ]') - - # Check nested deployments for RG-scoped role assignments too - NESTED_NAMES=$(echo "$OPS" | jq -r '[ - .[] | select(.resourceType == "Microsoft.Resources/deployments") - ] | .[].resourceName // empty') - - for NESTED_NAME in $NESTED_NAMES; do - NESTED_OPS=$(az deployment operation group list \ - --resource-group "$RG" --name "$NESTED_NAME" \ - --query "[?properties.provisioningState=='Succeeded' && properties.targetResource.id != null].properties.targetResource" \ - -o json 2>/dev/null || echo "[]") - - # Role assignments scoped to resources within the RG - NESTED_AUTH=$(echo "$NESTED_OPS" | jq -c '[ + if [[ -z "$STACK_ID" ]]; then + OPS=$(az deployment operation sub list \ + --name "$DEPLOYMENT_ID" \ + --query "[?properties.provisioningState=='Succeeded' && properties.targetResource.id != null].properties.targetResource" \ + -o json 2>/dev/null || echo "[]") + + if [[ "$OPS" != "[]" ]]; then + SUB_RESOURCES=$(echo "$OPS" | jq -c '[ .[] | select( - (.resourceType // "" | test("Microsoft.Authorization")) + (.resourceType // "" | test("Microsoft.Authorization|Microsoft.Policy")) and + (.id // "" | test("/resourceGroups/") | not) ) ]') - SUB_RESOURCES=$(jq -n --argjson a "$SUB_RESOURCES" --argjson b "$NESTED_AUTH" '$a + $b') - done + NESTED_NAMES=$(echo "$OPS" | jq -r '[ + .[] | select(.resourceType == "Microsoft.Resources/deployments") + ] | .[].resourceName // empty') + + for NESTED_NAME in $NESTED_NAMES; do + NESTED_OPS=$(az deployment operation group list \ + --resource-group "$RG" --name "$NESTED_NAME" \ + --query "[?properties.provisioningState=='Succeeded' && properties.targetResource.id != null].properties.targetResource" \ + -o json 2>/dev/null || echo "[]") + + NESTED_AUTH=$(echo "$NESTED_OPS" | jq -c '[ + .[] | select( + (.resourceType // "" | test("Microsoft.Authorization")) + ) + ]') + + SUB_RESOURCES=$(jq -n --argjson a "$SUB_RESOURCES" --argjson b "$NESTED_AUTH" '$a + $b') + done + fi fi SUB_COUNT=$(echo "$SUB_RESOURCES" | jq 'length') - echo "sub_count=$SUB_COUNT" >> "$GITHUB_OUTPUT" echo "sub_resources<> "$GITHUB_OUTPUT" echo "$SUB_RESOURCES" >> "$GITHUB_OUTPUT" echo "EOF" >> "$GITHUB_OUTPUT" - echo "" - echo "=== Destroy Plan ===" - echo "Resource group: $RG ($RESOURCE_COUNT resources)" - echo "Subscription-scoped resources: $SUB_COUNT" if [[ "$SUB_COUNT" -gt 0 ]]; then + echo "Sub-scoped: $SUB_COUNT resource(s)" echo "$SUB_RESOURCES" | jq -r '.[] | " - \(.resourceType): \(.resourceName) (\(.id))"' fi echo "===================" - - name: Delete subscription-scoped resources + - name: Destroy via deployment stack + id: destroy_stack + if: steps.state.outputs.found == 'true' && steps.check.outputs.stack_exists == 'true' + run: | + DEPLOYMENT_ID="${{ matrix.deployment_id }}" + echo "๐Ÿ—‘๏ธ Deleting deployment stack: $DEPLOYMENT_ID" + echo "This deletes the stack and ALL managed resources (deleteAll)..." + + START_TIME=$(date +%s) + + az stack sub delete \ + --name "$DEPLOYMENT_ID" \ + --action-on-unmanage deleteAll \ + --yes 2>&1 || { + echo "destroy_status=failed" >> "$GITHUB_OUTPUT" + echo "::error::Failed to delete deployment stack $DEPLOYMENT_ID" + exit 1 + } + + END_TIME=$(date +%s) + DURATION=$((END_TIME - START_TIME)) + echo "destroy_status=succeeded" >> "$GITHUB_OUTPUT" + echo "destroy_duration=${DURATION}s" >> "$GITHUB_OUTPUT" + echo "โœ… Deployment stack deleted in ${DURATION}s" + + - name: Delete subscription-scoped resources (fallback) id: destroy_sub - if: steps.check.outputs.exists == 'true' && steps.check.outputs.sub_count != '0' + if: | + steps.state.outputs.found == 'true' && + steps.check.outputs.stack_exists != 'true' && + steps.check.outputs.rg_exists == 'true' && + steps.check.outputs.sub_count != '0' run: | echo "๐Ÿ—‘๏ธ Deleting subscription-scoped resources first..." FAILED=0 @@ -259,9 +338,12 @@ jobs: echo "::warning::$FAILED subscription-scoped resource(s) failed to delete" fi - - name: Delete resource group - id: destroy - if: steps.check.outputs.exists == 'true' + - name: Delete resource group (fallback) + id: destroy_rg + if: | + steps.state.outputs.found == 'true' && + steps.check.outputs.stack_exists != 'true' && + steps.check.outputs.rg_exists == 'true' run: | RG="${{ steps.state.outputs.resource_group }}" echo "๐Ÿ—‘๏ธ Deleting resource group: $RG" @@ -281,6 +363,91 @@ jobs: echo "destroy_duration=${DURATION}s" >> "$GITHUB_OUTPUT" echo "โœ… Resource group deleted in ${DURATION}s: $RG" + - name: Purge soft-deleted resources + id: purge + if: | + always() && + steps.state.outputs.found == 'true' && + steps.check.outputs.soft_count != '0' && + (steps.destroy_stack.outputs.destroy_status == 'succeeded' || steps.destroy_rg.outputs.destroy_status == 'succeeded') + run: | + echo "๐Ÿงน Checking for soft-deleted resources to purge..." + SOFT_DELETABLE='${{ steps.check.outputs.soft_deletable }}' + PURGE_RESULTS="[]" + RETAINED_COUNT=0 + + for ROW in $(echo "$SOFT_DELETABLE" | jq -r '.[] | @base64'); do + DECODED=$(echo "$ROW" | base64 -d) + RES_TYPE=$(echo "$DECODED" | jq -r '.type') + RES_ID=$(echo "$DECODED" | jq -r '.id') + PURGE_PROTECTED=$(echo "$DECODED" | jq -r '.purgeProtected') + + # Extract resource name from ID + RES_NAME=$(echo "$RES_ID" | grep -oP '[^/]+$') + + case "$RES_TYPE" in + "Microsoft.KeyVault/vaults") + # Check if vault is in soft-deleted state + DELETED_VAULT=$(az keyvault list-deleted --query "[?name=='$RES_NAME']" -o json 2>/dev/null || echo "[]") + if [[ $(echo "$DELETED_VAULT" | jq 'length') -gt 0 ]]; then + if [[ "$PURGE_PROTECTED" == "true" ]]; then + echo " โš ๏ธ $RES_NAME: soft-deleted but purge-protected โ€” cannot purge" + RETAINED_COUNT=$((RETAINED_COUNT + 1)) + PURGE_RESULTS=$(echo "$PURGE_RESULTS" | jq --arg name "$RES_NAME" --arg type "$RES_TYPE" \ + '. + [{"name": $name, "type": $type, "action": "retained-soft-deleted", "reason": "purge-protected"}]') + else + echo " ๐Ÿ—‘๏ธ Purging soft-deleted vault: $RES_NAME" + if az keyvault purge --name "$RES_NAME" 2>/dev/null; then + echo " โœ… Purged: $RES_NAME" + PURGE_RESULTS=$(echo "$PURGE_RESULTS" | jq --arg name "$RES_NAME" --arg type "$RES_TYPE" \ + '. + [{"name": $name, "type": $type, "action": "purged"}]') + else + echo " โš ๏ธ Failed to purge: $RES_NAME" + RETAINED_COUNT=$((RETAINED_COUNT + 1)) + PURGE_RESULTS=$(echo "$PURGE_RESULTS" | jq --arg name "$RES_NAME" --arg type "$RES_TYPE" \ + '. + [{"name": $name, "type": $type, "action": "purge-failed"}]') + fi + fi + else + echo " โœ… $RES_NAME: not in soft-deleted state (already gone)" + fi + ;; + "Microsoft.CognitiveServices/accounts") + # Cognitive Services soft-delete purge + if [[ "$PURGE_PROTECTED" != "true" ]]; then + LOCATION=$(echo "$RES_ID" | grep -oP '(?<=locations/)[^/]+' || echo "") + if [[ -n "$LOCATION" ]]; then + az cognitiveservices account purge --name "$RES_NAME" --location "$LOCATION" \ + --resource-group "" 2>/dev/null || true + fi + fi + ;; + *) + echo " โ„น๏ธ $RES_TYPE: no purge implementation (soft-delete will expire naturally)" + ;; + esac + done + + echo "retained_count=$RETAINED_COUNT" >> "$GITHUB_OUTPUT" + echo "purge_results<> "$GITHUB_OUTPUT" + echo "$PURGE_RESULTS" >> "$GITHUB_OUTPUT" + echo "EOF" >> "$GITHUB_OUTPUT" + + if [[ "$RETAINED_COUNT" -gt 0 ]]; then + echo "โš ๏ธ $RETAINED_COUNT resource(s) retained in soft-deleted state (purge-protected)" + fi + + - name: Clean deployment history + if: | + always() && + steps.state.outputs.found == 'true' && + (steps.destroy_stack.outputs.destroy_status == 'succeeded' || steps.destroy_rg.outputs.destroy_status == 'succeeded') + continue-on-error: true + run: | + DEPLOYMENT_ID="${{ matrix.deployment_id }}" + echo "๐Ÿงน Cleaning subscription deployment history entry: $DEPLOYMENT_ID" + az deployment sub delete --name "$DEPLOYMENT_ID" 2>/dev/null || true + - name: Update deployment state if: always() && steps.state.outputs.found == 'true' run: | @@ -289,19 +456,40 @@ jobs: STATE_FILE="$DEPLOY_DIR/state.json" TIMESTAMP=$(date -u +%Y-%m-%dT%H:%M:%SZ) - if [[ "${{ steps.check.outputs.exists }}" == "false" ]]; then + # Determine final status based on which destroy path ran + STACK_EXISTS="${{ steps.check.outputs.stack_exists }}" + RG_EXISTS="${{ steps.check.outputs.rg_exists }}" + STACK_STATUS="${{ steps.destroy_stack.outputs.destroy_status }}" + RG_STATUS="${{ steps.destroy_rg.outputs.destroy_status }}" + RETAINED_COUNT="${{ steps.purge.outputs.retained_count }}" + + if [[ "$STACK_EXISTS" != "true" && "$RG_EXISTS" != "true" ]]; then STATUS="already-destroyed" - elif [[ "${{ steps.destroy.outputs.destroy_status }}" == "succeeded" ]]; then - STATUS="destroyed" + elif [[ "$STACK_STATUS" == "succeeded" || "$RG_STATUS" == "succeeded" ]]; then + if [[ "${RETAINED_COUNT:-0}" -gt 0 ]]; then + STATUS="retained-soft-deleted" + else + STATUS="destroyed" + fi + elif [[ "$STACK_STATUS" == "failed" || "$RG_STATUS" == "failed" ]]; then + STATUS="partially-destroyed" else STATUS="destroy-failed" fi + # Determine duration from whichever path ran + DURATION="${{ steps.destroy_stack.outputs.destroy_duration }}" + if [[ -z "$DURATION" ]]; then + DURATION="${{ steps.destroy_rg.outputs.destroy_duration }}" + fi + # Update state file if [[ -f "$STATE_FILE" ]]; then jq --arg status "$STATUS" --arg ts "$TIMESTAMP" --arg actor "${{ github.actor }}" \ - --arg duration "${{ steps.destroy.outputs.destroy_duration }}" \ - '. + {status: $status, destroyedAt: $ts, destroyedBy: $actor, destroyDuration: $duration}' \ + --arg duration "$DURATION" \ + --arg purgeResults '${{ steps.purge.outputs.purge_results }}' \ + '. + {status: $status, destroyedAt: $ts, destroyedBy: $actor, destroyDuration: $duration} | + if ($purgeResults | length) > 0 then . + {purgeResults: ($purgeResults | fromjson? // [])} else . end' \ "$STATE_FILE" > "${STATE_FILE}.tmp" && mv "${STATE_FILE}.tmp" "$STATE_FILE" fi @@ -323,26 +511,48 @@ jobs: run: | DEPLOY_ID="${{ matrix.deployment_id }}" RG="${{ steps.state.outputs.resource_group }}" - STATUS="${{ steps.destroy.outputs.destroy_status }}" - DURATION="${{ steps.destroy.outputs.destroy_duration }}" + STACK_EXISTS="${{ steps.check.outputs.stack_exists }}" + RG_EXISTS="${{ steps.check.outputs.rg_exists }}" + STACK_STATUS="${{ steps.destroy_stack.outputs.destroy_status }}" + RG_STATUS="${{ steps.destroy_rg.outputs.destroy_status }}" + STACK_DURATION="${{ steps.destroy_stack.outputs.destroy_duration }}" + RG_DURATION="${{ steps.destroy_rg.outputs.destroy_duration }}" RESOURCE_COUNT="${{ steps.check.outputs.resource_count }}" SUB_COUNT="${{ steps.check.outputs.sub_count }}" - EXISTS="${{ steps.check.outputs.exists }}" + SOFT_COUNT="${{ steps.check.outputs.soft_count }}" + RETAINED_COUNT="${{ steps.purge.outputs.retained_count }}" + DEPLOY_METHOD="${{ steps.state.outputs.deploy_method }}" RUN_URL="${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}" echo "============================================" echo "Git-Ape Destroy Summary" echo "============================================" echo "Deployment: $DEPLOY_ID" + echo "Method: $DEPLOY_METHOD" echo "Resource Group: $RG" - if [[ "$EXISTS" == "false" ]]; then + + if [[ "$STACK_EXISTS" == "true" ]]; then + if [[ "$STACK_STATUS" == "succeeded" ]]; then + echo "Result: โœ… Stack destroyed ($RESOURCE_COUNT resources via deleteAll)" + echo "Duration: $STACK_DURATION" + else + echo "Result: โŒ Stack delete failed" + fi + elif [[ "$RG_EXISTS" != "true" && "$STACK_EXISTS" != "true" ]]; then echo "Result: Already destroyed" - elif [[ "$STATUS" == "succeeded" ]]; then + elif [[ "$RG_STATUS" == "succeeded" ]]; then echo "Result: โœ… Destroyed ($RESOURCE_COUNT RG resources + $SUB_COUNT subscription-scoped)" - echo "Duration: $DURATION" + echo "Duration: $RG_DURATION" else echo "Result: โŒ Failed" fi + + if [[ "${RETAINED_COUNT:-0}" -gt 0 ]]; then + echo "Soft-deleted: โš ๏ธ $RETAINED_COUNT resource(s) retained (purge-protected)" + elif [[ "${SOFT_COUNT:-0}" -gt 0 ]]; then + echo "Soft-deleted: โœ… All soft-deleted resources purged" + fi + echo "Run: $RUN_URL" echo "============================================" @@ -356,15 +566,17 @@ jobs: DEPLOY_ID="${{ matrix.deployment_id }}" RG="${{ steps.state.outputs.resource_group }}" - STATUS="${{ steps.destroy.outputs.destroy_status }}" + STACK_STATUS="${{ steps.destroy_stack.outputs.destroy_status }}" + RG_STATUS="${{ steps.destroy_rg.outputs.destroy_status }}" + DEPLOY_METHOD="${{ steps.state.outputs.deploy_method }}" RUN_URL="${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}" - if [[ "$STATUS" == "succeeded" ]]; then + if [[ "$STACK_STATUS" == "succeeded" || "$RG_STATUS" == "succeeded" ]]; then EMOJI="๐Ÿ—‘๏ธ" - MSG="Resource group *$RG* ($DEPLOY_ID) destroyed" + MSG="Deployment *$DEPLOY_ID* destroyed (method: $DEPLOY_METHOD)" else EMOJI="โŒ" - MSG="Destroy failed for *$RG* ($DEPLOY_ID)" + MSG="Destroy failed for *$DEPLOY_ID* (method: $DEPLOY_METHOD)" fi curl -sf -X POST "$SLACK_WEBHOOK_URL" \ diff --git a/website/docs/deployment/state.md b/website/docs/deployment/state.md index 5437a4f..7a24263 100644 --- a/website/docs/deployment/state.md +++ b/website/docs/deployment/state.md @@ -26,7 +26,7 @@ Each deployment directory contains: ## Deployment Lifecycle -A deployment moves through a defined set of states tracked in `metadata.json`. Valid `status` values are `initialized`, `gathering-requirements`, `generating-template`, `awaiting-confirmation`, `deploying`, `testing`, `succeeded`, `failed`, `rolled-back`, `destroy-requested`, and `destroyed`. Terminal states (`succeeded`, `failed`, `rolled-back`, `destroyed`) are persisted in git for audit. +A deployment moves through a defined set of states tracked in `metadata.json`. Valid `status` values are `initialized`, `gathering-requirements`, `generating-template`, `awaiting-confirmation`, `deploying`, `testing`, `succeeded`, `failed`, `rolled-back`, `destroy-requested`, `destroyed`, `partially-destroyed`, and `retained-soft-deleted`. Terminal states (`succeeded`, `failed`, `rolled-back`, `destroyed`, `partially-destroyed`, `retained-soft-deleted`) are persisted in git for audit. ```mermaid %%{init: {'theme':'base','themeVariables':{'fontSize':'13px','lineColor':'#64748b','textColor':'#1e293b','primaryTextColor':'#0f172a','edgeLabelBackground':'#f8fafc','tertiaryColor':'#f1f5f9'}}}%% @@ -51,14 +51,23 @@ stateDiagram-v2 failed --> rolledBack: rollback initiated succeeded --> destroyRequested: PR sets metadata destroyRequested --> destroyed: git-ape-destroy.yml + destroyRequested --> partiallyDestroyed: partial failure + destroyRequested --> retainedSoftDeleted: purge-protected resources remain succeeded --> [*] rolledBack --> [*] destroyed --> [*] + partiallyDestroyed --> [*] + retainedSoftDeleted --> [*] + + state "partially-destroyed" as partiallyDestroyed + state "retained-soft-deleted" as retainedSoftDeleted classDef terminal fill:#dcfce7,stroke:#15803d,color:#14532d classDef error fill:#fecaca,stroke:#b91c1c,color:#7f1d1d + classDef warning fill:#fef9c3,stroke:#a16207,color:#713f12 class succeeded,destroyed terminal class failed,rolledBack error + class partiallyDestroyed,retainedSoftDeleted warning ``` ## Directory Structure @@ -113,7 +122,9 @@ Contains deployment tracking information. "region": "eastus", "project": "api", "environment": "dev", + "deployMethod": "stack", "resourceGroup": "rg-api-dev-eastus", + "resourceGroups": ["rg-api-dev-eastus"], "resources": [ { "type": "Microsoft.Web/sites", @@ -127,6 +138,11 @@ Contains deployment tracking information. } ``` +**Fields:** +- `deployMethod` - Deployment method used: `stack` (Azure Deployment Stacks, default for new deployments) or `subscription` (legacy `az deployment sub create`) +- `resourceGroup` - Primary resource group name (kept for backward compatibility) +- `resourceGroups` - Array of all resource groups managed by this deployment (supports multi-RG templates) + **Status values:** - `initialized` - Deployment directory created - `gathering-requirements` - Collecting user input @@ -140,6 +156,83 @@ Contains deployment tracking information. - `destroyed` - Resources torn down - `already-destroyed` - Resources were already deleted - `destroy-requested` - Teardown has been requested +- `partially-destroyed` - Some resources deleted but others remain (e.g., locks blocking deletion, transient errors) +- `retained-soft-deleted` - Destroy completed but purge-protected resources remain soft-deleted until retention expires + +### state.json + +Contains runtime deployment state populated after `az deployment` or `az stack` completes. Used by the destroy workflow to determine teardown strategy. + +**Example (Deployment Stacks):** + +```json +{ + "deploymentId": "deploy-20260218-143022", + "timestamp": "2026-02-18T14:30:22Z", + "status": "succeeded", + "duration": "210s", + "subscription": "00000000-0000-0000-0000-000000000000", + "location": "eastus", + "project": "api", + "environment": "dev", + "resourceGroup": "rg-api-dev-eastus", + "triggeredBy": "octocat", + "triggerEvent": "push", + "runId": "12345678", + "runUrl": "https://github.com/org/repo/actions/runs/12345678", + "stackId": "/subscriptions/00000000-.../providers/Microsoft.Resources/deploymentStacks/deploy-20260218-143022", + "deployMethod": "stack", + "managedResources": [ + { + "id": "/subscriptions/.../resourceGroups/rg-api-dev-eastus/providers/Microsoft.KeyVault/vaults/kv-api-dev-eus", + "type": "Microsoft.KeyVault/vaults", + "scope": "resourceGroup", + "apiVersion": "2024-04-01", + "softDeletable": true, + "purgeProtected": true + }, + { + "id": "/subscriptions/.../resourceGroups/rg-api-dev-eastus/providers/Microsoft.Storage/storageAccounts/stapidev8k3m", + "type": "Microsoft.Storage/storageAccounts", + "scope": "resourceGroup", + "apiVersion": "2023-05-01", + "softDeletable": false, + "purgeProtected": false + } + ], + "resourceGroups": ["rg-api-dev-eastus"], + "subscriptions": ["00000000-0000-0000-0000-000000000000"], + "externalReferences": [ + { + "kind": "privateEndpointConnection", + "targetResourceId": "/subscriptions/.../providers/Microsoft.Network/privateEndpoints/pe-kv-api" + } + ] +} +``` + +**Fields:** + +| Field | Type | Description | +|-------|------|-------------| +| `stackId` | `string \| null` | Azure Deployment Stack resource ID. When present, destroy uses `az stack sub delete` for complete cleanup. | +| `deployMethod` | `"stack" \| "subscription"` | Deployment method used. `stack` = Deployment Stacks (default); `subscription` = legacy `az deployment sub create`. | +| `managedResources` | `array` | Flat list of all resources managed by this deployment, regardless of scope. Populated by walking deployment operations recursively. | +| `managedResources[].id` | `string` | Full ARM resource ID. | +| `managedResources[].type` | `string` | ARM resource type (e.g., `Microsoft.KeyVault/vaults`). | +| `managedResources[].scope` | `string` | Scope level: `resourceGroup`, `subscription`, or `managementGroup`. | +| `managedResources[].apiVersion` | `string` | API version used for the resource. | +| `managedResources[].softDeletable` | `boolean` | Whether the resource type supports soft-delete (Key Vault, Cognitive Services, etc.). | +| `managedResources[].purgeProtected` | `boolean` | Whether the resource has purge protection enabled (cannot be permanently deleted until retention expires). | +| `resourceGroups` | `array` | All resource groups created/managed by this deployment. | +| `subscriptions` | `array` | All subscriptions involved in this deployment. | +| `externalReferences` | `array` | Cross-deployment references (private endpoint connections, VNet peerings, DNS records in shared zones). | + +**Destroy strategy selection:** + +1. If `stackId` is present โ†’ `az stack sub delete --name --action-on-unmanage deleteAll` +2. If `stackId` is null โ†’ fallback to state-driven delete using `managedResources[]` and `resourceGroups[]` +3. If neither field is populated (legacy state) โ†’ fall back to single `az group delete` on `resourceGroup` ### requirements.json diff --git a/website/docs/workflows/git-ape-deploy.md b/website/docs/workflows/git-ape-deploy.md index 6e23309..437c080 100644 --- a/website/docs/workflows/git-ape-deploy.md +++ b/website/docs/workflows/git-ape-deploy.md @@ -53,7 +53,7 @@ description: "GitHub Actions workflow: Git-Ape: Deploy" | **Runs On** | `ubuntu-latest` | | **Environment** | `azure-deploy` | | **Depends On** | `detect-deployments`, `check-comment-trigger` | -| **Steps** | 13 | +| **Steps** | 14 | @@ -305,18 +305,39 @@ jobs: echo "๐Ÿš€ Starting deployment: ${{ matrix.deployment_id }}" START_TIME=$(date +%s) - DEPLOY_OUTPUT=$(az deployment sub create \ - --name "${{ matrix.deployment_id }}" \ - --location "${{ steps.params.outputs.location }}" \ - --template-file "${{ steps.params.outputs.deploy_dir }}/template.json" \ - --parameters @"${{ steps.params.outputs.deploy_dir }}/parameters.json" \ - --output json 2>&1) + DEPLOY_DIR="${{ steps.params.outputs.deploy_dir }}" + DEPLOYMENT_ID="${{ matrix.deployment_id }}" + LOCATION="${{ steps.params.outputs.location }}" + + # Determine deploy method: prefer deployment stacks (idempotent destroy) + # Fall back to az deployment sub create if stacks are unavailable + DEPLOY_METHOD="stack" + + if [[ "$DEPLOY_METHOD" == "stack" ]]; then + DEPLOY_OUTPUT=$(az stack sub create \ + --name "$DEPLOYMENT_ID" \ + --location "$LOCATION" \ + --template-file "$DEPLOY_DIR/template.json" \ + --parameters @"$DEPLOY_DIR/parameters.json" \ + --action-on-unmanage deleteAll \ + --deny-settings-mode none \ + --yes \ + --output json 2>&1) + else + DEPLOY_OUTPUT=$(az deployment sub create \ + --name "$DEPLOYMENT_ID" \ + --location "$LOCATION" \ + --template-file "$DEPLOY_DIR/template.json" \ + --parameters @"$DEPLOY_DIR/parameters.json" \ + --output json 2>&1) + fi EXIT_CODE=$? END_TIME=$(date +%s) DURATION=$((END_TIME - START_TIME)) echo "deploy_duration=${DURATION}s" >> "$GITHUB_OUTPUT" + echo "deploy_method=$DEPLOY_METHOD" >> "$GITHUB_OUTPUT" if [[ $EXIT_CODE -ne 0 ]]; then echo "deploy_status=failed" >> "$GITHUB_OUTPUT" @@ -335,8 +356,18 @@ jobs: echo "deploy_status=succeeded" >> "$GITHUB_OUTPUT" - # Extract outputs - OUTPUTS=$(echo "$DEPLOY_OUTPUT" | jq -r '.properties.outputs // {}') + # Extract outputs depending on deploy method + if [[ "$DEPLOY_METHOD" == "stack" ]]; then + # For stacks, extract the stack ID + STACK_ID=$(echo "$DEPLOY_OUTPUT" | jq -r '.id // empty') + echo "stack_id=$STACK_ID" >> "$GITHUB_OUTPUT" + + # Extract outputs from the stack's deployment + OUTPUTS=$(echo "$DEPLOY_OUTPUT" | jq -r '.outputs // {}') + else + OUTPUTS=$(echo "$DEPLOY_OUTPUT" | jq -r '.properties.outputs // {}') + fi + echo "deploy_outputs<> "$GITHUB_OUTPUT" echo "$OUTPUTS" >> "$GITHUB_OUTPUT" echo "EOF" >> "$GITHUB_OUTPUT" @@ -345,7 +376,99 @@ jobs: RG_NAME=$(echo "$OUTPUTS" | jq -r '.resourceGroupName.value // empty') echo "resource_group=$RG_NAME" >> "$GITHUB_OUTPUT" - echo "โœ… Deployment succeeded in ${DURATION}s" + echo "โœ… Deployment succeeded in ${DURATION}s (method: $DEPLOY_METHOD)" + + - name: Capture managed resources + id: capture + if: steps.deploy.outputs.deploy_status == 'succeeded' + run: | + DEPLOYMENT_ID="${{ matrix.deployment_id }}" + DEPLOY_METHOD="${{ steps.deploy.outputs.deploy_method }}" + RG_NAME="${{ steps.deploy.outputs.resource_group }}" + STACK_ID="${{ steps.deploy.outputs.stack_id }}" + + # Known soft-deletable resource types + SOFT_DELETABLE_TYPES="Microsoft.KeyVault/vaults Microsoft.CognitiveServices/accounts Microsoft.AppConfiguration/configurationStores Microsoft.ApiManagement/service Microsoft.MachineLearningServices/workspaces Microsoft.RecoveryServices/vaults" + + MANAGED_RESOURCES="[]" + RESOURCE_GROUPS="[]" + + if [[ "$DEPLOY_METHOD" == "stack" && -n "$STACK_ID" ]]; then + # Stacks natively track all managed resources + STACK_RESOURCES=$(az stack sub show \ + --name "$DEPLOYMENT_ID" \ + --query "resources[].id" \ + -o json 2>/dev/null || echo "[]") + + # Build managedResources array from stack resources + for RES_ID in $(echo "$STACK_RESOURCES" | jq -r '.[]' 2>/dev/null); do + RES_TYPE=$(echo "$RES_ID" | grep -oP 'providers/\K[^/]+/[^/]+' | tail -1) + RES_SCOPE="resourceGroup" + if echo "$RES_ID" | grep -q "/resourceGroups/"; then + RES_SCOPE="resourceGroup" + else + RES_SCOPE="subscription" + fi + + IS_SOFT_DELETABLE="false" + for SD_TYPE in $SOFT_DELETABLE_TYPES; do + if [[ "$RES_TYPE" == "$SD_TYPE" ]]; then + IS_SOFT_DELETABLE="true" + break + fi + done + + MANAGED_RESOURCES=$(echo "$MANAGED_RESOURCES" | jq --arg id "$RES_ID" --arg type "$RES_TYPE" \ + --arg scope "$RES_SCOPE" --argjson sd "$IS_SOFT_DELETABLE" \ + '. + [{"id": $id, "type": $type, "scope": $scope, "softDeletable": $sd, "purgeProtected": false}]') + done + + # Extract resource groups from managed resources + RESOURCE_GROUPS=$(echo "$MANAGED_RESOURCES" | jq -c '[.[].id | capture("/resourceGroups/(?[^/]+)") | .rg] | unique') + else + # Fallback: walk deployment operations recursively + OPS=$(az deployment operation sub list \ + --name "$DEPLOYMENT_ID" \ + --query "[?properties.provisioningState=='Succeeded' && properties.targetResource.id != null].properties.targetResource" \ + -o json 2>/dev/null || echo "[]") + + for RES_ID in $(echo "$OPS" | jq -r '.[].id // empty' 2>/dev/null); do + RES_TYPE=$(echo "$OPS" | jq -r ".[] | select(.id == \"$RES_ID\") | .resourceType // empty") + RES_SCOPE="resourceGroup" + if echo "$RES_ID" | grep -q "/resourceGroups/"; then + RES_SCOPE="resourceGroup" + else + RES_SCOPE="subscription" + fi + + IS_SOFT_DELETABLE="false" + for SD_TYPE in $SOFT_DELETABLE_TYPES; do + if [[ "$RES_TYPE" == "$SD_TYPE" ]]; then + IS_SOFT_DELETABLE="true" + break + fi + done + + MANAGED_RESOURCES=$(echo "$MANAGED_RESOURCES" | jq --arg id "$RES_ID" --arg type "$RES_TYPE" \ + --arg scope "$RES_SCOPE" --argjson sd "$IS_SOFT_DELETABLE" \ + '. + [{"id": $id, "type": $type, "scope": $scope, "softDeletable": $sd, "purgeProtected": false}]') + done + + # Collect resource groups + if [[ -n "$RG_NAME" ]]; then + RESOURCE_GROUPS="[\"$RG_NAME\"]" + fi + fi + + echo "managed_resources<> "$GITHUB_OUTPUT" + echo "$MANAGED_RESOURCES" >> "$GITHUB_OUTPUT" + echo "EOF" >> "$GITHUB_OUTPUT" + echo "resource_groups<> "$GITHUB_OUTPUT" + echo "$RESOURCE_GROUPS" >> "$GITHUB_OUTPUT" + echo "EOF" >> "$GITHUB_OUTPUT" + + RESOURCE_COUNT=$(echo "$MANAGED_RESOURCES" | jq 'length') + echo "๐Ÿ“‹ Captured $RESOURCE_COUNT managed resources" - name: Run integration tests id: tests @@ -414,25 +537,60 @@ jobs: DEPLOY_DIR="${{ steps.params.outputs.deploy_dir }}" STATUS="${{ steps.deploy.outputs.deploy_status || 'failed' }}" TIMESTAMP=$(date -u +%Y-%m-%dT%H:%M:%SZ) + DEPLOY_METHOD="${{ steps.deploy.outputs.deploy_method }}" + STACK_ID="${{ steps.deploy.outputs.stack_id }}" + MANAGED_RESOURCES='${{ steps.capture.outputs.managed_resources }}' + RESOURCE_GROUPS='${{ steps.capture.outputs.resource_groups }}' + + # Ensure managed resources and resource groups are valid JSON + if ! echo "$MANAGED_RESOURCES" | jq empty 2>/dev/null; then + MANAGED_RESOURCES="[]" + fi + if ! echo "$RESOURCE_GROUPS" | jq empty 2>/dev/null; then + RESOURCE_GROUPS="[]" + fi - # Create/update state.json - cat > "$DEPLOY_DIR/state.json" < "$DEPLOY_DIR/state.json" - name: Commit deployment state if: always() @@ -441,9 +599,13 @@ jobs: STATUS="${{ steps.deploy.outputs.deploy_status }}" STATUS=${STATUS:-failed} - # Update metadata.json status from pending to actual result + # Update metadata.json status from pending to actual result, add deployMethod and resourceGroups if [[ -f "$DEPLOY_DIR/metadata.json" ]]; then - jq --arg status "$STATUS" '.status = $status' \ + DEPLOY_METHOD="${{ steps.deploy.outputs.deploy_method }}" + DEPLOY_METHOD=${DEPLOY_METHOD:-subscription} + RG_NAME="${{ steps.deploy.outputs.resource_group }}" + jq --arg status "$STATUS" --arg method "$DEPLOY_METHOD" --arg rg "$RG_NAME" \ + '.status = $status | .deployMethod = $method | .resourceGroups = (if $rg == "" then [] else [$rg] end)' \ "$DEPLOY_DIR/metadata.json" > "$DEPLOY_DIR/metadata.json.tmp" \ && mv "$DEPLOY_DIR/metadata.json.tmp" "$DEPLOY_DIR/metadata.json" fi diff --git a/website/docs/workflows/git-ape-destroy.md b/website/docs/workflows/git-ape-destroy.md index ee249dd..e821172 100644 --- a/website/docs/workflows/git-ape-destroy.md +++ b/website/docs/workflows/git-ape-destroy.md @@ -42,7 +42,7 @@ description: "GitHub Actions workflow: Git-Ape: Destroy" | **Runs On** | `ubuntu-latest` | | **Environment** | `azure-destroy` | | **Depends On** | `detect-destroys` | -| **Steps** | 9 | +| **Steps** | 12 | @@ -186,16 +186,34 @@ jobs: fi RG_NAME=$(jq -r '.resourceGroup // empty' "$STATE_FILE") - - if [[ -z "$RG_NAME" ]]; then - echo "::error::No resource group found in state file" + STACK_ID=$(jq -r '.stackId // empty' "$STATE_FILE") + DEPLOY_METHOD=$(jq -r '.deployMethod // "subscription"' "$STATE_FILE") + MANAGED_RESOURCES=$(jq -c '.managedResources // []' "$STATE_FILE") + RESOURCE_GROUPS=$(jq -c '.resourceGroups // []' "$STATE_FILE") + + # Fallback: if no stackId and no resourceGroup, cannot proceed + if [[ -z "$STACK_ID" && -z "$RG_NAME" ]]; then + echo "::error::No stack ID or resource group found in state file" echo "found=false" >> "$GITHUB_OUTPUT" exit 1 fi echo "found=true" >> "$GITHUB_OUTPUT" echo "resource_group=$RG_NAME" >> "$GITHUB_OUTPUT" - echo "Will destroy resource group: $RG_NAME" + echo "stack_id=$STACK_ID" >> "$GITHUB_OUTPUT" + echo "deploy_method=$DEPLOY_METHOD" >> "$GITHUB_OUTPUT" + echo "managed_resources<> "$GITHUB_OUTPUT" + echo "$MANAGED_RESOURCES" >> "$GITHUB_OUTPUT" + echo "EOF" >> "$GITHUB_OUTPUT" + echo "resource_groups<> "$GITHUB_OUTPUT" + echo "$RESOURCE_GROUPS" >> "$GITHUB_OUTPUT" + echo "EOF" >> "$GITHUB_OUTPUT" + + if [[ -n "$STACK_ID" ]]; then + echo "Will destroy via deployment stack: $STACK_ID" + else + echo "Will destroy resource group: $RG_NAME (fallback method)" + fi - name: Azure Login (OIDC) if: steps.state.outputs.found == 'true' @@ -211,92 +229,153 @@ jobs: run: | RG="${{ steps.state.outputs.resource_group }}" DEPLOYMENT_ID="${{ matrix.deployment_id }}" + STACK_ID="${{ steps.state.outputs.stack_id }}" + DEPLOY_METHOD="${{ steps.state.outputs.deploy_method }}" - # Check if resource group exists - EXISTS=$(az group exists --name "$RG") - echo "exists=$EXISTS" >> "$GITHUB_OUTPUT" - - if [[ "$EXISTS" != "true" ]]; then - echo "Resource group $RG does not exist (already deleted?)" - echo "resource_count=0" >> "$GITHUB_OUTPUT" - echo "sub_count=0" >> "$GITHUB_OUTPUT" - exit 0 + echo "=== Destroy Plan ===" + echo "Deployment: $DEPLOYMENT_ID" + echo "Method: $DEPLOY_METHOD" + + if [[ -n "$STACK_ID" ]]; then + # Check if stack still exists + STACK_EXISTS=$(az stack sub show --name "$DEPLOYMENT_ID" --query "id" -o tsv 2>/dev/null || echo "") + if [[ -n "$STACK_EXISTS" ]]; then + echo "stack_exists=true" >> "$GITHUB_OUTPUT" + echo "Stack: $STACK_ID (exists)" + + # List resources in the stack + STACK_RESOURCES=$(az stack sub show --name "$DEPLOYMENT_ID" --query "resources[].id" -o json 2>/dev/null || echo "[]") + RESOURCE_COUNT=$(echo "$STACK_RESOURCES" | jq 'length') + echo "resource_count=$RESOURCE_COUNT" >> "$GITHUB_OUTPUT" + echo "Resources: $RESOURCE_COUNT managed by stack" + else + echo "stack_exists=false" >> "$GITHUB_OUTPUT" + echo "Stack not found โ€” will use fallback" + echo "resource_count=0" >> "$GITHUB_OUTPUT" + fi + else + echo "stack_exists=false" >> "$GITHUB_OUTPUT" fi - # Inventory RG resources - RESOURCES=$(az resource list --resource-group "$RG" \ - --query "[].{name:name, type:type, id:id, provisioningState:provisioningState}" \ - --output json 2>/dev/null || echo "[]") - RESOURCE_COUNT=$(echo "$RESOURCES" | jq 'length') + # Check resource group existence (for fallback or soft-delete sweep) + if [[ -n "$RG" ]]; then + EXISTS=$(az group exists --name "$RG") + echo "rg_exists=$EXISTS" >> "$GITHUB_OUTPUT" + echo "RG: $RG (exists=$EXISTS)" + + if [[ "$EXISTS" == "true" ]]; then + RESOURCES=$(az resource list --resource-group "$RG" \ + --query "[].{name:name, type:type, id:id, provisioningState:provisioningState}" \ + --output json 2>/dev/null || echo "[]") + RESOURCE_COUNT=$(echo "$RESOURCES" | jq 'length') + # Only set resource_count if stack_exists is false (avoid overwrite) + if [[ "$STACK_ID" == "" ]]; then + echo "resource_count=$RESOURCE_COUNT" >> "$GITHUB_OUTPUT" + fi + echo "resources<> "$GITHUB_OUTPUT" + echo "$RESOURCES" >> "$GITHUB_OUTPUT" + echo "EOF" >> "$GITHUB_OUTPUT" + echo "$RESOURCES" | jq -r '.[] | " - \(.type)/\(.name) (\(.provisioningState))"' + fi + else + echo "rg_exists=false" >> "$GITHUB_OUTPUT" + fi - echo "resource_count=$RESOURCE_COUNT" >> "$GITHUB_OUTPUT" - echo "resources<> "$GITHUB_OUTPUT" - echo "$RESOURCES" >> "$GITHUB_OUTPUT" + # Identify soft-deletable resources from state + MANAGED_RESOURCES='${{ steps.state.outputs.managed_resources }}' + SOFT_DELETABLE=$(echo "$MANAGED_RESOURCES" | jq -c '[.[] | select(.softDeletable == true)]' 2>/dev/null || echo "[]") + SOFT_COUNT=$(echo "$SOFT_DELETABLE" | jq 'length') + echo "soft_deletable<> "$GITHUB_OUTPUT" + echo "$SOFT_DELETABLE" >> "$GITHUB_OUTPUT" echo "EOF" >> "$GITHUB_OUTPUT" + echo "soft_count=$SOFT_COUNT" >> "$GITHUB_OUTPUT" - echo "Resource group $RG has $RESOURCE_COUNT resources" - echo "$RESOURCES" | jq -r '.[] | " - \(.type)/\(.name) (\(.provisioningState))"' + if [[ "$SOFT_COUNT" -gt 0 ]]; then + echo "Soft-deletable: $SOFT_COUNT resource(s) โ€” will attempt purge after deletion" + echo "$SOFT_DELETABLE" | jq -r '.[] | " - \(.type): \(.id)"' + fi - # Query deployment operations to find subscription-scoped resources - # These are NOT deleted by az group delete (e.g. role assignments, policy assignments) + # Query subscription-scoped resources (for fallback only) SUB_RESOURCES="[]" - - OPS=$(az deployment operation sub list \ - --name "$DEPLOYMENT_ID" \ - --query "[?properties.provisioningState=='Succeeded' && properties.targetResource.id != null].properties.targetResource" \ - -o json 2>/dev/null || echo "[]") - - if [[ "$OPS" != "[]" ]]; then - # Find subscription-scoped authorization/policy resources (role assignments, etc.) - # These live outside the RG and survive az group delete - SUB_RESOURCES=$(echo "$OPS" | jq -c '[ - .[] | select( - (.resourceType // "" | test("Microsoft.Authorization|Microsoft.Policy")) and - (.id // "" | test("/resourceGroups/") | not) - ) - ]') - - # Check nested deployments for RG-scoped role assignments too - NESTED_NAMES=$(echo "$OPS" | jq -r '[ - .[] | select(.resourceType == "Microsoft.Resources/deployments") - ] | .[].resourceName // empty') - - for NESTED_NAME in $NESTED_NAMES; do - NESTED_OPS=$(az deployment operation group list \ - --resource-group "$RG" --name "$NESTED_NAME" \ - --query "[?properties.provisioningState=='Succeeded' && properties.targetResource.id != null].properties.targetResource" \ - -o json 2>/dev/null || echo "[]") - - # Role assignments scoped to resources within the RG - NESTED_AUTH=$(echo "$NESTED_OPS" | jq -c '[ + if [[ -z "$STACK_ID" ]]; then + OPS=$(az deployment operation sub list \ + --name "$DEPLOYMENT_ID" \ + --query "[?properties.provisioningState=='Succeeded' && properties.targetResource.id != null].properties.targetResource" \ + -o json 2>/dev/null || echo "[]") + + if [[ "$OPS" != "[]" ]]; then + SUB_RESOURCES=$(echo "$OPS" | jq -c '[ .[] | select( - (.resourceType // "" | test("Microsoft.Authorization")) + (.resourceType // "" | test("Microsoft.Authorization|Microsoft.Policy")) and + (.id // "" | test("/resourceGroups/") | not) ) ]') - SUB_RESOURCES=$(jq -n --argjson a "$SUB_RESOURCES" --argjson b "$NESTED_AUTH" '$a + $b') - done + NESTED_NAMES=$(echo "$OPS" | jq -r '[ + .[] | select(.resourceType == "Microsoft.Resources/deployments") + ] | .[].resourceName // empty') + + for NESTED_NAME in $NESTED_NAMES; do + NESTED_OPS=$(az deployment operation group list \ + --resource-group "$RG" --name "$NESTED_NAME" \ + --query "[?properties.provisioningState=='Succeeded' && properties.targetResource.id != null].properties.targetResource" \ + -o json 2>/dev/null || echo "[]") + + NESTED_AUTH=$(echo "$NESTED_OPS" | jq -c '[ + .[] | select( + (.resourceType // "" | test("Microsoft.Authorization")) + ) + ]') + + SUB_RESOURCES=$(jq -n --argjson a "$SUB_RESOURCES" --argjson b "$NESTED_AUTH" '$a + $b') + done + fi fi SUB_COUNT=$(echo "$SUB_RESOURCES" | jq 'length') - echo "sub_count=$SUB_COUNT" >> "$GITHUB_OUTPUT" echo "sub_resources<> "$GITHUB_OUTPUT" echo "$SUB_RESOURCES" >> "$GITHUB_OUTPUT" echo "EOF" >> "$GITHUB_OUTPUT" - echo "" - echo "=== Destroy Plan ===" - echo "Resource group: $RG ($RESOURCE_COUNT resources)" - echo "Subscription-scoped resources: $SUB_COUNT" if [[ "$SUB_COUNT" -gt 0 ]]; then + echo "Sub-scoped: $SUB_COUNT resource(s)" echo "$SUB_RESOURCES" | jq -r '.[] | " - \(.resourceType): \(.resourceName) (\(.id))"' fi echo "===================" - - name: Delete subscription-scoped resources + - name: Destroy via deployment stack + id: destroy_stack + if: steps.state.outputs.found == 'true' && steps.check.outputs.stack_exists == 'true' + run: | + DEPLOYMENT_ID="${{ matrix.deployment_id }}" + echo "๐Ÿ—‘๏ธ Deleting deployment stack: $DEPLOYMENT_ID" + echo "This deletes the stack and ALL managed resources (deleteAll)..." + + START_TIME=$(date +%s) + + az stack sub delete \ + --name "$DEPLOYMENT_ID" \ + --action-on-unmanage deleteAll \ + --yes 2>&1 || { + echo "destroy_status=failed" >> "$GITHUB_OUTPUT" + echo "::error::Failed to delete deployment stack $DEPLOYMENT_ID" + exit 1 + } + + END_TIME=$(date +%s) + DURATION=$((END_TIME - START_TIME)) + echo "destroy_status=succeeded" >> "$GITHUB_OUTPUT" + echo "destroy_duration=${DURATION}s" >> "$GITHUB_OUTPUT" + echo "โœ… Deployment stack deleted in ${DURATION}s" + + - name: Delete subscription-scoped resources (fallback) id: destroy_sub - if: steps.check.outputs.exists == 'true' && steps.check.outputs.sub_count != '0' + if: | + steps.state.outputs.found == 'true' && + steps.check.outputs.stack_exists != 'true' && + steps.check.outputs.rg_exists == 'true' && + steps.check.outputs.sub_count != '0' run: | echo "๐Ÿ—‘๏ธ Deleting subscription-scoped resources first..." FAILED=0 @@ -313,9 +392,12 @@ jobs: echo "::warning::$FAILED subscription-scoped resource(s) failed to delete" fi - - name: Delete resource group - id: destroy - if: steps.check.outputs.exists == 'true' + - name: Delete resource group (fallback) + id: destroy_rg + if: | + steps.state.outputs.found == 'true' && + steps.check.outputs.stack_exists != 'true' && + steps.check.outputs.rg_exists == 'true' run: | RG="${{ steps.state.outputs.resource_group }}" echo "๐Ÿ—‘๏ธ Deleting resource group: $RG" @@ -335,6 +417,91 @@ jobs: echo "destroy_duration=${DURATION}s" >> "$GITHUB_OUTPUT" echo "โœ… Resource group deleted in ${DURATION}s: $RG" + - name: Purge soft-deleted resources + id: purge + if: | + always() && + steps.state.outputs.found == 'true' && + steps.check.outputs.soft_count != '0' && + (steps.destroy_stack.outputs.destroy_status == 'succeeded' || steps.destroy_rg.outputs.destroy_status == 'succeeded') + run: | + echo "๐Ÿงน Checking for soft-deleted resources to purge..." + SOFT_DELETABLE='${{ steps.check.outputs.soft_deletable }}' + PURGE_RESULTS="[]" + RETAINED_COUNT=0 + + for ROW in $(echo "$SOFT_DELETABLE" | jq -r '.[] | @base64'); do + DECODED=$(echo "$ROW" | base64 -d) + RES_TYPE=$(echo "$DECODED" | jq -r '.type') + RES_ID=$(echo "$DECODED" | jq -r '.id') + PURGE_PROTECTED=$(echo "$DECODED" | jq -r '.purgeProtected') + + # Extract resource name from ID + RES_NAME=$(echo "$RES_ID" | grep -oP '[^/]+$') + + case "$RES_TYPE" in + "Microsoft.KeyVault/vaults") + # Check if vault is in soft-deleted state + DELETED_VAULT=$(az keyvault list-deleted --query "[?name=='$RES_NAME']" -o json 2>/dev/null || echo "[]") + if [[ $(echo "$DELETED_VAULT" | jq 'length') -gt 0 ]]; then + if [[ "$PURGE_PROTECTED" == "true" ]]; then + echo " โš ๏ธ $RES_NAME: soft-deleted but purge-protected โ€” cannot purge" + RETAINED_COUNT=$((RETAINED_COUNT + 1)) + PURGE_RESULTS=$(echo "$PURGE_RESULTS" | jq --arg name "$RES_NAME" --arg type "$RES_TYPE" \ + '. + [{"name": $name, "type": $type, "action": "retained-soft-deleted", "reason": "purge-protected"}]') + else + echo " ๐Ÿ—‘๏ธ Purging soft-deleted vault: $RES_NAME" + if az keyvault purge --name "$RES_NAME" 2>/dev/null; then + echo " โœ… Purged: $RES_NAME" + PURGE_RESULTS=$(echo "$PURGE_RESULTS" | jq --arg name "$RES_NAME" --arg type "$RES_TYPE" \ + '. + [{"name": $name, "type": $type, "action": "purged"}]') + else + echo " โš ๏ธ Failed to purge: $RES_NAME" + RETAINED_COUNT=$((RETAINED_COUNT + 1)) + PURGE_RESULTS=$(echo "$PURGE_RESULTS" | jq --arg name "$RES_NAME" --arg type "$RES_TYPE" \ + '. + [{"name": $name, "type": $type, "action": "purge-failed"}]') + fi + fi + else + echo " โœ… $RES_NAME: not in soft-deleted state (already gone)" + fi + ;; + "Microsoft.CognitiveServices/accounts") + # Cognitive Services soft-delete purge + if [[ "$PURGE_PROTECTED" != "true" ]]; then + LOCATION=$(echo "$RES_ID" | grep -oP '(?<=locations/)[^/]+' || echo "") + if [[ -n "$LOCATION" ]]; then + az cognitiveservices account purge --name "$RES_NAME" --location "$LOCATION" \ + --resource-group "" 2>/dev/null || true + fi + fi + ;; + *) + echo " โ„น๏ธ $RES_TYPE: no purge implementation (soft-delete will expire naturally)" + ;; + esac + done + + echo "retained_count=$RETAINED_COUNT" >> "$GITHUB_OUTPUT" + echo "purge_results<> "$GITHUB_OUTPUT" + echo "$PURGE_RESULTS" >> "$GITHUB_OUTPUT" + echo "EOF" >> "$GITHUB_OUTPUT" + + if [[ "$RETAINED_COUNT" -gt 0 ]]; then + echo "โš ๏ธ $RETAINED_COUNT resource(s) retained in soft-deleted state (purge-protected)" + fi + + - name: Clean deployment history + if: | + always() && + steps.state.outputs.found == 'true' && + (steps.destroy_stack.outputs.destroy_status == 'succeeded' || steps.destroy_rg.outputs.destroy_status == 'succeeded') + continue-on-error: true + run: | + DEPLOYMENT_ID="${{ matrix.deployment_id }}" + echo "๐Ÿงน Cleaning subscription deployment history entry: $DEPLOYMENT_ID" + az deployment sub delete --name "$DEPLOYMENT_ID" 2>/dev/null || true + - name: Update deployment state if: always() && steps.state.outputs.found == 'true' run: | @@ -343,19 +510,40 @@ jobs: STATE_FILE="$DEPLOY_DIR/state.json" TIMESTAMP=$(date -u +%Y-%m-%dT%H:%M:%SZ) - if [[ "${{ steps.check.outputs.exists }}" == "false" ]]; then + # Determine final status based on which destroy path ran + STACK_EXISTS="${{ steps.check.outputs.stack_exists }}" + RG_EXISTS="${{ steps.check.outputs.rg_exists }}" + STACK_STATUS="${{ steps.destroy_stack.outputs.destroy_status }}" + RG_STATUS="${{ steps.destroy_rg.outputs.destroy_status }}" + RETAINED_COUNT="${{ steps.purge.outputs.retained_count }}" + + if [[ "$STACK_EXISTS" != "true" && "$RG_EXISTS" != "true" ]]; then STATUS="already-destroyed" - elif [[ "${{ steps.destroy.outputs.destroy_status }}" == "succeeded" ]]; then - STATUS="destroyed" + elif [[ "$STACK_STATUS" == "succeeded" || "$RG_STATUS" == "succeeded" ]]; then + if [[ "${RETAINED_COUNT:-0}" -gt 0 ]]; then + STATUS="retained-soft-deleted" + else + STATUS="destroyed" + fi + elif [[ "$STACK_STATUS" == "failed" || "$RG_STATUS" == "failed" ]]; then + STATUS="partially-destroyed" else STATUS="destroy-failed" fi + # Determine duration from whichever path ran + DURATION="${{ steps.destroy_stack.outputs.destroy_duration }}" + if [[ -z "$DURATION" ]]; then + DURATION="${{ steps.destroy_rg.outputs.destroy_duration }}" + fi + # Update state file if [[ -f "$STATE_FILE" ]]; then jq --arg status "$STATUS" --arg ts "$TIMESTAMP" --arg actor "${{ github.actor }}" \ - --arg duration "${{ steps.destroy.outputs.destroy_duration }}" \ - '. + {status: $status, destroyedAt: $ts, destroyedBy: $actor, destroyDuration: $duration}' \ + --arg duration "$DURATION" \ + --arg purgeResults '${{ steps.purge.outputs.purge_results }}' \ + '. + {status: $status, destroyedAt: $ts, destroyedBy: $actor, destroyDuration: $duration} | + if ($purgeResults | length) > 0 then . + {purgeResults: ($purgeResults | fromjson? // [])} else . end' \ "$STATE_FILE" > "${STATE_FILE}.tmp" && mv "${STATE_FILE}.tmp" "$STATE_FILE" fi @@ -377,26 +565,48 @@ jobs: run: | DEPLOY_ID="${{ matrix.deployment_id }}" RG="${{ steps.state.outputs.resource_group }}" - STATUS="${{ steps.destroy.outputs.destroy_status }}" - DURATION="${{ steps.destroy.outputs.destroy_duration }}" + STACK_EXISTS="${{ steps.check.outputs.stack_exists }}" + RG_EXISTS="${{ steps.check.outputs.rg_exists }}" + STACK_STATUS="${{ steps.destroy_stack.outputs.destroy_status }}" + RG_STATUS="${{ steps.destroy_rg.outputs.destroy_status }}" + STACK_DURATION="${{ steps.destroy_stack.outputs.destroy_duration }}" + RG_DURATION="${{ steps.destroy_rg.outputs.destroy_duration }}" RESOURCE_COUNT="${{ steps.check.outputs.resource_count }}" SUB_COUNT="${{ steps.check.outputs.sub_count }}" - EXISTS="${{ steps.check.outputs.exists }}" + SOFT_COUNT="${{ steps.check.outputs.soft_count }}" + RETAINED_COUNT="${{ steps.purge.outputs.retained_count }}" + DEPLOY_METHOD="${{ steps.state.outputs.deploy_method }}" RUN_URL="${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}" echo "============================================" echo "Git-Ape Destroy Summary" echo "============================================" echo "Deployment: $DEPLOY_ID" + echo "Method: $DEPLOY_METHOD" echo "Resource Group: $RG" - if [[ "$EXISTS" == "false" ]]; then + + if [[ "$STACK_EXISTS" == "true" ]]; then + if [[ "$STACK_STATUS" == "succeeded" ]]; then + echo "Result: โœ… Stack destroyed ($RESOURCE_COUNT resources via deleteAll)" + echo "Duration: $STACK_DURATION" + else + echo "Result: โŒ Stack delete failed" + fi + elif [[ "$RG_EXISTS" != "true" && "$STACK_EXISTS" != "true" ]]; then echo "Result: Already destroyed" - elif [[ "$STATUS" == "succeeded" ]]; then + elif [[ "$RG_STATUS" == "succeeded" ]]; then echo "Result: โœ… Destroyed ($RESOURCE_COUNT RG resources + $SUB_COUNT subscription-scoped)" - echo "Duration: $DURATION" + echo "Duration: $RG_DURATION" else echo "Result: โŒ Failed" fi + + if [[ "${RETAINED_COUNT:-0}" -gt 0 ]]; then + echo "Soft-deleted: โš ๏ธ $RETAINED_COUNT resource(s) retained (purge-protected)" + elif [[ "${SOFT_COUNT:-0}" -gt 0 ]]; then + echo "Soft-deleted: โœ… All soft-deleted resources purged" + fi + echo "Run: $RUN_URL" echo "============================================" @@ -410,15 +620,17 @@ jobs: DEPLOY_ID="${{ matrix.deployment_id }}" RG="${{ steps.state.outputs.resource_group }}" - STATUS="${{ steps.destroy.outputs.destroy_status }}" + STACK_STATUS="${{ steps.destroy_stack.outputs.destroy_status }}" + RG_STATUS="${{ steps.destroy_rg.outputs.destroy_status }}" + DEPLOY_METHOD="${{ steps.state.outputs.deploy_method }}" RUN_URL="${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}" - if [[ "$STATUS" == "succeeded" ]]; then + if [[ "$STACK_STATUS" == "succeeded" || "$RG_STATUS" == "succeeded" ]]; then EMOJI="๐Ÿ—‘๏ธ" - MSG="Resource group *$RG* ($DEPLOY_ID) destroyed" + MSG="Deployment *$DEPLOY_ID* destroyed (method: $DEPLOY_METHOD)" else EMOJI="โŒ" - MSG="Destroy failed for *$RG* ($DEPLOY_ID)" + MSG="Destroy failed for *$DEPLOY_ID* (method: $DEPLOY_METHOD)" fi curl -sf -X POST "$SLACK_WEBHOOK_URL" \ From 69113e4122547400996bfa1d9d33798682320f73 Mon Sep 17 00:00:00 2001 From: Arnaud Lheureux Date: Wed, 6 May 2026 14:18:14 +0700 Subject: [PATCH 03/18] feat: add azure-stack-* skills with fast async destroy MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - introduce azure-stack-deploy and azure-stack-destroy skills (bash + pwsh) - destroy: fast async mode (default) polls resource groups, --wait for sync - align workflows + agents + docs with new skills - bump plugin to 0.1.0 ๐Ÿš€ - Generated by Copilot --- .../agents/azure-resource-deployer.agent.md | 140 +++++-- .../agents/azure-template-generator.agent.md | 44 ++- .github/agents/git-ape.agent.md | 16 +- .github/copilot-instructions.md | 28 +- .github/scripts/deployment-manager.sh | 29 +- .github/skills/azure-stack-deploy/SKILL.md | 143 +++++++ .../scripts/deploy-stack.ps1 | 314 ++++++++++++++++ .../scripts/deploy-stack.sh | 282 ++++++++++++++ .github/skills/azure-stack-destroy/SKILL.md | 153 ++++++++ .../scripts/destroy-stack.ps1 | 348 +++++++++++++++++ .../scripts/destroy-stack.sh | 352 ++++++++++++++++++ .github/workflows/git-ape-deploy.exampleyml | 37 +- .github/workflows/git-ape-destroy.exampleyml | 1 + plugin.json | 2 +- .../docs/agents/azure-resource-deployer.md | 140 +++++-- .../docs/agents/azure-template-generator.md | 44 ++- website/docs/agents/git-ape.md | 16 +- website/docs/deployment/state.md | 4 +- website/docs/skills/azure-stack-deploy.md | 161 ++++++++ website/docs/skills/azure-stack-destroy.md | 149 ++++++++ website/docs/skills/overview.md | 7 + website/docs/workflows/git-ape-deploy.md | 26 +- website/docs/workflows/git-ape-destroy.md | 1 + 23 files changed, 2333 insertions(+), 104 deletions(-) create mode 100644 .github/skills/azure-stack-deploy/SKILL.md create mode 100644 .github/skills/azure-stack-deploy/scripts/deploy-stack.ps1 create mode 100755 .github/skills/azure-stack-deploy/scripts/deploy-stack.sh create mode 100644 .github/skills/azure-stack-destroy/SKILL.md create mode 100644 .github/skills/azure-stack-destroy/scripts/destroy-stack.ps1 create mode 100755 .github/skills/azure-stack-destroy/scripts/destroy-stack.sh create mode 100644 website/docs/skills/azure-stack-deploy.md create mode 100644 website/docs/skills/azure-stack-destroy.md diff --git a/.github/agents/azure-resource-deployer.agent.md b/.github/agents/azure-resource-deployer.agent.md index dadde28..ecfa2ca 100644 --- a/.github/agents/azure-resource-deployer.agent.md +++ b/.github/agents/azure-resource-deployer.agent.md @@ -96,33 +96,47 @@ Before deploying, verify: ### 2. Execute Deployment -Use Azure MCP `deploy` service or Azure CLI: +**Always deploy as a subscription-scoped Deployment Stack.** Stacks track every managed resource (across resource groups and subscription scope) and make destroy idempotent โ€” a single `az stack sub delete --action-on-unmanage deleteAll` removes everything the stack owns, regardless of resource scope. -**Option A: Azure MCP (Preferred)** -``` -Use mcp_azure_mcp_search with "deploy" intent to execute template deployment -- Set deployment name: "git-ape-{timestamp}" -- Set mode: "Incremental" (default) or "Complete" (if user specified) -- Monitor deployment with progress updates -``` +> **Single source of truth:** the deploy command, fallback handling, state.json writer, soft-delete classification, and Key Vault purge-protection detection all live in the [`azure-stack-deploy`](../skills/azure-stack-deploy/SKILL.md) skill. Both bash and PowerShell implementations are provided. -**Option B: Azure CLI (Fallback)** +**Pre-flight: validate the stack before deploying** -**Always use subscription-level deployment** โ€” the ARM template includes resource group creation, so we deploy at subscription scope: +Use `az stack sub validate` (not `az deployment sub validate`) so the validation also checks the stack-specific flags (`--action-on-unmanage`, `--deny-settings-mode`) โ€” not just the template: ```bash -# Subscription-level deployment (creates RG + all resources atomically) -az deployment sub create \ +az stack sub validate \ --name "{deployment-id}" \ --location {location} \ --template-file {template.json} \ --parameters @{parameters.json} \ + --action-on-unmanage deleteAll \ + --deny-settings-mode none \ --output json ``` -**DO NOT use `az deployment group create`** โ€” our templates always include the resource group as a resource. Subscription-level deployment handles everything in one command. +**Invoke the deploy skill** -Capture the deployment operation ID for tracking. +```bash +# Bash +.github/skills/azure-stack-deploy/scripts/deploy-stack.sh \ + --deployment-id "{deployment-id}" + +# PowerShell +.github/skills/azure-stack-deploy/scripts/deploy-stack.ps1 ` + -DeploymentId "{deployment-id}" +``` + +The skill: +- Calls `az stack sub create --action-on-unmanage deleteAll --deny-settings-mode none --description "Git-Ape deployment {id}" --tags managedBy=git-ape deploymentId={id} --yes --verbose` +- Falls back to `az deployment sub create` only if the stack call fails (warns the user โ€” fallback path does NOT solve soft-delete / multi-RG / sub-scope idempotency) +- On any failure, dumps the per-operation failure list inline so the root cause is immediately visible +- On success, captures the `stackId`, classifies every managed resource (type, scope, soft-deletable, purge-protected), and writes the extended `state.json` (schemaVersion 1.0) +- Updates `metadata.json` with `status: "succeeded"`, `deployMethod`, and `resourceGroups[]` + +Pass `--no-fallback` (bash) / `-NoFallback` (pwsh) when the user explicitly wants to fail loudly instead of accepting the legacy path. + +**DO NOT use `az deployment group create`** โ€” our templates always include the resource group as a resource. Subscription scope handles everything in one command. ### 3. Monitor Progress @@ -150,15 +164,27 @@ Status updates: **Monitoring Commands:** ```bash -# Check deployment status (subscription-level) +# Stack path โ€” check stack provisioning state +az stack sub show \ + --name {deployment-id} \ + --query "provisioningState" \ + --output tsv + +# Stack path โ€” list managed resources (post-deploy or in-progress) +az stack sub show \ + --name {deployment-id} \ + --query "resources[].{Id:id, Status:status}" \ + --output table + +# Fallback path โ€” subscription deployment az deployment sub show \ - --name {deployment-name} \ + --name {deployment-id} \ --query "properties.provisioningState" \ --output tsv -# Get deployment operations (detailed resource status) +# Fallback path โ€” deployment operations (detailed resource status) az deployment operation sub list \ - --name {deployment-name} \ + --name {deployment-id} \ --query "[].{Resource:properties.targetResource.resourceName, Type:properties.targetResource.resourceType, Status:properties.provisioningState}" \ --output table ``` @@ -194,13 +220,18 @@ Use mcp_azure_mcp_search to query deployed resources and verify: ### 5. Capture Deployment Outputs -Extract and report deployment outputs (defined in ARM template `outputs` section): +Extract and report deployment outputs: ```bash -# Get deployment outputs -az deployment group show \ - --name {deployment-name} \ - --resource-group {rg-name} \ +# Stack path โ€” outputs are on the stack itself +az stack sub show \ + --name {deployment-id} \ + --query "outputs" \ + --output json + +# Fallback path โ€” subscription deployment outputs +az deployment sub show \ + --name {deployment-id} \ --query "properties.outputs" \ --output json ``` @@ -212,7 +243,25 @@ Common outputs to capture: - Managed identity principal IDs - Dashboard/monitoring URLs -### 6. Report Deployment Results +### 6. Verify `state.json` was written + +The [`azure-stack-deploy`](../skills/azure-stack-deploy/SKILL.md) skill writes `state.json` (schemaVersion 1.0) and updates `metadata.json` with `deployMethod` and `resourceGroups[]` as part of step 2. The agent's job here is to confirm the write succeeded and surface its contents for the user. + +```bash +DEPLOYMENT_ID="{deployment-id}" +DEPLOY_DIR=".azure/deployments/$DEPLOYMENT_ID" +[[ -f "$DEPLOY_DIR/state.json" ]] || { echo "state.json missing โ€” deploy skill did not complete"; exit 1; } + +# Sanity-check the schema and the lifecycle owner +jq '{schemaVersion, deploymentId, deployMethod, stackId, resourceGroups, managedResourceCount: (.managedResources | length)}' \ + "$DEPLOY_DIR/state.json" +``` + +If `deployMethod == "stack"` and `stackId` is empty, the deploy fell back silently โ€” re-run the skill with `--no-fallback` to surface why stacks were rejected. + +The destroy skill ([`azure-stack-destroy`](../skills/azure-stack-destroy/SKILL.md)) consumes this file as its sole source of truth. + +### 7. Report Deployment Results Provide a comprehensive summary: @@ -245,7 +294,9 @@ Provide a comprehensive summary: To destroy this deployment and delete all its resources: > `@git-ape destroy deployment {deployment-id}` > -> Or via GitHub: create a PR that sets `metadata.json` status to `destroy-requested`, then merge after approval +> Locally this invokes the [`azure-stack-destroy`](../skills/azure-stack-destroy/SKILL.md) skill, which uses `az stack sub delete --action-on-unmanage deleteAll --bypass-stack-out-of-sync-error true` (single command, idempotent across resource groups and subscription scope) and purges any soft-deletable resources that are not purge-protected. +> +> Or via GitHub: create a PR that sets `metadata.json` status to `destroy-requested`, then merge after approval. **Deployment Logs:** {Link to deployment logs if available} ``` @@ -254,7 +305,17 @@ To destroy this deployment and delete all its resources: ### Deployment Failure -If deployment fails, provide detailed diagnostics: +If deployment fails, **always dump the underlying failed operations before presenting options to the user**. The stack/deployment top-level error is usually just a summary; the real root cause is in the per-resource operations list. + +```bash +# Inline failure diagnostics โ€” run BEFORE asking the user what to do +echo "โ”€โ”€ Underlying failed operations โ”€โ”€" +az deployment operation sub list --name "{deployment-id}" --output json 2>/dev/null \ + | jq -r '.[] | select(.properties.provisioningState == "Failed") | + "โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€\nResource : \(.properties.targetResource.resourceName // "n/a") (\(.properties.targetResource.resourceType // "n/a"))\nStatus : \(.properties.statusCode // "n/a")\nMessage : \(.properties.statusMessage.error.message // .properties.statusMessage // "n/a")"' +``` + +Then surface the diagnostics in the user-facing message: ```markdown โŒ **Deployment Failed** @@ -267,6 +328,9 @@ If deployment fails, provide detailed diagnostics: - {Likely cause 1 based on error} - {Likely cause 2} +**Per-Resource Failures:** +{Output of `az deployment operation sub list` filtered to Failed entries} + **Diagnostic Details:** {Full error from Azure} @@ -326,24 +390,26 @@ Type A, B, C, or D: # Option A: Full Rollback if [[ "$USER_CHOICE" == "A" ]]; then # Confirm first - echo "โš ๏ธ This will DELETE all resources. Type 'confirm rollback' to proceed." + echo "โš ๏ธ This will DELETE all managed resources. Type 'confirm rollback' to proceed." read CONFIRMATION - + if [[ "$CONFIRMATION" == "confirm rollback" ]]; then - # Delete resources - az resource delete --ids {resource-id-1} {resource-id-2} - - # If RG was created new, delete it - if [[ "$RG_NEW" == "true" ]]; then - az group delete --name {rg-name} --yes --no-wait - fi - + # Single source of truth: the destroy skill handles stack delete, + # fallback RG delete, soft-delete purge sweep, and state.json updates. + .github/skills/azure-stack-destroy/scripts/destroy-stack.sh \ + --deployment-id {deployment-id} \ + --yes + # PowerShell equivalent: + # .github/skills/azure-stack-destroy/scripts/destroy-stack.ps1 -DeploymentId {deployment-id} -Yes + # Log rollback - echo "Rollback completed" >> .azure/deployments/{deployment-id}/deployment.log + echo "Rollback completed via azure-stack-destroy skill" >> .azure/deployments/{deployment-id}/deployment.log fi fi ``` +> **Important:** Never mix individual `az resource delete` calls when a `stackId` is present in `state.json`. The stack path is canonical โ€” always invoke the [`azure-stack-destroy`](../skills/azure-stack-destroy/SKILL.md) skill, which encapsulates the stack delete, fallback RG delete, and soft-delete purge sweep (Key Vault, Cognitive Services, etc.) for any resources that are not purge-protected. + **Step 4: Update deployment state:** ```json // .azure/deployments/{deployment-id}/metadata.json diff --git a/.github/agents/azure-template-generator.agent.md b/.github/agents/azure-template-generator.agent.md index 69e4807..22466b2 100644 --- a/.github/agents/azure-template-generator.agent.md +++ b/.github/agents/azure-template-generator.agent.md @@ -135,7 +135,7 @@ see [git-ape.agent.md](git-ape.agent.md). - Resource Group is a `Microsoft.Resources/resourceGroups` resource inside the template - Other resources go inside a nested `Microsoft.Resources/deployments` with `"resourceGroup"` property - Use `subscriptionResourceId()` for RG references, regular `resourceId()` inside nested -- Deploy with `az deployment sub create` (not `az deployment group create`) +- Deploy with `az stack sub create --action-on-unmanage deleteAll` (preferred) or `az deployment sub create` as a fallback (not `az deployment group create`) - `uniqueString()` uses `subscription().subscriptionId` instead of `resourceGroup().id` **Nested Template Requirements:** @@ -691,7 +691,30 @@ After showing the preview, provide the complete ARM template: ## Deployment Commands -**Azure CLI (Subscription-level deployment):** +The canonical deploy and destroy paths live in the [`azure-stack-deploy`](../skills/azure-stack-deploy/SKILL.md) and [`azure-stack-destroy`](../skills/azure-stack-destroy/SKILL.md) skills. The commands below are reference recipes โ€” prefer invoking the skills so local CLI / VS Code and CI pipelines stay in sync. + +**Azure CLI (Subscription-scoped Deployment Stack โ€” preferred):** +```bash +az stack sub create \ + --name {deployment-id} \ + --location {location} \ + --template-file template.json \ + --parameters @parameters.json \ + --action-on-unmanage deleteAll \ + --deny-settings-mode none \ + --description "Git-Ape deployment {deployment-id}" \ + --tags "managedBy=git-ape" "deploymentId={deployment-id}" \ + --yes \ + --verbose +``` + +The stack tracks every managed resource (across resource groups and subscription scope), so destroy is a single idempotent command: + +```bash +az stack sub delete --name {deployment-id} --action-on-unmanage deleteAll --bypass-stack-out-of-sync-error true --yes +``` + +**Azure CLI (Subscription-level deployment โ€” fallback only):** ```bash az deployment sub create \ --name {deployment-id} \ @@ -700,7 +723,20 @@ az deployment sub create \ --parameters @parameters.json ``` -**PowerShell:** +Use the fallback only when Deployment Stacks are unavailable in the target subscription/region. The fallback does NOT solve the soft-delete / multi-RG / sub-scope idempotency problem. + +**PowerShell (Deployment Stack โ€” preferred):** +```powershell +New-AzSubscriptionDeploymentStack ` + -Name {deployment-id} ` + -Location {location} ` + -TemplateFile template.json ` + -TemplateParameterFile parameters.json ` + -ActionOnUnmanage DeleteAll ` + -DenySettingsMode None +``` + +**PowerShell (subscription deployment โ€” fallback):** ```powershell New-AzSubscriptionDeployment ` -Name {deployment-id} ` @@ -709,7 +745,7 @@ New-AzSubscriptionDeployment ` -TemplateParameterFile parameters.json ``` -**Note:** We use subscription-level deployments so the resource group is created as part of the template. No need to create the RG separately. +**Note:** We use subscription scope so the resource group is created as part of the template. No need to create the RG separately. ```` ## Constraints diff --git a/.github/agents/git-ape.agent.md b/.github/agents/git-ape.agent.md index d206482..f40449d 100644 --- a/.github/agents/git-ape.agent.md +++ b/.github/agents/git-ape.agent.md @@ -97,7 +97,7 @@ Git-Ape can run in two modes. Detect which mode is active and adapt behavior acc | Validation | Run locally | `git-ape-plan.yml` runs on PR, posts what-if as comment | | Confirmation | Ask user interactively | PR approval = confirmation | | Deployment | Execute immediately | `git-ape-deploy.yml` runs on merge or `/deploy` comment | -| Destroy | Execute after confirmation | PR sets `metadata.json` status to `destroy-requested` โ†’ merge triggers `git-ape-destroy.yml` | +| Destroy | Execute via `az stack sub delete --action-on-unmanage deleteAll` after confirmation, then purge soft-deletables | PR sets `metadata.json` status to `destroy-requested` โ†’ merge triggers `git-ape-destroy.yml` (same stack-based flow + soft-delete purge) | | Results | Display in chat | Posted as PR/issue comment + state committed to repo | ## Your Role @@ -354,12 +354,13 @@ The deployment plan MUST start with a clear "Target Environment" table: **Delegate to:** `azure-resource-deployer` The deployer will: -- Execute the ARM template as a **subscription-level deployment** (`az deployment sub create`) +- Execute the ARM template as a **subscription-scoped Deployment Stack** (`az stack sub create --action-on-unmanage deleteAll`) so destroy is idempotent across resource groups and subscription scope. The CLI fallback (`az deployment sub create`) is used only if stacks are unavailable. - The ARM template includes resource group creation โ€” everything deploys atomically - Monitor deployment progress in real-time - Handle any deployment failures - Verify resource creation via Azure Resource Graph - Capture deployment outputs (resource IDs, endpoints, etc.) +- Capture the **stack ID** plus every managed resource into `state.json` (extended schema: `stackId`, `deployMethod`, `managedResources[]`, `resourceGroups[]`, `subscriptions[]`, `externalReferences[]`) so the destroy path can find them later โ€” including soft-deletable types (Key Vault, Cognitive Services, App Configuration, API Management, ML Workspaces, Recovery Services Vaults). **Deployment Monitoring:** Always poll deployment state every **30 seconds** using `sleep 30` between checks. No exponential backoff โ€” use a fixed 30-second interval for all resources regardless of type or expected duration. Check both the top-level deployment and nested deployment statuses on every poll. @@ -386,7 +387,16 @@ Run post-deployment validation: ``` To destroy this deployment and delete all its resources, use Git-Ape: > @git-ape destroy deployment {deployment-id} - + + Locally, this invokes the `azure-stack-destroy` skill: + > .github/skills/azure-stack-destroy/scripts/destroy-stack.sh --deployment-id {deployment-id} + > # or PowerShell: + > .github/skills/azure-stack-destroy/scripts/destroy-stack.ps1 -DeploymentId {deployment-id} + + Which uses `az stack sub delete --action-on-unmanage deleteAll --bypass-stack-out-of-sync-error true` + (single command, idempotent across resource groups and subscription scope) and + purges any soft-deletable resources that are not purge-protected. + Or via GitHub (if using CI/CD): > Create a PR that sets `metadata.json` status to `destroy-requested`, then merge after approval ``` diff --git a/.github/copilot-instructions.md b/.github/copilot-instructions.md index 2c29d37..64e1d61 100644 --- a/.github/copilot-instructions.md +++ b/.github/copilot-instructions.md @@ -152,13 +152,15 @@ Always include these tags on all resources: ## Deployment Workflow -### Interactive Mode (VS Code) +### Interactive Mode (VS Code / local CLI) 1. **Requirements Gathering:** Collect all necessary parameters before generating templates 2. **Template Validation:** Always validate ARM templates before deployment 3. **User Confirmation:** Echo deployment intent and wait for explicit approval -4. **Deployment Execution:** Monitor progress and capture deployment logs -5. **Integration Testing:** Run health checks on deployed resources +4. **Deployment Execution:** Invoke the **[`azure-stack-deploy`](.github/skills/azure-stack-deploy/SKILL.md) skill**, which deploys as a subscription-scoped Azure Deployment Stack (`az stack sub create --action-on-unmanage deleteAll`). This is the same primitive used by the CI workflows so local and pipeline deployments produce identical state. The skill captures the stack ID, managed resources, soft-deletable resources, and resource groups into `state.json` (schemaVersion 1.0). It falls back to `az deployment sub create` only if Deployment Stacks are unavailable in the target subscription/region. Both bash (`scripts/deploy-stack.sh`) and PowerShell (`scripts/deploy-stack.ps1`) implementations are provided. +5. **State Persistence:** The deploy skill writes `state.json` and updates `metadata.json` with `deployMethod` (`stack` or `subscription`) and `resourceGroups[]`. Schema reference: [website/docs/deployment/state.md](website/docs/deployment/state.md). +6. **Integration Testing:** Run health checks on deployed resources +7. **Destroy:** Invoke the **[`azure-stack-destroy`](.github/skills/azure-stack-destroy/SKILL.md) skill** (or `@git-ape destroy deployment {deployment-id}`). The skill mirrors the CI workflow exactly: `az stack sub delete --action-on-unmanage deleteAll --bypass-stack-out-of-sync-error true` (single command, idempotent across resource groups and subscription scope), purges any soft-deletable resources that are not purge-protected (Key Vault, Cognitive Services, etc.), then cleans the subscription deployment history entry to stay under the 800/scope limit. Both bash (`scripts/destroy-stack.sh`) and PowerShell (`scripts/destroy-stack.ps1`) implementations are provided. ### Pipeline Mode (GitHub Actions) @@ -193,10 +195,11 @@ Git-Ape provides three GitHub Actions workflows under `.github/workflows/`: 1. Detects deployment directories to execute 2. Logs into Azure via OIDC 3. Validates the template one more time -4. Runs `az deployment sub create` to deploy -5. Runs integration tests (lists deployed resources, tests HTTP endpoints) -6. Commits `state.json` with deployment result back to the repo -7. Posts deployment result as a PR comment (on `/deploy` trigger) +4. Runs `az stack sub create --action-on-unmanage deleteAll` to deploy (falls back to `az deployment sub create` if stacks are unavailable) +5. Captures the **stack ID**, managed resources, soft-deletable resources, and resource groups into `state.json` +6. Runs integration tests (lists deployed resources, tests HTTP endpoints) +7. Commits `state.json` (extended schema) and `metadata.json` (`deployMethod`, `resourceGroups[]`) back to the repo +8. Posts deployment result as a PR comment (on `/deploy` trigger) **Requires:** GitHub environment `azure-deploy` (for environment protection rules) @@ -413,10 +416,17 @@ jobs: subscription-id: ${{ secrets.AZURE_SUBSCRIPTION_ID }} - name: Deploy run: | - az deployment sub create \ + az stack sub create \ + --name ${{ env.DEPLOYMENT_ID }} \ --location ${{ env.LOCATION }} \ --template-file .azure/deployments/${{ env.DEPLOYMENT_ID }}/template.json \ - --parameters @.azure/deployments/${{ env.DEPLOYMENT_ID }}/parameters.json + --parameters @.azure/deployments/${{ env.DEPLOYMENT_ID }}/parameters.json \ + --action-on-unmanage deleteAll \ + --deny-settings-mode none \ + --description "Git-Ape deployment ${{ env.DEPLOYMENT_ID }}" \ + --tags "managedBy=git-ape" "deploymentId=${{ env.DEPLOYMENT_ID }}" \ + --yes \ + --verbose ``` **Transitioning from Service Principal secrets to OIDC:** diff --git a/.github/scripts/deployment-manager.sh b/.github/scripts/deployment-manager.sh index 815738f..7f0f173 100755 --- a/.github/scripts/deployment-manager.sh +++ b/.github/scripts/deployment-manager.sh @@ -1,6 +1,16 @@ #!/bin/bash # Azure Deployment State Manager -# Utility script for managing deployment artifacts and state persistence +# Utility script for managing deployment artifact metadata. +# +# Deploy / destroy logic lives in the dedicated skills: +# .github/skills/azure-stack-deploy/scripts/deploy-stack.sh (or .ps1) +# .github/skills/azure-stack-destroy/scripts/destroy-stack.sh (or .ps1) +# These mirror .github/workflows/git-ape-deploy.exampleyml and +# .github/workflows/git-ape-destroy.exampleyml so local CLI / VS Code +# operations produce identical state.json (schemaVersion 1.0). +# +# This script handles only inventory tasks: list / show / clean / init / +# validate / export. set -euo pipefail @@ -318,6 +328,19 @@ main() { fi validate_deployment "$2" ;; + deploy|destroy) + cat < + PowerShell: .github/skills/azure-stack-${COMMAND}/scripts/${COMMAND}-stack.ps1 -DeploymentId + Agent: /azure-stack-${COMMAND} + +See .github/skills/azure-stack-${COMMAND}/SKILL.md for full options. +EOF + exit 1 + ;; *) echo "Azure Deployment State Manager" echo "" @@ -331,6 +354,10 @@ main() { echo " init [id] Initialize new deployment directory" echo " validate Validate deployment state files" echo "" + echo "Deploy / destroy moved to dedicated skills:" + echo " Deploy: .github/skills/azure-stack-deploy/scripts/deploy-stack.{sh,ps1}" + echo " Destroy: .github/skills/azure-stack-destroy/scripts/destroy-stack.{sh,ps1}" + echo "" echo "Examples:" echo " $0 list" echo " $0 show deploy-20260218-143022" diff --git a/.github/skills/azure-stack-deploy/SKILL.md b/.github/skills/azure-stack-deploy/SKILL.md new file mode 100644 index 0000000..b404dfc --- /dev/null +++ b/.github/skills/azure-stack-deploy/SKILL.md @@ -0,0 +1,143 @@ +--- +name: azure-stack-deploy +description: "Deploy an ARM template as a subscription-scoped Azure Deployment Stack (idempotent across resource groups and sub-scope). Captures managed resources, classifies soft-deletable types, detects Key Vault purge protection, and writes extended state.json (schemaVersion 1.0). Use for any local CLI / VS Code Git-Ape deployment so the result matches the CI workflow." +argument-hint: "Deployment ID (folder under .azure/deployments/) โ€” optional --location override" +user-invocable: true +--- + +# Azure Stack Deploy + +Deploy a Git-Ape deployment artifact as a subscription-scoped **Azure Deployment Stack** (`az stack sub create --action-on-unmanage deleteAll`). The stack is the lifecycle owner of every resource the template creates โ€” across resource groups and subscription scope โ€” which makes destroy idempotent in a single call (see [`azure-stack-destroy`](../azure-stack-destroy/SKILL.md)). + +This skill produces the **same `state.json`** schema (`schemaVersion: "1.0"`) as the CI workflow at `.github/workflows/git-ape-deploy.exampleyml`, so local deployments and pipeline deployments are interchangeable. + +## When to Use + +- Local deployment from VS Code or terminal (the `git-ape` agent invokes this in Stage 3) +- Re-deploying an existing deployment ID after template edits โ€” stacks are stateful, so this is an in-place update +- Any time you would otherwise run `az deployment sub create` against a Git-Ape `template.json` + +## Prerequisites + +| Tool | Why | +|------|-----| +| `az` (Azure CLI โ‰ฅ 2.59) | `az stack sub` requires CLI โ‰ฅ 2.50; 2.59 has the latest stack flags | +| `jq` | State capture and JSON extraction | +| `bash` โ‰ฅ 4 OR PowerShell 7+ | Either runner works | +| Active `az login` | Skill exits early if no subscription is selected | +| Existing `template.json` (and optional `parameters.json`) under `.azure/deployments//` | Source artifacts | + +## Procedure + +### 1. Locate deployment artifacts + +```bash +DEPLOYMENT_ID="deploy-20260506-001" +DEPLOYMENT_PATH=".azure/deployments/$DEPLOYMENT_ID" + +[[ -f "$DEPLOYMENT_PATH/template.json" ]] || { echo "template.json missing"; exit 1; } +``` + +If `parameters.json` is present, `location`, `project` (or `projectName`), and `environment` are read from it. Defaults: `eastus` / `unknown` / `dev`. + +### 2. Run the script + +```bash +.github/skills/azure-stack-deploy/scripts/deploy-stack.sh \ + --deployment-id "$DEPLOYMENT_ID" +``` + +PowerShell equivalent: + +```powershell +.github/skills/azure-stack-deploy/scripts/deploy-stack.ps1 ` + -DeploymentId "$DEPLOYMENT_ID" +``` + +The script: + +1. Resolves `location`, `project`, `environment` from `parameters.json` (or defaults) +2. Validates Azure CLI session (`az account show`) +3. Calls `az stack sub create` with the canonical Git-Ape flag set: + - `--action-on-unmanage deleteAll` + - `--deny-settings-mode none` + - `--description "Git-Ape deployment "` + - `--tags managedBy=git-ape deploymentId=` + - `--yes --verbose` +4. **On stack failure**, falls back to `az deployment sub create` (warns the user โ€” no soft-delete / multi-RG idempotency on the fallback path) +5. **On any deployment failure**, dumps the per-operation failure list (`az deployment operation sub list`) inline so the root cause is visible without clicking into the Portal +6. **On success**, queries `az stack sub show --query "resources[].id"` for the live managed-resource list, classifies each resource (type, scope, soft-deletable, purge-protected), and writes the extended `state.json` +7. Updates `metadata.json` with `status: "succeeded"`, `deployMethod`, and `resourceGroups[]` + +### 3. Inspect output + +```text +โœ… Deployment succeeded in 142s (method: stack) +State written to: .azure/deployments/deploy-20260506-001/state.json +Stack ID: /subscriptions//providers/Microsoft.Resources/deploymentStacks/deploy-20260506-001 + +To destroy this deployment: + /azure-stack-destroy deploy-20260506-001 +``` + +## Arguments + +| Flag (bash) | Param (pwsh) | Required | Description | +|-------------|--------------|----------|-------------| +| `--deployment-id ` | `-DeploymentId ` | yes | Folder name under `.azure/deployments/` | +| `--location ` | `-Location ` | no | Override the location from `parameters.json` | +| `--no-fallback` | `-NoFallback` | no | Fail loudly if the stack call fails instead of falling back to `az deployment sub create` | + +## state.json schema (v1.0) + +```json +{ + "schemaVersion": "1.0", + "deploymentId": "deploy-20260506-001", + "timestamp": "2026-05-06T12:00:00Z", + "status": "succeeded", + "duration": "142s", + "subscription": "", + "location": "eastus", + "project": "myapp", + "environment": "dev", + "resourceGroup": "rg-myapp-dev-eastus", + "deployMethod": "stack", + "stackId": "/subscriptions//providers/Microsoft.Resources/deploymentStacks/deploy-20260506-001", + "managedResources": [ + { + "id": "/subscriptions//resourceGroups/rg-myapp-dev-eastus/providers/Microsoft.KeyVault/vaults/kv-myapp-dev-eus", + "type": "Microsoft.KeyVault/vaults", + "scope": "resourceGroup", + "softDeletable": true, + "purgeProtected": true + } + ], + "resourceGroups": ["rg-myapp-dev-eastus"], + "subscriptions": [""], + "externalReferences": [] +} +``` + +See [website/docs/deployment/state.md](../../../website/docs/deployment/state.md) for the full schema reference. + +## Soft-deletable resource types tracked + +`Microsoft.KeyVault/vaults`, `Microsoft.CognitiveServices/accounts`, `Microsoft.AppConfiguration/configurationStores`, `Microsoft.ApiManagement/service`, `Microsoft.MachineLearningServices/workspaces`, `Microsoft.RecoveryServices/vaults`. + +The destroy skill ([`azure-stack-destroy`](../azure-stack-destroy/SKILL.md)) consumes the `softDeletable` and `purgeProtected` fields to drive its purge sweep. + +## Failure modes + +| Symptom | Likely cause | Recovery | +|---------|--------------|----------| +| `Not logged in to Azure` | `az login` missing | Run `az login` then retry | +| `template.json missing` | Wrong deployment ID | Check `.azure/deployments/` contents | +| Stack create fails immediately | Region/policy blocks Deployment Stacks | Re-run without `--no-fallback`, accept the legacy path, or pick a supported region | +| Stack succeeds but `state.json` missing managed resources | `az stack sub show` race condition | Re-run โ€” the script is idempotent (stacks de-duplicate on `--name`) | + +## Related + +- [`azure-stack-destroy`](../azure-stack-destroy/SKILL.md) โ€” the matching destroy skill (single source of truth: `stackId`) +- [`azure-deployment-preflight`](../azure-deployment-preflight/SKILL.md) โ€” what-if and permission checks BEFORE deploy +- [`azure-security-analyzer`](../azure-security-analyzer/SKILL.md) โ€” security gate (BLOCKING) before deploy confirmation diff --git a/.github/skills/azure-stack-deploy/scripts/deploy-stack.ps1 b/.github/skills/azure-stack-deploy/scripts/deploy-stack.ps1 new file mode 100644 index 0000000..57210be --- /dev/null +++ b/.github/skills/azure-stack-deploy/scripts/deploy-stack.ps1 @@ -0,0 +1,314 @@ +<# +.SYNOPSIS + Deploy a Git-Ape deployment artifact as a subscription-scoped Azure Deployment Stack. + +.DESCRIPTION + PowerShell port of deploy-stack.sh. Mirrors the logic of + .github/workflows/git-ape-deploy.exampleyml so local CLI / VS Code + deployments produce identical state.json (schemaVersion 1.0). + +.PARAMETER DeploymentId + Folder name under .azure/deployments/. Required. + +.PARAMETER Location + Override the location from parameters.json. Optional. + +.PARAMETER NoFallback + Fail loudly if the stack call fails instead of falling back to az deployment sub create. + +.EXAMPLE + ./deploy-stack.ps1 -DeploymentId deploy-20260506-001 + +.EXAMPLE + ./deploy-stack.ps1 -DeploymentId deploy-20260506-001 -Location westus2 -NoFallback + +.NOTES + Requires: PowerShell 7+, az CLI โ‰ฅ 2.59, jq, active az login session. +#> +[CmdletBinding()] +param( + [string]$DeploymentId, + + [string]$Location, + + [switch]$NoFallback, + + [switch]$Help +) + +$ErrorActionPreference = 'Stop' + +function Show-Usage { + @' +Azure Stack Deploy โ€” deploy as subscription-scoped Deployment Stack + +Usage: deploy-stack.ps1 -DeploymentId [OPTIONS] + +Required: + -DeploymentId Folder name under .azure/deployments/ + +Options: + -Location Override location from parameters.json + -NoFallback Fail loudly if stack create fails (no fallback to az deployment sub create) + -Help Show this help + +Examples: + ./deploy-stack.ps1 -DeploymentId deploy-20260506-001 + ./deploy-stack.ps1 -DeploymentId deploy-20260506-001 -Location westus2 + ./deploy-stack.ps1 -DeploymentId deploy-20260506-001 -NoFallback +'@ | Write-Host +} + +if ($Help -or [string]::IsNullOrWhiteSpace($DeploymentId)) { + Show-Usage + exit 1 +} + +$ScriptDir = Split-Path -Parent $MyInvocation.MyCommand.Path +$WorkspaceRoot = (Resolve-Path (Join-Path $ScriptDir '../../../..')).Path +$DeploymentsDir = '.azure/deployments' +$DeploymentPath = Join-Path $WorkspaceRoot (Join-Path $DeploymentsDir $DeploymentId) + +# Soft-deletable resource types (must match the CI workflow list) +$SoftDeletableTypes = @( + 'Microsoft.KeyVault/vaults' + 'Microsoft.CognitiveServices/accounts' + 'Microsoft.AppConfiguration/configurationStores' + 'Microsoft.ApiManagement/service' + 'Microsoft.MachineLearningServices/workspaces' + 'Microsoft.RecoveryServices/vaults' +) + +function Write-Color { + param([string]$Text, [string]$Color = 'White') + Write-Host $Text -ForegroundColor $Color +} + +if (-not (Test-Path -PathType Container $DeploymentPath)) { + Write-Color "Deployment not found: $DeploymentId" Red + exit 1 +} +$TemplateFile = Join-Path $DeploymentPath 'template.json' +if (-not (Test-Path $TemplateFile)) { + Write-Color "Template not found: $TemplateFile" Red + exit 1 +} + +# Internal helpers ------------------------------------------------------------ + +function Get-ResourceClassification { + param([string]$ResourceId) + + $type = $null + if ($ResourceId -match 'providers/([^/]+/[^/]+)') { + $type = $matches[1] + } + $scope = if ($ResourceId -match '/resourceGroups/') { 'resourceGroup' } else { 'subscription' } + $isSoft = $SoftDeletableTypes -contains $type + + $purgeProtected = $false + if ($type -eq 'Microsoft.KeyVault/vaults') { + $pp = az resource show --ids $ResourceId --query 'properties.enablePurgeProtection // `false`' -o tsv 2>$null + $purgeProtected = ($pp -eq 'true') + } + + [pscustomobject]@{ + id = $ResourceId + type = $type + scope = $scope + softDeletable = $isSoft + purgeProtected = $purgeProtected + } +} + +function Build-ManagedResources { + param([string[]]$ResourceIds) + $list = @() + foreach ($id in $ResourceIds) { + if ([string]::IsNullOrWhiteSpace($id)) { continue } + $list += Get-ResourceClassification -ResourceId $id + } + , $list +} + +# Resolve deployment parameters ---------------------------------------------- + +$ParamsArg = @() +$ResolvedLoc = 'eastus' +$Project = 'unknown' +$Environment = 'dev' +$ParametersFile = Join-Path $DeploymentPath 'parameters.json' +if (Test-Path $ParametersFile) { + $ParamsArg += '--parameters' + $ParamsArg += "@$ParametersFile" + $params = Get-Content $ParametersFile -Raw | ConvertFrom-Json + if ($params.parameters.location.value) { $ResolvedLoc = $params.parameters.location.value } + if ($params.parameters.project.value) { $Project = $params.parameters.project.value } + elseif ($params.parameters.projectName.value) { $Project = $params.parameters.projectName.value } + if ($params.parameters.environment.value) { $Environment = $params.parameters.environment.value } +} +if ($PSBoundParameters.ContainsKey('Location') -and $Location) { $ResolvedLoc = $Location } + +$Subscription = az account show --query id -o tsv 2>$null +if ([string]::IsNullOrWhiteSpace($Subscription)) { + Write-Color "Not logged in to Azure. Run 'az login' first." Red + exit 1 +} + +Write-Color "๐Ÿš€ Deploying $DeploymentId" Blue +Write-Host " Subscription: $Subscription" +Write-Host " Location: $ResolvedLoc" +Write-Host ' Method: stack (az stack sub create --action-on-unmanage deleteAll)' + +# Deploy --------------------------------------------------------------------- + +$StartTime = Get-Date +$DeployMethod = 'stack' +$StackId = $null +$DeployOutput = $null +$ExitCode = 0 + +$stackArgs = @( + 'stack', 'sub', 'create', + '--name', $DeploymentId, + '--location', $ResolvedLoc, + '--template-file', $TemplateFile +) + $ParamsArg + @( + '--action-on-unmanage', 'deleteAll', + '--deny-settings-mode', 'none', + '--description', "Git-Ape deployment $DeploymentId", + '--tags', 'managedBy=git-ape', "deploymentId=$DeploymentId", + '--yes', '--verbose', '--output', 'json' +) + +# Capture stdout (JSON) and stderr (verbose log) separately so the JSON we hand +# to ConvertFrom-Json downstream stays clean. +$VerboseLog = New-TemporaryFile +try { + $DeployOutput = & az @stackArgs 2>$VerboseLog + if ($LASTEXITCODE -ne 0) { + if ($NoFallback) { + Write-Color 'โŒ Stack deploy failed and -NoFallback was set' Red + Write-Host $DeployOutput + Get-Content $VerboseLog | Write-Host + $ExitCode = 1 + } else { + Write-Color 'โš  Stack deploy failed; check whether Deployment Stacks are available in this subscription/region.' Yellow + Write-Host $DeployOutput + Get-Content $VerboseLog | Write-Host + Write-Color 'Falling back to az deployment sub create (NOT idempotent for soft-delete / multi-RG).' Yellow + $DeployMethod = 'subscription' + $fallbackArgs = @( + 'deployment', 'sub', 'create', + '--name', $DeploymentId, + '--location', $ResolvedLoc, + '--template-file', $TemplateFile + ) + $ParamsArg + @('--output', 'json') + $DeployOutput = & az @fallbackArgs 2>$VerboseLog + if ($LASTEXITCODE -ne 0) { + Get-Content $VerboseLog | Write-Host + $ExitCode = 1 + } + } + } +} finally { + Remove-Item -Force -ErrorAction SilentlyContinue $VerboseLog +} + +$EndTime = Get-Date +$Duration = [int]($EndTime - $StartTime).TotalSeconds + +if ($ExitCode -ne 0) { + Write-Color 'โŒ Deployment failed' Red + Write-Host $DeployOutput + Write-Host '' + Write-Color 'โ”€โ”€ Underlying failed operations โ”€โ”€' Yellow + $opsJson = az deployment operation sub list --name $DeploymentId --output json 2>$null + if ($opsJson) { + $ops = $opsJson | ConvertFrom-Json + $failed = $ops | Where-Object { $_.properties.provisioningState -eq 'Failed' } + if ($failed.Count -eq 0) { + Write-Host '(no failed operations reported)' + } else { + foreach ($op in $failed) { + Write-Host 'โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€' + Write-Host ("Resource : {0} ({1})" -f ($op.properties.targetResource.resourceName ?? 'n/a'), ($op.properties.targetResource.resourceType ?? 'n/a')) + Write-Host ("Status : {0}" -f ($op.properties.statusCode ?? 'n/a')) + $msg = if ($op.properties.statusMessage.error.message) { $op.properties.statusMessage.error.message } else { $op.properties.statusMessage } + Write-Host ("Message : {0}" -f $msg) + } + } + } else { + Write-Host '(no per-operation details available โ€” deployment may not have reached Azure)' + } + exit 1 +} + +# Capture state -------------------------------------------------------------- + +$DeployJson = $DeployOutput | ConvertFrom-Json +if ($DeployMethod -eq 'stack') { + $StackId = $DeployJson.id + $Outputs = $DeployJson.outputs +} else { + $Outputs = $DeployJson.properties.outputs +} +$RgName = if ($Outputs -and $Outputs.resourceGroupName) { $Outputs.resourceGroupName.value } else { '' } + +Write-Color "โœ… Deployment succeeded in ${Duration}s (method: $DeployMethod)" Green + +if ($DeployMethod -eq 'stack' -and $StackId) { + $stackResources = az stack sub show --name $DeploymentId --query 'resources[].id' -o json 2>$null + if ($stackResources) { + $resourceIds = $stackResources | ConvertFrom-Json + } else { $resourceIds = @() } +} else { + $opsTsv = az deployment operation sub list --name $DeploymentId ` + --query "[?properties.provisioningState=='Succeeded' && properties.targetResource.id != null].properties.targetResource.id" ` + -o tsv 2>$null + $resourceIds = if ($opsTsv) { $opsTsv -split "`n" | Where-Object { $_ } } else { @() } +} + +$ManagedResources = Build-ManagedResources -ResourceIds $resourceIds +$ResourceGroups = @($ManagedResources | ForEach-Object { + if ($_.id -match '/resourceGroups/([^/]+)') { $matches[1] } +} | Sort-Object -Unique) +if ($ResourceGroups.Count -eq 0 -and $RgName) { $ResourceGroups = @($RgName) } + +$StateFile = Join-Path $DeploymentPath 'state.json' +$Timestamp = (Get-Date).ToUniversalTime().ToString('yyyy-MM-ddTHH:mm:ssZ') + +$state = [ordered]@{ + schemaVersion = '1.0' + deploymentId = $DeploymentId + timestamp = $Timestamp + status = 'succeeded' + duration = "${Duration}s" + subscription = $Subscription + location = $ResolvedLoc + project = $Project + environment = $Environment + resourceGroup = $RgName + deployMethod = $DeployMethod + stackId = $(if ([string]::IsNullOrWhiteSpace($StackId)) { $null } else { $StackId }) + managedResources = $ManagedResources + resourceGroups = $ResourceGroups + subscriptions = @($Subscription) + externalReferences = @() +} +$state | ConvertTo-Json -Depth 10 | Set-Content -Path $StateFile -Encoding utf8 + +$MetadataFile = Join-Path $DeploymentPath 'metadata.json' +if (Test-Path $MetadataFile) { + $metadata = Get-Content $MetadataFile -Raw | ConvertFrom-Json + $metadata | Add-Member -MemberType NoteProperty -Name status -Value 'succeeded' -Force + $metadata | Add-Member -MemberType NoteProperty -Name deployMethod -Value $DeployMethod -Force + $metadata | Add-Member -MemberType NoteProperty -Name resourceGroups -Value $ResourceGroups -Force + $metadata | ConvertTo-Json -Depth 10 | Set-Content -Path $MetadataFile -Encoding utf8 +} + +Write-Color "State written to: $StateFile" Green +if ($StackId) { Write-Host "Stack ID: $StackId" } +Write-Host '' +Write-Host 'To destroy this deployment:' +Write-Host " /azure-stack-destroy $DeploymentId" diff --git a/.github/skills/azure-stack-deploy/scripts/deploy-stack.sh b/.github/skills/azure-stack-deploy/scripts/deploy-stack.sh new file mode 100755 index 0000000..c1bbf19 --- /dev/null +++ b/.github/skills/azure-stack-deploy/scripts/deploy-stack.sh @@ -0,0 +1,282 @@ +#!/bin/bash +# azure-stack-deploy / deploy-stack.sh +# +# Deploy a Git-Ape deployment artifact as a subscription-scoped +# Azure Deployment Stack. Mirrors the logic of +# .github/workflows/git-ape-deploy.exampleyml so local CLI / VS Code +# deployments produce identical state.json (schemaVersion 1.0). + +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +WORKSPACE_ROOT="$(cd "$SCRIPT_DIR/../../../.." && pwd)" +DEPLOYMENTS_DIR=".azure/deployments" + +# Soft-deletable resource types (must match the CI workflow list) +SOFT_DELETABLE_TYPES="Microsoft.KeyVault/vaults Microsoft.CognitiveServices/accounts Microsoft.AppConfiguration/configurationStores Microsoft.ApiManagement/service Microsoft.MachineLearningServices/workspaces Microsoft.RecoveryServices/vaults" + +# Color codes +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +BLUE='\033[0;34m' +NC='\033[0m' + +DEPLOYMENT_ID="" +LOCATION_OVERRIDE="" +NO_FALLBACK="false" + +usage() { + cat < [OPTIONS] + +Required: + --deployment-id Folder name under .azure/deployments/ + +Options: + --location Override location from parameters.json + --no-fallback Fail loudly if stack create fails (no fallback to az deployment sub create) + -h, --help Show this help + +Examples: + $0 --deployment-id deploy-20260506-001 + $0 --deployment-id deploy-20260506-001 --location westus2 + $0 --deployment-id deploy-20260506-001 --no-fallback +EOF + exit 1 +} + +while [[ $# -gt 0 ]]; do + case "$1" in + --deployment-id) DEPLOYMENT_ID="$2"; shift 2 ;; + --location) LOCATION_OVERRIDE="$2"; shift 2 ;; + --no-fallback) NO_FALLBACK="true"; shift ;; + -h|--help) usage ;; + *) echo "Unknown argument: $1"; usage ;; + esac +done + +[[ -n "$DEPLOYMENT_ID" ]] || usage + +DEPLOYMENT_PATH="$WORKSPACE_ROOT/$DEPLOYMENTS_DIR/$DEPLOYMENT_ID" + +if [[ ! -d "$DEPLOYMENT_PATH" ]]; then + echo -e "${RED}Deployment not found: $DEPLOYMENT_ID${NC}" + exit 1 +fi +if [[ ! -f "$DEPLOYMENT_PATH/template.json" ]]; then + echo -e "${RED}Template not found: $DEPLOYMENT_PATH/template.json${NC}" + exit 1 +fi + +# Internal helpers ------------------------------------------------------------ + +# Classify a resource ID -> JSON object {id, type, scope, softDeletable, purgeProtected} +_classify_resource() { + local RES_ID="$1" + local RES_TYPE + RES_TYPE=$(echo "$RES_ID" | grep -oE 'providers/[^/]+/[^/]+' | head -1 | sed 's|providers/||') + + local RES_SCOPE="resourceGroup" + echo "$RES_ID" | grep -q "/resourceGroups/" || RES_SCOPE="subscription" + + local IS_SOFT="false" + local SD_TYPE + for SD_TYPE in $SOFT_DELETABLE_TYPES; do + if [[ "$RES_TYPE" == "$SD_TYPE" ]]; then + IS_SOFT="true" + break + fi + done + + local PURGE_PROTECTED="false" + if [[ "$RES_TYPE" == "Microsoft.KeyVault/vaults" ]]; then + PURGE_PROTECTED=$(az resource show --ids "$RES_ID" \ + --query "properties.enablePurgeProtection // \`false\`" -o tsv 2>/dev/null || echo "false") + [[ -z "$PURGE_PROTECTED" ]] && PURGE_PROTECTED="false" + fi + + jq -n \ + --arg id "$RES_ID" --arg type "$RES_TYPE" --arg scope "$RES_SCOPE" \ + --argjson sd "$IS_SOFT" --argjson pp "$PURGE_PROTECTED" \ + '{id:$id, type:$type, scope:$scope, softDeletable:$sd, purgeProtected:$pp}' +} + +# Build managedResources[] array from a list of resource IDs (one per line on stdin) +_build_managed_resources() { + local OUT="[]" + local RES_ID CLASSIFIED + while IFS= read -r RES_ID; do + [[ -z "$RES_ID" ]] && continue + CLASSIFIED=$(_classify_resource "$RES_ID") + OUT=$(echo "$OUT" | jq --argjson r "$CLASSIFIED" '. + [$r]') + done + echo "$OUT" +} + +# Resolve deployment parameters ---------------------------------------------- + +PARAMS_ARG=() +LOCATION="eastus" +PROJECT="unknown" +ENVIRONMENT="dev" +if [[ -f "$DEPLOYMENT_PATH/parameters.json" ]]; then + PARAMS_ARG=(--parameters "@$DEPLOYMENT_PATH/parameters.json") + LOCATION=$(jq -r '.parameters.location.value // "eastus"' "$DEPLOYMENT_PATH/parameters.json") + PROJECT=$(jq -r '.parameters.project.value // .parameters.projectName.value // "unknown"' "$DEPLOYMENT_PATH/parameters.json") + ENVIRONMENT=$(jq -r '.parameters.environment.value // "dev"' "$DEPLOYMENT_PATH/parameters.json") +fi +[[ -n "$LOCATION_OVERRIDE" ]] && LOCATION="$LOCATION_OVERRIDE" + +SUBSCRIPTION=$(az account show --query id -o tsv 2>/dev/null || echo "") +if [[ -z "$SUBSCRIPTION" ]]; then + echo -e "${RED}Not logged in to Azure. Run 'az login' first.${NC}" + exit 1 +fi + +echo -e "${BLUE}๐Ÿš€ Deploying $DEPLOYMENT_ID${NC}" +echo " Subscription: $SUBSCRIPTION" +echo " Location: $LOCATION" +echo " Method: stack (az stack sub create --action-on-unmanage deleteAll)" + +# Deploy ---------------------------------------------------------------------- + +START_TIME=$(date +%s) +DEPLOY_METHOD="stack" +STACK_ID="" +DEPLOY_OUTPUT="" +EXIT_CODE=0 +# Verbose output goes to a temp file so it does not contaminate the JSON we +# need to feed to jq. We surface the verbose log only when something fails. +VERBOSE_LOG=$(mktemp) +trap 'rm -f "$VERBOSE_LOG"' EXIT + +if ! DEPLOY_OUTPUT=$(az stack sub create \ + --name "$DEPLOYMENT_ID" \ + --location "$LOCATION" \ + --template-file "$DEPLOYMENT_PATH/template.json" \ + "${PARAMS_ARG[@]}" \ + --action-on-unmanage deleteAll \ + --deny-settings-mode none \ + --description "Git-Ape deployment $DEPLOYMENT_ID" \ + --tags "managedBy=git-ape" "deploymentId=$DEPLOYMENT_ID" \ + --yes \ + --verbose \ + --output json 2>"$VERBOSE_LOG"); then + + if [[ "$NO_FALLBACK" == "true" ]]; then + echo -e "${RED}โŒ Stack deploy failed and --no-fallback was set${NC}" + echo "$DEPLOY_OUTPUT" + cat "$VERBOSE_LOG" >&2 + EXIT_CODE=1 + else + echo -e "${YELLOW}โš  Stack deploy failed; check whether Deployment Stacks are available in this subscription/region.${NC}" + echo "$DEPLOY_OUTPUT" + cat "$VERBOSE_LOG" >&2 + echo -e "${YELLOW}Falling back to az deployment sub create (NOT idempotent for soft-delete / multi-RG).${NC}" + DEPLOY_METHOD="subscription" + if ! DEPLOY_OUTPUT=$(az deployment sub create \ + --name "$DEPLOYMENT_ID" \ + --location "$LOCATION" \ + --template-file "$DEPLOYMENT_PATH/template.json" \ + "${PARAMS_ARG[@]}" \ + --output json 2>"$VERBOSE_LOG"); then + cat "$VERBOSE_LOG" >&2 + EXIT_CODE=1 + fi + fi +fi + +END_TIME=$(date +%s) +DURATION=$((END_TIME - START_TIME)) + +if [[ "$EXIT_CODE" -ne 0 ]]; then + echo -e "${RED}โŒ Deployment failed${NC}" + echo "$DEPLOY_OUTPUT" + # Surface underlying failed operations โ€” the stack/deployment top-level + # error is usually a summary; the real root cause lives in the per-resource + # operations list. + echo "" + echo -e "${YELLOW}โ”€โ”€ Underlying failed operations โ”€โ”€${NC}" + az deployment operation sub list --name "$DEPLOYMENT_ID" --output json 2>/dev/null \ + | jq -r '.[] | select(.properties.provisioningState == "Failed") | + "โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€\nResource : \(.properties.targetResource.resourceName // "n/a") (\(.properties.targetResource.resourceType // "n/a"))\nStatus : \(.properties.statusCode // "n/a")\nMessage : \(.properties.statusMessage.error.message // .properties.statusMessage // "n/a")"' \ + 2>/dev/null || echo "(no per-operation details available โ€” deployment may not have reached Azure)" + exit 1 +fi + +# Capture state --------------------------------------------------------------- + +if [[ "$DEPLOY_METHOD" == "stack" ]]; then + STACK_ID=$(echo "$DEPLOY_OUTPUT" | jq -r '.id // empty') + OUTPUTS=$(echo "$DEPLOY_OUTPUT" | jq -r '.outputs // {}') +else + OUTPUTS=$(echo "$DEPLOY_OUTPUT" | jq -r '.properties.outputs // {}') +fi +RG_NAME=$(echo "$OUTPUTS" | jq -r '.resourceGroupName.value // empty') + +echo -e "${GREEN}โœ… Deployment succeeded in ${DURATION}s (method: $DEPLOY_METHOD)${NC}" + +if [[ "$DEPLOY_METHOD" == "stack" && -n "$STACK_ID" ]]; then + STACK_RESOURCES=$(az stack sub show --name "$DEPLOYMENT_ID" --query "resources[].id" -o json 2>/dev/null || echo "[]") + MANAGED_RESOURCES=$(echo "$STACK_RESOURCES" | jq -r '.[]' | _build_managed_resources) +else + OPS=$(az deployment operation sub list --name "$DEPLOYMENT_ID" \ + --query "[?properties.provisioningState=='Succeeded' && properties.targetResource.id != null].properties.targetResource.id" \ + -o tsv 2>/dev/null || echo "") + MANAGED_RESOURCES=$(echo "$OPS" | _build_managed_resources) +fi + +RESOURCE_GROUPS=$(echo "$MANAGED_RESOURCES" | jq -c '[.[].id | capture("/resourceGroups/(?[^/]+)") | .rg] | unique') +[[ "$(echo "$RESOURCE_GROUPS" | jq 'length')" == "0" && -n "$RG_NAME" ]] && RESOURCE_GROUPS="[\"$RG_NAME\"]" + +STATE_FILE="$DEPLOYMENT_PATH/state.json" +TIMESTAMP=$(date -u +%Y-%m-%dT%H:%M:%SZ) +jq -n \ + --arg schemaVersion "1.0" \ + --arg deploymentId "$DEPLOYMENT_ID" \ + --arg timestamp "$TIMESTAMP" \ + --arg status "succeeded" \ + --arg duration "${DURATION}s" \ + --arg subscription "$SUBSCRIPTION" \ + --arg location "$LOCATION" \ + --arg project "$PROJECT" \ + --arg environment "$ENVIRONMENT" \ + --arg resourceGroup "$RG_NAME" \ + --arg deployMethod "$DEPLOY_METHOD" \ + --arg stackId "$STACK_ID" \ + --argjson managedResources "$MANAGED_RESOURCES" \ + --argjson resourceGroups "$RESOURCE_GROUPS" \ + '{ + schemaVersion: $schemaVersion, + deploymentId: $deploymentId, + timestamp: $timestamp, + status: $status, + duration: $duration, + subscription: $subscription, + location: $location, + project: $project, + environment: $environment, + resourceGroup: $resourceGroup, + deployMethod: $deployMethod, + stackId: (if $stackId == "" then null else $stackId end), + managedResources: $managedResources, + resourceGroups: $resourceGroups, + subscriptions: [$subscription], + externalReferences: [] + }' > "$STATE_FILE" + +if [[ -f "$DEPLOYMENT_PATH/metadata.json" ]]; then + jq --arg status "succeeded" --arg method "$DEPLOY_METHOD" --argjson rgs "$RESOURCE_GROUPS" \ + '.status = $status | .deployMethod = $method | .resourceGroups = $rgs' \ + "$DEPLOYMENT_PATH/metadata.json" > "$DEPLOYMENT_PATH/metadata.json.tmp" \ + && mv "$DEPLOYMENT_PATH/metadata.json.tmp" "$DEPLOYMENT_PATH/metadata.json" +fi + +echo -e "${GREEN}State written to: $STATE_FILE${NC}" +[[ -n "$STACK_ID" ]] && echo "Stack ID: $STACK_ID" +echo "" +echo "To destroy this deployment:" +echo " /azure-stack-destroy $DEPLOYMENT_ID" diff --git a/.github/skills/azure-stack-destroy/SKILL.md b/.github/skills/azure-stack-destroy/SKILL.md new file mode 100644 index 0000000..7b87d63 --- /dev/null +++ b/.github/skills/azure-stack-destroy/SKILL.md @@ -0,0 +1,153 @@ +--- +name: azure-stack-destroy +description: "Destroy a Git-Ape deployment by deleting its Azure Deployment Stack with --action-on-unmanage deleteAll, then purging soft-deleted resources (Key Vault, Cognitive Services) that are not purge-protected. Reads state.json (schemaVersion 1.0) to know exactly what to clean up. Use for any local CLI / VS Code Git-Ape teardown so the result matches the CI workflow." +argument-hint: "Deployment ID โ€” add --yes to skip the typed confirmation" +user-invocable: true +--- + +# Azure Stack Destroy + +Destroy a Git-Ape deployment by deleting its subscription-scoped **Azure Deployment Stack** in a single idempotent call (`az stack sub delete --action-on-unmanage deleteAll --bypass-stack-out-of-sync-error true`). The stack owns every resource the matching deploy created โ€” across resource groups and subscription scope โ€” so one delete cleans up everything. + +After the stack is gone, this skill performs a **soft-delete purge sweep** for resource types that linger after deletion (Key Vault, Cognitive Services, App Configuration, API Management, ML workspaces, Recovery Services vaults). Resources flagged `purgeProtected: true` in `state.json` are intentionally retained. + +This skill mirrors `.github/workflows/git-ape-destroy.exampleyml` so local destroys and CI destroys are interchangeable. + +## When to Use + +- User says: "destroy this deployment", "tear down deploy-XXX", "clean up the stack" +- Pair with the matching [`azure-stack-deploy`](../azure-stack-deploy/SKILL.md) โ€” same stack, same `state.json` key (`stackId`) +- Any time you would otherwise run `az group delete` against a Git-Ape deployment (don't โ€” you'll miss soft-delete cleanup and multi-RG resources) + +## Prerequisites + +| Tool | Why | +|------|-----| +| `az` (Azure CLI โ‰ฅ 2.59) | `az stack sub delete --bypass-stack-out-of-sync-error` requires a recent CLI | +| `jq` | Read state.json | +| `bash` โ‰ฅ 4 OR PowerShell 7+ | Either runner works | +| Active `az login` | Must be the same subscription where the stack lives | +| Existing `state.json` under `.azure/deployments//` | Source of truth for `stackId`, `managedResources`, `softDeletable`, `purgeProtected` | + +The skill **refuses to run** without `state.json`. Re-deploy first or hand-write a minimal state file (not recommended). + +## Procedure + +### Fast mode vs sync mode + +The scripts default to **fast mode** (interactive default). The CI workflow keeps **sync mode** (deterministic). + +| | How | Wait time (small VNet stack) | When to use | +|--|--|--|--| +| Fast (default) | Background the `az stack sub delete` call, then poll managed RGs with `az group exists` | ~2 min | Local CLI / VS Code use; user wants quick feedback | +| Sync (`--wait` / `-Wait`) | `az stack sub delete ... --yes` (blocks until stack metadata is fully cleaned) | ~5 min | CI pipelines (default in `git-ape-destroy.exampleyml`); when you need every Azure-side cleanup completed before the script exits | + +The Azure CLI does not expose `--no-wait` on `az stack sub delete`, so the fast path runs the same command as a detached background process. In fast mode the stack-metadata cleanup continues asynchronously in Azure after the script returns. The next destroy of the same `deploymentId` is idempotent: if the stack is still finalizing, `az stack sub show` will return it and the script will simply pick up where Azure left off. + +### 1. Identify deployment + +```bash +DEPLOYMENT_ID="deploy-20260506-001" +DEPLOYMENT_PATH=".azure/deployments/$DEPLOYMENT_ID" +[[ -f "$DEPLOYMENT_PATH/state.json" ]] || { echo "state.json missing โ€” cannot destroy"; exit 1; } +``` + +### 2. Run the script + +```bash +.github/skills/azure-stack-destroy/scripts/destroy-stack.sh \ + --deployment-id "$DEPLOYMENT_ID" +``` + +Skip the confirmation prompt (use only in automation): + +```bash +.github/skills/azure-stack-destroy/scripts/destroy-stack.sh \ + --deployment-id "$DEPLOYMENT_ID" \ + --yes +``` + +Force CI-equivalent sync wait (default for the CI workflow; opt-in for the script): + +```bash +.github/skills/azure-stack-destroy/scripts/destroy-stack.sh \ + --deployment-id "$DEPLOYMENT_ID" \ + --yes --wait +``` + +PowerShell equivalents: + +```powershell +.github/skills/azure-stack-destroy/scripts/destroy-stack.ps1 -DeploymentId "$DEPLOYMENT_ID" +.github/skills/azure-stack-destroy/scripts/destroy-stack.ps1 -DeploymentId "$DEPLOYMENT_ID" -Yes +.github/skills/azure-stack-destroy/scripts/destroy-stack.ps1 -DeploymentId "$DEPLOYMENT_ID" -Yes -Wait +``` + +### 3. What the script does + +1. Reads `state.json` and extracts `stackId`, `deployMethod`, `resourceGroup`, `managedResources[]`, `softDeletable[]` +2. Prints a **destroy plan** โ€” stack ID, resource group, count of soft-deletables (with purge-protection flagged) +3. Prompts for typed `destroy` confirmation (unless `--yes`) +4. **Stack delete path** (`stackId` present): + - `az stack sub delete --action-on-unmanage deleteAll --bypass-stack-out-of-sync-error true --yes` + - The bypass flag is safe in destroy because it's a one-shot operation โ€” we don't need the stale-manifest safety check that protects iterative updates +5. **Fallback path** (no `stackId`, only `resourceGroup`): `az group delete --name --yes` +6. **Purge sweep** for each `softDeletable` resource not marked `purgeProtected`: + - Key Vaults: `az keyvault list-deleted` + `az keyvault purge` + - Cognitive Services: `az cognitiveservices account purge` + - Other types: skipped (soft-delete expires naturally) +7. Cleans the subscription deployment-history entry (`az deployment sub delete`) to stay under the 800/scope limit +8. Updates `state.json` and `metadata.json` with terminal status: + +| Status | Meaning | +|--------|---------| +| `destroyed` | Stack/RG gone and all soft-deletables purged or absent | +| `retained-soft-deleted` | Stack gone but at least one soft-deletable retained (purge-protected or purge failed) | +| `partially-destroyed` | Stack delete partially failed | +| `destroy-failed` | Stack/RG delete failed entirely | +| `already-destroyed` | Stack and RG were already gone before this call | + +### 4. Inspect the result + +```text +=== Destroy Summary === +Status: destroyed +Duration: 87s +======================= +``` + +Or, when something is intentionally retained: + +```text +=== Destroy Summary === +Status: retained-soft-deleted +Duration: 92s +Retained: 1 soft-deleted resource(s) (purge-protected) +======================= +``` + +`state.json` gains `destroyedAt`, `destroyedBy`, `destroyDuration`, and a `purgeResults[]` array describing each soft-deletable's outcome. + +## Arguments + +| Flag (bash) | Param (pwsh) | Required | Description | +|-------------|--------------|----------|-------------| +| `--deployment-id ` | `-DeploymentId ` | yes | Folder name under `.azure/deployments/` | +| `--yes` | `-Yes` | no | Skip the typed `destroy` confirmation prompt (CI-only) | +| `--wait` | `-Wait` | no | Sync mode: block until Azure has cleaned up stack metadata. Matches the CI workflow. Slower (~3-4ร—) but fully deterministic. | +| `--poll-timeout ` | `-PollTimeout ` | no | Fast-mode timeout per managed RG poll (default 600s) | + +## Failure modes + +| Symptom | Likely cause | Recovery | +|---------|--------------|----------| +| `state.json missing` | Deployment never reached the state-write phase, or was hand-edited | Re-deploy (idempotent on stack name) then destroy, OR delete the `.azure/deployments//` folder if Azure has nothing | +| `Stack out of sync` despite `--bypass-stack-out-of-sync-error` | Old CLI version | Upgrade `az` to โ‰ฅ 2.59 | +| Key Vault purge fails | Vault is purge-protected (`purgeProtected: true`) | Expected โ€” wait 7-90 days for soft-delete window to expire, or purge manually after disabling protection | +| `Cannot delete resource group โ€ฆ`/`InUseSubnetCannotBeDeleted` | A resource outside the stack references one inside (e.g. external subnet peered to a deleted VNet) | Inspect `externalReferences[]` in `state.json`; remove the reference and rerun | + +## Related + +- [`azure-stack-deploy`](../azure-stack-deploy/SKILL.md) โ€” the matching deploy skill (writes the `state.json` this skill consumes) +- [`azure-drift-detector`](../azure-drift-detector/SKILL.md) โ€” check for unmanaged drift BEFORE destroy +- [`azure-resource-visualizer`](../azure-resource-visualizer/SKILL.md) โ€” visualize what's in the stack before tearing it down diff --git a/.github/skills/azure-stack-destroy/scripts/destroy-stack.ps1 b/.github/skills/azure-stack-destroy/scripts/destroy-stack.ps1 new file mode 100644 index 0000000..3f2b5b9 --- /dev/null +++ b/.github/skills/azure-stack-destroy/scripts/destroy-stack.ps1 @@ -0,0 +1,348 @@ +<# +.SYNOPSIS + Destroy a Git-Ape deployment by deleting its Azure Deployment Stack. + +.DESCRIPTION + PowerShell port of destroy-stack.sh. Mirrors the logic of + .github/workflows/git-ape-destroy.exampleyml so local destroys produce + identical state.json transitions. + +.PARAMETER DeploymentId + Folder name under .azure/deployments/. Required. + +.PARAMETER Yes + Skip the typed 'destroy' confirmation prompt (CI-only). + +.EXAMPLE + ./destroy-stack.ps1 -DeploymentId deploy-20260506-001 + +.EXAMPLE + ./destroy-stack.ps1 -DeploymentId deploy-20260506-001 -Yes + +.NOTES + Requires: PowerShell 7+, az CLI โ‰ฅ 2.59, jq, active az login session, + existing state.json under .azure/deployments//. +#> +[CmdletBinding()] +param( + [string]$DeploymentId, + + [switch]$Yes, + + [switch]$Wait, + + [int]$PollTimeout = 600, + + [int]$PollInterval = 10, + + [switch]$Help +) + +$ErrorActionPreference = 'Stop' + +function Show-Usage { + @' +Azure Stack Destroy โ€” destroy a Deployment Stack and purge soft-deletables + +Usage: destroy-stack.ps1 -DeploymentId [OPTIONS] + +Required: + -DeploymentId Folder name under .azure/deployments/ + +Options: + -Yes Skip the typed 'destroy' confirmation prompt + -Wait Sync mode (matches CI): block on 'az stack sub delete' + until Azure has cleaned up stack metadata. Slower but + fully deterministic. Default is fast mode (run the + same command in the background, then poll managed + resource groups until they are gone, ~2-3x faster). + -PollTimeout Fast-mode timeout per managed RG poll (default: 600) + -Help Show this help + +Examples: + ./destroy-stack.ps1 -DeploymentId deploy-20260506-001 # fast (default) + ./destroy-stack.ps1 -DeploymentId deploy-20260506-001 -Yes # fast, no prompt + ./destroy-stack.ps1 -DeploymentId deploy-20260506-001 -Wait # CI-equivalent sync +'@ | Write-Host +} + +if ($Help -or [string]::IsNullOrWhiteSpace($DeploymentId)) { + Show-Usage + exit 1 +} + +$ScriptDir = Split-Path -Parent $MyInvocation.MyCommand.Path +$WorkspaceRoot = (Resolve-Path (Join-Path $ScriptDir '../../../..')).Path +$DeploymentsDir = '.azure/deployments' +$DeploymentPath = Join-Path $WorkspaceRoot (Join-Path $DeploymentsDir $DeploymentId) +$StateFile = Join-Path $DeploymentPath 'state.json' + +function Write-Color { + param([string]$Text, [string]$Color = 'White') + Write-Host $Text -ForegroundColor $Color +} + +if (-not (Test-Path -PathType Container $DeploymentPath)) { + Write-Color "Deployment not found: $DeploymentId" Red + exit 1 +} +if (-not (Test-Path $StateFile)) { + Write-Color "state.json not found: $StateFile" Red + Write-Host 'Cannot destroy without deployment state.' + exit 1 +} + +$state = Get-Content $StateFile -Raw | ConvertFrom-Json +$StackId = if ($state.stackId) { [string]$state.stackId } else { '' } +$DeployMethod = if ($state.deployMethod) { [string]$state.deployMethod } else { 'subscription' } +$RgName = if ($state.resourceGroup) { [string]$state.resourceGroup } else { '' } +$ManagedRgs = @($state.resourceGroups | Where-Object { $_ }) +$ManagedResources = @($state.managedResources) +$SoftDeletable = @($ManagedResources | Where-Object { $_.softDeletable -eq $true }) + +if ([string]::IsNullOrWhiteSpace($StackId) -and [string]::IsNullOrWhiteSpace($RgName)) { + Write-Color 'No stackId or resourceGroup in state.json โ€” cannot destroy.' Red + exit 1 +} + +# Plan ----------------------------------------------------------------------- + +Write-Color '=== Destroy Plan ===' Yellow +Write-Host "Deployment: $DeploymentId" +Write-Host "Method: $DeployMethod" +if ($StackId) { Write-Host "Stack ID: $StackId" } +if ($RgName) { Write-Host "Resource RG: $RgName" } + +$SoftCount = $SoftDeletable.Count +if ($SoftCount -gt 0) { + Write-Host "Soft-deletable: $SoftCount resource(s) โ€” will purge non-protected after delete" + foreach ($r in $SoftDeletable) { + $suffix = if ($r.purgeProtected) { ' (purge-protected)' } else { '' } + Write-Host (" - {0}: {1}{2}" -f $r.type, $r.id, $suffix) + } +} +Write-Color '====================' Yellow + +if (-not $Yes) { + $confirm = Read-Host "Proceed with destroy? Type 'destroy' to confirm" + if ($confirm -ne 'destroy') { + Write-Host 'Cancelled' + exit 0 + } +} + +# Execute -------------------------------------------------------------------- + +$StackDeleted = $false +$RgDeleted = $false +$AlreadyGone = $true +$StartTime = Get-Date + +if ($StackId) { + $stackExists = az stack sub show --name $DeploymentId --query 'id' -o tsv 2>$null + if ($stackExists) { + $AlreadyGone = $false + if ($Wait) { + Write-Color "๐Ÿ—‘๏ธ Deleting deployment stack (sync wait): $DeploymentId" Blue + # --bypass-stack-out-of-sync-error: a destroy run is one-shot; we + # don't need the safety check that protects against stale manifests + # during iterative updates. + az stack sub delete ` + --name $DeploymentId ` + --action-on-unmanage deleteAll ` + --bypass-stack-out-of-sync-error true ` + --yes + if ($LASTEXITCODE -eq 0) { $StackDeleted = $true } + else { Write-Color 'โŒ Stack delete failed' Red } + } elseif ($ManagedRgs.Count -eq 0) { + Write-Color 'โš ๏ธ No resourceGroups[] in state.json โ€” falling back to sync wait' Yellow + az stack sub delete ` + --name $DeploymentId ` + --action-on-unmanage deleteAll ` + --bypass-stack-out-of-sync-error true ` + --yes + if ($LASTEXITCODE -eq 0) { $StackDeleted = $true } + else { Write-Color 'โŒ Stack delete failed' Red } + } else { + Write-Color "๐Ÿ—‘๏ธ Submitting stack delete (fast mode): $DeploymentId" Blue + $stackLog = New-TemporaryFile + $stackErr = New-TemporaryFile + # Spawn the blocking stack delete in a detached process; we exit + # as soon as the managed RGs are gone, leaving Azure to finish + # stack-metadata cleanup asynchronously. Azure CLI does not expose + # --no-wait on `az stack sub delete`, so backgrounding the call + # is the only way to get fast interactive return. + $bg = Start-Process -FilePath az ` + -ArgumentList @( + 'stack', 'sub', 'delete', + '--name', $DeploymentId, + '--action-on-unmanage', 'deleteAll', + '--bypass-stack-out-of-sync-error', 'true', + '--yes' + ) ` + -RedirectStandardOutput $stackLog.FullName ` + -RedirectStandardError $stackErr.FullName ` + -PassThru -NoNewWindow + + Write-Color ("โณ Polling {0} managed resource group(s) (timeout: {1}s)..." -f $ManagedRgs.Count, $PollTimeout) Blue + $pollStart = Get-Date + $pollFailed = $false + foreach ($rg in $ManagedRgs) { + while ($true) { + $elapsed = [int]((Get-Date) - $pollStart).TotalSeconds + if ($elapsed -ge $PollTimeout) { + Write-Color (" โš ๏ธ Timeout ({0}s) polling {1}" -f $elapsed, $rg) Red + $logBody = (Get-Content $stackLog.FullName -Raw -ErrorAction SilentlyContinue) + + (Get-Content $stackErr.FullName -Raw -ErrorAction SilentlyContinue) + if ($logBody) { + Write-Color ' Background stack-delete output:' Yellow + $logBody.TrimEnd() -split "`n" | ForEach-Object { Write-Host " $_" } + } + Write-Color ' Rerun with -Wait for synchronous diagnostics' Yellow + $pollFailed = $true + break + } + if ($bg.HasExited -and $bg.ExitCode -ne 0) { + $existsCheck = az group exists --name $rg 2>$null + if ($existsCheck -eq 'true') { + Write-Color (" โŒ Background stack-delete exited (code {0}) before {1} was removed" -f $bg.ExitCode, $rg) Red + $logBody = (Get-Content $stackLog.FullName -Raw -ErrorAction SilentlyContinue) + + (Get-Content $stackErr.FullName -Raw -ErrorAction SilentlyContinue) + if ($logBody) { + $logBody.TrimEnd() -split "`n" | ForEach-Object { Write-Host " $_" } + } + $pollFailed = $true + break + } + } + $exists = az group exists --name $rg 2>$null + if ($exists -ne 'true') { + Write-Color (" โœ“ {0} gone ({1}s)" -f $rg, $elapsed) Green + break + } + Start-Sleep -Seconds $PollInterval + } + if ($pollFailed) { break } + } + Remove-Item $stackLog.FullName -Force -ErrorAction SilentlyContinue + Remove-Item $stackErr.FullName -Force -ErrorAction SilentlyContinue + if ($pollFailed) { + $StackDeleted = $false + } else { + $StackDeleted = $true + Write-Color 'โ„น๏ธ Azure is finishing stack-metadata cleanup asynchronously' Blue + } + } + } else { + Write-Color 'Stack already gone โ€” skipping stack delete' Yellow + $StackDeleted = $true + } +} + +if (-not $StackId -and $RgName) { + $rgExists = az group exists --name $RgName 2>$null + if ($rgExists -eq 'true') { + $AlreadyGone = $false + Write-Color "๐Ÿ—‘๏ธ Deleting resource group: $RgName" Blue + az group delete --name $RgName --yes + if ($LASTEXITCODE -eq 0) { $RgDeleted = $true } + else { Write-Color 'โŒ Resource group delete failed' Red } + } else { + Write-Color 'Resource group already gone โ€” skipping' Yellow + $RgDeleted = $true + } +} + +# Soft-delete purge sweep +$PurgeResults = @() +$RetainedCount = 0 +if ($SoftCount -gt 0 -and ($StackDeleted -or $RgDeleted)) { + Write-Color '๐Ÿงน Purging soft-deleted resources...' Blue + foreach ($r in $SoftDeletable) { + $resType = $r.type + $resId = $r.id + $resName = ($resId -split '/')[-1] + $protected = [bool]$r.purgeProtected + + switch ($resType) { + 'Microsoft.KeyVault/vaults' { + $deletedVaultJson = az keyvault list-deleted --query "[?name=='$resName']" -o json 2>$null + $deletedVault = if ($deletedVaultJson) { $deletedVaultJson | ConvertFrom-Json } else { @() } + if ($deletedVault.Count -gt 0) { + if ($protected) { + Write-Host " โš ๏ธ ${resName}: soft-deleted but purge-protected โ€” retained" + $RetainedCount++ + $PurgeResults += [pscustomobject]@{ name=$resName; type=$resType; action='retained-soft-deleted'; reason='purge-protected' } + } else { + Write-Host " ๐Ÿ—‘๏ธ Purging vault: $resName" + az keyvault purge --name $resName 2>$null + if ($LASTEXITCODE -eq 0) { + $PurgeResults += [pscustomobject]@{ name=$resName; type=$resType; action='purged' } + } else { + Write-Host " โš ๏ธ Failed to purge vault: $resName" + $RetainedCount++ + $PurgeResults += [pscustomobject]@{ name=$resName; type=$resType; action='purge-failed' } + } + } + } else { + Write-Host " โœ“ ${resName}: not in soft-deleted state" + } + } + 'Microsoft.CognitiveServices/accounts' { + if (-not $protected) { + $loc = '' + if ($resId -match 'locations/([^/]+)') { $loc = $matches[1] } + if ($loc) { + az cognitiveservices account purge --name $resName --location $loc --resource-group '' 2>$null | Out-Null + } + } + } + default { + Write-Host " โ„น๏ธ ${resType}: no purge implementation (soft-delete will expire naturally)" + } + } + } +} + +# Clean subscription deployment history entry to stay under the 800/scope limit +az deployment sub delete --name $DeploymentId 2>$null | Out-Null + +$EndTime = Get-Date +$Duration = [int]($EndTime - $StartTime).TotalSeconds + +# Determine final status +$Status = if ($AlreadyGone) { + 'already-destroyed' +} elseif ($StackDeleted -or $RgDeleted) { + if ($RetainedCount -gt 0) { 'retained-soft-deleted' } else { 'destroyed' } +} else { + 'destroy-failed' +} + +# Update state.json + metadata.json +$Timestamp = (Get-Date).ToUniversalTime().ToString('yyyy-MM-ddTHH:mm:ssZ') +$Actor = az account show --query user.name -o tsv 2>$null +if (-not $Actor) { $Actor = 'unknown' } + +$state | Add-Member -MemberType NoteProperty -Name status -Value $Status -Force +$state | Add-Member -MemberType NoteProperty -Name destroyedAt -Value $Timestamp -Force +$state | Add-Member -MemberType NoteProperty -Name destroyedBy -Value $Actor -Force +$state | Add-Member -MemberType NoteProperty -Name destroyDuration -Value "${Duration}s" -Force +$state | Add-Member -MemberType NoteProperty -Name purgeResults -Value $PurgeResults -Force +$state | ConvertTo-Json -Depth 10 | Set-Content -Path $StateFile -Encoding utf8 + +$MetadataFile = Join-Path $DeploymentPath 'metadata.json' +if (Test-Path $MetadataFile) { + $metadata = Get-Content $MetadataFile -Raw | ConvertFrom-Json + $metadata | Add-Member -MemberType NoteProperty -Name status -Value $Status -Force + $metadata | ConvertTo-Json -Depth 10 | Set-Content -Path $MetadataFile -Encoding utf8 +} + +Write-Host '' +Write-Color '=== Destroy Summary ===' Green +Write-Host "Status: $Status" +Write-Host "Duration: ${Duration}s" +if ($RetainedCount -gt 0) { + Write-Color "Retained: $RetainedCount soft-deleted resource(s) (purge-protected)" Yellow +} +Write-Color '=======================' Green diff --git a/.github/skills/azure-stack-destroy/scripts/destroy-stack.sh b/.github/skills/azure-stack-destroy/scripts/destroy-stack.sh new file mode 100755 index 0000000..48d8f09 --- /dev/null +++ b/.github/skills/azure-stack-destroy/scripts/destroy-stack.sh @@ -0,0 +1,352 @@ +#!/bin/bash +# azure-stack-destroy / destroy-stack.sh +# +# Destroy a Git-Ape deployment via az stack sub delete (preferred) or +# az group delete (fallback), then purge soft-deleted resources that are +# not purge-protected. Mirrors .github/workflows/git-ape-destroy.exampleyml +# so local destroys produce identical state.json transitions. + +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +WORKSPACE_ROOT="$(cd "$SCRIPT_DIR/../../../.." && pwd)" +DEPLOYMENTS_DIR=".azure/deployments" + +# Color codes +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +BLUE='\033[0;34m' +NC='\033[0m' + +DEPLOYMENT_ID="" +YES_FLAG="false" +WAIT_FLAG="false" # default: fast mode (submit + poll RGs) +POLL_TIMEOUT=600 # max seconds to wait for managed RGs to disappear in fast mode +POLL_INTERVAL=10 # seconds between RG-existence checks + +usage() { + cat < [OPTIONS] + +Required: + --deployment-id Folder name under .azure/deployments/ + +Options: + --yes Skip the typed 'destroy' confirmation prompt + --wait Sync mode (matches CI): block on 'az stack sub delete' + until Azure has cleaned up stack metadata. Slower but + fully deterministic. Default is fast mode (run the + same command in the background, then poll managed + resource groups until they are gone, ~2-3ร— faster). + --poll-timeout Fast-mode timeout per managed RG poll (default: 600) + -h, --help Show this help + +Examples: + $0 --deployment-id deploy-20260506-001 # fast (interactive default) + $0 --deployment-id deploy-20260506-001 --yes # fast, no prompt + $0 --deployment-id deploy-20260506-001 --wait # CI-equivalent sync wait +EOF + exit 1 +} + +while [[ $# -gt 0 ]]; do + case "$1" in + --deployment-id) DEPLOYMENT_ID="$2"; shift 2 ;; + --yes) YES_FLAG="true"; shift ;; + --wait) WAIT_FLAG="true"; shift ;; + --poll-timeout) POLL_TIMEOUT="$2"; shift 2 ;; + -h|--help) usage ;; + *) echo "Unknown argument: $1"; usage ;; + esac +done + +[[ -n "$DEPLOYMENT_ID" ]] || usage + +DEPLOYMENT_PATH="$WORKSPACE_ROOT/$DEPLOYMENTS_DIR/$DEPLOYMENT_ID" +STATE_FILE="$DEPLOYMENT_PATH/state.json" + +if [[ ! -d "$DEPLOYMENT_PATH" ]]; then + echo -e "${RED}Deployment not found: $DEPLOYMENT_ID${NC}" + exit 1 +fi +if [[ ! -f "$STATE_FILE" ]]; then + echo -e "${RED}state.json not found: $STATE_FILE${NC}" + echo "Cannot destroy without deployment state." + exit 1 +fi + +STACK_ID=$(jq -r '.stackId // empty' "$STATE_FILE") +DEPLOY_METHOD=$(jq -r '.deployMethod // "subscription"' "$STATE_FILE") +RG_NAME=$(jq -r '.resourceGroup // empty' "$STATE_FILE") +MANAGED_RGS_JSON=$(jq -c '.resourceGroups // []' "$STATE_FILE") +MANAGED_RESOURCES=$(jq -c '.managedResources // []' "$STATE_FILE") +SOFT_DELETABLE=$(echo "$MANAGED_RESOURCES" | jq -c '[.[] | select(.softDeletable == true)]') + +if [[ -z "$STACK_ID" && -z "$RG_NAME" ]]; then + echo -e "${RED}No stackId or resourceGroup in state.json โ€” cannot destroy.${NC}" + exit 1 +fi + +# Plan ----------------------------------------------------------------------- + +echo -e "${YELLOW}=== Destroy Plan ===${NC}" +echo "Deployment: $DEPLOYMENT_ID" +echo "Method: $DEPLOY_METHOD" +[[ -n "$STACK_ID" ]] && echo "Stack ID: $STACK_ID" +[[ -n "$RG_NAME" ]] && echo "Resource RG: $RG_NAME" + +SOFT_COUNT=$(echo "$SOFT_DELETABLE" | jq 'length') +if [[ "$SOFT_COUNT" -gt 0 ]]; then + echo "Soft-deletable: $SOFT_COUNT resource(s) โ€” will purge non-protected after delete" + echo "$SOFT_DELETABLE" | jq -r '.[] | " - \(.type): \(.id)" + (if .purgeProtected then " (purge-protected)" else "" end)' +fi +echo -e "${YELLOW}====================${NC}" + +if [[ "$YES_FLAG" != "true" ]]; then + echo -n "Proceed with destroy? Type 'destroy' to confirm: " + read -r CONFIRM + if [[ "$CONFIRM" != "destroy" ]]; then + echo "Cancelled" + exit 0 + fi +fi + +# Execute -------------------------------------------------------------------- + +STACK_DELETED="false" +RG_DELETED="false" +ALREADY_GONE="true" +START_TIME=$(date +%s) + +# Primary path: stack delete +# +# Two modes: +# --wait (sync, matches CI): az stack sub delete --yes (blocks until +# Azure has finished both resource deletion +# and stack-metadata cleanup; ~5 min for a +# small stack) +# default (fast, interactive): start the same command in the background, +# poll each managed RG with `az group exists` +# until it returns false (~90s for the same +# small stack), then return. Azure CLI does +# not expose --no-wait on `az stack sub +# delete`, so the slow stack-metadata cleanup +# finishes asynchronously after the script +# exits. +if [[ -n "$STACK_ID" ]]; then + STACK_EXISTS=$(az stack sub show --name "$DEPLOYMENT_ID" --query "id" -o tsv 2>/dev/null || echo "") + if [[ -n "$STACK_EXISTS" ]]; then + ALREADY_GONE="false" + if [[ "$WAIT_FLAG" == "true" ]]; then + echo -e "${BLUE}๐Ÿ—‘๏ธ Deleting deployment stack (sync wait): $DEPLOYMENT_ID${NC}" + # --bypass-stack-out-of-sync-error: a destroy run is one-shot; we + # don't need the safety check that protects against stale manifests + # during iterative updates. + if az stack sub delete \ + --name "$DEPLOYMENT_ID" \ + --action-on-unmanage deleteAll \ + --bypass-stack-out-of-sync-error true \ + --yes 2>&1; then + STACK_DELETED="true" + else + echo -e "${RED}โŒ Stack delete failed${NC}" + fi + else + MANAGED_RG_COUNT=$(echo "$MANAGED_RGS_JSON" | jq 'length') + if [[ "$MANAGED_RG_COUNT" -eq 0 ]]; then + echo -e "${YELLOW}โš ๏ธ No resourceGroups[] in state.json โ€” falling back to sync wait${NC}" + if az stack sub delete \ + --name "$DEPLOYMENT_ID" \ + --action-on-unmanage deleteAll \ + --bypass-stack-out-of-sync-error true \ + --yes 2>&1; then + STACK_DELETED="true" + else + echo -e "${RED}โŒ Stack delete failed${NC}" + fi + else + echo -e "${BLUE}๐Ÿ—‘๏ธ Submitting stack delete (fast mode): $DEPLOYMENT_ID${NC}" + STACK_DELETE_LOG=$(mktemp) + # Background the blocking stack delete; we exit as soon as the + # managed RGs are gone, leaving Azure to finish stack-metadata + # cleanup asynchronously. + nohup az stack sub delete \ + --name "$DEPLOYMENT_ID" \ + --action-on-unmanage deleteAll \ + --bypass-stack-out-of-sync-error true \ + --yes > "$STACK_DELETE_LOG" 2>&1 & + STACK_BG_PID=$! + disown "$STACK_BG_PID" 2>/dev/null || true + + echo -e "${BLUE}โณ Polling $MANAGED_RG_COUNT managed resource group(s) (timeout: ${POLL_TIMEOUT}s)...${NC}" + POLL_START=$(date +%s) + POLL_FAILED="false" + for RG in $(echo "$MANAGED_RGS_JSON" | jq -r '.[]'); do + while true; do + ELAPSED=$(($(date +%s) - POLL_START)) + if [[ $ELAPSED -ge $POLL_TIMEOUT ]]; then + echo -e "${RED} โš ๏ธ Timeout (${ELAPSED}s) polling $RG${NC}" + if [[ -s "$STACK_DELETE_LOG" ]]; then + echo -e "${YELLOW} Background stack-delete output:${NC}" + sed 's/^/ /' "$STACK_DELETE_LOG" + fi + echo -e "${YELLOW} Rerun with --wait for synchronous diagnostics${NC}" + POLL_FAILED="true" + break + fi + # If the bg process already failed, surface it early + if ! kill -0 "$STACK_BG_PID" 2>/dev/null; then + wait "$STACK_BG_PID" 2>/dev/null || true + BG_EXIT=$? + if [[ $BG_EXIT -ne 0 ]]; then + EXISTS=$(az group exists --name "$RG" 2>/dev/null || echo "true") + if [[ "$EXISTS" == "true" ]]; then + echo -e "${RED} โŒ Background stack-delete exited (code $BG_EXIT) before $RG was removed${NC}" + if [[ -s "$STACK_DELETE_LOG" ]]; then + sed 's/^/ /' "$STACK_DELETE_LOG" + fi + POLL_FAILED="true" + break + fi + fi + fi + EXISTS=$(az group exists --name "$RG" 2>/dev/null || echo "false") + if [[ "$EXISTS" != "true" ]]; then + echo -e "${GREEN} โœ“ $RG gone (${ELAPSED}s)${NC}" + break + fi + sleep "$POLL_INTERVAL" + done + [[ "$POLL_FAILED" == "true" ]] && break + done + rm -f "$STACK_DELETE_LOG" + if [[ "$POLL_FAILED" == "true" ]]; then + STACK_DELETED="false" + else + STACK_DELETED="true" + echo -e "${BLUE}โ„น๏ธ Azure is finishing stack-metadata cleanup asynchronously${NC}" + fi + fi + fi + else + echo -e "${YELLOW}Stack already gone โ€” skipping stack delete${NC}" + STACK_DELETED="true" + fi +fi + +# Fallback path: resource group delete (only when no stack was used) +if [[ -z "$STACK_ID" && -n "$RG_NAME" ]]; then + RG_EXISTS=$(az group exists --name "$RG_NAME" 2>/dev/null || echo "false") + if [[ "$RG_EXISTS" == "true" ]]; then + ALREADY_GONE="false" + echo -e "${BLUE}๐Ÿ—‘๏ธ Deleting resource group: $RG_NAME${NC}" + if az group delete --name "$RG_NAME" --yes 2>&1; then + RG_DELETED="true" + else + echo -e "${RED}โŒ Resource group delete failed${NC}" + fi + else + echo -e "${YELLOW}Resource group already gone โ€” skipping${NC}" + RG_DELETED="true" + fi +fi + +# Soft-delete purge sweep +PURGE_RESULTS="[]" +RETAINED_COUNT=0 +if [[ "$SOFT_COUNT" -gt 0 ]] && [[ "$STACK_DELETED" == "true" || "$RG_DELETED" == "true" ]]; then + echo -e "${BLUE}๐Ÿงน Purging soft-deleted resources...${NC}" + for ROW in $(echo "$SOFT_DELETABLE" | jq -r '.[] | @base64'); do + DECODED=$(echo "$ROW" | base64 -d) + RES_TYPE=$(echo "$DECODED" | jq -r '.type') + RES_ID=$(echo "$DECODED" | jq -r '.id') + PURGE_PROTECTED=$(echo "$DECODED" | jq -r '.purgeProtected') + RES_NAME=$(echo "$RES_ID" | awk -F/ '{print $NF}') + + case "$RES_TYPE" in + "Microsoft.KeyVault/vaults") + DELETED_VAULT=$(az keyvault list-deleted --query "[?name=='$RES_NAME']" -o json 2>/dev/null || echo "[]") + if [[ "$(echo "$DELETED_VAULT" | jq 'length')" -gt 0 ]]; then + if [[ "$PURGE_PROTECTED" == "true" ]]; then + echo " โš ๏ธ $RES_NAME: soft-deleted but purge-protected โ€” retained" + RETAINED_COUNT=$((RETAINED_COUNT + 1)) + PURGE_RESULTS=$(echo "$PURGE_RESULTS" | jq --arg n "$RES_NAME" --arg t "$RES_TYPE" \ + '. + [{name:$n, type:$t, action:"retained-soft-deleted", reason:"purge-protected"}]') + else + echo " ๐Ÿ—‘๏ธ Purging vault: $RES_NAME" + if az keyvault purge --name "$RES_NAME" 2>/dev/null; then + PURGE_RESULTS=$(echo "$PURGE_RESULTS" | jq --arg n "$RES_NAME" --arg t "$RES_TYPE" \ + '. + [{name:$n, type:$t, action:"purged"}]') + else + echo " โš ๏ธ Failed to purge vault: $RES_NAME" + RETAINED_COUNT=$((RETAINED_COUNT + 1)) + PURGE_RESULTS=$(echo "$PURGE_RESULTS" | jq --arg n "$RES_NAME" --arg t "$RES_TYPE" \ + '. + [{name:$n, type:$t, action:"purge-failed"}]') + fi + fi + else + echo " โœ“ $RES_NAME: not in soft-deleted state" + fi + ;; + "Microsoft.CognitiveServices/accounts") + if [[ "$PURGE_PROTECTED" != "true" ]]; then + LOC=$(echo "$RES_ID" | grep -oE '(?<=locations/)[^/]+' || echo "") + if [[ -n "$LOC" ]]; then + az cognitiveservices account purge --name "$RES_NAME" --location "$LOC" \ + --resource-group "" 2>/dev/null || true + fi + fi + ;; + *) + echo " โ„น๏ธ $RES_TYPE: no purge implementation (soft-delete will expire naturally)" + ;; + esac + done +fi + +# Clean subscription deployment history entry to stay under the 800/scope limit +az deployment sub delete --name "$DEPLOYMENT_ID" 2>/dev/null || true + +END_TIME=$(date +%s) +DURATION=$((END_TIME - START_TIME)) + +# Determine final status +if [[ "$ALREADY_GONE" == "true" ]]; then + STATUS="already-destroyed" +elif [[ "$STACK_DELETED" == "true" || "$RG_DELETED" == "true" ]]; then + if [[ "$RETAINED_COUNT" -gt 0 ]]; then + STATUS="retained-soft-deleted" + else + STATUS="destroyed" + fi +else + STATUS="destroy-failed" +fi + +# Update state.json + metadata.json +TIMESTAMP=$(date -u +%Y-%m-%dT%H:%M:%SZ) +ACTOR=$(az account show --query user.name -o tsv 2>/dev/null || echo unknown) +jq --arg status "$STATUS" --arg ts "$TIMESTAMP" \ + --arg actor "$ACTOR" \ + --arg duration "${DURATION}s" \ + --argjson purgeResults "$PURGE_RESULTS" \ + '. + {status:$status, destroyedAt:$ts, destroyedBy:$actor, destroyDuration:$duration, purgeResults:$purgeResults}' \ + "$STATE_FILE" > "${STATE_FILE}.tmp" && mv "${STATE_FILE}.tmp" "$STATE_FILE" + +if [[ -f "$DEPLOYMENT_PATH/metadata.json" ]]; then + jq --arg status "$STATUS" '.status = $status' \ + "$DEPLOYMENT_PATH/metadata.json" > "$DEPLOYMENT_PATH/metadata.json.tmp" \ + && mv "$DEPLOYMENT_PATH/metadata.json.tmp" "$DEPLOYMENT_PATH/metadata.json" +fi + +echo "" +echo -e "${GREEN}=== Destroy Summary ===${NC}" +echo "Status: $STATUS" +echo "Duration: ${DURATION}s" +if [[ "$RETAINED_COUNT" -gt 0 ]]; then + echo -e "${YELLOW}Retained: $RETAINED_COUNT soft-deleted resource(s) (purge-protected)${NC}" +fi +echo -e "${GREEN}=======================${NC}" diff --git a/.github/workflows/git-ape-deploy.exampleyml b/.github/workflows/git-ape-deploy.exampleyml index 018c461..88e6f87 100644 --- a/.github/workflows/git-ape-deploy.exampleyml +++ b/.github/workflows/git-ape-deploy.exampleyml @@ -197,10 +197,15 @@ jobs: - name: Validate before deploy run: | - az deployment sub validate \ + # Stack-aware validation โ€” checks both the template and the + # stack-specific flags (--action-on-unmanage, --deny-settings-mode). + az stack sub validate \ + --name "${{ matrix.deployment_id }}" \ --location "${{ steps.params.outputs.location }}" \ --template-file "${{ steps.params.outputs.deploy_dir }}/template.json" \ --parameters @"${{ steps.params.outputs.deploy_dir }}/parameters.json" \ + --action-on-unmanage deleteAll \ + --deny-settings-mode none \ --output json - name: Run Microsoft Defender for DevOps template analyzer @@ -247,6 +252,10 @@ jobs: # Determine deploy method: prefer deployment stacks (idempotent destroy) # Fall back to az deployment sub create if stacks are unavailable DEPLOY_METHOD="stack" + # Verbose output goes to a temp file so it does not contaminate the + # JSON that downstream jq calls need to parse. + VERBOSE_LOG=$(mktemp) + trap 'rm -f "$VERBOSE_LOG"' EXIT if [[ "$DEPLOY_METHOD" == "stack" ]]; then DEPLOY_OUTPUT=$(az stack sub create \ @@ -256,18 +265,24 @@ jobs: --parameters @"$DEPLOY_DIR/parameters.json" \ --action-on-unmanage deleteAll \ --deny-settings-mode none \ + --description "Git-Ape deployment $DEPLOYMENT_ID" \ + --tags "managedBy=git-ape" "deploymentId=$DEPLOYMENT_ID" \ --yes \ - --output json 2>&1) + --verbose \ + --output json 2>"$VERBOSE_LOG") else DEPLOY_OUTPUT=$(az deployment sub create \ --name "$DEPLOYMENT_ID" \ --location "$LOCATION" \ --template-file "$DEPLOY_DIR/template.json" \ --parameters @"$DEPLOY_DIR/parameters.json" \ - --output json 2>&1) + --output json 2>"$VERBOSE_LOG") fi EXIT_CODE=$? + if [[ $EXIT_CODE -ne 0 ]]; then + cat "$VERBOSE_LOG" >&2 + fi END_TIME=$(date +%s) DURATION=$((END_TIME - START_TIME)) @@ -285,6 +300,20 @@ jobs: echo "==========================================" echo "$DEPLOY_OUTPUT" echo "==========================================" + + # Surface underlying failed operations โ€” the stack/deployment top-level + # error is usually a summary; the real root cause lives in the per-resource + # operations list. + echo "::group::Underlying failed operations" + az deployment sub show --name "$DEPLOYMENT_ID" --output json 2>/dev/null \ + | jq -r '.properties // {}' \ + || echo "No subscription-scope deployment details available." + az deployment operation sub list --name "$DEPLOYMENT_ID" --output json 2>/dev/null \ + | jq -r '.[] | select(.properties.provisioningState == "Failed") | + "โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€\nResource : \(.properties.targetResource.resourceName // "n/a") (\(.properties.targetResource.resourceType // "n/a"))\nStatus : \(.properties.statusCode // "n/a")\nMessage : \(.properties.statusMessage.error.message // .properties.statusMessage // "n/a")"' \ + || echo "No per-operation details available (deployment may not have reached Azure)." + echo "::endgroup::" + echo "::error::Deployment failed โ€” see output above for details" exit 1 fi @@ -487,6 +516,7 @@ jobs: # Create/update state.json with extended schema jq -n \ + --arg schemaVersion "1.0" \ --arg deploymentId "${{ matrix.deployment_id }}" \ --arg timestamp "$TIMESTAMP" \ --arg status "$STATUS" \ @@ -506,6 +536,7 @@ jobs: --argjson resourceGroups "$RESOURCE_GROUPS" \ --argjson subscriptions "[\"${{ secrets.AZURE_SUBSCRIPTION_ID }}\"]" \ '{ + schemaVersion: $schemaVersion, deploymentId: $deploymentId, timestamp: $timestamp, status: $status, diff --git a/.github/workflows/git-ape-destroy.exampleyml b/.github/workflows/git-ape-destroy.exampleyml index 2f58066..75eea1f 100644 --- a/.github/workflows/git-ape-destroy.exampleyml +++ b/.github/workflows/git-ape-destroy.exampleyml @@ -303,6 +303,7 @@ jobs: az stack sub delete \ --name "$DEPLOYMENT_ID" \ --action-on-unmanage deleteAll \ + --bypass-stack-out-of-sync-error true \ --yes 2>&1 || { echo "destroy_status=failed" >> "$GITHUB_OUTPUT" echo "::error::Failed to delete deployment stack $DEPLOYMENT_ID" diff --git a/plugin.json b/plugin.json index 7288d52..ed5ee6e 100644 --- a/plugin.json +++ b/plugin.json @@ -1,7 +1,7 @@ { "name": "git-ape", "description": "Intelligent Azure deployment agent system for GitHub Copilot. Provides guided, safe, and validated Azure resource deployments using ARM templates, with built-in security analysis, cost estimation, and CI/CD pipeline integration.", - "version": "0.0.1", + "version": "0.1.0", "author": { "name": "Microsoft", "url": "https://github.com/Azure/git-ape" diff --git a/website/docs/agents/azure-resource-deployer.md b/website/docs/agents/azure-resource-deployer.md index c0cbcc6..c8b8b4b 100644 --- a/website/docs/agents/azure-resource-deployer.md +++ b/website/docs/agents/azure-resource-deployer.md @@ -121,33 +121,47 @@ Before deploying, verify: ### 2. Execute Deployment -Use Azure MCP `deploy` service or Azure CLI: +**Always deploy as a subscription-scoped Deployment Stack.** Stacks track every managed resource (across resource groups and subscription scope) and make destroy idempotent โ€” a single `az stack sub delete --action-on-unmanage deleteAll` removes everything the stack owns, regardless of resource scope. -**Option A: Azure MCP (Preferred)** -``` -Use mcp_azure_mcp_search with "deploy" intent to execute template deployment -- Set deployment name: "git-ape-{timestamp}" -- Set mode: "Incremental" (default) or "Complete" (if user specified) -- Monitor deployment with progress updates -``` +> **Single source of truth:** the deploy command, fallback handling, state.json writer, soft-delete classification, and Key Vault purge-protection detection all live in the [`azure-stack-deploy`](../skills/azure-stack-deploy/SKILL.md) skill. Both bash and PowerShell implementations are provided. -**Option B: Azure CLI (Fallback)** +**Pre-flight: validate the stack before deploying** -**Always use subscription-level deployment** โ€” the ARM template includes resource group creation, so we deploy at subscription scope: +Use `az stack sub validate` (not `az deployment sub validate`) so the validation also checks the stack-specific flags (`--action-on-unmanage`, `--deny-settings-mode`) โ€” not just the template: ```bash -# Subscription-level deployment (creates RG + all resources atomically) -az deployment sub create \ +az stack sub validate \ --name "{deployment-id}" \ --location {location} \ --template-file {template.json} \ --parameters @{parameters.json} \ + --action-on-unmanage deleteAll \ + --deny-settings-mode none \ --output json ``` -**DO NOT use `az deployment group create`** โ€” our templates always include the resource group as a resource. Subscription-level deployment handles everything in one command. +**Invoke the deploy skill** -Capture the deployment operation ID for tracking. +```bash +# Bash +.github/skills/azure-stack-deploy/scripts/deploy-stack.sh \ + --deployment-id "{deployment-id}" + +# PowerShell +.github/skills/azure-stack-deploy/scripts/deploy-stack.ps1 ` + -DeploymentId "{deployment-id}" +``` + +The skill: +- Calls `az stack sub create --action-on-unmanage deleteAll --deny-settings-mode none --description "Git-Ape deployment {id}" --tags managedBy=git-ape deploymentId={id} --yes --verbose` +- Falls back to `az deployment sub create` only if the stack call fails (warns the user โ€” fallback path does NOT solve soft-delete / multi-RG / sub-scope idempotency) +- On any failure, dumps the per-operation failure list inline so the root cause is immediately visible +- On success, captures the `stackId`, classifies every managed resource (type, scope, soft-deletable, purge-protected), and writes the extended `state.json` (schemaVersion 1.0) +- Updates `metadata.json` with `status: "succeeded"`, `deployMethod`, and `resourceGroups[]` + +Pass `--no-fallback` (bash) / `-NoFallback` (pwsh) when the user explicitly wants to fail loudly instead of accepting the legacy path. + +**DO NOT use `az deployment group create`** โ€” our templates always include the resource group as a resource. Subscription scope handles everything in one command. ### 3. Monitor Progress @@ -175,15 +189,27 @@ Status updates: **Monitoring Commands:** ```bash -# Check deployment status (subscription-level) +# Stack path โ€” check stack provisioning state +az stack sub show \ + --name {deployment-id} \ + --query "provisioningState" \ + --output tsv + +# Stack path โ€” list managed resources (post-deploy or in-progress) +az stack sub show \ + --name {deployment-id} \ + --query "resources[].{Id:id, Status:status}" \ + --output table + +# Fallback path โ€” subscription deployment az deployment sub show \ - --name {deployment-name} \ + --name {deployment-id} \ --query "properties.provisioningState" \ --output tsv -# Get deployment operations (detailed resource status) +# Fallback path โ€” deployment operations (detailed resource status) az deployment operation sub list \ - --name {deployment-name} \ + --name {deployment-id} \ --query "[].{Resource:properties.targetResource.resourceName, Type:properties.targetResource.resourceType, Status:properties.provisioningState}" \ --output table ``` @@ -219,13 +245,18 @@ Use mcp_azure_mcp_search to query deployed resources and verify: ### 5. Capture Deployment Outputs -Extract and report deployment outputs (defined in ARM template `outputs` section): +Extract and report deployment outputs: ```bash -# Get deployment outputs -az deployment group show \ - --name {deployment-name} \ - --resource-group {rg-name} \ +# Stack path โ€” outputs are on the stack itself +az stack sub show \ + --name {deployment-id} \ + --query "outputs" \ + --output json + +# Fallback path โ€” subscription deployment outputs +az deployment sub show \ + --name {deployment-id} \ --query "properties.outputs" \ --output json ``` @@ -237,7 +268,25 @@ Common outputs to capture: - Managed identity principal IDs - Dashboard/monitoring URLs -### 6. Report Deployment Results +### 6. Verify `state.json` was written + +The [`azure-stack-deploy`](../skills/azure-stack-deploy/SKILL.md) skill writes `state.json` (schemaVersion 1.0) and updates `metadata.json` with `deployMethod` and `resourceGroups[]` as part of step 2. The agent's job here is to confirm the write succeeded and surface its contents for the user. + +```bash +DEPLOYMENT_ID="{deployment-id}" +DEPLOY_DIR=".azure/deployments/$DEPLOYMENT_ID" +[[ -f "$DEPLOY_DIR/state.json" ]] || { echo "state.json missing โ€” deploy skill did not complete"; exit 1; } + +# Sanity-check the schema and the lifecycle owner +jq '{schemaVersion, deploymentId, deployMethod, stackId, resourceGroups, managedResourceCount: (.managedResources | length)}' \ + "$DEPLOY_DIR/state.json" +``` + +If `deployMethod == "stack"` and `stackId` is empty, the deploy fell back silently โ€” re-run the skill with `--no-fallback` to surface why stacks were rejected. + +The destroy skill ([`azure-stack-destroy`](../skills/azure-stack-destroy/SKILL.md)) consumes this file as its sole source of truth. + +### 7. Report Deployment Results Provide a comprehensive summary: @@ -270,7 +319,9 @@ Provide a comprehensive summary: To destroy this deployment and delete all its resources: > `@git-ape destroy deployment {deployment-id}` > -> Or via GitHub: create a PR that sets `metadata.json` status to `destroy-requested`, then merge after approval +> Locally this invokes the [`azure-stack-destroy`](../skills/azure-stack-destroy/SKILL.md) skill, which uses `az stack sub delete --action-on-unmanage deleteAll --bypass-stack-out-of-sync-error true` (single command, idempotent across resource groups and subscription scope) and purges any soft-deletable resources that are not purge-protected. +> +> Or via GitHub: create a PR that sets `metadata.json` status to `destroy-requested`, then merge after approval. **Deployment Logs:** {Link to deployment logs if available} ``` @@ -279,7 +330,17 @@ To destroy this deployment and delete all its resources: ### Deployment Failure -If deployment fails, provide detailed diagnostics: +If deployment fails, **always dump the underlying failed operations before presenting options to the user**. The stack/deployment top-level error is usually just a summary; the real root cause is in the per-resource operations list. + +```bash +# Inline failure diagnostics โ€” run BEFORE asking the user what to do +echo "โ”€โ”€ Underlying failed operations โ”€โ”€" +az deployment operation sub list --name "{deployment-id}" --output json 2>/dev/null \ + | jq -r '.[] | select(.properties.provisioningState == "Failed") | + "โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€\nResource : \(.properties.targetResource.resourceName // "n/a") (\(.properties.targetResource.resourceType // "n/a"))\nStatus : \(.properties.statusCode // "n/a")\nMessage : \(.properties.statusMessage.error.message // .properties.statusMessage // "n/a")"' +``` + +Then surface the diagnostics in the user-facing message: ```markdown โŒ **Deployment Failed** @@ -292,6 +353,9 @@ If deployment fails, provide detailed diagnostics: - {Likely cause 1 based on error} - {Likely cause 2} +**Per-Resource Failures:** +{Output of `az deployment operation sub list` filtered to Failed entries} + **Diagnostic Details:** {Full error from Azure} @@ -351,24 +415,26 @@ Type A, B, C, or D: # Option A: Full Rollback if [[ "$USER_CHOICE" == "A" ]]; then # Confirm first - echo "โš ๏ธ This will DELETE all resources. Type 'confirm rollback' to proceed." + echo "โš ๏ธ This will DELETE all managed resources. Type 'confirm rollback' to proceed." read CONFIRMATION - + if [[ "$CONFIRMATION" == "confirm rollback" ]]; then - # Delete resources - az resource delete --ids {resource-id-1} {resource-id-2} - - # If RG was created new, delete it - if [[ "$RG_NEW" == "true" ]]; then - az group delete --name {rg-name} --yes --no-wait - fi - + # Single source of truth: the destroy skill handles stack delete, + # fallback RG delete, soft-delete purge sweep, and state.json updates. + .github/skills/azure-stack-destroy/scripts/destroy-stack.sh \ + --deployment-id {deployment-id} \ + --yes + # PowerShell equivalent: + # .github/skills/azure-stack-destroy/scripts/destroy-stack.ps1 -DeploymentId {deployment-id} -Yes + # Log rollback - echo "Rollback completed" >> .azure/deployments/{deployment-id}/deployment.log + echo "Rollback completed via azure-stack-destroy skill" >> .azure/deployments/{deployment-id}/deployment.log fi fi ``` +> **Important:** Never mix individual `az resource delete` calls when a `stackId` is present in `state.json`. The stack path is canonical โ€” always invoke the [`azure-stack-destroy`](../skills/azure-stack-destroy/SKILL.md) skill, which encapsulates the stack delete, fallback RG delete, and soft-delete purge sweep (Key Vault, Cognitive Services, etc.) for any resources that are not purge-protected. + **Step 4: Update deployment state:** ```json // .azure/deployments/{deployment-id}/metadata.json diff --git a/website/docs/agents/azure-template-generator.md b/website/docs/agents/azure-template-generator.md index 3a1f811..f6a28eb 100644 --- a/website/docs/agents/azure-template-generator.md +++ b/website/docs/agents/azure-template-generator.md @@ -160,7 +160,7 @@ see [git-ape.agent.md](git-ape). - Resource Group is a `Microsoft.Resources/resourceGroups` resource inside the template - Other resources go inside a nested `Microsoft.Resources/deployments` with `"resourceGroup"` property - Use `subscriptionResourceId()` for RG references, regular `resourceId()` inside nested -- Deploy with `az deployment sub create` (not `az deployment group create`) +- Deploy with `az stack sub create --action-on-unmanage deleteAll` (preferred) or `az deployment sub create` as a fallback (not `az deployment group create`) - `uniqueString()` uses `subscription().subscriptionId` instead of `resourceGroup().id` **Nested Template Requirements:** @@ -716,7 +716,30 @@ After showing the preview, provide the complete ARM template: ## Deployment Commands -**Azure CLI (Subscription-level deployment):** +The canonical deploy and destroy paths live in the [`azure-stack-deploy`](../skills/azure-stack-deploy/SKILL.md) and [`azure-stack-destroy`](../skills/azure-stack-destroy/SKILL.md) skills. The commands below are reference recipes โ€” prefer invoking the skills so local CLI / VS Code and CI pipelines stay in sync. + +**Azure CLI (Subscription-scoped Deployment Stack โ€” preferred):** +```bash +az stack sub create \ + --name {deployment-id} \ + --location {location} \ + --template-file template.json \ + --parameters @parameters.json \ + --action-on-unmanage deleteAll \ + --deny-settings-mode none \ + --description "Git-Ape deployment {deployment-id}" \ + --tags "managedBy=git-ape" "deploymentId={deployment-id}" \ + --yes \ + --verbose +``` + +The stack tracks every managed resource (across resource groups and subscription scope), so destroy is a single idempotent command: + +```bash +az stack sub delete --name {deployment-id} --action-on-unmanage deleteAll --bypass-stack-out-of-sync-error true --yes +``` + +**Azure CLI (Subscription-level deployment โ€” fallback only):** ```bash az deployment sub create \ --name {deployment-id} \ @@ -725,7 +748,20 @@ az deployment sub create \ --parameters @parameters.json ``` -**PowerShell:** +Use the fallback only when Deployment Stacks are unavailable in the target subscription/region. The fallback does NOT solve the soft-delete / multi-RG / sub-scope idempotency problem. + +**PowerShell (Deployment Stack โ€” preferred):** +```powershell +New-AzSubscriptionDeploymentStack ` + -Name {deployment-id} ` + -Location {location} ` + -TemplateFile template.json ` + -TemplateParameterFile parameters.json ` + -ActionOnUnmanage DeleteAll ` + -DenySettingsMode None +``` + +**PowerShell (subscription deployment โ€” fallback):** ```powershell New-AzSubscriptionDeployment ` -Name {deployment-id} ` @@ -734,7 +770,7 @@ New-AzSubscriptionDeployment ` -TemplateParameterFile parameters.json ``` -**Note:** We use subscription-level deployments so the resource group is created as part of the template. No need to create the RG separately. +**Note:** We use subscription scope so the resource group is created as part of the template. No need to create the RG separately. ```` ## Constraints diff --git a/website/docs/agents/git-ape.md b/website/docs/agents/git-ape.md index 102e292..185344f 100644 --- a/website/docs/agents/git-ape.md +++ b/website/docs/agents/git-ape.md @@ -137,7 +137,7 @@ Git-Ape can run in two modes. Detect which mode is active and adapt behavior acc | Validation | Run locally | `git-ape-plan.yml` runs on PR, posts what-if as comment | | Confirmation | Ask user interactively | PR approval = confirmation | | Deployment | Execute immediately | `git-ape-deploy.yml` runs on merge or `/deploy` comment | -| Destroy | Execute after confirmation | PR sets `metadata.json` status to `destroy-requested` โ†’ merge triggers `git-ape-destroy.yml` | +| Destroy | Execute via `az stack sub delete --action-on-unmanage deleteAll` after confirmation, then purge soft-deletables | PR sets `metadata.json` status to `destroy-requested` โ†’ merge triggers `git-ape-destroy.yml` (same stack-based flow + soft-delete purge) | | Results | Display in chat | Posted as PR/issue comment + state committed to repo | ## Your Role @@ -394,12 +394,13 @@ The deployment plan MUST start with a clear "Target Environment" table: **Delegate to:** `azure-resource-deployer` The deployer will: -- Execute the ARM template as a **subscription-level deployment** (`az deployment sub create`) +- Execute the ARM template as a **subscription-scoped Deployment Stack** (`az stack sub create --action-on-unmanage deleteAll`) so destroy is idempotent across resource groups and subscription scope. The CLI fallback (`az deployment sub create`) is used only if stacks are unavailable. - The ARM template includes resource group creation โ€” everything deploys atomically - Monitor deployment progress in real-time - Handle any deployment failures - Verify resource creation via Azure Resource Graph - Capture deployment outputs (resource IDs, endpoints, etc.) +- Capture the **stack ID** plus every managed resource into `state.json` (extended schema: `stackId`, `deployMethod`, `managedResources[]`, `resourceGroups[]`, `subscriptions[]`, `externalReferences[]`) so the destroy path can find them later โ€” including soft-deletable types (Key Vault, Cognitive Services, App Configuration, API Management, ML Workspaces, Recovery Services Vaults). **Deployment Monitoring:** Always poll deployment state every **30 seconds** using `sleep 30` between checks. No exponential backoff โ€” use a fixed 30-second interval for all resources regardless of type or expected duration. Check both the top-level deployment and nested deployment statuses on every poll. @@ -426,7 +427,16 @@ Run post-deployment validation: ``` To destroy this deployment and delete all its resources, use Git-Ape: > @git-ape destroy deployment {deployment-id} - + + Locally, this invokes the `azure-stack-destroy` skill: + > .github/skills/azure-stack-destroy/scripts/destroy-stack.sh --deployment-id {deployment-id} + > # or PowerShell: + > .github/skills/azure-stack-destroy/scripts/destroy-stack.ps1 -DeploymentId {deployment-id} + + Which uses `az stack sub delete --action-on-unmanage deleteAll --bypass-stack-out-of-sync-error true` + (single command, idempotent across resource groups and subscription scope) and + purges any soft-deletable resources that are not purge-protected. + Or via GitHub (if using CI/CD): > Create a PR that sets `metadata.json` status to `destroy-requested`, then merge after approval ``` diff --git a/website/docs/deployment/state.md b/website/docs/deployment/state.md index 7a24263..df430a5 100644 --- a/website/docs/deployment/state.md +++ b/website/docs/deployment/state.md @@ -167,6 +167,7 @@ Contains runtime deployment state populated after `az deployment` or `az stack` ```json { + "schemaVersion": "1.0", "deploymentId": "deploy-20260218-143022", "timestamp": "2026-02-18T14:30:22Z", "status": "succeeded", @@ -215,6 +216,7 @@ Contains runtime deployment state populated after `az deployment` or `az stack` | Field | Type | Description | |-------|------|-------------| +| `schemaVersion` | `string` | State schema version. `"1.0"` is the current Deployment Stacks edition. Tools that consume `state.json` should branch on this when newer schemas ship. | | `stackId` | `string \| null` | Azure Deployment Stack resource ID. When present, destroy uses `az stack sub delete` for complete cleanup. | | `deployMethod` | `"stack" \| "subscription"` | Deployment method used. `stack` = Deployment Stacks (default); `subscription` = legacy `az deployment sub create`. | | `managedResources` | `array` | Flat list of all resources managed by this deployment, regardless of scope. Populated by walking deployment operations recursively. | @@ -230,7 +232,7 @@ Contains runtime deployment state populated after `az deployment` or `az stack` **Destroy strategy selection:** -1. If `stackId` is present โ†’ `az stack sub delete --name --action-on-unmanage deleteAll` +1. If `stackId` is present โ†’ `az stack sub delete --name --action-on-unmanage deleteAll --bypass-stack-out-of-sync-error true` 2. If `stackId` is null โ†’ fallback to state-driven delete using `managedResources[]` and `resourceGroups[]` 3. If neither field is populated (legacy state) โ†’ fall back to single `az group delete` on `resourceGroup` diff --git a/website/docs/skills/azure-stack-deploy.md b/website/docs/skills/azure-stack-deploy.md new file mode 100644 index 0000000..ef2496d --- /dev/null +++ b/website/docs/skills/azure-stack-deploy.md @@ -0,0 +1,161 @@ +--- +title: "Azure Stack Deploy" +sidebar_label: "Azure Stack Deploy" +description: "Deploy an ARM template as a subscription-scoped Azure Deployment Stack (idempotent across resource groups and sub-scope). Captures managed resources, classifies soft-deletable types, detects Key Vault purge protection, and writes extended state.json (schemaVersion 1.0). Use for any local CLI / VS Code Git-Ape deployment so the result matches the CI workflow." +--- + + + + +# Azure Stack Deploy + +> Deploy an ARM template as a subscription-scoped Azure Deployment Stack (idempotent across resource groups and sub-scope). Captures managed resources, classifies soft-deletable types, detects Key Vault purge protection, and writes extended state.json (schemaVersion 1.0). Use for any local CLI / VS Code Git-Ape deployment so the result matches the CI workflow. + +## Details + +| Property | Value | +|----------|-------| +| **Skill Directory** | `.github/skills/azure-stack-deploy/` | +| **Phase** | General | +| **User Invocable** | โœ… Yes | +| **Usage** | `/azure-stack-deploy Deployment ID (folder under .azure/deployments/) โ€” optional --location override` | + + +## Documentation + +# Azure Stack Deploy + +Deploy a Git-Ape deployment artifact as a subscription-scoped **Azure Deployment Stack** (`az stack sub create --action-on-unmanage deleteAll`). The stack is the lifecycle owner of every resource the template creates โ€” across resource groups and subscription scope โ€” which makes destroy idempotent in a single call (see [`azure-stack-destroy`](../azure-stack-destroy/SKILL.md)). + +This skill produces the **same `state.json`** schema (`schemaVersion: "1.0"`) as the CI workflow at `.github/workflows/git-ape-deploy.exampleyml`, so local deployments and pipeline deployments are interchangeable. + +## When to Use + +- Local deployment from VS Code or terminal (the `git-ape` agent invokes this in Stage 3) +- Re-deploying an existing deployment ID after template edits โ€” stacks are stateful, so this is an in-place update +- Any time you would otherwise run `az deployment sub create` against a Git-Ape `template.json` + +## Prerequisites + +| Tool | Why | +|------|-----| +| `az` (Azure CLI โ‰ฅ 2.59) | `az stack sub` requires CLI โ‰ฅ 2.50; 2.59 has the latest stack flags | +| `jq` | State capture and JSON extraction | +| `bash` โ‰ฅ 4 OR PowerShell 7+ | Either runner works | +| Active `az login` | Skill exits early if no subscription is selected | +| Existing `template.json` (and optional `parameters.json`) under `.azure/deployments//` | Source artifacts | + +## Procedure + +### 1. Locate deployment artifacts + +```bash +DEPLOYMENT_ID="deploy-20260506-001" +DEPLOYMENT_PATH=".azure/deployments/$DEPLOYMENT_ID" + +[[ -f "$DEPLOYMENT_PATH/template.json" ]] || { echo "template.json missing"; exit 1; } +``` + +If `parameters.json` is present, `location`, `project` (or `projectName`), and `environment` are read from it. Defaults: `eastus` / `unknown` / `dev`. + +### 2. Run the script + +```bash +.github/skills/azure-stack-deploy/scripts/deploy-stack.sh \ + --deployment-id "$DEPLOYMENT_ID" +``` + +PowerShell equivalent: + +```powershell +.github/skills/azure-stack-deploy/scripts/deploy-stack.ps1 ` + -DeploymentId "$DEPLOYMENT_ID" +``` + +The script: + +1. Resolves `location`, `project`, `environment` from `parameters.json` (or defaults) +2. Validates Azure CLI session (`az account show`) +3. Calls `az stack sub create` with the canonical Git-Ape flag set: + - `--action-on-unmanage deleteAll` + - `--deny-settings-mode none` + - `--description "Git-Ape deployment "` + - `--tags managedBy=git-ape deploymentId=` + - `--yes --verbose` +4. **On stack failure**, falls back to `az deployment sub create` (warns the user โ€” no soft-delete / multi-RG idempotency on the fallback path) +5. **On any deployment failure**, dumps the per-operation failure list (`az deployment operation sub list`) inline so the root cause is visible without clicking into the Portal +6. **On success**, queries `az stack sub show --query "resources[].id"` for the live managed-resource list, classifies each resource (type, scope, soft-deletable, purge-protected), and writes the extended `state.json` +7. Updates `metadata.json` with `status: "succeeded"`, `deployMethod`, and `resourceGroups[]` + +### 3. Inspect output + +```text +โœ… Deployment succeeded in 142s (method: stack) +State written to: .azure/deployments/deploy-20260506-001/state.json +Stack ID: /subscriptions//providers/Microsoft.Resources/deploymentStacks/deploy-20260506-001 + +To destroy this deployment: + /azure-stack-destroy deploy-20260506-001 +``` + +## Arguments + +| Flag (bash) | Param (pwsh) | Required | Description | +|-------------|--------------|----------|-------------| +| `--deployment-id ` | `-DeploymentId ` | yes | Folder name under `.azure/deployments/` | +| `--location ` | `-Location ` | no | Override the location from `parameters.json` | +| `--no-fallback` | `-NoFallback` | no | Fail loudly if the stack call fails instead of falling back to `az deployment sub create` | + +## state.json schema (v1.0) + +```json +{ + "schemaVersion": "1.0", + "deploymentId": "deploy-20260506-001", + "timestamp": "2026-05-06T12:00:00Z", + "status": "succeeded", + "duration": "142s", + "subscription": "", + "location": "eastus", + "project": "myapp", + "environment": "dev", + "resourceGroup": "rg-myapp-dev-eastus", + "deployMethod": "stack", + "stackId": "/subscriptions//providers/Microsoft.Resources/deploymentStacks/deploy-20260506-001", + "managedResources": [ + { + "id": "/subscriptions//resourceGroups/rg-myapp-dev-eastus/providers/Microsoft.KeyVault/vaults/kv-myapp-dev-eus", + "type": "Microsoft.KeyVault/vaults", + "scope": "resourceGroup", + "softDeletable": true, + "purgeProtected": true + } + ], + "resourceGroups": ["rg-myapp-dev-eastus"], + "subscriptions": [""], + "externalReferences": [] +} +``` + +See [website/docs/deployment/state.md](../../../website/docs/deployment/state.md) for the full schema reference. + +## Soft-deletable resource types tracked + +`Microsoft.KeyVault/vaults`, `Microsoft.CognitiveServices/accounts`, `Microsoft.AppConfiguration/configurationStores`, `Microsoft.ApiManagement/service`, `Microsoft.MachineLearningServices/workspaces`, `Microsoft.RecoveryServices/vaults`. + +The destroy skill ([`azure-stack-destroy`](../azure-stack-destroy/SKILL.md)) consumes the `softDeletable` and `purgeProtected` fields to drive its purge sweep. + +## Failure modes + +| Symptom | Likely cause | Recovery | +|---------|--------------|----------| +| `Not logged in to Azure` | `az login` missing | Run `az login` then retry | +| `template.json missing` | Wrong deployment ID | Check `.azure/deployments/` contents | +| Stack create fails immediately | Region/policy blocks Deployment Stacks | Re-run without `--no-fallback`, accept the legacy path, or pick a supported region | +| Stack succeeds but `state.json` missing managed resources | `az stack sub show` race condition | Re-run โ€” the script is idempotent (stacks de-duplicate on `--name`) | + +## Related + +- [`azure-stack-destroy`](../azure-stack-destroy/SKILL.md) โ€” the matching destroy skill (single source of truth: `stackId`) +- [`azure-deployment-preflight`](../azure-deployment-preflight/SKILL.md) โ€” what-if and permission checks BEFORE deploy +- [`azure-security-analyzer`](../azure-security-analyzer/SKILL.md) โ€” security gate (BLOCKING) before deploy confirmation diff --git a/website/docs/skills/azure-stack-destroy.md b/website/docs/skills/azure-stack-destroy.md new file mode 100644 index 0000000..99029b6 --- /dev/null +++ b/website/docs/skills/azure-stack-destroy.md @@ -0,0 +1,149 @@ +--- +title: "Azure Stack Destroy" +sidebar_label: "Azure Stack Destroy" +description: "Destroy a Git-Ape deployment by deleting its Azure Deployment Stack with --action-on-unmanage deleteAll, then purging soft-deleted resources (Key Vault, Cognitive Services) that are not purge-protected. Reads state.json (schemaVersion 1.0) to know exactly what to clean up. Use for any local CLI / VS Code Git-Ape teardown so the result matches the CI workflow." +--- + + + + +# Azure Stack Destroy + +> Destroy a Git-Ape deployment by deleting its Azure Deployment Stack with --action-on-unmanage deleteAll, then purging soft-deleted resources (Key Vault, Cognitive Services) that are not purge-protected. Reads state.json (schemaVersion 1.0) to know exactly what to clean up. Use for any local CLI / VS Code Git-Ape teardown so the result matches the CI workflow. + +## Details + +| Property | Value | +|----------|-------| +| **Skill Directory** | `.github/skills/azure-stack-destroy/` | +| **Phase** | General | +| **User Invocable** | โœ… Yes | +| **Usage** | `/azure-stack-destroy Deployment ID โ€” add --yes to skip the typed confirmation` | + + +## Documentation + +# Azure Stack Destroy + +Destroy a Git-Ape deployment by deleting its subscription-scoped **Azure Deployment Stack** in a single idempotent call (`az stack sub delete --action-on-unmanage deleteAll --bypass-stack-out-of-sync-error true`). The stack owns every resource the matching deploy created โ€” across resource groups and subscription scope โ€” so one delete cleans up everything. + +After the stack is gone, this skill performs a **soft-delete purge sweep** for resource types that linger after deletion (Key Vault, Cognitive Services, App Configuration, API Management, ML workspaces, Recovery Services vaults). Resources flagged `purgeProtected: true` in `state.json` are intentionally retained. + +This skill mirrors `.github/workflows/git-ape-destroy.exampleyml` so local destroys and CI destroys are interchangeable. + +## When to Use + +- User says: "destroy this deployment", "tear down deploy-XXX", "clean up the stack" +- Pair with the matching [`azure-stack-deploy`](../azure-stack-deploy/SKILL.md) โ€” same stack, same `state.json` key (`stackId`) +- Any time you would otherwise run `az group delete` against a Git-Ape deployment (don't โ€” you'll miss soft-delete cleanup and multi-RG resources) + +## Prerequisites + +| Tool | Why | +|------|-----| +| `az` (Azure CLI โ‰ฅ 2.59) | `az stack sub delete --bypass-stack-out-of-sync-error` requires a recent CLI | +| `jq` | Read state.json | +| `bash` โ‰ฅ 4 OR PowerShell 7+ | Either runner works | +| Active `az login` | Must be the same subscription where the stack lives | +| Existing `state.json` under `.azure/deployments//` | Source of truth for `stackId`, `managedResources`, `softDeletable`, `purgeProtected` | + +The skill **refuses to run** without `state.json`. Re-deploy first or hand-write a minimal state file (not recommended). + +## Procedure + +### 1. Identify deployment + +```bash +DEPLOYMENT_ID="deploy-20260506-001" +DEPLOYMENT_PATH=".azure/deployments/$DEPLOYMENT_ID" +[[ -f "$DEPLOYMENT_PATH/state.json" ]] || { echo "state.json missing โ€” cannot destroy"; exit 1; } +``` + +### 2. Run the script + +```bash +.github/skills/azure-stack-destroy/scripts/destroy-stack.sh \ + --deployment-id "$DEPLOYMENT_ID" +``` + +Skip the confirmation prompt (use only in automation): + +```bash +.github/skills/azure-stack-destroy/scripts/destroy-stack.sh \ + --deployment-id "$DEPLOYMENT_ID" \ + --yes +``` + +PowerShell equivalents: + +```powershell +.github/skills/azure-stack-destroy/scripts/destroy-stack.ps1 -DeploymentId "$DEPLOYMENT_ID" +.github/skills/azure-stack-destroy/scripts/destroy-stack.ps1 -DeploymentId "$DEPLOYMENT_ID" -Yes +``` + +### 3. What the script does + +1. Reads `state.json` and extracts `stackId`, `deployMethod`, `resourceGroup`, `managedResources[]`, `softDeletable[]` +2. Prints a **destroy plan** โ€” stack ID, resource group, count of soft-deletables (with purge-protection flagged) +3. Prompts for typed `destroy` confirmation (unless `--yes`) +4. **Stack delete path** (`stackId` present): + - `az stack sub delete --action-on-unmanage deleteAll --bypass-stack-out-of-sync-error true --yes` + - The bypass flag is safe in destroy because it's a one-shot operation โ€” we don't need the stale-manifest safety check that protects iterative updates +5. **Fallback path** (no `stackId`, only `resourceGroup`): `az group delete --name --yes` +6. **Purge sweep** for each `softDeletable` resource not marked `purgeProtected`: + - Key Vaults: `az keyvault list-deleted` + `az keyvault purge` + - Cognitive Services: `az cognitiveservices account purge` + - Other types: skipped (soft-delete expires naturally) +7. Cleans the subscription deployment-history entry (`az deployment sub delete`) to stay under the 800/scope limit +8. Updates `state.json` and `metadata.json` with terminal status: + +| Status | Meaning | +|--------|---------| +| `destroyed` | Stack/RG gone and all soft-deletables purged or absent | +| `retained-soft-deleted` | Stack gone but at least one soft-deletable retained (purge-protected or purge failed) | +| `partially-destroyed` | Stack delete partially failed | +| `destroy-failed` | Stack/RG delete failed entirely | +| `already-destroyed` | Stack and RG were already gone before this call | + +### 4. Inspect the result + +```text +=== Destroy Summary === +Status: destroyed +Duration: 87s +======================= +``` + +Or, when something is intentionally retained: + +```text +=== Destroy Summary === +Status: retained-soft-deleted +Duration: 92s +Retained: 1 soft-deleted resource(s) (purge-protected) +======================= +``` + +`state.json` gains `destroyedAt`, `destroyedBy`, `destroyDuration`, and a `purgeResults[]` array describing each soft-deletable's outcome. + +## Arguments + +| Flag (bash) | Param (pwsh) | Required | Description | +|-------------|--------------|----------|-------------| +| `--deployment-id ` | `-DeploymentId ` | yes | Folder name under `.azure/deployments/` | +| `--yes` | `-Yes` | no | Skip the typed `destroy` confirmation prompt (CI-only) | + +## Failure modes + +| Symptom | Likely cause | Recovery | +|---------|--------------|----------| +| `state.json missing` | Deployment never reached the state-write phase, or was hand-edited | Re-deploy (idempotent on stack name) then destroy, OR delete the `.azure/deployments//` folder if Azure has nothing | +| `Stack out of sync` despite `--bypass-stack-out-of-sync-error` | Old CLI version | Upgrade `az` to โ‰ฅ 2.59 | +| Key Vault purge fails | Vault is purge-protected (`purgeProtected: true`) | Expected โ€” wait 7-90 days for soft-delete window to expire, or purge manually after disabling protection | +| `Cannot delete resource group โ€ฆ`/`InUseSubnetCannotBeDeleted` | A resource outside the stack references one inside (e.g. external subnet peered to a deleted VNet) | Inspect `externalReferences[]` in `state.json`; remove the reference and rerun | + +## Related + +- [`azure-stack-deploy`](../azure-stack-deploy/SKILL.md) โ€” the matching deploy skill (writes the `state.json` this skill consumes) +- [`azure-drift-detector`](../azure-drift-detector/SKILL.md) โ€” check for unmanaged drift BEFORE destroy +- [`azure-resource-visualizer`](../azure-resource-visualizer/SKILL.md) โ€” visualize what's in the stack before tearing it down diff --git a/website/docs/skills/overview.md b/website/docs/skills/overview.md index ab26a81..a71485a 100644 --- a/website/docs/skills/overview.md +++ b/website/docs/skills/overview.md @@ -40,6 +40,13 @@ Skills are focused capabilities invoked by agents at specific stages of the depl | [Azure Rest Api Reference](./azure-rest-api-reference) | Look up Azure REST API and ARM template reference documentation for any resource type. Returns exact property schemas, required fields, valid values, and latest stable API versions. Use BEFORE generating or modifying ARM templates to ensure correctness. No Azure connection required. | โœ… | | [Git Ape Onboarding](./git-ape-onboarding) | Onboard a repository, Azure subscription(s), and user identity for Git-Ape CI/CD using a skill-driven CLI playbook. Use for first-time setup of OIDC, federated credentials, RBAC, GitHub environments, and required secrets. | โœ… | +## General Skills + +| Skill | Description | Invocable | +|-------|-------------|:---------:| +| [Azure Stack Deploy](./azure-stack-deploy) | Deploy an ARM template as a subscription-scoped Azure Deployment Stack (idempotent across resource groups and sub-scope). Captures managed resources, classifies soft-deletable types, detects Key Vault purge protection, and writes extended state.json (schemaVersion 1.0). Use for any local CLI / VS Code Git-Ape deployment so the result matches the CI workflow. | โœ… | +| [Azure Stack Destroy](./azure-stack-destroy) | Destroy a Git-Ape deployment by deleting its Azure Deployment Stack with --action-on-unmanage deleteAll, then purging soft-deleted resources (Key Vault, Cognitive Services) that are not purge-protected. Reads state.json (schemaVersion 1.0) to know exactly what to clean up. Use for any local CLI / VS Code Git-Ape teardown so the result matches the CI workflow. | โœ… | + ## Skill Invocation in Deployment Flow ```mermaid diff --git a/website/docs/workflows/git-ape-deploy.md b/website/docs/workflows/git-ape-deploy.md index 437c080..87705dd 100644 --- a/website/docs/workflows/git-ape-deploy.md +++ b/website/docs/workflows/git-ape-deploy.md @@ -262,10 +262,15 @@ jobs: - name: Validate before deploy run: | - az deployment sub validate \ + # Stack-aware validation โ€” checks both the template and the + # stack-specific flags (--action-on-unmanage, --deny-settings-mode). + az stack sub validate \ + --name "${{ matrix.deployment_id }}" \ --location "${{ steps.params.outputs.location }}" \ --template-file "${{ steps.params.outputs.deploy_dir }}/template.json" \ --parameters @"${{ steps.params.outputs.deploy_dir }}/parameters.json" \ + --action-on-unmanage deleteAll \ + --deny-settings-mode none \ --output json - name: Run Microsoft Defender for DevOps template analyzer @@ -321,7 +326,10 @@ jobs: --parameters @"$DEPLOY_DIR/parameters.json" \ --action-on-unmanage deleteAll \ --deny-settings-mode none \ + --description "Git-Ape deployment $DEPLOYMENT_ID" \ + --tags "managedBy=git-ape" "deploymentId=$DEPLOYMENT_ID" \ --yes \ + --verbose \ --output json 2>&1) else DEPLOY_OUTPUT=$(az deployment sub create \ @@ -350,6 +358,20 @@ jobs: echo "==========================================" echo "$DEPLOY_OUTPUT" echo "==========================================" + + # Surface underlying failed operations โ€” the stack/deployment top-level + # error is usually a summary; the real root cause lives in the per-resource + # operations list. + echo "::group::Underlying failed operations" + az deployment sub show --name "$DEPLOYMENT_ID" --output json 2>/dev/null \ + | jq -r '.properties // {}' \ + || echo "No subscription-scope deployment details available." + az deployment operation sub list --name "$DEPLOYMENT_ID" --output json 2>/dev/null \ + | jq -r '.[] | select(.properties.provisioningState == "Failed") | + "โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€\nResource : \(.properties.targetResource.resourceName // "n/a") (\(.properties.targetResource.resourceType // "n/a"))\nStatus : \(.properties.statusCode // "n/a")\nMessage : \(.properties.statusMessage.error.message // .properties.statusMessage // "n/a")"' \ + || echo "No per-operation details available (deployment may not have reached Azure)." + echo "::endgroup::" + echo "::error::Deployment failed โ€” see output above for details" exit 1 fi @@ -552,6 +574,7 @@ jobs: # Create/update state.json with extended schema jq -n \ + --arg schemaVersion "1.0" \ --arg deploymentId "${{ matrix.deployment_id }}" \ --arg timestamp "$TIMESTAMP" \ --arg status "$STATUS" \ @@ -571,6 +594,7 @@ jobs: --argjson resourceGroups "$RESOURCE_GROUPS" \ --argjson subscriptions "[\"${{ secrets.AZURE_SUBSCRIPTION_ID }}\"]" \ '{ + schemaVersion: $schemaVersion, deploymentId: $deploymentId, timestamp: $timestamp, status: $status, diff --git a/website/docs/workflows/git-ape-destroy.md b/website/docs/workflows/git-ape-destroy.md index e821172..6d7f043 100644 --- a/website/docs/workflows/git-ape-destroy.md +++ b/website/docs/workflows/git-ape-destroy.md @@ -357,6 +357,7 @@ jobs: az stack sub delete \ --name "$DEPLOYMENT_ID" \ --action-on-unmanage deleteAll \ + --bypass-stack-out-of-sync-error true \ --yes 2>&1 || { echo "destroy_status=failed" >> "$GITHUB_OUTPUT" echo "::error::Failed to delete deployment stack $DEPLOYMENT_ID" From 6e017ab845e16b5248a7461d1aa312e52408abbf Mon Sep 17 00:00:00 2001 From: Arnaud Lheureux Date: Wed, 6 May 2026 14:32:40 +0700 Subject: [PATCH 04/18] fix(plugin): bump marketplace.json to 0.1.0 to match plugin.json MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ๐Ÿ”ง - Generated by Copilot --- .github/plugin/marketplace.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/plugin/marketplace.json b/.github/plugin/marketplace.json index 9b5979a..4123937 100644 --- a/.github/plugin/marketplace.json +++ b/.github/plugin/marketplace.json @@ -6,13 +6,13 @@ }, "metadata": { "description": "Git-Ape โ€” Intelligent Azure deployment agent and skill suite for GitHub Copilot. Onboard any repository with guided ARM template generation, security analysis, cost estimation, drift detection, and automated CI/CD pipelines.", - "version": "0.0.1" + "version": "0.1.0" }, "plugins": [ { "name": "git-ape", "description": "Intelligent Azure deployment agent system for GitHub Copilot. Provides guided, safe, and validated Azure resource deployments using ARM templates, with built-in security analysis, cost estimation, drift detection, and CI/CD pipeline integration.", - "version": "0.0.1", + "version": "0.1.0", "source": "." } ] From 6a9eebe877a07ee0ab531bf21c39b8a869f5ae21 Mon Sep 17 00:00:00 2001 From: Arnaud Lheureux Date: Mon, 25 May 2026 04:46:42 +0200 Subject: [PATCH 05/18] Potential fix for pull request finding Co-authored-by: Copilot Autofix powered by AI <175728472+Copilot@users.noreply.github.com> --- .github/workflows/git-ape-deploy.exampleyml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/git-ape-deploy.exampleyml b/.github/workflows/git-ape-deploy.exampleyml index 88e6f87..7ab1465 100644 --- a/.github/workflows/git-ape-deploy.exampleyml +++ b/.github/workflows/git-ape-deploy.exampleyml @@ -388,7 +388,7 @@ jobs: done # Extract resource groups from managed resources - RESOURCE_GROUPS=$(echo "$MANAGED_RESOURCES" | jq -c '[.[].id | capture("/resourceGroups/(?[^/]+)") | .rg] | unique') + RESOURCE_GROUPS=$(echo "$MANAGED_RESOURCES" | jq -c '[.[].id | select(test("/resourceGroups/")) | capture("/resourceGroups/(?[^/]+)") | .rg] | unique') else # Fallback: walk deployment operations recursively OPS=$(az deployment operation sub list \ From 20eff30d1f532619da09b97486d0e4a07d7599ae Mon Sep 17 00:00:00 2001 From: Arnaud Lheureux Date: Mon, 25 May 2026 04:47:09 +0200 Subject: [PATCH 06/18] Potential fix for pull request finding Co-authored-by: Copilot Autofix powered by AI <175728472+Copilot@users.noreply.github.com> --- .github/skills/azure-stack-deploy/scripts/deploy-stack.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/skills/azure-stack-deploy/scripts/deploy-stack.sh b/.github/skills/azure-stack-deploy/scripts/deploy-stack.sh index c1bbf19..43d42bd 100755 --- a/.github/skills/azure-stack-deploy/scripts/deploy-stack.sh +++ b/.github/skills/azure-stack-deploy/scripts/deploy-stack.sh @@ -77,7 +77,7 @@ fi _classify_resource() { local RES_ID="$1" local RES_TYPE - RES_TYPE=$(echo "$RES_ID" | grep -oE 'providers/[^/]+/[^/]+' | head -1 | sed 's|providers/||') + RES_TYPE=$(echo "$RES_ID" | grep -oE 'providers/[^/]+/[^/]+' | tail -1 | sed 's|providers/||') local RES_SCOPE="resourceGroup" echo "$RES_ID" | grep -q "/resourceGroups/" || RES_SCOPE="subscription" From f29135b2cc32044343de44c9ae495ef36f8fb359 Mon Sep 17 00:00:00 2001 From: Arnaud Lheureux Date: Mon, 25 May 2026 04:47:31 +0200 Subject: [PATCH 07/18] Potential fix for pull request finding Co-authored-by: Copilot Autofix powered by AI <175728472+Copilot@users.noreply.github.com> --- website/docs/deployment/state.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/website/docs/deployment/state.md b/website/docs/deployment/state.md index df430a5..fa8e050 100644 --- a/website/docs/deployment/state.md +++ b/website/docs/deployment/state.md @@ -232,7 +232,9 @@ Contains runtime deployment state populated after `az deployment` or `az stack` **Destroy strategy selection:** -1. If `stackId` is present โ†’ `az stack sub delete --name --action-on-unmanage deleteAll --bypass-stack-out-of-sync-error true` +1. If `stackId` is present โ†’ treat the deployment as stack-managed and delete by stack name: `az stack sub delete --name --action-on-unmanage deleteAll --bypass-stack-out-of-sync-error true` + - `deploymentId` is the Deployment Stack name. + - `stackId` is the full ARM resource ID for the stack and should only be used with an ID-based form such as `--ids `, not with `--name`. 2. If `stackId` is null โ†’ fallback to state-driven delete using `managedResources[]` and `resourceGroups[]` 3. If neither field is populated (legacy state) โ†’ fall back to single `az group delete` on `resourceGroup` From 0d440ee45554c6cb017902a74245e910c1396460 Mon Sep 17 00:00:00 2001 From: Arnaud Lheureux Date: Mon, 25 May 2026 04:47:41 +0200 Subject: [PATCH 08/18] Potential fix for pull request finding Co-authored-by: Copilot Autofix powered by AI <175728472+Copilot@users.noreply.github.com> --- website/docs/deployment/state.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/website/docs/deployment/state.md b/website/docs/deployment/state.md index fa8e050..f5dc08f 100644 --- a/website/docs/deployment/state.md +++ b/website/docs/deployment/state.md @@ -223,7 +223,7 @@ Contains runtime deployment state populated after `az deployment` or `az stack` | `managedResources[].id` | `string` | Full ARM resource ID. | | `managedResources[].type` | `string` | ARM resource type (e.g., `Microsoft.KeyVault/vaults`). | | `managedResources[].scope` | `string` | Scope level: `resourceGroup`, `subscription`, or `managementGroup`. | -| `managedResources[].apiVersion` | `string` | API version used for the resource. | +| `managedResources[].apiVersion?` | `string` | Optional API version used for the resource, when captured by the workflow/skill that wrote the state. | | `managedResources[].softDeletable` | `boolean` | Whether the resource type supports soft-delete (Key Vault, Cognitive Services, etc.). | | `managedResources[].purgeProtected` | `boolean` | Whether the resource has purge protection enabled (cannot be permanently deleted until retention expires). | | `resourceGroups` | `array` | All resource groups created/managed by this deployment. | From 30087c13266385717c9b7cb2450f2672170696ed Mon Sep 17 00:00:00 2001 From: Arnaud Lheureux Date: Mon, 25 May 2026 04:48:00 +0200 Subject: [PATCH 09/18] Potential fix for pull request finding Co-authored-by: Copilot Autofix powered by AI <175728472+Copilot@users.noreply.github.com> --- .../skills/azure-stack-destroy/scripts/destroy-stack.ps1 | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/.github/skills/azure-stack-destroy/scripts/destroy-stack.ps1 b/.github/skills/azure-stack-destroy/scripts/destroy-stack.ps1 index 3f2b5b9..a365c39 100644 --- a/.github/skills/azure-stack-destroy/scripts/destroy-stack.ps1 +++ b/.github/skills/azure-stack-destroy/scripts/destroy-stack.ps1 @@ -234,8 +234,13 @@ if ($StackId) { } } } else { - Write-Color 'Stack already gone โ€” skipping stack delete' Yellow - $StackDeleted = $true + if ($RgName) { + Write-Color 'Stack already gone โ€” falling back to resource group delete from state.json' Yellow + $StackId = $null + } else { + Write-Color 'Stack already gone โ€” skipping stack delete' Yellow + $StackDeleted = $true + } } } From 8c847865a9776a074f5ae9292d6eebe7113dfc46 Mon Sep 17 00:00:00 2001 From: Arnaud Lheureux Date: Mon, 25 May 2026 04:48:12 +0200 Subject: [PATCH 10/18] Potential fix for pull request finding Co-authored-by: Copilot Autofix powered by AI <175728472+Copilot@users.noreply.github.com> --- .github/skills/azure-stack-destroy/scripts/destroy-stack.sh | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/.github/skills/azure-stack-destroy/scripts/destroy-stack.sh b/.github/skills/azure-stack-destroy/scripts/destroy-stack.sh index 48d8f09..557324e 100755 --- a/.github/skills/azure-stack-destroy/scripts/destroy-stack.sh +++ b/.github/skills/azure-stack-destroy/scripts/destroy-stack.sh @@ -232,8 +232,9 @@ if [[ -n "$STACK_ID" ]]; then fi fi else - echo -e "${YELLOW}Stack already gone โ€” skipping stack delete${NC}" - STACK_DELETED="true" + echo -e "${YELLOW}Stack not found for stackId in state.json โ€” falling back to RG/state-driven delete${NC}" + STACK_DELETED="false" + STACK_ID="" fi fi From a1d5633b05714fc22ab3fa9921b09d75938a7185 Mon Sep 17 00:00:00 2001 From: Arnaud Lheureux Date: Tue, 26 May 2026 16:16:26 +0800 Subject: [PATCH 11/18] feat(agents): delegate to skills in deployer and template-generator MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - thin out azure-resource-deployer: delegate preflight, verify, rollback to existing skills - thin out azure-template-generator: add Step 0 lookups (rest-api-reference + naming-research) - replace inline RBAC GUIDs and per-resource checklists with skill invocations - collapse hardening checklist into 5 non-negotiable identity patterns + analyzer deferral ๐Ÿ”ง - Generated by Copilot --- .../agents/azure-resource-deployer.agent.md | 82 +++++++------- .../agents/azure-template-generator.agent.md | 100 ++++++++++-------- 2 files changed, 101 insertions(+), 81 deletions(-) diff --git a/.github/agents/azure-resource-deployer.agent.md b/.github/agents/azure-resource-deployer.agent.md index ecfa2ca..a79c04e 100644 --- a/.github/agents/azure-resource-deployer.agent.md +++ b/.github/agents/azure-resource-deployer.agent.md @@ -14,7 +14,19 @@ You are the **Azure Resource Deployer**, a specialist at executing ARM template ## Your Role -Execute ARM template deployments to Azure subscriptions, monitor real-time progress, handle failures gracefully, and verify successful resource creation. +Execute ARM template deployments to Azure subscriptions, monitor real-time progress, handle failures gracefully, and verify successful resource creation. **Delegate to skills wherever a skill already owns the work** โ€” your job is orchestration, not re-implementation. + +## Skills Used + +This agent is a thin orchestrator over the following skills. Do not duplicate their logic inline. + +| Stage | Skill | Why | +|-------|-------|-----| +| Pre-flight | [`/prereq-check`](../skills/prereq-check/SKILL.md) | Verify `az`, `jq`, `gh`, `git` are installed and `az login` is active | +| Pre-flight | [`/azure-deployment-preflight`](../skills/azure-deployment-preflight/SKILL.md) | What-if analysis, permission checks, change preview (CREATE/MODIFY/DELETE) | +| Deploy | [`/azure-stack-deploy`](../skills/azure-stack-deploy/SKILL.md) | The canonical `az stack sub create` runner โ€” writes `state.json` (schemaVersion 1.0), classifies soft-deletable + purge-protected resources | +| Verify | [`/azure-integration-tester`](../skills/azure-integration-tester/SKILL.md) | Post-deployment health checks and endpoint tests | +| Rollback | [`/azure-stack-destroy`](../skills/azure-stack-destroy/SKILL.md) | `az stack sub delete --action-on-unmanage deleteAll` + soft-delete purge sweep | ## Output Styling @@ -27,8 +39,10 @@ Use the shared progress bar and status line patterns for polling updates and sum Detect the auth context and configure accordingly. Never hardcode credentials. +> **Tool + session check:** Invoke [`/prereq-check`](../skills/prereq-check/SKILL.md) once at the very start of Stage 3 to confirm `az`, `jq`, and `gh` are installed at minimum versions AND that `az account show` returns an active subscription. The skill prints platform-specific install commands for anything missing. + ### Interactive (VS Code / local) -The user is already authenticated via `az login`. Verify with: +The user is already authenticated via `az login`. The `prereq-check` skill above verifies this. If you need the subscription details directly: ```bash az account show --output json ``` @@ -85,15 +99,22 @@ If invoked without user confirmation, **STOP** and report: "Deployment requires ### 1. Pre-Deployment Validation -Before deploying, verify: +**Delegate to:** [`/azure-deployment-preflight`](../skills/azure-deployment-preflight/SKILL.md) + +Do not run ad-hoc `az deployment sub validate` or `az stack sub validate` yourself โ€” the preflight skill already owns this and produces a structured report (`preflight-report.md`) with what-if categorization, permission checks, and a CREATE/MODIFY/DELETE summary. + +Invoke the skill with the deployment ID and confirm the report shows: ```markdown -โœ“ ARM template is valid JSON -โœ“ Target resource group exists (or will be created) -โœ“ Azure credentials are configured -โœ“ User has confirmed deployment +โœ“ Template JSON is syntactically valid +โœ“ Stack-specific flags (`--action-on-unmanage`, `--deny-settings-mode`) accepted +โœ“ What-if completed without blocking errors +โœ“ Caller has required RBAC on target scope +โœ“ User has confirmed deployment intent (orchestrator-level checkpoint, not the skill) ``` +If the preflight report flags any blocking issue, **STOP** and surface the issue to the user with the skill's recommended fix. Do not proceed to Step 2. + ### 2. Execute Deployment **Always deploy as a subscription-scoped Deployment Stack.** Stacks track every managed resource (across resource groups and subscription scope) and make destroy idempotent โ€” a single `az stack sub delete --action-on-unmanage deleteAll` removes everything the stack owns, regardless of resource scope. @@ -191,32 +212,22 @@ az deployment operation sub list \ ### 4. Verify Resource Creation -After deployment completes, verify resources exist using Azure Resource Graph: +**Delegate to:** [`/azure-integration-tester`](../skills/azure-integration-tester/SKILL.md) -**Verification Commands:** +The integration tester is the single source of truth for post-deployment verification. It reads `state.json` (written by `azure-stack-deploy` in Step 2) to know what to check, then runs health probes per resource type โ€” Function App HTTP probe, Storage Account `az storage account show`, App Service health endpoint, Database connection check, etc. -```bash -# Query all resources in the resource group -az resource list \ - --resource-group {rg-name} \ - --query "[].{Name:name, Type:type, Location:location, Status:provisioningState}" \ - --output table +Invoke the skill with the deployment ID and consume its structured verdict: -# Get specific resource details -az resource show \ - --resource-group {rg-name} \ - --name {resource-name} \ - --resource-type {resource-type} \ - --query "{Name:name, ID:id, Location:location, Status:properties.provisioningState}" +```bash +.github/skills/azure-integration-tester/scripts/run-tests.sh \ + --deployment-id "{deployment-id}" +# PowerShell: +# .github/skills/azure-integration-tester/scripts/run-tests.ps1 -DeploymentId "{deployment-id}" ``` -Or use Azure MCP tools: -``` -Use mcp_azure_mcp_search to query deployed resources and verify: -- Resource exists -- Provisioning state is "Succeeded" -- Configuration matches template -``` +The skill writes `tests.json` to `.azure/deployments/{id}/` with per-resource pass/fail. Surface the summary in the deployment report (Step 7). + +Do NOT re-implement ad-hoc `az resource list` / `az resource show` polling here โ€” the skill already covers the resource inventory query AND the per-type health probe in one pass. ### 5. Capture Deployment Outputs @@ -394,15 +405,12 @@ if [[ "$USER_CHOICE" == "A" ]]; then read CONFIRMATION if [[ "$CONFIRMATION" == "confirm rollback" ]]; then - # Single source of truth: the destroy skill handles stack delete, - # fallback RG delete, soft-delete purge sweep, and state.json updates. - .github/skills/azure-stack-destroy/scripts/destroy-stack.sh \ - --deployment-id {deployment-id} \ - --yes - # PowerShell equivalent: - # .github/skills/azure-stack-destroy/scripts/destroy-stack.ps1 -DeploymentId {deployment-id} -Yes - - # Log rollback + # Delegate to the destroy skill โ€” single source of truth for stack + # delete, fallback RG delete, soft-delete purge sweep, and state.json + # updates. The skill picks the right runner (bash or PowerShell) and + # handles all edge cases. + /azure-stack-destroy {deployment-id} + echo "Rollback completed via azure-stack-destroy skill" >> .azure/deployments/{deployment-id}/deployment.log fi fi diff --git a/.github/agents/azure-template-generator.agent.md b/.github/agents/azure-template-generator.agent.md index 22466b2..0661368 100644 --- a/.github/agents/azure-template-generator.agent.md +++ b/.github/agents/azure-template-generator.agent.md @@ -15,7 +15,22 @@ You are the **Azure Template Generator**, a specialist at creating production-re ## Your Role -Transform deployment requirements into validated, secure ARM templates. Show users exactly what will be deployed BEFORE execution happens. +Transform deployment requirements into validated, secure ARM templates. Show users exactly what will be deployed BEFORE execution happens. **Delegate to skills wherever a skill already owns the work** โ€” your job is template assembly + orchestration, not re-implementation of naming rules, schema lookups, or security assessment logic. + +## Skills Used + +This agent is a thin orchestrator over the following skills. Do not duplicate their logic inline. + +| Stage | Skill | Why | +|-------|-------|-----| +| Step 0 (lookup) | [`/azure-rest-api-reference`](../skills/azure-rest-api-reference/SKILL.md) | Get exact property schemas, required fields, valid enum values, latest stable API version per resource type. **Mandatory before writing any resource.** | +| Step 0 (lookup) | [`/azure-naming-research`](../skills/azure-naming-research/SKILL.md) | CAF abbreviation, length / charset constraints, uniqueness scope. **Mandatory before naming any resource.** | +| Step 1 (write) | [`/azure-role-selector`](../skills/azure-role-selector/SKILL.md) | Least-privilege RBAC role lookup โ€” returns the GUIDs for `Storage Blob Data Owner`, `Storage Account Contributor`, etc. Do NOT hardcode GUIDs in the agent. | +| Step 2 (assess) | [`/azure-security-analyzer`](../skills/azure-security-analyzer/SKILL.md) | Per-resource security best practices assessment + the BLOCKING security gate | +| Step 2 (assess) | [`/azure-policy-advisor`](../skills/azure-policy-advisor/SKILL.md) | Azure Policy compliance check against CIS / NIST / org framework (advisory) | +| Step 2 (assess) | [`/azure-resource-availability`](../skills/azure-resource-availability/SKILL.md) | Validate SKU + API version availability in target region + subscription quota (BLOCKING) | +| Step 2 (assess) | [`/azure-deployment-preflight`](../skills/azure-deployment-preflight/SKILL.md) | What-if analysis showing what will Create / Modify / Delete | +| Step 2 (assess) | [`/azure-cost-estimator`](../skills/azure-cost-estimator/SKILL.md) | Real pricing from Azure Retail Prices API | ## Output Styling @@ -24,6 +39,27 @@ see [git-ape.agent.md](git-ape.agent.md). ## Approach +### 0. Look Up Specs Before Writing Anything + +**Two skill invocations are mandatory before you write a single resource block.** Skipping either step is the #1 cause of preventable deployment failures (wrong property names, expired API versions, invalid characters, length overruns). + +**0a. Property and API version lookup** โ€” Invoke [`/azure-rest-api-reference`](../skills/azure-rest-api-reference/SKILL.md) for every resource type in the deployment. The skill returns: +- Latest stable (non-preview) API version +- Required vs optional properties +- Valid enum values per property +- Common gotchas (e.g. `kind` discriminator on `Microsoft.Web/sites`) + +Never rely on memorized schemas. Re-invoke whenever you change the API version of an existing resource. + +**0b. Naming research** โ€” Invoke [`/azure-naming-research`](../skills/azure-naming-research/SKILL.md) for every resource type. The skill returns: +- CAF abbreviation (e.g. `func`, `st`, `kv`, `cae`) +- Length min / max +- Valid character set (alphanumeric, hyphens, lowercase-only, etc.) +- Uniqueness scope (global, resource group, subscription) +- Whether `uniqueString()` is recommended + +Use the skill's output to derive ARM `variables()` expressions, e.g. `[concat('func-', parameters('projectName'), '-', parameters('environment'), '-', parameters('location'))]`. Do not hand-craft naming rules from memory. + ### 1. Generate ARM Template Structure **IMPORTANT:** Always generate **subscription-level** ARM templates that include resource group creation as a resource. This keeps all infrastructure in a single atomic template. @@ -183,8 +219,10 @@ Many Azure subscriptions enforce `allowSharedKeyAccess: false` via Azure Policy. ``` **Required RBAC Roles for Function App โ†’ Storage:** -- `Storage Blob Data Owner` (b7e6dc6d-f1e8-4753-8033-0f276bb0955b) โ€” blob access -- `Storage Account Contributor` (17d1049b-9a84-46fb-8f53-869881c3d3ab) โ€” file share creation + +Do NOT hardcode role definition GUIDs in this agent. Invoke [`/azure-role-selector`](../skills/azure-role-selector/SKILL.md) with the resource pair (e.g. "Function App needs blob + file share access on Storage Account") and use the GUIDs the skill returns. The skill encodes least-privilege โ€” it will recommend `Storage Blob Data Owner` (`b7e6dc6d-f1e8-4753-8033-0f276bb0955b`) + `Storage Account Contributor` (`17d1049b-9a84-46fb-8f53-869881c3d3ab`) for this specific pair, or narrower roles (`Storage Blob Data Contributor`, `Storage File Data SMB Share Contributor`) when full ownership is not needed. + +The GUIDs above appear in the example block only so you can verify the skill output matches โ€” do not copy them into new templates without running the skill first. **Pattern: App Service โ†’ SQL Database (Managed Identity)** ```json @@ -207,53 +245,27 @@ Many Azure subscriptions enforce `allowSharedKeyAccess: false` via Azure Policy. #### General Best Practices +These are **write-time guardrails** โ€” apply them while assembling resource blocks so the template starts in a known-good state. The full assessment runs in Step 3 via [`/azure-security-analyzer`](../skills/azure-security-analyzer/SKILL.md), which has the complete severity-tagged checklist per resource type. Do not duplicate that checklist here. + For **ALL resources**: -- โœ“ Use latest **stable** API versions โ€” invoke `/azure-resource-availability` to query the latest non-preview API version for each resource type; never hardcode -- โœ“ Validate that all resource properties used in the template exist in the chosen API version's schema +- โœ“ Use latest **stable** API versions โ€” returned by [`/azure-rest-api-reference`](../skills/azure-rest-api-reference/SKILL.md) in Step 0a; never hardcode +- โœ“ Use names returned by [`/azure-naming-research`](../skills/azure-naming-research/SKILL.md) in Step 0b - โœ“ Enable diagnostic settings and logging - โœ“ Apply resource tags from workspace standards - โœ“ Use `dependsOn` for proper ordering - โœ“ Output resource IDs and endpoints - โœ“ **Use managed identity for all inter-resource access** (no keys/secrets) -- โœ“ **Include RBAC role assignments** when resources need to access each other - -For **Function Apps**: -- โœ“ Use managed identity (system-assigned) -- โœ“ **Use `AzureWebJobsStorage__accountName` instead of connection string** โ€” never use `listKeys()` -- โœ“ **Add RBAC role assignments** for storage access (Storage Blob Data Owner + Storage Account Contributor) -- โœ“ HTTPS only enforcement -- โœ“ TLS 1.2 minimum -- โœ“ FTP disabled (`ftpsState: Disabled`) -- โœ“ Remote debugging disabled -- โœ“ HTTP/2 enabled -- โœ“ Enable Application Insights integration -- โœ“ Configure CORS appropriately -- โœ“ Set runtime version explicitly - -For **Storage Accounts**: -- โœ“ Enable secure transfer (HTTPS only) -- โœ“ Minimum TLS version 1.2 -- โœ“ Enable blob soft delete -- โœ“ Disable public blob access (unless explicitly needed) -- โœ“ **Set `allowSharedKeyAccess: false`** when all consumers use managed identity -- โœ“ Enable encryption at rest (default) -- โœ“ Configure firewall rules for network security - -For **Databases**: -- โœ“ Enable Transparent Data Encryption -- โœ“ **Use AAD-only authentication** (`azureADOnlyAuthentication: true`) -- โœ“ Configure firewall rules (no 0.0.0.0/0 in prod) -- โœ“ Enable auditing and threat detection -- โœ“ Automated backups configured - -For **App Services**: -- โœ“ HTTPS only -- โœ“ **Use managed identity** for all backend connections -- โœ“ FTP disabled -- โœ“ Always On enabled for production -- โœ“ Enable health check endpoint monitoring -- โœ“ Configure auto-scaling rules (for Standard+ tiers) -- โœ“ Enable app service logs +- โœ“ **Include RBAC role assignments** with GUIDs from [`/azure-role-selector`](../skills/azure-role-selector/SKILL.md), not from memory + +**Non-negotiable identity patterns** โ€” these are write-time, not assessment-time, because once a template ships with shared keys / connection strings it is hard to retrofit: + +- **Function Apps**: System-assigned identity + `AzureWebJobsStorage__accountName` (NEVER `AzureWebJobsStorage` connection string, NEVER `listKeys()`) +- **Storage Accounts**: `allowSharedKeyAccess: false` when all consumers use managed identity +- **Databases**: AAD-only authentication (`azureADOnlyAuthentication: true`); no SQL auth +- **App Services**: Managed identity for all backend connections; HTTPS only; FTP disabled (`ftpsState: Disabled`); TLS 1.2 minimum +- **Key Vault**: Use Key Vault references in app settings (`@Microsoft.KeyVault(SecretUri=...)`), not raw secrets + +All other per-resource hardening (TLS versions, blob soft delete, threat detection, health probes, auto-scaling, etc.) is owned by the security analyzer in Step 3 and the policy advisor in Step 4 โ€” they will flag anything missing with severity tags, and Critical / High findings are auto-applied or BLOCK the security gate. ### 3. Analyze Security Best Practices (Per Resource) From 904a7dd7b205b0879c7fade09ce93adf7d3d9aa6 Mon Sep 17 00:00:00 2001 From: Arnaud Lheureux Date: Tue, 26 May 2026 16:16:34 +0800 Subject: [PATCH 12/18] feat(skills): refine azure-stack-deploy and azure-stack-destroy guidance MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - destroy: front-load WHEN, add USE FOR / DO NOT USE FOR, harden state.json prerequisite - destroy: list extra soft-deletable resource types in purgeResults note - deploy: clarify stack-create flags and state.json schema references ๐Ÿ”ง - Generated by Copilot --- .github/skills/azure-stack-deploy/SKILL.md | 22 ++++++++++-- .github/skills/azure-stack-destroy/SKILL.md | 37 ++++++++++++++++++--- 2 files changed, 51 insertions(+), 8 deletions(-) diff --git a/.github/skills/azure-stack-deploy/SKILL.md b/.github/skills/azure-stack-deploy/SKILL.md index b404dfc..115ab54 100644 --- a/.github/skills/azure-stack-deploy/SKILL.md +++ b/.github/skills/azure-stack-deploy/SKILL.md @@ -1,6 +1,6 @@ --- name: azure-stack-deploy -description: "Deploy an ARM template as a subscription-scoped Azure Deployment Stack (idempotent across resource groups and sub-scope). Captures managed resources, classifies soft-deletable types, detects Key Vault purge protection, and writes extended state.json (schemaVersion 1.0). Use for any local CLI / VS Code Git-Ape deployment so the result matches the CI workflow." +description: "Run an Azure Deployment Stack create (subscription scope) for a prepared Git-Ape deployment artifact and write state.json (schemaVersion 1.0). Use locally so the result matches the CI deploy workflow." argument-hint: "Deployment ID (folder under .azure/deployments/) โ€” optional --location override" user-invocable: true --- @@ -9,7 +9,7 @@ user-invocable: true Deploy a Git-Ape deployment artifact as a subscription-scoped **Azure Deployment Stack** (`az stack sub create --action-on-unmanage deleteAll`). The stack is the lifecycle owner of every resource the template creates โ€” across resource groups and subscription scope โ€” which makes destroy idempotent in a single call (see [`azure-stack-destroy`](../azure-stack-destroy/SKILL.md)). -This skill produces the **same `state.json`** schema (`schemaVersion: "1.0"`) as the CI workflow at `.github/workflows/git-ape-deploy.exampleyml`, so local deployments and pipeline deployments are interchangeable. +This skill produces the **same `state.json`** schema (`schemaVersion: "1.0"`) as the CI workflow at `.github/workflows/git-ape-deploy.yml`, so local deployments and pipeline deployments are interchangeable. ## When to Use @@ -17,6 +17,13 @@ This skill produces the **same `state.json`** schema (`schemaVersion: "1.0"`) as - Re-deploying an existing deployment ID after template edits โ€” stacks are stateful, so this is an in-place update - Any time you would otherwise run `az deployment sub create` against a Git-Ape `template.json` +## Do NOT use for + +- **Tearing down / destroying** an existing deployment โ€” use [`azure-stack-destroy`](../azure-stack-destroy/SKILL.md) instead +- **What-if preview / preflight validation** without deploying โ€” use [`azure-deployment-preflight`](../azure-deployment-preflight/SKILL.md) instead +- **Off-topic** (non-Azure, non-deployment) requests +- Generating or editing ARM templates โ€” use `azure-prepare` or another IaC authoring skill + ## Prerequisites | Tool | Why | @@ -64,7 +71,7 @@ The script: - `--description "Git-Ape deployment "` - `--tags managedBy=git-ape deploymentId=` - `--yes --verbose` -4. **On stack failure**, falls back to `az deployment sub create` (warns the user โ€” no soft-delete / multi-RG idempotency on the fallback path) +4. **On stack failure**, falls back to `az deployment sub create` and prints `โš ๏ธ FALLBACK: no multi-RG idempotency, no soft-delete tracking` so the trade-off is unambiguous 5. **On any deployment failure**, dumps the per-operation failure list (`az deployment operation sub list`) inline so the root cause is visible without clicking into the Portal 6. **On success**, queries `az stack sub show --query "resources[].id"` for the live managed-resource list, classifies each resource (type, scope, soft-deletable, purge-protected), and writes the extended `state.json` 7. Updates `metadata.json` with `status: "succeeded"`, `deployMethod`, and `resourceGroups[]` @@ -80,6 +87,15 @@ To destroy this deployment: /azure-stack-destroy deploy-20260506-001 ``` +## What to tell the user after running + +After the script returns, your reply MUST mention: + +1. The primitive used: `az stack sub create --action-on-unmanage deleteAll` (or fallback `az deployment sub create`) +2. The stack ID (from `state.json.stackId`) โ€” this is the single handle for destroy +3. That `state.json` (schemaVersion 1.0) was written under the deployment folder +4. The next-step destroy command: `/azure-stack-destroy ` + ## Arguments | Flag (bash) | Param (pwsh) | Required | Description | diff --git a/.github/skills/azure-stack-destroy/SKILL.md b/.github/skills/azure-stack-destroy/SKILL.md index 7b87d63..889d3bf 100644 --- a/.github/skills/azure-stack-destroy/SKILL.md +++ b/.github/skills/azure-stack-destroy/SKILL.md @@ -1,6 +1,6 @@ --- name: azure-stack-destroy -description: "Destroy a Git-Ape deployment by deleting its Azure Deployment Stack with --action-on-unmanage deleteAll, then purging soft-deleted resources (Key Vault, Cognitive Services) that are not purge-protected. Reads state.json (schemaVersion 1.0) to know exactly what to clean up. Use for any local CLI / VS Code Git-Ape teardown so the result matches the CI workflow." +description: "Tear down a Git-Ape deployment by ID. Reads `state.json` under `.azure/deployments//` to delete the Azure Deployment Stack and purge soft-deleted Key Vault / Cognitive Services. Refuses to run without `state.json`. Use for any local CLI or VS Code Git-Ape teardown so the result matches the CI destroy workflow." argument-hint: "Deployment ID โ€” add --yes to skip the typed confirmation" user-invocable: true --- @@ -11,7 +11,34 @@ Destroy a Git-Ape deployment by deleting its subscription-scoped **Azure Deploym After the stack is gone, this skill performs a **soft-delete purge sweep** for resource types that linger after deletion (Key Vault, Cognitive Services, App Configuration, API Management, ML workspaces, Recovery Services vaults). Resources flagged `purgeProtected: true` in `state.json` are intentionally retained. -This skill mirrors `.github/workflows/git-ape-destroy.exampleyml` so local destroys and CI destroys are interchangeable. +This skill mirrors `.github/workflows/git-ape-destroy.yml` so local destroys and CI destroys are interchangeable. + +## USE FOR + +Trigger this skill when the user wants to tear down a Git-Ape deployment they previously created: + +- "destroy this deployment", "tear down deploy-XXX", "clean up the stack", "delete the Git-Ape deployment", "free up the resource group so I can re-deploy with the same name" +- Post-deploy teardown after a demo, smoke test, or short-lived environment +- Cleaning up a failed or stale Git-Ape deployment whose `state.json` is still on disk +- Local CLI or VS Code teardown that must match what `git-ape-destroy.yml` does in CI + +### Prefer this over raw `az group delete` + +For any deployment Git-Ape created, this skill is the correct tool โ€” do **not** suggest `az group delete` on its own. Reasons: + +1. **Multi-RG / subscription-scope coverage.** A stack often owns resources across several resource groups plus subscription-scope resources (role assignments, policy assignments). One `az group delete` cleans only one RG. +2. **Soft-delete purge.** Key Vault and Cognitive Services soft-delete on RG deletion and silently hold the name (and quota) for 7โ€“90 days. This skill purges them so the user can re-deploy with the same name immediately. +3. **State consistency.** Updates `state.json` and `metadata.json` to terminal status (`destroyed`, `retained-soft-deleted`, etc.) so the next operation sees an accurate view. + +## DO NOT USE FOR + +Refuse to invoke this skill in any of these cases: + +- **No `state.json` on disk.** Hard prerequisite โ€” see below. Without it, recommend re-running deploy or aborting. +- **Resource groups not created by Git-Ape** (e.g. ones the user made by hand with `az group create`). Suggest `az group delete --name --yes` directly instead. +- **Deploying or updating a stack.** Use `azure-stack-deploy` for those. +- **Deleting an individual resource inside a stack.** This skill always destroys the whole stack โ€” there is no "surgical" mode. +- **Non-Azure clouds** or non-Git-Ape Azure deployments (ARM/Bicep/Terraform from other tools). ## When to Use @@ -29,7 +56,7 @@ This skill mirrors `.github/workflows/git-ape-destroy.exampleyml` so local destr | Active `az login` | Must be the same subscription where the stack lives | | Existing `state.json` under `.azure/deployments//` | Source of truth for `stackId`, `managedResources`, `softDeletable`, `purgeProtected` | -The skill **refuses to run** without `state.json`. Re-deploy first or hand-write a minimal state file (not recommended). +> **Hard prerequisite: `state.json` under `.azure/deployments//`.** Without it this skill **aborts** โ€” it has no idea which stack, resource groups, or soft-deletables to clean up. Do NOT hand-write `state.json`; re-run the matching `azure-stack-deploy` for that deployment ID first, or use `az group delete` directly on a known resource group (a non-Git-Ape teardown, outside this skill's scope). ## Procedure @@ -40,7 +67,7 @@ The scripts default to **fast mode** (interactive default). The CI workflow keep | | How | Wait time (small VNet stack) | When to use | |--|--|--|--| | Fast (default) | Background the `az stack sub delete` call, then poll managed RGs with `az group exists` | ~2 min | Local CLI / VS Code use; user wants quick feedback | -| Sync (`--wait` / `-Wait`) | `az stack sub delete ... --yes` (blocks until stack metadata is fully cleaned) | ~5 min | CI pipelines (default in `git-ape-destroy.exampleyml`); when you need every Azure-side cleanup completed before the script exits | +| Sync (`--wait` / `-Wait`) | `az stack sub delete ... --yes` (blocks until stack metadata is fully cleaned) | ~5 min | CI pipelines (default in `git-ape-destroy.yml`); when you need every Azure-side cleanup completed before the script exits | The Azure CLI does not expose `--no-wait` on `az stack sub delete`, so the fast path runs the same command as a detached background process. In fast mode the stack-metadata cleanup continues asynchronously in Azure after the script returns. The next destroy of the same `deploymentId` is idempotent: if the stack is still finalizing, `az stack sub show` will return it and the script will simply pick up where Azure left off. @@ -95,7 +122,7 @@ PowerShell equivalents: 6. **Purge sweep** for each `softDeletable` resource not marked `purgeProtected`: - Key Vaults: `az keyvault list-deleted` + `az keyvault purge` - Cognitive Services: `az cognitiveservices account purge` - - Other types: skipped (soft-delete expires naturally) + - Other types (App Configuration, API Management, ML workspaces, Recovery Services vaults): not auto-purged โ€” they expire from soft-delete naturally and are tracked in `purgeResults[]` with `status: skipped-natural-expiry` 7. Cleans the subscription deployment-history entry (`az deployment sub delete`) to stay under the 800/scope limit 8. Updates `state.json` and `metadata.json` with terminal status: From ed078a05d767b63588a9d9c78d430410949fb59e Mon Sep 17 00:00:00 2001 From: Arnaud Lheureux Date: Tue, 26 May 2026 16:16:40 +0800 Subject: [PATCH 13/18] test(evals): add evals for azure-stack-deploy and azure-stack-destroy MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - register both skills in evals manifest at expanded tier - add 5-task eval for azure-stack-destroy (positive-local, positive-stack, negative-deploy, negative-non-gitape, negative-off-topic) - add eval suite for azure-stack-deploy ๐Ÿงช - Generated by Copilot --- .github/evals/azure-stack-deploy/eval.yaml | 48 +++++++++++++++++ .../tasks/negative-destroy.yaml | 15 ++++++ .../tasks/negative-off-topic.yaml | 15 ++++++ .../tasks/negative-whatif-preview.yaml | 15 ++++++ .../tasks/positive-local-deploy.yaml | 47 ++++++++++++++++ .../tasks/positive-redeploy-after-edit.yaml | 43 +++++++++++++++ .github/evals/azure-stack-destroy/eval.yaml | 48 +++++++++++++++++ .../tasks/negative-deploy.yaml | 15 ++++++ .../tasks/negative-non-gitape-rg-delete.yaml | 15 ++++++ .../tasks/negative-off-topic.yaml | 15 ++++++ .../tasks/positive-clean-up-stack.yaml | 52 ++++++++++++++++++ .../tasks/positive-local-destroy.yaml | 54 +++++++++++++++++++ .github/evals/manifest.yaml | 6 ++- 13 files changed, 387 insertions(+), 1 deletion(-) create mode 100644 .github/evals/azure-stack-deploy/eval.yaml create mode 100644 .github/evals/azure-stack-deploy/tasks/negative-destroy.yaml create mode 100644 .github/evals/azure-stack-deploy/tasks/negative-off-topic.yaml create mode 100644 .github/evals/azure-stack-deploy/tasks/negative-whatif-preview.yaml create mode 100644 .github/evals/azure-stack-deploy/tasks/positive-local-deploy.yaml create mode 100644 .github/evals/azure-stack-deploy/tasks/positive-redeploy-after-edit.yaml create mode 100644 .github/evals/azure-stack-destroy/eval.yaml create mode 100644 .github/evals/azure-stack-destroy/tasks/negative-deploy.yaml create mode 100644 .github/evals/azure-stack-destroy/tasks/negative-non-gitape-rg-delete.yaml create mode 100644 .github/evals/azure-stack-destroy/tasks/negative-off-topic.yaml create mode 100644 .github/evals/azure-stack-destroy/tasks/positive-clean-up-stack.yaml create mode 100644 .github/evals/azure-stack-destroy/tasks/positive-local-destroy.yaml diff --git a/.github/evals/azure-stack-deploy/eval.yaml b/.github/evals/azure-stack-deploy/eval.yaml new file mode 100644 index 0000000..19d2f56 --- /dev/null +++ b/.github/evals/azure-stack-deploy/eval.yaml @@ -0,0 +1,48 @@ +# yaml-language-server: $schema=https://raw.githubusercontent.com/microsoft/waza/main/schemas/eval.schema.json + +# Expanded-tier evaluation suite for the azure-stack-deploy skill. +# Validates trigger precision via the heuristic `trigger` grader plus +# per-positive-task answer_quality LLM judge. +# +# Run: waza run .github/evals/azure-stack-deploy/eval.yaml + +name: azure-stack-deploy-eval +description: Trigger precision + answer quality for azure-stack-deploy (Azure Deployment Stacks). +skill: azure-stack-deploy +version: "0.1" + +config: + # 2 trials catches obvious LLM nondeterminism flakes (single trial = no + # flake signal). Pilot tier bumps to 3 via /skill-promote. + trials_per_task: 2 + timeout_seconds: 60 + parallel: false + executor: copilot-sdk + model: claude-sonnet-4.6 + +metrics: + - name: trigger_precision + weight: 1.0 + threshold: 0.6 + description: Skill should activate on Deployment Stack deploy prompts and stay quiet on teardown / preview / unrelated prompts. + +graders: + # Budget grader: azure-stack-deploy is a guided deploy workflow; flag any + # leg that explodes in tool calls or runs unreasonably long. + - type: behavior + name: budget + config: + max_tool_calls: 30 + max_duration_ms: 240000 + + # answer_quality (LLM-as-judge) is scoped per-task on positive tasks + # only (see tasks/positive-*.yaml). Keeps judge-model errors from + # zeroing out the negative-task trigger check in the same leg. + # + # Do NOT add `skill_invocation` with `required_skills:` here โ€” eval-level + # prompt graders fire on EVERY task (including negatives) and produce + # deterministic 0.0 noise across all models (removed in commit 2f699c79 + # from git-ape-onboarding for this reason). + +tasks: + - "tasks/*.yaml" diff --git a/.github/evals/azure-stack-deploy/tasks/negative-destroy.yaml b/.github/evals/azure-stack-deploy/tasks/negative-destroy.yaml new file mode 100644 index 0000000..c21af7a --- /dev/null +++ b/.github/evals/azure-stack-deploy/tasks/negative-destroy.yaml @@ -0,0 +1,15 @@ +# yaml-language-server: $schema=https://raw.githubusercontent.com/microsoft/waza/main/schemas/task.schema.json + +id: negative-destroy +name: Negative โ€” Destroying / tearing down an existing deployment +description: Destroy/teardown prompts belong to azure-stack-destroy, not azure-stack-deploy. +tags: [trigger, negative, mutable-by-skill] +inputs: + prompt: "Tear down the Azure resources I deployed under deploy-20260506-001 โ€” delete the stack and the resource group cleanly." +graders: + - name: trigger_relevance_negative + type: trigger + config: + skill_path: .github/skills/azure-stack-deploy/SKILL.md + mode: negative + threshold: 0.5 diff --git a/.github/evals/azure-stack-deploy/tasks/negative-off-topic.yaml b/.github/evals/azure-stack-deploy/tasks/negative-off-topic.yaml new file mode 100644 index 0000000..7d6fb65 --- /dev/null +++ b/.github/evals/azure-stack-deploy/tasks/negative-off-topic.yaml @@ -0,0 +1,15 @@ +# yaml-language-server: $schema=https://raw.githubusercontent.com/microsoft/waza/main/schemas/task.schema.json + +id: negative-off-topic +name: Negative โ€” Off-topic prompt (Linux kernel scheduling) +description: Off-topic prompt clearly outside Azure Deployment Stacks should not trigger this skill. +tags: [trigger, negative, off-topic, mutable-by-skill] +inputs: + prompt: "Explain how the Linux Completely Fair Scheduler (CFS) picks the next task to run, and how vruntime is recomputed when a task wakes from sleep." +graders: + - name: trigger_relevance_negative + type: trigger + config: + skill_path: .github/skills/azure-stack-deploy/SKILL.md + mode: negative + threshold: 0.5 diff --git a/.github/evals/azure-stack-deploy/tasks/negative-whatif-preview.yaml b/.github/evals/azure-stack-deploy/tasks/negative-whatif-preview.yaml new file mode 100644 index 0000000..372ca88 --- /dev/null +++ b/.github/evals/azure-stack-deploy/tasks/negative-whatif-preview.yaml @@ -0,0 +1,15 @@ +# yaml-language-server: $schema=https://raw.githubusercontent.com/microsoft/waza/main/schemas/task.schema.json + +id: negative-whatif-preview +name: Negative โ€” What-if preview / preflight validation +description: What-if preview belongs to azure-deployment-preflight, not azure-stack-deploy. +tags: [trigger, negative, mutable-by-skill] +inputs: + prompt: "Before I deploy, show me a what-if preview of the changes the template at .azure/deployments/deploy-20260506-001/template.json would make โ€” don't actually deploy anything yet." +graders: + - name: trigger_relevance_negative + type: trigger + config: + skill_path: .github/skills/azure-stack-deploy/SKILL.md + mode: negative + threshold: 0.5 diff --git a/.github/evals/azure-stack-deploy/tasks/positive-local-deploy.yaml b/.github/evals/azure-stack-deploy/tasks/positive-local-deploy.yaml new file mode 100644 index 0000000..d10d898 --- /dev/null +++ b/.github/evals/azure-stack-deploy/tasks/positive-local-deploy.yaml @@ -0,0 +1,47 @@ +# yaml-language-server: $schema=https://raw.githubusercontent.com/microsoft/waza/main/schemas/task.schema.json + +id: positive-local-deploy +name: Positive โ€” Local deploy of an existing deployment artifact +description: Skill should be invoked when the user wants to deploy a Git-Ape template.json locally as a Deployment Stack. +# `mutable-by-skill` โ€” score reflects SKILL.md (trigger + answer_quality +# graders read from .github/skills/azure-stack-deploy/SKILL.md). +tags: [trigger, positive, mutable-by-skill] +inputs: + prompt: "I have an ARM template at .azure/deployments/deploy-20260506-001/template.json โ€” deploy it to my Azure subscription the same way the CI workflow would, so destroy stays a single command later." +graders: + - name: trigger_relevance_positive + type: trigger + config: + skill_path: .github/skills/azure-stack-deploy/SKILL.md + mode: positive + threshold: 0.5 + + # answer_quality (LLM-as-judge): scoped per-task on positives so a flaky + # judge call only zeroes out this task, not the whole leg. + # IMPORTANT: `continue_session: true` is mandatory โ€” without it the judge + # has zero access to the agent's response and scores oscillate. + - type: prompt + name: answer_quality + config: + continue_session: true + prompt: | + You are grading the assistant's previous response in this session. + The user asked to deploy an existing ARM template + (`.azure/deployments/deploy-20260506-001/template.json`) the same + way the Git-Ape CI workflow would, so destroy stays a single + command later. + + PASS criteria โ€” the response must contain ALL of: + 1. Names `az stack sub create` (NOT `az deployment sub create`) + as the deployment primitive. + 2. Includes the `--action-on-unmanage deleteAll` flag (this is + what makes destroy idempotent and matches the CI workflow). + 3. References the helper script + `.github/skills/azure-stack-deploy/scripts/deploy-stack.sh` + OR `deploy-stack.ps1` instead of asking the user to assemble + the `az` command from scratch. + 4. Mentions that `state.json` (schemaVersion 1.0) will be + written to capture the stack ID and managed resources. + + If ALL four criteria are met, call `set_waza_grade_pass`. + Otherwise, call `set_waza_grade_fail` and list which criteria are missing. diff --git a/.github/evals/azure-stack-deploy/tasks/positive-redeploy-after-edit.yaml b/.github/evals/azure-stack-deploy/tasks/positive-redeploy-after-edit.yaml new file mode 100644 index 0000000..f9d2a33 --- /dev/null +++ b/.github/evals/azure-stack-deploy/tasks/positive-redeploy-after-edit.yaml @@ -0,0 +1,43 @@ +# yaml-language-server: $schema=https://raw.githubusercontent.com/microsoft/waza/main/schemas/task.schema.json + +id: positive-redeploy-after-edit +name: Positive โ€” Re-deploy after template edit +description: Skill should be invoked when re-deploying an existing deployment ID after template.json was edited (in-place stack update). +# See positive-local-deploy.yaml for `mutable-by-*` tag semantics. +tags: [trigger, positive, mutable-by-skill] +inputs: + prompt: "I already deployed deploy-20260506-001 last week and just edited its template.json to add a tag. Push the change to the same deployment without creating a duplicate stack โ€” what command do I run?" +graders: + - name: trigger_relevance_positive + type: trigger + config: + skill_path: .github/skills/azure-stack-deploy/SKILL.md + mode: positive + threshold: 0.5 + + - type: prompt + name: answer_quality + config: + continue_session: true + prompt: | + You are grading the assistant's previous response in this session. + The user has an existing deployment (`deploy-20260506-001`) and + edited its `template.json`. They want to push the change + in-place โ€” same stack, no duplicate. + + PASS criteria โ€” the response must contain ALL of: + 1. Calls out that Azure Deployment Stacks are stateful and that + re-running `az stack sub create` against the SAME stack name + updates the existing stack in place (NOT create-only). + 2. Names `az stack sub create` (or the equivalent + `deploy-stack.sh` / `deploy-stack.ps1` script) as the + command to run again. + 3. Reuses the same deployment ID / stack name + (`deploy-20260506-001`) โ€” does NOT instruct the user to + pick a new name or create a fresh deployment folder. + 4. Reaches a concrete next step โ€” either the exact command to + run OR a clear instruction to invoke the + `azure-stack-deploy` script with the existing deployment ID. + + If ALL four criteria are met, call `set_waza_grade_pass`. + Otherwise, call `set_waza_grade_fail` and list which criteria are missing. diff --git a/.github/evals/azure-stack-destroy/eval.yaml b/.github/evals/azure-stack-destroy/eval.yaml new file mode 100644 index 0000000..8c21487 --- /dev/null +++ b/.github/evals/azure-stack-destroy/eval.yaml @@ -0,0 +1,48 @@ +# yaml-language-server: $schema=https://raw.githubusercontent.com/microsoft/waza/main/schemas/eval.schema.json + +# Expanded-tier evaluation suite for the azure-stack-destroy skill. +# Validates trigger precision via the heuristic `trigger` grader plus +# per-positive-task answer_quality LLM judge. +# +# Run: waza run .github/evals/azure-stack-destroy/eval.yaml + +name: azure-stack-destroy-eval +description: Trigger precision + answer quality for azure-stack-destroy (Azure Deployment Stack teardown). +skill: azure-stack-destroy +version: "0.1" + +config: + # 2 trials catches obvious LLM nondeterminism flakes (single trial = no + # flake signal). Pilot tier bumps to 3 via /skill-promote. + trials_per_task: 2 + timeout_seconds: 60 + parallel: false + executor: copilot-sdk + model: claude-sonnet-4.6 + +metrics: + - name: trigger_precision + weight: 1.0 + threshold: 0.6 + description: Skill should activate on Deployment Stack destroy / teardown prompts and stay quiet on deploy / non-Git-Ape / unrelated prompts. + +graders: + # Budget grader: azure-stack-destroy is a guided teardown workflow; flag any + # leg that explodes in tool calls or runs unreasonably long. + - type: behavior + name: budget + config: + max_tool_calls: 30 + max_duration_ms: 240000 + + # answer_quality (LLM-as-judge) is scoped per-task on positive tasks + # only (see tasks/positive-*.yaml). Keeps judge-model errors from + # zeroing out the negative-task trigger check in the same leg. + # + # Do NOT add `skill_invocation` with `required_skills:` here โ€” eval-level + # prompt graders fire on EVERY task (including negatives) and produce + # deterministic 0.0 noise across all models (removed in commit 2f699c79 + # from git-ape-onboarding for this reason). + +tasks: + - "tasks/*.yaml" diff --git a/.github/evals/azure-stack-destroy/tasks/negative-deploy.yaml b/.github/evals/azure-stack-destroy/tasks/negative-deploy.yaml new file mode 100644 index 0000000..afa13ad --- /dev/null +++ b/.github/evals/azure-stack-destroy/tasks/negative-deploy.yaml @@ -0,0 +1,15 @@ +# yaml-language-server: $schema=https://raw.githubusercontent.com/microsoft/waza/main/schemas/task.schema.json + +id: negative-deploy +name: Negative โ€” Deploying a new stack (opposite operation) +description: Deploy prompts belong to azure-stack-deploy, not azure-stack-destroy. +tags: [trigger, negative, mutable-by-skill] +inputs: + prompt: "Deploy this ARM template to a new subscription-scoped Azure Deployment Stack named deploy-20260526-001 in East US." +graders: + - name: trigger_relevance_negative + type: trigger + config: + skill_path: .github/skills/azure-stack-destroy/SKILL.md + mode: negative + threshold: 0.5 diff --git a/.github/evals/azure-stack-destroy/tasks/negative-non-gitape-rg-delete.yaml b/.github/evals/azure-stack-destroy/tasks/negative-non-gitape-rg-delete.yaml new file mode 100644 index 0000000..4018809 --- /dev/null +++ b/.github/evals/azure-stack-destroy/tasks/negative-non-gitape-rg-delete.yaml @@ -0,0 +1,15 @@ +# yaml-language-server: $schema=https://raw.githubusercontent.com/microsoft/waza/main/schemas/task.schema.json + +id: negative-non-gitape-rg-delete +name: Negative โ€” Deleting a non-Git-Ape resource group +description: Deleting a plain resource group with no state.json is outside this skill's scope โ€” use az group delete directly. +tags: [trigger, negative, mutable-by-skill] +inputs: + prompt: "Delete the resource group `rg-myproject-prod-eastus` and everything inside it. It wasn't created by Git-Ape โ€” I made it manually with `az group create`." +graders: + - name: trigger_relevance_negative + type: trigger + config: + skill_path: .github/skills/azure-stack-destroy/SKILL.md + mode: negative + threshold: 0.5 diff --git a/.github/evals/azure-stack-destroy/tasks/negative-off-topic.yaml b/.github/evals/azure-stack-destroy/tasks/negative-off-topic.yaml new file mode 100644 index 0000000..7d271b5 --- /dev/null +++ b/.github/evals/azure-stack-destroy/tasks/negative-off-topic.yaml @@ -0,0 +1,15 @@ +# yaml-language-server: $schema=https://raw.githubusercontent.com/microsoft/waza/main/schemas/task.schema.json + +id: negative-off-topic +name: Negative โ€” Off-topic prompt (Linux kernel scheduling) +description: Off-topic prompt clearly outside Azure deployment teardown should not trigger this skill. +tags: [trigger, negative, off-topic, mutable-by-skill] +inputs: + prompt: "Explain how the Linux Completely Fair Scheduler (CFS) picks the next task to run, and how vruntime is recomputed when a task wakes from sleep." +graders: + - name: trigger_relevance_negative + type: trigger + config: + skill_path: .github/skills/azure-stack-destroy/SKILL.md + mode: negative + threshold: 0.5 diff --git a/.github/evals/azure-stack-destroy/tasks/positive-clean-up-stack.yaml b/.github/evals/azure-stack-destroy/tasks/positive-clean-up-stack.yaml new file mode 100644 index 0000000..a67ff42 --- /dev/null +++ b/.github/evals/azure-stack-destroy/tasks/positive-clean-up-stack.yaml @@ -0,0 +1,52 @@ +# yaml-language-server: $schema=https://raw.githubusercontent.com/microsoft/waza/main/schemas/task.schema.json + +id: positive-clean-up-stack +name: Positive โ€” Clean up the deployment stack +description: User asks to clean up a Git-Ape deployment stack and free the resource group; skill should activate. +tags: [trigger, positive, mutable-by-skill] +inputs: + prompt: "Clean up the Azure deployment stack for deploy-20260524-test. I want to free up the resource group and any soft-deletable resources so I can re-deploy with the same name." +graders: + - name: trigger_relevance_positive + type: trigger + config: + skill_path: .github/skills/azure-stack-destroy/SKILL.md + mode: positive + threshold: 0.5 + + # answer_quality (LLM-as-judge): scoped per-task on positives so a flaky + # judge call only zeroes out this task, not the whole leg. See eval.yaml. + # + # IMPORTANT: waza prompt graders are binary (set_waza_grade_pass = 1.0, + # set_waza_grade_fail = 0.0). The judge has NO access to the agent's + # response unless continue_session: true is set. + - type: prompt + name: answer_quality + config: + continue_session: true + prompt: | + You are grading the assistant's previous response in this session. + The user asked to clean up a Git-Ape deployment stack + (deploy-20260524-test), free the resource group, and handle any + soft-deletable resources so the deployment can be re-created with + the same name. + + PASS criteria โ€” the response must contain ALL of: + 1. Recommends running the `azure-stack-destroy` skill OR its + scripts (`destroy-stack.sh` / `destroy-stack.ps1`) rather than + a raw `az group delete` โ€” explicitly because raw `az group + delete` misses soft-delete cleanup and any multi-RG resources. + 2. References the requirement for `state.json` under + `.azure/deployments/deploy-20260524-test/` (skill refuses to + run without it). + 3. Mentions deleting the deployment stack itself โ€” + `az stack sub delete` with `--action-on-unmanage deleteAll` + (or equivalent semantics: one delete cleans every resource + the stack owns). + 4. Either covers the soft-delete purge sweep behavior (Key + Vault, Cognitive Services purged after stack delete) OR + notes that resources flagged `purgeProtected: true` in + `state.json` are intentionally retained. + + If ALL four PASS criteria are met, call `set_waza_grade_pass`. + Otherwise, call `set_waza_grade_fail` and list which criteria are missing. diff --git a/.github/evals/azure-stack-destroy/tasks/positive-local-destroy.yaml b/.github/evals/azure-stack-destroy/tasks/positive-local-destroy.yaml new file mode 100644 index 0000000..48537ad --- /dev/null +++ b/.github/evals/azure-stack-destroy/tasks/positive-local-destroy.yaml @@ -0,0 +1,54 @@ +# yaml-language-server: $schema=https://raw.githubusercontent.com/microsoft/waza/main/schemas/task.schema.json + +id: positive-local-destroy +name: Positive โ€” Local destroy of a Git-Ape deployment +description: User asks to tear down a specific deploy-XXX deployment with soft-delete cleanup; skill should activate. +tags: [trigger, positive, mutable-by-skill] +inputs: + prompt: "I'm done with deploy-20260506-001. Tear down the deployment stack โ€” delete the resources cleanly and purge any soft-deleted Key Vaults so I can re-use the name." +graders: + - name: trigger_relevance_positive + type: trigger + config: + skill_path: .github/skills/azure-stack-destroy/SKILL.md + mode: positive + threshold: 0.5 + + # answer_quality (LLM-as-judge): scoped per-task on positives so a flaky + # judge call only zeroes out this task, not the whole leg. See eval.yaml. + # + # IMPORTANT: waza prompt graders are binary (set_waza_grade_pass = 1.0, + # set_waza_grade_fail = 0.0). They are NOT 1โ€“5 rubrics. The judge has NO + # access to the agent's response unless continue_session: true is set โ€” it + # resumes the agent's own session so it can read the response. + - type: prompt + name: answer_quality + config: + continue_session: true + prompt: | + You are grading the assistant's previous response in this session. + The user asked to tear down a specific Git-Ape deployment + (deploy-20260506-001), delete the resources, and purge soft-deleted + Key Vaults so the name can be re-used. + + PASS criteria โ€” the response must contain ALL of: + 1. Recommends the `azure-stack-destroy` skill OR invokes the + `destroy-stack.sh` / `destroy-stack.ps1` script under + `.github/skills/azure-stack-destroy/scripts/` (NOT a raw + `az group delete`). + 2. References `state.json` under + `.azure/deployments/deploy-20260506-001/` as the source of + truth for what to destroy (stackId, managedResources, + softDeletable, purgeProtected). + 3. Names the actual stack-delete command or its semantics โ€” + `az stack sub delete --action-on-unmanage deleteAll` + (single idempotent call that owns all resources across + resource groups). + 4. Addresses the soft-delete purge sweep explicitly โ€” mentions + `az keyvault purge` (or `az keyvault list-deleted` + purge), + OR explains that the skill's purge sweep deletes + non-purge-protected soft-deleted Key Vaults so the name is + immediately reusable. + + If ALL four PASS criteria are met, call `set_waza_grade_pass`. + Otherwise, call `set_waza_grade_fail` and list which criteria are missing. diff --git a/.github/evals/manifest.yaml b/.github/evals/manifest.yaml index f71f4b3..6f903b6 100644 --- a/.github/evals/manifest.yaml +++ b/.github/evals/manifest.yaml @@ -27,7 +27,11 @@ skills: # Pilot tier: full multi-model fan-out (most-trusted skills). - name: prereq-check tier: pilot - + # Expanded tier: 2-model fan-out for skills still maturing toward pilot. + - name: azure-stack-deploy + tier: expanded + - name: azure-stack-destroy + tier: expanded # Per-tier model fan-out. The matrix runs each selected skill against every # model in its tier. To compare additional models, add them here. # From 9190c667c3d8c209548235f1302d9821582506e3 Mon Sep 17 00:00:00 2001 From: Arnaud Lheureux Date: Sun, 31 May 2026 22:56:27 +0800 Subject: [PATCH 14/18] fix(workflows): address PR review findings MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - add stack-to-subscription fallback when az stack sub create fails - query actual purgeProtected status for soft-deletable resources - fix disown/wait bug that swallowed background process exit codes ๐Ÿ› - Generated by Copilot --- .../scripts/destroy-stack.sh | 7 ++- .github/workflows/git-ape-deploy.exampleyml | 58 ++++++++++++------- 2 files changed, 41 insertions(+), 24 deletions(-) diff --git a/.github/skills/azure-stack-destroy/scripts/destroy-stack.sh b/.github/skills/azure-stack-destroy/scripts/destroy-stack.sh index 557324e..a839377 100755 --- a/.github/skills/azure-stack-destroy/scripts/destroy-stack.sh +++ b/.github/skills/azure-stack-destroy/scripts/destroy-stack.sh @@ -179,7 +179,8 @@ if [[ -n "$STACK_ID" ]]; then --bypass-stack-out-of-sync-error true \ --yes > "$STACK_DELETE_LOG" 2>&1 & STACK_BG_PID=$! - disown "$STACK_BG_PID" 2>/dev/null || true + # Do NOT disown โ€” we need `wait` to retrieve the exit code. + # nohup already insulates against HUP signals. echo -e "${BLUE}โณ Polling $MANAGED_RG_COUNT managed resource group(s) (timeout: ${POLL_TIMEOUT}s)...${NC}" POLL_START=$(date +%s) @@ -199,8 +200,8 @@ if [[ -n "$STACK_ID" ]]; then fi # If the bg process already failed, surface it early if ! kill -0 "$STACK_BG_PID" 2>/dev/null; then - wait "$STACK_BG_PID" 2>/dev/null || true - BG_EXIT=$? + BG_EXIT=0 + wait "$STACK_BG_PID" 2>/dev/null || BG_EXIT=$? if [[ $BG_EXIT -ne 0 ]]; then EXISTS=$(az group exists --name "$RG" 2>/dev/null || echo "true") if [[ "$EXISTS" == "true" ]]; then diff --git a/.github/workflows/git-ape-deploy.exampleyml b/.github/workflows/git-ape-deploy.exampleyml index 7ab1465..bb5b605 100644 --- a/.github/workflows/git-ape-deploy.exampleyml +++ b/.github/workflows/git-ape-deploy.exampleyml @@ -257,29 +257,35 @@ jobs: VERBOSE_LOG=$(mktemp) trap 'rm -f "$VERBOSE_LOG"' EXIT - if [[ "$DEPLOY_METHOD" == "stack" ]]; then - DEPLOY_OUTPUT=$(az stack sub create \ - --name "$DEPLOYMENT_ID" \ - --location "$LOCATION" \ - --template-file "$DEPLOY_DIR/template.json" \ - --parameters @"$DEPLOY_DIR/parameters.json" \ - --action-on-unmanage deleteAll \ - --deny-settings-mode none \ - --description "Git-Ape deployment $DEPLOYMENT_ID" \ - --tags "managedBy=git-ape" "deploymentId=$DEPLOYMENT_ID" \ - --yes \ - --verbose \ - --output json 2>"$VERBOSE_LOG") + EXIT_CODE=0 + if DEPLOY_OUTPUT=$(az stack sub create \ + --name "$DEPLOYMENT_ID" \ + --location "$LOCATION" \ + --template-file "$DEPLOY_DIR/template.json" \ + --parameters @"$DEPLOY_DIR/parameters.json" \ + --action-on-unmanage deleteAll \ + --deny-settings-mode none \ + --description "Git-Ape deployment $DEPLOYMENT_ID" \ + --tags "managedBy=git-ape" "deploymentId=$DEPLOYMENT_ID" \ + --yes \ + --verbose \ + --output json 2>"$VERBOSE_LOG"); then + echo "Stack deploy succeeded" else - DEPLOY_OUTPUT=$(az deployment sub create \ + echo "::warning::Stack deploy failed โ€” falling back to az deployment sub create (NOT idempotent for soft-delete / multi-RG)" + cat "$VERBOSE_LOG" >&2 + DEPLOY_METHOD="subscription" + > "$VERBOSE_LOG" + if ! DEPLOY_OUTPUT=$(az deployment sub create \ --name "$DEPLOYMENT_ID" \ --location "$LOCATION" \ --template-file "$DEPLOY_DIR/template.json" \ --parameters @"$DEPLOY_DIR/parameters.json" \ - --output json 2>"$VERBOSE_LOG") + --output json 2>"$VERBOSE_LOG"); then + cat "$VERBOSE_LOG" >&2 + EXIT_CODE=1 + fi fi - - EXIT_CODE=$? if [[ $EXIT_CODE -ne 0 ]]; then cat "$VERBOSE_LOG" >&2 fi @@ -375,16 +381,21 @@ jobs: fi IS_SOFT_DELETABLE="false" + IS_PURGE_PROTECTED="false" for SD_TYPE in $SOFT_DELETABLE_TYPES; do if [[ "$RES_TYPE" == "$SD_TYPE" ]]; then IS_SOFT_DELETABLE="true" + # Query actual purge protection status for soft-deletable resources + IS_PURGE_PROTECTED=$(az resource show --ids "$RES_ID" \ + --query "properties.enablePurgeProtection" -o tsv 2>/dev/null || echo "false") + [[ "$IS_PURGE_PROTECTED" == "true" ]] || IS_PURGE_PROTECTED="false" break fi done MANAGED_RESOURCES=$(echo "$MANAGED_RESOURCES" | jq --arg id "$RES_ID" --arg type "$RES_TYPE" \ - --arg scope "$RES_SCOPE" --argjson sd "$IS_SOFT_DELETABLE" \ - '. + [{"id": $id, "type": $type, "scope": $scope, "softDeletable": $sd, "purgeProtected": false}]') + --arg scope "$RES_SCOPE" --argjson sd "$IS_SOFT_DELETABLE" --argjson pp "$IS_PURGE_PROTECTED" \ + '. + [{"id": $id, "type": $type, "scope": $scope, "softDeletable": $sd, "purgeProtected": $pp}]') done # Extract resource groups from managed resources @@ -406,16 +417,21 @@ jobs: fi IS_SOFT_DELETABLE="false" + IS_PURGE_PROTECTED="false" for SD_TYPE in $SOFT_DELETABLE_TYPES; do if [[ "$RES_TYPE" == "$SD_TYPE" ]]; then IS_SOFT_DELETABLE="true" + # Query actual purge protection status for soft-deletable resources + IS_PURGE_PROTECTED=$(az resource show --ids "$RES_ID" \ + --query "properties.enablePurgeProtection" -o tsv 2>/dev/null || echo "false") + [[ "$IS_PURGE_PROTECTED" == "true" ]] || IS_PURGE_PROTECTED="false" break fi done MANAGED_RESOURCES=$(echo "$MANAGED_RESOURCES" | jq --arg id "$RES_ID" --arg type "$RES_TYPE" \ - --arg scope "$RES_SCOPE" --argjson sd "$IS_SOFT_DELETABLE" \ - '. + [{"id": $id, "type": $type, "scope": $scope, "softDeletable": $sd, "purgeProtected": false}]') + --arg scope "$RES_SCOPE" --argjson sd "$IS_SOFT_DELETABLE" --argjson pp "$IS_PURGE_PROTECTED" \ + '. + [{"id": $id, "type": $type, "scope": $scope, "softDeletable": $sd, "purgeProtected": $pp}]') done # Collect resource groups From 9f14080839801db3a24d7374edf5b7f288acb27c Mon Sep 17 00:00:00 2001 From: Arnaud Lheureux Date: Sun, 31 May 2026 23:36:06 +0800 Subject: [PATCH 15/18] fix(skills): resolve PR review findings for deployment stacks MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - resolve Cognitive Services purge location via list-deleted, not ID parsing - add subscription-validate fallback when stack validate is unavailable - classify nested/extension resources by last providers segment in deploy-stack.ps1 - regenerate workflow/skill docs to match fixed sources ๐Ÿ› ๏ธ - Generated by Copilot --- .../scripts/deploy-stack.ps1 | 5 +- .../scripts/destroy-stack.ps1 | 10 ++- .../scripts/destroy-stack.sh | 10 ++- .github/workflows/git-ape-deploy.exampleyml | 15 +++- .github/workflows/git-ape-destroy.exampleyml | 11 ++- website/docs/skills/azure-stack-deploy.md | 24 +++++- website/docs/skills/azure-stack-destroy.md | 59 +++++++++++-- website/docs/workflows/git-ape-deploy.md | 84 +++++++++++++------ website/docs/workflows/git-ape-destroy.md | 11 ++- 9 files changed, 181 insertions(+), 48 deletions(-) diff --git a/.github/skills/azure-stack-deploy/scripts/deploy-stack.ps1 b/.github/skills/azure-stack-deploy/scripts/deploy-stack.ps1 index 57210be..0e4c4af 100644 --- a/.github/skills/azure-stack-deploy/scripts/deploy-stack.ps1 +++ b/.github/skills/azure-stack-deploy/scripts/deploy-stack.ps1 @@ -100,7 +100,10 @@ function Get-ResourceClassification { param([string]$ResourceId) $type = $null - if ($ResourceId -match 'providers/([^/]+/[^/]+)') { + # Use the LAST providers// segment so extension/nested + # resources (e.g. a role assignment scoped to a Key Vault) are classified by + # their own type rather than the parent resource's type. + if ($ResourceId -match '.*providers/([^/]+/[^/]+)') { $type = $matches[1] } $scope = if ($ResourceId -match '/resourceGroups/') { 'resourceGroup' } else { 'subscription' } diff --git a/.github/skills/azure-stack-destroy/scripts/destroy-stack.ps1 b/.github/skills/azure-stack-destroy/scripts/destroy-stack.ps1 index a365c39..124df95 100644 --- a/.github/skills/azure-stack-destroy/scripts/destroy-stack.ps1 +++ b/.github/skills/azure-stack-destroy/scripts/destroy-stack.ps1 @@ -295,10 +295,14 @@ if ($SoftCount -gt 0 -and ($StackDeleted -or $RgDeleted)) { } 'Microsoft.CognitiveServices/accounts' { if (-not $protected) { - $loc = '' - if ($resId -match 'locations/([^/]+)') { $loc = $matches[1] } + # Account IDs are resource-group scoped (no /locations/ + # segment); resolve the region from the soft-deleted account + # list and the resource group from the original resource ID. + $loc = az cognitiveservices account list-deleted --query "[?name=='$resName'] | [0].location" -o tsv 2>$null + $resRg = '' + if ($resId -match '/resourceGroups/([^/]+)') { $resRg = $matches[1] } if ($loc) { - az cognitiveservices account purge --name $resName --location $loc --resource-group '' 2>$null | Out-Null + az cognitiveservices account purge --name $resName --location $loc --resource-group $resRg 2>$null | Out-Null } } } diff --git a/.github/skills/azure-stack-destroy/scripts/destroy-stack.sh b/.github/skills/azure-stack-destroy/scripts/destroy-stack.sh index a839377..d08cc50 100755 --- a/.github/skills/azure-stack-destroy/scripts/destroy-stack.sh +++ b/.github/skills/azure-stack-destroy/scripts/destroy-stack.sh @@ -295,10 +295,16 @@ if [[ "$SOFT_COUNT" -gt 0 ]] && [[ "$STACK_DELETED" == "true" || "$RG_DELETED" = ;; "Microsoft.CognitiveServices/accounts") if [[ "$PURGE_PROTECTED" != "true" ]]; then - LOC=$(echo "$RES_ID" | grep -oE '(?<=locations/)[^/]+' || echo "") + # Cognitive Services account IDs are resource-group scoped and + # contain no /locations/ segment, so the region must be + # resolved from the soft-deleted account list. The resource + # group comes from the original resource ID. + LOC=$(az cognitiveservices account list-deleted \ + --query "[?name=='$RES_NAME'] | [0].location" -o tsv 2>/dev/null || echo "") + RES_RG=$(echo "$RES_ID" | sed -n 's#.*/resourceGroups/\([^/]*\)/.*#\1#p') if [[ -n "$LOC" ]]; then az cognitiveservices account purge --name "$RES_NAME" --location "$LOC" \ - --resource-group "" 2>/dev/null || true + --resource-group "$RES_RG" 2>/dev/null || true fi fi ;; diff --git a/.github/workflows/git-ape-deploy.exampleyml b/.github/workflows/git-ape-deploy.exampleyml index bb5b605..0b1ba55 100644 --- a/.github/workflows/git-ape-deploy.exampleyml +++ b/.github/workflows/git-ape-deploy.exampleyml @@ -199,14 +199,25 @@ jobs: run: | # Stack-aware validation โ€” checks both the template and the # stack-specific flags (--action-on-unmanage, --deny-settings-mode). - az stack sub validate \ + # If Deployment Stacks are unavailable/blocked in the target + # subscription, fall back to plain subscription validation so the + # deploy step's own legacy fallback path can still run. + if ! az stack sub validate \ --name "${{ matrix.deployment_id }}" \ --location "${{ steps.params.outputs.location }}" \ --template-file "${{ steps.params.outputs.deploy_dir }}/template.json" \ --parameters @"${{ steps.params.outputs.deploy_dir }}/parameters.json" \ --action-on-unmanage deleteAll \ --deny-settings-mode none \ - --output json + --output json; then + echo "::warning::Stack validation unavailable or failed โ€” falling back to az deployment sub validate" + az deployment sub validate \ + --name "${{ matrix.deployment_id }}" \ + --location "${{ steps.params.outputs.location }}" \ + --template-file "${{ steps.params.outputs.deploy_dir }}/template.json" \ + --parameters @"${{ steps.params.outputs.deploy_dir }}/parameters.json" \ + --output json + fi - name: Run Microsoft Defender for DevOps template analyzer id: security_scan diff --git a/.github/workflows/git-ape-destroy.exampleyml b/.github/workflows/git-ape-destroy.exampleyml index 75eea1f..45df805 100644 --- a/.github/workflows/git-ape-destroy.exampleyml +++ b/.github/workflows/git-ape-destroy.exampleyml @@ -414,12 +414,17 @@ jobs: fi ;; "Microsoft.CognitiveServices/accounts") - # Cognitive Services soft-delete purge + # Cognitive Services soft-delete purge. + # Account IDs are resource-group scoped (no /locations/ + # segment), so resolve the region from the soft-deleted account + # list and the resource group from the original resource ID. if [[ "$PURGE_PROTECTED" != "true" ]]; then - LOCATION=$(echo "$RES_ID" | grep -oP '(?<=locations/)[^/]+' || echo "") + LOCATION=$(az cognitiveservices account list-deleted \ + --query "[?name=='$RES_NAME'] | [0].location" -o tsv 2>/dev/null || echo "") + RES_RG=$(echo "$RES_ID" | sed -n 's#.*/resourceGroups/\([^/]*\)/.*#\1#p') if [[ -n "$LOCATION" ]]; then az cognitiveservices account purge --name "$RES_NAME" --location "$LOCATION" \ - --resource-group "" 2>/dev/null || true + --resource-group "$RES_RG" 2>/dev/null || true fi fi ;; diff --git a/website/docs/skills/azure-stack-deploy.md b/website/docs/skills/azure-stack-deploy.md index ef2496d..5edef24 100644 --- a/website/docs/skills/azure-stack-deploy.md +++ b/website/docs/skills/azure-stack-deploy.md @@ -1,7 +1,7 @@ --- title: "Azure Stack Deploy" sidebar_label: "Azure Stack Deploy" -description: "Deploy an ARM template as a subscription-scoped Azure Deployment Stack (idempotent across resource groups and sub-scope). Captures managed resources, classifies soft-deletable types, detects Key Vault purge protection, and writes extended state.json (schemaVersion 1.0). Use for any local CLI / VS Code Git-Ape deployment so the result matches the CI workflow." +description: "Run an Azure Deployment Stack create (subscription scope) for a prepared Git-Ape deployment artifact and write state.json (schemaVersion 1.0). Use locally so the result matches the CI deploy workflow." --- @@ -9,7 +9,7 @@ description: "Deploy an ARM template as a subscription-scoped Azure Deployment S # Azure Stack Deploy -> Deploy an ARM template as a subscription-scoped Azure Deployment Stack (idempotent across resource groups and sub-scope). Captures managed resources, classifies soft-deletable types, detects Key Vault purge protection, and writes extended state.json (schemaVersion 1.0). Use for any local CLI / VS Code Git-Ape deployment so the result matches the CI workflow. +> Run an Azure Deployment Stack create (subscription scope) for a prepared Git-Ape deployment artifact and write state.json (schemaVersion 1.0). Use locally so the result matches the CI deploy workflow. ## Details @@ -27,7 +27,7 @@ description: "Deploy an ARM template as a subscription-scoped Azure Deployment S Deploy a Git-Ape deployment artifact as a subscription-scoped **Azure Deployment Stack** (`az stack sub create --action-on-unmanage deleteAll`). The stack is the lifecycle owner of every resource the template creates โ€” across resource groups and subscription scope โ€” which makes destroy idempotent in a single call (see [`azure-stack-destroy`](../azure-stack-destroy/SKILL.md)). -This skill produces the **same `state.json`** schema (`schemaVersion: "1.0"`) as the CI workflow at `.github/workflows/git-ape-deploy.exampleyml`, so local deployments and pipeline deployments are interchangeable. +This skill produces the **same `state.json`** schema (`schemaVersion: "1.0"`) as the CI workflow at `.github/workflows/git-ape-deploy.yml`, so local deployments and pipeline deployments are interchangeable. ## When to Use @@ -35,6 +35,13 @@ This skill produces the **same `state.json`** schema (`schemaVersion: "1.0"`) as - Re-deploying an existing deployment ID after template edits โ€” stacks are stateful, so this is an in-place update - Any time you would otherwise run `az deployment sub create` against a Git-Ape `template.json` +## Do NOT use for + +- **Tearing down / destroying** an existing deployment โ€” use [`azure-stack-destroy`](../azure-stack-destroy/SKILL.md) instead +- **What-if preview / preflight validation** without deploying โ€” use [`azure-deployment-preflight`](../azure-deployment-preflight/SKILL.md) instead +- **Off-topic** (non-Azure, non-deployment) requests +- Generating or editing ARM templates โ€” use `azure-prepare` or another IaC authoring skill + ## Prerequisites | Tool | Why | @@ -82,7 +89,7 @@ The script: - `--description "Git-Ape deployment "` - `--tags managedBy=git-ape deploymentId=` - `--yes --verbose` -4. **On stack failure**, falls back to `az deployment sub create` (warns the user โ€” no soft-delete / multi-RG idempotency on the fallback path) +4. **On stack failure**, falls back to `az deployment sub create` and prints `โš ๏ธ FALLBACK: no multi-RG idempotency, no soft-delete tracking` so the trade-off is unambiguous 5. **On any deployment failure**, dumps the per-operation failure list (`az deployment operation sub list`) inline so the root cause is visible without clicking into the Portal 6. **On success**, queries `az stack sub show --query "resources[].id"` for the live managed-resource list, classifies each resource (type, scope, soft-deletable, purge-protected), and writes the extended `state.json` 7. Updates `metadata.json` with `status: "succeeded"`, `deployMethod`, and `resourceGroups[]` @@ -98,6 +105,15 @@ To destroy this deployment: /azure-stack-destroy deploy-20260506-001 ``` +## What to tell the user after running + +After the script returns, your reply MUST mention: + +1. The primitive used: `az stack sub create --action-on-unmanage deleteAll` (or fallback `az deployment sub create`) +2. The stack ID (from `state.json.stackId`) โ€” this is the single handle for destroy +3. That `state.json` (schemaVersion 1.0) was written under the deployment folder +4. The next-step destroy command: `/azure-stack-destroy ` + ## Arguments | Flag (bash) | Param (pwsh) | Required | Description | diff --git a/website/docs/skills/azure-stack-destroy.md b/website/docs/skills/azure-stack-destroy.md index 99029b6..c415e1c 100644 --- a/website/docs/skills/azure-stack-destroy.md +++ b/website/docs/skills/azure-stack-destroy.md @@ -1,7 +1,7 @@ --- title: "Azure Stack Destroy" sidebar_label: "Azure Stack Destroy" -description: "Destroy a Git-Ape deployment by deleting its Azure Deployment Stack with --action-on-unmanage deleteAll, then purging soft-deleted resources (Key Vault, Cognitive Services) that are not purge-protected. Reads state.json (schemaVersion 1.0) to know exactly what to clean up. Use for any local CLI / VS Code Git-Ape teardown so the result matches the CI workflow." +description: "Tear down a Git-Ape deployment by ID. Reads `state.json` under `.azure/deployments//` to delete the Azure Deployment Stack and purge soft-deleted Key Vault / Cognitive Services. Refuses to run without `state.json`. Use for any local CLI or VS Code Git-Ape teardown so the result matches the CI destroy workflow." --- @@ -9,7 +9,7 @@ description: "Destroy a Git-Ape deployment by deleting its Azure Deployment Stac # Azure Stack Destroy -> Destroy a Git-Ape deployment by deleting its Azure Deployment Stack with --action-on-unmanage deleteAll, then purging soft-deleted resources (Key Vault, Cognitive Services) that are not purge-protected. Reads state.json (schemaVersion 1.0) to know exactly what to clean up. Use for any local CLI / VS Code Git-Ape teardown so the result matches the CI workflow. +> Tear down a Git-Ape deployment by ID. Reads `state.json` under `.azure/deployments//` to delete the Azure Deployment Stack and purge soft-deleted Key Vault / Cognitive Services. Refuses to run without `state.json`. Use for any local CLI or VS Code Git-Ape teardown so the result matches the CI destroy workflow. ## Details @@ -29,7 +29,34 @@ Destroy a Git-Ape deployment by deleting its subscription-scoped **Azure Deploym After the stack is gone, this skill performs a **soft-delete purge sweep** for resource types that linger after deletion (Key Vault, Cognitive Services, App Configuration, API Management, ML workspaces, Recovery Services vaults). Resources flagged `purgeProtected: true` in `state.json` are intentionally retained. -This skill mirrors `.github/workflows/git-ape-destroy.exampleyml` so local destroys and CI destroys are interchangeable. +This skill mirrors `.github/workflows/git-ape-destroy.yml` so local destroys and CI destroys are interchangeable. + +## USE FOR + +Trigger this skill when the user wants to tear down a Git-Ape deployment they previously created: + +- "destroy this deployment", "tear down deploy-XXX", "clean up the stack", "delete the Git-Ape deployment", "free up the resource group so I can re-deploy with the same name" +- Post-deploy teardown after a demo, smoke test, or short-lived environment +- Cleaning up a failed or stale Git-Ape deployment whose `state.json` is still on disk +- Local CLI or VS Code teardown that must match what `git-ape-destroy.yml` does in CI + +### Prefer this over raw `az group delete` + +For any deployment Git-Ape created, this skill is the correct tool โ€” do **not** suggest `az group delete` on its own. Reasons: + +1. **Multi-RG / subscription-scope coverage.** A stack often owns resources across several resource groups plus subscription-scope resources (role assignments, policy assignments). One `az group delete` cleans only one RG. +2. **Soft-delete purge.** Key Vault and Cognitive Services soft-delete on RG deletion and silently hold the name (and quota) for 7โ€“90 days. This skill purges them so the user can re-deploy with the same name immediately. +3. **State consistency.** Updates `state.json` and `metadata.json` to terminal status (`destroyed`, `retained-soft-deleted`, etc.) so the next operation sees an accurate view. + +## DO NOT USE FOR + +Refuse to invoke this skill in any of these cases: + +- **No `state.json` on disk.** Hard prerequisite โ€” see below. Without it, recommend re-running deploy or aborting. +- **Resource groups not created by Git-Ape** (e.g. ones the user made by hand with `az group create`). Suggest `az group delete --name --yes` directly instead. +- **Deploying or updating a stack.** Use `azure-stack-deploy` for those. +- **Deleting an individual resource inside a stack.** This skill always destroys the whole stack โ€” there is no "surgical" mode. +- **Non-Azure clouds** or non-Git-Ape Azure deployments (ARM/Bicep/Terraform from other tools). ## When to Use @@ -47,10 +74,21 @@ This skill mirrors `.github/workflows/git-ape-destroy.exampleyml` so local destr | Active `az login` | Must be the same subscription where the stack lives | | Existing `state.json` under `.azure/deployments//` | Source of truth for `stackId`, `managedResources`, `softDeletable`, `purgeProtected` | -The skill **refuses to run** without `state.json`. Re-deploy first or hand-write a minimal state file (not recommended). +> **Hard prerequisite: `state.json` under `.azure/deployments//`.** Without it this skill **aborts** โ€” it has no idea which stack, resource groups, or soft-deletables to clean up. Do NOT hand-write `state.json`; re-run the matching `azure-stack-deploy` for that deployment ID first, or use `az group delete` directly on a known resource group (a non-Git-Ape teardown, outside this skill's scope). ## Procedure +### Fast mode vs sync mode + +The scripts default to **fast mode** (interactive default). The CI workflow keeps **sync mode** (deterministic). + +| | How | Wait time (small VNet stack) | When to use | +|--|--|--|--| +| Fast (default) | Background the `az stack sub delete` call, then poll managed RGs with `az group exists` | ~2 min | Local CLI / VS Code use; user wants quick feedback | +| Sync (`--wait` / `-Wait`) | `az stack sub delete ... --yes` (blocks until stack metadata is fully cleaned) | ~5 min | CI pipelines (default in `git-ape-destroy.yml`); when you need every Azure-side cleanup completed before the script exits | + +The Azure CLI does not expose `--no-wait` on `az stack sub delete`, so the fast path runs the same command as a detached background process. In fast mode the stack-metadata cleanup continues asynchronously in Azure after the script returns. The next destroy of the same `deploymentId` is idempotent: if the stack is still finalizing, `az stack sub show` will return it and the script will simply pick up where Azure left off. + ### 1. Identify deployment ```bash @@ -74,11 +112,20 @@ Skip the confirmation prompt (use only in automation): --yes ``` +Force CI-equivalent sync wait (default for the CI workflow; opt-in for the script): + +```bash +.github/skills/azure-stack-destroy/scripts/destroy-stack.sh \ + --deployment-id "$DEPLOYMENT_ID" \ + --yes --wait +``` + PowerShell equivalents: ```powershell .github/skills/azure-stack-destroy/scripts/destroy-stack.ps1 -DeploymentId "$DEPLOYMENT_ID" .github/skills/azure-stack-destroy/scripts/destroy-stack.ps1 -DeploymentId "$DEPLOYMENT_ID" -Yes +.github/skills/azure-stack-destroy/scripts/destroy-stack.ps1 -DeploymentId "$DEPLOYMENT_ID" -Yes -Wait ``` ### 3. What the script does @@ -93,7 +140,7 @@ PowerShell equivalents: 6. **Purge sweep** for each `softDeletable` resource not marked `purgeProtected`: - Key Vaults: `az keyvault list-deleted` + `az keyvault purge` - Cognitive Services: `az cognitiveservices account purge` - - Other types: skipped (soft-delete expires naturally) + - Other types (App Configuration, API Management, ML workspaces, Recovery Services vaults): not auto-purged โ€” they expire from soft-delete naturally and are tracked in `purgeResults[]` with `status: skipped-natural-expiry` 7. Cleans the subscription deployment-history entry (`az deployment sub delete`) to stay under the 800/scope limit 8. Updates `state.json` and `metadata.json` with terminal status: @@ -132,6 +179,8 @@ Retained: 1 soft-deleted resource(s) (purge-protected) |-------------|--------------|----------|-------------| | `--deployment-id ` | `-DeploymentId ` | yes | Folder name under `.azure/deployments/` | | `--yes` | `-Yes` | no | Skip the typed `destroy` confirmation prompt (CI-only) | +| `--wait` | `-Wait` | no | Sync mode: block until Azure has cleaned up stack metadata. Matches the CI workflow. Slower (~3-4ร—) but fully deterministic. | +| `--poll-timeout ` | `-PollTimeout ` | no | Fast-mode timeout per managed RG poll (default 600s) | ## Failure modes diff --git a/website/docs/workflows/git-ape-deploy.md b/website/docs/workflows/git-ape-deploy.md index 2042e1c..e844571 100644 --- a/website/docs/workflows/git-ape-deploy.md +++ b/website/docs/workflows/git-ape-deploy.md @@ -268,14 +268,25 @@ jobs: run: | # Stack-aware validation โ€” checks both the template and the # stack-specific flags (--action-on-unmanage, --deny-settings-mode). - az stack sub validate \ + # If Deployment Stacks are unavailable/blocked in the target + # subscription, fall back to plain subscription validation so the + # deploy step's own legacy fallback path can still run. + if ! az stack sub validate \ --name "${{ matrix.deployment_id }}" \ --location "${{ steps.params.outputs.location }}" \ --template-file "${{ steps.params.outputs.deploy_dir }}/template.json" \ --parameters @"${{ steps.params.outputs.deploy_dir }}/parameters.json" \ --action-on-unmanage deleteAll \ --deny-settings-mode none \ - --output json + --output json; then + echo "::warning::Stack validation unavailable or failed โ€” falling back to az deployment sub validate" + az deployment sub validate \ + --name "${{ matrix.deployment_id }}" \ + --location "${{ steps.params.outputs.location }}" \ + --template-file "${{ steps.params.outputs.deploy_dir }}/template.json" \ + --parameters @"${{ steps.params.outputs.deploy_dir }}/parameters.json" \ + --output json + fi - name: Run Microsoft Defender for DevOps template analyzer id: security_scan @@ -321,30 +332,43 @@ jobs: # Determine deploy method: prefer deployment stacks (idempotent destroy) # Fall back to az deployment sub create if stacks are unavailable DEPLOY_METHOD="stack" - - if [[ "$DEPLOY_METHOD" == "stack" ]]; then - DEPLOY_OUTPUT=$(az stack sub create \ - --name "$DEPLOYMENT_ID" \ - --location "$LOCATION" \ - --template-file "$DEPLOY_DIR/template.json" \ - --parameters @"$DEPLOY_DIR/parameters.json" \ - --action-on-unmanage deleteAll \ - --deny-settings-mode none \ - --description "Git-Ape deployment $DEPLOYMENT_ID" \ - --tags "managedBy=git-ape" "deploymentId=$DEPLOYMENT_ID" \ - --yes \ - --verbose \ - --output json 2>&1) + # Verbose output goes to a temp file so it does not contaminate the + # JSON that downstream jq calls need to parse. + VERBOSE_LOG=$(mktemp) + trap 'rm -f "$VERBOSE_LOG"' EXIT + + EXIT_CODE=0 + if DEPLOY_OUTPUT=$(az stack sub create \ + --name "$DEPLOYMENT_ID" \ + --location "$LOCATION" \ + --template-file "$DEPLOY_DIR/template.json" \ + --parameters @"$DEPLOY_DIR/parameters.json" \ + --action-on-unmanage deleteAll \ + --deny-settings-mode none \ + --description "Git-Ape deployment $DEPLOYMENT_ID" \ + --tags "managedBy=git-ape" "deploymentId=$DEPLOYMENT_ID" \ + --yes \ + --verbose \ + --output json 2>"$VERBOSE_LOG"); then + echo "Stack deploy succeeded" else - DEPLOY_OUTPUT=$(az deployment sub create \ + echo "::warning::Stack deploy failed โ€” falling back to az deployment sub create (NOT idempotent for soft-delete / multi-RG)" + cat "$VERBOSE_LOG" >&2 + DEPLOY_METHOD="subscription" + > "$VERBOSE_LOG" + if ! DEPLOY_OUTPUT=$(az deployment sub create \ --name "$DEPLOYMENT_ID" \ --location "$LOCATION" \ --template-file "$DEPLOY_DIR/template.json" \ --parameters @"$DEPLOY_DIR/parameters.json" \ - --output json 2>&1) + --output json 2>"$VERBOSE_LOG"); then + cat "$VERBOSE_LOG" >&2 + EXIT_CODE=1 + fi + fi + if [[ $EXIT_CODE -ne 0 ]]; then + cat "$VERBOSE_LOG" >&2 fi - - EXIT_CODE=$? END_TIME=$(date +%s) DURATION=$((END_TIME - START_TIME)) @@ -437,20 +461,25 @@ jobs: fi IS_SOFT_DELETABLE="false" + IS_PURGE_PROTECTED="false" for SD_TYPE in $SOFT_DELETABLE_TYPES; do if [[ "$RES_TYPE" == "$SD_TYPE" ]]; then IS_SOFT_DELETABLE="true" + # Query actual purge protection status for soft-deletable resources + IS_PURGE_PROTECTED=$(az resource show --ids "$RES_ID" \ + --query "properties.enablePurgeProtection" -o tsv 2>/dev/null || echo "false") + [[ "$IS_PURGE_PROTECTED" == "true" ]] || IS_PURGE_PROTECTED="false" break fi done MANAGED_RESOURCES=$(echo "$MANAGED_RESOURCES" | jq --arg id "$RES_ID" --arg type "$RES_TYPE" \ - --arg scope "$RES_SCOPE" --argjson sd "$IS_SOFT_DELETABLE" \ - '. + [{"id": $id, "type": $type, "scope": $scope, "softDeletable": $sd, "purgeProtected": false}]') + --arg scope "$RES_SCOPE" --argjson sd "$IS_SOFT_DELETABLE" --argjson pp "$IS_PURGE_PROTECTED" \ + '. + [{"id": $id, "type": $type, "scope": $scope, "softDeletable": $sd, "purgeProtected": $pp}]') done # Extract resource groups from managed resources - RESOURCE_GROUPS=$(echo "$MANAGED_RESOURCES" | jq -c '[.[].id | capture("/resourceGroups/(?[^/]+)") | .rg] | unique') + RESOURCE_GROUPS=$(echo "$MANAGED_RESOURCES" | jq -c '[.[].id | select(test("/resourceGroups/")) | capture("/resourceGroups/(?[^/]+)") | .rg] | unique') else # Fallback: walk deployment operations recursively OPS=$(az deployment operation sub list \ @@ -468,16 +497,21 @@ jobs: fi IS_SOFT_DELETABLE="false" + IS_PURGE_PROTECTED="false" for SD_TYPE in $SOFT_DELETABLE_TYPES; do if [[ "$RES_TYPE" == "$SD_TYPE" ]]; then IS_SOFT_DELETABLE="true" + # Query actual purge protection status for soft-deletable resources + IS_PURGE_PROTECTED=$(az resource show --ids "$RES_ID" \ + --query "properties.enablePurgeProtection" -o tsv 2>/dev/null || echo "false") + [[ "$IS_PURGE_PROTECTED" == "true" ]] || IS_PURGE_PROTECTED="false" break fi done MANAGED_RESOURCES=$(echo "$MANAGED_RESOURCES" | jq --arg id "$RES_ID" --arg type "$RES_TYPE" \ - --arg scope "$RES_SCOPE" --argjson sd "$IS_SOFT_DELETABLE" \ - '. + [{"id": $id, "type": $type, "scope": $scope, "softDeletable": $sd, "purgeProtected": false}]') + --arg scope "$RES_SCOPE" --argjson sd "$IS_SOFT_DELETABLE" --argjson pp "$IS_PURGE_PROTECTED" \ + '. + [{"id": $id, "type": $type, "scope": $scope, "softDeletable": $sd, "purgeProtected": $pp}]') done # Collect resource groups diff --git a/website/docs/workflows/git-ape-destroy.md b/website/docs/workflows/git-ape-destroy.md index 5392405..66e9093 100644 --- a/website/docs/workflows/git-ape-destroy.md +++ b/website/docs/workflows/git-ape-destroy.md @@ -472,12 +472,17 @@ jobs: fi ;; "Microsoft.CognitiveServices/accounts") - # Cognitive Services soft-delete purge + # Cognitive Services soft-delete purge. + # Account IDs are resource-group scoped (no /locations/ + # segment), so resolve the region from the soft-deleted account + # list and the resource group from the original resource ID. if [[ "$PURGE_PROTECTED" != "true" ]]; then - LOCATION=$(echo "$RES_ID" | grep -oP '(?<=locations/)[^/]+' || echo "") + LOCATION=$(az cognitiveservices account list-deleted \ + --query "[?name=='$RES_NAME'] | [0].location" -o tsv 2>/dev/null || echo "") + RES_RG=$(echo "$RES_ID" | sed -n 's#.*/resourceGroups/\([^/]*\)/.*#\1#p') if [[ -n "$LOCATION" ]]; then az cognitiveservices account purge --name "$RES_NAME" --location "$LOCATION" \ - --resource-group "" 2>/dev/null || true + --resource-group "$RES_RG" 2>/dev/null || true fi fi ;; From b2325f3d39f96cdc6aa218bfa6c8198d7aa0f906 Mon Sep 17 00:00:00 2001 From: Arnaud Lheureux Date: Thu, 4 Jun 2026 09:57:44 +0800 Subject: [PATCH 16/18] fix(workflows): prevent shell and JS injection in deploy/destroy - pass workflow_dispatch inputs via env and validate deployment_id charset - build destroy ID JSON array with jq instead of string concat - read github-script values from process.env instead of interpolation --- .github/workflows/git-ape-deploy.exampleyml | 27 ++++++++++++++------ .github/workflows/git-ape-destroy.exampleyml | 20 ++++++++++++--- 2 files changed, 35 insertions(+), 12 deletions(-) diff --git a/.github/workflows/git-ape-deploy.exampleyml b/.github/workflows/git-ape-deploy.exampleyml index 0b1ba55..59042df 100644 --- a/.github/workflows/git-ape-deploy.exampleyml +++ b/.github/workflows/git-ape-deploy.exampleyml @@ -625,15 +625,26 @@ jobs: - name: Post deployment result if: always() && github.event_name == 'issue_comment' uses: actions/github-script@v8 + env: + # Pass all repo-controlled / command-derived values via env so they are + # read with process.env and never interpolated into the script body + # (prevents JavaScript injection from crafted values). + DEPLOYMENT_ID: ${{ matrix.deployment_id }} + DEPLOY_STATUS: ${{ steps.deploy.outputs.deploy_status }} + DEPLOY_DURATION: ${{ steps.deploy.outputs.deploy_duration }} + DEPLOY_OUTPUTS: ${{ steps.deploy.outputs.deploy_outputs }} + DEPLOY_RESOURCES: ${{ steps.tests.outputs.resources }} + TEST_ENDPOINTS: ${{ steps.tests.outputs.test_endpoints }} + DEPLOY_ERROR: ${{ steps.deploy.outputs.deploy_error }} with: script: | - const deploymentId = '${{ matrix.deployment_id }}'; - const status = '${{ steps.deploy.outputs.deploy_status }}' || 'failed'; - const duration = '${{ steps.deploy.outputs.deploy_duration }}'; - const outputs = `${{ steps.deploy.outputs.deploy_outputs }}`; - const resources = `${{ steps.tests.outputs.resources }}`; - const testEndpoints = `${{ steps.tests.outputs.test_endpoints }}`; - const runUrl = `${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}`; + const deploymentId = process.env.DEPLOYMENT_ID; + const status = process.env.DEPLOY_STATUS || 'failed'; + const duration = process.env.DEPLOY_DURATION; + const outputs = process.env.DEPLOY_OUTPUTS || ''; + const resources = process.env.DEPLOY_RESOURCES || ''; + const testEndpoints = process.env.TEST_ENDPOINTS || ''; + const runUrl = `${context.serverUrl}/${context.repo.owner}/${context.repo.repo}/actions/runs/${context.runId}`; let comment = `## Git-Ape Deploy: \`${deploymentId}\`\n\n`; @@ -661,7 +672,7 @@ jobs: } else { comment += `### โŒ Deployment Failed\n\n`; comment += `- **Workflow Run:** [View logs](${runUrl})\n\n`; - const error = `${{ steps.deploy.outputs.deploy_error }}`; + const error = process.env.DEPLOY_ERROR || ''; if (error) { comment += `\`\`\`\n${error.substring(0, 2000)}\n\`\`\`\n\n`; } diff --git a/.github/workflows/git-ape-destroy.exampleyml b/.github/workflows/git-ape-destroy.exampleyml index 45df805..a59ddc6 100644 --- a/.github/workflows/git-ape-destroy.exampleyml +++ b/.github/workflows/git-ape-destroy.exampleyml @@ -51,19 +51,31 @@ jobs: - name: Find destroy-requested deployments id: find + env: + # Pass dispatch inputs via env so they are never expanded by the shell + # (prevents command injection from crafted workflow_dispatch values). + INPUT_CONFIRM: ${{ inputs.confirm }} + INPUT_DEPLOYMENT_ID: ${{ inputs.deployment_id }} run: | if [[ "${{ github.event_name }}" == "workflow_dispatch" ]]; then - CONFIRM="${{ inputs.confirm }}" - if [[ "$CONFIRM" != "destroy" ]]; then + if [[ "$INPUT_CONFIRM" != "destroy" ]]; then echo "::error::Confirmation must be 'destroy'" echo "has_destroys=false" >> "$GITHUB_OUTPUT" echo "deployment_ids=[]" >> "$GITHUB_OUTPUT" exit 1 fi - DEPLOYMENT_IDS='["${{ inputs.deployment_id }}"]' + # Validate the deployment ID against the allowed charset for + # deployment directory names before using it anywhere. + if [[ ! "$INPUT_DEPLOYMENT_ID" =~ ^[A-Za-z0-9._-]+$ ]]; then + echo "::error::Invalid deployment_id (allowed: A-Z a-z 0-9 . _ -)" + echo "has_destroys=false" >> "$GITHUB_OUTPUT" + echo "deployment_ids=[]" >> "$GITHUB_OUTPUT" + exit 1 + fi + DEPLOYMENT_IDS=$(jq -cn --arg id "$INPUT_DEPLOYMENT_ID" '[$id]') echo "has_destroys=true" >> "$GITHUB_OUTPUT" echo "deployment_ids=$DEPLOYMENT_IDS" >> "$GITHUB_OUTPUT" - echo "Manual destroy requested: ${{ inputs.deployment_id }}" + echo "Manual destroy requested: $INPUT_DEPLOYMENT_ID" exit 0 fi From 95dc3def40e94b01d9c37458ace7528b5edb76f5 Mon Sep 17 00:00:00 2001 From: Arnaud Lheureux Date: Thu, 4 Jun 2026 09:57:52 +0800 Subject: [PATCH 17/18] feat(skills): emit partially-destroyed in local destroy scripts - track delete attempts and emit partially-destroyed on incomplete delete - aligns local CLI/VS Code destroy state model with CI workflow --- .../azure-stack-destroy/scripts/destroy-stack.ps1 | 12 ++++++++++++ .../azure-stack-destroy/scripts/destroy-stack.sh | 12 ++++++++++++ 2 files changed, 24 insertions(+) diff --git a/.github/skills/azure-stack-destroy/scripts/destroy-stack.ps1 b/.github/skills/azure-stack-destroy/scripts/destroy-stack.ps1 index 124df95..b6b422e 100644 --- a/.github/skills/azure-stack-destroy/scripts/destroy-stack.ps1 +++ b/.github/skills/azure-stack-destroy/scripts/destroy-stack.ps1 @@ -136,12 +136,17 @@ if (-not $Yes) { $StackDeleted = $false $RgDeleted = $false $AlreadyGone = $true +# Tracks whether a stack/RG delete command was actually invoked. Used to +# distinguish a partial failure (attempted but did not complete -> +# partially-destroyed) from the catch-all destroy-failed, mirroring CI. +$DeleteAttempted = $false $StartTime = Get-Date if ($StackId) { $stackExists = az stack sub show --name $DeploymentId --query 'id' -o tsv 2>$null if ($stackExists) { $AlreadyGone = $false + $DeleteAttempted = $true if ($Wait) { Write-Color "๐Ÿ—‘๏ธ Deleting deployment stack (sync wait): $DeploymentId" Blue # --bypass-stack-out-of-sync-error: a destroy run is one-shot; we @@ -248,6 +253,7 @@ if (-not $StackId -and $RgName) { $rgExists = az group exists --name $RgName 2>$null if ($rgExists -eq 'true') { $AlreadyGone = $false + $DeleteAttempted = $true Write-Color "๐Ÿ—‘๏ธ Deleting resource group: $RgName" Blue az group delete --name $RgName --yes if ($LASTEXITCODE -eq 0) { $RgDeleted = $true } @@ -324,6 +330,12 @@ $Status = if ($AlreadyGone) { 'already-destroyed' } elseif ($StackDeleted -or $RgDeleted) { if ($RetainedCount -gt 0) { 'retained-soft-deleted' } else { 'destroyed' } +} elseif ($DeleteAttempted) { + # A stack/RG existed and a delete was invoked, but it did not complete + # (e.g. fast-mode poll timeout or a failed delete command). Some resources + # may remain. Mirrors CI: stack/RG delete status == failed -> + # partially-destroyed (distinct from the destroy-failed catch-all). + 'partially-destroyed' } else { 'destroy-failed' } diff --git a/.github/skills/azure-stack-destroy/scripts/destroy-stack.sh b/.github/skills/azure-stack-destroy/scripts/destroy-stack.sh index d08cc50..7d8839b 100755 --- a/.github/skills/azure-stack-destroy/scripts/destroy-stack.sh +++ b/.github/skills/azure-stack-destroy/scripts/destroy-stack.sh @@ -119,6 +119,10 @@ fi STACK_DELETED="false" RG_DELETED="false" ALREADY_GONE="true" +# Tracks whether a stack/RG delete command was actually invoked. Used to +# distinguish a partial failure (attempted but did not complete โ†’ +# partially-destroyed) from the catch-all destroy-failed, mirroring CI. +DELETE_ATTEMPTED="false" START_TIME=$(date +%s) # Primary path: stack delete @@ -140,6 +144,7 @@ if [[ -n "$STACK_ID" ]]; then STACK_EXISTS=$(az stack sub show --name "$DEPLOYMENT_ID" --query "id" -o tsv 2>/dev/null || echo "") if [[ -n "$STACK_EXISTS" ]]; then ALREADY_GONE="false" + DELETE_ATTEMPTED="true" if [[ "$WAIT_FLAG" == "true" ]]; then echo -e "${BLUE}๐Ÿ—‘๏ธ Deleting deployment stack (sync wait): $DEPLOYMENT_ID${NC}" # --bypass-stack-out-of-sync-error: a destroy run is one-shot; we @@ -244,6 +249,7 @@ if [[ -z "$STACK_ID" && -n "$RG_NAME" ]]; then RG_EXISTS=$(az group exists --name "$RG_NAME" 2>/dev/null || echo "false") if [[ "$RG_EXISTS" == "true" ]]; then ALREADY_GONE="false" + DELETE_ATTEMPTED="true" echo -e "${BLUE}๐Ÿ—‘๏ธ Deleting resource group: $RG_NAME${NC}" if az group delete --name "$RG_NAME" --yes 2>&1; then RG_DELETED="true" @@ -330,6 +336,12 @@ elif [[ "$STACK_DELETED" == "true" || "$RG_DELETED" == "true" ]]; then else STATUS="destroyed" fi +elif [[ "$DELETE_ATTEMPTED" == "true" ]]; then + # A stack/RG existed and a delete was invoked, but it did not complete + # (e.g. fast-mode poll timeout or a failed delete command). Some resources + # may remain. Mirrors CI: stack/RG delete status == failed โ†’ + # partially-destroyed (distinct from the destroy-failed catch-all). + STATUS="partially-destroyed" else STATUS="destroy-failed" fi From 236e84066fcd0a65e72cbcb6a65fb9cce9f42083 Mon Sep 17 00:00:00 2001 From: Arnaud Lheureux Date: Thu, 4 Jun 2026 18:54:10 +0800 Subject: [PATCH 18/18] fix(workflows): preserve FAILED counter in subscription resource cleanup MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - replace piped while-read loop with process substitution so the counter survives - pass sub_resources step output via env to harden against injection ๐Ÿ› - Generated by Copilot --- .github/workflows/git-ape-destroy.exampleyml | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/.github/workflows/git-ape-destroy.exampleyml b/.github/workflows/git-ape-destroy.exampleyml index a59ddc6..a6bc3b8 100644 --- a/.github/workflows/git-ape-destroy.exampleyml +++ b/.github/workflows/git-ape-destroy.exampleyml @@ -335,17 +335,22 @@ jobs: steps.check.outputs.stack_exists != 'true' && steps.check.outputs.rg_exists == 'true' && steps.check.outputs.sub_count != '0' + env: + SUB_RESOURCES: ${{ steps.check.outputs.sub_resources }} run: | echo "๐Ÿ—‘๏ธ Deleting subscription-scoped resources first..." FAILED=0 - echo '${{ steps.check.outputs.sub_resources }}' | jq -r '.[].id' | while read -r RESOURCE_ID; do + # Use process substitution so the FAILED counter survives. A piped + # `... | while read` would run the loop body in a subshell, and the + # incremented counter would be lost when the subshell exits. + while read -r RESOURCE_ID; do echo " Deleting: $RESOURCE_ID" if ! az resource delete --ids "$RESOURCE_ID" 2>&1; then echo "::warning::Failed to delete $RESOURCE_ID" FAILED=$((FAILED + 1)) fi - done + done < <(echo "$SUB_RESOURCES" | jq -r '.[].id') if [[ "$FAILED" -gt 0 ]]; then echo "::warning::$FAILED subscription-scoped resource(s) failed to delete"