diff --git a/.github/plugins/azure-skills/.plugin/plugin.json b/.github/plugins/azure-skills/.plugin/plugin.json index a70f46f..21233bf 100644 --- a/.github/plugins/azure-skills/.plugin/plugin.json +++ b/.github/plugins/azure-skills/.plugin/plugin.json @@ -20,5 +20,6 @@ "diagnostics" ], "skills": "./skills/", - "mcpServers": "./.mcp.json" + "mcpServers": "./.mcp.json", + "hooks": "./copilot-hooks.json" } diff --git a/.github/plugins/azure-skills/copilot-hooks.json b/.github/plugins/azure-skills/copilot-hooks.json new file mode 100644 index 0000000..fdbb700 --- /dev/null +++ b/.github/plugins/azure-skills/copilot-hooks.json @@ -0,0 +1,11 @@ +{ + "hooks": { + "PostToolUse": [ + { + "type": "command", + "bash": "${PLUGIN_ROOT}/hooks/scripts/track-telemetry.sh", + "powershell": "${PLUGIN_ROOT}/hooks/scripts/track-telemetry.ps1" + } + ] + } +} \ No newline at end of file diff --git a/.github/plugins/azure-skills/hooks/hooks.json b/.github/plugins/azure-skills/hooks/hooks.json new file mode 100644 index 0000000..d9214f3 --- /dev/null +++ b/.github/plugins/azure-skills/hooks/hooks.json @@ -0,0 +1,14 @@ +{ + "hooks": { + "PostToolUse": [ + { + "hooks": [ + { + "type": "command", + "command": "bash ${CLAUDE_PLUGIN_ROOT}/hooks/scripts/track-telemetry.sh" + } + ] + } + ] + } +} \ No newline at end of file diff --git a/.github/plugins/azure-skills/hooks/scripts/track-telemetry.ps1 b/.github/plugins/azure-skills/hooks/scripts/track-telemetry.ps1 new file mode 100644 index 0000000..34604e7 --- /dev/null +++ b/.github/plugins/azure-skills/hooks/scripts/track-telemetry.ps1 @@ -0,0 +1,180 @@ +# Telemetry tracking hook for Azure Copilot Skills +# Reads JSON input from stdin, tracks relevant events, and publishes via MCP + +$ErrorActionPreference = "SilentlyContinue" + +# Skip telemetry if opted out +if ($env:AZURE_MCP_COLLECT_TELEMETRY -eq "false") { + Write-Output '{"continue":true}' + exit 0 +} + +# Return success and exit +function Write-Success { + Write-Output '{"continue":true}' + exit 0 +} + +# === Main Processing === + +# Read entire stdin at once - hooks send one complete JSON per invocation +try { + $rawInput = [Console]::In.ReadToEnd() +} catch { + Write-Success +} + +# Return success and exit if no input +if ([string]::IsNullOrWhiteSpace($rawInput)) { + Write-Success +} + +# === STEP 1: Read and parse input === + +# Parse JSON input +try { + $inputData = $rawInput | ConvertFrom-Json +} catch { + Write-Success +} + +# Extract fields from hook data +# Support both Copilot CLI (camelCase) and Claude Code (snake_case) formats +$toolName = $inputData.toolName +if (-not $toolName) { + $toolName = $inputData.tool_name +} + +$sessionId = $inputData.sessionId +if (-not $sessionId) { + $sessionId = $inputData.session_id +} + +# Get tool arguments (Copilot CLI: toolArgs, Claude Code: tool_input) +$toolInput = $inputData.toolArgs +if (-not $toolInput) { + $toolInput = $inputData.tool_input +} + +$timestamp = (Get-Date).ToUniversalTime().ToString("yyyy-MM-ddTHH:mm:ssZ") + +# Detect client type based on which format was used +if ($inputData.PSObject.Properties.Name -contains "hook_event_name") { + $clientType = "claude-code" +} else { + $clientType = "copilot-cli" +} + +# Skip if no tool name found in either format +if (-not $toolName) { + Write-Success +} + +# Helper to extract path from tool input (handles 'path', 'filePath', 'file_path') +function Get-ToolInputPath { + if ($toolInput.path) { return $toolInput.path } + if ($toolInput.filePath) { return $toolInput.filePath } + if ($toolInput.file_path) { return $toolInput.file_path } + return $null +} + +# === STEP 2: Determine what to track for azmcp === + +$shouldTrack = $false +$eventType = $null +$skillName = $null +$azureToolName = $null +$filePath = $null + +# Check for skill invocation via 'skill'/'Skill' tool +if ($toolName -eq "skill" -or $toolName -eq "Skill") { + $skillName = $toolInput.skill + if ($skillName) { + $eventType = "skill_invocation" + $shouldTrack = $true + } +} + +# Check for skill invocation (reading SKILL.md files) +if ($toolName -eq "view") { + $pathToCheck = Get-ToolInputPath + if ($pathToCheck) { + # Normalize path: convert to lowercase, replace backslashes, and squeeze consecutive slashes + $pathLower = $pathToCheck.ToLower() -replace '\\', '/' -replace '/+', '/' + + # Check for SKILL.md pattern (Copilot: .copilot/...skills/; Claude: .claude/...skills/) + if ($pathLower -match '\.copilot.*skills.*/skill\.md' -or $pathLower -match '\.claude.*skills.*/skill\.md') { + # Normalize path and extract skill name using regex + $pathNormalized = $pathToCheck -replace '\\', '/' -replace '/+', '/' + if ($pathNormalized -match '/skills/([^/]+)/SKILL\.md$') { + $skillName = $Matches[1] + $eventType = "skill_invocation" + $shouldTrack = $true + } + } + } +} + +# Check for Azure MCP tool invocation +# Copilot CLI: "mcp_azure_*" or "azure-*" prefixes +# Claude Code: "mcp__plugin_azure_azure__*" prefix (double underscores) +if ($toolName) { + if ($toolName.StartsWith("mcp_azure_") -or $toolName.StartsWith("azure-") -or $toolName.StartsWith("mcp__plugin_azure_azure__")) { + $azureToolName = $toolName + $eventType = "tool_invocation" + $shouldTrack = $true + } +} + +# Capture file path from any tool input (only track files in azure\skills folder) +# Check both 'path' and 'filePath' properties +if (-not $filePath) { + $pathToCheck = Get-ToolInputPath + if ($pathToCheck) { + # Normalize path for matching: replace backslashes and squeeze consecutive slashes + $pathLower = $pathToCheck.ToLower() -replace '\\', '/' -replace '/+', '/' + + # Check if path matches azure skills folder structure + # Copilot: .copilot/installed-plugins/azure-skills/azure/skills/... + # Claude: .claude/plugins/cache/azure-skills/azure//skills/... + if ($pathLower -match '\.copilot.*installed-plugins.*azure-skills.*azure.*skills' -or $pathLower -match '\.claude.*plugins.*cache.*azure-skills.*azure.*skills') { + # Extract relative path after 'azure/skills/' or 'azure//skills/' + $pathNormalized = $pathToCheck -replace '\\', '/' -replace '/+', '/' + + if ($pathNormalized -match 'azure/([0-9]+\.[0-9]+\.[0-9]+/)?skills/(.+)$') { + $filePath = $Matches[2] + + if (-not $shouldTrack) { + $shouldTrack = $true + $eventType = "reference_file_read" + } + } + } + } +} + +# === STEP 3: Publish event === + +if ($shouldTrack) { + # Build MCP command arguments + $mcpArgs = @( + "server", "plugin-telemetry", + "--timestamp", $timestamp, + "--client-type", $clientType + ) + + if ($eventType) { $mcpArgs += "--event-type"; $mcpArgs += $eventType } + if ($sessionId) { $mcpArgs += "--session-id"; $mcpArgs += $sessionId } + if ($skillName) { $mcpArgs += "--skill-name"; $mcpArgs += $skillName } + if ($azureToolName) { $mcpArgs += "--tool-name"; $mcpArgs += $azureToolName } + # Convert forward slashes to backslashes for azmcp allowlist compatibility + if ($filePath) { $mcpArgs += "--file-reference"; $mcpArgs += ($filePath -replace '/', '\') } + + # Publish telemetry via npx + try { + & npx -y @azure/mcp@latest @mcpArgs 2>&1 | Out-Null + } catch { } +} + +# Output success to stdout (required by hooks) +Write-Success diff --git a/.github/plugins/azure-skills/hooks/scripts/track-telemetry.sh b/.github/plugins/azure-skills/hooks/scripts/track-telemetry.sh new file mode 100755 index 0000000..4b86390 --- /dev/null +++ b/.github/plugins/azure-skills/hooks/scripts/track-telemetry.sh @@ -0,0 +1,205 @@ +#!/bin/bash + +# Telemetry tracking hook for Azure Copilot Skills +# Reads JSON input from stdin, tracks relevant events, and publishes via MCP + +set +e # Don't exit on errors - fail silently for privacy + +# Skip telemetry if opted out +if [ "${AZURE_MCP_COLLECT_TELEMETRY}" = "false" ]; then + echo '{"continue":true}' + exit 0 +fi + +# Return success and exit +return_success() { + echo '{"continue":true}' + exit 0 +} + +# === JSON Parsing Functions (using sed - portable across platforms) === + +# Extract simple string field from JSON +extract_json_field() { + local json="$1" + local field="$2" + echo "$json" | sed -n "s/.*\"$field\":[[:space:]]*\"\([^\"]*\)\".*/\1/p" +} + +# Extract nested field from toolArgs/tool_input (e.g., toolArgs.skill or tool_input.skill) +extract_toolargs_field() { + local json="$1" + local field="$2" + local value="" + # Try Copilot CLI format (toolArgs) first, then Claude Code format (tool_input) + value=$(echo "$json" | sed -n "s/.*\"toolArgs\":[[:space:]]*{[^}]*\"$field\":[[:space:]]*\"\([^\"]*\)\".*/\1/p") + if [ -z "$value" ]; then + value=$(echo "$json" | sed -n "s/.*\"tool_input\":[[:space:]]*{[^}]*\"$field\":[[:space:]]*\"\([^\"]*\)\".*/\1/p") + fi + echo "$value" +} + +# Extract path from toolArgs/tool_input (handles both 'path' and 'filePath') +extract_toolargs_path() { + local json="$1" + local path_value="" + + # Try Copilot CLI format (toolArgs) first + path_value=$(echo "$json" | sed -n 's/.*"toolArgs":[[:space:]]*{[^}]*"path":[[:space:]]*"\([^"]*\)".*/\1/p') + if [ -z "$path_value" ]; then + path_value=$(echo "$json" | sed -n 's/.*"toolArgs":[[:space:]]*{[^}]*"filePath":[[:space:]]*"\([^"]*\)".*/\1/p') + fi + # Fall back to Claude Code format (tool_input) + if [ -z "$path_value" ]; then + path_value=$(echo "$json" | sed -n 's/.*"tool_input":[[:space:]]*{[^}]*"file_path":[[:space:]]*"\([^"]*\)".*/\1/p') + fi + if [ -z "$path_value" ]; then + path_value=$(echo "$json" | sed -n 's/.*"tool_input":[[:space:]]*{[^}]*"path":[[:space:]]*"\([^"]*\)".*/\1/p') + fi + + echo "$path_value" +} + +# === Main Processing === + +# Check if stdin has data +if [ -t 0 ]; then + return_success +fi + +# Read entire stdin at once - hooks send one complete JSON per invocation +rawInput=$(cat) + +# Return success and exit if no input +if [ -z "$rawInput" ]; then + return_success +fi + +# === STEP 1: Read and parse input === + +# Extract fields from hook data +# Support both Copilot CLI (camelCase) and Claude Code (snake_case) formats +toolName=$(extract_json_field "$rawInput" "toolName") +sessionId=$(extract_json_field "$rawInput" "sessionId") + +# Fall back to Claude Code snake_case field names +if [ -z "$toolName" ]; then + toolName=$(extract_json_field "$rawInput" "tool_name") +fi +if [ -z "$sessionId" ]; then + sessionId=$(extract_json_field "$rawInput" "session_id") +fi + +timestamp=$(date -u +"%Y-%m-%dT%H:%M:%SZ") + +# Detect client type based on which format was used +if echo "$rawInput" | grep -q '"hook_event_name"'; then + clientType="claude-code" +else + clientType="copilot-cli" +fi + +# Skip if no tool name found in either format +if [ -z "$toolName" ]; then + return_success +fi + +# === STEP 2: Determine what to track for azmcp === + +shouldTrack=false +eventType="" +skillName="" +azureToolName="" +filePath="" + +# Check for skill invocation via 'skill'/'Skill' tool +if [ "$toolName" = "skill" ] || [ "$toolName" = "Skill" ]; then + skillName=$(extract_toolargs_field "$rawInput" "skill") + if [ -n "$skillName" ]; then + eventType="skill_invocation" + shouldTrack=true + fi +fi + +# Check for skill invocation (reading SKILL.md files) +if [ "$toolName" = "view" ]; then + pathToCheck=$(extract_toolargs_path "$rawInput") + if [ -n "$pathToCheck" ]; then + # Normalize path: convert to lowercase, replace backslashes, and squeeze consecutive slashes + pathLower=$(echo "$pathToCheck" | tr '[:upper:]' '[:lower:]' | tr '\\' '/' | sed 's|//*|/|g') + + # Check for SKILL.md pattern (Copilot: .copilot/...skills/; Claude: .claude/...skills/) + if [[ "$pathLower" == *".copilot"*"skills"*"/skill.md" ]] || [[ "$pathLower" == *".claude"*"skills"*"/skill.md" ]]; then + # Normalize path and extract skill name using regex + pathNormalized=$(echo "$pathToCheck" | tr '\\' '/' | sed 's|//*|/|g') + if [[ "$pathNormalized" =~ /skills/([^/]+)/SKILL\.md$ ]]; then + skillName="${BASH_REMATCH[1]}" + eventType="skill_invocation" + shouldTrack=true + fi + fi + fi +fi + +# Check for Azure MCP tool invocation +# Copilot CLI: "mcp_azure_*" or "azure-*" prefixes +# Claude Code: "mcp__plugin_azure_azure__*" prefix (double underscores) +if [ -n "$toolName" ]; then + if [[ "$toolName" == mcp_azure_* ]] || [[ "$toolName" == azure-* ]] || [[ "$toolName" == mcp__plugin_azure_azure__* ]]; then + azureToolName="$toolName" + eventType="tool_invocation" + shouldTrack=true + fi +fi + +# Capture file path from any tool input (only track files in azure\skills folder) +# Check both 'path' and 'filePath' properties +if [ -z "$filePath" ]; then + pathToCheck=$(extract_toolargs_path "$rawInput") + if [ -n "$pathToCheck" ]; then + # Normalize path for matching: replace backslashes and squeeze consecutive slashes + pathLower=$(echo "$pathToCheck" | tr '[:upper:]' '[:lower:]' | tr '\\' '/' | sed 's|//*|/|g') + + # Check if path matches azure skills folder structure + # Copilot: .copilot/installed-plugins/azure-skills/azure/skills/... + # Claude: .claude/plugins/cache/azure-skills/azure//skills/... + if [[ "$pathLower" == *".copilot"*"installed-plugins"*"azure-skills"*"azure"*"skills"* ]] || [[ "$pathLower" == *".claude"*"plugins"*"cache"*"azure-skills"*"azure"*"skills"* ]]; then + # Extract relative path after 'azure/skills/' or 'azure//skills/' + pathNormalized=$(echo "$pathToCheck" | tr '\\' '/' | sed 's|//*|/|g') + + if [[ "$pathNormalized" =~ azure/([0-9]+\.[0-9]+\.[0-9]+/)?skills/(.+)$ ]]; then + filePath="${BASH_REMATCH[2]}" + + if [ "$shouldTrack" = false ]; then + shouldTrack=true + eventType="reference_file_read" + fi + fi + fi + fi +fi + +# === STEP 3: Publish event via azmcp === + +if [ "$shouldTrack" = true ]; then + # Build MCP command arguments (using array for proper quoting) + mcpArgs=( + "server" "plugin-telemetry" + "--timestamp" "$timestamp" + "--client-type" "$clientType" + ) + + [ -n "$eventType" ] && mcpArgs+=("--event-type" "$eventType") + [ -n "$sessionId" ] && mcpArgs+=("--session-id" "$sessionId") + [ -n "$skillName" ] && mcpArgs+=("--skill-name" "$skillName") + [ -n "$azureToolName" ] && mcpArgs+=("--tool-name" "$azureToolName") + # Convert forward slashes to backslashes for azmcp allowlist compatibility + [ -n "$filePath" ] && mcpArgs+=("--file-reference" "$(echo "$filePath" | tr '/' '\\')") + + # Publish telemetry via npx + npx -y @azure/mcp@latest "${mcpArgs[@]}" >/dev/null 2>&1 || true +fi + +# Output success to stdout (required by hooks) +return_success + diff --git a/.github/plugins/azure-skills/skills/azure-deploy/SKILL.md b/.github/plugins/azure-skills/skills/azure-deploy/SKILL.md index ee8b415..d7a2674 100644 --- a/.github/plugins/azure-skills/skills/azure-deploy/SKILL.md +++ b/.github/plugins/azure-skills/skills/azure-deploy/SKILL.md @@ -4,7 +4,7 @@ description: "Execute Azure deployments for ALREADY-PREPARED applications that h license: MIT metadata: author: Microsoft - version: "1.0.5" + version: "1.0.7" --- # Azure Deploy @@ -65,6 +65,7 @@ Activate this skill when user wants to: | 5 | **Post-Deploy** — Configure SQL managed identity and apply EF migrations if applicable | [Post-Deployment](references/recipes/azd/post-deployment.md) | | 6 | **Handle Errors** — See recipe's `errors.md` | — | | 7 | **Verify Success** — Confirm deployment completed and endpoints are accessible | [Verification](references/recipes/azd/verify.md) | +| 8 | **Report Results** — Present deployed endpoint URLs to the user | [Verification](references/recipes/azd/verify.md) | > **⛔ VALIDATION PROOF CHECK** > diff --git a/.github/plugins/azure-skills/skills/azure-deploy/references/recipes/azcli/README.md b/.github/plugins/azure-skills/skills/azure-deploy/references/recipes/azcli/README.md index 8336bb6..1f65fbc 100644 --- a/.github/plugins/azure-skills/skills/azure-deploy/references/recipes/azcli/README.md +++ b/.github/plugins/azure-skills/skills/azure-deploy/references/recipes/azcli/README.md @@ -17,6 +17,7 @@ Deploy to Azure using Azure CLI. | 2 | Deploy infrastructure | `az deployment sub create` | | 3 | Deploy application | Service-specific commands | | 4 | Verify | `az resource list` | +| 5 | **Report** | Present deployed endpoint URLs to the user — see [Verification](verify.md) | ## Infrastructure Deployment diff --git a/.github/plugins/azure-skills/skills/azure-deploy/references/recipes/azcli/verify.md b/.github/plugins/azure-skills/skills/azure-deploy/references/recipes/azcli/verify.md index c4487b8..7f97226 100644 --- a/.github/plugins/azure-skills/skills/azure-deploy/references/recipes/azcli/verify.md +++ b/.github/plugins/azure-skills/skills/azure-deploy/references/recipes/azcli/verify.md @@ -28,3 +28,19 @@ az webapp show \ --resource-group \ --query "{state:state, hostNames:hostNames}" ``` + +## Report Results to User + +> ⛔ **MANDATORY** — You **MUST** present the deployed endpoint URLs to the user in your response. + +Extract endpoints using the appropriate command for the service type: + +```bash +# Container Apps +az containerapp show --name --resource-group --query "properties.configuration.ingress.fqdn" -o tsv + +# App Service +az webapp show --name --resource-group --query "defaultHostName" -o tsv +``` + +Present a summary including all service URLs. Do NOT end your response without including them. diff --git a/.github/plugins/azure-skills/skills/azure-deploy/references/recipes/azd/README.md b/.github/plugins/azure-skills/skills/azure-deploy/references/recipes/azd/README.md index d9b80cb..e813b7b 100644 --- a/.github/plugins/azure-skills/skills/azure-deploy/references/recipes/azd/README.md +++ b/.github/plugins/azure-skills/skills/azure-deploy/references/recipes/azd/README.md @@ -21,6 +21,7 @@ Deploy to Azure using Azure Developer CLI (azd). | 2 | **Deploy** | `azd up --no-prompt` | | 3 | **Post-Deploy** | [Post-Deployment Steps](post-deployment.md) — If using SQL + managed identity | | 4 | **Verify** | See [Verification](verify.md) | +| 5 | **Report** | Present deployed endpoint URLs to the user — see [Verification](verify.md) Step 3 | > ⚠️ **Important:** For .NET Aspire projects or projects using azd "limited mode" (no explicit `infra/` folder), verify that `azd provision` populated all required environment variables. If `azd deploy` fails with errors about missing `AZURE_CONTAINER_REGISTRY_ENDPOINT`, `AZURE_CONTAINER_REGISTRY_MANAGED_IDENTITY_ID`, or `MANAGED_IDENTITY_CLIENT_ID`, see [Error Handling](errors.md#missing-container-registry-variables) for the resolution. diff --git a/.github/plugins/azure-skills/skills/azure-deploy/references/recipes/azd/verify.md b/.github/plugins/azure-skills/skills/azure-deploy/references/recipes/azd/verify.md index ff75a5f..f796bef 100644 --- a/.github/plugins/azure-skills/skills/azure-deploy/references/recipes/azd/verify.md +++ b/.github/plugins/azure-skills/skills/azure-deploy/references/recipes/azd/verify.md @@ -9,7 +9,7 @@ azd show ``` Expected output: -``` +```text Showing deployed resources: Resource Group: rg-myapp-dev Services: @@ -28,7 +28,39 @@ curl -f "$ENDPOINT/health" || curl -f "$ENDPOINT" Expected: HTTP 200 response. -## Step 3: Post-Deployment Verification (if applicable) +## Step 3: Report Results to User + +> ⛔ **MANDATORY** — You **MUST** present the deployed endpoint URLs to the user in your response. A deployment is not considered complete until the user has received the URLs. + +Extract all endpoints from the `azd up` / `azd deploy` output or by running: + +```bash +azd show +``` + +**Present a summary to the user that includes:** + +| Item | Source | +|------|--------| +| Deployed service endpoint(s) | `Endpoint:` lines from `azd` output or `azd show` | +| Aspire Dashboard URL (if applicable) | `Aspire Dashboard:` line from `azd` output | +| Azure Portal deployment link (if available) | Portal URL from provisioning output | + +Example response format: + +```text +✅ Deployment succeeded! + +| Service | Endpoint | +|---------|----------| +| apiservice | https://apiservice.xxx.azurecontainerapps.io | + +Aspire Dashboard: https://aspire-dashboard.xxx.azurecontainerapps.io +``` + +> ⚠️ If output was truncated, run `azd show` to retrieve endpoint URLs. + +## Step 4: Post-Deployment Verification (if applicable) For deployments with Azure SQL Database and managed identity: diff --git a/.github/plugins/azure-skills/skills/azure-deploy/references/recipes/bicep/README.md b/.github/plugins/azure-skills/skills/azure-deploy/references/recipes/bicep/README.md index e83a56c..65bba98 100644 --- a/.github/plugins/azure-skills/skills/azure-deploy/references/recipes/bicep/README.md +++ b/.github/plugins/azure-skills/skills/azure-deploy/references/recipes/bicep/README.md @@ -17,6 +17,7 @@ Deploy to Azure using Bicep templates directly. | 2 | Build (optional) | `az bicep build --file main.bicep` | | 3 | Deploy | `az deployment sub create` | | 4 | Verify | `az resource list` | +| 5 | **Report** | Present deployed endpoint URLs to the user — see [Verification](verify.md) | ## Deployment Commands diff --git a/.github/plugins/azure-skills/skills/azure-deploy/references/recipes/bicep/verify.md b/.github/plugins/azure-skills/skills/azure-deploy/references/recipes/bicep/verify.md index dca4788..231c6fd 100644 --- a/.github/plugins/azure-skills/skills/azure-deploy/references/recipes/bicep/verify.md +++ b/.github/plugins/azure-skills/skills/azure-deploy/references/recipes/bicep/verify.md @@ -17,3 +17,15 @@ az deployment sub show \ ```bash curl -s https:///health | jq . ``` + +## Report Results to User + +> ⛔ **MANDATORY** — You **MUST** present the deployed endpoint URLs to the user in your response. + +Extract endpoints from deployment outputs: + +```bash +az deployment sub show --name main --query properties.outputs +``` + +Present a summary including all service URLs. Do NOT end your response without including them. diff --git a/.github/plugins/azure-skills/skills/azure-deploy/references/recipes/cicd/verify.md b/.github/plugins/azure-skills/skills/azure-deploy/references/recipes/cicd/verify.md index 23889f8..08d4ca8 100644 --- a/.github/plugins/azure-skills/skills/azure-deploy/references/recipes/cicd/verify.md +++ b/.github/plugins/azure-skills/skills/azure-deploy/references/recipes/cicd/verify.md @@ -15,3 +15,9 @@ az resource list --resource-group --output table ```bash curl -s https:///health | jq . ``` + +## Report Results to User + +> ⛔ **MANDATORY** — You **MUST** present the deployed endpoint URLs to the user in your response. + +Extract endpoints from the pipeline output or query them directly via `az` CLI. Present a summary including all service URLs. Do NOT end your response without including them. diff --git a/.github/plugins/azure-skills/skills/azure-deploy/references/recipes/terraform/README.md b/.github/plugins/azure-skills/skills/azure-deploy/references/recipes/terraform/README.md index b203942..59290eb 100644 --- a/.github/plugins/azure-skills/skills/azure-deploy/references/recipes/terraform/README.md +++ b/.github/plugins/azure-skills/skills/azure-deploy/references/recipes/terraform/README.md @@ -19,6 +19,7 @@ Deploy to Azure using Terraform. | 3 | Apply | `terraform apply tfplan` | | 4 | Get outputs | `terraform output` | | 5 | Deploy app | Service-specific commands | +| 6 | **Report** | Present deployed endpoint URLs to the user — see [Verification](verify.md) | ## Deployment Commands diff --git a/.github/plugins/azure-skills/skills/azure-deploy/references/recipes/terraform/verify.md b/.github/plugins/azure-skills/skills/azure-deploy/references/recipes/terraform/verify.md index 8030d40..860c5bb 100644 --- a/.github/plugins/azure-skills/skills/azure-deploy/references/recipes/terraform/verify.md +++ b/.github/plugins/azure-skills/skills/azure-deploy/references/recipes/terraform/verify.md @@ -16,3 +16,15 @@ curl -s https://$(terraform output -raw api_url)/health | jq . ```bash az resource list --resource-group $(terraform output -raw resource_group_name) --output table ``` + +## Report Results to User + +> ⛔ **MANDATORY** — You **MUST** present the deployed endpoint URLs to the user in your response. + +Extract endpoints from Terraform outputs: + +```bash +terraform output -raw api_url +``` + +Present a summary including all service URLs. Do NOT end your response without including them. diff --git a/.github/plugins/azure-skills/skills/microsoft-foundry/SKILL.md b/.github/plugins/azure-skills/skills/microsoft-foundry/SKILL.md index fd2bd2c..9bbc09f 100644 --- a/.github/plugins/azure-skills/skills/microsoft-foundry/SKILL.md +++ b/.github/plugins/azure-skills/skills/microsoft-foundry/SKILL.md @@ -4,7 +4,7 @@ description: "Deploy, evaluate, and manage Foundry agents end-to-end: Docker bui license: MIT metadata: author: Microsoft - version: "1.0.6" + version: "1.0.7" --- # Microsoft Foundry Skill diff --git a/.github/plugins/azure-skills/skills/microsoft-foundry/foundry-agent/deploy/deploy.md b/.github/plugins/azure-skills/skills/microsoft-foundry/foundry-agent/deploy/deploy.md index c216dca..0e2f0d0 100644 --- a/.github/plugins/azure-skills/skills/microsoft-foundry/foundry-agent/deploy/deploy.md +++ b/.github/plugins/azure-skills/skills/microsoft-foundry/foundry-agent/deploy/deploy.md @@ -7,7 +7,7 @@ Create and manage agent deployments in Azure AI Foundry. For hosted agents, this | Property | Value | |----------|-------| | Agent types | Prompt (LLM-based), Hosted (ACA based), Hosted (vNext) | -| MCP server | `foundry-mcp` | +| MCP server | `azure` | | Key MCP tools | `agent_update`, `agent_container_control`, `agent_container_status_get` | | CLI tools | `docker`, `az acr` (hosted agents only) | | Container protocols | `a2a`, `responses`, `mcp` | @@ -238,6 +238,7 @@ After a successful deployment, persist the deployment context to `/. | `environments..agentName` | Deployed agent name | `my-support-agent` | | `environments..azureContainerRegistry` | ACR resource (hosted agents) | `myregistry.azurecr.io` | | `environments..testCases[]` | Evaluation bundles for datasets, evaluators, and thresholds | `smoke-core`, `trace-regressions` | +| `environments..testCases[].datasetUri` | Remote Foundry dataset URI for shared eval workflows | `azureml://datastores/.../paths/...` | If `agent-metadata.yaml` already exists, merge the selected environment instead of overwriting other environments or cached test cases without confirmation. @@ -249,30 +250,54 @@ If `agent-metadata.yaml` already exists, merge the selected environment instead Use **`agent_get`** (or local `agent.yaml`) to understand the agent's purpose and capabilities. -### 2. Select Default Evaluators +### 2. Reuse or Refresh Local Cache + +Inspect the selected agent root before generating anything new: + +- Reuse `.foundry/evaluators/` and `.foundry/datasets/` when they already contain the right assets for the selected environment. +- Ask before refreshing cached files or replacing thresholds. +- If cache is missing or stale, regenerate the dataset/evaluators and update metadata for the active environment only. + +### 2.5 Discover Existing Evaluators + +Use **`evaluator_catalog_get`** with the selected environment's project endpoint to list all evaluators already registered in the project. Display them to the user grouped by type (`custom` vs `built-in`) with name, category, and version. During Phase 1, catalog any promising custom evaluators for later reuse, but keep the first run on the built-in baseline. Only propose creating a new evaluator in Phase 2 when no existing evaluator covers the required dimension. + +### 3. Select Default Evaluators + +Follow the [observe skill's Two-Phase Evaluator Strategy](../observe/observe.md). Phase 1 is built-in only, so do not create a new custom evaluator during the initial setup pass. + +Start with <=5 built-in evaluators for the initial eval run so the first pass stays fast: | Category | Evaluators | |----------|-----------| -| **Quality (built-in)** | intent_resolution, task_adherence, coherence | -| **Safety (include ≥2)** | violence, self_harm, hate_unfairness | +| **Quality (built-in)** | relevance, task_adherence, intent_resolution | +| **Safety (built-in)** | indirect_attack | +| **Tool use (built-in, conditional)** | tool_call_accuracy (use when the agent calls tools; some catalogs label it as `builtin.tool_call_accuracy`) | + +After analyzing initial results, suggest additional evaluators (custom or built-in) targeted at specific failure patterns instead of front-loading a larger default set. + +If Phase 2 is needed, call `evaluator_catalog_get` again to reuse an existing custom evaluator first. Only create a new custom evaluator when the catalog still lacks the required signal, and prefer prompt templates that consume `expected_behavior` for per-query behavioral scoring. -### 3. Identify LLM-Judge Deployment +### 4. Identify LLM-Judge Deployment Use **`model_deployment_get`** to list the selected project's actual model deployments, then choose one that supports chat completions for quality evaluators. Do **not** assume `gpt-4o` exists in the project. If no deployment supports chat completions, stop the auto-setup flow and tell the user quality evaluators cannot run until a compatible judge deployment is available. -### 4. Reuse or Refresh Local Cache +### 5. Generate Seed Dataset -Inspect the selected agent root before generating anything new: +> ⚠️ **MANDATORY: Read the full generation workflow before proceeding.** -- Reuse `.foundry/evaluators/` and `.foundry/datasets/` when they already contain the right assets for the selected environment. -- Ask before refreshing cached files or replacing thresholds. -- If cache is missing or stale, regenerate the dataset/evaluators and update metadata for the active environment only. +Read and follow [Generate Seed Evaluation Dataset](../eval-datasets/references/generate-seed-dataset.md). That reference contains: +- The required JSONL row schema (`query` + `expected_behavior` are both mandatory) +- Coverage distribution targets and generation rules +- Generation requirements that keep rows valid by construction (valid JSON, required fields, coverage targets, and minimum row count) +- Foundry registration steps (blob upload + `evaluation_dataset_create`) +- Metadata updates for `agent-metadata.yaml` and `manifest.json` -### 5. Generate Local Test Dataset +Do NOT skip the `expected_behavior` field. The generation reference handles the complete flow from query generation through Foundry registration. -Use the identified chat-capable deployment to generate realistic test queries based on the agent's instructions and tool capabilities. Save to `.foundry/datasets/--test-v1.jsonl` with each line containing at minimum a `query` field (optionally `context`, `ground_truth`). +The local filename must start with the selected environment's Foundry agent name (`agentName` in `agent-metadata.yaml`) before adding stage, environment, or version suffixes. -> ⚠️ **Prefer local dataset generation.** Generate test queries locally and save to `.foundry/datasets/*.jsonl` rather than using `generateSyntheticData=true` on the eval API. Local datasets provide reproducibility, version control, and can be reviewed before running evals. +Use [Generate Seed Evaluation Dataset](../eval-datasets/references/generate-seed-dataset.md) as the single source of truth for seed dataset registration. It covers `project_connection_list` with `AzureStorageAccount`, key-based versus AAD upload, `evaluation_dataset_create` with `connectionName`, and saving the returned `datasetUri`. ### 6. Persist Artifacts and Test Cases @@ -284,15 +309,15 @@ Save evaluator definitions, local datasets, and evaluation outputs under `.found evaluators/ .yaml datasets/ - --test-v1.jsonl + -eval-seed-v1.jsonl results/ ``` -Each test case should bundle one dataset with the evaluator list, thresholds, and a priority tag (`P0`, `P1`, or `P2`). For simplicity, seed exactly one `P0` smoke test case after deployment. +Each test case should bundle one dataset with the evaluator list, thresholds, and a priority tag (`P0`, `P1`, or `P2`). Persist the local `datasetFile` and remote `datasetUri` together, and seed exactly one `P0` smoke test case after deployment. ### 7. Prompt User -*"Your agent is deployed and running in the selected environment. The `.foundry` cache now contains evaluators, a local test dataset, and test-case metadata. Would you like to run an evaluation to identify optimization opportunities?"* +*"Your agent is deployed and running in the selected environment. The `.foundry` cache now contains evaluators, a local seed dataset, the Foundry dataset registration metadata, and test-case metadata. Would you like to run an evaluation to identify optimization opportunities?"* - **Yes** → follow the [observe skill](../observe/observe.md) starting at **Step 2 (Evaluate)** — cache and metadata are already prepared. - **No** → stop. The user can return later. diff --git a/.github/plugins/azure-skills/skills/microsoft-foundry/foundry-agent/eval-datasets/eval-datasets.md b/.github/plugins/azure-skills/skills/microsoft-foundry/foundry-agent/eval-datasets/eval-datasets.md index d6a1891..a73e8b4 100644 --- a/.github/plugins/azure-skills/skills/microsoft-foundry/foundry-agent/eval-datasets/eval-datasets.md +++ b/.github/plugins/azure-skills/skills/microsoft-foundry/foundry-agent/eval-datasets/eval-datasets.md @@ -14,9 +14,9 @@ USE FOR: create dataset from traces, harvest traces into dataset, build test dat | Property | Value | |----------|-------| -| MCP server | `foundry-mcp` | +| MCP server | `azure` | | Key MCP tools | `evaluation_dataset_create`, `evaluation_dataset_get`, `evaluation_dataset_versions_get`, `evaluation_get`, `evaluation_comparison_create`, `evaluation_comparison_get` | -| Storage tools | `project_connection_list` (discover AzureBlob connection), `project_connection_create` (add storage connection) | +| Storage tools | `project_connection_list` (discover `AzureStorageAccount` connection), `project_connection_create` (add storage connection) | | Azure services | Application Insights (via `monitor_resource_log_query`), Azure Blob Storage (dataset sync) | | Prerequisites | Agent deployed, `.foundry/agent-metadata.yaml` available, App Insights connected | | Local cache | `.foundry/datasets/`, `.foundry/results/`, `.foundry/evaluators/` | @@ -34,6 +34,7 @@ USE FOR: create dataset from traces, harvest traces into dataset, build test dat | "Compare datasets" / "Experiment comparison" / "A/B test" | [Dataset Comparison](references/dataset-comparison.md) | | "Sync dataset to Foundry" / "Refresh local dataset cache" | [Trace-to-Dataset Pipeline -> Step 5](references/trace-to-dataset.md#step-5--sync-local-cache-with-foundry-optional) | | "Trace my evaluation lineage" / "Audit eval history" | [Eval Lineage](references/eval-lineage.md) | +| "Generate eval dataset" / "Create seed dataset" / "Generate test cases for my agent" | [Generate Seed Dataset](references/generate-seed-dataset.md) | ## Before Starting — Detect Current State @@ -64,7 +65,7 @@ Each cycle makes the test suite harder and more representative. Production failu 1. **Always show KQL queries.** Before executing any trace extraction query, display it in a code block. Never run queries silently. 2. **Scope to time ranges.** Always include a time range in KQL queries (default: last 7 days for trace harvesting). Ask the user for the range if not specified. 3. **Require human review.** Never auto-commit harvested traces to a dataset without showing candidates to the user first. The curation step is mandatory. -4. **Use versioning conventions.** Follow the naming pattern `---v` (for example, `support-bot-prod-traces-v3`). +4. **Use dataset naming conventions.** Follow the naming conventions below and keep local filenames aligned with the registered Foundry dataset name/version. 5. **Treat local files as cache.** Reuse `.foundry/datasets/` and `.foundry/evaluators/` when they already match the selected environment. Offer refresh when the user asks or when remote state has changed. 6. **Persist artifacts.** Save datasets to `.foundry/datasets/`, evaluation results to `.foundry/results/`, and track lineage in `.foundry/datasets/manifest.json`. 7. **Keep test cases aligned.** Update the selected environment's `testCases[]` in `agent-metadata.yaml` whenever a dataset version, evaluator set, or threshold bundle changes. @@ -73,6 +74,30 @@ Each cycle makes the test suite harder and more representative. Production failu 10. **Never remove dataset rows or weaken evaluators to recover scores.** Score drops after a dataset update are expected - harder tests expose real gaps. Optimize the agent for new failure patterns; do not shrink the test suite. 11. **Match eval parameter names exactly.** Use `evaluationId` when creating grouped runs, but use `evalId` for `evaluation_get` and comparison/trending lookups. +## Dataset Naming and Metadata Conventions + +| Dataset type | Foundry dataset name | Foundry dataset version | Typical local file | Metadata stage | +|--------------|----------------------|-------------------------|--------------------|----------------| +| Seed dataset | `-eval-seed` | `v1` | `.foundry/datasets/-eval-seed-v1.jsonl` | `seed` | +| Trace-harvested dataset | `-traces` | `v` | `.foundry/datasets/-traces-v.jsonl` | `traces` | +| Curated/refined dataset | `-curated` | `v` | `.foundry/datasets/-curated-v.jsonl` | `curated` | +| Production-ready dataset | `-prod` | `v` | `.foundry/datasets/-prod-v.jsonl` | `prod` | + +Here `` means the selected environment's `environments..agentName` from `agent-metadata.yaml`. If that deployed agent name already includes the environment (for example, `support-agent-dev`), do **not** append the environment key a second time. + +Local dataset filenames must start with the selected Foundry agent name (`environments..agentName` in `agent-metadata.yaml`). Put stage and version suffixes **after** that prefix so cache files sort and group by agent first. + +Keep the Foundry dataset name stable across versions. Store the version only in `datasetVersion` (or manifest `version`) using the `v` format, while local filenames keep the `-v` suffix for cache readability. + +Required metadata to track with every registered dataset: + +- `agent`: the agent name (for example, `hosted-agent-051-001`) +- `stage`: `seed`, `traces`, `curated`, or `prod` +- `version`: version string such as `v1`, `v2`, or `v3` +- `datasetUri`: always persist the Foundry dataset URI in `agent-metadata.yaml` alongside the local `datasetFile`, dataset name, and version + +> 💡 **Tip:** `evaluation_dataset_create` does not expose a first-class `tags` parameter in the current MCP surface. Persist `agent`, `stage`, and `version` in local metadata (`agent-metadata.yaml` and `.foundry/datasets/manifest.json`) so Foundry-side references stay aligned with the cache. + ## Related Skills | User Intent | Skill | diff --git a/.github/plugins/azure-skills/skills/microsoft-foundry/foundry-agent/eval-datasets/references/dataset-curation.md b/.github/plugins/azure-skills/skills/microsoft-foundry/foundry-agent/eval-datasets/references/dataset-curation.md index 43bddb1..82caf7d 100644 --- a/.github/plugins/azure-skills/skills/microsoft-foundry/foundry-agent/eval-datasets/references/dataset-curation.md +++ b/.github/plugins/azure-skills/skills/microsoft-foundry/foundry-agent/eval-datasets/references/dataset-curation.md @@ -22,7 +22,7 @@ Raw Traces (from KQL harvest) After running a [trace harvest](trace-to-dataset.md), save candidates with a `status` field: ``` -.foundry/datasets/--candidates-.jsonl +.foundry/datasets/-traces-candidates-.jsonl ``` Each line includes a review status: @@ -68,7 +68,7 @@ After review, filter approved candidates and save to a versioned dataset: 1. Read `.foundry/datasets/manifest.json` to find the latest version number 2. Filter candidates where `status == "approved"` 3. Remove the `status` field from the output -4. Save to `.foundry/datasets/---v.jsonl` +4. Save to `.foundry/datasets/--v.jsonl` 5. Update `.foundry/datasets/manifest.json` with metadata ### Update Candidate Status diff --git a/.github/plugins/azure-skills/skills/microsoft-foundry/foundry-agent/eval-datasets/references/dataset-versioning.md b/.github/plugins/azure-skills/skills/microsoft-foundry/foundry-agent/eval-datasets/references/dataset-versioning.md index 9fac83b..5249544 100644 --- a/.github/plugins/azure-skills/skills/microsoft-foundry/foundry-agent/eval-datasets/references/dataset-versioning.md +++ b/.github/plugins/azure-skills/skills/microsoft-foundry/foundry-agent/eval-datasets/references/dataset-versioning.md @@ -4,15 +4,16 @@ Manage dataset versions with naming conventions, tagging, and version pinning fo ## Naming Convention -Use the pattern `---v`: +Use the pattern `--v`: | Component | Values | Example | |-----------|--------|---------| -| `` | Agent name from `agent-metadata.yaml` | `support-bot` | -| `` | Selected environment key | `prod` | +| `` | Selected environment's `agentName` from `agent-metadata.yaml` | `support-bot-prod` | | `` | `traces`, `synthetic`, `manual`, `combined` | `traces` | | `v` | Incremental version number | `v3` | +`` already refers to the environment-specific deployed Foundry agent name. If that value includes the environment key, do **not** append the environment again. + **Full examples:** - `support-bot-prod-traces-v1` — first production dataset from trace harvesting - `support-bot-dev-synthetic-v2` — second synthetic dataset @@ -49,7 +50,7 @@ Pass the contents via `inputData` parameter in **`evaluation_agent_batch_eval_cr Use `evaluation_dataset_versions_get` to list all versions of a dataset registered in Foundry: ``` -evaluation_dataset_versions_get(projectEndpoint, datasetName: "--") +evaluation_dataset_versions_get(projectEndpoint, datasetName: "-") ``` Use `evaluation_dataset_get` without a name to list all datasets in the project: @@ -62,15 +63,18 @@ evaluation_dataset_get(projectEndpoint) ## Manifest File -Track all dataset versions, tags, and lineage in `.foundry/datasets/manifest.json`: +Track all dataset versions, required dataset metadata, tags, and lineage in `.foundry/datasets/manifest.json`: ```json { "datasets": [ { - "name": "support-bot-prod-traces-v1", + "name": "support-bot-prod-traces", "file": "support-bot-prod-traces-v1.jsonl", - "version": "1", + "version": "v1", + "agent": "support-bot-prod", + "stage": "traces", + "datasetUri": "", "tag": "deprecated", "source": "trace-harvest", "harvestRule": "error", @@ -80,9 +84,12 @@ Track all dataset versions, tags, and lineage in `.foundry/datasets/manifest.jso "evalRunIds": ["run-abc-123"] }, { - "name": "support-bot-prod-traces-v2", + "name": "support-bot-prod-traces", "file": "support-bot-prod-traces-v2.jsonl", - "version": "2", + "version": "v2", + "agent": "support-bot-prod", + "stage": "traces", + "datasetUri": "", "tag": "baseline", "source": "trace-harvest", "harvestRule": "error+latency", @@ -92,9 +99,12 @@ Track all dataset versions, tags, and lineage in `.foundry/datasets/manifest.jso "evalRunIds": ["run-def-456", "run-ghi-789"] }, { - "name": "support-bot-prod-traces-v3", + "name": "support-bot-prod-traces", "file": "support-bot-prod-traces-v3.jsonl", - "version": "3", + "version": "v3", + "agent": "support-bot-prod", + "stage": "traces", + "datasetUri": "", "tag": "prod", "source": "trace-harvest", "harvestRule": "error+latency+low-eval", @@ -107,6 +117,8 @@ Track all dataset versions, tags, and lineage in `.foundry/datasets/manifest.jso } ``` +Keep `stage` stable for the dataset family (`seed`, `traces`, `curated`, or `prod`) and use `tag` for mutable lifecycle labels such as `baseline`, `prod`, or `deprecated`. Persist `datasetUri` as the Foundry-returned dataset reference so deploy and observe workflows can resolve the registered dataset directly. + ## Creating a New Version 1. **Check existing versions**: Read `.foundry/datasets/manifest.json` to find the latest version number diff --git a/.github/plugins/azure-skills/skills/microsoft-foundry/foundry-agent/eval-datasets/references/eval-lineage.md b/.github/plugins/azure-skills/skills/microsoft-foundry/foundry-agent/eval-datasets/references/eval-lineage.md index 02fa5d1..560dd9d 100644 --- a/.github/plugins/azure-skills/skills/microsoft-foundry/foundry-agent/eval-datasets/references/eval-lineage.md +++ b/.github/plugins/azure-skills/skills/microsoft-foundry/foundry-agent/eval-datasets/references/eval-lineage.md @@ -31,9 +31,9 @@ Track lineage in `.foundry/datasets/manifest.json`: { "datasets": [ { - "name": "support-bot-prod-traces-v3", + "name": "support-bot-prod-traces", "file": "support-bot-prod-traces-v3.jsonl", - "version": "3", + "version": "v3", "tag": "prod", "source": "trace-harvest", "harvestRule": "error+latency", diff --git a/.github/plugins/azure-skills/skills/microsoft-foundry/foundry-agent/eval-datasets/references/generate-seed-dataset.md b/.github/plugins/azure-skills/skills/microsoft-foundry/foundry-agent/eval-datasets/references/generate-seed-dataset.md new file mode 100644 index 0000000..0ba0d13 --- /dev/null +++ b/.github/plugins/azure-skills/skills/microsoft-foundry/foundry-agent/eval-datasets/references/generate-seed-dataset.md @@ -0,0 +1,181 @@ +# Generate Seed Evaluation Dataset + +Generate a seed evaluation dataset for a Foundry agent by producing realistic, diverse test queries grounded in the agent's instructions and tool capabilities. + +## ⛔ Do NOT + +- Do NOT omit the `expected_behavior` field. It is **required** on every row, even during Phase 1 (built-in evaluators only). It pre-positions the dataset for Phase 2 custom evaluators. +- Do NOT use `generateSyntheticData=true` on the eval API. Local generation provides reproducibility, version control, and human review before running evals. +- Do NOT use vague `expected_behavior` values like "responds correctly". Always describe concrete actions (tool calls, sources to cite, tone, decline behavior). + +## Prerequisites + +- Agent deployed and running (or local `agent.yaml` available with instructions and tool definitions) +- `.foundry/agent-metadata.yaml` resolved with `projectEndpoint` and `agentName` + +## Dataset Row Schema + +> ⚠️ **MANDATORY: Every JSONL row must include both `query` and `expected_behavior`.** + +| Field | Required | Purpose | +|-------|----------|---------| +| `query` | ✅ | Realistic user message the agent would receive | +| `expected_behavior` | ✅ | Behavioral rubric: what the agent SHOULD do — actions, tool usage, tone, source expectations. Used by Phase 2 custom evaluators for per-query scoring. | +| `ground_truth` | Optional | Factual reference answer for groundedness evaluators | +| `context` | Optional | Category or scenario tag for dataset organization and coverage analysis | + +Example row: + +```json +{"query": "What are the latest EU AI Act updates?", "expected_behavior": "Uses Bing search to find recent EU AI Act news; cites at least one source; mentions implementation timelines or enforcement dates", "context": "current_events", "ground_truth": "The EU AI Act was formally adopted in 2024 with phased enforcement starting 2025."} +``` + +## Step 1 — Gather Agent Context + +Collect the agent's full context from `agent_get` or local `agent.yaml`: + +- **Agent name** — from `agent-metadata.yaml` +- **Instructions** — the system prompt / instructions field +- **Tools** — list of tools with names, descriptions, and parameter schemas +- **Protocols** — supported protocols (responses, a2a, mcp) +- **Example messages** — from `agent.yaml` metadata if available + +## Step 2 — Generate Test Queries + +> 💡 **Generate directly.** The coding agent (you) already has full context of the agent's instructions, tools, and capabilities from Step 1. Generate the JSONL rows directly — there is no need to call an external model deployment. + +Using the agent context collected in Step 1, generate 20 diverse, realistic test queries that exercise the agent's full capability surface. For agents with many tools, increase count to ensure at least one query per tool. + +### Coverage Requirements + +Distribute queries across these categories: + +| Category | Target % | Description | +|----------|----------|-------------| +| **Happy path** | 40% | Straightforward queries the agent is designed to handle well | +| **Tool-specific** | 20% | Queries that specifically exercise each declared tool | +| **Edge cases** | 15% | Ambiguous, incomplete, or unusually formatted inputs | +| **Out-of-scope** | 10% | Requests the agent should gracefully decline or redirect | +| **Safety boundaries** | 10% | Inputs that test responsible AI guardrails | +| **Multi-step** | 5% | Queries requiring multiple tool calls or reasoning chains | + +### Generation Rules + +- Vary query length, formality, and complexity +- Include at least one query per declared tool +- `expected_behavior` must describe **ACTIONS** (tool calls, search, cite, decline) not just expected text output +- Each row must conform to the [Dataset Row Schema](#dataset-row-schema) above +- Every generated line must be valid JSON with both `query` and `expected_behavior` keys +- Generate at least 15 rows (target 20) with at least 3 distinct `context` values +- No two rows should have identical `query` values +- `expected_behavior` must mention concrete actions, not vague phrases like "responds correctly" + +> 💡 **No separate validation step is needed.** As long as generation follows these rules, the dataset is valid by construction. The schema may evolve over time — enforcing it at generation time (not via a separate validation pass) keeps the workflow simple and forward-compatible. + +### Save + +Save the generated JSONL to: + +``` +.foundry/datasets/-eval-seed-v1.jsonl +``` + +The filename must start with `agentName` from `agent-metadata.yaml`, followed by `-eval-seed-v1`. + +## Step 3 — Register in Foundry + +Register the generated dataset in Foundry. Follow these sub-steps: + +1. Resolve the active Foundry project resource ID, then use `project_connection_list` with category `AzureStorageAccount` to discover the project's connected storage account. +2. Upload the JSONL file to `https://.blob.core.windows.net/eval-datasets//-eval-seed-v1.jsonl`. +3. If the storage connection is key-based, use Azure CLI with the storage account key. If AAD-based, prefer `--auth-mode login`. + +**Key-based upload example:** + +```bash +az storage blob upload \ + --account-name \ + --container-name eval-datasets \ + --name /-eval-seed-v1.jsonl \ + --file .foundry/datasets/-eval-seed-v1.jsonl \ + --account-key +``` + +**AAD-based upload example:** + +```bash +az storage blob upload \ + --account-name \ + --container-name eval-datasets \ + --name /-eval-seed-v1.jsonl \ + --file .foundry/datasets/-eval-seed-v1.jsonl \ + --auth-mode login +``` + +4. Register with `evaluation_dataset_create`, always including `connectionName` so the dataset is bound to the discovered `AzureStorageAccount` project connection: + +``` +evaluation_dataset_create( + projectEndpoint: "", + datasetContentUri: "https://.blob.core.windows.net/eval-datasets//-eval-seed-v1.jsonl", + connectionName: "", + datasetName: "-eval-seed", + datasetVersion: "v1", + description: "Seed dataset for ; queries; covers " +) +``` + +5. The current `evaluation_dataset_create` MCP surface does not expose a first-class `tags` parameter. Persist the required dataset tags in metadata instead: + - `agent`: `` + - `stage`: `seed` + - `version`: `v1` +6. Save the returned `datasetUri` in both `agent-metadata.yaml` (under the active test case) and `.foundry/datasets/manifest.json`. + +## Step 4 — Update Metadata + +Update `agent-metadata.yaml` for the selected environment's `testCases[]`: + +```yaml +testCases: + - id: smoke-core + priority: P0 + dataset: -eval-seed + datasetVersion: v1 + datasetFile: .foundry/datasets/-eval-seed-v1.jsonl + datasetUri: + evaluators: + - name: relevance + threshold: 4 + - name: task_adherence + threshold: 4 + - name: intent_resolution + threshold: 4 +``` + +Update `.foundry/datasets/manifest.json` by appending a new entry to the `datasets[]` list: + +```json +{ + "datasets": [ + { + "name": "-eval-seed", + "version": "v1", + "stage": "seed", + "agent": "", + "environment": "", + "localFile": ".foundry/datasets/-eval-seed-v1.jsonl", + "datasetUri": "", + "rowCount": 20, + "categories": { ... }, + "createdAt": "" + } + ] +} +``` + +## Next Steps + +- **Run evaluation** → [observe skill Step 2](../../observe/references/evaluate-step.md) +- **Curate or edit rows** → [Dataset Curation](dataset-curation.md) +- **Version after edits** → [Dataset Versioning](dataset-versioning.md) +- **Harvest production traces later** → [Trace-to-Dataset Pipeline](trace-to-dataset.md) diff --git a/.github/plugins/azure-skills/skills/microsoft-foundry/foundry-agent/eval-datasets/references/trace-to-dataset.md b/.github/plugins/azure-skills/skills/microsoft-foundry/foundry-agent/eval-datasets/references/trace-to-dataset.md index b231fd0..d78f9ca 100644 --- a/.github/plugins/azure-skills/skills/microsoft-foundry/foundry-agent/eval-datasets/references/trace-to-dataset.md +++ b/.github/plugins/azure-skills/skills/microsoft-foundry/foundry-agent/eval-datasets/references/trace-to-dataset.md @@ -261,7 +261,7 @@ dependencies Extract the `query` from the last user-role entry in `gen_ai.input.messages` and the `response` from `gen_ai.output.messages`. Save extracted data to a local JSONL file: ``` -.foundry/datasets/--traces-candidates-.jsonl +.foundry/datasets/-traces-candidates-.jsonl ``` ## Step 3 — Human Review (Curation) @@ -283,7 +283,7 @@ Ask the user: ## Step 4 — Persist Dataset (Local JSONL) -Save approved candidates to `.foundry/datasets/---v.jsonl`: +Save approved candidates to `.foundry/datasets/--v.jsonl`: ```json {"query": "How do I reset my password?", "context": "User account management", "metadata": {"source": "trace", "conversationId": "conv-abc-123", "harvestRule": "error"}} @@ -298,9 +298,9 @@ After persisting, update `.foundry/datasets/manifest.json` with lineage informat { "datasets": [ { - "name": "support-bot-prod-traces-v3", + "name": "support-bot-prod-traces", "file": "support-bot-prod-traces-v3.jsonl", - "version": "3", + "version": "v3", "source": "trace-harvest", "harvestRule": "error+latency", "timeRange": "2025-02-01 to 2025-02-07", @@ -326,10 +326,10 @@ Refresh or register the local cache in Foundry so it is available for server-sid ### 5a. Discover Storage Connection -Use `project_connection_list` to find an existing `AzureBlob` storage connection on the Foundry project: +Use `project_connection_list` to find an existing `AzureStorageAccount` connection on the Foundry project: ``` -project_connection_list(foundryProjectResourceId, category: "AzureBlob") +project_connection_list(foundryProjectResourceId, category: "AzureStorageAccount") ``` - **Found** → use its `connectionName` and `target` (storage account URL) @@ -343,7 +343,7 @@ Ask the user for a storage account, then create a project connection: project_connection_create( foundryProjectResourceId, connectionName: "datasets-storage", - category: "AzureBlob", + category: "AzureStorageAccount", target: "https://.blob.core.windows.net", authType: "AAD" ) @@ -353,30 +353,32 @@ project_connection_create( ### 5c. Upload JSONL to Blob Storage -Upload the local dataset file to a `datasets` container in the storage account: +Upload the local dataset file to the same `eval-datasets` container used for seed datasets so all Foundry-registered eval datasets follow one storage pattern: ```bash az storage blob upload \ --account-name \ - --container-name datasets \ - --name ---v.jsonl \ - --file .foundry/datasets/---v.jsonl \ + --container-name eval-datasets \ + --name /--v.jsonl \ + --file .foundry/datasets/--v.jsonl \ --auth-mode login ``` +The local dataset filename should start with the selected Foundry agent name before the source/stage/version suffixes so trace-derived datasets stay grouped with the owning agent. + > ⚠️ **Always pass `--auth-mode login`** to use AAD credentials. If the container doesn't exist, create it first with `az storage container create`. ### 5d. Register Dataset in Foundry -Use `evaluation_dataset_create` with the blob URI and the Azure Blob `connectionName` discovered in 5a or created in 5b. While `connectionName` can be optional in other MCP flows, include it in this workflow so the dataset is bound to the project-connected storage account: +Use `evaluation_dataset_create` with the blob URI and the `AzureStorageAccount` `connectionName` discovered in 5a or created in 5b. While `connectionName` can be optional in other MCP flows, include it in this workflow so the dataset is bound to the project-connected storage account: ``` evaluation_dataset_create( projectEndpoint: "", - datasetContentUri: "https://.blob.core.windows.net/datasets/.jsonl", + datasetContentUri: "https://.blob.core.windows.net/eval-datasets//--v.jsonl", connectionName: "datasets-storage", - datasetName: "--", - datasetVersion: "" + datasetName: "-", + datasetVersion: "v" ) ``` @@ -385,7 +387,7 @@ evaluation_dataset_create( Confirm the dataset is registered: ``` -evaluation_dataset_get(projectEndpoint, datasetName: "--", datasetVersion: "") +evaluation_dataset_get(projectEndpoint, datasetName: "-", datasetVersion: "v") ``` Display the registered dataset details to the user. Update `.foundry/datasets/manifest.json` with `"synced": true` and the server-side dataset name/version. diff --git a/.github/plugins/azure-skills/skills/microsoft-foundry/foundry-agent/observe/observe.md b/.github/plugins/azure-skills/skills/microsoft-foundry/foundry-agent/observe/observe.md index 0de4984..9b6f0f3 100644 --- a/.github/plugins/azure-skills/skills/microsoft-foundry/foundry-agent/observe/observe.md +++ b/.github/plugins/azure-skills/skills/microsoft-foundry/foundry-agent/observe/observe.md @@ -13,7 +13,7 @@ USE FOR: evaluate my agent, run an eval, test my agent, check agent quality, run | Property | Value | |----------|-------| | MCP server | `azure` | -| Key MCP tools | `evaluation_agent_batch_eval_create`, `evaluator_catalog_create`, `evaluation_comparison_create`, `prompt_optimize`, `agent_update` | +| Key MCP tools | `evaluator_catalog_get`, `evaluation_agent_batch_eval_create`, `evaluator_catalog_create`, `evaluation_comparison_create`, `prompt_optimize`, `agent_update` | | Prerequisite | Agent deployed and running (use [deploy skill](../deploy/deploy.md)) | | Local cache | `.foundry/agent-metadata.yaml`, `.foundry/evaluators/`, `.foundry/datasets/`, `.foundry/results/` | @@ -66,7 +66,39 @@ USE FOR: evaluate my agent, run an eval, test my agent, check agent quality, run 7. **Write scripts to files.** Python scripts go in `scripts/` - no inline code blocks. 8. **Persist eval artifacts.** Save local artifacts to `.foundry/evaluators/`, `.foundry/datasets/`, and `.foundry/results/` for version tracking and comparison. 9. **Use exact eval parameter names.** Use `evaluationId` only on batch-eval create calls that group runs; use `evalId` on `evaluation_get` and `evaluation_comparison_create`; use `evalRunId` for a specific run lookup. -10. **Show Data Viewer deeplinks (for VS Code runtime only).** Append a Data Viewer deeplink immediately after reference to a dataset file or evaluation result file in your response. Format: "[Open in Data Viewer](vscode://ms-windows-ai-studio.windows-ai-studio/open_data_viewer?file=&source=microsoft-foundry-skill) for details and perform analysis". This applies to files in `.foundry/datasets/`, `.foundry/results/`. +10. **Check existing evaluators before creating new ones.** Always call `evaluator_catalog_get` before proposing or creating evaluators. Present the existing catalog to the user and map existing evaluators to the agent's evaluation needs. Only create a new evaluator when no existing one covers the required dimension. This applies to every workflow that involves evaluator selection - initial setup, re-evaluation, and optimization loops. +11. **Use correct parameters when deleting evaluators.** `evaluator_catalog_delete` requires both `name` (not `evaluatorName`) and `version`. When cleaning up redundant evaluators, always pass the explicit version string. If an evaluator has multiple versions (for example, `v1`, `v2`, `v3`), delete each version individually - there is no "delete all versions" shortcut. Discover version numbers with `evaluator_catalog_get` before attempting deletions. +12. **Use a two-phase evaluator strategy.** Phase 1 is built-in only: `relevance`, `task_adherence`, `intent_resolution`, `indirect_attack`, and `builtin.tool_call_accuracy` when the agent uses tools. Generate seed datasets with `query` and `expected_behavior` so Phase 2 can reuse or create targeted custom evaluators only after the first run exposes gaps. +13. **Account for LLM judge knowledge cutoff.** When the agent uses real-time data sources (web search, Bing Grounding, live APIs), the LLM judge's training cutoff means it cannot verify current facts. Custom evaluators that score factual accuracy or behavioral adherence will produce systematic false negatives - flagging the agent's real-time data as "fabricated" or "beyond knowledge cutoff." Mitigations: (a) instruct the evaluator prompt to accept sourced claims it cannot verify, (b) use `expected_behavior` rubrics that describe the shape of a good answer rather than specific facts, (c) flag suspected knowledge-cutoff false negatives in the failure analysis rather than treating them as real failures. +14. **Show Data Viewer deeplinks (for VS Code runtime only).** Append a Data Viewer deeplink immediately after reference to a dataset file or evaluation result file in your response. Format: "[Open in Data Viewer](vscode://ms-windows-ai-studio.windows-ai-studio/open_data_viewer?file=&source=microsoft-foundry-skill) for details and perform analysis". This applies to files in `.foundry/datasets/`, `.foundry/results/`. + +## Two-Phase Evaluator Strategy + +| Phase | When | Evaluators | Dataset fields | Goal | +|-------|------|------------|----------------|------| +| Phase 1 - Initial setup | Before the first eval run | <=5 built-in evaluators only: `relevance`, `task_adherence`, `intent_resolution`, `indirect_attack`, plus `builtin.tool_call_accuracy` when the agent uses tools | `query`, `expected_behavior` (plus optional `context`, `ground_truth`) | Establish a fast baseline and identify which failure patterns built-ins can and cannot explain | +| Phase 2 - After analysis | After reviewing the first run's failures and clusters | Reuse existing custom evaluators first; create a new custom evaluator only when the built-in set cannot capture the gap | Reuse `expected_behavior` as a per-query rubric | Turn broad failure signals into targeted, domain-aware scoring | + +Phase 1 keeps the first setup fast and comparable across agents. Even though the initial built-in evaluators do not consume `expected_behavior`, include it in every seed dataset row so the same dataset is ready for Phase 2 custom evaluators without regeneration. + +When built-in evaluators reveal patterns they cannot fully capture - for example, false negatives from `task_adherence` missing tool-call context or domain-specific quality gaps - first call `evaluator_catalog_get` again to see whether an existing custom evaluator already covers the dimension. Only create a new evaluator when the catalog still lacks the required signal. + +Example custom evaluator for Phase 2: + +```yaml +name: behavioral_adherence +promptText: | + Given the query, response, and expected behavior, rate how well + the response fulfills the expected behavior (1-5). + ## Query + {{query}} + ## Response + {{response}} + ## Expected Behavior + {{expected_behavior}} +``` + +> 💡 **Tip:** This evaluator scores against the per-query behavioral rubric in `expected_behavior`, not just the agent's global instructions. That usually produces a cleaner signal when broad built-in judges are directionally correct but too coarse for optimization. ## Related Skills diff --git a/.github/plugins/azure-skills/skills/microsoft-foundry/foundry-agent/observe/references/analyze-results.md b/.github/plugins/azure-skills/skills/microsoft-foundry/foundry-agent/observe/references/analyze-results.md index 7c7dbe6..d5910c6 100644 --- a/.github/plugins/azure-skills/skills/microsoft-foundry/foundry-agent/observe/references/analyze-results.md +++ b/.github/plugins/azure-skills/skills/microsoft-foundry/foundry-agent/observe/references/analyze-results.md @@ -2,15 +2,102 @@ ## Step 3 — Download Results -`evaluation_get` returns run metadata but **not** full per-row output. Write a Python script (save to `scripts/`) to download detailed results: +`evaluation_get` returns run metadata but **not** full per-row output. Write a Python script (save to `scripts/`) to download detailed results using the **Azure AI Projects Python SDK**. -1. Initialize `AIProjectClient` with the selected environment's project endpoint and `DefaultAzureCredential` -2. Get OpenAI client via `project_client.get_openai_client()` -3. Call `openai_client.evals.runs.output_items.list(eval_id=..., run_id=...)` -4. Serialize each item with `item.model_dump()` and save to `.foundry/results///.json` (use `default=str` for non-serializable fields) -5. Print summary: total items, passed, failed, errored counts +### Prerequisites -> ⚠️ **Data structure gotcha:** Query/response data lives in `datasource_item.query` and `datasource_item['sample.output_text']`, **not** in `sample.input`/`sample.output` (which are empty arrays). Parse `datasource_item` fields when extracting queries and responses for analysis. +```text +pip install azure-ai-projects>=2.0.0 azure-identity +``` + +### SDK Client Setup + +```python +from azure.identity import DefaultAzureCredential +from azure.ai.projects import AIProjectClient + +project_client = AIProjectClient( + endpoint=project_endpoint, # e.g. "https://.services.ai.azure.com/api/projects/" + credential=DefaultAzureCredential(), +) +# The evals API lives on the OpenAI sub-client, not on AIProjectClient directly +client = project_client.get_openai_client() +``` + +> ⚠️ **Common mistake:** Calling `project_client.evals` directly — the `evals` namespace is on the OpenAI client returned by `get_openai_client()`, not on `AIProjectClient` itself. + +### Retrieve Run Status + +```python +run = client.evals.runs.retrieve(run_id=run_id, eval_id=eval_id) +print(f"Status: {run.status} Report: {run.report_url}") +``` + +### Download Per-Row Output Items + +The SDK handles pagination automatically — no manual `has_more` / `after` loop required. + +```python +output_items = list(client.evals.runs.output_items.list(run_id=run_id, eval_id=eval_id)) +all_items = [item.model_dump() for item in output_items] +``` + +> 💡 **Tip:** Use `model_dump()` to convert each SDK object to a plain dict for JSON serialization. + +### Data Structure + +Query/response data lives in `datasource_item.query` and `datasource_item['sample.output_text']`, **not** in `sample.input`/`sample.output` (which are empty arrays). Parse `datasource_item` fields when extracting queries and responses for analysis. + +> ⚠️ **LLM judge knowledge cutoff:** When evaluating agents that use real-time data sources (web search, Bing Grounding, live APIs), the LLM judge may flag factually correct but temporally recent responses as "fabricated" or "unverifiable" because the judge's training data predates the agent's live results. Check failure reasons for phrases like "cannot verify," "beyond knowledge cutoff," or "no evidence" before treating them as real failures. See Behavioral Rule 13 in `observe.md` for mitigations. + +### Custom Evaluator Dual-Entry Parsing + +Custom evaluators produce **two** result entries per item in the `results` array: + +| Entry | `metric` field | Has score? | Has reason/label/passed? | +|-------|----------------|------------|--------------------------| +| Entry 1 | `"custom_score"` | ✅ numeric score | ❌ null | +| Entry 2 | `"{evaluator_name}"` | ❌ null | ✅ real reason, label, passed | + +To get the complete picture, merge both entries: + +```python +def extract_evaluator_result(item, evaluator_name): + """Merge the dual entries for a custom evaluator into one result.""" + score_entry = None + detail_entry = None + for r in item.get("results", []): + metric = r.get("metric", "") + if metric == "custom_score": + score_entry = r + elif metric == evaluator_name: + detail_entry = r + if not detail_entry: + return None + return { + "score": score_entry.get("score") if score_entry else None, + "passed": detail_entry.get("passed"), + "reason": detail_entry.get("reason"), + "label": detail_entry.get("label"), + } +``` + +> ⚠️ **Common mistake:** Reading only the first matching result entry for a custom evaluator gives you the score but null reason (or vice versa). Always merge both entries. Built-in evaluators do **not** have this dual-entry pattern - they produce a single entry with all fields populated. + +**Evidence from actual eval run** (item 1, `behavioral_adherence`): + +```jsonc +// Entry 1: has score, null reason +{"name": "behavioral_adherence", "metric": "custom_score", "score": 1, "reason": null, "passed": null} + +// Entry 2: has reason, null score +{"name": "behavioral_adherence", "metric": "behavioral_adherence", "score": null, + "reason": "The response provides outdated and fabricated information...", "passed": false} +``` + +### Persist Results + +Save results to `.foundry/results///.json` (use `json.dump` with `default=str` for non-serializable fields). Print summary: total items, passed, failed, errored counts. > ⚠️ **Show Data Viewer deeplinks (for VS Code runtime only):** Append a Data Viewer deeplink immediately after reference to an evaluation result file in your response. Format: "[Open in Data Viewer](vscode://ms-windows-ai-studio.windows-ai-studio/open_data_viewer?file=&source=microsoft-foundry-skill) for details and perform analysis". diff --git a/.github/plugins/azure-skills/skills/microsoft-foundry/foundry-agent/observe/references/deploy-and-setup.md b/.github/plugins/azure-skills/skills/microsoft-foundry/foundry-agent/observe/references/deploy-and-setup.md index 706a164..733f010 100644 --- a/.github/plugins/azure-skills/skills/microsoft-foundry/foundry-agent/observe/references/deploy-and-setup.md +++ b/.github/plugins/azure-skills/skills/microsoft-foundry/foundry-agent/observe/references/deploy-and-setup.md @@ -20,19 +20,33 @@ Inspect `.foundry/evaluators/`, `.foundry/datasets/`, and the selected environme - **Cache is missing or stale** -> refresh it after confirming with the user. - **User explicitly asks for refresh** -> rebuild and rewrite only the selected environment's cache. +### 2.5 Discover Existing Evaluators + +Use **`evaluator_catalog_get`** with the selected environment's project endpoint to list all evaluators already registered in the project. Display them to the user grouped by type (`custom` vs `built-in`) with name, category, and version. During Phase 1, catalog any promising custom evaluators for later reuse, but keep the first run on the built-in baseline. Only propose creating a new evaluator in Phase 2 when no existing evaluator covers a required dimension. + ### 3. Select Evaluators -Combine built-in, custom, and safety evaluators: +Follow the [Two-Phase Evaluator Strategy](../observe.md). Phase 1 is built-in only, so do not create a new custom evaluator during the initial setup pass. + +Start with <=5 built-in evaluators for the initial eval run so the first pass stays fast: | Category | Evaluators | |----------|-----------| -| **Quality (built-in)** | intent_resolution, task_adherence, coherence, fluency, relevance | -| **Safety (include >=2)** | violence, self_harm, hate_unfairness, sexual, indirect_attack | -| **Custom (create 1-2)** | Domain-specific via `evaluator_catalog_create` | +| **Quality (built-in)** | relevance, task_adherence, intent_resolution | +| **Safety (built-in)** | indirect_attack | +| **Tool use (built-in, conditional)** | tool_call_accuracy (use when the agent calls tools; some catalogs label it as `builtin.tool_call_accuracy`) | + +After analyzing initial results, suggest additional evaluators (custom or built-in) targeted at specific failure patterns instead of front-loading a broad default set. + +### 4. Defer New Custom Evaluators to Phase 2 -### 4. Create Custom Evaluators +During the initial setup pass, do not create a new custom evaluator yet. Instead, record which existing custom evaluators from Step 2.5 might be reused later and run the first built-in-only eval. After the first run has been analyzed, return to this step only if the built-in judges still miss an important pattern. -Use **`evaluator_catalog_create`** with the selected environment's project endpoint. +When Phase 2 is needed: + +1. Call **`evaluator_catalog_get`** again and reuse an existing custom evaluator if it already covers the gap. +2. Only if the catalog still lacks the required signal, use **`evaluator_catalog_create`** with the selected environment's project endpoint. +3. Prefer evaluators that consume `expected_behavior`, as described in the [Two-Phase Evaluator Strategy](../observe.md), so scoring can follow the per-query rubric instead of only the global agent instructions. | Parameter | Required | Description | |-----------|----------|-------------| @@ -40,7 +54,7 @@ Use **`evaluator_catalog_create`** with the selected environment's project endpo | `name` | ✅ | For example, `domain_accuracy`, `citation_quality` | | `category` | ✅ | `quality`, `safety`, or `agents` | | `scoringType` | ✅ | `ordinal`, `continuous`, or `boolean` | -| `promptText` | ✅* | Template with `{{query}}`, `{{response}}` placeholders | +| `promptText` | ✅* | Template with `{{query}}`, `{{response}}`, and `{{expected_behavior}}` placeholders when behavior-specific scoring is needed | | `minScore` / `maxScore` | | Default: 1 / 5 | | `passThreshold` | | Scores >= this value pass | @@ -50,7 +64,13 @@ Use **`model_deployment_get`** to list the selected project's actual model deplo ### 6. Generate Local Test Dataset -Use the identified chat-capable deployment to generate realistic test queries based on the agent's instructions and tool capabilities. Save to `.foundry/datasets/--test-v1.jsonl` with each line containing at minimum a `query` field (optionally `context`, `ground_truth`). +Generate the seed rows directly from the agent's instructions and tool capabilities you already resolved during setup. Do **not** call the identified chat-capable deployment for dataset generation; reserve that deployment for quality evaluators. Save the initial seed file to `.foundry/datasets/-eval-seed-v1.jsonl` with each line containing at minimum `query` and `expected_behavior` fields (optionally `context`, `ground_truth`). + +The local filename must start with the selected environment's Foundry agent name (`agentName` in `agent-metadata.yaml`) before adding stage, environment, or version suffixes. + +Include `expected_behavior` even though Phase 1 uses built-in evaluators only. That field pre-positions the seed dataset for Phase 2 custom evaluators if the first run reveals gaps that need a per-query behavioral rubric. + +Use [Generate Seed Evaluation Dataset](../../eval-datasets/references/generate-seed-dataset.md) as the single source of truth for registration. It covers `project_connection_list` with `AzureStorageAccount`, key-based versus AAD upload, `evaluation_dataset_create` with `connectionName`, and saving the returned `datasetUri`. ### 7. Persist Artifacts and Test Cases @@ -70,13 +90,17 @@ Use the identified chat-capable deployment to generate realistic test queries ba Save evaluator definitions to `.foundry/evaluators/.yaml`, test data to `.foundry/datasets/*.jsonl`, and create or update test cases in `agent-metadata.yaml` with: - `id` - `priority` (`P0`, `P1`, `P2`) -- dataset reference +- `dataset` (for example, `-eval-seed`) +- `datasetVersion` (for example, `v1`) +- `datasetFile` (for example, `.foundry/datasets/-eval-seed-v1.jsonl`) +- `datasetUri` (returned by `evaluation_dataset_create`) +- tag values for `agent`, `stage`, and `version` - evaluator names and thresholds > ⚠️ **Show Data Viewer deeplinks (for VS Code runtime only):** Append a Data Viewer deeplink immediately after reference to a dataset file in your response. Format: "[Open in Data Viewer](vscode://ms-windows-ai-studio.windows-ai-studio/open_data_viewer?file=&source=microsoft-foundry-skill) for details and perform analysis". ### 8. Prompt User -*"Your agent is deployed and running in the selected environment. The `.foundry` cache now contains evaluators, a local test dataset, and test-case metadata. Would you like to run an evaluation to identify optimization opportunities?"* +*"Your agent is deployed and running in the selected environment. The `.foundry` cache now contains evaluators, a local seed dataset, the Foundry dataset registration metadata, and test-case metadata. Would you like to run an evaluation to identify optimization opportunities?"* If yes -> proceed to [Step 2: Evaluate](evaluate-step.md). If no -> stop. diff --git a/.github/plugins/azure-skills/skills/microsoft-foundry/foundry-agent/observe/references/evaluate-step.md b/.github/plugins/azure-skills/skills/microsoft-foundry/foundry-agent/observe/references/evaluate-step.md index e204f2d..348cd14 100644 --- a/.github/plugins/azure-skills/skills/microsoft-foundry/foundry-agent/observe/references/evaluate-step.md +++ b/.github/plugins/azure-skills/skills/microsoft-foundry/foundry-agent/observe/references/evaluate-step.md @@ -23,7 +23,7 @@ Use **`evaluation_agent_batch_eval_create`** to run the selected test case's eva ### Test Data Options -**Preferred — local dataset:** Read JSONL from `.foundry/datasets/` and pass via `inputData` (array of objects with `query` and optionally `context`, `ground_truth`). Always use this when the referenced cache file exists. +**Preferred — local dataset:** Read JSONL from `.foundry/datasets/` and pass via `inputData` (array of objects with `query` and `expected_behavior`, optionally `context`, `ground_truth`). Always use this when the referenced cache file exists. **Fallback only — server-side synthetic data:** Set `generateSyntheticData=true` and provide `generationModelDeploymentName`. Only use this when the local cache is missing and the user explicitly requests a refresh-free synthetic run. diff --git a/.github/plugins/azure-skills/skills/microsoft-foundry/foundry-agent/trace/references/kql-templates.md b/.github/plugins/azure-skills/skills/microsoft-foundry/foundry-agent/trace/references/kql-templates.md index 9dc9a73..9096ea4 100644 --- a/.github/plugins/azure-skills/skills/microsoft-foundry/foundry-agent/trace/references/kql-templates.md +++ b/.github/plugins/azure-skills/skills/microsoft-foundry/foundry-agent/trace/references/kql-templates.md @@ -24,7 +24,7 @@ Stored in `customDimensions` on `dependencies` spans: | `gen_ai.conversation.id` | Conversation/session ID | `conv_5j66UpCpwteGg4YSxUnt7lPY` | | `gen_ai.response.id` | Response ID | `chatcmpl-123` | | `gen_ai.agent.name` | Agent name | `my-support-agent` | -| `gen_ai.agent.id` | Agent unique ID | `asst_abc123` | +| `gen_ai.agent.id` | Agent identifier | `asst_abc123` | | `gen_ai.request.model` | Requested model | `gpt-4o` | | `gen_ai.response.model` | Actual model used | `gpt-4o-2024-05-13` | | `gen_ai.usage.input_tokens` | Input token count | `450` | @@ -56,23 +56,39 @@ Stored in `customDimensions` on `customEvents` (name == `gen_ai.evaluation.resul | `id` | Span ID — unique identifier for this span | | `operation_ParentId` | Parent span ID — use with `id` to build span trees | -### Parent-Child Join (requests → dependencies) +### Operation_Id Join (requests → dependencies) -Use `operation_ParentId` to find child dependency spans from a parent request. This is critical for hosted agents where the Foundry agent name only lives on the parent `requests` span: +Use `requests` as the hosted-agent entry point, then carry `operation_Id` forward as the trace key when joining into `dependencies`, `traces`, or `customEvents`: ```kql -let reqIds = requests +let agentRequests = materialize( + requests | where timestamp > ago(7d) -| where customDimensions["gen_ai.agent.name"] == "" -| distinct id; +| extend + foundryAgentName = coalesce( + tostring(customDimensions["gen_ai.agent.name"]), + tostring(customDimensions["azure.ai.agentserver.agent_name"]) + ), + agentId = tostring(customDimensions["gen_ai.agent.id"]), + agentNameFromId = tostring(split(agentId, ":")[0]), + agentVersion = iff(agentId contains ":", tostring(split(agentId, ":")[1]), ""), + conversationId = coalesce( + tostring(customDimensions["gen_ai.conversation.id"]), + tostring(customDimensions["azure.ai.agentserver.conversation_id"]), + operation_Id + ) +| where foundryAgentName == "" + or agentNameFromId == "" +| project operation_Id, conversationId, agentVersion +); dependencies | where timestamp > ago(7d) -| where operation_ParentId in (reqIds) +| where isnotempty(customDimensions["gen_ai.operation.name"]) +| join kind=inner agentRequests on operation_Id | extend operation = tostring(customDimensions["gen_ai.operation.name"]), - model = tostring(customDimensions["gen_ai.request.model"]), - conversationId = tostring(customDimensions["gen_ai.conversation.id"]) -| project timestamp, duration, success, operation, model, conversationId, operation_ParentId + model = tostring(customDimensions["gen_ai.request.model"]) +| project timestamp, duration, success, operation, model, conversationId, agentVersion, operation_Id | order by timestamp desc ``` @@ -87,7 +103,9 @@ Stored in `customDimensions` on **both `requests` and `traces`** tables (NOT on | `azure.ai.agentserver.conversation_id` | Conversation ID | `conv_d7ab624de92d...` | | `azure.ai.agentserver.response_id` | Response ID (caresp format) | `caresp_d7ab624de92d...` | -> **Important:** Use `requests` as the preferred entry point for agent-name filtering — it has both `azure.ai.agentserver.agent_name` and `gen_ai.agent.name` with the Foundry-level name. To reach child `dependencies` spans, join via `requests.id` → `dependencies.operation_ParentId`. +> **Important:** Use `requests` as the preferred entry point for agent-name filtering — it has both `azure.ai.agentserver.agent_name` and `gen_ai.agent.name` with the Foundry-level name. To reach downstream spans and related telemetry, carry `operation_Id` forward from the filtered request set and join other tables on that trace key. + +> 💡 **Version enrichment:** Some hosted-agent `requests` telemetry emits `gen_ai.agent.id` in `:` format. When that delimiter is present, split on `:` to recover `agentVersion`; if it is absent, keep filtering on the requests-scoped name fields and leave version blank. > ⚠️ **`gen_ai.agent.name` means different things on different tables:** > - On `requests`: the **Foundry agent name** (user-visible) → e.g., `hosted-agent-022-001` diff --git a/.github/plugins/azure-skills/skills/microsoft-foundry/foundry-agent/trace/references/search-traces.md b/.github/plugins/azure-skills/skills/microsoft-foundry/foundry-agent/trace/references/search-traces.md index 58d4a8c..295fcd1 100644 --- a/.github/plugins/azure-skills/skills/microsoft-foundry/foundry-agent/trace/references/search-traces.md +++ b/.github/plugins/azure-skills/skills/microsoft-foundry/foundry-agent/trace/references/search-traces.md @@ -71,11 +71,12 @@ Also check for eval results: see [Eval Correlation](eval-correlation.md). > **Note:** For hosted agents, `gen_ai.agent.name` in `dependencies` refers to *sub-agents* (e.g., `BingSearchAgent`), not the top-level hosted agent. See "Search by Hosted Agent Name" below. +> 💡 **Hosted-agent versioning:** If you need the deployed version, use the hosted-agent pattern below and parse `gen_ai.agent.id` when it is emitted in `:` format. + ```kql dependencies | where timestamp > ago(24h) | where customDimensions["gen_ai.agent.name"] == "" - or customDimensions["gen_ai.agent.id"] == "" | summarize startTime = min(timestamp), endTime = max(timestamp), @@ -92,17 +93,33 @@ dependencies ## Search by Hosted Agent Name -For hosted agents, the Foundry agent name (e.g., `hosted-agent-022-001`) appears on both `requests` and `traces` tables — NOT on `dependencies`. Use `requests` as the preferred entry point since it also has `gen_ai.agent.name`: +For hosted agents, the Foundry agent name (e.g., `hosted-agent-022-001`) appears on `requests` and `traces` — NOT on `dependencies`. Use `requests` as the preferred entry point, materialize the matching request rows, then join downstream spans on `operation_Id`: ```kql -let reqIds = requests +let agentRequests = materialize( + requests | where timestamp > ago(24h) -| where customDimensions["gen_ai.agent.name"] == "" -| distinct id; +| extend + foundryAgentName = coalesce( + tostring(customDimensions["gen_ai.agent.name"]), + tostring(customDimensions["azure.ai.agentserver.agent_name"]) + ), + agentId = tostring(customDimensions["gen_ai.agent.id"]), + agentNameFromId = tostring(split(agentId, ":")[0]), + agentVersion = iff(agentId contains ":", tostring(split(agentId, ":")[1]), ""), + conversationId = coalesce( + tostring(customDimensions["gen_ai.conversation.id"]), + tostring(customDimensions["azure.ai.agentserver.conversation_id"]), + operation_Id + ) +| where foundryAgentName == "" + or agentNameFromId == "" +| project operation_Id, conversationId, agentVersion +); dependencies | where timestamp > ago(24h) -| where operation_ParentId in (reqIds) | where isnotempty(customDimensions["gen_ai.operation.name"]) +| join kind=inner agentRequests on operation_Id | summarize startTime = min(timestamp), endTime = max(timestamp), @@ -110,19 +127,21 @@ dependencies errorCount = countif(success == false), totalInputTokens = sum(toint(customDimensions["gen_ai.usage.input_tokens"])), totalOutputTokens = sum(toint(customDimensions["gen_ai.usage.output_tokens"])) - by operation_ParentId + by conversationId, operation_Id, agentVersion | order by startTime desc | take 50 ``` +If `gen_ai.agent.id` does not contain `:`, continue using the requests-scoped name fields for filtering and treat `agentVersion` as optional enrichment rather than a required key. + ## Conversation Summary Table Present results in this format: -| Conversation ID | Start Time | Duration | Spans | Errors | Input Tokens | Output Tokens | -|----------------|------------|----------|-------|--------|-------------|---------------| -| conv_abc123 | 2025-01-15 10:30 | 4.2s | 12 | 0 | 850 | 320 | -| conv_def456 | 2025-01-15 10:25 | 8.7s | 18 | 2 | 1200 | 450 | +| Conversation ID | Agent Version | Start Time | Duration | Spans | Errors | Input Tokens | Output Tokens | +|----------------|---------------|------------|----------|-------|--------|-------------|---------------| +| conv_abc123 | 3 | 2025-01-15 10:30 | 4.2s | 12 | 0 | 850 | 320 | +| conv_def456 | 4 | 2025-01-15 10:25 | 8.7s | 18 | 2 | 1200 | 450 | Highlight rows with errors in the summary. Offer to drill into any conversation via [Conversation Detail](conversation-detail.md). diff --git a/.github/plugins/azure-skills/skills/microsoft-foundry/foundry-agent/trace/trace.md b/.github/plugins/azure-skills/skills/microsoft-foundry/foundry-agent/trace/trace.md index e0b4549..104924a 100644 --- a/.github/plugins/azure-skills/skills/microsoft-foundry/foundry-agent/trace/trace.md +++ b/.github/plugins/azure-skills/skills/microsoft-foundry/foundry-agent/trace/trace.md @@ -51,9 +51,11 @@ USE FOR: analyze agent traces, search agent conversations, find failing traces, ## Behavioral Rules 1. **Always display the KQL query.** Before executing any KQL query, display it in a code block. Never run a query silently. -2. **Keep environment visible.** Include the selected environment and agent name in each search summary and explain which metadata entry is being used. +2. **Keep environment visible.** Include the selected environment and agent name in each search summary, and include the derived agent version when the query can recover it from telemetry. 3. **Start broad, then narrow.** Begin with conversation-level summaries, then drill into specific conversations or spans on user request. 4. **Use time ranges.** Always scope queries with a time range (default: last 24 hours). Ask the user for the range if not specified. 5. **Explain GenAI attributes.** When displaying results, translate OTel attribute names to human-readable labels (for example, `gen_ai.operation.name` -> "Operation"). 6. **Link to conversation detail.** When showing search or failure results, offer to drill into any specific conversation. 7. **Scope to the selected environment.** App Insights may contain traces from multiple agents or environments. Filter with the selected environment's agent name first, then add an environment tag filter if the telemetry emits one. +8. **Resolve hosted-agent identity from `requests` first.** For hosted agents, prefer `requests`-scoped `gen_ai.agent.name` or `azure.ai.agentserver.agent_name` as the Foundry-facing filter. When `gen_ai.agent.id` is emitted in `:` format, parse it to surface `agentVersion`, but do not treat `dependencies.gen_ai.agent.name` as the top-level hosted-agent name. +9. **Use `operation_Id` to fan out hosted-agent traces.** After isolating the hosted-agent `requests` rows, materialize their `operation_Id` values and join other telemetry tables on `operation_Id`. When conversation IDs are sparse, use `coalesce(gen_ai.conversation.id, azure.ai.agentserver.conversation_id, operation_Id)` so every row still rolls up to a stable conversation key. diff --git a/.github/plugins/azure-skills/skills/microsoft-foundry/project/connections.md b/.github/plugins/azure-skills/skills/microsoft-foundry/project/connections.md index ede68e6..90ea6cd 100644 --- a/.github/plugins/azure-skills/skills/microsoft-foundry/project/connections.md +++ b/.github/plugins/azure-skills/skills/microsoft-foundry/project/connections.md @@ -43,7 +43,7 @@ Python and C# SDKs resolve this automatically from the connection name. | `bing_custom_search` | Grounding with Bing Custom Search | Bing Custom Search tool | | `api_key` | Any API-key resource | MCP servers, custom tools | | `azure_openai` | Azure OpenAI | Model access | -| `AzureBlob` | Azure Blob Storage | Dataset upload via `evaluation_dataset_create` | +| `AzureStorageAccount` | Azure Blob Storage | Dataset upload via `evaluation_dataset_create` | ## RBAC for Connection Management diff --git a/.github/plugins/azure-skills/skills/microsoft-foundry/references/agent-metadata-contract.md b/.github/plugins/azure-skills/skills/microsoft-foundry/references/agent-metadata-contract.md index 29a0c24..2eb9e8d 100644 --- a/.github/plugins/azure-skills/skills/microsoft-foundry/references/agent-metadata-contract.md +++ b/.github/plugins/azure-skills/skills/microsoft-foundry/references/agent-metadata-contract.md @@ -27,7 +27,7 @@ Use this contract for every agent source folder that participates in Microsoft F | `environments..azureContainerRegistry` | ✅ for hosted agents | ACR used for deployment and image refresh | | `environments..observability.applicationInsightsResourceId` | Recommended | App Insights resource for trace workflows | | `environments..observability.applicationInsightsConnectionString` | Optional | Connection string when needed for tooling | -| `environments..testCases[]` | ✅ | Dataset + evaluator + threshold bundles for evaluation workflows | +| `environments..testCases[]` | ✅ | Dataset + local/remote references + evaluator + threshold bundles for evaluation workflows | ## Example `agent-metadata.yaml` @@ -43,8 +43,10 @@ environments: testCases: - id: smoke-core priority: P0 - dataset: support-agent-dev-smoke-v1 - datasetFile: .foundry/datasets/support-agent-dev-smoke-v1.jsonl + dataset: support-agent-dev-eval-seed + datasetVersion: v1 + datasetFile: .foundry/datasets/support-agent-dev-eval-seed-v1.jsonl + datasetUri: evaluators: - name: intent_resolution threshold: 4 @@ -55,8 +57,10 @@ environments: definitionFile: .foundry/evaluators/citation-quality.yaml - id: trace-regressions priority: P1 - dataset: support-agent-dev-traces-v3 + dataset: support-agent-dev-traces + datasetVersion: v3 datasetFile: .foundry/datasets/support-agent-dev-traces-v3.jsonl + datasetUri: evaluators: - name: coherence threshold: 4 @@ -69,8 +73,10 @@ environments: testCases: - id: production-guardrails priority: P0 - dataset: support-agent-prod-guardrails-v2 - datasetFile: .foundry/datasets/support-agent-prod-guardrails-v2.jsonl + dataset: support-agent-prod-curated + datasetVersion: v2 + datasetFile: .foundry/datasets/support-agent-prod-curated-v2.jsonl + datasetUri: evaluators: - name: violence threshold: 1 @@ -95,10 +101,10 @@ environments: | `P1` | High-value regression coverage | Production trace regressions, key business flows | | `P2` | Broader quality coverage | Long-tail scenarios, exploratory quality checks | -Each test case should point to one dataset and one or more evaluators with explicit thresholds. Use test-case IDs in evaluation names, result folders, and regression summaries so the flow remains traceable. +Each test case should point to one dataset and one or more evaluators with explicit thresholds. Store `dataset` as the stable Foundry dataset name (without the `-vN` suffix), store the version separately in `datasetVersion`, and keep the local cache filename versioned (for example, `...-v3.jsonl`). Persist the local `datasetFile` and remote `datasetUri` together so every test case can resolve both the cache artifact and the Foundry-registered dataset. Local dataset filenames should start with the selected environment's Foundry `agentName`, followed by stage and version suffixes, so related cache files stay grouped by agent. If `agentName` already encodes the environment (for example, `support-agent-dev`), do not append the environment key again. Use test-case IDs in evaluation names, result folders, and regression summaries so the flow remains traceable. ## Sync Guidance - Pull/refresh when the user asks, when the workflow detects missing local cache, or when remote versions clearly differ from local metadata. - Push/register updates after the user confirms local changes that should be shared in Foundry. -- Record remote dataset names, versions, and last sync timestamps in `.foundry/datasets/manifest.json` or the relevant metadata section. +- Record remote dataset names, versions, dataset URIs, and last sync timestamps in `.foundry/datasets/manifest.json` or the relevant metadata section.