diff --git a/eng/pipelines/scripts/Get-Test-Logs.ps1 b/eng/pipelines/scripts/Get-Test-Logs.ps1 index d840a71ca2ae..d4afe231a76a 100644 --- a/eng/pipelines/scripts/Get-Test-Logs.ps1 +++ b/eng/pipelines/scripts/Get-Test-Logs.ps1 @@ -1,13 +1,19 @@ <# .SYNOPSIS -Captures any test.log files in the build directory and moves them to a staging directory for artifact publishing. +Captures any test.log files, JVM crash logs, surefire dumpstream files, and jstack dumps in the build directory +and moves them to a staging directory for artifact publishing. .DESCRIPTION -This script is used to capture any test.log files in the build directory and move them to a staging directory for -artifact publishing. It also sets a pipeline variable to indicate whether any test.log files were found. +This script is used to capture diagnostic files from the build directory and move them to a staging directory for +artifact publishing. It also sets a pipeline variable to indicate whether any diagnostic files were found. +Collected files include: + - *test.log (test logs) + - hs_err_pid*.log (JVM crash reports) + - *.dumpstream (Surefire forked JVM crash/corruption reports) + - jstack-dumps.log (periodic jstack thread dumps from the Java process monitor) .PARAMETER StagingDirectory -The directory where the test.log files will be moved to. +The directory where the diagnostic files will be moved to. .PARAMETER TestLogsArtifactName The name of the artifact to be created. @@ -22,11 +28,21 @@ param( ) $testLogs = Get-ChildItem -Path . -Recurse -Filter *test.log -File -Depth 4 +$jvmCrashLogs = Get-ChildItem -Path . -Recurse -Filter hs_err_pid*.log -File -Depth 6 +$dumpstreamFiles = Get-ChildItem -Path . -Recurse -Filter *.dumpstream -File -Depth 6 +$jstackDumps = Get-ChildItem -Path "$StagingDirectory/troubleshooting" -Filter jstack-dumps.log -File -ErrorAction SilentlyContinue -if ($testLogs.Count -gt 0) { +$allFiles = @() +if ($testLogs) { $allFiles += $testLogs } +if ($jvmCrashLogs) { $allFiles += $jvmCrashLogs } +if ($dumpstreamFiles) { $allFiles += $dumpstreamFiles } +if ($jstackDumps) { $allFiles += $jstackDumps } + +if ($allFiles.Count -gt 0) { if (-not (Test-Path "$StagingDirectory/troubleshooting")) { New-Item -ItemType Directory -Path "$StagingDirectory/troubleshooting" | Out-Null } Write-Host "##vso[task.setvariable variable=HAS_TROUBLESHOOTING]true" - Compress-Archive -Path $testLogs -DestinationPath "$StagingDirectory/troubleshooting/$TestLogsArtifactName.zip" + Write-Host "Found $($testLogs.Count) test log(s), $($jvmCrashLogs.Count) JVM crash log(s), $($dumpstreamFiles.Count) dumpstream file(s), $($jstackDumps.Count) jstack dump(s)" + Compress-Archive -Path $allFiles -DestinationPath "$StagingDirectory/troubleshooting/$TestLogsArtifactName.zip" } diff --git a/eng/pipelines/scripts/Monitor-Java-Processes.ps1 b/eng/pipelines/scripts/Monitor-Java-Processes.ps1 new file mode 100644 index 000000000000..283db7b9f678 --- /dev/null +++ b/eng/pipelines/scripts/Monitor-Java-Processes.ps1 @@ -0,0 +1,105 @@ +<# +.SYNOPSIS +Monitors Java processes by taking periodic jstack thread dumps. + +.DESCRIPTION +This script runs in the background, periodically capturing thread dumps of all running Java processes. +It uses both 'ps' (to reliably find Java processes on Linux) and 'jstack' (for thread dumps). +It writes the output to a log file in the troubleshooting directory. This is useful for diagnosing CI pipeline +hangs caused by deadlocked or stuck Java processes. + +.PARAMETER StagingDirectory +The directory where jstack dump files will be written. + +.PARAMETER IntervalSeconds +The interval in seconds between captures. Default is 120 (2 minutes). + +.PARAMETER DurationMinutes +The maximum duration in minutes to run the monitor. Default is 55 minutes. +#> + +param( + [Parameter(Mandatory = $true)] + [string]$StagingDirectory, + + [Parameter(Mandatory = $false)] + [int]$IntervalSeconds = 120, + + [Parameter(Mandatory = $false)] + [int]$DurationMinutes = 55 +) + +$troubleshootingDir = "$StagingDirectory/troubleshooting" +if (-not (Test-Path $troubleshootingDir)) { + New-Item -ItemType Directory -Path $troubleshootingDir | Out-Null +} + +$outputFile = "$troubleshootingDir/jstack-dumps.log" +$endTime = (Get-Date).AddMinutes($DurationMinutes) + +Add-Content -Path $outputFile -Value "Monitor started at $(Get-Date -Format 'yyyy-MM-dd HH:mm:ss')" +Add-Content -Path $outputFile -Value "JAVA_HOME=$($env:JAVA_HOME)" + +while ((Get-Date) -lt $endTime) { + Start-Sleep -Seconds $IntervalSeconds + + $timestamp = Get-Date -Format "yyyy-MM-dd HH:mm:ss" + Add-Content -Path $outputFile -Value "`n========== Snapshot at $timestamp ==========" + + # Use 'ps' to find Java processes (more reliable than jps on CI agents) + try { + if ($IsLinux -or $IsMacOS) { + $psOutput = bash -c "ps aux | grep '[j]ava'" 2>&1 + } else { + $psOutput = Get-Process -Name java -ErrorAction SilentlyContinue | Format-Table Id, CPU, WorkingSet64, CommandLine -AutoSize | Out-String + } + Add-Content -Path $outputFile -Value "`n--- Java processes (ps) ---" + if ($psOutput) { + Add-Content -Path $outputFile -Value $psOutput + } else { + Add-Content -Path $outputFile -Value "(no Java processes found)" + } + } catch { + Add-Content -Path $outputFile -Value "Error listing processes: $_" + } + + # Also try jps for comparison + $javaHome = $env:JAVA_HOME + $jpsPath = if ($javaHome) { "$javaHome/bin/jps" } else { "jps" } + $jstackPath = if ($javaHome) { "$javaHome/bin/jstack" } else { "jstack" } + + try { + $jpsOutput = & $jpsPath -l 2>&1 + Add-Content -Path $outputFile -Value "`n--- Java processes (jps -l) ---" + Add-Content -Path $outputFile -Value $jpsOutput + } catch { + Add-Content -Path $outputFile -Value "Error running jps: $_" + } + + # Extract PIDs from ps output and take jstack dumps + if ($IsLinux -or $IsMacOS) { + try { + $javaPids = bash -c "ps -eo pid,comm | grep '[j]ava' | awk '{print \$1}'" 2>&1 + if ($javaPids) { + foreach ($pid in ($javaPids -split "`n" | Where-Object { $_.Trim() })) { + $pid = $pid.Trim() + Add-Content -Path $outputFile -Value "`n--- jstack for PID $pid ---" + try { + $stackTrace = & $jstackPath $pid 2>&1 + Add-Content -Path $outputFile -Value $stackTrace + } catch { + Add-Content -Path $outputFile -Value "Failed to get jstack for PID $pid : $_" + } + } + } + } catch { + Add-Content -Path $outputFile -Value "Error extracting PIDs: $_" + } + } +} + +Add-Content -Path $outputFile -Value "`nMonitor finished at $(Get-Date -Format 'yyyy-MM-dd HH:mm:ss')" +# Mark that we have troubleshooting artifacts +if (Test-Path $outputFile) { + Write-Host "##vso[task.setvariable variable=HAS_TROUBLESHOOTING]true" +} diff --git a/sdk/parents/azure-client-sdk-parent/pom.xml b/sdk/parents/azure-client-sdk-parent/pom.xml index fb194a4906c6..c80d6d63c1b9 100644 --- a/sdk/parents/azure-client-sdk-parent/pom.xml +++ b/sdk/parents/azure-client-sdk-parent/pom.xml @@ -894,6 +894,7 @@ debug 1 + 1800 false ${defaultSurefireArgLine} @@ -944,6 +945,7 @@ debug 1 + 1800 false ${defaultFailsafeArgLine} diff --git a/sdk/spring/ci.yml b/sdk/spring/ci.yml index b91fedb2a87a..b76556b50240 100644 --- a/sdk/spring/ci.yml +++ b/sdk/spring/ci.yml @@ -254,6 +254,17 @@ extends: template: ../../eng/pipelines/templates/stages/archetype-sdk-client.yml parameters: ServiceDirectory: spring + PreBuildSteps: + - bash: | + nohup pwsh -File "$(Build.SourcesDirectory)/eng/pipelines/scripts/Monitor-Java-Processes.ps1" \ + -StagingDirectory "$(System.DefaultWorkingDirectory)" \ + -IntervalSeconds 180 \ + -DurationMinutes 55 \ + > /dev/null 2>&1 & + echo "Java process monitor started in background (PID: $!)" + displayName: 'Start Java process monitor (background)' + continueOnError: true + condition: always() Artifacts: - name: azure-spring-data-cosmos groupId: com.azure diff --git a/sdk/spring/pom.xml b/sdk/spring/pom.xml index d85cf0f56bc8..be007e1921d0 100644 --- a/sdk/spring/pom.xml +++ b/sdk/spring/pom.xml @@ -136,6 +136,7 @@ azure-spring-data-cosmos + monitor