From 12008d2e44ef48be60a5a40e89de20d479460d06 Mon Sep 17 00:00:00 2001 From: Andreas Linde Date: Sun, 28 Dec 2025 14:44:54 +0100 Subject: [PATCH 1/3] Add test to reproduce CPU spin bug on connection errors When connections are closed externally (e.g., network interface changes on Windows), the recv4/recv6 loops spin at 100% CPU because ReadFrom returns errors immediately and the code continues without backoff. This test simulates the condition by closing connections without going through the normal Shutdown path, then measures allocation rate to detect the spin (31M+ allocations in 500ms confirms the bug). --- service_test.go | 70 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 70 insertions(+) diff --git a/service_test.go b/service_test.go index d2f39b4f..bf98a776 100644 --- a/service_test.go +++ b/service_test.go @@ -3,6 +3,7 @@ package zeroconf import ( "context" "log" + "runtime" "testing" "time" ) @@ -202,3 +203,72 @@ func TestSubtype(t *testing.T) { } }) } + +// TestCPUSpinOnConnectionClose demonstrates the CPU spinning bug that occurs +// when connections are closed externally (simulating network interface changes). +// This reproduces the issue seen on Windows after hours of running. +func TestCPUSpinOnConnectionClose(t *testing.T) { + server, err := Register(mdnsName, mdnsService, mdnsDomain, mdnsPort, + []string{"txtv=0", "lo=1", "la=2"}, nil) + if err != nil { + t.Fatal(err) + } + + // Let the server start and stabilize + time.Sleep(100 * time.Millisecond) + + // Measure baseline goroutine count + baselineGoroutines := runtime.NumGoroutine() + t.Logf("Baseline goroutines: %d", baselineGoroutines) + + // Get baseline CPU stats + var baselineStats runtime.MemStats + runtime.ReadMemStats(&baselineStats) + baselineMallocs := baselineStats.Mallocs + + // Close connections WITHOUT triggering shutdown + // This simulates what happens when network interfaces change on Windows + if server.ipv4conn != nil { + server.ipv4conn.Close() + } + if server.ipv6conn != nil { + server.ipv6conn.Close() + } + + t.Log("Connections closed - recv loops should now be spinning on errors") + + // Wait and measure - if spinning, we'll see high allocation rate + // because the tight loop keeps running + time.Sleep(500 * time.Millisecond) + + var afterStats runtime.MemStats + runtime.ReadMemStats(&afterStats) + allocsDuring := afterStats.Mallocs - baselineMallocs + + t.Logf("Allocations during 500ms after connection close: %d", allocsDuring) + t.Logf("Current goroutines: %d", runtime.NumGoroutine()) + + // A spinning loop will have many more allocations than a properly blocked one + // This threshold is somewhat arbitrary but a blocked recv should have near-zero + // while a spinning one will have thousands + if allocsDuring > 10000 { + t.Errorf("DETECTED CPU SPIN: %d allocations in 500ms indicates tight loop", allocsDuring) + t.Log("This confirms the bug: recv loops spin when ReadFrom returns errors") + } else { + t.Logf("Allocation count (%d) suggests recv loops may be handling errors correctly", allocsDuring) + } + + // Clean up - this should work even with closed connections + done := make(chan struct{}) + go func() { + defer close(done) + server.Shutdown() + }() + + select { + case <-done: + t.Log("Shutdown completed successfully") + case <-time.After(2 * time.Second): + t.Error("Shutdown timed out - recv loops may be stuck") + } +} From 347175df450451a15cab4e6ec8ebc33d5ee7b48d Mon Sep 17 00:00:00 2001 From: Andreas Linde Date: Sun, 28 Dec 2025 14:47:57 +0100 Subject: [PATCH 2/3] Fix CPU spin in recv4/recv6 when ReadFrom returns errors When connections are closed externally (e.g., network interface changes on Windows), ReadFrom returns errors immediately. The previous code used a bare 'continue' which caused a tight loop spinning at 100% CPU. Add 50ms backoff on errors with shutdown check to prevent CPU exhaustion while still allowing clean shutdown during the backoff. Fixes issue where Windows systems would consume all CPU after hours of running when network conditions change. --- server.go | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/server.go b/server.go index 71b7bf8c..07e30fab 100644 --- a/server.go +++ b/server.go @@ -274,7 +274,13 @@ func (s *Server) recv4(c *ipv4.PacketConn) { var ifIndex int n, cm, from, err := c.ReadFrom(buf) if err != nil { - continue + // Backoff to prevent CPU spin on persistent errors + select { + case <-s.shouldShutdown: + return + case <-time.After(50 * time.Millisecond): + continue + } } if cm != nil { ifIndex = cm.IfIndex @@ -299,7 +305,13 @@ func (s *Server) recv6(c *ipv6.PacketConn) { var ifIndex int n, cm, from, err := c.ReadFrom(buf) if err != nil { - continue + // Backoff to prevent CPU spin on persistent errors + select { + case <-s.shouldShutdown: + return + case <-time.After(50 * time.Millisecond): + continue + } } if cm != nil { ifIndex = cm.IfIndex From 9297361a4eea3b1d3e0381afc964ea3101c86c3d Mon Sep 17 00:00:00 2001 From: Andreas Linde Date: Sun, 28 Dec 2025 19:30:46 +0100 Subject: [PATCH 3/3] fix(ci): Replace unavailable pl-strflt/uci reusable workflows The pl-strflt/uci reusable workflows are no longer accessible. Replace with self-contained workflow definitions: - go-test.yml: Full test suite on Ubuntu (mDNS multicast works), build + unit tests only on macOS/Windows (multicast blocked in CI) - go-check.yml: Run gofmt, go vet, staticcheck, and go mod tidy checks Also: - Add dev branch to push triggers - Increase TTL test timeout from 5s to 10s for CI timing tolerance --- .github/workflows/go-check.yml | 39 +++++++++++++++++++++-- .github/workflows/go-test.yml | 58 ++++++++++++++++++++++++++++++++-- service_test.go | 3 +- 3 files changed, 93 insertions(+), 7 deletions(-) diff --git a/.github/workflows/go-check.yml b/.github/workflows/go-check.yml index 9ce6260d..4b40112a 100644 --- a/.github/workflows/go-check.yml +++ b/.github/workflows/go-check.yml @@ -3,7 +3,7 @@ name: Go Checks on: pull_request: push: - branches: ["master"] + branches: ["master", "dev"] workflow_dispatch: permissions: @@ -14,5 +14,38 @@ concurrency: cancel-in-progress: true jobs: - go-check: - uses: pl-strflt/uci/.github/workflows/go-check.yml@v0.0 + check: + runs-on: ubuntu-latest + steps: + - name: Checkout code + uses: actions/checkout@v4 + + - name: Set up Go + uses: actions/setup-go@v5 + with: + go-version: "1.23" + + - name: Check formatting + run: | + if [ -n "$(gofmt -l .)" ]; then + echo "Code is not formatted. Run 'gofmt -w .'" + gofmt -d . + exit 1 + fi + + - name: Run go vet + run: go vet ./... + + - name: Run staticcheck + uses: dominikh/staticcheck-action@v1 + with: + version: "latest" + + - name: Check go mod tidy + run: | + go mod tidy + if [ -n "$(git status --porcelain go.mod go.sum)" ]; then + echo "go.mod or go.sum is not tidy. Run 'go mod tidy'" + git diff go.mod go.sum + exit 1 + fi diff --git a/.github/workflows/go-test.yml b/.github/workflows/go-test.yml index d4ca588c..c8aa8d30 100644 --- a/.github/workflows/go-test.yml +++ b/.github/workflows/go-test.yml @@ -3,7 +3,7 @@ name: Go Test on: pull_request: push: - branches: ["master"] + branches: ["master", "dev"] workflow_dispatch: permissions: @@ -14,5 +14,57 @@ concurrency: cancel-in-progress: true jobs: - go-test: - uses: pl-strflt/uci/.github/workflows/go-test.yml@v0.0 + # Full test suite on Ubuntu where mDNS multicast works + test: + strategy: + matrix: + go: ["1.22", "1.23"] + fail-fast: false + runs-on: ubuntu-latest + steps: + - name: Checkout code + uses: actions/checkout@v4 + + - name: Set up Go + uses: actions/setup-go@v5 + with: + go-version: ${{ matrix.go }} + + - name: Download dependencies + run: go mod download + + - name: Run tests + run: go test -v -race -coverprofile=coverage.txt -covermode=atomic ./... + + - name: Upload coverage + if: matrix.go == '1.23' + uses: codecov/codecov-action@v4 + with: + files: ./coverage.txt + fail_ci_if_error: false + + # Build verification on other platforms (mDNS multicast doesn't work in CI) + build: + strategy: + matrix: + os: [macos-latest, windows-latest] + go: ["1.22", "1.23"] + fail-fast: false + runs-on: ${{ matrix.os }} + steps: + - name: Checkout code + uses: actions/checkout@v4 + + - name: Set up Go + uses: actions/setup-go@v5 + with: + go-version: ${{ matrix.go }} + + - name: Download dependencies + run: go mod download + + - name: Build + run: go build ./... + + - name: Run unit tests (no network) + run: go test -v -race -run "TestQuickShutdown|TestCPUSpinOnConnectionClose" ./... diff --git a/service_test.go b/service_test.go index bf98a776..176107ec 100644 --- a/service_test.go +++ b/service_test.go @@ -182,7 +182,8 @@ func TestSubtype(t *testing.T) { initialQueryInterval = 100 * time.Millisecond cleanupFreq = 100 * time.Millisecond - ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + // Use longer timeout for CI environments where timing can be inconsistent + ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) defer cancel() startMDNS(t, mdnsPort, mdnsName, mdnsSubtype, mdnsDomain)