diff --git a/.github/workflows/go-check.yml b/.github/workflows/go-check.yml index 9ce6260d..4b40112a 100644 --- a/.github/workflows/go-check.yml +++ b/.github/workflows/go-check.yml @@ -3,7 +3,7 @@ name: Go Checks on: pull_request: push: - branches: ["master"] + branches: ["master", "dev"] workflow_dispatch: permissions: @@ -14,5 +14,38 @@ concurrency: cancel-in-progress: true jobs: - go-check: - uses: pl-strflt/uci/.github/workflows/go-check.yml@v0.0 + check: + runs-on: ubuntu-latest + steps: + - name: Checkout code + uses: actions/checkout@v4 + + - name: Set up Go + uses: actions/setup-go@v5 + with: + go-version: "1.23" + + - name: Check formatting + run: | + if [ -n "$(gofmt -l .)" ]; then + echo "Code is not formatted. Run 'gofmt -w .'" + gofmt -d . + exit 1 + fi + + - name: Run go vet + run: go vet ./... + + - name: Run staticcheck + uses: dominikh/staticcheck-action@v1 + with: + version: "latest" + + - name: Check go mod tidy + run: | + go mod tidy + if [ -n "$(git status --porcelain go.mod go.sum)" ]; then + echo "go.mod or go.sum is not tidy. Run 'go mod tidy'" + git diff go.mod go.sum + exit 1 + fi diff --git a/.github/workflows/go-test.yml b/.github/workflows/go-test.yml index d4ca588c..c8aa8d30 100644 --- a/.github/workflows/go-test.yml +++ b/.github/workflows/go-test.yml @@ -3,7 +3,7 @@ name: Go Test on: pull_request: push: - branches: ["master"] + branches: ["master", "dev"] workflow_dispatch: permissions: @@ -14,5 +14,57 @@ concurrency: cancel-in-progress: true jobs: - go-test: - uses: pl-strflt/uci/.github/workflows/go-test.yml@v0.0 + # Full test suite on Ubuntu where mDNS multicast works + test: + strategy: + matrix: + go: ["1.22", "1.23"] + fail-fast: false + runs-on: ubuntu-latest + steps: + - name: Checkout code + uses: actions/checkout@v4 + + - name: Set up Go + uses: actions/setup-go@v5 + with: + go-version: ${{ matrix.go }} + + - name: Download dependencies + run: go mod download + + - name: Run tests + run: go test -v -race -coverprofile=coverage.txt -covermode=atomic ./... + + - name: Upload coverage + if: matrix.go == '1.23' + uses: codecov/codecov-action@v4 + with: + files: ./coverage.txt + fail_ci_if_error: false + + # Build verification on other platforms (mDNS multicast doesn't work in CI) + build: + strategy: + matrix: + os: [macos-latest, windows-latest] + go: ["1.22", "1.23"] + fail-fast: false + runs-on: ${{ matrix.os }} + steps: + - name: Checkout code + uses: actions/checkout@v4 + + - name: Set up Go + uses: actions/setup-go@v5 + with: + go-version: ${{ matrix.go }} + + - name: Download dependencies + run: go mod download + + - name: Build + run: go build ./... + + - name: Run unit tests (no network) + run: go test -v -race -run "TestQuickShutdown|TestCPUSpinOnConnectionClose" ./... diff --git a/server.go b/server.go index 71b7bf8c..07e30fab 100644 --- a/server.go +++ b/server.go @@ -274,7 +274,13 @@ func (s *Server) recv4(c *ipv4.PacketConn) { var ifIndex int n, cm, from, err := c.ReadFrom(buf) if err != nil { - continue + // Backoff to prevent CPU spin on persistent errors + select { + case <-s.shouldShutdown: + return + case <-time.After(50 * time.Millisecond): + continue + } } if cm != nil { ifIndex = cm.IfIndex @@ -299,7 +305,13 @@ func (s *Server) recv6(c *ipv6.PacketConn) { var ifIndex int n, cm, from, err := c.ReadFrom(buf) if err != nil { - continue + // Backoff to prevent CPU spin on persistent errors + select { + case <-s.shouldShutdown: + return + case <-time.After(50 * time.Millisecond): + continue + } } if cm != nil { ifIndex = cm.IfIndex diff --git a/service_test.go b/service_test.go index d2f39b4f..176107ec 100644 --- a/service_test.go +++ b/service_test.go @@ -3,6 +3,7 @@ package zeroconf import ( "context" "log" + "runtime" "testing" "time" ) @@ -181,7 +182,8 @@ func TestSubtype(t *testing.T) { initialQueryInterval = 100 * time.Millisecond cleanupFreq = 100 * time.Millisecond - ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + // Use longer timeout for CI environments where timing can be inconsistent + ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) defer cancel() startMDNS(t, mdnsPort, mdnsName, mdnsSubtype, mdnsDomain) @@ -202,3 +204,72 @@ func TestSubtype(t *testing.T) { } }) } + +// TestCPUSpinOnConnectionClose demonstrates the CPU spinning bug that occurs +// when connections are closed externally (simulating network interface changes). +// This reproduces the issue seen on Windows after hours of running. +func TestCPUSpinOnConnectionClose(t *testing.T) { + server, err := Register(mdnsName, mdnsService, mdnsDomain, mdnsPort, + []string{"txtv=0", "lo=1", "la=2"}, nil) + if err != nil { + t.Fatal(err) + } + + // Let the server start and stabilize + time.Sleep(100 * time.Millisecond) + + // Measure baseline goroutine count + baselineGoroutines := runtime.NumGoroutine() + t.Logf("Baseline goroutines: %d", baselineGoroutines) + + // Get baseline CPU stats + var baselineStats runtime.MemStats + runtime.ReadMemStats(&baselineStats) + baselineMallocs := baselineStats.Mallocs + + // Close connections WITHOUT triggering shutdown + // This simulates what happens when network interfaces change on Windows + if server.ipv4conn != nil { + server.ipv4conn.Close() + } + if server.ipv6conn != nil { + server.ipv6conn.Close() + } + + t.Log("Connections closed - recv loops should now be spinning on errors") + + // Wait and measure - if spinning, we'll see high allocation rate + // because the tight loop keeps running + time.Sleep(500 * time.Millisecond) + + var afterStats runtime.MemStats + runtime.ReadMemStats(&afterStats) + allocsDuring := afterStats.Mallocs - baselineMallocs + + t.Logf("Allocations during 500ms after connection close: %d", allocsDuring) + t.Logf("Current goroutines: %d", runtime.NumGoroutine()) + + // A spinning loop will have many more allocations than a properly blocked one + // This threshold is somewhat arbitrary but a blocked recv should have near-zero + // while a spinning one will have thousands + if allocsDuring > 10000 { + t.Errorf("DETECTED CPU SPIN: %d allocations in 500ms indicates tight loop", allocsDuring) + t.Log("This confirms the bug: recv loops spin when ReadFrom returns errors") + } else { + t.Logf("Allocation count (%d) suggests recv loops may be handling errors correctly", allocsDuring) + } + + // Clean up - this should work even with closed connections + done := make(chan struct{}) + go func() { + defer close(done) + server.Shutdown() + }() + + select { + case <-done: + t.Log("Shutdown completed successfully") + case <-time.After(2 * time.Second): + t.Error("Shutdown timed out - recv loops may be stuck") + } +}