From 98cd374f8150a8ac03f331a08d39c97754952a8d Mon Sep 17 00:00:00 2001 From: yangk Date: Fri, 12 Jun 2026 14:59:37 +0800 Subject: [PATCH] fix(builder): add timeouts to mirror pull client and probe real manifest path A mirror that passed the /v2/ liveness probe (docker.xuanyuan.me) accepted connections but never answered manifest requests for some images, and the mirror RegistryHost client had no timeouts while the containerd pull path has no outer context deadline -- builds hung indefinitely. Add dial/TLS/ response-header timeouts so a stalling mirror costs at most ~30s before the resolver falls through to the next host. Probe a real manifest path (200 or 401 alive) so ping-only frontends are exercised end to end. --- builder/mirror/prober.go | 26 ++++++++++++++++++++------ builder/mirror/prober_test.go | 33 +++++++++++++++++++++++++++------ builder/sources/mirror_hosts.go | 12 +++++++++++- 3 files changed, 58 insertions(+), 13 deletions(-) diff --git a/builder/mirror/prober.go b/builder/mirror/prober.go index 2756444b3..b46653823 100644 --- a/builder/mirror/prober.go +++ b/builder/mirror/prober.go @@ -35,11 +35,19 @@ type probeResult struct { latency time.Duration } -// Probe checks every candidate's /v2/ registry endpoint concurrently and -// returns only the alive ones, sorted by ascending latency. A mirror is alive -// when /v2/ answers 200 or 401 (an auth challenge still proves a working -// registry frontend). Candidates keep their scheme; entries without one are -// probed via https. +// probeManifestPath is a real, tiny docker.io manifest. Probing it instead of +// the bare /v2/ ping exercises the mirror's actual proxy path, so a frontend +// that answers pings but stalls on manifests fails the probe timeout. +const probeManifestPath = "/v2/library/alpine/manifests/latest" + +// Probe fetches a real manifest from every candidate concurrently and returns +// only the alive ones, sorted by ascending latency. 200 and 401 both count as +// alive: token-auth mirrors (e.g. daocloud) answer 401 to anonymous manifest +// requests, and the containerd/BuildKit token flow handles that during real +// pulls. The probe cannot catch every stall (a mirror may serve alpine fine +// and hang on another image) — the pull-side client timeout is the hard +// safety net; the probe only filters and orders. Candidates keep their +// scheme; entries without one are probed via https. func Probe(ctx context.Context, candidates []string, timeout time.Duration) []string { if len(candidates) == 0 { return nil @@ -76,12 +84,18 @@ func probeOne(ctx context.Context, client *http.Client, mirrorURL string) (time. if !strings.HasPrefix(endpoint, "http://") && !strings.HasPrefix(endpoint, "https://") { endpoint = "https://" + endpoint } - endpoint = strings.TrimSuffix(endpoint, "/") + "/v2/" + endpoint = strings.TrimSuffix(endpoint, "/") + probeManifestPath req, err := http.NewRequestWithContext(ctx, http.MethodGet, endpoint, nil) if err != nil { logrus.Debugf("probe mirror %s: build request failure: %v", mirrorURL, err) return 0, false } + req.Header.Set("Accept", strings.Join([]string{ + "application/vnd.docker.distribution.manifest.v2+json", + "application/vnd.docker.distribution.manifest.list.v2+json", + "application/vnd.oci.image.manifest.v1+json", + "application/vnd.oci.image.index.v1+json", + }, ", ")) start := time.Now() resp, err := client.Do(req) if err != nil { diff --git a/builder/mirror/prober_test.go b/builder/mirror/prober_test.go index 2ab9e38c9..b9506dcf8 100644 --- a/builder/mirror/prober_test.go +++ b/builder/mirror/prober_test.go @@ -11,12 +11,33 @@ import ( // capability_id: rainbond.builder.dynamic-mirror-probe func TestProbeFiltersAndSortsByLatency(t *testing.T) { slowOK := newRegistryStub(t, http.StatusOK, 300*time.Millisecond) - fastUnauthorized := newRegistryStub(t, http.StatusUnauthorized, 0) + fastOK := newRegistryStub(t, http.StatusOK, 0) dead := newRegistryStub(t, http.StatusInternalServerError, 0) - got := Probe(context.Background(), []string{slowOK, fastUnauthorized, dead}, 2*time.Second) + got := Probe(context.Background(), []string{slowOK, fastOK, dead}, 2*time.Second) - assertStringSlice(t, got, []string{fastUnauthorized, slowOK}) + assertStringSlice(t, got, []string{fastOK, slowOK}) +} + +// token 认证类 mirror(如 daocloud)对匿名 manifest 请求回 401,但真实拉取时 +// resolver 会走 token 流程,因此 401 必须判活。 +func TestProbeTokenAuthMirrorIsAlive(t *testing.T) { + authChallenge := newRegistryStub(t, http.StatusUnauthorized, 0) + got := Probe(context.Background(), []string{authChallenge}, time.Second) + assertStringSlice(t, got, []string{authChallenge}) +} + +// manifest 路径挂起不响应的“假活”源要在探活超时内排除 +// (docker.xuanyuan.me 卡死构建事故的场景之一)。 +func TestProbeManifestStallTreatedAsDead(t *testing.T) { + stalled := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + time.Sleep(2 * time.Second) // longer than probe timeout + })) + t.Cleanup(stalled.Close) + got := Probe(context.Background(), []string{stalled.URL}, 500*time.Millisecond) + if len(got) != 0 { + t.Fatalf("stalled mirror must be treated as dead, got %v", got) + } } func TestProbeUnreachableHostDropped(t *testing.T) { @@ -32,12 +53,12 @@ func TestProbeEmptyInput(t *testing.T) { } } -// newRegistryStub serves /v2/ with the given status after an artificial delay -// and returns the server base URL. +// newRegistryStub serves the probe manifest path with the given status after +// an artificial delay and returns the server base URL. func newRegistryStub(t *testing.T, status int, delay time.Duration) string { t.Helper() srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { - if r.URL.Path != "/v2/" { + if r.URL.Path != probeManifestPath { w.WriteHeader(http.StatusNotFound) return } diff --git a/builder/sources/mirror_hosts.go b/builder/sources/mirror_hosts.go index 801f094fb..781befd9c 100644 --- a/builder/sources/mirror_hosts.go +++ b/builder/sources/mirror_hosts.go @@ -20,8 +20,10 @@ package sources import ( "crypto/tls" + "net" "net/http" "strings" + "time" refdocker "github.com/containerd/containerd/reference/docker" "github.com/containerd/containerd/remotes/docker" @@ -60,8 +62,16 @@ func mirrorRegistryHost(mirrorURL string) docker.RegistryHost { } host = strings.TrimSuffix(host, "/") return docker.RegistryHost{ + // 必须带超时:探活通过的 mirror 仍可能在取 manifest/blob 时挂起不响应, + // containerd 这条 pull 路径外层没有 ctx 超时,没有这里的超时会把构建卡死。 Client: &http.Client{ - Transport: &http.Transport{TLSClientConfig: &tls.Config{InsecureSkipVerify: true}}, + Transport: &http.Transport{ + DialContext: (&net.Dialer{Timeout: 10 * time.Second}).DialContext, + TLSClientConfig: &tls.Config{InsecureSkipVerify: true}, + TLSHandshakeTimeout: 10 * time.Second, + ResponseHeaderTimeout: 30 * time.Second, + IdleConnTimeout: 90 * time.Second, + }, }, Authorizer: docker.NewDockerAuthorizer(), Host: host,