diff --git a/builder/mirror/prober.go b/builder/mirror/prober.go index 2756444b3..b46653823 100644 --- a/builder/mirror/prober.go +++ b/builder/mirror/prober.go @@ -35,11 +35,19 @@ type probeResult struct { latency time.Duration } -// Probe checks every candidate's /v2/ registry endpoint concurrently and -// returns only the alive ones, sorted by ascending latency. A mirror is alive -// when /v2/ answers 200 or 401 (an auth challenge still proves a working -// registry frontend). Candidates keep their scheme; entries without one are -// probed via https. +// probeManifestPath is a real, tiny docker.io manifest. Probing it instead of +// the bare /v2/ ping exercises the mirror's actual proxy path, so a frontend +// that answers pings but stalls on manifests fails the probe timeout. +const probeManifestPath = "/v2/library/alpine/manifests/latest" + +// Probe fetches a real manifest from every candidate concurrently and returns +// only the alive ones, sorted by ascending latency. 200 and 401 both count as +// alive: token-auth mirrors (e.g. daocloud) answer 401 to anonymous manifest +// requests, and the containerd/BuildKit token flow handles that during real +// pulls. The probe cannot catch every stall (a mirror may serve alpine fine +// and hang on another image) — the pull-side client timeout is the hard +// safety net; the probe only filters and orders. Candidates keep their +// scheme; entries without one are probed via https. func Probe(ctx context.Context, candidates []string, timeout time.Duration) []string { if len(candidates) == 0 { return nil @@ -76,12 +84,18 @@ func probeOne(ctx context.Context, client *http.Client, mirrorURL string) (time. if !strings.HasPrefix(endpoint, "http://") && !strings.HasPrefix(endpoint, "https://") { endpoint = "https://" + endpoint } - endpoint = strings.TrimSuffix(endpoint, "/") + "/v2/" + endpoint = strings.TrimSuffix(endpoint, "/") + probeManifestPath req, err := http.NewRequestWithContext(ctx, http.MethodGet, endpoint, nil) if err != nil { logrus.Debugf("probe mirror %s: build request failure: %v", mirrorURL, err) return 0, false } + req.Header.Set("Accept", strings.Join([]string{ + "application/vnd.docker.distribution.manifest.v2+json", + "application/vnd.docker.distribution.manifest.list.v2+json", + "application/vnd.oci.image.manifest.v1+json", + "application/vnd.oci.image.index.v1+json", + }, ", ")) start := time.Now() resp, err := client.Do(req) if err != nil { diff --git a/builder/mirror/prober_test.go b/builder/mirror/prober_test.go index 2ab9e38c9..b9506dcf8 100644 --- a/builder/mirror/prober_test.go +++ b/builder/mirror/prober_test.go @@ -11,12 +11,33 @@ import ( // capability_id: rainbond.builder.dynamic-mirror-probe func TestProbeFiltersAndSortsByLatency(t *testing.T) { slowOK := newRegistryStub(t, http.StatusOK, 300*time.Millisecond) - fastUnauthorized := newRegistryStub(t, http.StatusUnauthorized, 0) + fastOK := newRegistryStub(t, http.StatusOK, 0) dead := newRegistryStub(t, http.StatusInternalServerError, 0) - got := Probe(context.Background(), []string{slowOK, fastUnauthorized, dead}, 2*time.Second) + got := Probe(context.Background(), []string{slowOK, fastOK, dead}, 2*time.Second) - assertStringSlice(t, got, []string{fastUnauthorized, slowOK}) + assertStringSlice(t, got, []string{fastOK, slowOK}) +} + +// token 认证类 mirror(如 daocloud)对匿名 manifest 请求回 401,但真实拉取时 +// resolver 会走 token 流程,因此 401 必须判活。 +func TestProbeTokenAuthMirrorIsAlive(t *testing.T) { + authChallenge := newRegistryStub(t, http.StatusUnauthorized, 0) + got := Probe(context.Background(), []string{authChallenge}, time.Second) + assertStringSlice(t, got, []string{authChallenge}) +} + +// manifest 路径挂起不响应的“假活”源要在探活超时内排除 +// (docker.xuanyuan.me 卡死构建事故的场景之一)。 +func TestProbeManifestStallTreatedAsDead(t *testing.T) { + stalled := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + time.Sleep(2 * time.Second) // longer than probe timeout + })) + t.Cleanup(stalled.Close) + got := Probe(context.Background(), []string{stalled.URL}, 500*time.Millisecond) + if len(got) != 0 { + t.Fatalf("stalled mirror must be treated as dead, got %v", got) + } } func TestProbeUnreachableHostDropped(t *testing.T) { @@ -32,12 +53,12 @@ func TestProbeEmptyInput(t *testing.T) { } } -// newRegistryStub serves /v2/ with the given status after an artificial delay -// and returns the server base URL. +// newRegistryStub serves the probe manifest path with the given status after +// an artificial delay and returns the server base URL. func newRegistryStub(t *testing.T, status int, delay time.Duration) string { t.Helper() srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { - if r.URL.Path != "/v2/" { + if r.URL.Path != probeManifestPath { w.WriteHeader(http.StatusNotFound) return } diff --git a/builder/sources/mirror_hosts.go b/builder/sources/mirror_hosts.go index 801f094fb..781befd9c 100644 --- a/builder/sources/mirror_hosts.go +++ b/builder/sources/mirror_hosts.go @@ -20,8 +20,10 @@ package sources import ( "crypto/tls" + "net" "net/http" "strings" + "time" refdocker "github.com/containerd/containerd/reference/docker" "github.com/containerd/containerd/remotes/docker" @@ -60,8 +62,16 @@ func mirrorRegistryHost(mirrorURL string) docker.RegistryHost { } host = strings.TrimSuffix(host, "/") return docker.RegistryHost{ + // 必须带超时:探活通过的 mirror 仍可能在取 manifest/blob 时挂起不响应, + // containerd 这条 pull 路径外层没有 ctx 超时,没有这里的超时会把构建卡死。 Client: &http.Client{ - Transport: &http.Transport{TLSClientConfig: &tls.Config{InsecureSkipVerify: true}}, + Transport: &http.Transport{ + DialContext: (&net.Dialer{Timeout: 10 * time.Second}).DialContext, + TLSClientConfig: &tls.Config{InsecureSkipVerify: true}, + TLSHandshakeTimeout: 10 * time.Second, + ResponseHeaderTimeout: 30 * time.Second, + IdleConnTimeout: 90 * time.Second, + }, }, Authorizer: docker.NewDockerAuthorizer(), Host: host,