From ade92b918e3883a1edeee619666b43eae94cf483 Mon Sep 17 00:00:00 2001 From: Rada Kamysheva Date: Tue, 23 Jun 2026 07:38:41 +0000 Subject: [PATCH 01/24] bundle/fuzz: add create-payload parity fuzz test for terraform vs direct Implements the first technique from DECO-25361: generate random job configs and check for differences in the create payload between the terraform and direct deploy engines. Both engines run the same `bundle deploy` pipeline in-process (via testcli) against a testserver, differing only in DATABRICKS_BUNDLE_ENGINE, and the POST /api/2.2/jobs/create body each sends is captured and diffed. Because only the engine differs, shared mutators cancel out and any remaining diff is a genuine engine divergence. The fuzzer already surfaced two real (benign) divergences, documented in DefaultIgnorePaths: - num_workers: 0 is sent explicitly by terraform but dropped by direct (omitempty). - the terraform provider strips the deprecated spark conf "spark.databricks.delta.preview.enabled"; direct forwards it. Run with: go test ./bundle/fuzz -run TestJobCreateParity (FUZZ_SEEDS overrides the seed count; auto-skips when terraform is not provisioned via acceptance/install_terraform.py). --- bundle/fuzz/capture.go | 59 +++++ bundle/fuzz/capture_deploy.go | 145 ++++++++++++ bundle/fuzz/capture_deploy_test.go | 35 +++ bundle/fuzz/compare.go | 204 +++++++++++++++++ bundle/fuzz/compare_test.go | 95 ++++++++ bundle/fuzz/fuzz_test.go | 68 ++++++ bundle/fuzz/generate.go | 349 +++++++++++++++++++++++++++++ bundle/fuzz/generate_test.go | 47 ++++ bundle/fuzz/rand.go | 47 ++++ 9 files changed, 1049 insertions(+) create mode 100644 bundle/fuzz/capture.go create mode 100644 bundle/fuzz/capture_deploy.go create mode 100644 bundle/fuzz/capture_deploy_test.go create mode 100644 bundle/fuzz/compare.go create mode 100644 bundle/fuzz/compare_test.go create mode 100644 bundle/fuzz/fuzz_test.go create mode 100644 bundle/fuzz/generate.go create mode 100644 bundle/fuzz/generate_test.go create mode 100644 bundle/fuzz/rand.go diff --git a/bundle/fuzz/capture.go b/bundle/fuzz/capture.go new file mode 100644 index 0000000000..330f485f82 --- /dev/null +++ b/bundle/fuzz/capture.go @@ -0,0 +1,59 @@ +package fuzz + +import ( + "encoding/json" + "sync" + + "github.com/databricks/cli/libs/testserver" +) + +// jobsCreatePath is the Jobs API route both engines must hit on create. The +// direct engine posts here via the SDK; the terraform provider is expected to +// post here too, and a mismatch (e.g. a different API version) is itself a +// divergence worth surfacing. +const jobsCreatePath = "/api/2.2/jobs/create" + +// CapturedRequest is a single mutating API request observed by the testserver. +type CapturedRequest struct { + Method string + Path string + Body json.RawMessage +} + +// recorder collects request bodies sent to a testserver. It is safe for +// concurrent use because the SDK and terraform may issue requests from multiple +// goroutines. +type recorder struct { + mu sync.Mutex + requests []CapturedRequest +} + +func (r *recorder) callback(req *testserver.Request) { + r.mu.Lock() + defer r.mu.Unlock() + + var body json.RawMessage + if json.Valid(req.Body) { + // Copy: testserver reuses the underlying buffer across requests. + body = append(json.RawMessage(nil), req.Body...) + } + + r.requests = append(r.requests, CapturedRequest{ + Method: req.Method, + Path: req.URL.Path, + Body: body, + }) +} + +// find returns the body of the first recorded request matching method and path. +func (r *recorder) find(method, path string) (json.RawMessage, bool) { + r.mu.Lock() + defer r.mu.Unlock() + + for _, req := range r.requests { + if req.Method == method && req.Path == path { + return req.Body, true + } + } + return nil, false +} diff --git a/bundle/fuzz/capture_deploy.go b/bundle/fuzz/capture_deploy.go new file mode 100644 index 0000000000..6f06487bf3 --- /dev/null +++ b/bundle/fuzz/capture_deploy.go @@ -0,0 +1,145 @@ +package fuzz + +import ( + "context" + "encoding/json" + "fmt" + "os" + "path/filepath" + "testing" + + "github.com/databricks/cli/bundle/config/resources" + "github.com/databricks/cli/internal/testcli" + "github.com/databricks/cli/libs/testserver" +) + +const ( + // bundleResourceKey is the map key the generated job is registered under. + bundleResourceKey = "fuzz_job" + fakeToken = "testtoken" +) + +// CaptureJobCreate deploys a bundle containing job through the given engine +// ("direct" or "terraform") and returns the create request body sent to the +// Jobs API. +// +// Both engines run the full `bundle deploy` pipeline against an in-process +// testserver, so the only difference between two captures with different engines +// is the engine itself. That is what makes the resulting payloads directly +// comparable: shared mutators (deployment metadata, presets, ...) are applied +// identically on both sides and cancel out in the diff. +// +// The terraform engine additionally requires DATABRICKS_TF_EXEC_PATH and +// DATABRICKS_TF_CLI_CONFIG_FILE to point at a provisioned terraform binary and +// provider mirror; see RequireTerraform. +func CaptureJobCreate(ctx context.Context, t *testing.T, job *resources.Job, engine string) (json.RawMessage, error) { + rec := &recorder{} + server := testserver.New(t) + server.RequestCallback = rec.callback + testserver.AddDefaultHandlers(server) + + dir := t.TempDir() + if err := writeJobBundle(dir, server.URL, job); err != nil { + return nil, err + } + + t.Setenv("DATABRICKS_HOST", server.URL) + t.Setenv("DATABRICKS_TOKEN", fakeToken) + t.Setenv("DATABRICKS_BUNDLE_ENGINE", engine) + t.Chdir(dir) + + stdout, stderr, err := testcli.NewRunner(t, ctx, "bundle", "deploy").Run() + if err != nil { + return nil, fmt.Errorf("bundle deploy (engine=%s) failed: %w\nstdout:\n%s\nstderr:\n%s", + engine, err, stdout.String(), stderr.String()) + } + + body, ok := rec.find("POST", jobsCreatePath) + if !ok { + return nil, fmt.Errorf("engine=%s did not POST %s during deploy", engine, jobsCreatePath) + } + return body, nil +} + +// CompareJobEngines deploys job under both engines and returns the create-payload +// differences that are not covered by DefaultIgnorePaths. An empty result means +// the engines produced equivalent create payloads. +func CompareJobEngines(ctx context.Context, t *testing.T, job *resources.Job) ([]Difference, error) { + direct, err := CaptureJobCreate(ctx, t, job, "direct") + if err != nil { + return nil, fmt.Errorf("capturing direct payload: %w", err) + } + terraform, err := CaptureJobCreate(ctx, t, job, "terraform") + if err != nil { + return nil, fmt.Errorf("capturing terraform payload: %w", err) + } + return DiffPayloads(direct, terraform, DefaultIgnorePaths) +} + +// writeJobBundle writes a minimal databricks.yml describing a single job. The +// document is emitted as JSON, which is valid YAML, so we can reuse the job's +// own JSON marshaling (which honors ForceSendFields) without a YAML dependency. +func writeJobBundle(dir, host string, job *resources.Job) error { + jobJSON, err := json.Marshal(job) + if err != nil { + return fmt.Errorf("marshaling job: %w", err) + } + + var jobMap map[string]any + if err := json.Unmarshal(jobJSON, &jobMap); err != nil { + return fmt.Errorf("unmarshaling job: %w", err) + } + + doc := map[string]any{ + "bundle": map[string]any{"name": "fuzz"}, + "workspace": map[string]any{"host": host}, + "resources": map[string]any{ + "jobs": map[string]any{bundleResourceKey: jobMap}, + }, + } + + data, err := json.MarshalIndent(doc, "", " ") + if err != nil { + return fmt.Errorf("marshaling bundle: %w", err) + } + + return os.WriteFile(filepath.Join(dir, "databricks.yml"), data, 0o600) +} + +// RequireTerraform points the terraform engine at the binary and provider mirror +// provisioned by acceptance/install_terraform.py into /build, and skips the +// test when they are absent so the suite still runs where terraform is not set up. +func RequireTerraform(t testing.TB) { + buildDir := filepath.Join(repoRoot(t), "build") + execPath := filepath.Join(buildDir, "terraform") + cfgFile := filepath.Join(buildDir, ".terraformrc") + + if _, err := os.Stat(execPath); err != nil { + t.Skipf("terraform not provisioned (%s); run: python3 acceptance/install_terraform.py --targetdir build", execPath) + } + + t.Setenv("DATABRICKS_TF_EXEC_PATH", execPath) + t.Setenv("DATABRICKS_TF_CLI_CONFIG_FILE", cfgFile) + t.Setenv("TF_CLI_CONFIG_FILE", cfgFile) + // Terraform phones home to checkpoint-api.hashicorp.com otherwise; disable it + // so the testserver/network isn't hit. See acceptance_test.go. + t.Setenv("CHECKPOINT_DISABLE", "1") +} + +// repoRoot returns the repository root by walking up from the current directory. +func repoRoot(t testing.TB) string { + dir, err := os.Getwd() + if err != nil { + t.Fatalf("getwd: %s", err) + } + for { + if _, err := os.Stat(filepath.Join(dir, "go.mod")); err == nil { + return dir + } + parent := filepath.Dir(dir) + if parent == dir { + t.Fatal("could not locate repo root (go.mod not found)") + } + dir = parent + } +} diff --git a/bundle/fuzz/capture_deploy_test.go b/bundle/fuzz/capture_deploy_test.go new file mode 100644 index 0000000000..2518265d75 --- /dev/null +++ b/bundle/fuzz/capture_deploy_test.go @@ -0,0 +1,35 @@ +package fuzz + +import ( + "encoding/json" + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +func TestCaptureJobCreateDirect(t *testing.T) { + job := GenerateJob(newRNG(1)) + + body, err := CaptureJobCreate(t.Context(), t, job, "direct") + require.NoError(t, err) + require.NotEmpty(t, body) + + var payload map[string]any + require.NoError(t, json.Unmarshal(body, &payload)) + assert.Equal(t, job.Name, payload["name"]) + assert.Contains(t, payload, "tasks") +} + +func TestCaptureJobCreateTerraform(t *testing.T) { + RequireTerraform(t) + job := GenerateJob(newRNG(1)) + + body, err := CaptureJobCreate(t.Context(), t, job, "terraform") + require.NoError(t, err) + require.NotEmpty(t, body) + + var payload map[string]any + require.NoError(t, json.Unmarshal(body, &payload)) + assert.Equal(t, job.Name, payload["name"]) +} diff --git a/bundle/fuzz/compare.go b/bundle/fuzz/compare.go new file mode 100644 index 0000000000..48b7d3e648 --- /dev/null +++ b/bundle/fuzz/compare.go @@ -0,0 +1,204 @@ +package fuzz + +import ( + "bytes" + "encoding/json" + "fmt" + "regexp" + "slices" + "strconv" + "strings" +) + +// Difference is a single mismatch between the two engines' create payloads, +// located by a JSON-ish path (e.g. "tasks[0].new_cluster.num_workers"). +type Difference struct { + Path string + Direct any + Terraform any +} + +func (d Difference) String() string { + return fmt.Sprintf("%s: direct=%s terraform=%s", d.Path, render(d.Direct), render(d.Terraform)) +} + +// missing marks a value that is absent on one side. +type missing struct{} + +func render(v any) string { + if _, ok := v.(missing); ok { + return "" + } + b, err := json.Marshal(v) + if err != nil { + return fmt.Sprintf("%v", v) + } + return string(b) +} + +// DiffPayloads decodes both create payloads and returns every difference whose +// path is not explicitly ignored. ignorePaths are matched exactly against the +// rendered path, with "[*]" standing in for any slice index. +func DiffPayloads(direct, terraform json.RawMessage, ignorePaths []string) ([]Difference, error) { + d, err := decode(direct) + if err != nil { + return nil, fmt.Errorf("decoding direct payload: %w", err) + } + tf, err := decode(terraform) + if err != nil { + return nil, fmt.Errorf("decoding terraform payload: %w", err) + } + + var diffs []Difference + diffValue("", d, tf, &diffs) + + ignore := make(map[string]bool, len(ignorePaths)) + for _, p := range ignorePaths { + ignore[p] = true + } + + filtered := diffs[:0] + for _, diff := range diffs { + if !ignore[normalizePath(diff.Path)] { + filtered = append(filtered, diff) + } + } + return filtered, nil +} + +// decode unmarshals JSON using UseNumber so large int64 values (e.g. job ids, +// spark_context_id) are not corrupted by float64 rounding. See the encoding rule +// in the repo style guide. +func decode(raw json.RawMessage) (any, error) { + if len(raw) == 0 { + return nil, nil + } + dec := json.NewDecoder(bytes.NewReader(raw)) + dec.UseNumber() + var v any + if err := dec.Decode(&v); err != nil { + return nil, err + } + return v, nil +} + +func diffValue(path string, a, b any, diffs *[]Difference) { + switch av := a.(type) { + case map[string]any: + bv, ok := b.(map[string]any) + if !ok { + *diffs = append(*diffs, Difference{Path: path, Direct: a, Terraform: b}) + return + } + keys := unionKeys(av, bv) + for _, k := range keys { + achild, aok := av[k] + bchild, bok := bv[k] + child := joinKey(path, k) + switch { + case aok && bok: + diffValue(child, achild, bchild, diffs) + case aok: + *diffs = append(*diffs, Difference{Path: child, Direct: achild, Terraform: missing{}}) + default: + *diffs = append(*diffs, Difference{Path: child, Direct: missing{}, Terraform: bchild}) + } + } + case []any: + bv, ok := b.([]any) + if !ok { + *diffs = append(*diffs, Difference{Path: path, Direct: a, Terraform: b}) + return + } + n := max(len(av), len(bv)) + for i := range n { + child := fmt.Sprintf("%s[%d]", path, i) + switch { + case i < len(av) && i < len(bv): + diffValue(child, av[i], bv[i], diffs) + case i < len(av): + *diffs = append(*diffs, Difference{Path: child, Direct: av[i], Terraform: missing{}}) + default: + *diffs = append(*diffs, Difference{Path: child, Direct: missing{}, Terraform: bv[i]}) + } + } + default: + if !scalarEqual(a, b) { + *diffs = append(*diffs, Difference{Path: path, Direct: a, Terraform: b}) + } + } +} + +// scalarEqual compares two JSON scalars. json.Number is compared by its string +// form so 1 and 1.0 don't masquerade as equal across engines. +func scalarEqual(a, b any) bool { + an, aok := a.(json.Number) + bn, bok := b.(json.Number) + if aok && bok { + return an.String() == bn.String() + } + return a == b +} + +func unionKeys(a, b map[string]any) []string { + seen := map[string]bool{} + var keys []string + for k := range a { + if !seen[k] { + seen[k] = true + keys = append(keys, k) + } + } + for k := range b { + if !seen[k] { + seen[k] = true + keys = append(keys, k) + } + } + slices.Sort(keys) + return keys +} + +func joinKey(path, key string) string { + // Map keys can themselves contain dots or brackets (e.g. spark_conf entries + // like "spark.databricks.delta.preview.enabled"). Render those as bracketed, + // quoted segments so the path stays unambiguous and ignore entries can target + // a single key. + if key == "" || strings.ContainsAny(key, `.[]"`) { + return path + "[" + strconv.Quote(key) + "]" + } + if path == "" { + return key + } + return path + "." + key +} + +// indexRe matches numeric slice indices like "[12]" but not quoted string keys +// like ["spark.x"]. +var indexRe = regexp.MustCompile(`\[\d+\]`) + +// normalizePath replaces concrete slice indices with [*] so a single ignore +// entry can cover every element of a slice. +func normalizePath(path string) string { + return indexRe.ReplaceAllString(path, "[*]") +} + +// DefaultIgnorePaths lists create-payload paths that legitimately differ between +// the engines and are not parity bugs. Keep this list small and well-justified; +// every entry is a known, intentional divergence. +var DefaultIgnorePaths = []string{ + // num_workers is a zero-able int: when a cluster has num_workers: 0 the + // terraform provider serializes it explicitly while the direct engine drops + // it via omitempty. The backend treats absent and 0 identically, so this is a + // benign serialization difference. See the update_single_node acceptance test + // ("issues with zero conversion"). + "tasks[*].new_cluster.num_workers", + "job_clusters[*].new_cluster.num_workers", + + // The terraform provider strips the deprecated/ignored spark conf + // "spark.databricks.delta.preview.enabled" from new_cluster.spark_conf, while + // the direct engine forwards it verbatim. The backend ignores the key either + // way, so this is a benign provider-side filter rather than a parity bug. + `tasks[*].new_cluster.spark_conf["spark.databricks.delta.preview.enabled"]`, + `job_clusters[*].new_cluster.spark_conf["spark.databricks.delta.preview.enabled"]`, +} diff --git a/bundle/fuzz/compare_test.go b/bundle/fuzz/compare_test.go new file mode 100644 index 0000000000..ec5818468b --- /dev/null +++ b/bundle/fuzz/compare_test.go @@ -0,0 +1,95 @@ +package fuzz + +import ( + "encoding/json" + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +func TestDiffPayloads(t *testing.T) { + tests := []struct { + name string + direct string + terraform string + ignore []string + want []string + }{ + { + name: "identical", + direct: `{"name":"a","tasks":[{"task_key":"t"}]}`, + terraform: `{"name":"a","tasks":[{"task_key":"t"}]}`, + want: nil, + }, + { + name: "scalar mismatch", + direct: `{"name":"a"}`, + terraform: `{"name":"b"}`, + want: []string{"name"}, + }, + { + name: "missing on terraform", + direct: `{"name":"a","queue":{"enabled":true}}`, + terraform: `{"name":"a"}`, + want: []string{"queue"}, + }, + { + name: "missing on direct", + direct: `{"name":"a"}`, + terraform: `{"name":"a","max_concurrent_runs":1}`, + want: []string{"max_concurrent_runs"}, + }, + { + name: "nested slice element mismatch", + direct: `{"tasks":[{"task_key":"t","timeout_seconds":1}]}`, + terraform: `{"tasks":[{"task_key":"t","timeout_seconds":2}]}`, + want: []string{"tasks[0].timeout_seconds"}, + }, + { + name: "slice length mismatch", + direct: `{"tasks":[{"task_key":"a"},{"task_key":"b"}]}`, + terraform: `{"tasks":[{"task_key":"a"}]}`, + want: []string{"tasks[1]"}, + }, + { + name: "number 1 vs 1.0 differ", + direct: `{"n":1}`, + terraform: `{"n":1.0}`, + want: []string{"n"}, + }, + { + name: "ignored path", + direct: `{"tasks":[{"timeout_seconds":1}]}`, + terraform: `{"tasks":[{"timeout_seconds":2}]}`, + ignore: []string{"tasks[*].timeout_seconds"}, + want: nil, + }, + { + name: "dotted map key is bracket-quoted", + direct: `{"spark_conf":{"spark.x.y":"1"}}`, + terraform: `{"spark_conf":{}}`, + want: []string{`spark_conf["spark.x.y"]`}, + }, + { + name: "dotted map key can be ignored", + direct: `{"c":{"spark_conf":{"spark.x.y":"1"}}}`, + terraform: `{"c":{"spark_conf":{}}}`, + ignore: []string{`c.spark_conf["spark.x.y"]`}, + want: nil, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + diffs, err := DiffPayloads(json.RawMessage(tt.direct), json.RawMessage(tt.terraform), tt.ignore) + require.NoError(t, err) + + var paths []string + for _, d := range diffs { + paths = append(paths, d.Path) + } + assert.ElementsMatch(t, tt.want, paths) + }) + } +} diff --git a/bundle/fuzz/fuzz_test.go b/bundle/fuzz/fuzz_test.go new file mode 100644 index 0000000000..55e52eb0bb --- /dev/null +++ b/bundle/fuzz/fuzz_test.go @@ -0,0 +1,68 @@ +package fuzz + +import ( + "encoding/json" + "os" + "strconv" + "testing" + + "github.com/stretchr/testify/require" +) + +// defaultParitySeeds is the number of random jobs TestJobCreateParity checks by +// default. Each seed runs two real deploys (direct + terraform), so the count is +// kept modest; override with FUZZ_SEEDS for a deeper local run. +const defaultParitySeeds = 20 + +// TestJobCreateParity is the first DECO-25361 technique: for many random job +// configs, assert the terraform and direct engines produce equivalent create +// payloads. On divergence it prints the seed and the generated job so the failure +// can be reproduced and inspected. +func TestJobCreateParity(t *testing.T) { + RequireTerraform(t) + + seeds := defaultParitySeeds + if v := os.Getenv("FUZZ_SEEDS"); v != "" { + n, err := strconv.Atoi(v) + require.NoErrorf(t, err, "invalid FUZZ_SEEDS=%q", v) + seeds = n + } + + for seed := int64(0); seed < int64(seeds); seed++ { + t.Run("seed="+strconv.FormatInt(seed, 10), func(t *testing.T) { + checkJobParity(t, seed) + }) + } +} + +// FuzzJobCreateParity exposes the same parity check to Go's native fuzzer +// (`go test -fuzz=FuzzJobCreateParity`). Note each input runs two real deploys, +// so this is intended for ad-hoc deep runs, not the default `go test` path. +func FuzzJobCreateParity(f *testing.F) { + RequireTerraform(f) + for seed := int64(0); seed < 5; seed++ { + f.Add(seed) + } + f.Fuzz(func(t *testing.T, seed int64) { + checkJobParity(t, seed) + }) +} + +// checkJobParity generates the job for seed, deploys it under both engines, and +// fails the test with reproduction details if the create payloads diverge. +func checkJobParity(t *testing.T, seed int64) { + t.Helper() + job := GenerateJob(newRNG(seed)) + + diffs, err := CompareJobEngines(t.Context(), t, job) + require.NoErrorf(t, err, "seed %d", seed) + + if len(diffs) > 0 { + jobJSON, _ := json.MarshalIndent(job, "", " ") + t.Errorf("seed %d: terraform/direct create payloads diverge (%d differences):", seed, len(diffs)) + for _, d := range diffs { + t.Errorf(" %s", d) + } + t.Logf("reproduce with GenerateJob(newRNG(%d)):\n%s", seed, jobJSON) + } +} diff --git a/bundle/fuzz/generate.go b/bundle/fuzz/generate.go new file mode 100644 index 0000000000..a7c5e6056f --- /dev/null +++ b/bundle/fuzz/generate.go @@ -0,0 +1,349 @@ +// Package fuzz provides randomized generators and harnesses that compare how the +// terraform and direct deploy engines translate the same bundle resource into an +// API create payload. See DECO-25361. +// +// The first technique implemented here generates a random resource config and +// checks for differences in the create payload between the terraform and direct +// engines. Generators are seeded so that any divergence found by the fuzz driver +// can be reproduced from the printed seed. +package fuzz + +import ( + "fmt" + "math/rand/v2" + + "github.com/databricks/cli/bundle/config/resources" + "github.com/databricks/databricks-sdk-go/service/compute" + "github.com/databricks/databricks-sdk-go/service/jobs" +) + +// Value pools are intentionally small and valid-looking: the goal is to exercise +// the engines' config->payload translation across many field combinations, not to +// stress the API with invalid values (which the testserver would reject before we +// can compare payloads). +var ( + sparkVersions = []string{"13.3.x-scala2.12", "14.3.x-scala2.12", "15.4.x-scala2.12", "16.4.x-scala2.12"} + nodeTypeIDs = []string{"i3.xlarge", "m5.large", "r5.xlarge", "Standard_DS3_v2"} + timezones = []string{"UTC", "America/Los_Angeles", "Europe/Amsterdam"} + cronExprs = []string{"0 0 12 * * ?", "0 15 10 ? * MON-FRI", "0 0/30 * * * ?"} + pauseStatuses = []jobs.PauseStatus{jobs.PauseStatusPaused, jobs.PauseStatusUnpaused} + performance = []jobs.PerformanceTarget{jobs.PerformanceTargetPerformanceOptimized, jobs.PerformanceTargetStandard} + timeUnits = []string{"HOURS", "DAYS", "WEEKS"} + healthMetrics = []string{"RUN_DURATION_SECONDS", "STREAMING_BACKLOG_BYTES", "STREAMING_BACKLOG_RECORDS"} + conditionOps = []string{"EQUAL_TO", "NOT_EQUAL", "GREATER_THAN", "LESS_THAN_OR_EQUAL"} + runIfs = []string{"ALL_SUCCESS", "AT_LEAST_ONE_SUCCESS", "NONE_FAILED", "ALL_DONE"} + gitProviders = []jobs.GitProvider{jobs.GitProviderGitHub, jobs.GitProviderGitLab, jobs.GitProviderAzureDevOpsServices} +) + +// GenerateJob builds a random, well-formed job config driven entirely by rng, so +// the same seed always produces the same job. It deliberately favors fields whose +// translation tends to differ between engines (tasks, clusters, schedules, +// notifications, tags, zero-able scalars). +func GenerateJob(rng *rand.Rand) *resources.Job { + job := &resources.Job{} + job.Name = randName(rng, "job") + + if chance(rng, 0.5) { + job.Description = randSentence(rng) + } + if chance(rng, 0.4) { + job.MaxConcurrentRuns = rng.IntN(10) + 1 + } + if chance(rng, 0.4) { + job.TimeoutSeconds = rng.IntN(7200) + } + if chance(rng, 0.3) { + job.PerformanceTarget = oneOf(rng, performance) + } + if chance(rng, 0.5) { + job.Tags = randTags(rng) + } + if chance(rng, 0.3) { + job.GitSource = randGitSource(rng) + } + + randScheduling(rng, job) + + if chance(rng, 0.3) { + job.EmailNotifications = randEmailNotifications(rng) + } + if chance(rng, 0.2) { + job.WebhookNotifications = randWebhookNotifications(rng) + } + if chance(rng, 0.3) { + job.NotificationSettings = &jobs.JobNotificationSettings{ + NoAlertForCanceledRuns: chance(rng, 0.5), + NoAlertForSkippedRuns: chance(rng, 0.5), + } + } + if chance(rng, 0.3) { + job.Health = randHealth(rng) + } + if chance(rng, 0.3) { + job.Parameters = randParameters(rng) + } + if chance(rng, 0.3) { + job.Queue = &jobs.QueueSettings{Enabled: chance(rng, 0.5)} + } + + // Generate shared job clusters first so tasks can reference them by key. + var jobClusterKeys []string + if chance(rng, 0.5) { + n := rng.IntN(2) + 1 + for i := range n { + key := fmt.Sprintf("cluster_%d", i) + jobClusterKeys = append(jobClusterKeys, key) + job.JobClusters = append(job.JobClusters, jobs.JobCluster{ + JobClusterKey: key, + NewCluster: randClusterSpec(rng), + }) + } + } + + nTasks := rng.IntN(3) + 1 + var taskKeys []string + for i := range nTasks { + task := randTask(rng, i, jobClusterKeys) + // Randomly chain dependencies onto previously generated tasks. + if len(taskKeys) > 0 && chance(rng, 0.4) { + dep := taskKeys[rng.IntN(len(taskKeys))] + task.DependsOn = []jobs.TaskDependency{{TaskKey: dep}} + if chance(rng, 0.5) { + task.RunIf = jobs.RunIf(oneOf(rng, runIfs)) + } + } + taskKeys = append(taskKeys, task.TaskKey) + job.Tasks = append(job.Tasks, task) + } + + return job +} + +// randScheduling sets at most one of schedule/trigger/continuous, which are +// mutually exclusive ways to launch a job. +func randScheduling(rng *rand.Rand, job *resources.Job) { + switch rng.IntN(5) { + case 0: + job.Schedule = &jobs.CronSchedule{ + QuartzCronExpression: oneOf(rng, cronExprs), + TimezoneId: oneOf(rng, timezones), + PauseStatus: oneOf(rng, pauseStatuses), + } + case 1: + job.Trigger = &jobs.TriggerSettings{ + PauseStatus: oneOf(rng, pauseStatuses), + Periodic: &jobs.PeriodicTriggerConfiguration{ + Interval: rng.IntN(12) + 1, + Unit: jobs.PeriodicTriggerConfigurationTimeUnit(oneOf(rng, timeUnits)), + }, + } + case 2: + job.Trigger = &jobs.TriggerSettings{ + PauseStatus: oneOf(rng, pauseStatuses), + FileArrival: &jobs.FileArrivalTriggerConfiguration{ + Url: "s3://" + randWord(rng) + "/" + randWord(rng), + }, + } + case 3: + job.Continuous = &jobs.Continuous{PauseStatus: oneOf(rng, pauseStatuses)} + default: + // no scheduling + } +} + +func randTask(rng *rand.Rand, idx int, jobClusterKeys []string) jobs.Task { + task := jobs.Task{TaskKey: fmt.Sprintf("task_%d", idx)} + + // Use absolute workspace paths with source=WORKSPACE so the generated bundle + // never depends on local files existing on disk (which deploy would reject). + // condition_task needs no compute, so it is handled separately below. + needsCompute := true + switch rng.IntN(4) { + case 0: + task.NotebookTask = &jobs.NotebookTask{ + NotebookPath: "/Workspace/Users/test/" + randName(rng, "nb"), + Source: jobs.SourceWorkspace, + } + case 1: + task.SparkPythonTask = &jobs.SparkPythonTask{ + PythonFile: "/Workspace/Users/test/" + randName(rng, "main") + ".py", + Source: jobs.SourceWorkspace, + } + case 2: + task.PythonWheelTask = &jobs.PythonWheelTask{ + PackageName: randName(rng, "pkg"), + EntryPoint: "main", + } + case 3: + task.ConditionTask = &jobs.ConditionTask{ + Left: randWord(rng), + Op: jobs.ConditionTaskOp(oneOf(rng, conditionOps)), + Right: randWord(rng), + } + needsCompute = false + } + + if needsCompute { + assignCompute(rng, &task, jobClusterKeys) + if chance(rng, 0.4) { + task.Libraries = randLibraries(rng) + } + } + + if chance(rng, 0.3) { + task.TimeoutSeconds = rng.IntN(3600) + } + if chance(rng, 0.3) { + task.MaxRetries = rng.IntN(5) + task.MinRetryIntervalMillis = rng.IntN(60000) + task.RetryOnTimeout = chance(rng, 0.5) + } + return task +} + +// assignCompute attaches exactly one compute source, which notebook/python/wheel +// tasks require: a shared job cluster (when available), a brand-new cluster, or an +// existing cluster id. +func assignCompute(rng *rand.Rand, task *jobs.Task, jobClusterKeys []string) { + const ( + computeNew = iota + computeExisting + computeShared + ) + options := []int{computeNew, computeExisting} + if len(jobClusterKeys) > 0 { + options = append(options, computeShared) + } + switch oneOf(rng, options) { + case computeNew: + spec := randClusterSpec(rng) + task.NewCluster = &spec + case computeExisting: + task.ExistingClusterId = randName(rng, "cluster") + case computeShared: + task.JobClusterKey = oneOf(rng, jobClusterKeys) + } +} + +func randClusterSpec(rng *rand.Rand) compute.ClusterSpec { + spec := compute.ClusterSpec{ + SparkVersion: oneOf(rng, sparkVersions), + NodeTypeId: oneOf(rng, nodeTypeIDs), + } + if chance(rng, 0.5) { + spec.NumWorkers = rng.IntN(8) + } else { + spec.Autoscale = &compute.AutoScale{ + MinWorkers: 1, + MaxWorkers: rng.IntN(8) + 2, + } + } + if chance(rng, 0.4) { + spec.SparkConf = map[string]string{ + "spark.databricks.delta.preview.enabled": "true", + "spark.speculation": fmt.Sprintf("%t", chance(rng, 0.5)), + } + } + if chance(rng, 0.3) { + spec.CustomTags = randTags(rng) + } + if chance(rng, 0.3) { + spec.SparkEnvVars = map[string]string{"PYSPARK_PYTHON": "/databricks/python3/bin/python3"} + } + if chance(rng, 0.3) { + spec.DriverNodeTypeId = oneOf(rng, nodeTypeIDs) + } + return spec +} + +func randGitSource(rng *rand.Rand) *jobs.GitSource { + src := &jobs.GitSource{ + GitProvider: oneOf(rng, gitProviders), + GitUrl: "https://example.com/" + randWord(rng) + "/" + randWord(rng) + ".git", + } + switch rng.IntN(3) { + case 0: + src.GitBranch = oneOf(rng, []string{"main", "develop", "release"}) + case 1: + src.GitTag = "v" + fmt.Sprintf("%d.%d.0", rng.IntN(5), rng.IntN(10)) + case 2: + src.GitCommit = fmt.Sprintf("%040x", rng.Int64()) + } + return src +} + +func randEmailNotifications(rng *rand.Rand) *jobs.JobEmailNotifications { + email := randWord(rng) + "@example.com" + n := &jobs.JobEmailNotifications{NoAlertForSkippedRuns: chance(rng, 0.5)} + if chance(rng, 0.6) { + n.OnFailure = []string{email} + } + if chance(rng, 0.4) { + n.OnSuccess = []string{email} + } + if chance(rng, 0.3) { + n.OnStart = []string{email} + } + return n +} + +func randWebhookNotifications(rng *rand.Rand) *jobs.WebhookNotifications { + hook := []jobs.Webhook{{Id: randName(rng, "hook")}} + n := &jobs.WebhookNotifications{} + if chance(rng, 0.6) { + n.OnFailure = hook + } + if chance(rng, 0.4) { + n.OnSuccess = hook + } + return n +} + +func randHealth(rng *rand.Rand) *jobs.JobsHealthRules { + return &jobs.JobsHealthRules{ + Rules: []jobs.JobsHealthRule{ + { + Metric: jobs.JobsHealthMetric(oneOf(rng, healthMetrics)), + Op: jobs.JobsHealthOperatorGreaterThan, + Value: int64(rng.IntN(3600) + 1), + }, + }, + } +} + +func randLibraries(rng *rand.Rand) []compute.Library { + n := rng.IntN(2) + 1 + libs := make([]compute.Library, 0, n) + for range n { + switch rng.IntN(3) { + case 0: + libs = append(libs, compute.Library{Pypi: &compute.PythonPyPiLibrary{Package: randWord(rng)}}) + case 1: + libs = append(libs, compute.Library{Maven: &compute.MavenLibrary{Coordinates: "org.example:" + randWord(rng) + ":1.0.0"}}) + case 2: + libs = append(libs, compute.Library{Whl: "/Workspace/Users/test/" + randName(rng, "lib") + ".whl"}) + } + } + return libs +} + +func randParameters(rng *rand.Rand) []jobs.JobParameterDefinition { + n := rng.IntN(3) + 1 + params := make([]jobs.JobParameterDefinition, 0, n) + for i := range n { + params = append(params, jobs.JobParameterDefinition{ + Name: fmt.Sprintf("param_%d", i), + Default: randWord(rng), + }) + } + return params +} + +func randTags(rng *rand.Rand) map[string]string { + n := rng.IntN(3) + 1 + tags := make(map[string]string, n) + for i := range n { + tags[fmt.Sprintf("tag_%d", i)] = randWord(rng) + } + return tags +} diff --git a/bundle/fuzz/generate_test.go b/bundle/fuzz/generate_test.go new file mode 100644 index 0000000000..524e84864c --- /dev/null +++ b/bundle/fuzz/generate_test.go @@ -0,0 +1,47 @@ +package fuzz + +import ( + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +func TestGenerateJobIsDeterministic(t *testing.T) { + a := GenerateJob(newRNG(42)) + b := GenerateJob(newRNG(42)) + assert.Equal(t, a, b, "same seed must produce identical job") +} + +func TestGenerateJobIsWellFormed(t *testing.T) { + for seed := int64(0); seed < 200; seed++ { + job := GenerateJob(newRNG(seed)) + require.NotEmptyf(t, job.Name, "seed %d: job must have a name", seed) + require.NotEmptyf(t, job.Tasks, "seed %d: job must have at least one task", seed) + + clusterKeys := map[string]bool{} + for _, jc := range job.JobClusters { + clusterKeys[jc.JobClusterKey] = true + } + + taskKeys := map[string]bool{} + for _, task := range job.Tasks { + require.NotEmptyf(t, task.TaskKey, "seed %d: task must have a key", seed) + taskKeys[task.TaskKey] = true + + // A task referencing a job cluster must reference one we generated. + if task.JobClusterKey != "" { + assert.Containsf(t, clusterKeys, task.JobClusterKey, + "seed %d: task %q references unknown job cluster %q", seed, task.TaskKey, task.JobClusterKey) + } + } + + // Every dependency must point at a task that exists in this job. + for _, task := range job.Tasks { + for _, dep := range task.DependsOn { + assert.Containsf(t, taskKeys, dep.TaskKey, + "seed %d: task %q depends on unknown task %q", seed, task.TaskKey, dep.TaskKey) + } + } + } +} diff --git a/bundle/fuzz/rand.go b/bundle/fuzz/rand.go new file mode 100644 index 0000000000..529e4da115 --- /dev/null +++ b/bundle/fuzz/rand.go @@ -0,0 +1,47 @@ +package fuzz + +import ( + "fmt" + "math/rand/v2" + "strings" +) + +var words = []string{ + "alpha", "bravo", "charlie", "delta", "echo", "foxtrot", "golf", "hotel", + "india", "juliet", "kilo", "lima", "mike", "november", "oscar", "papa", +} + +// newRNG returns a deterministic RNG for the given seed, so any job the fuzzer +// flags can be regenerated from the printed seed alone. +func newRNG(seed int64) *rand.Rand { + return rand.New(rand.NewPCG(uint64(seed), 0)) +} + +// chance returns true with probability p (0..1). +func chance(rng *rand.Rand, p float64) bool { + return rng.Float64() < p +} + +// oneOf returns a random element of s. s must be non-empty. +func oneOf[T any](rng *rand.Rand, s []T) T { + return s[rng.IntN(len(s))] +} + +func randWord(rng *rand.Rand) string { + return oneOf(rng, words) +} + +// randName returns a deterministic-but-varied identifier with the given prefix, +// e.g. "job_alpha_4271". +func randName(rng *rand.Rand, prefix string) string { + return fmt.Sprintf("%s_%s_%d", prefix, randWord(rng), rng.IntN(10000)) +} + +func randSentence(rng *rand.Rand) string { + n := rng.IntN(4) + 2 + parts := make([]string, 0, n) + for range n { + parts = append(parts, randWord(rng)) + } + return strings.Join(parts, " ") +} From 9bd7d4ad7b54546516d8309c57895738c394232b Mon Sep 17 00:00:00 2001 From: Rada Kamysheva Date: Tue, 23 Jun 2026 08:06:00 +0000 Subject: [PATCH 02/24] bundle/fuzz: fix lint (intrange, perfsprint) and correct num_workers ignore Address golangci-lint failures (intrange loops, strconv.FormatBool over fmt.Sprintf) and tighten the create-payload ignore list: drop the dead job_clusters num_workers entry (those are at parity) and document the task-level num_workers divergence as a real CLI gap to fix separately. --- bundle/fuzz/compare.go | 14 ++++++++------ bundle/fuzz/fuzz_test.go | 4 ++-- bundle/fuzz/generate.go | 3 ++- bundle/fuzz/generate_test.go | 2 +- 4 files changed, 13 insertions(+), 10 deletions(-) diff --git a/bundle/fuzz/compare.go b/bundle/fuzz/compare.go index 48b7d3e648..e893ab443d 100644 --- a/bundle/fuzz/compare.go +++ b/bundle/fuzz/compare.go @@ -187,13 +187,15 @@ func normalizePath(path string) string { // the engines and are not parity bugs. Keep this list small and well-justified; // every entry is a known, intentional divergence. var DefaultIgnorePaths = []string{ - // num_workers is a zero-able int: when a cluster has num_workers: 0 the - // terraform provider serializes it explicitly while the direct engine drops - // it via omitempty. The backend treats absent and 0 identically, so this is a - // benign serialization difference. See the update_single_node acceptance test - // ("issues with zero conversion"). + // A single-node task cluster (num_workers: 0, no autoscale) diverges: the + // terraform provider sends num_workers: 0 while the direct engine omits it. + // JobClustersFixups.initializeNumWorkers force-sends num_workers for + // job_clusters but is NOT applied to task-level new_cluster, so the fix-up + // only covers job_clusters (those are at parity and need no ignore here). + // This is a real CLI gap surfaced by the fuzzer, tracked separately; ignore + // it here so the fuzz suite stays green until the fix-up is extended to task + // clusters. "tasks[*].new_cluster.num_workers", - "job_clusters[*].new_cluster.num_workers", // The terraform provider strips the deprecated/ignored spark conf // "spark.databricks.delta.preview.enabled" from new_cluster.spark_conf, while diff --git a/bundle/fuzz/fuzz_test.go b/bundle/fuzz/fuzz_test.go index 55e52eb0bb..ace7a5efd3 100644 --- a/bundle/fuzz/fuzz_test.go +++ b/bundle/fuzz/fuzz_test.go @@ -28,7 +28,7 @@ func TestJobCreateParity(t *testing.T) { seeds = n } - for seed := int64(0); seed < int64(seeds); seed++ { + for seed := range int64(seeds) { t.Run("seed="+strconv.FormatInt(seed, 10), func(t *testing.T) { checkJobParity(t, seed) }) @@ -40,7 +40,7 @@ func TestJobCreateParity(t *testing.T) { // so this is intended for ad-hoc deep runs, not the default `go test` path. func FuzzJobCreateParity(f *testing.F) { RequireTerraform(f) - for seed := int64(0); seed < 5; seed++ { + for seed := range int64(5) { f.Add(seed) } f.Fuzz(func(t *testing.T, seed int64) { diff --git a/bundle/fuzz/generate.go b/bundle/fuzz/generate.go index a7c5e6056f..98db7a70f5 100644 --- a/bundle/fuzz/generate.go +++ b/bundle/fuzz/generate.go @@ -11,6 +11,7 @@ package fuzz import ( "fmt" "math/rand/v2" + "strconv" "github.com/databricks/cli/bundle/config/resources" "github.com/databricks/databricks-sdk-go/service/compute" @@ -241,7 +242,7 @@ func randClusterSpec(rng *rand.Rand) compute.ClusterSpec { if chance(rng, 0.4) { spec.SparkConf = map[string]string{ "spark.databricks.delta.preview.enabled": "true", - "spark.speculation": fmt.Sprintf("%t", chance(rng, 0.5)), + "spark.speculation": strconv.FormatBool(chance(rng, 0.5)), } } if chance(rng, 0.3) { diff --git a/bundle/fuzz/generate_test.go b/bundle/fuzz/generate_test.go index 524e84864c..f7a797e8f5 100644 --- a/bundle/fuzz/generate_test.go +++ b/bundle/fuzz/generate_test.go @@ -14,7 +14,7 @@ func TestGenerateJobIsDeterministic(t *testing.T) { } func TestGenerateJobIsWellFormed(t *testing.T) { - for seed := int64(0); seed < 200; seed++ { + for seed := range int64(200) { job := GenerateJob(newRNG(seed)) require.NotEmptyf(t, job.Name, "seed %d: job must have a name", seed) require.NotEmptyf(t, job.Tasks, "seed %d: job must have at least one task", seed) From a40126469c4ff402bbde9c5e7771fdc683437287 Mon Sep 17 00:00:00 2001 From: Rada Kamysheva Date: Tue, 23 Jun 2026 09:46:58 +0000 Subject: [PATCH 03/24] bundle/fuzz: wire parity tests into CI and harden harness - Add a `test-fuzz` task and a nightly CI job that provisions terraform and runs the create-payload parity tests. They previously always skipped because terraform was never provisioned in the test path. - Ignore repo-root build/ so the provisioned terraform binary and provider mirror are not accidentally committed. - Skip cleanly when build/ is only partially provisioned (missing provider mirror or .terraformrc) instead of failing mid-deploy. - Document that the harness covers jobs only for now (DECO-25361). --- .github/workflows/push.yml | 35 +++++++++++++++++++++++++++++++++++ .gitignore | 4 ++++ Taskfile.yml | 15 +++++++++++++++ bundle/fuzz/capture_deploy.go | 10 ++++++++-- bundle/fuzz/generate.go | 6 ++++++ 5 files changed, 68 insertions(+), 2 deletions(-) diff --git a/.github/workflows/push.yml b/.github/workflows/push.yml index f80cfba7ad..3b2720ba3a 100644 --- a/.github/workflows/push.yml +++ b/.github/workflows/push.yml @@ -370,6 +370,41 @@ jobs: run: | go tool -modfile=tools/task/go.mod task test-sandbox + test-fuzz: + needs: + - cleanups + + # The terraform/direct create-payload parity tests run two real `bundle deploy` + # invocations per seed, so they are too slow for every PR and too noisy to gate + # the merge queue. Run them on the nightly schedule to catch engine drift; not + # part of test-result for that reason. + if: ${{ github.event_name == 'schedule' }} + name: "task test-fuzz" + runs-on: + group: databricks-protected-runner-group-large + labels: linux-ubuntu-latest-large + + defaults: + run: + shell: bash + + permissions: + id-token: write + contents: read + + steps: + - name: Checkout repository and submodules + uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 + + - name: Setup build environment + uses: ./.github/actions/setup-build-environment + with: + cache-key: test-fuzz + + - name: Run tests + run: | + go tool -modfile=tools/task/go.mod task test-fuzz + # This job groups the result of all the above test jobs. # It is a required check, so it blocks auto-merge and the merge queue. # diff --git a/.gitignore b/.gitignore index 116ec5e976..9da32ca035 100644 --- a/.gitignore +++ b/.gitignore @@ -58,6 +58,10 @@ tools/testmask/testmask # Release artifacts dist/ +# Terraform binary + provider mirror provisioned by acceptance/install_terraform.py +# for the bundle/fuzz parity tests (see Taskfile `test-fuzz`). +/build/ + # Local development notes, tmp /pr-* /tmp/ diff --git a/Taskfile.yml b/Taskfile.yml index e44aadbd06..c12f76a936 100644 --- a/Taskfile.yml +++ b/Taskfile.yml @@ -701,6 +701,21 @@ tasks: --packages ./acceptance/... \ -- -timeout=${LOCAL_TIMEOUT:-30m} -run "TestAccept/cmd/sandbox" + test-fuzz: + desc: Run terraform/direct create-payload parity fuzz tests (provisions terraform) + sources: + - bundle/fuzz/** + cmds: + # The parity harness expects terraform + the provider mirror at /build; + # RequireTerraform skips when it's absent, so provision it first. + - python3 acceptance/install_terraform.py --targetdir build + - | + {{.GO_TOOL}} gotestsum \ + --format ${GOTESTSUM_FORMAT:-pkgname-and-test-fails} \ + --no-summary=skipped \ + --packages ./bundle/fuzz/... \ + -- -timeout=${LOCAL_TIMEOUT:-30m} + # --- Integration tests --- integration: diff --git a/bundle/fuzz/capture_deploy.go b/bundle/fuzz/capture_deploy.go index 6f06487bf3..0efeaa9ed1 100644 --- a/bundle/fuzz/capture_deploy.go +++ b/bundle/fuzz/capture_deploy.go @@ -114,8 +114,14 @@ func RequireTerraform(t testing.TB) { execPath := filepath.Join(buildDir, "terraform") cfgFile := filepath.Join(buildDir, ".terraformrc") - if _, err := os.Stat(execPath); err != nil { - t.Skipf("terraform not provisioned (%s); run: python3 acceptance/install_terraform.py --targetdir build", execPath) + // install_terraform.py provisions all three together; a partial build/ (e.g. + // the binary without the provider mirror or .terraformrc) would otherwise fail + // mid-deploy with a confusing error instead of skipping cleanly. + tfpluginsDir := filepath.Join(buildDir, "tfplugins") + for _, p := range []string{execPath, cfgFile, tfpluginsDir} { + if _, err := os.Stat(p); err != nil { + t.Skipf("terraform not fully provisioned (%s); run: python3 acceptance/install_terraform.py --targetdir build", p) + } } t.Setenv("DATABRICKS_TF_EXEC_PATH", execPath) diff --git a/bundle/fuzz/generate.go b/bundle/fuzz/generate.go index 98db7a70f5..697748e03f 100644 --- a/bundle/fuzz/generate.go +++ b/bundle/fuzz/generate.go @@ -6,6 +6,9 @@ // checks for differences in the create payload between the terraform and direct // engines. Generators are seeded so that any divergence found by the fuzz driver // can be reproduced from the printed seed. +// +// Only jobs are covered for now. Extending the harness to other resource kinds +// (pipelines, apps, ...) is tracked as follow-up work under DECO-25361. package fuzz import ( @@ -40,6 +43,9 @@ var ( // the same seed always produces the same job. It deliberately favors fields whose // translation tends to differ between engines (tasks, clusters, schedules, // notifications, tags, zero-able scalars). +// +// TODO(DECO-25361): generalize the harness across resource kinds so pipelines, +// apps, etc. get the same create-payload parity coverage as jobs. func GenerateJob(rng *rand.Rand) *resources.Job { job := &resources.Job{} job.Name = randName(rng, "job") From 52a7830fdc47076fab12ce3d36c23c0a0af07106 Mon Sep 17 00:00:00 2001 From: Rada Kamysheva Date: Wed, 24 Jun 2026 08:28:25 +0000 Subject: [PATCH 04/24] bundle/fuzz: rotate nightly seeds and add single-seed reproduction Make the create-payload parity fuzz suite explore new configs over time and be reproducible from a reported seed: - FUZZ_SEED (comma-separated) runs exactly those seeds, overriding the range, so a reported divergence reproduces with one command. The failure message now prints this knob. - FUZZ_SEED_OFFSET shifts the deterministic window; push.yml derives it from GITHUB_RUN_NUMBER so each nightly run checks seeds it has never tested before instead of re-checking a fixed set. Windows are non-overlapping because the run number is unique and monotonic. - Guard FUZZ_SEEDS > 0 so a negative value no longer panics make() and zero no longer passes as a no-op. - Drop the test-fuzz Task sources fingerprint: the seeds depend on env vars Task can't see, so skipping on an unchanged checksum would silently no-op a repro run or a shifted window. - Keep the nightly window modest (25); exploration comes from rotation, not size, and it can be raised once nightly timings are known. --- .github/workflows/push.yml | 14 ++++++++++ Taskfile.yml | 6 ++-- bundle/fuzz/fuzz_test.go | 57 +++++++++++++++++++++++++++++++++----- 3 files changed, 68 insertions(+), 9 deletions(-) diff --git a/.github/workflows/push.yml b/.github/workflows/push.yml index 3b2720ba3a..bf995d7555 100644 --- a/.github/workflows/push.yml +++ b/.github/workflows/push.yml @@ -402,7 +402,21 @@ jobs: cache-key: test-fuzz - name: Run tests + env: + # Shift the seed window by the run number every nightly run so CI + # explores configs it has never tested before instead of re-checking a + # fixed set. The window is kept modest (each seed runs two real deploys) + # since the exploration comes from rotating the window, not its size; + # raise it once nightly timings are known. A divergence prints + # FUZZ_SEED= for one-command reproduction. + # + # offset = GITHUB_RUN_NUMBER * FUZZ_SEEDS. GITHUB_RUN_NUMBER is a + # built-in, monotonically increasing, unique-per-run integer, so as long + # as FUZZ_SEEDS is constant the windows are non-overlapping (gaps from + # non-schedule runs are fine; we only need fresh seeds, not every seed). + FUZZ_SEEDS: "25" run: | + export FUZZ_SEED_OFFSET=$(( GITHUB_RUN_NUMBER * FUZZ_SEEDS )) go tool -modfile=tools/task/go.mod task test-fuzz # This job groups the result of all the above test jobs. diff --git a/Taskfile.yml b/Taskfile.yml index c12f76a936..edd95d7025 100644 --- a/Taskfile.yml +++ b/Taskfile.yml @@ -703,8 +703,10 @@ tasks: test-fuzz: desc: Run terraform/direct create-payload parity fuzz tests (provisions terraform) - sources: - - bundle/fuzz/** + # No `sources:` fingerprint: the seeds checked are a function of the FUZZ_SEED, + # FUZZ_SEEDS, and FUZZ_SEED_OFFSET env vars, which Task can't see. Skipping on + # an unchanged source checksum would silently no-op a FUZZ_SEED= repro run + # or a shifted nightly window, so always run. cmds: # The parity harness expects terraform + the provider mirror at /build; # RequireTerraform skips when it's absent, so provision it first. diff --git a/bundle/fuzz/fuzz_test.go b/bundle/fuzz/fuzz_test.go index ace7a5efd3..51471b3533 100644 --- a/bundle/fuzz/fuzz_test.go +++ b/bundle/fuzz/fuzz_test.go @@ -4,6 +4,7 @@ import ( "encoding/json" "os" "strconv" + "strings" "testing" "github.com/stretchr/testify/require" @@ -21,18 +22,60 @@ const defaultParitySeeds = 20 func TestJobCreateParity(t *testing.T) { RequireTerraform(t) - seeds := defaultParitySeeds + for _, seed := range paritySeeds(t) { + t.Run("seed="+strconv.FormatInt(seed, 10), func(t *testing.T) { + checkJobParity(t, seed) + }) + } +} + +// paritySeeds returns the seeds TestJobCreateParity should check. +// +// FUZZ_SEED (comma-separated list) runs exactly those seeds and overrides +// everything else. This is the knob the failure message prints so a single +// reported divergence can be reproduced with one command, without re-running +// every seed before it. +// +// Otherwise the test runs FUZZ_SEEDS seeds (default defaultParitySeeds) starting +// at FUZZ_SEED_OFFSET. The offset lets the nightly job shift the window every run +// (push.yml derives it from the run number) so CI explores configs it has never +// tested before instead of re-checking the same fixed set forever. +func paritySeeds(t *testing.T) []int64 { + if v := os.Getenv("FUZZ_SEED"); v != "" { + var seeds []int64 + for _, part := range strings.Split(v, ",") { + part = strings.TrimSpace(part) + if part == "" { + continue + } + n, err := strconv.ParseInt(part, 10, 64) + require.NoErrorf(t, err, "invalid FUZZ_SEED entry %q", part) + seeds = append(seeds, n) + } + require.NotEmptyf(t, seeds, "FUZZ_SEED=%q contained no seeds", v) + return seeds + } + + count := defaultParitySeeds if v := os.Getenv("FUZZ_SEEDS"); v != "" { n, err := strconv.Atoi(v) require.NoErrorf(t, err, "invalid FUZZ_SEEDS=%q", v) - seeds = n + require.Greaterf(t, n, 0, "FUZZ_SEEDS must be positive, got %d", n) + count = n } - for seed := range int64(seeds) { - t.Run("seed="+strconv.FormatInt(seed, 10), func(t *testing.T) { - checkJobParity(t, seed) - }) + var offset int64 + if v := os.Getenv("FUZZ_SEED_OFFSET"); v != "" { + n, err := strconv.ParseInt(v, 10, 64) + require.NoErrorf(t, err, "invalid FUZZ_SEED_OFFSET=%q", v) + offset = n + } + + seeds := make([]int64, 0, count) + for i := range int64(count) { + seeds = append(seeds, offset+i) } + return seeds } // FuzzJobCreateParity exposes the same parity check to Go's native fuzzer @@ -63,6 +106,6 @@ func checkJobParity(t *testing.T, seed int64) { for _, d := range diffs { t.Errorf(" %s", d) } - t.Logf("reproduce with GenerateJob(newRNG(%d)):\n%s", seed, jobJSON) + t.Logf("reproduce with: FUZZ_SEED=%d go test ./bundle/fuzz -run TestJobCreateParity\n%s", seed, jobJSON) } } From 93eccfd5cccafb460609c862c9bf8c5bee80d52a Mon Sep 17 00:00:00 2001 From: Rada Kamysheva Date: Wed, 24 Jun 2026 12:03:48 +0000 Subject: [PATCH 05/24] bundle: force-send num_workers for single-node task clusters The terraform provider force-sends num_workers: 0 for a single-node new_cluster (no autoscale) on both job_clusters and task-level clusters, but JobClustersFixups only applied initializeNumWorkers to job_clusters. The direct engine therefore omitted num_workers on task clusters, so the two engines produced divergent create payloads. This divergence was surfaced by the bundle/fuzz parity harness. Apply initializeNumWorkers to task new_cluster too so the direct engine matches terraform, and drop the now-obsolete tasks[*].new_cluster.num_workers entry from the fuzz DefaultIgnorePaths. --- acceptance/bundle/deploy/wal/chain-3-jobs/output.txt | 2 ++ .../bundle/deploy/wal/crash-after-create/output.txt | 1 + acceptance/bundle/override/job_tasks/output.txt | 2 ++ .../missing_map_key/out.validate.direct.json | 3 ++- .../missing_map_key/out.validate.terraform.json | 3 ++- .../config/mutator/resourcemutator/cluster_fixups.go | 1 + bundle/fuzz/compare.go | 10 ---------- 7 files changed, 10 insertions(+), 12 deletions(-) diff --git a/acceptance/bundle/deploy/wal/chain-3-jobs/output.txt b/acceptance/bundle/deploy/wal/chain-3-jobs/output.txt index f27bfaa3f2..f11dc173ee 100644 --- a/acceptance/bundle/deploy/wal/chain-3-jobs/output.txt +++ b/acceptance/bundle/deploy/wal/chain-3-jobs/output.txt @@ -35,6 +35,7 @@ Exit code: [KILLED] { "new_cluster": { "node_type_id": "[NODE_TYPE_ID]", + "num_workers": 0, "spark_version": "15.4.x-scala2.12" }, "spark_python_task": { @@ -73,6 +74,7 @@ Exit code: [KILLED] { "new_cluster": { "node_type_id": "[NODE_TYPE_ID]", + "num_workers": 0, "spark_version": "15.4.x-scala2.12" }, "spark_python_task": { diff --git a/acceptance/bundle/deploy/wal/crash-after-create/output.txt b/acceptance/bundle/deploy/wal/crash-after-create/output.txt index 2ab926a1dd..9cd95a0b5c 100644 --- a/acceptance/bundle/deploy/wal/crash-after-create/output.txt +++ b/acceptance/bundle/deploy/wal/crash-after-create/output.txt @@ -39,6 +39,7 @@ Exit code: [KILLED] { "new_cluster": { "node_type_id": "[NODE_TYPE_ID]", + "num_workers": 0, "spark_version": "15.4.x-scala2.12" }, "spark_python_task": { diff --git a/acceptance/bundle/override/job_tasks/output.txt b/acceptance/bundle/override/job_tasks/output.txt index 2bee9738e3..59b6fc1c39 100644 --- a/acceptance/bundle/override/job_tasks/output.txt +++ b/acceptance/bundle/override/job_tasks/output.txt @@ -18,6 +18,7 @@ }, { "new_cluster": { + "num_workers": 0, "spark_version": "13.3.x-scala2.12" }, "spark_python_task": { @@ -42,6 +43,7 @@ Exit code: 1 "tasks": [ { "new_cluster": { + "num_workers": 0, "spark_version": "13.3.x-scala2.12" }, "spark_python_task": { diff --git a/acceptance/bundle/resource_deps/missing_map_key/out.validate.direct.json b/acceptance/bundle/resource_deps/missing_map_key/out.validate.direct.json index cfd1427ce4..7279aaeba3 100644 --- a/acceptance/bundle/resource_deps/missing_map_key/out.validate.direct.json +++ b/acceptance/bundle/resource_deps/missing_map_key/out.validate.direct.json @@ -30,7 +30,8 @@ "new_cluster": { "custom_tags": { "ResourceClass": "SingleNode" - } + }, + "num_workers": 0 }, "task_key": "test-task" } diff --git a/acceptance/bundle/resource_deps/missing_map_key/out.validate.terraform.json b/acceptance/bundle/resource_deps/missing_map_key/out.validate.terraform.json index 3cdf58f84e..3bad6f4619 100644 --- a/acceptance/bundle/resource_deps/missing_map_key/out.validate.terraform.json +++ b/acceptance/bundle/resource_deps/missing_map_key/out.validate.terraform.json @@ -30,7 +30,8 @@ "new_cluster": { "custom_tags": { "ResourceClass": "SingleNode" - } + }, + "num_workers": 0 }, "task_key": "test-task" } diff --git a/bundle/config/mutator/resourcemutator/cluster_fixups.go b/bundle/config/mutator/resourcemutator/cluster_fixups.go index 893cd248aa..04ddef6cc2 100644 --- a/bundle/config/mutator/resourcemutator/cluster_fixups.go +++ b/bundle/config/mutator/resourcemutator/cluster_fixups.go @@ -94,6 +94,7 @@ func prepareJobSettingsForUpdate(js *jobs.JobSettings) { for _, task := range js.Tasks { if task.NewCluster != nil { ModifyRequestOnInstancePool(task.NewCluster) + initializeNumWorkers(task.NewCluster) } } for ind := range js.JobClusters { diff --git a/bundle/fuzz/compare.go b/bundle/fuzz/compare.go index e893ab443d..de68171962 100644 --- a/bundle/fuzz/compare.go +++ b/bundle/fuzz/compare.go @@ -187,16 +187,6 @@ func normalizePath(path string) string { // the engines and are not parity bugs. Keep this list small and well-justified; // every entry is a known, intentional divergence. var DefaultIgnorePaths = []string{ - // A single-node task cluster (num_workers: 0, no autoscale) diverges: the - // terraform provider sends num_workers: 0 while the direct engine omits it. - // JobClustersFixups.initializeNumWorkers force-sends num_workers for - // job_clusters but is NOT applied to task-level new_cluster, so the fix-up - // only covers job_clusters (those are at parity and need no ignore here). - // This is a real CLI gap surfaced by the fuzzer, tracked separately; ignore - // it here so the fuzz suite stays green until the fix-up is extended to task - // clusters. - "tasks[*].new_cluster.num_workers", - // The terraform provider strips the deprecated/ignored spark conf // "spark.databricks.delta.preview.enabled" from new_cluster.spark_conf, while // the direct engine forwards it verbatim. The backend ignores the key either From 9da78a95fd2e7ae6f1df4b8a3ac4b2cf2a4cedc6 Mon Sep 17 00:00:00 2001 From: Rada Kamysheva Date: Wed, 24 Jun 2026 12:04:02 +0000 Subject: [PATCH 06/24] bundle/fuzz: report nightly parity failures and fix create-path comment The nightly test-fuzz job is intentionally excluded from test-result, so a failure was only visible in the Actions tab. Add a failure step that opens (or comments on) a single deduped GitHub issue with a one-command repro. Also correct the jobsCreatePath comment: a different API version shows up as a capture failure (the testserver registers only this route, so a mismatched version 404s and the deploy fails), not as a payload diff. --- .github/workflows/push.yml | 37 +++++++++++++++++++++++++++++++++++++ bundle/fuzz/capture.go | 8 +++++--- 2 files changed, 42 insertions(+), 3 deletions(-) diff --git a/.github/workflows/push.yml b/.github/workflows/push.yml index bf995d7555..8c6f1e5f3a 100644 --- a/.github/workflows/push.yml +++ b/.github/workflows/push.yml @@ -391,6 +391,8 @@ jobs: permissions: id-token: write contents: read + # Needed by the failure-reporting step below to open/comment a tracking issue. + issues: write steps: - name: Checkout repository and submodules @@ -419,6 +421,41 @@ jobs: export FUZZ_SEED_OFFSET=$(( GITHUB_RUN_NUMBER * FUZZ_SEEDS )) go tool -modfile=tools/task/go.mod task test-fuzz + # This job is intentionally excluded from test-result, so a failure here is + # invisible unless someone watches the Actions tab. Surface it as a GitHub + # issue instead. Reuse a single open issue (deduped by label) so a recurring + # divergence doesn't open one issue per night. + - name: Report failure + if: ${{ failure() }} + env: + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }} + run: | + gh label create fuzz-nightly \ + --description "Nightly terraform/direct create-payload parity failures" \ + --color FBCA04 2>/dev/null || true + + body=$(cat <\`. + Reproduce locally with: + + \`\`\` + FUZZ_SEED= go test ./bundle/fuzz -run TestJobCreateParity + \`\`\` + EOF + ) + + existing=$(gh issue list --state open --label fuzz-nightly --json number --jq '.[0].number') + if [ -n "$existing" ]; then + gh issue comment "$existing" --body "$body" + else + gh issue create --title "Nightly fuzz parity failure" --label fuzz-nightly --body "$body" + fi + # This job groups the result of all the above test jobs. # It is a required check, so it blocks auto-merge and the merge queue. # diff --git a/bundle/fuzz/capture.go b/bundle/fuzz/capture.go index 330f485f82..fe10bc10be 100644 --- a/bundle/fuzz/capture.go +++ b/bundle/fuzz/capture.go @@ -8,9 +8,11 @@ import ( ) // jobsCreatePath is the Jobs API route both engines must hit on create. The -// direct engine posts here via the SDK; the terraform provider is expected to -// post here too, and a mismatch (e.g. a different API version) is itself a -// divergence worth surfacing. +// direct engine posts here via the SDK and the terraform provider is expected to +// as well. The testserver registers only this exact route, so if an engine ever +// posted to a different version the deploy would 404 and CaptureJobCreate would +// fail with "did not POST". A version skew therefore surfaces as a capture +// failure, not as a payload diff. const jobsCreatePath = "/api/2.2/jobs/create" // CapturedRequest is a single mutating API request observed by the testserver. From 3a14c6163af70fc421c2783e45c468b5ae75ea1e Mon Sep 17 00:00:00 2001 From: Rada Kamysheva Date: Wed, 24 Jun 2026 13:26:12 +0000 Subject: [PATCH 07/24] bundle/fuzz: make harness files test-only and add num_workers regression test Rename the capture/deploy/recorder helpers to *_test.go so the parity harness compiles only under `go test` instead of into the package's regular build, and add a committed regression test (cluster_fixups_test.go) covering the single-node task-cluster num_workers force-send fix so the divergence is guarded at PR time, not just in the nightly suite. --- .github/workflows/push.yml | 5 +- Taskfile.yml | 8 +- .../resourcemutator/cluster_fixups_test.go | 92 +++++++++++++++++++ bundle/fuzz/compare.go | 72 +++++++++++++++ bundle/fuzz/compare_test.go | 24 +++++ ...re_deploy_test.go => deploy_smoke_test.go} | 6 +- .../{capture_deploy.go => deploy_test.go} | 44 ++++++--- bundle/fuzz/fuzz_test.go | 81 ++++++++++++++-- bundle/fuzz/{capture.go => recorder_test.go} | 10 +- 9 files changed, 311 insertions(+), 31 deletions(-) create mode 100644 bundle/config/mutator/resourcemutator/cluster_fixups_test.go rename bundle/fuzz/{capture_deploy_test.go => deploy_smoke_test.go} (82%) rename bundle/fuzz/{capture_deploy.go => deploy_test.go} (73%) rename bundle/fuzz/{capture.go => recorder_test.go} (86%) diff --git a/.github/workflows/push.yml b/.github/workflows/push.yml index 8c6f1e5f3a..cc69da23f4 100644 --- a/.github/workflows/push.yml +++ b/.github/workflows/push.yml @@ -444,8 +444,11 @@ jobs: Reproduce locally with: \`\`\` - FUZZ_SEED= go test ./bundle/fuzz -run TestJobCreateParity + FUZZ_SEED= task test-fuzz \`\`\` + + Once fixed, add the seed to \`regressionSeeds\` in \`bundle/fuzz/fuzz_test.go\` + in the same PR so the divergence can never silently regress. EOF ) diff --git a/Taskfile.yml b/Taskfile.yml index edd95d7025..cec4c328da 100644 --- a/Taskfile.yml +++ b/Taskfile.yml @@ -707,9 +707,15 @@ tasks: # FUZZ_SEEDS, and FUZZ_SEED_OFFSET env vars, which Task can't see. Skipping on # an unchanged source checksum would silently no-op a FUZZ_SEED= repro run # or a shifted nightly window, so always run. + env: + # The terraform parity tests are opt-in (see requireFuzzOptIn): they skip + # unless a FUZZ_* var is set, so a leftover build/ never makes them run as + # part of a plain `task test`. This constant flag opts this target in + # without overriding the FUZZ_SEED(S)/OFFSET tuning knobs. + FUZZ_PARITY: "1" cmds: # The parity harness expects terraform + the provider mirror at /build; - # RequireTerraform skips when it's absent, so provision it first. + # requireTerraform skips when it's absent, so provision it first. - python3 acceptance/install_terraform.py --targetdir build - | {{.GO_TOOL}} gotestsum \ diff --git a/bundle/config/mutator/resourcemutator/cluster_fixups_test.go b/bundle/config/mutator/resourcemutator/cluster_fixups_test.go new file mode 100644 index 0000000000..5cb2e93749 --- /dev/null +++ b/bundle/config/mutator/resourcemutator/cluster_fixups_test.go @@ -0,0 +1,92 @@ +package resourcemutator + +import ( + "testing" + + "github.com/databricks/databricks-sdk-go/service/compute" + "github.com/databricks/databricks-sdk-go/service/jobs" + "github.com/stretchr/testify/assert" +) + +func TestInitializeNumWorkers(t *testing.T) { + tests := []struct { + name string + spec compute.ClusterSpec + wantForceSend bool + }{ + { + name: "single-node cluster force-sends num_workers", + spec: compute.ClusterSpec{SparkVersion: "15.4.x-scala2.12", NodeTypeId: "i3.xlarge"}, + wantForceSend: true, + }, + { + name: "autoscale cluster does not force-send", + spec: compute.ClusterSpec{Autoscale: &compute.AutoScale{MinWorkers: 1, MaxWorkers: 4}}, + wantForceSend: false, + }, + { + name: "multi-node cluster does not force-send", + spec: compute.ClusterSpec{NumWorkers: 3}, + wantForceSend: false, + }, + { + name: "already force-sent stays force-sent without duplicating", + spec: compute.ClusterSpec{ForceSendFields: []string{"NumWorkers"}}, + wantForceSend: true, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + spec := tt.spec + initializeNumWorkers(&spec) + + count := 0 + for _, f := range spec.ForceSendFields { + if f == "NumWorkers" { + count++ + } + } + if tt.wantForceSend { + assert.Equal(t, 1, count, "NumWorkers must appear in ForceSendFields exactly once") + } else { + assert.Equal(t, 0, count, "NumWorkers must not be in ForceSendFields") + } + }) + } +} + +// TestPrepareJobSettingsForUpdateForcesNumWorkers locks the DECO-25361 fix: a +// single-node new_cluster must force-send num_workers on task-level clusters too, +// not just shared job_clusters. The terraform provider always sends num_workers:0 +// for such clusters, so missing it on the task side made the direct engine +// produce a divergent create payload. +func TestPrepareJobSettingsForUpdateForcesNumWorkers(t *testing.T) { + js := &jobs.JobSettings{ + Tasks: []jobs.Task{ + { + TaskKey: "single_node_task", + NewCluster: &compute.ClusterSpec{SparkVersion: "15.4.x-scala2.12", NodeTypeId: "i3.xlarge"}, + }, + { + TaskKey: "autoscale_task", + NewCluster: &compute.ClusterSpec{Autoscale: &compute.AutoScale{MinWorkers: 1, MaxWorkers: 4}}, + }, + }, + JobClusters: []jobs.JobCluster{ + { + JobClusterKey: "single_node_cluster", + NewCluster: compute.ClusterSpec{SparkVersion: "15.4.x-scala2.12", NodeTypeId: "i3.xlarge"}, + }, + }, + } + + prepareJobSettingsForUpdate(js) + + assert.Contains(t, js.Tasks[0].NewCluster.ForceSendFields, "NumWorkers", + "single-node task cluster must force-send num_workers") + assert.NotContains(t, js.Tasks[1].NewCluster.ForceSendFields, "NumWorkers", + "autoscale task cluster must not force-send num_workers") + assert.Contains(t, js.JobClusters[0].NewCluster.ForceSendFields, "NumWorkers", + "single-node job cluster must force-send num_workers") +} diff --git a/bundle/fuzz/compare.go b/bundle/fuzz/compare.go index de68171962..81c1bc7afb 100644 --- a/bundle/fuzz/compare.go +++ b/bundle/fuzz/compare.go @@ -110,6 +110,14 @@ func diffValue(path string, a, b any, diffs *[]Difference) { *diffs = append(*diffs, Difference{Path: path, Direct: a, Terraform: b}) return } + // Slices whose elements carry a natural identity key (tasks, job clusters) + // are matched by that key so an engine emitting the same elements in a + // different order is not reported as a difference. Everything else is + // compared positionally. + if key := identityKey(av, bv); key != "" { + diffKeyedSlice(path, key, av, bv, diffs) + return + } n := max(len(av), len(bv)) for i := range n { child := fmt.Sprintf("%s[%d]", path, i) @@ -129,6 +137,70 @@ func diffValue(path string, a, b any, diffs *[]Difference) { } } +// identityFields are the keys, in priority order, that uniquely identify the +// elements of a payload slice. Job tasks and shared job clusters are the slices +// whose order is not significant but which the engines may emit differently. +var identityFields = []string{"task_key", "job_cluster_key"} + +// identityKey returns the field that identifies every element of both slices, or +// "" if the elements are not uniformly keyed objects (in which case the caller +// falls back to positional comparison). +func identityKey(a, b []any) string { + for _, field := range identityFields { + if allHaveKey(a, field) && allHaveKey(b, field) { + return field + } + } + return "" +} + +func allHaveKey(s []any, field string) bool { + if len(s) == 0 { + return false + } + for _, el := range s { + m, ok := el.(map[string]any) + if !ok { + return false + } + if _, ok := m[field].(string); !ok { + return false + } + } + return true +} + +// diffKeyedSlice matches elements of a and b by the value of key (which is unique +// within each slice for tasks/job clusters) and diffs each matched pair, +// reporting unmatched elements as present-on-one-side. Paths keep numeric indices +// so ignore-path [*] normalization still applies. +func diffKeyedSlice(path, key string, a, b []any, diffs *[]Difference) { + bByKey := make(map[string]any, len(b)) + for _, el := range b { + bByKey[el.(map[string]any)[key].(string)] = el + } + + matched := make(map[string]bool, len(a)) + for i, el := range a { + child := fmt.Sprintf("%s[%d]", path, i) + k := el.(map[string]any)[key].(string) + matched[k] = true + if bel, ok := bByKey[k]; ok { + diffValue(child, el, bel, diffs) + } else { + *diffs = append(*diffs, Difference{Path: child, Direct: el, Terraform: missing{}}) + } + } + for j, el := range b { + k := el.(map[string]any)[key].(string) + if matched[k] { + continue + } + child := fmt.Sprintf("%s[%d]", path, j) + *diffs = append(*diffs, Difference{Path: child, Direct: missing{}, Terraform: el}) + } +} + // scalarEqual compares two JSON scalars. json.Number is compared by its string // form so 1 and 1.0 don't masquerade as equal across engines. func scalarEqual(a, b any) bool { diff --git a/bundle/fuzz/compare_test.go b/bundle/fuzz/compare_test.go index ec5818468b..46e506d75c 100644 --- a/bundle/fuzz/compare_test.go +++ b/bundle/fuzz/compare_test.go @@ -78,6 +78,30 @@ func TestDiffPayloads(t *testing.T) { ignore: []string{`c.spark_conf["spark.x.y"]`}, want: nil, }, + { + name: "tasks matched by key ignore order", + direct: `{"tasks":[{"task_key":"a","timeout_seconds":1},{"task_key":"b","timeout_seconds":2}]}`, + terraform: `{"tasks":[{"task_key":"b","timeout_seconds":2},{"task_key":"a","timeout_seconds":1}]}`, + want: nil, + }, + { + name: "tasks matched by key surface real diff at direct index", + direct: `{"tasks":[{"task_key":"a","timeout_seconds":1},{"task_key":"b","timeout_seconds":2}]}`, + terraform: `{"tasks":[{"task_key":"b","timeout_seconds":9},{"task_key":"a","timeout_seconds":1}]}`, + want: []string{"tasks[1].timeout_seconds"}, + }, + { + name: "task only on terraform reported at its index", + direct: `{"tasks":[{"task_key":"a"}]}`, + terraform: `{"tasks":[{"task_key":"a"},{"task_key":"b"}]}`, + want: []string{"tasks[1]"}, + }, + { + name: "job_clusters matched by key ignore order", + direct: `{"job_clusters":[{"job_cluster_key":"x","new_cluster":{"num_workers":1}},{"job_cluster_key":"y","new_cluster":{"num_workers":2}}]}`, + terraform: `{"job_clusters":[{"job_cluster_key":"y","new_cluster":{"num_workers":2}},{"job_cluster_key":"x","new_cluster":{"num_workers":1}}]}`, + want: nil, + }, } for _, tt := range tests { diff --git a/bundle/fuzz/capture_deploy_test.go b/bundle/fuzz/deploy_smoke_test.go similarity index 82% rename from bundle/fuzz/capture_deploy_test.go rename to bundle/fuzz/deploy_smoke_test.go index 2518265d75..d501ee7808 100644 --- a/bundle/fuzz/capture_deploy_test.go +++ b/bundle/fuzz/deploy_smoke_test.go @@ -11,7 +11,7 @@ import ( func TestCaptureJobCreateDirect(t *testing.T) { job := GenerateJob(newRNG(1)) - body, err := CaptureJobCreate(t.Context(), t, job, "direct") + body, err := captureJobCreate(t.Context(), t, job, "direct") require.NoError(t, err) require.NotEmpty(t, body) @@ -22,10 +22,10 @@ func TestCaptureJobCreateDirect(t *testing.T) { } func TestCaptureJobCreateTerraform(t *testing.T) { - RequireTerraform(t) + requireTerraform(t) job := GenerateJob(newRNG(1)) - body, err := CaptureJobCreate(t.Context(), t, job, "terraform") + body, err := captureJobCreate(t.Context(), t, job, "terraform") require.NoError(t, err) require.NotEmpty(t, body) diff --git a/bundle/fuzz/capture_deploy.go b/bundle/fuzz/deploy_test.go similarity index 73% rename from bundle/fuzz/capture_deploy.go rename to bundle/fuzz/deploy_test.go index 0efeaa9ed1..e42dbb7434 100644 --- a/bundle/fuzz/capture_deploy.go +++ b/bundle/fuzz/deploy_test.go @@ -19,7 +19,7 @@ const ( fakeToken = "testtoken" ) -// CaptureJobCreate deploys a bundle containing job through the given engine +// captureJobCreate deploys a bundle containing job through the given engine // ("direct" or "terraform") and returns the create request body sent to the // Jobs API. // @@ -31,8 +31,8 @@ const ( // // The terraform engine additionally requires DATABRICKS_TF_EXEC_PATH and // DATABRICKS_TF_CLI_CONFIG_FILE to point at a provisioned terraform binary and -// provider mirror; see RequireTerraform. -func CaptureJobCreate(ctx context.Context, t *testing.T, job *resources.Job, engine string) (json.RawMessage, error) { +// provider mirror; see requireTerraform. +func captureJobCreate(ctx context.Context, t *testing.T, job *resources.Job, engine string) (json.RawMessage, error) { rec := &recorder{} server := testserver.New(t) server.RequestCallback = rec.callback @@ -61,15 +61,15 @@ func CaptureJobCreate(ctx context.Context, t *testing.T, job *resources.Job, eng return body, nil } -// CompareJobEngines deploys job under both engines and returns the create-payload +// compareJobEngines deploys job under both engines and returns the create-payload // differences that are not covered by DefaultIgnorePaths. An empty result means // the engines produced equivalent create payloads. -func CompareJobEngines(ctx context.Context, t *testing.T, job *resources.Job) ([]Difference, error) { - direct, err := CaptureJobCreate(ctx, t, job, "direct") +func compareJobEngines(ctx context.Context, t *testing.T, job *resources.Job) ([]Difference, error) { + direct, err := captureJobCreate(ctx, t, job, "direct") if err != nil { return nil, fmt.Errorf("capturing direct payload: %w", err) } - terraform, err := CaptureJobCreate(ctx, t, job, "terraform") + terraform, err := captureJobCreate(ctx, t, job, "terraform") if err != nil { return nil, fmt.Errorf("capturing terraform payload: %w", err) } @@ -106,10 +106,32 @@ func writeJobBundle(dir, host string, job *resources.Job) error { return os.WriteFile(filepath.Join(dir, "databricks.yml"), data, 0o600) } -// RequireTerraform points the terraform engine at the binary and provider mirror -// provisioned by acceptance/install_terraform.py into /build, and skips the -// test when they are absent so the suite still runs where terraform is not set up. -func RequireTerraform(t testing.TB) { +// fuzzOptInVars are the environment variables that opt a run into the +// terraform-backed parity suite. FUZZ_SEED / FUZZ_SEEDS / FUZZ_SEED_OFFSET double +// as the tuning knobs (see paritySeeds), so setting any of them implies opt-in; +// FUZZ_PARITY is a no-tuning switch used by `task test-fuzz`. +var fuzzOptInVars = []string{"FUZZ_PARITY", "FUZZ_SEED", "FUZZ_SEEDS", "FUZZ_SEED_OFFSET"} + +// requireFuzzOptIn skips unless the run explicitly opted into the terraform +// parity suite. Gating on an env var rather than on the presence of build/ keeps +// a leftover terraform install (from a prior `task test-fuzz` or acceptance run) +// from silently turning a plain `task test` into dozens of real deploys. +func requireFuzzOptIn(t testing.TB) { + for _, name := range fuzzOptInVars { + if os.Getenv(name) != "" { + return + } + } + t.Skip("terraform parity suite is opt-in; run `task test-fuzz` or set FUZZ_SEED= to reproduce a single seed") +} + +// requireTerraform opts in via requireFuzzOptIn, then points the terraform engine +// at the binary and provider mirror provisioned by acceptance/install_terraform.py +// into /build, skipping when they are absent so the suite still skips +// cleanly where terraform is not set up. +func requireTerraform(t testing.TB) { + requireFuzzOptIn(t) + buildDir := filepath.Join(repoRoot(t), "build") execPath := filepath.Join(buildDir, "terraform") cfgFile := filepath.Join(buildDir, ".terraformrc") diff --git a/bundle/fuzz/fuzz_test.go b/bundle/fuzz/fuzz_test.go index 51471b3533..7b0d0df8ea 100644 --- a/bundle/fuzz/fuzz_test.go +++ b/bundle/fuzz/fuzz_test.go @@ -7,6 +7,7 @@ import ( "strings" "testing" + "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" ) @@ -15,12 +16,27 @@ import ( // kept modest; override with FUZZ_SEEDS for a deeper local run. const defaultParitySeeds = 20 +// regressionSeeds are seeds that previously surfaced a terraform/direct create +// payload divergence. They are always checked (in addition to the rotating +// nightly window) so a fixed divergence can never silently regress, even though +// the nightly window moves on every run and would otherwise never revisit them. +// +// When the nightly job reports a new failing FUZZ_SEED, add it here in the same +// PR that fixes the divergence. +// +// - 29: first seed that generates a single-node task-level new_cluster +// (num_workers 0, no autoscale). The direct engine omitted num_workers on +// task clusters while terraform force-sent num_workers:0, so the create +// payloads diverged. Fixed by applying initializeNumWorkers to task clusters +// in resourcemutator.prepareJobSettingsForUpdate. +var regressionSeeds = []int64{29} + // TestJobCreateParity is the first DECO-25361 technique: for many random job // configs, assert the terraform and direct engines produce equivalent create // payloads. On divergence it prints the seed and the generated job so the failure // can be reproduced and inspected. func TestJobCreateParity(t *testing.T) { - RequireTerraform(t) + requireTerraform(t) for _, seed := range paritySeeds(t) { t.Run("seed="+strconv.FormatInt(seed, 10), func(t *testing.T) { @@ -36,10 +52,12 @@ func TestJobCreateParity(t *testing.T) { // reported divergence can be reproduced with one command, without re-running // every seed before it. // -// Otherwise the test runs FUZZ_SEEDS seeds (default defaultParitySeeds) starting -// at FUZZ_SEED_OFFSET. The offset lets the nightly job shift the window every run -// (push.yml derives it from the run number) so CI explores configs it has never -// tested before instead of re-checking the same fixed set forever. +// Otherwise the test runs the regressionSeeds plus FUZZ_SEEDS seeds (default +// defaultParitySeeds) starting at FUZZ_SEED_OFFSET. The offset lets the nightly +// job shift the window every run (push.yml derives it from the run number) so CI +// explores configs it has never tested before instead of re-checking the same +// fixed set forever; the regressionSeeds are always included on top so known +// past divergences keep being verified. func paritySeeds(t *testing.T) []int64 { if v := os.Getenv("FUZZ_SEED"); v != "" { var seeds []int64 @@ -71,21 +89,64 @@ func paritySeeds(t *testing.T) []int64 { offset = n } - seeds := make([]int64, 0, count) + seeds := make([]int64, 0, len(regressionSeeds)+count) + seen := make(map[int64]bool, len(regressionSeeds)+count) + for _, s := range regressionSeeds { + if !seen[s] { + seen[s] = true + seeds = append(seeds, s) + } + } for i := range int64(count) { - seeds = append(seeds, offset+i) + s := offset + i + if !seen[s] { + seen[s] = true + seeds = append(seeds, s) + } } return seeds } +func TestParitySeeds(t *testing.T) { + t.Run("default includes regression seeds then window", func(t *testing.T) { + t.Setenv("FUZZ_SEEDS", "3") + t.Setenv("FUZZ_SEED_OFFSET", "100") + want := append(append([]int64{}, regressionSeeds...), 100, 101, 102) + assert.Equal(t, want, paritySeeds(t)) + }) + + t.Run("window overlapping a regression seed is deduplicated", func(t *testing.T) { + t.Setenv("FUZZ_SEEDS", "5") + t.Setenv("FUZZ_SEED_OFFSET", "27") + seeds := paritySeeds(t) + count := 0 + for _, s := range seeds { + if s == 29 { + count++ + } + } + assert.Equal(t, 1, count, "seed 29 must appear once even though it is both a regression seed and inside the window") + }) + + t.Run("FUZZ_SEED override ignores regression seeds", func(t *testing.T) { + t.Setenv("FUZZ_SEED", "7, 8") + assert.Equal(t, []int64{7, 8}, paritySeeds(t)) + }) +} + // FuzzJobCreateParity exposes the same parity check to Go's native fuzzer // (`go test -fuzz=FuzzJobCreateParity`). Note each input runs two real deploys, // so this is intended for ad-hoc deep runs, not the default `go test` path. func FuzzJobCreateParity(f *testing.F) { - RequireTerraform(f) + requireTerraform(f) for seed := range int64(5) { f.Add(seed) } + // Seed the corpus with known past divergences so the fuzzer always starts + // from inputs that previously exposed a bug. + for _, seed := range regressionSeeds { + f.Add(seed) + } f.Fuzz(func(t *testing.T, seed int64) { checkJobParity(t, seed) }) @@ -97,7 +158,7 @@ func checkJobParity(t *testing.T, seed int64) { t.Helper() job := GenerateJob(newRNG(seed)) - diffs, err := CompareJobEngines(t.Context(), t, job) + diffs, err := compareJobEngines(t.Context(), t, job) require.NoErrorf(t, err, "seed %d", seed) if len(diffs) > 0 { @@ -106,6 +167,6 @@ func checkJobParity(t *testing.T, seed int64) { for _, d := range diffs { t.Errorf(" %s", d) } - t.Logf("reproduce with: FUZZ_SEED=%d go test ./bundle/fuzz -run TestJobCreateParity\n%s", seed, jobJSON) + t.Logf("reproduce with: FUZZ_SEED=%d task test-fuzz\nonce fixed, add %d to regressionSeeds in bundle/fuzz/fuzz_test.go\n%s", seed, seed, jobJSON) } } diff --git a/bundle/fuzz/capture.go b/bundle/fuzz/recorder_test.go similarity index 86% rename from bundle/fuzz/capture.go rename to bundle/fuzz/recorder_test.go index fe10bc10be..244cb81480 100644 --- a/bundle/fuzz/capture.go +++ b/bundle/fuzz/recorder_test.go @@ -10,13 +10,13 @@ import ( // jobsCreatePath is the Jobs API route both engines must hit on create. The // direct engine posts here via the SDK and the terraform provider is expected to // as well. The testserver registers only this exact route, so if an engine ever -// posted to a different version the deploy would 404 and CaptureJobCreate would +// posted to a different version the deploy would 404 and captureJobCreate would // fail with "did not POST". A version skew therefore surfaces as a capture // failure, not as a payload diff. const jobsCreatePath = "/api/2.2/jobs/create" -// CapturedRequest is a single mutating API request observed by the testserver. -type CapturedRequest struct { +// capturedRequest is a single mutating API request observed by the testserver. +type capturedRequest struct { Method string Path string Body json.RawMessage @@ -27,7 +27,7 @@ type CapturedRequest struct { // goroutines. type recorder struct { mu sync.Mutex - requests []CapturedRequest + requests []capturedRequest } func (r *recorder) callback(req *testserver.Request) { @@ -40,7 +40,7 @@ func (r *recorder) callback(req *testserver.Request) { body = append(json.RawMessage(nil), req.Body...) } - r.requests = append(r.requests, CapturedRequest{ + r.requests = append(r.requests, capturedRequest{ Method: req.Method, Path: req.URL.Path, Body: body, From 94ecaaa800d8d4665c7d5efe68ba6b03fa6ffa4a Mon Sep 17 00:00:00 2001 From: Rada Kamysheva Date: Wed, 24 Jun 2026 13:34:51 +0000 Subject: [PATCH 08/24] bundle/fuzz: make the whole package test-only and harden parity reporting Move the remaining generator/diff/rand implementation into _test.go files (keeping only a doc.go for the package comment) so nothing in the harness compiles into the regular build, since no product code imports it. Distinguish deploy/capture failures from create-payload divergences in checkJobParity: skip when neither engine deploys the generated config, fail distinctly when exactly one engine accepts it (an acceptance divergence, not a payload diff), and only diff payloads when both deploys succeed. This keeps nightly triage from misdirecting a deploy failure into regressionSeeds. Also document the unique-identity-key assumption in diffKeyedSlice. --- bundle/fuzz/compare.go | 268 ----------------- bundle/fuzz/compare_cases_test.go | 119 ++++++++ bundle/fuzz/compare_test.go | 374 +++++++++++++++++------- bundle/fuzz/deploy_test.go | 15 - bundle/fuzz/doc.go | 17 ++ bundle/fuzz/fuzz_test.go | 27 +- bundle/fuzz/generate.go | 356 ---------------------- bundle/fuzz/generate_invariants_test.go | 47 +++ bundle/fuzz/generate_test.go | 358 +++++++++++++++++++++-- bundle/fuzz/{rand.go => rand_test.go} | 0 10 files changed, 800 insertions(+), 781 deletions(-) delete mode 100644 bundle/fuzz/compare.go create mode 100644 bundle/fuzz/compare_cases_test.go create mode 100644 bundle/fuzz/doc.go delete mode 100644 bundle/fuzz/generate.go create mode 100644 bundle/fuzz/generate_invariants_test.go rename bundle/fuzz/{rand.go => rand_test.go} (100%) diff --git a/bundle/fuzz/compare.go b/bundle/fuzz/compare.go deleted file mode 100644 index 81c1bc7afb..0000000000 --- a/bundle/fuzz/compare.go +++ /dev/null @@ -1,268 +0,0 @@ -package fuzz - -import ( - "bytes" - "encoding/json" - "fmt" - "regexp" - "slices" - "strconv" - "strings" -) - -// Difference is a single mismatch between the two engines' create payloads, -// located by a JSON-ish path (e.g. "tasks[0].new_cluster.num_workers"). -type Difference struct { - Path string - Direct any - Terraform any -} - -func (d Difference) String() string { - return fmt.Sprintf("%s: direct=%s terraform=%s", d.Path, render(d.Direct), render(d.Terraform)) -} - -// missing marks a value that is absent on one side. -type missing struct{} - -func render(v any) string { - if _, ok := v.(missing); ok { - return "" - } - b, err := json.Marshal(v) - if err != nil { - return fmt.Sprintf("%v", v) - } - return string(b) -} - -// DiffPayloads decodes both create payloads and returns every difference whose -// path is not explicitly ignored. ignorePaths are matched exactly against the -// rendered path, with "[*]" standing in for any slice index. -func DiffPayloads(direct, terraform json.RawMessage, ignorePaths []string) ([]Difference, error) { - d, err := decode(direct) - if err != nil { - return nil, fmt.Errorf("decoding direct payload: %w", err) - } - tf, err := decode(terraform) - if err != nil { - return nil, fmt.Errorf("decoding terraform payload: %w", err) - } - - var diffs []Difference - diffValue("", d, tf, &diffs) - - ignore := make(map[string]bool, len(ignorePaths)) - for _, p := range ignorePaths { - ignore[p] = true - } - - filtered := diffs[:0] - for _, diff := range diffs { - if !ignore[normalizePath(diff.Path)] { - filtered = append(filtered, diff) - } - } - return filtered, nil -} - -// decode unmarshals JSON using UseNumber so large int64 values (e.g. job ids, -// spark_context_id) are not corrupted by float64 rounding. See the encoding rule -// in the repo style guide. -func decode(raw json.RawMessage) (any, error) { - if len(raw) == 0 { - return nil, nil - } - dec := json.NewDecoder(bytes.NewReader(raw)) - dec.UseNumber() - var v any - if err := dec.Decode(&v); err != nil { - return nil, err - } - return v, nil -} - -func diffValue(path string, a, b any, diffs *[]Difference) { - switch av := a.(type) { - case map[string]any: - bv, ok := b.(map[string]any) - if !ok { - *diffs = append(*diffs, Difference{Path: path, Direct: a, Terraform: b}) - return - } - keys := unionKeys(av, bv) - for _, k := range keys { - achild, aok := av[k] - bchild, bok := bv[k] - child := joinKey(path, k) - switch { - case aok && bok: - diffValue(child, achild, bchild, diffs) - case aok: - *diffs = append(*diffs, Difference{Path: child, Direct: achild, Terraform: missing{}}) - default: - *diffs = append(*diffs, Difference{Path: child, Direct: missing{}, Terraform: bchild}) - } - } - case []any: - bv, ok := b.([]any) - if !ok { - *diffs = append(*diffs, Difference{Path: path, Direct: a, Terraform: b}) - return - } - // Slices whose elements carry a natural identity key (tasks, job clusters) - // are matched by that key so an engine emitting the same elements in a - // different order is not reported as a difference. Everything else is - // compared positionally. - if key := identityKey(av, bv); key != "" { - diffKeyedSlice(path, key, av, bv, diffs) - return - } - n := max(len(av), len(bv)) - for i := range n { - child := fmt.Sprintf("%s[%d]", path, i) - switch { - case i < len(av) && i < len(bv): - diffValue(child, av[i], bv[i], diffs) - case i < len(av): - *diffs = append(*diffs, Difference{Path: child, Direct: av[i], Terraform: missing{}}) - default: - *diffs = append(*diffs, Difference{Path: child, Direct: missing{}, Terraform: bv[i]}) - } - } - default: - if !scalarEqual(a, b) { - *diffs = append(*diffs, Difference{Path: path, Direct: a, Terraform: b}) - } - } -} - -// identityFields are the keys, in priority order, that uniquely identify the -// elements of a payload slice. Job tasks and shared job clusters are the slices -// whose order is not significant but which the engines may emit differently. -var identityFields = []string{"task_key", "job_cluster_key"} - -// identityKey returns the field that identifies every element of both slices, or -// "" if the elements are not uniformly keyed objects (in which case the caller -// falls back to positional comparison). -func identityKey(a, b []any) string { - for _, field := range identityFields { - if allHaveKey(a, field) && allHaveKey(b, field) { - return field - } - } - return "" -} - -func allHaveKey(s []any, field string) bool { - if len(s) == 0 { - return false - } - for _, el := range s { - m, ok := el.(map[string]any) - if !ok { - return false - } - if _, ok := m[field].(string); !ok { - return false - } - } - return true -} - -// diffKeyedSlice matches elements of a and b by the value of key (which is unique -// within each slice for tasks/job clusters) and diffs each matched pair, -// reporting unmatched elements as present-on-one-side. Paths keep numeric indices -// so ignore-path [*] normalization still applies. -func diffKeyedSlice(path, key string, a, b []any, diffs *[]Difference) { - bByKey := make(map[string]any, len(b)) - for _, el := range b { - bByKey[el.(map[string]any)[key].(string)] = el - } - - matched := make(map[string]bool, len(a)) - for i, el := range a { - child := fmt.Sprintf("%s[%d]", path, i) - k := el.(map[string]any)[key].(string) - matched[k] = true - if bel, ok := bByKey[k]; ok { - diffValue(child, el, bel, diffs) - } else { - *diffs = append(*diffs, Difference{Path: child, Direct: el, Terraform: missing{}}) - } - } - for j, el := range b { - k := el.(map[string]any)[key].(string) - if matched[k] { - continue - } - child := fmt.Sprintf("%s[%d]", path, j) - *diffs = append(*diffs, Difference{Path: child, Direct: missing{}, Terraform: el}) - } -} - -// scalarEqual compares two JSON scalars. json.Number is compared by its string -// form so 1 and 1.0 don't masquerade as equal across engines. -func scalarEqual(a, b any) bool { - an, aok := a.(json.Number) - bn, bok := b.(json.Number) - if aok && bok { - return an.String() == bn.String() - } - return a == b -} - -func unionKeys(a, b map[string]any) []string { - seen := map[string]bool{} - var keys []string - for k := range a { - if !seen[k] { - seen[k] = true - keys = append(keys, k) - } - } - for k := range b { - if !seen[k] { - seen[k] = true - keys = append(keys, k) - } - } - slices.Sort(keys) - return keys -} - -func joinKey(path, key string) string { - // Map keys can themselves contain dots or brackets (e.g. spark_conf entries - // like "spark.databricks.delta.preview.enabled"). Render those as bracketed, - // quoted segments so the path stays unambiguous and ignore entries can target - // a single key. - if key == "" || strings.ContainsAny(key, `.[]"`) { - return path + "[" + strconv.Quote(key) + "]" - } - if path == "" { - return key - } - return path + "." + key -} - -// indexRe matches numeric slice indices like "[12]" but not quoted string keys -// like ["spark.x"]. -var indexRe = regexp.MustCompile(`\[\d+\]`) - -// normalizePath replaces concrete slice indices with [*] so a single ignore -// entry can cover every element of a slice. -func normalizePath(path string) string { - return indexRe.ReplaceAllString(path, "[*]") -} - -// DefaultIgnorePaths lists create-payload paths that legitimately differ between -// the engines and are not parity bugs. Keep this list small and well-justified; -// every entry is a known, intentional divergence. -var DefaultIgnorePaths = []string{ - // The terraform provider strips the deprecated/ignored spark conf - // "spark.databricks.delta.preview.enabled" from new_cluster.spark_conf, while - // the direct engine forwards it verbatim. The backend ignores the key either - // way, so this is a benign provider-side filter rather than a parity bug. - `tasks[*].new_cluster.spark_conf["spark.databricks.delta.preview.enabled"]`, - `job_clusters[*].new_cluster.spark_conf["spark.databricks.delta.preview.enabled"]`, -} diff --git a/bundle/fuzz/compare_cases_test.go b/bundle/fuzz/compare_cases_test.go new file mode 100644 index 0000000000..46e506d75c --- /dev/null +++ b/bundle/fuzz/compare_cases_test.go @@ -0,0 +1,119 @@ +package fuzz + +import ( + "encoding/json" + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +func TestDiffPayloads(t *testing.T) { + tests := []struct { + name string + direct string + terraform string + ignore []string + want []string + }{ + { + name: "identical", + direct: `{"name":"a","tasks":[{"task_key":"t"}]}`, + terraform: `{"name":"a","tasks":[{"task_key":"t"}]}`, + want: nil, + }, + { + name: "scalar mismatch", + direct: `{"name":"a"}`, + terraform: `{"name":"b"}`, + want: []string{"name"}, + }, + { + name: "missing on terraform", + direct: `{"name":"a","queue":{"enabled":true}}`, + terraform: `{"name":"a"}`, + want: []string{"queue"}, + }, + { + name: "missing on direct", + direct: `{"name":"a"}`, + terraform: `{"name":"a","max_concurrent_runs":1}`, + want: []string{"max_concurrent_runs"}, + }, + { + name: "nested slice element mismatch", + direct: `{"tasks":[{"task_key":"t","timeout_seconds":1}]}`, + terraform: `{"tasks":[{"task_key":"t","timeout_seconds":2}]}`, + want: []string{"tasks[0].timeout_seconds"}, + }, + { + name: "slice length mismatch", + direct: `{"tasks":[{"task_key":"a"},{"task_key":"b"}]}`, + terraform: `{"tasks":[{"task_key":"a"}]}`, + want: []string{"tasks[1]"}, + }, + { + name: "number 1 vs 1.0 differ", + direct: `{"n":1}`, + terraform: `{"n":1.0}`, + want: []string{"n"}, + }, + { + name: "ignored path", + direct: `{"tasks":[{"timeout_seconds":1}]}`, + terraform: `{"tasks":[{"timeout_seconds":2}]}`, + ignore: []string{"tasks[*].timeout_seconds"}, + want: nil, + }, + { + name: "dotted map key is bracket-quoted", + direct: `{"spark_conf":{"spark.x.y":"1"}}`, + terraform: `{"spark_conf":{}}`, + want: []string{`spark_conf["spark.x.y"]`}, + }, + { + name: "dotted map key can be ignored", + direct: `{"c":{"spark_conf":{"spark.x.y":"1"}}}`, + terraform: `{"c":{"spark_conf":{}}}`, + ignore: []string{`c.spark_conf["spark.x.y"]`}, + want: nil, + }, + { + name: "tasks matched by key ignore order", + direct: `{"tasks":[{"task_key":"a","timeout_seconds":1},{"task_key":"b","timeout_seconds":2}]}`, + terraform: `{"tasks":[{"task_key":"b","timeout_seconds":2},{"task_key":"a","timeout_seconds":1}]}`, + want: nil, + }, + { + name: "tasks matched by key surface real diff at direct index", + direct: `{"tasks":[{"task_key":"a","timeout_seconds":1},{"task_key":"b","timeout_seconds":2}]}`, + terraform: `{"tasks":[{"task_key":"b","timeout_seconds":9},{"task_key":"a","timeout_seconds":1}]}`, + want: []string{"tasks[1].timeout_seconds"}, + }, + { + name: "task only on terraform reported at its index", + direct: `{"tasks":[{"task_key":"a"}]}`, + terraform: `{"tasks":[{"task_key":"a"},{"task_key":"b"}]}`, + want: []string{"tasks[1]"}, + }, + { + name: "job_clusters matched by key ignore order", + direct: `{"job_clusters":[{"job_cluster_key":"x","new_cluster":{"num_workers":1}},{"job_cluster_key":"y","new_cluster":{"num_workers":2}}]}`, + terraform: `{"job_clusters":[{"job_cluster_key":"y","new_cluster":{"num_workers":2}},{"job_cluster_key":"x","new_cluster":{"num_workers":1}}]}`, + want: nil, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + diffs, err := DiffPayloads(json.RawMessage(tt.direct), json.RawMessage(tt.terraform), tt.ignore) + require.NoError(t, err) + + var paths []string + for _, d := range diffs { + paths = append(paths, d.Path) + } + assert.ElementsMatch(t, tt.want, paths) + }) + } +} diff --git a/bundle/fuzz/compare_test.go b/bundle/fuzz/compare_test.go index 46e506d75c..fd6807b56c 100644 --- a/bundle/fuzz/compare_test.go +++ b/bundle/fuzz/compare_test.go @@ -1,119 +1,273 @@ package fuzz import ( + "bytes" "encoding/json" - "testing" - - "github.com/stretchr/testify/assert" - "github.com/stretchr/testify/require" + "fmt" + "regexp" + "slices" + "strconv" + "strings" ) -func TestDiffPayloads(t *testing.T) { - tests := []struct { - name string - direct string - terraform string - ignore []string - want []string - }{ - { - name: "identical", - direct: `{"name":"a","tasks":[{"task_key":"t"}]}`, - terraform: `{"name":"a","tasks":[{"task_key":"t"}]}`, - want: nil, - }, - { - name: "scalar mismatch", - direct: `{"name":"a"}`, - terraform: `{"name":"b"}`, - want: []string{"name"}, - }, - { - name: "missing on terraform", - direct: `{"name":"a","queue":{"enabled":true}}`, - terraform: `{"name":"a"}`, - want: []string{"queue"}, - }, - { - name: "missing on direct", - direct: `{"name":"a"}`, - terraform: `{"name":"a","max_concurrent_runs":1}`, - want: []string{"max_concurrent_runs"}, - }, - { - name: "nested slice element mismatch", - direct: `{"tasks":[{"task_key":"t","timeout_seconds":1}]}`, - terraform: `{"tasks":[{"task_key":"t","timeout_seconds":2}]}`, - want: []string{"tasks[0].timeout_seconds"}, - }, - { - name: "slice length mismatch", - direct: `{"tasks":[{"task_key":"a"},{"task_key":"b"}]}`, - terraform: `{"tasks":[{"task_key":"a"}]}`, - want: []string{"tasks[1]"}, - }, - { - name: "number 1 vs 1.0 differ", - direct: `{"n":1}`, - terraform: `{"n":1.0}`, - want: []string{"n"}, - }, - { - name: "ignored path", - direct: `{"tasks":[{"timeout_seconds":1}]}`, - terraform: `{"tasks":[{"timeout_seconds":2}]}`, - ignore: []string{"tasks[*].timeout_seconds"}, - want: nil, - }, - { - name: "dotted map key is bracket-quoted", - direct: `{"spark_conf":{"spark.x.y":"1"}}`, - terraform: `{"spark_conf":{}}`, - want: []string{`spark_conf["spark.x.y"]`}, - }, - { - name: "dotted map key can be ignored", - direct: `{"c":{"spark_conf":{"spark.x.y":"1"}}}`, - terraform: `{"c":{"spark_conf":{}}}`, - ignore: []string{`c.spark_conf["spark.x.y"]`}, - want: nil, - }, - { - name: "tasks matched by key ignore order", - direct: `{"tasks":[{"task_key":"a","timeout_seconds":1},{"task_key":"b","timeout_seconds":2}]}`, - terraform: `{"tasks":[{"task_key":"b","timeout_seconds":2},{"task_key":"a","timeout_seconds":1}]}`, - want: nil, - }, - { - name: "tasks matched by key surface real diff at direct index", - direct: `{"tasks":[{"task_key":"a","timeout_seconds":1},{"task_key":"b","timeout_seconds":2}]}`, - terraform: `{"tasks":[{"task_key":"b","timeout_seconds":9},{"task_key":"a","timeout_seconds":1}]}`, - want: []string{"tasks[1].timeout_seconds"}, - }, - { - name: "task only on terraform reported at its index", - direct: `{"tasks":[{"task_key":"a"}]}`, - terraform: `{"tasks":[{"task_key":"a"},{"task_key":"b"}]}`, - want: []string{"tasks[1]"}, - }, - { - name: "job_clusters matched by key ignore order", - direct: `{"job_clusters":[{"job_cluster_key":"x","new_cluster":{"num_workers":1}},{"job_cluster_key":"y","new_cluster":{"num_workers":2}}]}`, - terraform: `{"job_clusters":[{"job_cluster_key":"y","new_cluster":{"num_workers":2}},{"job_cluster_key":"x","new_cluster":{"num_workers":1}}]}`, - want: nil, - }, - } - - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - diffs, err := DiffPayloads(json.RawMessage(tt.direct), json.RawMessage(tt.terraform), tt.ignore) - require.NoError(t, err) - - var paths []string - for _, d := range diffs { - paths = append(paths, d.Path) +// Difference is a single mismatch between the two engines' create payloads, +// located by a JSON-ish path (e.g. "tasks[0].new_cluster.num_workers"). +type Difference struct { + Path string + Direct any + Terraform any +} + +func (d Difference) String() string { + return fmt.Sprintf("%s: direct=%s terraform=%s", d.Path, render(d.Direct), render(d.Terraform)) +} + +// missing marks a value that is absent on one side. +type missing struct{} + +func render(v any) string { + if _, ok := v.(missing); ok { + return "" + } + b, err := json.Marshal(v) + if err != nil { + return fmt.Sprintf("%v", v) + } + return string(b) +} + +// DiffPayloads decodes both create payloads and returns every difference whose +// path is not explicitly ignored. ignorePaths are matched exactly against the +// rendered path, with "[*]" standing in for any slice index. +func DiffPayloads(direct, terraform json.RawMessage, ignorePaths []string) ([]Difference, error) { + d, err := decode(direct) + if err != nil { + return nil, fmt.Errorf("decoding direct payload: %w", err) + } + tf, err := decode(terraform) + if err != nil { + return nil, fmt.Errorf("decoding terraform payload: %w", err) + } + + var diffs []Difference + diffValue("", d, tf, &diffs) + + ignore := make(map[string]bool, len(ignorePaths)) + for _, p := range ignorePaths { + ignore[p] = true + } + + filtered := diffs[:0] + for _, diff := range diffs { + if !ignore[normalizePath(diff.Path)] { + filtered = append(filtered, diff) + } + } + return filtered, nil +} + +// decode unmarshals JSON using UseNumber so large int64 values (e.g. job ids, +// spark_context_id) are not corrupted by float64 rounding. See the encoding rule +// in the repo style guide. +func decode(raw json.RawMessage) (any, error) { + if len(raw) == 0 { + return nil, nil + } + dec := json.NewDecoder(bytes.NewReader(raw)) + dec.UseNumber() + var v any + if err := dec.Decode(&v); err != nil { + return nil, err + } + return v, nil +} + +func diffValue(path string, a, b any, diffs *[]Difference) { + switch av := a.(type) { + case map[string]any: + bv, ok := b.(map[string]any) + if !ok { + *diffs = append(*diffs, Difference{Path: path, Direct: a, Terraform: b}) + return + } + keys := unionKeys(av, bv) + for _, k := range keys { + achild, aok := av[k] + bchild, bok := bv[k] + child := joinKey(path, k) + switch { + case aok && bok: + diffValue(child, achild, bchild, diffs) + case aok: + *diffs = append(*diffs, Difference{Path: child, Direct: achild, Terraform: missing{}}) + default: + *diffs = append(*diffs, Difference{Path: child, Direct: missing{}, Terraform: bchild}) + } + } + case []any: + bv, ok := b.([]any) + if !ok { + *diffs = append(*diffs, Difference{Path: path, Direct: a, Terraform: b}) + return + } + // Slices whose elements carry a natural identity key (tasks, job clusters) + // are matched by that key so an engine emitting the same elements in a + // different order is not reported as a difference. Everything else is + // compared positionally. + if key := identityKey(av, bv); key != "" { + diffKeyedSlice(path, key, av, bv, diffs) + return + } + n := max(len(av), len(bv)) + for i := range n { + child := fmt.Sprintf("%s[%d]", path, i) + switch { + case i < len(av) && i < len(bv): + diffValue(child, av[i], bv[i], diffs) + case i < len(av): + *diffs = append(*diffs, Difference{Path: child, Direct: av[i], Terraform: missing{}}) + default: + *diffs = append(*diffs, Difference{Path: child, Direct: missing{}, Terraform: bv[i]}) } - assert.ElementsMatch(t, tt.want, paths) - }) + } + default: + if !scalarEqual(a, b) { + *diffs = append(*diffs, Difference{Path: path, Direct: a, Terraform: b}) + } + } +} + +// identityFields are the keys, in priority order, that uniquely identify the +// elements of a payload slice. Job tasks and shared job clusters are the slices +// whose order is not significant but which the engines may emit differently. +var identityFields = []string{"task_key", "job_cluster_key"} + +// identityKey returns the field that identifies every element of both slices, or +// "" if the elements are not uniformly keyed objects (in which case the caller +// falls back to positional comparison). +func identityKey(a, b []any) string { + for _, field := range identityFields { + if allHaveKey(a, field) && allHaveKey(b, field) { + return field + } + } + return "" +} + +func allHaveKey(s []any, field string) bool { + if len(s) == 0 { + return false + } + for _, el := range s { + m, ok := el.(map[string]any) + if !ok { + return false + } + if _, ok := m[field].(string); !ok { + return false + } + } + return true +} + +// diffKeyedSlice matches elements of a and b by the value of key (which is unique +// within each slice for tasks/job clusters) and diffs each matched pair, +// reporting unmatched elements as present-on-one-side. Paths keep numeric indices +// so ignore-path [*] normalization still applies. +func diffKeyedSlice(path, key string, a, b []any, diffs *[]Difference) { + // identityFields are unique within a slice by API contract (no two job tasks + // share a task_key, no two job_clusters share a job_cluster_key), so keying by + // them is unambiguous. If a payload ever repeated a key, last-one-wins here and + // the duplicate would be mismatched rather than reported precisely; callers + // outside the job-create harness must not rely on this for non-unique keys. + bByKey := make(map[string]any, len(b)) + for _, el := range b { + bByKey[el.(map[string]any)[key].(string)] = el + } + + matched := make(map[string]bool, len(a)) + for i, el := range a { + child := fmt.Sprintf("%s[%d]", path, i) + k := el.(map[string]any)[key].(string) + matched[k] = true + if bel, ok := bByKey[k]; ok { + diffValue(child, el, bel, diffs) + } else { + *diffs = append(*diffs, Difference{Path: child, Direct: el, Terraform: missing{}}) + } + } + for j, el := range b { + k := el.(map[string]any)[key].(string) + if matched[k] { + continue + } + child := fmt.Sprintf("%s[%d]", path, j) + *diffs = append(*diffs, Difference{Path: child, Direct: missing{}, Terraform: el}) + } +} + +// scalarEqual compares two JSON scalars. json.Number is compared by its string +// form so 1 and 1.0 don't masquerade as equal across engines. +func scalarEqual(a, b any) bool { + an, aok := a.(json.Number) + bn, bok := b.(json.Number) + if aok && bok { + return an.String() == bn.String() } + return a == b +} + +func unionKeys(a, b map[string]any) []string { + seen := map[string]bool{} + var keys []string + for k := range a { + if !seen[k] { + seen[k] = true + keys = append(keys, k) + } + } + for k := range b { + if !seen[k] { + seen[k] = true + keys = append(keys, k) + } + } + slices.Sort(keys) + return keys +} + +func joinKey(path, key string) string { + // Map keys can themselves contain dots or brackets (e.g. spark_conf entries + // like "spark.databricks.delta.preview.enabled"). Render those as bracketed, + // quoted segments so the path stays unambiguous and ignore entries can target + // a single key. + if key == "" || strings.ContainsAny(key, `.[]"`) { + return path + "[" + strconv.Quote(key) + "]" + } + if path == "" { + return key + } + return path + "." + key +} + +// indexRe matches numeric slice indices like "[12]" but not quoted string keys +// like ["spark.x"]. +var indexRe = regexp.MustCompile(`\[\d+\]`) + +// normalizePath replaces concrete slice indices with [*] so a single ignore +// entry can cover every element of a slice. +func normalizePath(path string) string { + return indexRe.ReplaceAllString(path, "[*]") +} + +// DefaultIgnorePaths lists create-payload paths that legitimately differ between +// the engines and are not parity bugs. Keep this list small and well-justified; +// every entry is a known, intentional divergence. +var DefaultIgnorePaths = []string{ + // The terraform provider strips the deprecated/ignored spark conf + // "spark.databricks.delta.preview.enabled" from new_cluster.spark_conf, while + // the direct engine forwards it verbatim. The backend ignores the key either + // way, so this is a benign provider-side filter rather than a parity bug. + `tasks[*].new_cluster.spark_conf["spark.databricks.delta.preview.enabled"]`, + `job_clusters[*].new_cluster.spark_conf["spark.databricks.delta.preview.enabled"]`, } diff --git a/bundle/fuzz/deploy_test.go b/bundle/fuzz/deploy_test.go index e42dbb7434..2328e0354e 100644 --- a/bundle/fuzz/deploy_test.go +++ b/bundle/fuzz/deploy_test.go @@ -61,21 +61,6 @@ func captureJobCreate(ctx context.Context, t *testing.T, job *resources.Job, eng return body, nil } -// compareJobEngines deploys job under both engines and returns the create-payload -// differences that are not covered by DefaultIgnorePaths. An empty result means -// the engines produced equivalent create payloads. -func compareJobEngines(ctx context.Context, t *testing.T, job *resources.Job) ([]Difference, error) { - direct, err := captureJobCreate(ctx, t, job, "direct") - if err != nil { - return nil, fmt.Errorf("capturing direct payload: %w", err) - } - terraform, err := captureJobCreate(ctx, t, job, "terraform") - if err != nil { - return nil, fmt.Errorf("capturing terraform payload: %w", err) - } - return DiffPayloads(direct, terraform, DefaultIgnorePaths) -} - // writeJobBundle writes a minimal databricks.yml describing a single job. The // document is emitted as JSON, which is valid YAML, so we can reuse the job's // own JSON marshaling (which honors ForceSendFields) without a YAML dependency. diff --git a/bundle/fuzz/doc.go b/bundle/fuzz/doc.go new file mode 100644 index 0000000000..cf898d3ec1 --- /dev/null +++ b/bundle/fuzz/doc.go @@ -0,0 +1,17 @@ +// Package fuzz provides randomized generators and harnesses that compare how the +// terraform and direct deploy engines translate the same bundle resource into an +// API create payload. See DECO-25361. +// +// The first technique implemented here generates a random resource config and +// checks for differences in the create payload between the terraform and direct +// engines. Generators are seeded so that any divergence found by the fuzz driver +// can be reproduced from the printed seed. +// +// Only jobs are covered for now. Extending the harness to other resource kinds +// (pipelines, apps, ...) is tracked as follow-up work under DECO-25361. +// +// Everything else in the package lives in _test.go files: the package is a +// test-only utility and nothing in the product imports it, so keeping the logic +// out of the regular build avoids shipping dead code. This file exists only to +// carry the package documentation in a non-test file. +package fuzz diff --git a/bundle/fuzz/fuzz_test.go b/bundle/fuzz/fuzz_test.go index 7b0d0df8ea..88a3c5a3b6 100644 --- a/bundle/fuzz/fuzz_test.go +++ b/bundle/fuzz/fuzz_test.go @@ -154,12 +154,35 @@ func FuzzJobCreateParity(f *testing.F) { // checkJobParity generates the job for seed, deploys it under both engines, and // fails the test with reproduction details if the create payloads diverge. +// +// A deploy/capture failure is not a create-payload divergence, so the three +// outcomes are handled distinctly to keep nightly triage from misdirecting a +// deploy failure into regressionSeeds (which is only for real payload diffs): +// - neither engine deployed: the generator produced a config nothing accepts, +// so skip (logging both errors) rather than flag a parity bug. +// - exactly one engine deployed: the engines disagree on whether the config is +// even valid. That is a real divergence worth failing on, but an acceptance +// divergence, not a payload diff, so it is reported as such. +// - both deployed: compare the captured create payloads. func checkJobParity(t *testing.T, seed int64) { t.Helper() job := GenerateJob(newRNG(seed)) - diffs, err := compareJobEngines(t.Context(), t, job) - require.NoErrorf(t, err, "seed %d", seed) + ctx := t.Context() + direct, directErr := captureJobCreate(ctx, t, job, "direct") + terraform, tfErr := captureJobCreate(ctx, t, job, "terraform") + + switch { + case directErr != nil && tfErr != nil: + t.Skipf("seed %d: config did not deploy under either engine (not a parity divergence)\ndirect: %v\nterraform: %v", seed, directErr, tfErr) + case directErr != nil: + t.Fatalf("seed %d: direct rejected a config terraform accepted (engine acceptance divergence, not a payload diff): %v", seed, directErr) + case tfErr != nil: + t.Fatalf("seed %d: terraform rejected a config direct accepted (engine acceptance divergence, not a payload diff): %v", seed, tfErr) + } + + diffs, err := DiffPayloads(direct, terraform, DefaultIgnorePaths) + require.NoErrorf(t, err, "seed %d: comparing create payloads", seed) if len(diffs) > 0 { jobJSON, _ := json.MarshalIndent(job, "", " ") diff --git a/bundle/fuzz/generate.go b/bundle/fuzz/generate.go deleted file mode 100644 index 697748e03f..0000000000 --- a/bundle/fuzz/generate.go +++ /dev/null @@ -1,356 +0,0 @@ -// Package fuzz provides randomized generators and harnesses that compare how the -// terraform and direct deploy engines translate the same bundle resource into an -// API create payload. See DECO-25361. -// -// The first technique implemented here generates a random resource config and -// checks for differences in the create payload between the terraform and direct -// engines. Generators are seeded so that any divergence found by the fuzz driver -// can be reproduced from the printed seed. -// -// Only jobs are covered for now. Extending the harness to other resource kinds -// (pipelines, apps, ...) is tracked as follow-up work under DECO-25361. -package fuzz - -import ( - "fmt" - "math/rand/v2" - "strconv" - - "github.com/databricks/cli/bundle/config/resources" - "github.com/databricks/databricks-sdk-go/service/compute" - "github.com/databricks/databricks-sdk-go/service/jobs" -) - -// Value pools are intentionally small and valid-looking: the goal is to exercise -// the engines' config->payload translation across many field combinations, not to -// stress the API with invalid values (which the testserver would reject before we -// can compare payloads). -var ( - sparkVersions = []string{"13.3.x-scala2.12", "14.3.x-scala2.12", "15.4.x-scala2.12", "16.4.x-scala2.12"} - nodeTypeIDs = []string{"i3.xlarge", "m5.large", "r5.xlarge", "Standard_DS3_v2"} - timezones = []string{"UTC", "America/Los_Angeles", "Europe/Amsterdam"} - cronExprs = []string{"0 0 12 * * ?", "0 15 10 ? * MON-FRI", "0 0/30 * * * ?"} - pauseStatuses = []jobs.PauseStatus{jobs.PauseStatusPaused, jobs.PauseStatusUnpaused} - performance = []jobs.PerformanceTarget{jobs.PerformanceTargetPerformanceOptimized, jobs.PerformanceTargetStandard} - timeUnits = []string{"HOURS", "DAYS", "WEEKS"} - healthMetrics = []string{"RUN_DURATION_SECONDS", "STREAMING_BACKLOG_BYTES", "STREAMING_BACKLOG_RECORDS"} - conditionOps = []string{"EQUAL_TO", "NOT_EQUAL", "GREATER_THAN", "LESS_THAN_OR_EQUAL"} - runIfs = []string{"ALL_SUCCESS", "AT_LEAST_ONE_SUCCESS", "NONE_FAILED", "ALL_DONE"} - gitProviders = []jobs.GitProvider{jobs.GitProviderGitHub, jobs.GitProviderGitLab, jobs.GitProviderAzureDevOpsServices} -) - -// GenerateJob builds a random, well-formed job config driven entirely by rng, so -// the same seed always produces the same job. It deliberately favors fields whose -// translation tends to differ between engines (tasks, clusters, schedules, -// notifications, tags, zero-able scalars). -// -// TODO(DECO-25361): generalize the harness across resource kinds so pipelines, -// apps, etc. get the same create-payload parity coverage as jobs. -func GenerateJob(rng *rand.Rand) *resources.Job { - job := &resources.Job{} - job.Name = randName(rng, "job") - - if chance(rng, 0.5) { - job.Description = randSentence(rng) - } - if chance(rng, 0.4) { - job.MaxConcurrentRuns = rng.IntN(10) + 1 - } - if chance(rng, 0.4) { - job.TimeoutSeconds = rng.IntN(7200) - } - if chance(rng, 0.3) { - job.PerformanceTarget = oneOf(rng, performance) - } - if chance(rng, 0.5) { - job.Tags = randTags(rng) - } - if chance(rng, 0.3) { - job.GitSource = randGitSource(rng) - } - - randScheduling(rng, job) - - if chance(rng, 0.3) { - job.EmailNotifications = randEmailNotifications(rng) - } - if chance(rng, 0.2) { - job.WebhookNotifications = randWebhookNotifications(rng) - } - if chance(rng, 0.3) { - job.NotificationSettings = &jobs.JobNotificationSettings{ - NoAlertForCanceledRuns: chance(rng, 0.5), - NoAlertForSkippedRuns: chance(rng, 0.5), - } - } - if chance(rng, 0.3) { - job.Health = randHealth(rng) - } - if chance(rng, 0.3) { - job.Parameters = randParameters(rng) - } - if chance(rng, 0.3) { - job.Queue = &jobs.QueueSettings{Enabled: chance(rng, 0.5)} - } - - // Generate shared job clusters first so tasks can reference them by key. - var jobClusterKeys []string - if chance(rng, 0.5) { - n := rng.IntN(2) + 1 - for i := range n { - key := fmt.Sprintf("cluster_%d", i) - jobClusterKeys = append(jobClusterKeys, key) - job.JobClusters = append(job.JobClusters, jobs.JobCluster{ - JobClusterKey: key, - NewCluster: randClusterSpec(rng), - }) - } - } - - nTasks := rng.IntN(3) + 1 - var taskKeys []string - for i := range nTasks { - task := randTask(rng, i, jobClusterKeys) - // Randomly chain dependencies onto previously generated tasks. - if len(taskKeys) > 0 && chance(rng, 0.4) { - dep := taskKeys[rng.IntN(len(taskKeys))] - task.DependsOn = []jobs.TaskDependency{{TaskKey: dep}} - if chance(rng, 0.5) { - task.RunIf = jobs.RunIf(oneOf(rng, runIfs)) - } - } - taskKeys = append(taskKeys, task.TaskKey) - job.Tasks = append(job.Tasks, task) - } - - return job -} - -// randScheduling sets at most one of schedule/trigger/continuous, which are -// mutually exclusive ways to launch a job. -func randScheduling(rng *rand.Rand, job *resources.Job) { - switch rng.IntN(5) { - case 0: - job.Schedule = &jobs.CronSchedule{ - QuartzCronExpression: oneOf(rng, cronExprs), - TimezoneId: oneOf(rng, timezones), - PauseStatus: oneOf(rng, pauseStatuses), - } - case 1: - job.Trigger = &jobs.TriggerSettings{ - PauseStatus: oneOf(rng, pauseStatuses), - Periodic: &jobs.PeriodicTriggerConfiguration{ - Interval: rng.IntN(12) + 1, - Unit: jobs.PeriodicTriggerConfigurationTimeUnit(oneOf(rng, timeUnits)), - }, - } - case 2: - job.Trigger = &jobs.TriggerSettings{ - PauseStatus: oneOf(rng, pauseStatuses), - FileArrival: &jobs.FileArrivalTriggerConfiguration{ - Url: "s3://" + randWord(rng) + "/" + randWord(rng), - }, - } - case 3: - job.Continuous = &jobs.Continuous{PauseStatus: oneOf(rng, pauseStatuses)} - default: - // no scheduling - } -} - -func randTask(rng *rand.Rand, idx int, jobClusterKeys []string) jobs.Task { - task := jobs.Task{TaskKey: fmt.Sprintf("task_%d", idx)} - - // Use absolute workspace paths with source=WORKSPACE so the generated bundle - // never depends on local files existing on disk (which deploy would reject). - // condition_task needs no compute, so it is handled separately below. - needsCompute := true - switch rng.IntN(4) { - case 0: - task.NotebookTask = &jobs.NotebookTask{ - NotebookPath: "/Workspace/Users/test/" + randName(rng, "nb"), - Source: jobs.SourceWorkspace, - } - case 1: - task.SparkPythonTask = &jobs.SparkPythonTask{ - PythonFile: "/Workspace/Users/test/" + randName(rng, "main") + ".py", - Source: jobs.SourceWorkspace, - } - case 2: - task.PythonWheelTask = &jobs.PythonWheelTask{ - PackageName: randName(rng, "pkg"), - EntryPoint: "main", - } - case 3: - task.ConditionTask = &jobs.ConditionTask{ - Left: randWord(rng), - Op: jobs.ConditionTaskOp(oneOf(rng, conditionOps)), - Right: randWord(rng), - } - needsCompute = false - } - - if needsCompute { - assignCompute(rng, &task, jobClusterKeys) - if chance(rng, 0.4) { - task.Libraries = randLibraries(rng) - } - } - - if chance(rng, 0.3) { - task.TimeoutSeconds = rng.IntN(3600) - } - if chance(rng, 0.3) { - task.MaxRetries = rng.IntN(5) - task.MinRetryIntervalMillis = rng.IntN(60000) - task.RetryOnTimeout = chance(rng, 0.5) - } - return task -} - -// assignCompute attaches exactly one compute source, which notebook/python/wheel -// tasks require: a shared job cluster (when available), a brand-new cluster, or an -// existing cluster id. -func assignCompute(rng *rand.Rand, task *jobs.Task, jobClusterKeys []string) { - const ( - computeNew = iota - computeExisting - computeShared - ) - options := []int{computeNew, computeExisting} - if len(jobClusterKeys) > 0 { - options = append(options, computeShared) - } - switch oneOf(rng, options) { - case computeNew: - spec := randClusterSpec(rng) - task.NewCluster = &spec - case computeExisting: - task.ExistingClusterId = randName(rng, "cluster") - case computeShared: - task.JobClusterKey = oneOf(rng, jobClusterKeys) - } -} - -func randClusterSpec(rng *rand.Rand) compute.ClusterSpec { - spec := compute.ClusterSpec{ - SparkVersion: oneOf(rng, sparkVersions), - NodeTypeId: oneOf(rng, nodeTypeIDs), - } - if chance(rng, 0.5) { - spec.NumWorkers = rng.IntN(8) - } else { - spec.Autoscale = &compute.AutoScale{ - MinWorkers: 1, - MaxWorkers: rng.IntN(8) + 2, - } - } - if chance(rng, 0.4) { - spec.SparkConf = map[string]string{ - "spark.databricks.delta.preview.enabled": "true", - "spark.speculation": strconv.FormatBool(chance(rng, 0.5)), - } - } - if chance(rng, 0.3) { - spec.CustomTags = randTags(rng) - } - if chance(rng, 0.3) { - spec.SparkEnvVars = map[string]string{"PYSPARK_PYTHON": "/databricks/python3/bin/python3"} - } - if chance(rng, 0.3) { - spec.DriverNodeTypeId = oneOf(rng, nodeTypeIDs) - } - return spec -} - -func randGitSource(rng *rand.Rand) *jobs.GitSource { - src := &jobs.GitSource{ - GitProvider: oneOf(rng, gitProviders), - GitUrl: "https://example.com/" + randWord(rng) + "/" + randWord(rng) + ".git", - } - switch rng.IntN(3) { - case 0: - src.GitBranch = oneOf(rng, []string{"main", "develop", "release"}) - case 1: - src.GitTag = "v" + fmt.Sprintf("%d.%d.0", rng.IntN(5), rng.IntN(10)) - case 2: - src.GitCommit = fmt.Sprintf("%040x", rng.Int64()) - } - return src -} - -func randEmailNotifications(rng *rand.Rand) *jobs.JobEmailNotifications { - email := randWord(rng) + "@example.com" - n := &jobs.JobEmailNotifications{NoAlertForSkippedRuns: chance(rng, 0.5)} - if chance(rng, 0.6) { - n.OnFailure = []string{email} - } - if chance(rng, 0.4) { - n.OnSuccess = []string{email} - } - if chance(rng, 0.3) { - n.OnStart = []string{email} - } - return n -} - -func randWebhookNotifications(rng *rand.Rand) *jobs.WebhookNotifications { - hook := []jobs.Webhook{{Id: randName(rng, "hook")}} - n := &jobs.WebhookNotifications{} - if chance(rng, 0.6) { - n.OnFailure = hook - } - if chance(rng, 0.4) { - n.OnSuccess = hook - } - return n -} - -func randHealth(rng *rand.Rand) *jobs.JobsHealthRules { - return &jobs.JobsHealthRules{ - Rules: []jobs.JobsHealthRule{ - { - Metric: jobs.JobsHealthMetric(oneOf(rng, healthMetrics)), - Op: jobs.JobsHealthOperatorGreaterThan, - Value: int64(rng.IntN(3600) + 1), - }, - }, - } -} - -func randLibraries(rng *rand.Rand) []compute.Library { - n := rng.IntN(2) + 1 - libs := make([]compute.Library, 0, n) - for range n { - switch rng.IntN(3) { - case 0: - libs = append(libs, compute.Library{Pypi: &compute.PythonPyPiLibrary{Package: randWord(rng)}}) - case 1: - libs = append(libs, compute.Library{Maven: &compute.MavenLibrary{Coordinates: "org.example:" + randWord(rng) + ":1.0.0"}}) - case 2: - libs = append(libs, compute.Library{Whl: "/Workspace/Users/test/" + randName(rng, "lib") + ".whl"}) - } - } - return libs -} - -func randParameters(rng *rand.Rand) []jobs.JobParameterDefinition { - n := rng.IntN(3) + 1 - params := make([]jobs.JobParameterDefinition, 0, n) - for i := range n { - params = append(params, jobs.JobParameterDefinition{ - Name: fmt.Sprintf("param_%d", i), - Default: randWord(rng), - }) - } - return params -} - -func randTags(rng *rand.Rand) map[string]string { - n := rng.IntN(3) + 1 - tags := make(map[string]string, n) - for i := range n { - tags[fmt.Sprintf("tag_%d", i)] = randWord(rng) - } - return tags -} diff --git a/bundle/fuzz/generate_invariants_test.go b/bundle/fuzz/generate_invariants_test.go new file mode 100644 index 0000000000..f7a797e8f5 --- /dev/null +++ b/bundle/fuzz/generate_invariants_test.go @@ -0,0 +1,47 @@ +package fuzz + +import ( + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +func TestGenerateJobIsDeterministic(t *testing.T) { + a := GenerateJob(newRNG(42)) + b := GenerateJob(newRNG(42)) + assert.Equal(t, a, b, "same seed must produce identical job") +} + +func TestGenerateJobIsWellFormed(t *testing.T) { + for seed := range int64(200) { + job := GenerateJob(newRNG(seed)) + require.NotEmptyf(t, job.Name, "seed %d: job must have a name", seed) + require.NotEmptyf(t, job.Tasks, "seed %d: job must have at least one task", seed) + + clusterKeys := map[string]bool{} + for _, jc := range job.JobClusters { + clusterKeys[jc.JobClusterKey] = true + } + + taskKeys := map[string]bool{} + for _, task := range job.Tasks { + require.NotEmptyf(t, task.TaskKey, "seed %d: task must have a key", seed) + taskKeys[task.TaskKey] = true + + // A task referencing a job cluster must reference one we generated. + if task.JobClusterKey != "" { + assert.Containsf(t, clusterKeys, task.JobClusterKey, + "seed %d: task %q references unknown job cluster %q", seed, task.TaskKey, task.JobClusterKey) + } + } + + // Every dependency must point at a task that exists in this job. + for _, task := range job.Tasks { + for _, dep := range task.DependsOn { + assert.Containsf(t, taskKeys, dep.TaskKey, + "seed %d: task %q depends on unknown task %q", seed, task.TaskKey, dep.TaskKey) + } + } + } +} diff --git a/bundle/fuzz/generate_test.go b/bundle/fuzz/generate_test.go index f7a797e8f5..1b0acf55b0 100644 --- a/bundle/fuzz/generate_test.go +++ b/bundle/fuzz/generate_test.go @@ -1,47 +1,345 @@ package fuzz import ( - "testing" + "fmt" + "math/rand/v2" + "strconv" - "github.com/stretchr/testify/assert" - "github.com/stretchr/testify/require" + "github.com/databricks/cli/bundle/config/resources" + "github.com/databricks/databricks-sdk-go/service/compute" + "github.com/databricks/databricks-sdk-go/service/jobs" ) -func TestGenerateJobIsDeterministic(t *testing.T) { - a := GenerateJob(newRNG(42)) - b := GenerateJob(newRNG(42)) - assert.Equal(t, a, b, "same seed must produce identical job") -} +// Value pools are intentionally small and valid-looking: the goal is to exercise +// the engines' config->payload translation across many field combinations, not to +// stress the API with invalid values (which the testserver would reject before we +// can compare payloads). +var ( + sparkVersions = []string{"13.3.x-scala2.12", "14.3.x-scala2.12", "15.4.x-scala2.12", "16.4.x-scala2.12"} + nodeTypeIDs = []string{"i3.xlarge", "m5.large", "r5.xlarge", "Standard_DS3_v2"} + timezones = []string{"UTC", "America/Los_Angeles", "Europe/Amsterdam"} + cronExprs = []string{"0 0 12 * * ?", "0 15 10 ? * MON-FRI", "0 0/30 * * * ?"} + pauseStatuses = []jobs.PauseStatus{jobs.PauseStatusPaused, jobs.PauseStatusUnpaused} + performance = []jobs.PerformanceTarget{jobs.PerformanceTargetPerformanceOptimized, jobs.PerformanceTargetStandard} + timeUnits = []string{"HOURS", "DAYS", "WEEKS"} + healthMetrics = []string{"RUN_DURATION_SECONDS", "STREAMING_BACKLOG_BYTES", "STREAMING_BACKLOG_RECORDS"} + conditionOps = []string{"EQUAL_TO", "NOT_EQUAL", "GREATER_THAN", "LESS_THAN_OR_EQUAL"} + runIfs = []string{"ALL_SUCCESS", "AT_LEAST_ONE_SUCCESS", "NONE_FAILED", "ALL_DONE"} + gitProviders = []jobs.GitProvider{jobs.GitProviderGitHub, jobs.GitProviderGitLab, jobs.GitProviderAzureDevOpsServices} +) + +// GenerateJob builds a random, well-formed job config driven entirely by rng, so +// the same seed always produces the same job. It deliberately favors fields whose +// translation tends to differ between engines (tasks, clusters, schedules, +// notifications, tags, zero-able scalars). +// +// TODO(DECO-25361): generalize the harness across resource kinds so pipelines, +// apps, etc. get the same create-payload parity coverage as jobs. +func GenerateJob(rng *rand.Rand) *resources.Job { + job := &resources.Job{} + job.Name = randName(rng, "job") + + if chance(rng, 0.5) { + job.Description = randSentence(rng) + } + if chance(rng, 0.4) { + job.MaxConcurrentRuns = rng.IntN(10) + 1 + } + if chance(rng, 0.4) { + job.TimeoutSeconds = rng.IntN(7200) + } + if chance(rng, 0.3) { + job.PerformanceTarget = oneOf(rng, performance) + } + if chance(rng, 0.5) { + job.Tags = randTags(rng) + } + if chance(rng, 0.3) { + job.GitSource = randGitSource(rng) + } -func TestGenerateJobIsWellFormed(t *testing.T) { - for seed := range int64(200) { - job := GenerateJob(newRNG(seed)) - require.NotEmptyf(t, job.Name, "seed %d: job must have a name", seed) - require.NotEmptyf(t, job.Tasks, "seed %d: job must have at least one task", seed) + randScheduling(rng, job) - clusterKeys := map[string]bool{} - for _, jc := range job.JobClusters { - clusterKeys[jc.JobClusterKey] = true + if chance(rng, 0.3) { + job.EmailNotifications = randEmailNotifications(rng) + } + if chance(rng, 0.2) { + job.WebhookNotifications = randWebhookNotifications(rng) + } + if chance(rng, 0.3) { + job.NotificationSettings = &jobs.JobNotificationSettings{ + NoAlertForCanceledRuns: chance(rng, 0.5), + NoAlertForSkippedRuns: chance(rng, 0.5), } + } + if chance(rng, 0.3) { + job.Health = randHealth(rng) + } + if chance(rng, 0.3) { + job.Parameters = randParameters(rng) + } + if chance(rng, 0.3) { + job.Queue = &jobs.QueueSettings{Enabled: chance(rng, 0.5)} + } - taskKeys := map[string]bool{} - for _, task := range job.Tasks { - require.NotEmptyf(t, task.TaskKey, "seed %d: task must have a key", seed) - taskKeys[task.TaskKey] = true + // Generate shared job clusters first so tasks can reference them by key. + var jobClusterKeys []string + if chance(rng, 0.5) { + n := rng.IntN(2) + 1 + for i := range n { + key := fmt.Sprintf("cluster_%d", i) + jobClusterKeys = append(jobClusterKeys, key) + job.JobClusters = append(job.JobClusters, jobs.JobCluster{ + JobClusterKey: key, + NewCluster: randClusterSpec(rng), + }) + } + } - // A task referencing a job cluster must reference one we generated. - if task.JobClusterKey != "" { - assert.Containsf(t, clusterKeys, task.JobClusterKey, - "seed %d: task %q references unknown job cluster %q", seed, task.TaskKey, task.JobClusterKey) + nTasks := rng.IntN(3) + 1 + var taskKeys []string + for i := range nTasks { + task := randTask(rng, i, jobClusterKeys) + // Randomly chain dependencies onto previously generated tasks. + if len(taskKeys) > 0 && chance(rng, 0.4) { + dep := taskKeys[rng.IntN(len(taskKeys))] + task.DependsOn = []jobs.TaskDependency{{TaskKey: dep}} + if chance(rng, 0.5) { + task.RunIf = jobs.RunIf(oneOf(rng, runIfs)) } } + taskKeys = append(taskKeys, task.TaskKey) + job.Tasks = append(job.Tasks, task) + } - // Every dependency must point at a task that exists in this job. - for _, task := range job.Tasks { - for _, dep := range task.DependsOn { - assert.Containsf(t, taskKeys, dep.TaskKey, - "seed %d: task %q depends on unknown task %q", seed, task.TaskKey, dep.TaskKey) - } + return job +} + +// randScheduling sets at most one of schedule/trigger/continuous, which are +// mutually exclusive ways to launch a job. +func randScheduling(rng *rand.Rand, job *resources.Job) { + switch rng.IntN(5) { + case 0: + job.Schedule = &jobs.CronSchedule{ + QuartzCronExpression: oneOf(rng, cronExprs), + TimezoneId: oneOf(rng, timezones), + PauseStatus: oneOf(rng, pauseStatuses), + } + case 1: + job.Trigger = &jobs.TriggerSettings{ + PauseStatus: oneOf(rng, pauseStatuses), + Periodic: &jobs.PeriodicTriggerConfiguration{ + Interval: rng.IntN(12) + 1, + Unit: jobs.PeriodicTriggerConfigurationTimeUnit(oneOf(rng, timeUnits)), + }, + } + case 2: + job.Trigger = &jobs.TriggerSettings{ + PauseStatus: oneOf(rng, pauseStatuses), + FileArrival: &jobs.FileArrivalTriggerConfiguration{ + Url: "s3://" + randWord(rng) + "/" + randWord(rng), + }, + } + case 3: + job.Continuous = &jobs.Continuous{PauseStatus: oneOf(rng, pauseStatuses)} + default: + // no scheduling + } +} + +func randTask(rng *rand.Rand, idx int, jobClusterKeys []string) jobs.Task { + task := jobs.Task{TaskKey: fmt.Sprintf("task_%d", idx)} + + // Use absolute workspace paths with source=WORKSPACE so the generated bundle + // never depends on local files existing on disk (which deploy would reject). + // condition_task needs no compute, so it is handled separately below. + needsCompute := true + switch rng.IntN(4) { + case 0: + task.NotebookTask = &jobs.NotebookTask{ + NotebookPath: "/Workspace/Users/test/" + randName(rng, "nb"), + Source: jobs.SourceWorkspace, + } + case 1: + task.SparkPythonTask = &jobs.SparkPythonTask{ + PythonFile: "/Workspace/Users/test/" + randName(rng, "main") + ".py", + Source: jobs.SourceWorkspace, + } + case 2: + task.PythonWheelTask = &jobs.PythonWheelTask{ + PackageName: randName(rng, "pkg"), + EntryPoint: "main", + } + case 3: + task.ConditionTask = &jobs.ConditionTask{ + Left: randWord(rng), + Op: jobs.ConditionTaskOp(oneOf(rng, conditionOps)), + Right: randWord(rng), + } + needsCompute = false + } + + if needsCompute { + assignCompute(rng, &task, jobClusterKeys) + if chance(rng, 0.4) { + task.Libraries = randLibraries(rng) + } + } + + if chance(rng, 0.3) { + task.TimeoutSeconds = rng.IntN(3600) + } + if chance(rng, 0.3) { + task.MaxRetries = rng.IntN(5) + task.MinRetryIntervalMillis = rng.IntN(60000) + task.RetryOnTimeout = chance(rng, 0.5) + } + return task +} + +// assignCompute attaches exactly one compute source, which notebook/python/wheel +// tasks require: a shared job cluster (when available), a brand-new cluster, or an +// existing cluster id. +func assignCompute(rng *rand.Rand, task *jobs.Task, jobClusterKeys []string) { + const ( + computeNew = iota + computeExisting + computeShared + ) + options := []int{computeNew, computeExisting} + if len(jobClusterKeys) > 0 { + options = append(options, computeShared) + } + switch oneOf(rng, options) { + case computeNew: + spec := randClusterSpec(rng) + task.NewCluster = &spec + case computeExisting: + task.ExistingClusterId = randName(rng, "cluster") + case computeShared: + task.JobClusterKey = oneOf(rng, jobClusterKeys) + } +} + +func randClusterSpec(rng *rand.Rand) compute.ClusterSpec { + spec := compute.ClusterSpec{ + SparkVersion: oneOf(rng, sparkVersions), + NodeTypeId: oneOf(rng, nodeTypeIDs), + } + if chance(rng, 0.5) { + spec.NumWorkers = rng.IntN(8) + } else { + spec.Autoscale = &compute.AutoScale{ + MinWorkers: 1, + MaxWorkers: rng.IntN(8) + 2, + } + } + if chance(rng, 0.4) { + spec.SparkConf = map[string]string{ + "spark.databricks.delta.preview.enabled": "true", + "spark.speculation": strconv.FormatBool(chance(rng, 0.5)), } } + if chance(rng, 0.3) { + spec.CustomTags = randTags(rng) + } + if chance(rng, 0.3) { + spec.SparkEnvVars = map[string]string{"PYSPARK_PYTHON": "/databricks/python3/bin/python3"} + } + if chance(rng, 0.3) { + spec.DriverNodeTypeId = oneOf(rng, nodeTypeIDs) + } + return spec +} + +func randGitSource(rng *rand.Rand) *jobs.GitSource { + src := &jobs.GitSource{ + GitProvider: oneOf(rng, gitProviders), + GitUrl: "https://example.com/" + randWord(rng) + "/" + randWord(rng) + ".git", + } + switch rng.IntN(3) { + case 0: + src.GitBranch = oneOf(rng, []string{"main", "develop", "release"}) + case 1: + src.GitTag = "v" + fmt.Sprintf("%d.%d.0", rng.IntN(5), rng.IntN(10)) + case 2: + src.GitCommit = fmt.Sprintf("%040x", rng.Int64()) + } + return src +} + +func randEmailNotifications(rng *rand.Rand) *jobs.JobEmailNotifications { + email := randWord(rng) + "@example.com" + n := &jobs.JobEmailNotifications{NoAlertForSkippedRuns: chance(rng, 0.5)} + if chance(rng, 0.6) { + n.OnFailure = []string{email} + } + if chance(rng, 0.4) { + n.OnSuccess = []string{email} + } + if chance(rng, 0.3) { + n.OnStart = []string{email} + } + return n +} + +func randWebhookNotifications(rng *rand.Rand) *jobs.WebhookNotifications { + hook := []jobs.Webhook{{Id: randName(rng, "hook")}} + n := &jobs.WebhookNotifications{} + if chance(rng, 0.6) { + n.OnFailure = hook + } + if chance(rng, 0.4) { + n.OnSuccess = hook + } + return n +} + +func randHealth(rng *rand.Rand) *jobs.JobsHealthRules { + return &jobs.JobsHealthRules{ + Rules: []jobs.JobsHealthRule{ + { + Metric: jobs.JobsHealthMetric(oneOf(rng, healthMetrics)), + Op: jobs.JobsHealthOperatorGreaterThan, + Value: int64(rng.IntN(3600) + 1), + }, + }, + } +} + +func randLibraries(rng *rand.Rand) []compute.Library { + n := rng.IntN(2) + 1 + libs := make([]compute.Library, 0, n) + for range n { + switch rng.IntN(3) { + case 0: + libs = append(libs, compute.Library{Pypi: &compute.PythonPyPiLibrary{Package: randWord(rng)}}) + case 1: + libs = append(libs, compute.Library{Maven: &compute.MavenLibrary{Coordinates: "org.example:" + randWord(rng) + ":1.0.0"}}) + case 2: + libs = append(libs, compute.Library{Whl: "/Workspace/Users/test/" + randName(rng, "lib") + ".whl"}) + } + } + return libs +} + +func randParameters(rng *rand.Rand) []jobs.JobParameterDefinition { + n := rng.IntN(3) + 1 + params := make([]jobs.JobParameterDefinition, 0, n) + for i := range n { + params = append(params, jobs.JobParameterDefinition{ + Name: fmt.Sprintf("param_%d", i), + Default: randWord(rng), + }) + } + return params +} + +func randTags(rng *rand.Rand) map[string]string { + n := rng.IntN(3) + 1 + tags := make(map[string]string, n) + for i := range n { + tags[fmt.Sprintf("tag_%d", i)] = randWord(rng) + } + return tags } diff --git a/bundle/fuzz/rand.go b/bundle/fuzz/rand_test.go similarity index 100% rename from bundle/fuzz/rand.go rename to bundle/fuzz/rand_test.go From d12822cd9021b932795bfd3159b830a8b7d34f54 Mon Sep 17 00:00:00 2001 From: Rada Kamysheva Date: Thu, 25 Jun 2026 11:42:04 +0000 Subject: [PATCH 09/24] bundle/fuzz: fix lint (stringsseq, testifylint) in paritySeeds Use strings.SplitSeq instead of ranging over strings.Split (modernize stringsseq) and require.Positivef instead of require.Greaterf(t, n, 0) (testifylint negative-positive). --- bundle/fuzz/fuzz_test.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bundle/fuzz/fuzz_test.go b/bundle/fuzz/fuzz_test.go index 88a3c5a3b6..79c5b55c18 100644 --- a/bundle/fuzz/fuzz_test.go +++ b/bundle/fuzz/fuzz_test.go @@ -61,7 +61,7 @@ func TestJobCreateParity(t *testing.T) { func paritySeeds(t *testing.T) []int64 { if v := os.Getenv("FUZZ_SEED"); v != "" { var seeds []int64 - for _, part := range strings.Split(v, ",") { + for part := range strings.SplitSeq(v, ",") { part = strings.TrimSpace(part) if part == "" { continue @@ -78,7 +78,7 @@ func paritySeeds(t *testing.T) []int64 { if v := os.Getenv("FUZZ_SEEDS"); v != "" { n, err := strconv.Atoi(v) require.NoErrorf(t, err, "invalid FUZZ_SEEDS=%q", v) - require.Greaterf(t, n, 0, "FUZZ_SEEDS must be positive, got %d", n) + require.Positivef(t, n, "FUZZ_SEEDS must be positive, got %d", n) count = n } From e55b6f477e994642cf8b3e2a437e234f75211e48 Mon Sep 17 00:00:00 2001 From: Rada Kamysheva Date: Thu, 25 Jun 2026 11:53:11 +0000 Subject: [PATCH 10/24] bundle/fuzz: fix nightly issue dedup and document paritySeeds test The failure-reporting step used `gh issue list --jq '.[0].number'`, which prints the literal "null" when no open issue exists, so it always took the comment branch and tried to comment on issue "null" instead of creating one. Use `// empty` so the create branch runs on the first divergence. --- .github/workflows/push.yml | 2 +- bundle/fuzz/fuzz_test.go | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/.github/workflows/push.yml b/.github/workflows/push.yml index cc69da23f4..6621b15705 100644 --- a/.github/workflows/push.yml +++ b/.github/workflows/push.yml @@ -452,7 +452,7 @@ jobs: EOF ) - existing=$(gh issue list --state open --label fuzz-nightly --json number --jq '.[0].number') + existing=$(gh issue list --state open --label fuzz-nightly --json number --jq '.[0].number // empty') if [ -n "$existing" ]; then gh issue comment "$existing" --body "$body" else diff --git a/bundle/fuzz/fuzz_test.go b/bundle/fuzz/fuzz_test.go index 79c5b55c18..3b15ea5e14 100644 --- a/bundle/fuzz/fuzz_test.go +++ b/bundle/fuzz/fuzz_test.go @@ -107,6 +107,8 @@ func paritySeeds(t *testing.T) []int64 { return seeds } +// TestParitySeeds verifies paritySeeds composes the regression seeds with the +// rotating window, deduplicates overlaps, and lets FUZZ_SEED override both. func TestParitySeeds(t *testing.T) { t.Run("default includes regression seeds then window", func(t *testing.T) { t.Setenv("FUZZ_SEEDS", "3") From c460fe9d148a8d686fec8fed481ad8733cc1b692 Mon Sep 17 00:00:00 2001 From: Rada Kamysheva Date: Thu, 25 Jun 2026 17:41:57 +0000 Subject: [PATCH 11/24] bundle/fuzz: document divergences instead of fixing them Revert the num_workers single-node task-cluster fix along with its unit test and acceptance updates so this PR adds only the parity harness. Both terraform/direct divergences the harness found are now documented and suppressed via DefaultIgnorePaths rather than fixed (fixes follow separately): num_workers on single-node task clusters (seed 29) and the spark.databricks.delta.preview.enabled spark conf key. --- .../bundle/deploy/wal/chain-3-jobs/output.txt | 2 - .../deploy/wal/crash-after-create/output.txt | 1 - .../bundle/override/job_tasks/output.txt | 2 - .../missing_map_key/out.validate.direct.json | 3 +- .../out.validate.terraform.json | 3 +- .../mutator/resourcemutator/cluster_fixups.go | 1 - .../resourcemutator/cluster_fixups_test.go | 92 ------------------- bundle/fuzz/compare_test.go | 9 ++ bundle/fuzz/fuzz_test.go | 19 ++-- bundle/fuzz/recorder_test.go | 8 +- 10 files changed, 25 insertions(+), 115 deletions(-) delete mode 100644 bundle/config/mutator/resourcemutator/cluster_fixups_test.go diff --git a/acceptance/bundle/deploy/wal/chain-3-jobs/output.txt b/acceptance/bundle/deploy/wal/chain-3-jobs/output.txt index f11dc173ee..f27bfaa3f2 100644 --- a/acceptance/bundle/deploy/wal/chain-3-jobs/output.txt +++ b/acceptance/bundle/deploy/wal/chain-3-jobs/output.txt @@ -35,7 +35,6 @@ Exit code: [KILLED] { "new_cluster": { "node_type_id": "[NODE_TYPE_ID]", - "num_workers": 0, "spark_version": "15.4.x-scala2.12" }, "spark_python_task": { @@ -74,7 +73,6 @@ Exit code: [KILLED] { "new_cluster": { "node_type_id": "[NODE_TYPE_ID]", - "num_workers": 0, "spark_version": "15.4.x-scala2.12" }, "spark_python_task": { diff --git a/acceptance/bundle/deploy/wal/crash-after-create/output.txt b/acceptance/bundle/deploy/wal/crash-after-create/output.txt index 9cd95a0b5c..2ab926a1dd 100644 --- a/acceptance/bundle/deploy/wal/crash-after-create/output.txt +++ b/acceptance/bundle/deploy/wal/crash-after-create/output.txt @@ -39,7 +39,6 @@ Exit code: [KILLED] { "new_cluster": { "node_type_id": "[NODE_TYPE_ID]", - "num_workers": 0, "spark_version": "15.4.x-scala2.12" }, "spark_python_task": { diff --git a/acceptance/bundle/override/job_tasks/output.txt b/acceptance/bundle/override/job_tasks/output.txt index 59b6fc1c39..2bee9738e3 100644 --- a/acceptance/bundle/override/job_tasks/output.txt +++ b/acceptance/bundle/override/job_tasks/output.txt @@ -18,7 +18,6 @@ }, { "new_cluster": { - "num_workers": 0, "spark_version": "13.3.x-scala2.12" }, "spark_python_task": { @@ -43,7 +42,6 @@ Exit code: 1 "tasks": [ { "new_cluster": { - "num_workers": 0, "spark_version": "13.3.x-scala2.12" }, "spark_python_task": { diff --git a/acceptance/bundle/resource_deps/missing_map_key/out.validate.direct.json b/acceptance/bundle/resource_deps/missing_map_key/out.validate.direct.json index 7279aaeba3..cfd1427ce4 100644 --- a/acceptance/bundle/resource_deps/missing_map_key/out.validate.direct.json +++ b/acceptance/bundle/resource_deps/missing_map_key/out.validate.direct.json @@ -30,8 +30,7 @@ "new_cluster": { "custom_tags": { "ResourceClass": "SingleNode" - }, - "num_workers": 0 + } }, "task_key": "test-task" } diff --git a/acceptance/bundle/resource_deps/missing_map_key/out.validate.terraform.json b/acceptance/bundle/resource_deps/missing_map_key/out.validate.terraform.json index 3bad6f4619..3cdf58f84e 100644 --- a/acceptance/bundle/resource_deps/missing_map_key/out.validate.terraform.json +++ b/acceptance/bundle/resource_deps/missing_map_key/out.validate.terraform.json @@ -30,8 +30,7 @@ "new_cluster": { "custom_tags": { "ResourceClass": "SingleNode" - }, - "num_workers": 0 + } }, "task_key": "test-task" } diff --git a/bundle/config/mutator/resourcemutator/cluster_fixups.go b/bundle/config/mutator/resourcemutator/cluster_fixups.go index 04ddef6cc2..893cd248aa 100644 --- a/bundle/config/mutator/resourcemutator/cluster_fixups.go +++ b/bundle/config/mutator/resourcemutator/cluster_fixups.go @@ -94,7 +94,6 @@ func prepareJobSettingsForUpdate(js *jobs.JobSettings) { for _, task := range js.Tasks { if task.NewCluster != nil { ModifyRequestOnInstancePool(task.NewCluster) - initializeNumWorkers(task.NewCluster) } } for ind := range js.JobClusters { diff --git a/bundle/config/mutator/resourcemutator/cluster_fixups_test.go b/bundle/config/mutator/resourcemutator/cluster_fixups_test.go deleted file mode 100644 index 5cb2e93749..0000000000 --- a/bundle/config/mutator/resourcemutator/cluster_fixups_test.go +++ /dev/null @@ -1,92 +0,0 @@ -package resourcemutator - -import ( - "testing" - - "github.com/databricks/databricks-sdk-go/service/compute" - "github.com/databricks/databricks-sdk-go/service/jobs" - "github.com/stretchr/testify/assert" -) - -func TestInitializeNumWorkers(t *testing.T) { - tests := []struct { - name string - spec compute.ClusterSpec - wantForceSend bool - }{ - { - name: "single-node cluster force-sends num_workers", - spec: compute.ClusterSpec{SparkVersion: "15.4.x-scala2.12", NodeTypeId: "i3.xlarge"}, - wantForceSend: true, - }, - { - name: "autoscale cluster does not force-send", - spec: compute.ClusterSpec{Autoscale: &compute.AutoScale{MinWorkers: 1, MaxWorkers: 4}}, - wantForceSend: false, - }, - { - name: "multi-node cluster does not force-send", - spec: compute.ClusterSpec{NumWorkers: 3}, - wantForceSend: false, - }, - { - name: "already force-sent stays force-sent without duplicating", - spec: compute.ClusterSpec{ForceSendFields: []string{"NumWorkers"}}, - wantForceSend: true, - }, - } - - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - spec := tt.spec - initializeNumWorkers(&spec) - - count := 0 - for _, f := range spec.ForceSendFields { - if f == "NumWorkers" { - count++ - } - } - if tt.wantForceSend { - assert.Equal(t, 1, count, "NumWorkers must appear in ForceSendFields exactly once") - } else { - assert.Equal(t, 0, count, "NumWorkers must not be in ForceSendFields") - } - }) - } -} - -// TestPrepareJobSettingsForUpdateForcesNumWorkers locks the DECO-25361 fix: a -// single-node new_cluster must force-send num_workers on task-level clusters too, -// not just shared job_clusters. The terraform provider always sends num_workers:0 -// for such clusters, so missing it on the task side made the direct engine -// produce a divergent create payload. -func TestPrepareJobSettingsForUpdateForcesNumWorkers(t *testing.T) { - js := &jobs.JobSettings{ - Tasks: []jobs.Task{ - { - TaskKey: "single_node_task", - NewCluster: &compute.ClusterSpec{SparkVersion: "15.4.x-scala2.12", NodeTypeId: "i3.xlarge"}, - }, - { - TaskKey: "autoscale_task", - NewCluster: &compute.ClusterSpec{Autoscale: &compute.AutoScale{MinWorkers: 1, MaxWorkers: 4}}, - }, - }, - JobClusters: []jobs.JobCluster{ - { - JobClusterKey: "single_node_cluster", - NewCluster: compute.ClusterSpec{SparkVersion: "15.4.x-scala2.12", NodeTypeId: "i3.xlarge"}, - }, - }, - } - - prepareJobSettingsForUpdate(js) - - assert.Contains(t, js.Tasks[0].NewCluster.ForceSendFields, "NumWorkers", - "single-node task cluster must force-send num_workers") - assert.NotContains(t, js.Tasks[1].NewCluster.ForceSendFields, "NumWorkers", - "autoscale task cluster must not force-send num_workers") - assert.Contains(t, js.JobClusters[0].NewCluster.ForceSendFields, "NumWorkers", - "single-node job cluster must force-send num_workers") -} diff --git a/bundle/fuzz/compare_test.go b/bundle/fuzz/compare_test.go index fd6807b56c..f53d2f3b30 100644 --- a/bundle/fuzz/compare_test.go +++ b/bundle/fuzz/compare_test.go @@ -270,4 +270,13 @@ var DefaultIgnorePaths = []string{ // way, so this is a benign provider-side filter rather than a parity bug. `tasks[*].new_cluster.spark_conf["spark.databricks.delta.preview.enabled"]`, `job_clusters[*].new_cluster.spark_conf["spark.databricks.delta.preview.enabled"]`, + + // For a single-node task-level new_cluster (no autoscale, num_workers unset) + // the terraform provider force-sends num_workers:0 while the direct engine + // omits the field, so the create payloads diverge. This is a real + // terraform/direct divergence the harness found (seed 29); it is documented + // and suppressed here rather than fixed in this PR. Tracked under DECO-25361. + // Shared job_clusters are not affected: resourcemutator already force-sends + // num_workers for them under both engines, so only the task path diverges. + `tasks[*].new_cluster.num_workers`, } diff --git a/bundle/fuzz/fuzz_test.go b/bundle/fuzz/fuzz_test.go index 3b15ea5e14..7836574d15 100644 --- a/bundle/fuzz/fuzz_test.go +++ b/bundle/fuzz/fuzz_test.go @@ -18,17 +18,18 @@ const defaultParitySeeds = 20 // regressionSeeds are seeds that previously surfaced a terraform/direct create // payload divergence. They are always checked (in addition to the rotating -// nightly window) so a fixed divergence can never silently regress, even though -// the nightly window moves on every run and would otherwise never revisit them. +// nightly window) so the divergence keeps being exercised even though the +// nightly window moves on every run and would otherwise never revisit them. // -// When the nightly job reports a new failing FUZZ_SEED, add it here in the same -// PR that fixes the divergence. +// When the nightly job reports a new failing FUZZ_SEED, add it here. // -// - 29: first seed that generates a single-node task-level new_cluster -// (num_workers 0, no autoscale). The direct engine omitted num_workers on -// task clusters while terraform force-sent num_workers:0, so the create -// payloads diverged. Fixed by applying initializeNumWorkers to task clusters -// in resourcemutator.prepareJobSettingsForUpdate. +// - 29: generates a single-node task-level new_cluster (num_workers 0, no +// autoscale). The direct engine omits num_workers on task clusters while +// terraform force-sends num_workers:0, so the create payloads diverge. This +// divergence is documented and currently suppressed via DefaultIgnorePaths +// (tasks[*].new_cluster.num_workers), not fixed in this PR; tracked under +// DECO-25361. The seed stays here so that once the divergence is fixed and +// its ignore entry removed, this seed guards against regression. var regressionSeeds = []int64{29} // TestJobCreateParity is the first DECO-25361 technique: for many random job diff --git a/bundle/fuzz/recorder_test.go b/bundle/fuzz/recorder_test.go index 244cb81480..a5e7d4d707 100644 --- a/bundle/fuzz/recorder_test.go +++ b/bundle/fuzz/recorder_test.go @@ -9,10 +9,10 @@ import ( // jobsCreatePath is the Jobs API route both engines must hit on create. The // direct engine posts here via the SDK and the terraform provider is expected to -// as well. The testserver registers only this exact route, so if an engine ever -// posted to a different version the deploy would 404 and captureJobCreate would -// fail with "did not POST". A version skew therefore surfaces as a capture -// failure, not as a payload diff. +// as well. The testserver registers only this version of the jobs/create route, +// so if an engine ever posted to a different version the deploy would 404 and +// captureJobCreate would fail with "did not POST". A version skew therefore +// surfaces as a capture failure, not as a payload diff. const jobsCreatePath = "/api/2.2/jobs/create" // capturedRequest is a single mutating API request observed by the testserver. From 4f8eafbb1aea2c08446b980fc1e7e964264d0461 Mon Sep 17 00:00:00 2001 From: Rada Kamysheva Date: Fri, 26 Jun 2026 07:41:24 +0000 Subject: [PATCH 12/24] bundle/fuzz: narrow num_workers ignore and tidy parity harness Address review feedback on the create-payload parity harness: - Replace the path-only ignore list with value-conditional ignore rules so the documented num_workers divergence (direct omits, terraform force-sends 0) is suppressed only for that exact shape; a real value mismatch at the same path now fails again. - Unexport package-internal identifiers (generateJob, diffPayloads, difference, defaultIgnoreRules) that are only used within the package. - Document why TestCaptureJobCreateDirect is intentionally not opt-in. - Reword the one-sided-deploy failures as deploy/capture differences rather than asserting one engine "rejected" the config. - Make TestParitySeeds hermetic against ambient FUZZ_* env vars. - Correct the seed 29 comment to reflect that the divergence is suppressed. --- bundle/fuzz/compare_cases_test.go | 37 ++++++++-- bundle/fuzz/compare_test.go | 96 ++++++++++++++++--------- bundle/fuzz/deploy_smoke_test.go | 10 ++- bundle/fuzz/fuzz_test.go | 35 +++++---- bundle/fuzz/generate_invariants_test.go | 6 +- bundle/fuzz/generate_test.go | 4 +- 6 files changed, 133 insertions(+), 55 deletions(-) diff --git a/bundle/fuzz/compare_cases_test.go b/bundle/fuzz/compare_cases_test.go index 46e506d75c..95c732750b 100644 --- a/bundle/fuzz/compare_cases_test.go +++ b/bundle/fuzz/compare_cases_test.go @@ -13,7 +13,7 @@ func TestDiffPayloads(t *testing.T) { name string direct string terraform string - ignore []string + ignore []ignoreRule want []string }{ { @@ -62,7 +62,7 @@ func TestDiffPayloads(t *testing.T) { name: "ignored path", direct: `{"tasks":[{"timeout_seconds":1}]}`, terraform: `{"tasks":[{"timeout_seconds":2}]}`, - ignore: []string{"tasks[*].timeout_seconds"}, + ignore: []ignoreRule{{Path: "tasks[*].timeout_seconds"}}, want: nil, }, { @@ -75,7 +75,7 @@ func TestDiffPayloads(t *testing.T) { name: "dotted map key can be ignored", direct: `{"c":{"spark_conf":{"spark.x.y":"1"}}}`, terraform: `{"c":{"spark_conf":{}}}`, - ignore: []string{`c.spark_conf["spark.x.y"]`}, + ignore: []ignoreRule{{Path: `c.spark_conf["spark.x.y"]`}}, want: nil, }, { @@ -102,11 +102,29 @@ func TestDiffPayloads(t *testing.T) { terraform: `{"job_clusters":[{"job_cluster_key":"y","new_cluster":{"num_workers":2}},{"job_cluster_key":"x","new_cluster":{"num_workers":1}}]}`, want: nil, }, + { + // The documented single-node divergence: direct omits num_workers, + // terraform force-sends 0. defaultIgnoreRules suppresses exactly this. + name: "task num_workers absent-vs-zero is ignored", + direct: `{"tasks":[{"task_key":"t","new_cluster":{"spark_version":"x"}}]}`, + terraform: `{"tasks":[{"task_key":"t","new_cluster":{"spark_version":"x","num_workers":0}}]}`, + ignore: defaultIgnoreRules, + want: nil, + }, + { + // A real num_workers value mismatch shares the path but is NOT the + // benign shape, so the narrowed rule must still report it. + name: "task num_workers value mismatch still surfaces", + direct: `{"tasks":[{"task_key":"t","new_cluster":{"spark_version":"x","num_workers":3}}]}`, + terraform: `{"tasks":[{"task_key":"t","new_cluster":{"spark_version":"x","num_workers":5}}]}`, + ignore: defaultIgnoreRules, + want: []string{"tasks[0].new_cluster.num_workers"}, + }, } for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { - diffs, err := DiffPayloads(json.RawMessage(tt.direct), json.RawMessage(tt.terraform), tt.ignore) + diffs, err := diffPayloads(json.RawMessage(tt.direct), json.RawMessage(tt.terraform), tt.ignore) require.NoError(t, err) var paths []string @@ -117,3 +135,14 @@ func TestDiffPayloads(t *testing.T) { }) } } + +func TestIsBenignTaskNumWorkers(t *testing.T) { + assert.True(t, isBenignTaskNumWorkers(difference{Direct: missing{}, Terraform: json.Number("0")}), + "direct absent + terraform 0 is the documented divergence") + assert.False(t, isBenignTaskNumWorkers(difference{Direct: json.Number("3"), Terraform: json.Number("5")}), + "two differing counts is a real divergence") + assert.False(t, isBenignTaskNumWorkers(difference{Direct: missing{}, Terraform: json.Number("2")}), + "direct absent but terraform non-zero is not the benign shape") + assert.False(t, isBenignTaskNumWorkers(difference{Direct: json.Number("0"), Terraform: missing{}}), + "reversed sides are not the benign shape") +} diff --git a/bundle/fuzz/compare_test.go b/bundle/fuzz/compare_test.go index f53d2f3b30..de34c18aaf 100644 --- a/bundle/fuzz/compare_test.go +++ b/bundle/fuzz/compare_test.go @@ -10,15 +10,15 @@ import ( "strings" ) -// Difference is a single mismatch between the two engines' create payloads, +// difference is a single mismatch between the two engines' create payloads, // located by a JSON-ish path (e.g. "tasks[0].new_cluster.num_workers"). -type Difference struct { +type difference struct { Path string Direct any Terraform any } -func (d Difference) String() string { +func (d difference) String() string { return fmt.Sprintf("%s: direct=%s terraform=%s", d.Path, render(d.Direct), render(d.Terraform)) } @@ -36,10 +36,20 @@ func render(v any) string { return string(b) } -// DiffPayloads decodes both create payloads and returns every difference whose -// path is not explicitly ignored. ignorePaths are matched exactly against the -// rendered path, with "[*]" standing in for any slice index. -func DiffPayloads(direct, terraform json.RawMessage, ignorePaths []string) ([]Difference, error) { +// ignoreRule suppresses a known, intentional engine divergence. A rule matches a +// difference when the difference's normalized path equals Path and, if Match is +// non-nil, Match also reports true for the two values. A nil Match ignores any +// difference at Path; a non-nil Match narrows the rule to specific values so a +// genuine mismatch at the same path is still reported. +type ignoreRule struct { + Path string + Match func(d difference) bool +} + +// diffPayloads decodes both create payloads and returns every difference that no +// ignore rule suppresses. Paths are matched with "[*]" standing in for any slice +// index (see normalizePath). +func diffPayloads(direct, terraform json.RawMessage, ignore []ignoreRule) ([]difference, error) { d, err := decode(direct) if err != nil { return nil, fmt.Errorf("decoding direct payload: %w", err) @@ -49,23 +59,32 @@ func DiffPayloads(direct, terraform json.RawMessage, ignorePaths []string) ([]Di return nil, fmt.Errorf("decoding terraform payload: %w", err) } - var diffs []Difference + var diffs []difference diffValue("", d, tf, &diffs) - ignore := make(map[string]bool, len(ignorePaths)) - for _, p := range ignorePaths { - ignore[p] = true - } - filtered := diffs[:0] for _, diff := range diffs { - if !ignore[normalizePath(diff.Path)] { + if !ignored(diff, ignore) { filtered = append(filtered, diff) } } return filtered, nil } +// ignored reports whether any rule suppresses d. +func ignored(d difference, rules []ignoreRule) bool { + norm := normalizePath(d.Path) + for _, r := range rules { + if r.Path != norm { + continue + } + if r.Match == nil || r.Match(d) { + return true + } + } + return false +} + // decode unmarshals JSON using UseNumber so large int64 values (e.g. job ids, // spark_context_id) are not corrupted by float64 rounding. See the encoding rule // in the repo style guide. @@ -82,12 +101,12 @@ func decode(raw json.RawMessage) (any, error) { return v, nil } -func diffValue(path string, a, b any, diffs *[]Difference) { +func diffValue(path string, a, b any, diffs *[]difference) { switch av := a.(type) { case map[string]any: bv, ok := b.(map[string]any) if !ok { - *diffs = append(*diffs, Difference{Path: path, Direct: a, Terraform: b}) + *diffs = append(*diffs, difference{Path: path, Direct: a, Terraform: b}) return } keys := unionKeys(av, bv) @@ -99,15 +118,15 @@ func diffValue(path string, a, b any, diffs *[]Difference) { case aok && bok: diffValue(child, achild, bchild, diffs) case aok: - *diffs = append(*diffs, Difference{Path: child, Direct: achild, Terraform: missing{}}) + *diffs = append(*diffs, difference{Path: child, Direct: achild, Terraform: missing{}}) default: - *diffs = append(*diffs, Difference{Path: child, Direct: missing{}, Terraform: bchild}) + *diffs = append(*diffs, difference{Path: child, Direct: missing{}, Terraform: bchild}) } } case []any: bv, ok := b.([]any) if !ok { - *diffs = append(*diffs, Difference{Path: path, Direct: a, Terraform: b}) + *diffs = append(*diffs, difference{Path: path, Direct: a, Terraform: b}) return } // Slices whose elements carry a natural identity key (tasks, job clusters) @@ -125,14 +144,14 @@ func diffValue(path string, a, b any, diffs *[]Difference) { case i < len(av) && i < len(bv): diffValue(child, av[i], bv[i], diffs) case i < len(av): - *diffs = append(*diffs, Difference{Path: child, Direct: av[i], Terraform: missing{}}) + *diffs = append(*diffs, difference{Path: child, Direct: av[i], Terraform: missing{}}) default: - *diffs = append(*diffs, Difference{Path: child, Direct: missing{}, Terraform: bv[i]}) + *diffs = append(*diffs, difference{Path: child, Direct: missing{}, Terraform: bv[i]}) } } default: if !scalarEqual(a, b) { - *diffs = append(*diffs, Difference{Path: path, Direct: a, Terraform: b}) + *diffs = append(*diffs, difference{Path: path, Direct: a, Terraform: b}) } } } @@ -174,7 +193,7 @@ func allHaveKey(s []any, field string) bool { // within each slice for tasks/job clusters) and diffs each matched pair, // reporting unmatched elements as present-on-one-side. Paths keep numeric indices // so ignore-path [*] normalization still applies. -func diffKeyedSlice(path, key string, a, b []any, diffs *[]Difference) { +func diffKeyedSlice(path, key string, a, b []any, diffs *[]difference) { // identityFields are unique within a slice by API contract (no two job tasks // share a task_key, no two job_clusters share a job_cluster_key), so keying by // them is unambiguous. If a payload ever repeated a key, last-one-wins here and @@ -193,7 +212,7 @@ func diffKeyedSlice(path, key string, a, b []any, diffs *[]Difference) { if bel, ok := bByKey[k]; ok { diffValue(child, el, bel, diffs) } else { - *diffs = append(*diffs, Difference{Path: child, Direct: el, Terraform: missing{}}) + *diffs = append(*diffs, difference{Path: child, Direct: el, Terraform: missing{}}) } } for j, el := range b { @@ -202,7 +221,7 @@ func diffKeyedSlice(path, key string, a, b []any, diffs *[]Difference) { continue } child := fmt.Sprintf("%s[%d]", path, j) - *diffs = append(*diffs, Difference{Path: child, Direct: missing{}, Terraform: el}) + *diffs = append(*diffs, difference{Path: child, Direct: missing{}, Terraform: el}) } } @@ -260,16 +279,16 @@ func normalizePath(path string) string { return indexRe.ReplaceAllString(path, "[*]") } -// DefaultIgnorePaths lists create-payload paths that legitimately differ between -// the engines and are not parity bugs. Keep this list small and well-justified; -// every entry is a known, intentional divergence. -var DefaultIgnorePaths = []string{ +// defaultIgnoreRules lists create-payload divergences that are known, intentional +// engine differences and not parity bugs. Keep this list small and +// well-justified; every entry is a documented divergence. +var defaultIgnoreRules = []ignoreRule{ // The terraform provider strips the deprecated/ignored spark conf // "spark.databricks.delta.preview.enabled" from new_cluster.spark_conf, while // the direct engine forwards it verbatim. The backend ignores the key either // way, so this is a benign provider-side filter rather than a parity bug. - `tasks[*].new_cluster.spark_conf["spark.databricks.delta.preview.enabled"]`, - `job_clusters[*].new_cluster.spark_conf["spark.databricks.delta.preview.enabled"]`, + {Path: `tasks[*].new_cluster.spark_conf["spark.databricks.delta.preview.enabled"]`}, + {Path: `job_clusters[*].new_cluster.spark_conf["spark.databricks.delta.preview.enabled"]`}, // For a single-node task-level new_cluster (no autoscale, num_workers unset) // the terraform provider force-sends num_workers:0 while the direct engine @@ -278,5 +297,18 @@ var DefaultIgnorePaths = []string{ // and suppressed here rather than fixed in this PR. Tracked under DECO-25361. // Shared job_clusters are not affected: resourcemutator already force-sends // num_workers for them under both engines, so only the task path diverges. - `tasks[*].new_cluster.num_workers`, + // + // Match narrows this to exactly that shape (direct absent, terraform 0); a + // genuine num_workers value mismatch at the same path is still reported. + {Path: `tasks[*].new_cluster.num_workers`, Match: isBenignTaskNumWorkers}, +} + +// isBenignTaskNumWorkers reports whether d is the single documented num_workers +// divergence: the direct engine omits num_workers while terraform force-sends 0. +// Any other pair of values (in particular two differing non-zero counts) is a +// real divergence and must not be suppressed. +func isBenignTaskNumWorkers(d difference) bool { + _, directAbsent := d.Direct.(missing) + n, ok := d.Terraform.(json.Number) + return directAbsent && ok && n.String() == "0" } diff --git a/bundle/fuzz/deploy_smoke_test.go b/bundle/fuzz/deploy_smoke_test.go index d501ee7808..dd90b35990 100644 --- a/bundle/fuzz/deploy_smoke_test.go +++ b/bundle/fuzz/deploy_smoke_test.go @@ -8,8 +8,14 @@ import ( "github.com/stretchr/testify/require" ) +// TestCaptureJobCreateDirect is intentionally NOT gated behind requireFuzzOptIn, +// unlike the terraform parity suite. The direct engine needs no provisioned +// terraform, and one deterministic direct deploy is cheap, so this runs on every +// `task test` as a smoke test that the capture harness and the direct create path +// still work. The expensive part the opt-in protects against is the terraform +// side (two real deploys per seed), which stays opt-in via requireTerraform. func TestCaptureJobCreateDirect(t *testing.T) { - job := GenerateJob(newRNG(1)) + job := generateJob(newRNG(1)) body, err := captureJobCreate(t.Context(), t, job, "direct") require.NoError(t, err) @@ -23,7 +29,7 @@ func TestCaptureJobCreateDirect(t *testing.T) { func TestCaptureJobCreateTerraform(t *testing.T) { requireTerraform(t) - job := GenerateJob(newRNG(1)) + job := generateJob(newRNG(1)) body, err := captureJobCreate(t.Context(), t, job, "terraform") require.NoError(t, err) diff --git a/bundle/fuzz/fuzz_test.go b/bundle/fuzz/fuzz_test.go index 7836574d15..596ac99f7d 100644 --- a/bundle/fuzz/fuzz_test.go +++ b/bundle/fuzz/fuzz_test.go @@ -25,11 +25,12 @@ const defaultParitySeeds = 20 // // - 29: generates a single-node task-level new_cluster (num_workers 0, no // autoscale). The direct engine omits num_workers on task clusters while -// terraform force-sends num_workers:0, so the create payloads diverge. This -// divergence is documented and currently suppressed via DefaultIgnorePaths -// (tasks[*].new_cluster.num_workers), not fixed in this PR; tracked under -// DECO-25361. The seed stays here so that once the divergence is fixed and -// its ignore entry removed, this seed guards against regression. +// terraform force-sends num_workers:0, so the create payloads diverge. That +// specific shape is suppressed by defaultIgnoreRules (see +// isBenignTaskNumWorkers), so seed 29 currently asserts only that nothing +// else about this config diverges. Once the divergence is fixed and its +// ignore rule removed, this seed becomes a full guard against it regressing. +// Tracked under DECO-25361. var regressionSeeds = []int64{29} // TestJobCreateParity is the first DECO-25361 technique: for many random job @@ -111,6 +112,14 @@ func paritySeeds(t *testing.T) []int64 { // TestParitySeeds verifies paritySeeds composes the regression seeds with the // rotating window, deduplicates overlaps, and lets FUZZ_SEED override both. func TestParitySeeds(t *testing.T) { + // Isolate from any ambient FUZZ_* in the developer's environment. FUZZ_SEED in + // particular would short-circuit paritySeeds and break the cases below; an + // inherited FUZZ_SEEDS/OFFSET would skew the expected window. paritySeeds + // treats "" as unset, and subtests set only what they need on top. + t.Setenv("FUZZ_SEED", "") + t.Setenv("FUZZ_SEEDS", "") + t.Setenv("FUZZ_SEED_OFFSET", "") + t.Run("default includes regression seeds then window", func(t *testing.T) { t.Setenv("FUZZ_SEEDS", "3") t.Setenv("FUZZ_SEED_OFFSET", "100") @@ -163,13 +172,15 @@ func FuzzJobCreateParity(f *testing.F) { // deploy failure into regressionSeeds (which is only for real payload diffs): // - neither engine deployed: the generator produced a config nothing accepts, // so skip (logging both errors) rather than flag a parity bug. -// - exactly one engine deployed: the engines disagree on whether the config is -// even valid. That is a real divergence worth failing on, but an acceptance -// divergence, not a payload diff, so it is reported as such. +// - exactly one engine deployed: the engines disagree on whether the config +// deploys at all. That is worth failing on, but it is a deploy/capture +// difference rather than a payload diff, so it is reported separately. The +// failing side's error (an API rejection, an unregistered route, etc.) is +// included so triage can tell a true acceptance divergence from a harness gap. // - both deployed: compare the captured create payloads. func checkJobParity(t *testing.T, seed int64) { t.Helper() - job := GenerateJob(newRNG(seed)) + job := generateJob(newRNG(seed)) ctx := t.Context() direct, directErr := captureJobCreate(ctx, t, job, "direct") @@ -179,12 +190,12 @@ func checkJobParity(t *testing.T, seed int64) { case directErr != nil && tfErr != nil: t.Skipf("seed %d: config did not deploy under either engine (not a parity divergence)\ndirect: %v\nterraform: %v", seed, directErr, tfErr) case directErr != nil: - t.Fatalf("seed %d: direct rejected a config terraform accepted (engine acceptance divergence, not a payload diff): %v", seed, directErr) + t.Fatalf("seed %d: terraform deployed but direct did not (deploy/capture difference, not a payload diff): %v", seed, directErr) case tfErr != nil: - t.Fatalf("seed %d: terraform rejected a config direct accepted (engine acceptance divergence, not a payload diff): %v", seed, tfErr) + t.Fatalf("seed %d: direct deployed but terraform did not (deploy/capture difference, not a payload diff): %v", seed, tfErr) } - diffs, err := DiffPayloads(direct, terraform, DefaultIgnorePaths) + diffs, err := diffPayloads(direct, terraform, defaultIgnoreRules) require.NoErrorf(t, err, "seed %d: comparing create payloads", seed) if len(diffs) > 0 { diff --git a/bundle/fuzz/generate_invariants_test.go b/bundle/fuzz/generate_invariants_test.go index f7a797e8f5..9ca3b5cc93 100644 --- a/bundle/fuzz/generate_invariants_test.go +++ b/bundle/fuzz/generate_invariants_test.go @@ -8,14 +8,14 @@ import ( ) func TestGenerateJobIsDeterministic(t *testing.T) { - a := GenerateJob(newRNG(42)) - b := GenerateJob(newRNG(42)) + a := generateJob(newRNG(42)) + b := generateJob(newRNG(42)) assert.Equal(t, a, b, "same seed must produce identical job") } func TestGenerateJobIsWellFormed(t *testing.T) { for seed := range int64(200) { - job := GenerateJob(newRNG(seed)) + job := generateJob(newRNG(seed)) require.NotEmptyf(t, job.Name, "seed %d: job must have a name", seed) require.NotEmptyf(t, job.Tasks, "seed %d: job must have at least one task", seed) diff --git a/bundle/fuzz/generate_test.go b/bundle/fuzz/generate_test.go index 1b0acf55b0..6472957b2f 100644 --- a/bundle/fuzz/generate_test.go +++ b/bundle/fuzz/generate_test.go @@ -28,14 +28,14 @@ var ( gitProviders = []jobs.GitProvider{jobs.GitProviderGitHub, jobs.GitProviderGitLab, jobs.GitProviderAzureDevOpsServices} ) -// GenerateJob builds a random, well-formed job config driven entirely by rng, so +// generateJob builds a random, well-formed job config driven entirely by rng, so // the same seed always produces the same job. It deliberately favors fields whose // translation tends to differ between engines (tasks, clusters, schedules, // notifications, tags, zero-able scalars). // // TODO(DECO-25361): generalize the harness across resource kinds so pipelines, // apps, etc. get the same create-payload parity coverage as jobs. -func GenerateJob(rng *rand.Rand) *resources.Job { +func generateJob(rng *rand.Rand) *resources.Job { job := &resources.Job{} job.Name = randName(rng, "job") From 4027412b3994e0115a0e132d52c6e0b48b5044df Mon Sep 17 00:00:00 2001 From: Rada Kamysheva Date: Fri, 26 Jun 2026 08:21:24 +0000 Subject: [PATCH 13/24] bundle: force-send num_workers for single-node task clusters The terraform provider force-sends num_workers:0 for a single-node new_cluster on task-level clusters too, not just shared job_clusters, but prepareJobSettingsForUpdate only applied initializeNumWorkers to job_clusters. The direct engine therefore omitted num_workers on task clusters and the two engines produced divergent create payloads (found by the bundle/fuzz parity harness, seed 29). Apply initializeNumWorkers to task new_cluster too so the direct engine matches terraform, drop the now-obsolete tasks[*].new_cluster.num_workers ignore entry, and simplify the fuzz ignore list to a plain []string now that value-conditional matching is no longer needed. --- .github/workflows/push.yml | 26 ++--- Taskfile.yml | 15 +-- .../bundle/deploy/wal/chain-3-jobs/output.txt | 2 + .../deploy/wal/crash-after-create/output.txt | 1 + .../bundle/override/job_tasks/output.txt | 2 + .../missing_map_key/out.validate.direct.json | 3 +- .../out.validate.terraform.json | 3 +- .../mutator/resourcemutator/cluster_fixups.go | 3 + .../resourcemutator/cluster_fixups_test.go | 92 +++++++++++++++ bundle/fuzz/compare_cases_test.go | 35 +----- bundle/fuzz/compare_test.go | 109 ++++-------------- bundle/fuzz/deploy_smoke_test.go | 9 +- bundle/fuzz/deploy_test.go | 46 +++----- bundle/fuzz/doc.go | 21 +--- bundle/fuzz/fuzz_test.go | 83 +++++-------- bundle/fuzz/generate_test.go | 23 ++-- bundle/fuzz/recorder_test.go | 7 +- 17 files changed, 205 insertions(+), 275 deletions(-) create mode 100644 bundle/config/mutator/resourcemutator/cluster_fixups_test.go diff --git a/.github/workflows/push.yml b/.github/workflows/push.yml index 6621b15705..2370163d50 100644 --- a/.github/workflows/push.yml +++ b/.github/workflows/push.yml @@ -374,10 +374,8 @@ jobs: needs: - cleanups - # The terraform/direct create-payload parity tests run two real `bundle deploy` - # invocations per seed, so they are too slow for every PR and too noisy to gate - # the merge queue. Run them on the nightly schedule to catch engine drift; not - # part of test-result for that reason. + # Two real deploys per seed: too slow for every PR, so nightly only and not part + # of test-result. if: ${{ github.event_name == 'schedule' }} name: "task test-fuzz" runs-on: @@ -405,26 +403,16 @@ jobs: - name: Run tests env: - # Shift the seed window by the run number every nightly run so CI - # explores configs it has never tested before instead of re-checking a - # fixed set. The window is kept modest (each seed runs two real deploys) - # since the exploration comes from rotating the window, not its size; - # raise it once nightly timings are known. A divergence prints - # FUZZ_SEED= for one-command reproduction. - # - # offset = GITHUB_RUN_NUMBER * FUZZ_SEEDS. GITHUB_RUN_NUMBER is a - # built-in, monotonically increasing, unique-per-run integer, so as long - # as FUZZ_SEEDS is constant the windows are non-overlapping (gaps from - # non-schedule runs are fine; we only need fresh seeds, not every seed). + # Shift the seed window each nightly run so CI explores new configs. + # offset = GITHUB_RUN_NUMBER * FUZZ_SEEDS keeps windows non-overlapping + # (GITHUB_RUN_NUMBER is monotonic). A divergence prints FUZZ_SEED=. FUZZ_SEEDS: "25" run: | export FUZZ_SEED_OFFSET=$(( GITHUB_RUN_NUMBER * FUZZ_SEEDS )) go tool -modfile=tools/task/go.mod task test-fuzz - # This job is intentionally excluded from test-result, so a failure here is - # invisible unless someone watches the Actions tab. Surface it as a GitHub - # issue instead. Reuse a single open issue (deduped by label) so a recurring - # divergence doesn't open one issue per night. + # Excluded from test-result, so surface failures as a GitHub issue. Reuse one + # open issue (deduped by label) so a recurring divergence doesn't spam nightly. - name: Report failure if: ${{ failure() }} env: diff --git a/Taskfile.yml b/Taskfile.yml index cec4c328da..fb81bf644d 100644 --- a/Taskfile.yml +++ b/Taskfile.yml @@ -703,19 +703,14 @@ tasks: test-fuzz: desc: Run terraform/direct create-payload parity fuzz tests (provisions terraform) - # No `sources:` fingerprint: the seeds checked are a function of the FUZZ_SEED, - # FUZZ_SEEDS, and FUZZ_SEED_OFFSET env vars, which Task can't see. Skipping on - # an unchanged source checksum would silently no-op a FUZZ_SEED= repro run - # or a shifted nightly window, so always run. + # No `sources:` fingerprint: the seeds depend on FUZZ_* env vars Task can't see, + # so always run rather than no-op a repro or a shifted nightly window. env: - # The terraform parity tests are opt-in (see requireFuzzOptIn): they skip - # unless a FUZZ_* var is set, so a leftover build/ never makes them run as - # part of a plain `task test`. This constant flag opts this target in - # without overriding the FUZZ_SEED(S)/OFFSET tuning knobs. + # Opt this target into the parity suite (see requireFuzzOptIn) without + # overriding the FUZZ_SEED(S)/OFFSET tuning knobs. FUZZ_PARITY: "1" cmds: - # The parity harness expects terraform + the provider mirror at /build; - # requireTerraform skips when it's absent, so provision it first. + # requireTerraform expects terraform + provider mirror at /build. - python3 acceptance/install_terraform.py --targetdir build - | {{.GO_TOOL}} gotestsum \ diff --git a/acceptance/bundle/deploy/wal/chain-3-jobs/output.txt b/acceptance/bundle/deploy/wal/chain-3-jobs/output.txt index f27bfaa3f2..f11dc173ee 100644 --- a/acceptance/bundle/deploy/wal/chain-3-jobs/output.txt +++ b/acceptance/bundle/deploy/wal/chain-3-jobs/output.txt @@ -35,6 +35,7 @@ Exit code: [KILLED] { "new_cluster": { "node_type_id": "[NODE_TYPE_ID]", + "num_workers": 0, "spark_version": "15.4.x-scala2.12" }, "spark_python_task": { @@ -73,6 +74,7 @@ Exit code: [KILLED] { "new_cluster": { "node_type_id": "[NODE_TYPE_ID]", + "num_workers": 0, "spark_version": "15.4.x-scala2.12" }, "spark_python_task": { diff --git a/acceptance/bundle/deploy/wal/crash-after-create/output.txt b/acceptance/bundle/deploy/wal/crash-after-create/output.txt index 2ab926a1dd..9cd95a0b5c 100644 --- a/acceptance/bundle/deploy/wal/crash-after-create/output.txt +++ b/acceptance/bundle/deploy/wal/crash-after-create/output.txt @@ -39,6 +39,7 @@ Exit code: [KILLED] { "new_cluster": { "node_type_id": "[NODE_TYPE_ID]", + "num_workers": 0, "spark_version": "15.4.x-scala2.12" }, "spark_python_task": { diff --git a/acceptance/bundle/override/job_tasks/output.txt b/acceptance/bundle/override/job_tasks/output.txt index 2bee9738e3..59b6fc1c39 100644 --- a/acceptance/bundle/override/job_tasks/output.txt +++ b/acceptance/bundle/override/job_tasks/output.txt @@ -18,6 +18,7 @@ }, { "new_cluster": { + "num_workers": 0, "spark_version": "13.3.x-scala2.12" }, "spark_python_task": { @@ -42,6 +43,7 @@ Exit code: 1 "tasks": [ { "new_cluster": { + "num_workers": 0, "spark_version": "13.3.x-scala2.12" }, "spark_python_task": { diff --git a/acceptance/bundle/resource_deps/missing_map_key/out.validate.direct.json b/acceptance/bundle/resource_deps/missing_map_key/out.validate.direct.json index cfd1427ce4..7279aaeba3 100644 --- a/acceptance/bundle/resource_deps/missing_map_key/out.validate.direct.json +++ b/acceptance/bundle/resource_deps/missing_map_key/out.validate.direct.json @@ -30,7 +30,8 @@ "new_cluster": { "custom_tags": { "ResourceClass": "SingleNode" - } + }, + "num_workers": 0 }, "task_key": "test-task" } diff --git a/acceptance/bundle/resource_deps/missing_map_key/out.validate.terraform.json b/acceptance/bundle/resource_deps/missing_map_key/out.validate.terraform.json index 3cdf58f84e..3bad6f4619 100644 --- a/acceptance/bundle/resource_deps/missing_map_key/out.validate.terraform.json +++ b/acceptance/bundle/resource_deps/missing_map_key/out.validate.terraform.json @@ -30,7 +30,8 @@ "new_cluster": { "custom_tags": { "ResourceClass": "SingleNode" - } + }, + "num_workers": 0 }, "task_key": "test-task" } diff --git a/bundle/config/mutator/resourcemutator/cluster_fixups.go b/bundle/config/mutator/resourcemutator/cluster_fixups.go index 893cd248aa..ee4ee04c8b 100644 --- a/bundle/config/mutator/resourcemutator/cluster_fixups.go +++ b/bundle/config/mutator/resourcemutator/cluster_fixups.go @@ -94,6 +94,9 @@ func prepareJobSettingsForUpdate(js *jobs.JobSettings) { for _, task := range js.Tasks { if task.NewCluster != nil { ModifyRequestOnInstancePool(task.NewCluster) + // Match terraform, which force-sends num_workers:0 for single-node + // task clusters too, not just shared job_clusters (DECO-25361). + initializeNumWorkers(task.NewCluster) } } for ind := range js.JobClusters { diff --git a/bundle/config/mutator/resourcemutator/cluster_fixups_test.go b/bundle/config/mutator/resourcemutator/cluster_fixups_test.go new file mode 100644 index 0000000000..5cb2e93749 --- /dev/null +++ b/bundle/config/mutator/resourcemutator/cluster_fixups_test.go @@ -0,0 +1,92 @@ +package resourcemutator + +import ( + "testing" + + "github.com/databricks/databricks-sdk-go/service/compute" + "github.com/databricks/databricks-sdk-go/service/jobs" + "github.com/stretchr/testify/assert" +) + +func TestInitializeNumWorkers(t *testing.T) { + tests := []struct { + name string + spec compute.ClusterSpec + wantForceSend bool + }{ + { + name: "single-node cluster force-sends num_workers", + spec: compute.ClusterSpec{SparkVersion: "15.4.x-scala2.12", NodeTypeId: "i3.xlarge"}, + wantForceSend: true, + }, + { + name: "autoscale cluster does not force-send", + spec: compute.ClusterSpec{Autoscale: &compute.AutoScale{MinWorkers: 1, MaxWorkers: 4}}, + wantForceSend: false, + }, + { + name: "multi-node cluster does not force-send", + spec: compute.ClusterSpec{NumWorkers: 3}, + wantForceSend: false, + }, + { + name: "already force-sent stays force-sent without duplicating", + spec: compute.ClusterSpec{ForceSendFields: []string{"NumWorkers"}}, + wantForceSend: true, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + spec := tt.spec + initializeNumWorkers(&spec) + + count := 0 + for _, f := range spec.ForceSendFields { + if f == "NumWorkers" { + count++ + } + } + if tt.wantForceSend { + assert.Equal(t, 1, count, "NumWorkers must appear in ForceSendFields exactly once") + } else { + assert.Equal(t, 0, count, "NumWorkers must not be in ForceSendFields") + } + }) + } +} + +// TestPrepareJobSettingsForUpdateForcesNumWorkers locks the DECO-25361 fix: a +// single-node new_cluster must force-send num_workers on task-level clusters too, +// not just shared job_clusters. The terraform provider always sends num_workers:0 +// for such clusters, so missing it on the task side made the direct engine +// produce a divergent create payload. +func TestPrepareJobSettingsForUpdateForcesNumWorkers(t *testing.T) { + js := &jobs.JobSettings{ + Tasks: []jobs.Task{ + { + TaskKey: "single_node_task", + NewCluster: &compute.ClusterSpec{SparkVersion: "15.4.x-scala2.12", NodeTypeId: "i3.xlarge"}, + }, + { + TaskKey: "autoscale_task", + NewCluster: &compute.ClusterSpec{Autoscale: &compute.AutoScale{MinWorkers: 1, MaxWorkers: 4}}, + }, + }, + JobClusters: []jobs.JobCluster{ + { + JobClusterKey: "single_node_cluster", + NewCluster: compute.ClusterSpec{SparkVersion: "15.4.x-scala2.12", NodeTypeId: "i3.xlarge"}, + }, + }, + } + + prepareJobSettingsForUpdate(js) + + assert.Contains(t, js.Tasks[0].NewCluster.ForceSendFields, "NumWorkers", + "single-node task cluster must force-send num_workers") + assert.NotContains(t, js.Tasks[1].NewCluster.ForceSendFields, "NumWorkers", + "autoscale task cluster must not force-send num_workers") + assert.Contains(t, js.JobClusters[0].NewCluster.ForceSendFields, "NumWorkers", + "single-node job cluster must force-send num_workers") +} diff --git a/bundle/fuzz/compare_cases_test.go b/bundle/fuzz/compare_cases_test.go index 95c732750b..1549b3de23 100644 --- a/bundle/fuzz/compare_cases_test.go +++ b/bundle/fuzz/compare_cases_test.go @@ -13,7 +13,7 @@ func TestDiffPayloads(t *testing.T) { name string direct string terraform string - ignore []ignoreRule + ignore []string want []string }{ { @@ -62,7 +62,7 @@ func TestDiffPayloads(t *testing.T) { name: "ignored path", direct: `{"tasks":[{"timeout_seconds":1}]}`, terraform: `{"tasks":[{"timeout_seconds":2}]}`, - ignore: []ignoreRule{{Path: "tasks[*].timeout_seconds"}}, + ignore: []string{"tasks[*].timeout_seconds"}, want: nil, }, { @@ -75,7 +75,7 @@ func TestDiffPayloads(t *testing.T) { name: "dotted map key can be ignored", direct: `{"c":{"spark_conf":{"spark.x.y":"1"}}}`, terraform: `{"c":{"spark_conf":{}}}`, - ignore: []ignoreRule{{Path: `c.spark_conf["spark.x.y"]`}}, + ignore: []string{`c.spark_conf["spark.x.y"]`}, want: nil, }, { @@ -102,24 +102,6 @@ func TestDiffPayloads(t *testing.T) { terraform: `{"job_clusters":[{"job_cluster_key":"y","new_cluster":{"num_workers":2}},{"job_cluster_key":"x","new_cluster":{"num_workers":1}}]}`, want: nil, }, - { - // The documented single-node divergence: direct omits num_workers, - // terraform force-sends 0. defaultIgnoreRules suppresses exactly this. - name: "task num_workers absent-vs-zero is ignored", - direct: `{"tasks":[{"task_key":"t","new_cluster":{"spark_version":"x"}}]}`, - terraform: `{"tasks":[{"task_key":"t","new_cluster":{"spark_version":"x","num_workers":0}}]}`, - ignore: defaultIgnoreRules, - want: nil, - }, - { - // A real num_workers value mismatch shares the path but is NOT the - // benign shape, so the narrowed rule must still report it. - name: "task num_workers value mismatch still surfaces", - direct: `{"tasks":[{"task_key":"t","new_cluster":{"spark_version":"x","num_workers":3}}]}`, - terraform: `{"tasks":[{"task_key":"t","new_cluster":{"spark_version":"x","num_workers":5}}]}`, - ignore: defaultIgnoreRules, - want: []string{"tasks[0].new_cluster.num_workers"}, - }, } for _, tt := range tests { @@ -135,14 +117,3 @@ func TestDiffPayloads(t *testing.T) { }) } } - -func TestIsBenignTaskNumWorkers(t *testing.T) { - assert.True(t, isBenignTaskNumWorkers(difference{Direct: missing{}, Terraform: json.Number("0")}), - "direct absent + terraform 0 is the documented divergence") - assert.False(t, isBenignTaskNumWorkers(difference{Direct: json.Number("3"), Terraform: json.Number("5")}), - "two differing counts is a real divergence") - assert.False(t, isBenignTaskNumWorkers(difference{Direct: missing{}, Terraform: json.Number("2")}), - "direct absent but terraform non-zero is not the benign shape") - assert.False(t, isBenignTaskNumWorkers(difference{Direct: json.Number("0"), Terraform: missing{}}), - "reversed sides are not the benign shape") -} diff --git a/bundle/fuzz/compare_test.go b/bundle/fuzz/compare_test.go index de34c18aaf..1681e17179 100644 --- a/bundle/fuzz/compare_test.go +++ b/bundle/fuzz/compare_test.go @@ -36,20 +36,10 @@ func render(v any) string { return string(b) } -// ignoreRule suppresses a known, intentional engine divergence. A rule matches a -// difference when the difference's normalized path equals Path and, if Match is -// non-nil, Match also reports true for the two values. A nil Match ignores any -// difference at Path; a non-nil Match narrows the rule to specific values so a -// genuine mismatch at the same path is still reported. -type ignoreRule struct { - Path string - Match func(d difference) bool -} - -// diffPayloads decodes both create payloads and returns every difference that no -// ignore rule suppresses. Paths are matched with "[*]" standing in for any slice -// index (see normalizePath). -func diffPayloads(direct, terraform json.RawMessage, ignore []ignoreRule) ([]difference, error) { +// diffPayloads decodes both create payloads and returns every difference whose +// normalized path is not in ignore ("[*]" stands in for any slice index, see +// normalizePath). +func diffPayloads(direct, terraform json.RawMessage, ignore []string) ([]difference, error) { d, err := decode(direct) if err != nil { return nil, fmt.Errorf("decoding direct payload: %w", err) @@ -64,30 +54,15 @@ func diffPayloads(direct, terraform json.RawMessage, ignore []ignoreRule) ([]dif filtered := diffs[:0] for _, diff := range diffs { - if !ignored(diff, ignore) { + if !slices.Contains(ignore, normalizePath(diff.Path)) { filtered = append(filtered, diff) } } return filtered, nil } -// ignored reports whether any rule suppresses d. -func ignored(d difference, rules []ignoreRule) bool { - norm := normalizePath(d.Path) - for _, r := range rules { - if r.Path != norm { - continue - } - if r.Match == nil || r.Match(d) { - return true - } - } - return false -} - -// decode unmarshals JSON using UseNumber so large int64 values (e.g. job ids, -// spark_context_id) are not corrupted by float64 rounding. See the encoding rule -// in the repo style guide. +// decode unmarshals JSON with UseNumber so large int64 values (job ids, +// spark_context_id) aren't corrupted by float64 rounding. func decode(raw json.RawMessage) (any, error) { if len(raw) == 0 { return nil, nil @@ -129,10 +104,8 @@ func diffValue(path string, a, b any, diffs *[]difference) { *diffs = append(*diffs, difference{Path: path, Direct: a, Terraform: b}) return } - // Slices whose elements carry a natural identity key (tasks, job clusters) - // are matched by that key so an engine emitting the same elements in a - // different order is not reported as a difference. Everything else is - // compared positionally. + // Match keyed slices (tasks, job clusters) by identity so a different emit + // order isn't a difference; everything else is compared positionally. if key := identityKey(av, bv); key != "" { diffKeyedSlice(path, key, av, bv, diffs) return @@ -157,13 +130,11 @@ func diffValue(path string, a, b any, diffs *[]difference) { } // identityFields are the keys, in priority order, that uniquely identify the -// elements of a payload slice. Job tasks and shared job clusters are the slices -// whose order is not significant but which the engines may emit differently. +// elements of order-insensitive payload slices (job tasks, shared job clusters). var identityFields = []string{"task_key", "job_cluster_key"} // identityKey returns the field that identifies every element of both slices, or -// "" if the elements are not uniformly keyed objects (in which case the caller -// falls back to positional comparison). +// "" if they are not uniformly keyed objects (caller then compares positionally). func identityKey(a, b []any) string { for _, field := range identityFields { if allHaveKey(a, field) && allHaveKey(b, field) { @@ -189,16 +160,11 @@ func allHaveKey(s []any, field string) bool { return true } -// diffKeyedSlice matches elements of a and b by the value of key (which is unique -// within each slice for tasks/job clusters) and diffs each matched pair, -// reporting unmatched elements as present-on-one-side. Paths keep numeric indices -// so ignore-path [*] normalization still applies. +// diffKeyedSlice matches elements of a and b by key (unique within each slice for +// tasks/job clusters by API contract) and diffs each matched pair, reporting +// unmatched elements as present-on-one-side. Paths keep numeric indices so [*] +// normalization still applies. Duplicate keys would be last-one-wins. func diffKeyedSlice(path, key string, a, b []any, diffs *[]difference) { - // identityFields are unique within a slice by API contract (no two job tasks - // share a task_key, no two job_clusters share a job_cluster_key), so keying by - // them is unambiguous. If a payload ever repeated a key, last-one-wins here and - // the duplicate would be mismatched rather than reported precisely; callers - // outside the job-create harness must not rely on this for non-unique keys. bByKey := make(map[string]any, len(b)) for _, el := range b { bByKey[el.(map[string]any)[key].(string)] = el @@ -256,10 +222,8 @@ func unionKeys(a, b map[string]any) []string { } func joinKey(path, key string) string { - // Map keys can themselves contain dots or brackets (e.g. spark_conf entries - // like "spark.databricks.delta.preview.enabled"). Render those as bracketed, - // quoted segments so the path stays unambiguous and ignore entries can target - // a single key. + // Map keys can contain dots/brackets (e.g. spark_conf keys), so render those as + // bracketed quoted segments to keep the path unambiguous. if key == "" || strings.ContainsAny(key, `.[]"`) { return path + "[" + strconv.Quote(key) + "]" } @@ -279,36 +243,11 @@ func normalizePath(path string) string { return indexRe.ReplaceAllString(path, "[*]") } -// defaultIgnoreRules lists create-payload divergences that are known, intentional -// engine differences and not parity bugs. Keep this list small and -// well-justified; every entry is a documented divergence. -var defaultIgnoreRules = []ignoreRule{ - // The terraform provider strips the deprecated/ignored spark conf - // "spark.databricks.delta.preview.enabled" from new_cluster.spark_conf, while - // the direct engine forwards it verbatim. The backend ignores the key either - // way, so this is a benign provider-side filter rather than a parity bug. - {Path: `tasks[*].new_cluster.spark_conf["spark.databricks.delta.preview.enabled"]`}, - {Path: `job_clusters[*].new_cluster.spark_conf["spark.databricks.delta.preview.enabled"]`}, - - // For a single-node task-level new_cluster (no autoscale, num_workers unset) - // the terraform provider force-sends num_workers:0 while the direct engine - // omits the field, so the create payloads diverge. This is a real - // terraform/direct divergence the harness found (seed 29); it is documented - // and suppressed here rather than fixed in this PR. Tracked under DECO-25361. - // Shared job_clusters are not affected: resourcemutator already force-sends - // num_workers for them under both engines, so only the task path diverges. - // - // Match narrows this to exactly that shape (direct absent, terraform 0); a - // genuine num_workers value mismatch at the same path is still reported. - {Path: `tasks[*].new_cluster.num_workers`, Match: isBenignTaskNumWorkers}, -} - -// isBenignTaskNumWorkers reports whether d is the single documented num_workers -// divergence: the direct engine omits num_workers while terraform force-sends 0. -// Any other pair of values (in particular two differing non-zero counts) is a -// real divergence and must not be suppressed. -func isBenignTaskNumWorkers(d difference) bool { - _, directAbsent := d.Direct.(missing) - n, ok := d.Terraform.(json.Number) - return directAbsent && ok && n.String() == "0" +// defaultIgnorePaths lists known, intentional engine divergences. Keep it small; +// every entry is a documented difference, not a parity bug. +var defaultIgnorePaths = []string{ + // Terraform strips the deprecated "spark.databricks.delta.preview.enabled" from + // spark_conf while direct forwards it. The backend ignores it either way. + `tasks[*].new_cluster.spark_conf["spark.databricks.delta.preview.enabled"]`, + `job_clusters[*].new_cluster.spark_conf["spark.databricks.delta.preview.enabled"]`, } diff --git a/bundle/fuzz/deploy_smoke_test.go b/bundle/fuzz/deploy_smoke_test.go index dd90b35990..f6f9e5ea39 100644 --- a/bundle/fuzz/deploy_smoke_test.go +++ b/bundle/fuzz/deploy_smoke_test.go @@ -8,12 +8,9 @@ import ( "github.com/stretchr/testify/require" ) -// TestCaptureJobCreateDirect is intentionally NOT gated behind requireFuzzOptIn, -// unlike the terraform parity suite. The direct engine needs no provisioned -// terraform, and one deterministic direct deploy is cheap, so this runs on every -// `task test` as a smoke test that the capture harness and the direct create path -// still work. The expensive part the opt-in protects against is the terraform -// side (two real deploys per seed), which stays opt-in via requireTerraform. +// TestCaptureJobCreateDirect is intentionally NOT opt-in gated: a single direct +// deploy is cheap, so it runs on every `task test` as a smoke test of the capture +// harness. The expensive terraform side stays opt-in via requireTerraform. func TestCaptureJobCreateDirect(t *testing.T) { job := generateJob(newRNG(1)) diff --git a/bundle/fuzz/deploy_test.go b/bundle/fuzz/deploy_test.go index 2328e0354e..3a738b9cfa 100644 --- a/bundle/fuzz/deploy_test.go +++ b/bundle/fuzz/deploy_test.go @@ -20,18 +20,10 @@ const ( ) // captureJobCreate deploys a bundle containing job through the given engine -// ("direct" or "terraform") and returns the create request body sent to the -// Jobs API. -// -// Both engines run the full `bundle deploy` pipeline against an in-process -// testserver, so the only difference between two captures with different engines -// is the engine itself. That is what makes the resulting payloads directly -// comparable: shared mutators (deployment metadata, presets, ...) are applied -// identically on both sides and cancel out in the diff. -// -// The terraform engine additionally requires DATABRICKS_TF_EXEC_PATH and -// DATABRICKS_TF_CLI_CONFIG_FILE to point at a provisioned terraform binary and -// provider mirror; see requireTerraform. +// ("direct" or "terraform") and returns the create request body sent to the Jobs +// API. Both engines run the full `bundle deploy` against an in-process testserver, +// so shared mutators cancel out and the only difference in the payloads is the +// engine itself. Terraform additionally needs the env from requireTerraform. func captureJobCreate(ctx context.Context, t *testing.T, job *resources.Job, engine string) (json.RawMessage, error) { rec := &recorder{} server := testserver.New(t) @@ -61,9 +53,8 @@ func captureJobCreate(ctx context.Context, t *testing.T, job *resources.Job, eng return body, nil } -// writeJobBundle writes a minimal databricks.yml describing a single job. The -// document is emitted as JSON, which is valid YAML, so we can reuse the job's -// own JSON marshaling (which honors ForceSendFields) without a YAML dependency. +// writeJobBundle writes a minimal databricks.yml for a single job. It emits JSON +// (valid YAML) to reuse the job's own marshaling, which honors ForceSendFields. func writeJobBundle(dir, host string, job *resources.Job) error { jobJSON, err := json.Marshal(job) if err != nil { @@ -91,16 +82,12 @@ func writeJobBundle(dir, host string, job *resources.Job) error { return os.WriteFile(filepath.Join(dir, "databricks.yml"), data, 0o600) } -// fuzzOptInVars are the environment variables that opt a run into the -// terraform-backed parity suite. FUZZ_SEED / FUZZ_SEEDS / FUZZ_SEED_OFFSET double -// as the tuning knobs (see paritySeeds), so setting any of them implies opt-in; -// FUZZ_PARITY is a no-tuning switch used by `task test-fuzz`. +// fuzzOptInVars opt a run into the terraform parity suite. FUZZ_SEED(S)/OFFSET also +// tune it (see paritySeeds); FUZZ_PARITY is a no-tuning switch for `task test-fuzz`. var fuzzOptInVars = []string{"FUZZ_PARITY", "FUZZ_SEED", "FUZZ_SEEDS", "FUZZ_SEED_OFFSET"} -// requireFuzzOptIn skips unless the run explicitly opted into the terraform -// parity suite. Gating on an env var rather than on the presence of build/ keeps -// a leftover terraform install (from a prior `task test-fuzz` or acceptance run) -// from silently turning a plain `task test` into dozens of real deploys. +// requireFuzzOptIn skips unless a FUZZ_* var is set. Gating on an env var rather +// than on a leftover build/ keeps a plain `task test` from running real deploys. func requireFuzzOptIn(t testing.TB) { for _, name := range fuzzOptInVars { if os.Getenv(name) != "" { @@ -111,9 +98,8 @@ func requireFuzzOptIn(t testing.TB) { } // requireTerraform opts in via requireFuzzOptIn, then points the terraform engine -// at the binary and provider mirror provisioned by acceptance/install_terraform.py -// into /build, skipping when they are absent so the suite still skips -// cleanly where terraform is not set up. +// at the binary and provider mirror that acceptance/install_terraform.py provisions +// into /build, skipping cleanly when they are absent. func requireTerraform(t testing.TB) { requireFuzzOptIn(t) @@ -121,9 +107,8 @@ func requireTerraform(t testing.TB) { execPath := filepath.Join(buildDir, "terraform") cfgFile := filepath.Join(buildDir, ".terraformrc") - // install_terraform.py provisions all three together; a partial build/ (e.g. - // the binary without the provider mirror or .terraformrc) would otherwise fail - // mid-deploy with a confusing error instead of skipping cleanly. + // Require all three together; a partial build/ would otherwise fail mid-deploy + // instead of skipping cleanly. tfpluginsDir := filepath.Join(buildDir, "tfplugins") for _, p := range []string{execPath, cfgFile, tfpluginsDir} { if _, err := os.Stat(p); err != nil { @@ -134,8 +119,7 @@ func requireTerraform(t testing.TB) { t.Setenv("DATABRICKS_TF_EXEC_PATH", execPath) t.Setenv("DATABRICKS_TF_CLI_CONFIG_FILE", cfgFile) t.Setenv("TF_CLI_CONFIG_FILE", cfgFile) - // Terraform phones home to checkpoint-api.hashicorp.com otherwise; disable it - // so the testserver/network isn't hit. See acceptance_test.go. + // Disable terraform's checkpoint-api.hashicorp.com phone-home. See acceptance_test.go. t.Setenv("CHECKPOINT_DISABLE", "1") } diff --git a/bundle/fuzz/doc.go b/bundle/fuzz/doc.go index cf898d3ec1..10608ae248 100644 --- a/bundle/fuzz/doc.go +++ b/bundle/fuzz/doc.go @@ -1,17 +1,8 @@ -// Package fuzz provides randomized generators and harnesses that compare how the -// terraform and direct deploy engines translate the same bundle resource into an -// API create payload. See DECO-25361. +// Package fuzz compares how the terraform and direct deploy engines translate the +// same bundle resource into an API create payload, catching divergences during the +// migration off terraform. Generators are seeded so any divergence reproduces from +// the printed seed. Jobs only for now (DECO-25361). // -// The first technique implemented here generates a random resource config and -// checks for differences in the create payload between the terraform and direct -// engines. Generators are seeded so that any divergence found by the fuzz driver -// can be reproduced from the printed seed. -// -// Only jobs are covered for now. Extending the harness to other resource kinds -// (pipelines, apps, ...) is tracked as follow-up work under DECO-25361. -// -// Everything else in the package lives in _test.go files: the package is a -// test-only utility and nothing in the product imports it, so keeping the logic -// out of the regular build avoids shipping dead code. This file exists only to -// carry the package documentation in a non-test file. +// Everything lives in _test.go files: the package is test-only and nothing in the +// product imports it. This file exists only to carry the package doc. package fuzz diff --git a/bundle/fuzz/fuzz_test.go b/bundle/fuzz/fuzz_test.go index 596ac99f7d..33c0b3963e 100644 --- a/bundle/fuzz/fuzz_test.go +++ b/bundle/fuzz/fuzz_test.go @@ -11,32 +11,21 @@ import ( "github.com/stretchr/testify/require" ) -// defaultParitySeeds is the number of random jobs TestJobCreateParity checks by -// default. Each seed runs two real deploys (direct + terraform), so the count is -// kept modest; override with FUZZ_SEEDS for a deeper local run. +// defaultParitySeeds is how many random jobs TestJobCreateParity checks by default. +// Each seed runs two real deploys, so keep it modest; override with FUZZ_SEEDS. const defaultParitySeeds = 20 -// regressionSeeds are seeds that previously surfaced a terraform/direct create -// payload divergence. They are always checked (in addition to the rotating -// nightly window) so the divergence keeps being exercised even though the -// nightly window moves on every run and would otherwise never revisit them. +// regressionSeeds are seeds that previously surfaced a divergence. They are always +// checked (on top of the rotating nightly window, which never revisits them) so a +// fixed divergence can't silently regress. When the nightly job reports a new +// failing FUZZ_SEED, add it here in the PR that fixes the divergence. // -// When the nightly job reports a new failing FUZZ_SEED, add it here. -// -// - 29: generates a single-node task-level new_cluster (num_workers 0, no -// autoscale). The direct engine omits num_workers on task clusters while -// terraform force-sends num_workers:0, so the create payloads diverge. That -// specific shape is suppressed by defaultIgnoreRules (see -// isBenignTaskNumWorkers), so seed 29 currently asserts only that nothing -// else about this config diverges. Once the divergence is fixed and its -// ignore rule removed, this seed becomes a full guard against it regressing. -// Tracked under DECO-25361. +// - 29: single-node task new_cluster; direct omitted num_workers while terraform +// force-sent 0. Fixed by initializeNumWorkers on task clusters (DECO-25361). var regressionSeeds = []int64{29} -// TestJobCreateParity is the first DECO-25361 technique: for many random job -// configs, assert the terraform and direct engines produce equivalent create -// payloads. On divergence it prints the seed and the generated job so the failure -// can be reproduced and inspected. +// TestJobCreateParity asserts the terraform and direct engines produce equivalent +// create payloads for many random jobs, printing the seed on divergence. func TestJobCreateParity(t *testing.T) { requireTerraform(t) @@ -49,17 +38,11 @@ func TestJobCreateParity(t *testing.T) { // paritySeeds returns the seeds TestJobCreateParity should check. // -// FUZZ_SEED (comma-separated list) runs exactly those seeds and overrides -// everything else. This is the knob the failure message prints so a single -// reported divergence can be reproduced with one command, without re-running -// every seed before it. -// -// Otherwise the test runs the regressionSeeds plus FUZZ_SEEDS seeds (default -// defaultParitySeeds) starting at FUZZ_SEED_OFFSET. The offset lets the nightly -// job shift the window every run (push.yml derives it from the run number) so CI -// explores configs it has never tested before instead of re-checking the same -// fixed set forever; the regressionSeeds are always included on top so known -// past divergences keep being verified. +// FUZZ_SEED (comma-separated) runs exactly those seeds and overrides everything, +// so a reported divergence reproduces with one command. Otherwise it runs +// regressionSeeds plus FUZZ_SEEDS seeds (default defaultParitySeeds) from +// FUZZ_SEED_OFFSET; the nightly job shifts the offset each run so CI keeps +// exploring new configs. func paritySeeds(t *testing.T) []int64 { if v := os.Getenv("FUZZ_SEED"); v != "" { var seeds []int64 @@ -112,10 +95,8 @@ func paritySeeds(t *testing.T) []int64 { // TestParitySeeds verifies paritySeeds composes the regression seeds with the // rotating window, deduplicates overlaps, and lets FUZZ_SEED override both. func TestParitySeeds(t *testing.T) { - // Isolate from any ambient FUZZ_* in the developer's environment. FUZZ_SEED in - // particular would short-circuit paritySeeds and break the cases below; an - // inherited FUZZ_SEEDS/OFFSET would skew the expected window. paritySeeds - // treats "" as unset, and subtests set only what they need on top. + // Isolate from ambient FUZZ_* in the dev environment (paritySeeds treats "" as + // unset); subtests set only what they need. t.Setenv("FUZZ_SEED", "") t.Setenv("FUZZ_SEEDS", "") t.Setenv("FUZZ_SEED_OFFSET", "") @@ -146,16 +127,14 @@ func TestParitySeeds(t *testing.T) { }) } -// FuzzJobCreateParity exposes the same parity check to Go's native fuzzer -// (`go test -fuzz=FuzzJobCreateParity`). Note each input runs two real deploys, -// so this is intended for ad-hoc deep runs, not the default `go test` path. +// FuzzJobCreateParity exposes the parity check to Go's native fuzzer. Each input +// runs two real deploys, so it's for ad-hoc deep runs, not the default test path. func FuzzJobCreateParity(f *testing.F) { requireTerraform(f) for seed := range int64(5) { f.Add(seed) } - // Seed the corpus with known past divergences so the fuzzer always starts - // from inputs that previously exposed a bug. + // Seed the corpus with known past divergences. for _, seed := range regressionSeeds { f.Add(seed) } @@ -164,20 +143,12 @@ func FuzzJobCreateParity(f *testing.F) { }) } -// checkJobParity generates the job for seed, deploys it under both engines, and -// fails the test with reproduction details if the create payloads diverge. -// -// A deploy/capture failure is not a create-payload divergence, so the three -// outcomes are handled distinctly to keep nightly triage from misdirecting a -// deploy failure into regressionSeeds (which is only for real payload diffs): -// - neither engine deployed: the generator produced a config nothing accepts, -// so skip (logging both errors) rather than flag a parity bug. -// - exactly one engine deployed: the engines disagree on whether the config -// deploys at all. That is worth failing on, but it is a deploy/capture -// difference rather than a payload diff, so it is reported separately. The -// failing side's error (an API rejection, an unregistered route, etc.) is -// included so triage can tell a true acceptance divergence from a harness gap. -// - both deployed: compare the captured create payloads. +// checkJobParity deploys the seed's job under both engines and fails if the create +// payloads diverge. A deploy/capture failure is not a payload divergence, so the +// outcomes are kept distinct: +// - neither deployed: skip (the config is unacceptable to both engines). +// - one deployed: fail separately as a deploy/capture difference, not a diff. +// - both deployed: compare the captured payloads. func checkJobParity(t *testing.T, seed int64) { t.Helper() job := generateJob(newRNG(seed)) @@ -195,7 +166,7 @@ func checkJobParity(t *testing.T, seed int64) { t.Fatalf("seed %d: direct deployed but terraform did not (deploy/capture difference, not a payload diff): %v", seed, tfErr) } - diffs, err := diffPayloads(direct, terraform, defaultIgnoreRules) + diffs, err := diffPayloads(direct, terraform, defaultIgnorePaths) require.NoErrorf(t, err, "seed %d: comparing create payloads", seed) if len(diffs) > 0 { diff --git a/bundle/fuzz/generate_test.go b/bundle/fuzz/generate_test.go index 6472957b2f..7a96c1868c 100644 --- a/bundle/fuzz/generate_test.go +++ b/bundle/fuzz/generate_test.go @@ -11,9 +11,8 @@ import ( ) // Value pools are intentionally small and valid-looking: the goal is to exercise -// the engines' config->payload translation across many field combinations, not to -// stress the API with invalid values (which the testserver would reject before we -// can compare payloads). +// config->payload translation across many field combinations, not to stress the +// API with invalid values the testserver would reject. var ( sparkVersions = []string{"13.3.x-scala2.12", "14.3.x-scala2.12", "15.4.x-scala2.12", "16.4.x-scala2.12"} nodeTypeIDs = []string{"i3.xlarge", "m5.large", "r5.xlarge", "Standard_DS3_v2"} @@ -29,12 +28,10 @@ var ( ) // generateJob builds a random, well-formed job config driven entirely by rng, so -// the same seed always produces the same job. It deliberately favors fields whose -// translation tends to differ between engines (tasks, clusters, schedules, -// notifications, tags, zero-able scalars). +// the same seed always produces the same job. It favors fields whose translation +// tends to differ between engines. // -// TODO(DECO-25361): generalize the harness across resource kinds so pipelines, -// apps, etc. get the same create-payload parity coverage as jobs. +// TODO(DECO-25361): generalize the harness across resource kinds. func generateJob(rng *rand.Rand) *resources.Job { job := &resources.Job{} job.Name = randName(rng, "job") @@ -150,9 +147,8 @@ func randScheduling(rng *rand.Rand, job *resources.Job) { func randTask(rng *rand.Rand, idx int, jobClusterKeys []string) jobs.Task { task := jobs.Task{TaskKey: fmt.Sprintf("task_%d", idx)} - // Use absolute workspace paths with source=WORKSPACE so the generated bundle - // never depends on local files existing on disk (which deploy would reject). - // condition_task needs no compute, so it is handled separately below. + // Use absolute workspace paths so deploy never depends on local files. + // condition_task needs no compute, handled separately below. needsCompute := true switch rng.IntN(4) { case 0: @@ -197,9 +193,8 @@ func randTask(rng *rand.Rand, idx int, jobClusterKeys []string) jobs.Task { return task } -// assignCompute attaches exactly one compute source, which notebook/python/wheel -// tasks require: a shared job cluster (when available), a brand-new cluster, or an -// existing cluster id. +// assignCompute attaches exactly one compute source: a shared job cluster (when +// available), a new cluster, or an existing cluster id. func assignCompute(rng *rand.Rand, task *jobs.Task, jobClusterKeys []string) { const ( computeNew = iota diff --git a/bundle/fuzz/recorder_test.go b/bundle/fuzz/recorder_test.go index a5e7d4d707..73620d00e1 100644 --- a/bundle/fuzz/recorder_test.go +++ b/bundle/fuzz/recorder_test.go @@ -8,11 +8,8 @@ import ( ) // jobsCreatePath is the Jobs API route both engines must hit on create. The -// direct engine posts here via the SDK and the terraform provider is expected to -// as well. The testserver registers only this version of the jobs/create route, -// so if an engine ever posted to a different version the deploy would 404 and -// captureJobCreate would fail with "did not POST". A version skew therefore -// surfaces as a capture failure, not as a payload diff. +// testserver registers only this version, so an engine posting to a different one +// surfaces as a capture failure ("did not POST"), not a payload diff. const jobsCreatePath = "/api/2.2/jobs/create" // capturedRequest is a single mutating API request observed by the testserver. From 5cf31b416031b0f5f4cb551d76454390ccda6cf5 Mon Sep 17 00:00:00 2001 From: Rada Kamysheva Date: Fri, 26 Jun 2026 15:44:13 +0000 Subject: [PATCH 14/24] bundle/fuzz: replace terraform/direct parity with invariant testing Switch the fuzz suite from comparing terraform and direct create payloads to asserting invariants on the direct engine's payload. Terraform and direct can disagree for legitimate reasons, so a payload diff is noisy; an invariant has no legitimate reason to fail, so a failure is a real bug. This drops the payload diff and its ignore-list of documented divergences, and removes terraform from the harness (each seed is now one in-process direct deploy). Gate on `bundle validate` so the suite distinguishes the two fuzzing outcomes: an invalid config skips (it can't violate an invariant), while a validated config that fails to deploy or breaks an invariant fails. This is the distinction a looser, schema-driven generator will rely on. Revert the num_workers:0 force-send for single-node task clusters (and its acceptance goldens): it only matched terraform's payload, with no demonstrated behavior benefit, and direct has shipped without it. If a real backend requirement is confirmed, it can return as a standalone change. --- .github/workflows/push.yml | 17 +- .gitignore | 4 - Taskfile.yml | 8 +- .../bundle/deploy/wal/chain-3-jobs/output.txt | 2 - .../deploy/wal/crash-after-create/output.txt | 1 - .../bundle/override/job_tasks/output.txt | 2 - .../missing_map_key/out.validate.direct.json | 3 +- .../out.validate.terraform.json | 3 +- .../mutator/resourcemutator/cluster_fixups.go | 3 - .../resourcemutator/cluster_fixups_test.go | 92 ------- bundle/fuzz/compare_cases_test.go | 119 -------- bundle/fuzz/compare_test.go | 253 ------------------ bundle/fuzz/deploy_smoke_test.go | 25 +- bundle/fuzz/deploy_test.go | 90 +++---- bundle/fuzz/doc.go | 11 +- bundle/fuzz/fuzz_test.go | 120 ++++----- bundle/fuzz/generate_test.go | 6 +- bundle/fuzz/invariants_cases_test.go | 93 +++++++ bundle/fuzz/invariants_test.go | 175 ++++++++++++ bundle/fuzz/recorder_test.go | 9 +- 20 files changed, 377 insertions(+), 659 deletions(-) delete mode 100644 bundle/config/mutator/resourcemutator/cluster_fixups_test.go delete mode 100644 bundle/fuzz/compare_cases_test.go delete mode 100644 bundle/fuzz/compare_test.go create mode 100644 bundle/fuzz/invariants_cases_test.go create mode 100644 bundle/fuzz/invariants_test.go diff --git a/.github/workflows/push.yml b/.github/workflows/push.yml index 2370163d50..0f0aab8f7d 100644 --- a/.github/workflows/push.yml +++ b/.github/workflows/push.yml @@ -374,8 +374,9 @@ jobs: needs: - cleanups - # Two real deploys per seed: too slow for every PR, so nightly only and not part - # of test-result. + # A real deploy per seed across a wide rotating window: too slow for every PR, + # so nightly only and not part of test-result. (The package's un-gated smoke + # test still checks the invariants on one seed on every PR.) if: ${{ github.event_name == 'schedule' }} name: "task test-fuzz" runs-on: @@ -405,14 +406,14 @@ jobs: env: # Shift the seed window each nightly run so CI explores new configs. # offset = GITHUB_RUN_NUMBER * FUZZ_SEEDS keeps windows non-overlapping - # (GITHUB_RUN_NUMBER is monotonic). A divergence prints FUZZ_SEED=. + # (GITHUB_RUN_NUMBER is monotonic). A failure prints FUZZ_SEED=. FUZZ_SEEDS: "25" run: | export FUZZ_SEED_OFFSET=$(( GITHUB_RUN_NUMBER * FUZZ_SEEDS )) go tool -modfile=tools/task/go.mod task test-fuzz # Excluded from test-result, so surface failures as a GitHub issue. Reuse one - # open issue (deduped by label) so a recurring divergence doesn't spam nightly. + # open issue (deduped by label) so a recurring failure doesn't spam nightly. - name: Report failure if: ${{ failure() }} env: @@ -420,11 +421,11 @@ jobs: RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }} run: | gh label create fuzz-nightly \ - --description "Nightly terraform/direct create-payload parity failures" \ + --description "Nightly create-payload invariant failures" \ --color FBCA04 2>/dev/null || true body=$(cat </build. - - python3 acceptance/install_terraform.py --targetdir build - | {{.GO_TOOL}} gotestsum \ --format ${GOTESTSUM_FORMAT:-pkgname-and-test-fails} \ diff --git a/acceptance/bundle/deploy/wal/chain-3-jobs/output.txt b/acceptance/bundle/deploy/wal/chain-3-jobs/output.txt index f11dc173ee..f27bfaa3f2 100644 --- a/acceptance/bundle/deploy/wal/chain-3-jobs/output.txt +++ b/acceptance/bundle/deploy/wal/chain-3-jobs/output.txt @@ -35,7 +35,6 @@ Exit code: [KILLED] { "new_cluster": { "node_type_id": "[NODE_TYPE_ID]", - "num_workers": 0, "spark_version": "15.4.x-scala2.12" }, "spark_python_task": { @@ -74,7 +73,6 @@ Exit code: [KILLED] { "new_cluster": { "node_type_id": "[NODE_TYPE_ID]", - "num_workers": 0, "spark_version": "15.4.x-scala2.12" }, "spark_python_task": { diff --git a/acceptance/bundle/deploy/wal/crash-after-create/output.txt b/acceptance/bundle/deploy/wal/crash-after-create/output.txt index 9cd95a0b5c..2ab926a1dd 100644 --- a/acceptance/bundle/deploy/wal/crash-after-create/output.txt +++ b/acceptance/bundle/deploy/wal/crash-after-create/output.txt @@ -39,7 +39,6 @@ Exit code: [KILLED] { "new_cluster": { "node_type_id": "[NODE_TYPE_ID]", - "num_workers": 0, "spark_version": "15.4.x-scala2.12" }, "spark_python_task": { diff --git a/acceptance/bundle/override/job_tasks/output.txt b/acceptance/bundle/override/job_tasks/output.txt index 59b6fc1c39..2bee9738e3 100644 --- a/acceptance/bundle/override/job_tasks/output.txt +++ b/acceptance/bundle/override/job_tasks/output.txt @@ -18,7 +18,6 @@ }, { "new_cluster": { - "num_workers": 0, "spark_version": "13.3.x-scala2.12" }, "spark_python_task": { @@ -43,7 +42,6 @@ Exit code: 1 "tasks": [ { "new_cluster": { - "num_workers": 0, "spark_version": "13.3.x-scala2.12" }, "spark_python_task": { diff --git a/acceptance/bundle/resource_deps/missing_map_key/out.validate.direct.json b/acceptance/bundle/resource_deps/missing_map_key/out.validate.direct.json index 7279aaeba3..cfd1427ce4 100644 --- a/acceptance/bundle/resource_deps/missing_map_key/out.validate.direct.json +++ b/acceptance/bundle/resource_deps/missing_map_key/out.validate.direct.json @@ -30,8 +30,7 @@ "new_cluster": { "custom_tags": { "ResourceClass": "SingleNode" - }, - "num_workers": 0 + } }, "task_key": "test-task" } diff --git a/acceptance/bundle/resource_deps/missing_map_key/out.validate.terraform.json b/acceptance/bundle/resource_deps/missing_map_key/out.validate.terraform.json index 3bad6f4619..3cdf58f84e 100644 --- a/acceptance/bundle/resource_deps/missing_map_key/out.validate.terraform.json +++ b/acceptance/bundle/resource_deps/missing_map_key/out.validate.terraform.json @@ -30,8 +30,7 @@ "new_cluster": { "custom_tags": { "ResourceClass": "SingleNode" - }, - "num_workers": 0 + } }, "task_key": "test-task" } diff --git a/bundle/config/mutator/resourcemutator/cluster_fixups.go b/bundle/config/mutator/resourcemutator/cluster_fixups.go index ee4ee04c8b..893cd248aa 100644 --- a/bundle/config/mutator/resourcemutator/cluster_fixups.go +++ b/bundle/config/mutator/resourcemutator/cluster_fixups.go @@ -94,9 +94,6 @@ func prepareJobSettingsForUpdate(js *jobs.JobSettings) { for _, task := range js.Tasks { if task.NewCluster != nil { ModifyRequestOnInstancePool(task.NewCluster) - // Match terraform, which force-sends num_workers:0 for single-node - // task clusters too, not just shared job_clusters (DECO-25361). - initializeNumWorkers(task.NewCluster) } } for ind := range js.JobClusters { diff --git a/bundle/config/mutator/resourcemutator/cluster_fixups_test.go b/bundle/config/mutator/resourcemutator/cluster_fixups_test.go deleted file mode 100644 index 5cb2e93749..0000000000 --- a/bundle/config/mutator/resourcemutator/cluster_fixups_test.go +++ /dev/null @@ -1,92 +0,0 @@ -package resourcemutator - -import ( - "testing" - - "github.com/databricks/databricks-sdk-go/service/compute" - "github.com/databricks/databricks-sdk-go/service/jobs" - "github.com/stretchr/testify/assert" -) - -func TestInitializeNumWorkers(t *testing.T) { - tests := []struct { - name string - spec compute.ClusterSpec - wantForceSend bool - }{ - { - name: "single-node cluster force-sends num_workers", - spec: compute.ClusterSpec{SparkVersion: "15.4.x-scala2.12", NodeTypeId: "i3.xlarge"}, - wantForceSend: true, - }, - { - name: "autoscale cluster does not force-send", - spec: compute.ClusterSpec{Autoscale: &compute.AutoScale{MinWorkers: 1, MaxWorkers: 4}}, - wantForceSend: false, - }, - { - name: "multi-node cluster does not force-send", - spec: compute.ClusterSpec{NumWorkers: 3}, - wantForceSend: false, - }, - { - name: "already force-sent stays force-sent without duplicating", - spec: compute.ClusterSpec{ForceSendFields: []string{"NumWorkers"}}, - wantForceSend: true, - }, - } - - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - spec := tt.spec - initializeNumWorkers(&spec) - - count := 0 - for _, f := range spec.ForceSendFields { - if f == "NumWorkers" { - count++ - } - } - if tt.wantForceSend { - assert.Equal(t, 1, count, "NumWorkers must appear in ForceSendFields exactly once") - } else { - assert.Equal(t, 0, count, "NumWorkers must not be in ForceSendFields") - } - }) - } -} - -// TestPrepareJobSettingsForUpdateForcesNumWorkers locks the DECO-25361 fix: a -// single-node new_cluster must force-send num_workers on task-level clusters too, -// not just shared job_clusters. The terraform provider always sends num_workers:0 -// for such clusters, so missing it on the task side made the direct engine -// produce a divergent create payload. -func TestPrepareJobSettingsForUpdateForcesNumWorkers(t *testing.T) { - js := &jobs.JobSettings{ - Tasks: []jobs.Task{ - { - TaskKey: "single_node_task", - NewCluster: &compute.ClusterSpec{SparkVersion: "15.4.x-scala2.12", NodeTypeId: "i3.xlarge"}, - }, - { - TaskKey: "autoscale_task", - NewCluster: &compute.ClusterSpec{Autoscale: &compute.AutoScale{MinWorkers: 1, MaxWorkers: 4}}, - }, - }, - JobClusters: []jobs.JobCluster{ - { - JobClusterKey: "single_node_cluster", - NewCluster: compute.ClusterSpec{SparkVersion: "15.4.x-scala2.12", NodeTypeId: "i3.xlarge"}, - }, - }, - } - - prepareJobSettingsForUpdate(js) - - assert.Contains(t, js.Tasks[0].NewCluster.ForceSendFields, "NumWorkers", - "single-node task cluster must force-send num_workers") - assert.NotContains(t, js.Tasks[1].NewCluster.ForceSendFields, "NumWorkers", - "autoscale task cluster must not force-send num_workers") - assert.Contains(t, js.JobClusters[0].NewCluster.ForceSendFields, "NumWorkers", - "single-node job cluster must force-send num_workers") -} diff --git a/bundle/fuzz/compare_cases_test.go b/bundle/fuzz/compare_cases_test.go deleted file mode 100644 index 1549b3de23..0000000000 --- a/bundle/fuzz/compare_cases_test.go +++ /dev/null @@ -1,119 +0,0 @@ -package fuzz - -import ( - "encoding/json" - "testing" - - "github.com/stretchr/testify/assert" - "github.com/stretchr/testify/require" -) - -func TestDiffPayloads(t *testing.T) { - tests := []struct { - name string - direct string - terraform string - ignore []string - want []string - }{ - { - name: "identical", - direct: `{"name":"a","tasks":[{"task_key":"t"}]}`, - terraform: `{"name":"a","tasks":[{"task_key":"t"}]}`, - want: nil, - }, - { - name: "scalar mismatch", - direct: `{"name":"a"}`, - terraform: `{"name":"b"}`, - want: []string{"name"}, - }, - { - name: "missing on terraform", - direct: `{"name":"a","queue":{"enabled":true}}`, - terraform: `{"name":"a"}`, - want: []string{"queue"}, - }, - { - name: "missing on direct", - direct: `{"name":"a"}`, - terraform: `{"name":"a","max_concurrent_runs":1}`, - want: []string{"max_concurrent_runs"}, - }, - { - name: "nested slice element mismatch", - direct: `{"tasks":[{"task_key":"t","timeout_seconds":1}]}`, - terraform: `{"tasks":[{"task_key":"t","timeout_seconds":2}]}`, - want: []string{"tasks[0].timeout_seconds"}, - }, - { - name: "slice length mismatch", - direct: `{"tasks":[{"task_key":"a"},{"task_key":"b"}]}`, - terraform: `{"tasks":[{"task_key":"a"}]}`, - want: []string{"tasks[1]"}, - }, - { - name: "number 1 vs 1.0 differ", - direct: `{"n":1}`, - terraform: `{"n":1.0}`, - want: []string{"n"}, - }, - { - name: "ignored path", - direct: `{"tasks":[{"timeout_seconds":1}]}`, - terraform: `{"tasks":[{"timeout_seconds":2}]}`, - ignore: []string{"tasks[*].timeout_seconds"}, - want: nil, - }, - { - name: "dotted map key is bracket-quoted", - direct: `{"spark_conf":{"spark.x.y":"1"}}`, - terraform: `{"spark_conf":{}}`, - want: []string{`spark_conf["spark.x.y"]`}, - }, - { - name: "dotted map key can be ignored", - direct: `{"c":{"spark_conf":{"spark.x.y":"1"}}}`, - terraform: `{"c":{"spark_conf":{}}}`, - ignore: []string{`c.spark_conf["spark.x.y"]`}, - want: nil, - }, - { - name: "tasks matched by key ignore order", - direct: `{"tasks":[{"task_key":"a","timeout_seconds":1},{"task_key":"b","timeout_seconds":2}]}`, - terraform: `{"tasks":[{"task_key":"b","timeout_seconds":2},{"task_key":"a","timeout_seconds":1}]}`, - want: nil, - }, - { - name: "tasks matched by key surface real diff at direct index", - direct: `{"tasks":[{"task_key":"a","timeout_seconds":1},{"task_key":"b","timeout_seconds":2}]}`, - terraform: `{"tasks":[{"task_key":"b","timeout_seconds":9},{"task_key":"a","timeout_seconds":1}]}`, - want: []string{"tasks[1].timeout_seconds"}, - }, - { - name: "task only on terraform reported at its index", - direct: `{"tasks":[{"task_key":"a"}]}`, - terraform: `{"tasks":[{"task_key":"a"},{"task_key":"b"}]}`, - want: []string{"tasks[1]"}, - }, - { - name: "job_clusters matched by key ignore order", - direct: `{"job_clusters":[{"job_cluster_key":"x","new_cluster":{"num_workers":1}},{"job_cluster_key":"y","new_cluster":{"num_workers":2}}]}`, - terraform: `{"job_clusters":[{"job_cluster_key":"y","new_cluster":{"num_workers":2}},{"job_cluster_key":"x","new_cluster":{"num_workers":1}}]}`, - want: nil, - }, - } - - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - diffs, err := diffPayloads(json.RawMessage(tt.direct), json.RawMessage(tt.terraform), tt.ignore) - require.NoError(t, err) - - var paths []string - for _, d := range diffs { - paths = append(paths, d.Path) - } - assert.ElementsMatch(t, tt.want, paths) - }) - } -} diff --git a/bundle/fuzz/compare_test.go b/bundle/fuzz/compare_test.go deleted file mode 100644 index 1681e17179..0000000000 --- a/bundle/fuzz/compare_test.go +++ /dev/null @@ -1,253 +0,0 @@ -package fuzz - -import ( - "bytes" - "encoding/json" - "fmt" - "regexp" - "slices" - "strconv" - "strings" -) - -// difference is a single mismatch between the two engines' create payloads, -// located by a JSON-ish path (e.g. "tasks[0].new_cluster.num_workers"). -type difference struct { - Path string - Direct any - Terraform any -} - -func (d difference) String() string { - return fmt.Sprintf("%s: direct=%s terraform=%s", d.Path, render(d.Direct), render(d.Terraform)) -} - -// missing marks a value that is absent on one side. -type missing struct{} - -func render(v any) string { - if _, ok := v.(missing); ok { - return "" - } - b, err := json.Marshal(v) - if err != nil { - return fmt.Sprintf("%v", v) - } - return string(b) -} - -// diffPayloads decodes both create payloads and returns every difference whose -// normalized path is not in ignore ("[*]" stands in for any slice index, see -// normalizePath). -func diffPayloads(direct, terraform json.RawMessage, ignore []string) ([]difference, error) { - d, err := decode(direct) - if err != nil { - return nil, fmt.Errorf("decoding direct payload: %w", err) - } - tf, err := decode(terraform) - if err != nil { - return nil, fmt.Errorf("decoding terraform payload: %w", err) - } - - var diffs []difference - diffValue("", d, tf, &diffs) - - filtered := diffs[:0] - for _, diff := range diffs { - if !slices.Contains(ignore, normalizePath(diff.Path)) { - filtered = append(filtered, diff) - } - } - return filtered, nil -} - -// decode unmarshals JSON with UseNumber so large int64 values (job ids, -// spark_context_id) aren't corrupted by float64 rounding. -func decode(raw json.RawMessage) (any, error) { - if len(raw) == 0 { - return nil, nil - } - dec := json.NewDecoder(bytes.NewReader(raw)) - dec.UseNumber() - var v any - if err := dec.Decode(&v); err != nil { - return nil, err - } - return v, nil -} - -func diffValue(path string, a, b any, diffs *[]difference) { - switch av := a.(type) { - case map[string]any: - bv, ok := b.(map[string]any) - if !ok { - *diffs = append(*diffs, difference{Path: path, Direct: a, Terraform: b}) - return - } - keys := unionKeys(av, bv) - for _, k := range keys { - achild, aok := av[k] - bchild, bok := bv[k] - child := joinKey(path, k) - switch { - case aok && bok: - diffValue(child, achild, bchild, diffs) - case aok: - *diffs = append(*diffs, difference{Path: child, Direct: achild, Terraform: missing{}}) - default: - *diffs = append(*diffs, difference{Path: child, Direct: missing{}, Terraform: bchild}) - } - } - case []any: - bv, ok := b.([]any) - if !ok { - *diffs = append(*diffs, difference{Path: path, Direct: a, Terraform: b}) - return - } - // Match keyed slices (tasks, job clusters) by identity so a different emit - // order isn't a difference; everything else is compared positionally. - if key := identityKey(av, bv); key != "" { - diffKeyedSlice(path, key, av, bv, diffs) - return - } - n := max(len(av), len(bv)) - for i := range n { - child := fmt.Sprintf("%s[%d]", path, i) - switch { - case i < len(av) && i < len(bv): - diffValue(child, av[i], bv[i], diffs) - case i < len(av): - *diffs = append(*diffs, difference{Path: child, Direct: av[i], Terraform: missing{}}) - default: - *diffs = append(*diffs, difference{Path: child, Direct: missing{}, Terraform: bv[i]}) - } - } - default: - if !scalarEqual(a, b) { - *diffs = append(*diffs, difference{Path: path, Direct: a, Terraform: b}) - } - } -} - -// identityFields are the keys, in priority order, that uniquely identify the -// elements of order-insensitive payload slices (job tasks, shared job clusters). -var identityFields = []string{"task_key", "job_cluster_key"} - -// identityKey returns the field that identifies every element of both slices, or -// "" if they are not uniformly keyed objects (caller then compares positionally). -func identityKey(a, b []any) string { - for _, field := range identityFields { - if allHaveKey(a, field) && allHaveKey(b, field) { - return field - } - } - return "" -} - -func allHaveKey(s []any, field string) bool { - if len(s) == 0 { - return false - } - for _, el := range s { - m, ok := el.(map[string]any) - if !ok { - return false - } - if _, ok := m[field].(string); !ok { - return false - } - } - return true -} - -// diffKeyedSlice matches elements of a and b by key (unique within each slice for -// tasks/job clusters by API contract) and diffs each matched pair, reporting -// unmatched elements as present-on-one-side. Paths keep numeric indices so [*] -// normalization still applies. Duplicate keys would be last-one-wins. -func diffKeyedSlice(path, key string, a, b []any, diffs *[]difference) { - bByKey := make(map[string]any, len(b)) - for _, el := range b { - bByKey[el.(map[string]any)[key].(string)] = el - } - - matched := make(map[string]bool, len(a)) - for i, el := range a { - child := fmt.Sprintf("%s[%d]", path, i) - k := el.(map[string]any)[key].(string) - matched[k] = true - if bel, ok := bByKey[k]; ok { - diffValue(child, el, bel, diffs) - } else { - *diffs = append(*diffs, difference{Path: child, Direct: el, Terraform: missing{}}) - } - } - for j, el := range b { - k := el.(map[string]any)[key].(string) - if matched[k] { - continue - } - child := fmt.Sprintf("%s[%d]", path, j) - *diffs = append(*diffs, difference{Path: child, Direct: missing{}, Terraform: el}) - } -} - -// scalarEqual compares two JSON scalars. json.Number is compared by its string -// form so 1 and 1.0 don't masquerade as equal across engines. -func scalarEqual(a, b any) bool { - an, aok := a.(json.Number) - bn, bok := b.(json.Number) - if aok && bok { - return an.String() == bn.String() - } - return a == b -} - -func unionKeys(a, b map[string]any) []string { - seen := map[string]bool{} - var keys []string - for k := range a { - if !seen[k] { - seen[k] = true - keys = append(keys, k) - } - } - for k := range b { - if !seen[k] { - seen[k] = true - keys = append(keys, k) - } - } - slices.Sort(keys) - return keys -} - -func joinKey(path, key string) string { - // Map keys can contain dots/brackets (e.g. spark_conf keys), so render those as - // bracketed quoted segments to keep the path unambiguous. - if key == "" || strings.ContainsAny(key, `.[]"`) { - return path + "[" + strconv.Quote(key) + "]" - } - if path == "" { - return key - } - return path + "." + key -} - -// indexRe matches numeric slice indices like "[12]" but not quoted string keys -// like ["spark.x"]. -var indexRe = regexp.MustCompile(`\[\d+\]`) - -// normalizePath replaces concrete slice indices with [*] so a single ignore -// entry can cover every element of a slice. -func normalizePath(path string) string { - return indexRe.ReplaceAllString(path, "[*]") -} - -// defaultIgnorePaths lists known, intentional engine divergences. Keep it small; -// every entry is a documented difference, not a parity bug. -var defaultIgnorePaths = []string{ - // Terraform strips the deprecated "spark.databricks.delta.preview.enabled" from - // spark_conf while direct forwards it. The backend ignores it either way. - `tasks[*].new_cluster.spark_conf["spark.databricks.delta.preview.enabled"]`, - `job_clusters[*].new_cluster.spark_conf["spark.databricks.delta.preview.enabled"]`, -} diff --git a/bundle/fuzz/deploy_smoke_test.go b/bundle/fuzz/deploy_smoke_test.go index f6f9e5ea39..0121c7468e 100644 --- a/bundle/fuzz/deploy_smoke_test.go +++ b/bundle/fuzz/deploy_smoke_test.go @@ -1,38 +1,21 @@ package fuzz import ( - "encoding/json" "testing" - "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" ) // TestCaptureJobCreateDirect is intentionally NOT opt-in gated: a single direct // deploy is cheap, so it runs on every `task test` as a smoke test of the capture -// harness. The expensive terraform side stays opt-in via requireTerraform. +// harness and the invariants. The wider seed sweep stays opt-in via +// requireFuzzOptIn. func TestCaptureJobCreateDirect(t *testing.T) { job := generateJob(newRNG(1)) - body, err := captureJobCreate(t.Context(), t, job, "direct") + body, err := captureJobCreate(t.Context(), t, job) require.NoError(t, err) require.NotEmpty(t, body) - var payload map[string]any - require.NoError(t, json.Unmarshal(body, &payload)) - assert.Equal(t, job.Name, payload["name"]) - assert.Contains(t, payload, "tasks") -} - -func TestCaptureJobCreateTerraform(t *testing.T) { - requireTerraform(t) - job := generateJob(newRNG(1)) - - body, err := captureJobCreate(t.Context(), t, job, "terraform") - require.NoError(t, err) - require.NotEmpty(t, body) - - var payload map[string]any - require.NoError(t, json.Unmarshal(body, &payload)) - assert.Equal(t, job.Name, payload["name"]) + checkJobInvariants(t, 1, job, body) } diff --git a/bundle/fuzz/deploy_test.go b/bundle/fuzz/deploy_test.go index 3a738b9cfa..ddf8d4342b 100644 --- a/bundle/fuzz/deploy_test.go +++ b/bundle/fuzz/deploy_test.go @@ -3,6 +3,7 @@ package fuzz import ( "context" "encoding/json" + "errors" "fmt" "os" "path/filepath" @@ -19,12 +20,18 @@ const ( fakeToken = "testtoken" ) -// captureJobCreate deploys a bundle containing job through the given engine -// ("direct" or "terraform") and returns the create request body sent to the Jobs -// API. Both engines run the full `bundle deploy` against an in-process testserver, -// so shared mutators cancel out and the only difference in the payloads is the -// engine itself. Terraform additionally needs the env from requireTerraform. -func captureJobCreate(ctx context.Context, t *testing.T, job *resources.Job, engine string) (json.RawMessage, error) { +// errInvalidConfig marks a generated config that `bundle validate` rejects. The +// caller skips on it: an invalid config can't violate an invariant, so it is not a +// bug. This is the distinction that makes the suite safe to point at a looser +// (e.g. schema-driven) generator, which will produce invalid configs by design. +var errInvalidConfig = errors.New("config did not validate") + +// captureJobCreate validates then deploys a bundle containing job via the direct +// engine against an in-process testserver, returning the create request body sent +// to the Jobs API. A validation failure is wrapped as errInvalidConfig. The +// invariant suite asserts properties of the payload; the terraform engine is not +// involved (we assert fundamental properties rather than compare engines). +func captureJobCreate(ctx context.Context, t *testing.T, job *resources.Job) (json.RawMessage, error) { rec := &recorder{} server := testserver.New(t) server.RequestCallback = rec.callback @@ -37,18 +44,24 @@ func captureJobCreate(ctx context.Context, t *testing.T, job *resources.Job, eng t.Setenv("DATABRICKS_HOST", server.URL) t.Setenv("DATABRICKS_TOKEN", fakeToken) - t.Setenv("DATABRICKS_BUNDLE_ENGINE", engine) + t.Setenv("DATABRICKS_BUNDLE_ENGINE", "direct") t.Chdir(dir) + // Validate first so an invalid config is reported as errInvalidConfig (caller + // skips) rather than a deploy failure (caller fails). + if _, stderr, err := testcli.NewRunner(t, ctx, "bundle", "validate").Run(); err != nil { + return nil, fmt.Errorf("%w: %v\nstderr:\n%s", errInvalidConfig, err, stderr.String()) + } + stdout, stderr, err := testcli.NewRunner(t, ctx, "bundle", "deploy").Run() if err != nil { - return nil, fmt.Errorf("bundle deploy (engine=%s) failed: %w\nstdout:\n%s\nstderr:\n%s", - engine, err, stdout.String(), stderr.String()) + return nil, fmt.Errorf("bundle deploy failed: %w\nstdout:\n%s\nstderr:\n%s", + err, stdout.String(), stderr.String()) } body, ok := rec.find("POST", jobsCreatePath) if !ok { - return nil, fmt.Errorf("engine=%s did not POST %s during deploy", engine, jobsCreatePath) + return nil, fmt.Errorf("deploy did not POST %s", jobsCreatePath) } return body, nil } @@ -82,61 +95,18 @@ func writeJobBundle(dir, host string, job *resources.Job) error { return os.WriteFile(filepath.Join(dir, "databricks.yml"), data, 0o600) } -// fuzzOptInVars opt a run into the terraform parity suite. FUZZ_SEED(S)/OFFSET also -// tune it (see paritySeeds); FUZZ_PARITY is a no-tuning switch for `task test-fuzz`. -var fuzzOptInVars = []string{"FUZZ_PARITY", "FUZZ_SEED", "FUZZ_SEEDS", "FUZZ_SEED_OFFSET"} +// fuzzOptInVars opt a run into the invariant suite. FUZZ_SEED(S)/OFFSET also tune +// it (see invariantSeeds); FUZZ_INVARIANTS is a no-tuning switch for `task test-fuzz`. +var fuzzOptInVars = []string{"FUZZ_INVARIANTS", "FUZZ_SEED", "FUZZ_SEEDS", "FUZZ_SEED_OFFSET"} -// requireFuzzOptIn skips unless a FUZZ_* var is set. Gating on an env var rather -// than on a leftover build/ keeps a plain `task test` from running real deploys. +// requireFuzzOptIn skips unless a FUZZ_* var is set. Each seed runs a real +// in-process deploy, so gating keeps a plain `task test` fast (the single +// un-gated direct smoke test still exercises the harness on every run). func requireFuzzOptIn(t testing.TB) { for _, name := range fuzzOptInVars { if os.Getenv(name) != "" { return } } - t.Skip("terraform parity suite is opt-in; run `task test-fuzz` or set FUZZ_SEED= to reproduce a single seed") -} - -// requireTerraform opts in via requireFuzzOptIn, then points the terraform engine -// at the binary and provider mirror that acceptance/install_terraform.py provisions -// into /build, skipping cleanly when they are absent. -func requireTerraform(t testing.TB) { - requireFuzzOptIn(t) - - buildDir := filepath.Join(repoRoot(t), "build") - execPath := filepath.Join(buildDir, "terraform") - cfgFile := filepath.Join(buildDir, ".terraformrc") - - // Require all three together; a partial build/ would otherwise fail mid-deploy - // instead of skipping cleanly. - tfpluginsDir := filepath.Join(buildDir, "tfplugins") - for _, p := range []string{execPath, cfgFile, tfpluginsDir} { - if _, err := os.Stat(p); err != nil { - t.Skipf("terraform not fully provisioned (%s); run: python3 acceptance/install_terraform.py --targetdir build", p) - } - } - - t.Setenv("DATABRICKS_TF_EXEC_PATH", execPath) - t.Setenv("DATABRICKS_TF_CLI_CONFIG_FILE", cfgFile) - t.Setenv("TF_CLI_CONFIG_FILE", cfgFile) - // Disable terraform's checkpoint-api.hashicorp.com phone-home. See acceptance_test.go. - t.Setenv("CHECKPOINT_DISABLE", "1") -} - -// repoRoot returns the repository root by walking up from the current directory. -func repoRoot(t testing.TB) string { - dir, err := os.Getwd() - if err != nil { - t.Fatalf("getwd: %s", err) - } - for { - if _, err := os.Stat(filepath.Join(dir, "go.mod")); err == nil { - return dir - } - parent := filepath.Dir(dir) - if parent == dir { - t.Fatal("could not locate repo root (go.mod not found)") - } - dir = parent - } + t.Skip("invariant fuzz suite is opt-in; run `task test-fuzz` or set FUZZ_SEED= to reproduce a single seed") } diff --git a/bundle/fuzz/doc.go b/bundle/fuzz/doc.go index 10608ae248..59b0417096 100644 --- a/bundle/fuzz/doc.go +++ b/bundle/fuzz/doc.go @@ -1,7 +1,10 @@ -// Package fuzz compares how the terraform and direct deploy engines translate the -// same bundle resource into an API create payload, catching divergences during the -// migration off terraform. Generators are seeded so any divergence reproduces from -// the printed seed. Jobs only for now (DECO-25361). +// Package fuzz deploys randomly generated bundle resources through the direct +// engine and asserts invariants that any valid config's API create payload must +// satisfy (e.g. task keys are preserved, references resolve, a new_cluster is +// sized by autoscale or num_workers but not both). Unlike a terraform/direct +// payload comparison, an invariant has no legitimate reason to fail, so a failure +// is a real bug. Generators are seeded so any failure reproduces from the printed +// seed. Jobs only for now. // // Everything lives in _test.go files: the package is test-only and nothing in the // product imports it. This file exists only to carry the package doc. diff --git a/bundle/fuzz/fuzz_test.go b/bundle/fuzz/fuzz_test.go index 33c0b3963e..f8d758e88a 100644 --- a/bundle/fuzz/fuzz_test.go +++ b/bundle/fuzz/fuzz_test.go @@ -2,6 +2,7 @@ package fuzz import ( "encoding/json" + "errors" "os" "strconv" "strings" @@ -11,39 +12,38 @@ import ( "github.com/stretchr/testify/require" ) -// defaultParitySeeds is how many random jobs TestJobCreateParity checks by default. -// Each seed runs two real deploys, so keep it modest; override with FUZZ_SEEDS. -const defaultParitySeeds = 20 +// defaultInvariantSeeds is how many random jobs TestJobInvariants checks by +// default. Each seed runs a real deploy, so keep it modest; override with +// FUZZ_SEEDS. +const defaultInvariantSeeds = 20 -// regressionSeeds are seeds that previously surfaced a divergence. They are always +// regressionSeeds are seeds that previously broke an invariant. They are always // checked (on top of the rotating nightly window, which never revisits them) so a -// fixed divergence can't silently regress. When the nightly job reports a new -// failing FUZZ_SEED, add it here in the PR that fixes the divergence. -// -// - 29: single-node task new_cluster; direct omitted num_workers while terraform -// force-sent 0. Fixed by initializeNumWorkers on task clusters (DECO-25361). -var regressionSeeds = []int64{29} +// fixed bug can't silently regress. When the nightly job reports a new failing +// FUZZ_SEED, add it here in the PR that fixes it. Empty until the first such bug. +var regressionSeeds = []int64{} -// TestJobCreateParity asserts the terraform and direct engines produce equivalent -// create payloads for many random jobs, printing the seed on divergence. -func TestJobCreateParity(t *testing.T) { - requireTerraform(t) +// TestJobInvariants asserts the engine produces a create payload satisfying the +// invariants in checkJobInvariants for many random jobs, printing the seed on +// failure. +func TestJobInvariants(t *testing.T) { + requireFuzzOptIn(t) - for _, seed := range paritySeeds(t) { + for _, seed := range invariantSeeds(t) { t.Run("seed="+strconv.FormatInt(seed, 10), func(t *testing.T) { - checkJobParity(t, seed) + checkJob(t, seed) }) } } -// paritySeeds returns the seeds TestJobCreateParity should check. +// invariantSeeds returns the seeds TestJobInvariants should check. // // FUZZ_SEED (comma-separated) runs exactly those seeds and overrides everything, -// so a reported divergence reproduces with one command. Otherwise it runs -// regressionSeeds plus FUZZ_SEEDS seeds (default defaultParitySeeds) from +// so a reported failure reproduces with one command. Otherwise it runs +// regressionSeeds plus FUZZ_SEEDS seeds (default defaultInvariantSeeds) from // FUZZ_SEED_OFFSET; the nightly job shifts the offset each run so CI keeps // exploring new configs. -func paritySeeds(t *testing.T) []int64 { +func invariantSeeds(t *testing.T) []int64 { if v := os.Getenv("FUZZ_SEED"); v != "" { var seeds []int64 for part := range strings.SplitSeq(v, ",") { @@ -59,7 +59,7 @@ func paritySeeds(t *testing.T) []int64 { return seeds } - count := defaultParitySeeds + count := defaultInvariantSeeds if v := os.Getenv("FUZZ_SEEDS"); v != "" { n, err := strconv.Atoi(v) require.NoErrorf(t, err, "invalid FUZZ_SEEDS=%q", v) @@ -92,89 +92,63 @@ func paritySeeds(t *testing.T) []int64 { return seeds } -// TestParitySeeds verifies paritySeeds composes the regression seeds with the -// rotating window, deduplicates overlaps, and lets FUZZ_SEED override both. -func TestParitySeeds(t *testing.T) { - // Isolate from ambient FUZZ_* in the dev environment (paritySeeds treats "" as - // unset); subtests set only what they need. +// TestInvariantSeeds verifies invariantSeeds composes the regression seeds with +// the rotating window, deduplicates overlaps, and lets FUZZ_SEED override both. +func TestInvariantSeeds(t *testing.T) { + // Isolate from ambient FUZZ_* in the dev environment (invariantSeeds treats "" + // as unset); subtests set only what they need. t.Setenv("FUZZ_SEED", "") t.Setenv("FUZZ_SEEDS", "") t.Setenv("FUZZ_SEED_OFFSET", "") - t.Run("default includes regression seeds then window", func(t *testing.T) { + t.Run("default is regression seeds then the window", func(t *testing.T) { t.Setenv("FUZZ_SEEDS", "3") t.Setenv("FUZZ_SEED_OFFSET", "100") want := append(append([]int64{}, regressionSeeds...), 100, 101, 102) - assert.Equal(t, want, paritySeeds(t)) - }) - - t.Run("window overlapping a regression seed is deduplicated", func(t *testing.T) { - t.Setenv("FUZZ_SEEDS", "5") - t.Setenv("FUZZ_SEED_OFFSET", "27") - seeds := paritySeeds(t) - count := 0 - for _, s := range seeds { - if s == 29 { - count++ - } - } - assert.Equal(t, 1, count, "seed 29 must appear once even though it is both a regression seed and inside the window") + assert.Equal(t, want, invariantSeeds(t)) }) t.Run("FUZZ_SEED override ignores regression seeds", func(t *testing.T) { t.Setenv("FUZZ_SEED", "7, 8") - assert.Equal(t, []int64{7, 8}, paritySeeds(t)) + assert.Equal(t, []int64{7, 8}, invariantSeeds(t)) }) } -// FuzzJobCreateParity exposes the parity check to Go's native fuzzer. Each input -// runs two real deploys, so it's for ad-hoc deep runs, not the default test path. -func FuzzJobCreateParity(f *testing.F) { - requireTerraform(f) +// FuzzJobInvariants exposes the invariant check to Go's native fuzzer. Each input +// runs a real deploy, so it's for ad-hoc deep runs, not the default test path. +func FuzzJobInvariants(f *testing.F) { + requireFuzzOptIn(f) for seed := range int64(5) { f.Add(seed) } - // Seed the corpus with known past divergences. + // Seed the corpus with known past failures. for _, seed := range regressionSeeds { f.Add(seed) } f.Fuzz(func(t *testing.T, seed int64) { - checkJobParity(t, seed) + checkJob(t, seed) }) } -// checkJobParity deploys the seed's job under both engines and fails if the create -// payloads diverge. A deploy/capture failure is not a payload divergence, so the -// outcomes are kept distinct: -// - neither deployed: skip (the config is unacceptable to both engines). -// - one deployed: fail separately as a deploy/capture difference, not a diff. -// - both deployed: compare the captured payloads. -func checkJobParity(t *testing.T, seed int64) { +// checkJob validates and deploys the seed's job, then asserts its create payload +// satisfies the invariants. It separates the two fuzzing outcomes: +// - the config doesn't validate: skip, since invalid input can't be a bug. +// - a validated config that fails to deploy or breaks an invariant: fail (a +// config the CLI accepted must deploy and produce a sound payload). +func checkJob(t *testing.T, seed int64) { t.Helper() job := generateJob(newRNG(seed)) - ctx := t.Context() - direct, directErr := captureJobCreate(ctx, t, job, "direct") - terraform, tfErr := captureJobCreate(ctx, t, job, "terraform") - - switch { - case directErr != nil && tfErr != nil: - t.Skipf("seed %d: config did not deploy under either engine (not a parity divergence)\ndirect: %v\nterraform: %v", seed, directErr, tfErr) - case directErr != nil: - t.Fatalf("seed %d: terraform deployed but direct did not (deploy/capture difference, not a payload diff): %v", seed, directErr) - case tfErr != nil: - t.Fatalf("seed %d: direct deployed but terraform did not (deploy/capture difference, not a payload diff): %v", seed, tfErr) + payload, err := captureJobCreate(t.Context(), t, job) + if errors.Is(err, errInvalidConfig) { + t.Skipf("seed %d: config did not validate, so it can't violate an invariant: %v", seed, err) } + require.NoErrorf(t, err, "seed %d: validated config failed to deploy", seed) - diffs, err := diffPayloads(direct, terraform, defaultIgnorePaths) - require.NoErrorf(t, err, "seed %d: comparing create payloads", seed) + checkJobInvariants(t, seed, job, payload) - if len(diffs) > 0 { + if t.Failed() { jobJSON, _ := json.MarshalIndent(job, "", " ") - t.Errorf("seed %d: terraform/direct create payloads diverge (%d differences):", seed, len(diffs)) - for _, d := range diffs { - t.Errorf(" %s", d) - } t.Logf("reproduce with: FUZZ_SEED=%d task test-fuzz\nonce fixed, add %d to regressionSeeds in bundle/fuzz/fuzz_test.go\n%s", seed, seed, jobJSON) } } diff --git a/bundle/fuzz/generate_test.go b/bundle/fuzz/generate_test.go index 7a96c1868c..47abd7b7e9 100644 --- a/bundle/fuzz/generate_test.go +++ b/bundle/fuzz/generate_test.go @@ -28,10 +28,10 @@ var ( ) // generateJob builds a random, well-formed job config driven entirely by rng, so -// the same seed always produces the same job. It favors fields whose translation -// tends to differ between engines. +// the same seed always produces the same job. It favors fields whose +// config->payload translation is non-trivial (clusters, scheduling, references). // -// TODO(DECO-25361): generalize the harness across resource kinds. +// TODO: generalize the harness across resource kinds. func generateJob(rng *rand.Rand) *resources.Job { job := &resources.Job{} job.Name = randName(rng, "job") diff --git a/bundle/fuzz/invariants_cases_test.go b/bundle/fuzz/invariants_cases_test.go new file mode 100644 index 0000000000..ea14c45f50 --- /dev/null +++ b/bundle/fuzz/invariants_cases_test.go @@ -0,0 +1,93 @@ +package fuzz + +import ( + "encoding/json" + "testing" + + "github.com/databricks/cli/bundle/config/resources" + "github.com/databricks/databricks-sdk-go/service/compute" + "github.com/databricks/databricks-sdk-go/service/jobs" + "github.com/stretchr/testify/assert" +) + +// recordingT captures whether the invariant assertions failed, so the table below +// can check that a bad payload is rejected and a good one is accepted without a +// real deploy. +type recordingT struct{ failed bool } + +func (r *recordingT) Errorf(string, ...any) { r.failed = true } + +// FailNow is only reached if decodePayload errors; every case here is valid JSON, +// so record and stop the goroutine the way require would. +func (r *recordingT) FailNow() { panic("unexpected FailNow") } + +func TestCheckJobInvariants(t *testing.T) { + job := &resources.Job{ + JobSettings: jobs.JobSettings{ + Name: "j", + JobClusters: []jobs.JobCluster{ + {JobClusterKey: "shared", NewCluster: compute.ClusterSpec{}}, + }, + Tasks: []jobs.Task{ + {TaskKey: "a"}, + {TaskKey: "b"}, + }, + }, + } + + tests := []struct { + name string + payload string + wantFailed bool + }{ + { + name: "valid payload", + payload: `{"name":"j","job_clusters":[{"job_cluster_key":"shared","new_cluster":{"num_workers":0}}],"tasks":[{"task_key":"a","job_cluster_key":"shared"},{"task_key":"b","depends_on":[{"task_key":"a"}]}]}`, + }, + { + name: "renamed job", + payload: `{"name":"other","job_clusters":[{"job_cluster_key":"shared"}],"tasks":[{"task_key":"a"},{"task_key":"b"}]}`, + wantFailed: true, + }, + { + name: "dropped task", + payload: `{"name":"j","job_clusters":[{"job_cluster_key":"shared"}],"tasks":[{"task_key":"a"}]}`, + wantFailed: true, + }, + { + name: "dangling dependency", + payload: `{"name":"j","job_clusters":[{"job_cluster_key":"shared"}],"tasks":[{"task_key":"a"},{"task_key":"b","depends_on":[{"task_key":"ghost"}]}]}`, + wantFailed: true, + }, + { + name: "dangling job cluster reference", + payload: `{"name":"j","job_clusters":[{"job_cluster_key":"shared"}],"tasks":[{"task_key":"a","job_cluster_key":"missing"},{"task_key":"b"}]}`, + wantFailed: true, + }, + { + name: "new_cluster without explicit size is a valid single node", + payload: `{"name":"j","job_clusters":[{"job_cluster_key":"shared","new_cluster":{"spark_version":"x"}}],"tasks":[{"task_key":"a"},{"task_key":"b"}]}`, + }, + { + name: "single-node new_cluster with num_workers 0", + payload: `{"name":"j","job_clusters":[{"job_cluster_key":"shared","new_cluster":{"num_workers":0}}],"tasks":[{"task_key":"a"},{"task_key":"b"}]}`, + }, + { + name: "autoscale new_cluster", + payload: `{"name":"j","job_clusters":[{"job_cluster_key":"shared","new_cluster":{"autoscale":{"min_workers":1,"max_workers":3}}}],"tasks":[{"task_key":"a"},{"task_key":"b"}]}`, + }, + { + name: "new_cluster sets both autoscale and num_workers", + payload: `{"name":"j","job_clusters":[{"job_cluster_key":"shared","new_cluster":{"autoscale":{"min_workers":1,"max_workers":3},"num_workers":2}}],"tasks":[{"task_key":"a"},{"task_key":"b"}]}`, + wantFailed: true, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + rec := &recordingT{} + checkJobInvariants(rec, 0, job, json.RawMessage(tt.payload)) + assert.Equal(t, tt.wantFailed, rec.failed) + }) + } +} diff --git a/bundle/fuzz/invariants_test.go b/bundle/fuzz/invariants_test.go new file mode 100644 index 0000000000..055390236a --- /dev/null +++ b/bundle/fuzz/invariants_test.go @@ -0,0 +1,175 @@ +package fuzz + +import ( + "bytes" + "encoding/json" + "fmt" + + "github.com/databricks/cli/bundle/config/resources" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +// checkJobInvariants asserts the properties that any valid job's create payload +// must satisfy, independent of deploy engine. Unlike a terraform/direct payload +// diff, an invariant has no legitimate reason to fail, so a failure is a real bug +// and the seed reproduces it. Each invariant is checked separately so a failure +// points at the property that broke. +func checkJobInvariants(t require.TestingT, seed int64, job *resources.Job, payload json.RawMessage) { + p, err := decodePayload(payload) + require.NoErrorf(t, err, "seed %d: decoding create payload", seed) + + nameMatchesConfig(t, seed, job, p) + taskKeysMatchConfig(t, seed, job, p) + dependenciesResolve(t, seed, p) + jobClusterKeysMatchConfig(t, seed, job, p) + taskClusterRefsResolve(t, seed, p) + newClustersSizedExclusively(t, seed, p) +} + +// nameMatchesConfig: the engine must not rename the job. +func nameMatchesConfig(t require.TestingT, seed int64, job *resources.Job, p map[string]any) { + assert.Equalf(t, job.Name, p["name"], "seed %d: payload name must match config", seed) +} + +// taskKeysMatchConfig: the payload must carry exactly the tasks from config, no +// more and no fewer, identified by task_key. +func taskKeysMatchConfig(t require.TestingT, seed int64, job *resources.Job, p map[string]any) { + want := make([]string, 0, len(job.Tasks)) + for _, task := range job.Tasks { + want = append(want, task.TaskKey) + } + assert.ElementsMatchf(t, want, taskKeys(p), "seed %d: payload task keys must match config", seed) +} + +// dependenciesResolve: every depends_on must point at a task in the same payload. +func dependenciesResolve(t require.TestingT, seed int64, p map[string]any) { + keys := sliceToSet(taskKeys(p)) + for _, task := range payloadTasks(p) { + for _, dep := range slice(task["depends_on"]) { + d, ok := dep.(map[string]any) + if !ok { + continue + } + assert.Containsf(t, keys, d["task_key"], + "seed %d: task %v depends on unknown task %v", seed, task["task_key"], d["task_key"]) + } + } +} + +// jobClusterKeysMatchConfig: the payload's shared job clusters must match config. +func jobClusterKeysMatchConfig(t require.TestingT, seed int64, job *resources.Job, p map[string]any) { + want := make([]string, 0, len(job.JobClusters)) + for _, jc := range job.JobClusters { + want = append(want, jc.JobClusterKey) + } + assert.ElementsMatchf(t, want, jobClusterKeys(p), "seed %d: payload job cluster keys must match config", seed) +} + +// taskClusterRefsResolve: a task referencing a shared cluster must reference one +// declared in job_clusters. +func taskClusterRefsResolve(t require.TestingT, seed int64, p map[string]any) { + keys := sliceToSet(jobClusterKeys(p)) + for _, task := range payloadTasks(p) { + ref, ok := task["job_cluster_key"].(string) + if !ok || ref == "" { + continue + } + assert.Containsf(t, keys, ref, + "seed %d: task %v references unknown job cluster %q", seed, task["task_key"], ref) + } +} + +// newClustersSizedExclusively: a new_cluster is sized either by autoscale or by a +// fixed num_workers, never both. The two are mutually exclusive cluster shapes, so +// an engine emitting both (e.g. force-sending num_workers onto an autoscale +// cluster) produces a payload the backend rejects. +func newClustersSizedExclusively(t require.TestingT, seed int64, p map[string]any) { + for _, c := range newClusters(p) { + _, hasAutoscale := c["autoscale"] + _, hasNumWorkers := c["num_workers"] + assert.Falsef(t, hasAutoscale && hasNumWorkers, + "seed %d: new_cluster must not set both autoscale and num_workers, got %v", seed, c) + } +} + +// decodePayload unmarshals the create body with UseNumber so large int64 values +// (job ids, spark_context_id) aren't corrupted by float64 rounding. +func decodePayload(raw json.RawMessage) (map[string]any, error) { + dec := json.NewDecoder(bytes.NewReader(raw)) + dec.UseNumber() + var p map[string]any + if err := dec.Decode(&p); err != nil { + return nil, fmt.Errorf("decoding payload: %w", err) + } + return p, nil +} + +// payloadTasks returns the payload's task objects. +func payloadTasks(p map[string]any) []map[string]any { + tasks := make([]map[string]any, 0, len(slice(p["tasks"]))) + for _, el := range slice(p["tasks"]) { + if m, ok := el.(map[string]any); ok { + tasks = append(tasks, m) + } + } + return tasks +} + +func taskKeys(p map[string]any) []string { + var keys []string + for _, task := range payloadTasks(p) { + if k, ok := task["task_key"].(string); ok { + keys = append(keys, k) + } + } + return keys +} + +func jobClusterKeys(p map[string]any) []string { + var keys []string + for _, el := range slice(p["job_clusters"]) { + jc, ok := el.(map[string]any) + if !ok { + continue + } + if k, ok := jc["job_cluster_key"].(string); ok { + keys = append(keys, k) + } + } + return keys +} + +// newClusters returns every new_cluster spec in the payload: one per task that +// defines its own cluster plus one per shared job cluster. +func newClusters(p map[string]any) []map[string]any { + var specs []map[string]any + for _, task := range payloadTasks(p) { + if c, ok := task["new_cluster"].(map[string]any); ok { + specs = append(specs, c) + } + } + for _, el := range slice(p["job_clusters"]) { + jc, ok := el.(map[string]any) + if !ok { + continue + } + if c, ok := jc["new_cluster"].(map[string]any); ok { + specs = append(specs, c) + } + } + return specs +} + +func slice(v any) []any { + s, _ := v.([]any) + return s +} + +func sliceToSet(s []string) map[string]bool { + set := make(map[string]bool, len(s)) + for _, v := range s { + set[v] = true + } + return set +} diff --git a/bundle/fuzz/recorder_test.go b/bundle/fuzz/recorder_test.go index 73620d00e1..cfabf22721 100644 --- a/bundle/fuzz/recorder_test.go +++ b/bundle/fuzz/recorder_test.go @@ -7,9 +7,9 @@ import ( "github.com/databricks/cli/libs/testserver" ) -// jobsCreatePath is the Jobs API route both engines must hit on create. The -// testserver registers only this version, so an engine posting to a different one -// surfaces as a capture failure ("did not POST"), not a payload diff. +// jobsCreatePath is the Jobs API route the deploy must hit on create. The +// testserver registers only this version, so posting to a different one surfaces +// as a capture failure ("did not POST"). const jobsCreatePath = "/api/2.2/jobs/create" // capturedRequest is a single mutating API request observed by the testserver. @@ -20,8 +20,7 @@ type capturedRequest struct { } // recorder collects request bodies sent to a testserver. It is safe for -// concurrent use because the SDK and terraform may issue requests from multiple -// goroutines. +// concurrent use because the deploy may issue requests from multiple goroutines. type recorder struct { mu sync.Mutex requests []capturedRequest From 83cd5ea12bbf2db691d9ea30dde68e7c3ffc2e32 Mon Sep 17 00:00:00 2001 From: Rada Kamysheva Date: Mon, 29 Jun 2026 07:58:49 +0000 Subject: [PATCH 15/24] acceptance: replace bundle/fuzz parity with schema-driven invariant fuzzing Drop the terraform/direct create-payload parity package in favor of fuzzing the existing acceptance/bundle/invariant framework, which already checks invariants across all resource types and is prepped for fuzzing via its INPUT_CONFIG_OK contract. - add acceptance/bin/gen_fuzz_config.py: a seeded generator that walks the bundle schema and emits a random databricks.yml for any resource type - add acceptance/bundle/invariant/fuzz: generates configs over a seed window and asserts the CLI never panics; the no-drift invariant is opt-in (FUZZ_CHECK_DRIFT) for the nightly wide-window run - point task test-fuzz and the nightly job at the new variant - remove bundle/fuzz and its parity harness --- .github/workflows/push.yml | 26 +- Taskfile.yml | 18 +- acceptance/bin/gen_fuzz_config.py | 207 +++++++++++ .../bundle/invariant/fuzz/out.test.toml | 5 + acceptance/bundle/invariant/fuzz/output.txt | 0 acceptance/bundle/invariant/fuzz/script | 62 ++++ acceptance/bundle/invariant/fuzz/test.toml | 5 + bundle/fuzz/deploy_smoke_test.go | 21 -- bundle/fuzz/deploy_test.go | 112 ------ bundle/fuzz/doc.go | 11 - bundle/fuzz/fuzz_test.go | 154 -------- bundle/fuzz/generate_invariants_test.go | 47 --- bundle/fuzz/generate_test.go | 340 ------------------ bundle/fuzz/invariants_cases_test.go | 93 ----- bundle/fuzz/invariants_test.go | 175 --------- bundle/fuzz/rand_test.go | 47 --- bundle/fuzz/recorder_test.go | 57 --- 17 files changed, 300 insertions(+), 1080 deletions(-) create mode 100755 acceptance/bin/gen_fuzz_config.py create mode 100644 acceptance/bundle/invariant/fuzz/out.test.toml create mode 100644 acceptance/bundle/invariant/fuzz/output.txt create mode 100644 acceptance/bundle/invariant/fuzz/script create mode 100644 acceptance/bundle/invariant/fuzz/test.toml delete mode 100644 bundle/fuzz/deploy_smoke_test.go delete mode 100644 bundle/fuzz/deploy_test.go delete mode 100644 bundle/fuzz/doc.go delete mode 100644 bundle/fuzz/fuzz_test.go delete mode 100644 bundle/fuzz/generate_invariants_test.go delete mode 100644 bundle/fuzz/generate_test.go delete mode 100644 bundle/fuzz/invariants_cases_test.go delete mode 100644 bundle/fuzz/invariants_test.go delete mode 100644 bundle/fuzz/rand_test.go delete mode 100644 bundle/fuzz/recorder_test.go diff --git a/.github/workflows/push.yml b/.github/workflows/push.yml index 0f0aab8f7d..3a3a57c726 100644 --- a/.github/workflows/push.yml +++ b/.github/workflows/push.yml @@ -374,9 +374,10 @@ jobs: needs: - cleanups - # A real deploy per seed across a wide rotating window: too slow for every PR, - # so nightly only and not part of test-result. (The package's un-gated smoke - # test still checks the invariants on one seed on every PR.) + # A real deploy per seed across a wide rotating window, with the no-drift + # invariant on: too slow for every PR, so nightly only and not part of + # test-result. (The committed acceptance fuzz test still checks the no-panic + # invariant on a small fixed seed window on every PR.) if: ${{ github.event_name == 'schedule' }} name: "task test-fuzz" runs-on: @@ -405,11 +406,11 @@ jobs: - name: Run tests env: # Shift the seed window each nightly run so CI explores new configs. - # offset = GITHUB_RUN_NUMBER * FUZZ_SEEDS keeps windows non-overlapping - # (GITHUB_RUN_NUMBER is monotonic). A failure prints FUZZ_SEED=. - FUZZ_SEEDS: "25" + # start = GITHUB_RUN_NUMBER * FUZZ_SEED_COUNT keeps windows non-overlapping + # (GITHUB_RUN_NUMBER is monotonic). A failure prints the failing seed. + FUZZ_SEED_COUNT: "25" run: | - export FUZZ_SEED_OFFSET=$(( GITHUB_RUN_NUMBER * FUZZ_SEEDS )) + export FUZZ_SEED_START=$(( GITHUB_RUN_NUMBER * FUZZ_SEED_COUNT )) go tool -modfile=tools/task/go.mod task test-fuzz # Excluded from test-result, so surface failures as a GitHub issue. Reuse one @@ -421,23 +422,20 @@ jobs: RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }} run: | gh label create fuzz-nightly \ - --description "Nightly create-payload invariant failures" \ + --description "Nightly schema fuzz invariant failures" \ --color FBCA04 2>/dev/null || true body=$(cat <\`. + The failing seed is printed in the job log as \`reproduce with: ...\`. Reproduce locally with: \`\`\` - FUZZ_SEED= task test-fuzz + FUZZ_SEED_START= FUZZ_SEED_COUNT=1 task test-fuzz \`\`\` - - Once fixed, add the seed to \`regressionSeeds\` in \`bundle/fuzz/fuzz_test.go\` - in the same PR so the bug can never silently regress. EOF ) diff --git a/Taskfile.yml b/Taskfile.yml index c8f53c3479..b2aaadb5eb 100644 --- a/Taskfile.yml +++ b/Taskfile.yml @@ -702,20 +702,20 @@ tasks: -- -timeout=${LOCAL_TIMEOUT:-30m} -run "TestAccept/cmd/sandbox" test-fuzz: - desc: Run create-payload invariant fuzz tests (random jobs, direct engine) - # No `sources:` fingerprint: the seeds depend on FUZZ_* env vars Task can't see, - # so always run rather than no-op a repro or a shifted nightly window. - env: - # Opt this target into the invariant suite (see requireFuzzOptIn) without - # overriding the FUZZ_SEED(S)/OFFSET tuning knobs. - FUZZ_INVARIANTS: "1" + desc: Run schema fuzz invariant tests (random configs, direct engine) + # No `sources:` fingerprint: the seed window depends on FUZZ_* env vars Task + # can't see, so always run rather than no-op a repro or a shifted nightly window. cmds: - | + # Sweep a wider window than the committed acceptance run and turn on the + # no-drift invariant; a repro can narrow it with FUZZ_SEED_START/COUNT. + export FUZZ_SEED_COUNT="${FUZZ_SEED_COUNT:-200}" + export FUZZ_CHECK_DRIFT="${FUZZ_CHECK_DRIFT:-1}" {{.GO_TOOL}} gotestsum \ --format ${GOTESTSUM_FORMAT:-pkgname-and-test-fails} \ --no-summary=skipped \ - --packages ./bundle/fuzz/... \ - -- -timeout=${LOCAL_TIMEOUT:-30m} + --packages ./acceptance/... \ + -- -timeout=${LOCAL_TIMEOUT:-30m} -run "TestAccept/bundle/invariant/fuzz" # --- Integration tests --- diff --git a/acceptance/bin/gen_fuzz_config.py b/acceptance/bin/gen_fuzz_config.py new file mode 100755 index 0000000000..85909bb03f --- /dev/null +++ b/acceptance/bin/gen_fuzz_config.py @@ -0,0 +1,207 @@ +#!/usr/bin/env python3 +""" +Generate a random bundle config from the bundle JSON schema. + +The generator walks the schema (`databricks bundle schema`), resolving $ref and +picking concrete branches of oneOf/anyOf, and emits a single random resource as a +databricks.yml. It is seeded so a failing run can be reproduced with the same --seed. + +This feeds the invariant tests (see acceptance/bundle/invariant/): the harness +deploys the generated config and asserts invariants such as no-drift. Configs the +CLI rejects are filtered out by the harness before invariants are checked, so the +generator is free to produce structurally-random-but-sometimes-invalid configs. +""" + +import argparse +import json +import random +import sys + +# Maximum object/array nesting depth. The schema is recursive (e.g. job tasks -> +# for_each_task -> task), so without a cap the walk would not terminate. +MAX_DEPTH = 6 + +# A string branch whose pattern matches a ${...} reference. These exist because the +# schema generator wraps every concrete field in a oneOf with interpolation-string +# alternatives (see bundle/internal/schema/main.go addInterpolationPatterns). We +# generate concrete values, not references, so these branches are skipped. +INTERPOLATION_MARKER = "\\$\\{" + + +class Generator: + def __init__(self, schema, rng, unique): + self.root = schema + self.rng = rng + self.unique = unique + + def resolve(self, schema): + # Follow $ref chains. A ref looks like "#/$defs/github.com/.../resources.Job"; + # definitions are nested under $defs by the "/"-separated path segments. + while isinstance(schema, dict) and "$ref" in schema: + cur = self.root["$defs"] + for part in schema["$ref"].split("/")[2:]: + cur = cur[part] + schema = cur + return schema + + def is_interpolation(self, branch): + return branch.get("type") == "string" and INTERPOLATION_MARKER in branch.get("pattern", "") + + def choose_branch(self, branches): + # Prefer concrete branches over the ${...} interpolation-string alternatives. + concrete = [b for b in branches if not self.is_interpolation(b)] + return self.rng.choice(concrete or branches) + + def gen(self, schema, depth, name=""): + schema = self.resolve(schema) + if not isinstance(schema, dict) or not schema: + return self.gen_scalar({"type": "string"}, name) + + if "const" in schema: + return schema["const"] + if schema.get("enum"): + return self.rng.choice(schema["enum"]) + + for key in ("oneOf", "anyOf"): + if schema.get(key): + return self.gen(self.choose_branch(schema[key]), depth, name) + + t = schema.get("type") + if t == "object" or "properties" in schema or self.is_map(schema): + return self.gen_object(schema, depth) + if t == "array": + return self.gen_array(schema, depth, name) + return self.gen_scalar(schema, name) + + def is_map(self, schema): + return isinstance(schema.get("additionalProperties"), dict) and not schema.get("properties") + + def gen_object(self, schema, depth): + props = schema.get("properties", {}) + required = set(schema.get("required", [])) + result = {} + + for prop_name, prop_schema in props.items(): + # Always emit required fields; emit optional ones with decreasing + # probability as we go deeper to keep configs from exploding. + keep = prop_name in required or (depth < MAX_DEPTH and self.rng.random() < 0.35) + if not keep: + continue + value = self.gen(prop_schema, depth + 1, prop_name) + if value is not None: + result[prop_name] = value + + # Map type (additionalProperties schema, no fixed properties): synthesize a + # few random keys, e.g. resources. or string maps like tags. + if self.is_map(schema): + for _ in range(self.rng.randint(1, 2)): + key = self.token() + result[key] = self.gen(schema["additionalProperties"], depth + 1, key) + + return result + + def gen_array(self, schema, depth, name): + items = schema.get("items") + if not items or depth >= MAX_DEPTH: + return [] + return [self.gen(items, depth + 1, name) for _ in range(self.rng.randint(1, 3))] + + def gen_scalar(self, schema, name): + t = schema.get("type") + if t == "boolean": + return self.rng.choice([True, False]) + if t == "integer": + return self.rng.choice([0, 1, self.rng.randint(2, 1000)]) + if t == "number": + return round(self.rng.uniform(0, 1000), 2) + # string (default) + if name in ("name", "display_name"): + return f"fuzz-{name}-{self.unique}" + return self.token() + + def token(self): + return "fuzz_" + "".join(self.rng.choice("abcdefghijklmnopqrstuvwxyz0123456789") for _ in range(8)) + + +def resource_types(schema, gen): + # resources is `oneOf[{object with one property per resource type}]`. + resources = gen.resolve(schema["properties"]["resources"]) + obj = next(b for b in resources["oneOf"] if b.get("type") == "object") + return obj["properties"] + + +def gen_config(schema, seed, unique, allowed): + rng = random.Random(seed) + gen = Generator(schema, rng, unique) + + types = resource_types(schema, gen) + candidates = [t for t in types if not allowed or t in allowed] + if not candidates: + sys.exit(f"no resource types to generate from (allowed={sorted(allowed)})") + rtype = rng.choice(sorted(candidates)) + + # Each resource type is a map ref; its element schema lives under the object + # branch's additionalProperties. + map_schema = gen.resolve(types[rtype]) + obj = next(b for b in map_schema["oneOf"] if b.get("type") == "object") + element = obj["additionalProperties"] + + key = f"fuzz_{rtype}_{seed}" + instance = gen.gen(element, 0, "name") + return { + "bundle": {"name": f"fuzz-{unique}"}, + "resources": {rtype: {key: instance}}, + } + + +def to_yaml(obj, indent=0, list_item=False): + pad = " " * indent + if isinstance(obj, dict): + if not obj: + return f"{pad}{{}}\n" if not list_item else f"{pad}- {{}}\n" + out = "" + first = True + for k, v in obj.items(): + prefix = pad + "- " if list_item and first else (pad + " " if list_item else pad) + child_indent = indent + 2 if list_item else indent + 1 + if isinstance(v, (dict, list)) and v: + out += f"{prefix}{k}:\n" + to_yaml(v, child_indent) + else: + out += f"{prefix}{k}: {json.dumps(v)}\n" + first = False + return out + if isinstance(obj, list): + if not obj: + return f"{pad}[]\n" + out = "" + for item in obj: + if isinstance(item, (dict, list)): + out += to_yaml(item, indent, list_item=True) + else: + out += f"{pad}- {json.dumps(item)}\n" + return out + return f"{pad}{json.dumps(obj)}\n" + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument("--schema", required=True, help="Path to bundle JSON schema") + parser.add_argument("--seed", type=int, required=True, help="RNG seed (for reproducibility)") + parser.add_argument("--unique", default="local", help="Unique suffix for resource names") + parser.add_argument( + "--resources", + default="", + help="Comma-separated allow-list of resource types (default: all)", + ) + args = parser.parse_args() + + with open(args.schema) as f: + schema = json.load(f) + + allowed = {r.strip() for r in args.resources.split(",") if r.strip()} + config = gen_config(schema, args.seed, args.unique, allowed) + sys.stdout.write(to_yaml(config)) + + +if __name__ == "__main__": + main() diff --git a/acceptance/bundle/invariant/fuzz/out.test.toml b/acceptance/bundle/invariant/fuzz/out.test.toml new file mode 100644 index 0000000000..789aa10c79 --- /dev/null +++ b/acceptance/bundle/invariant/fuzz/out.test.toml @@ -0,0 +1,5 @@ +Local = true +Cloud = true +RequiresUnityCatalog = true +EnvMatrix.DATABRICKS_BUNDLE_ENGINE = ["direct"] +EnvMatrix.INPUT_CONFIG = [] diff --git a/acceptance/bundle/invariant/fuzz/output.txt b/acceptance/bundle/invariant/fuzz/output.txt new file mode 100644 index 0000000000..e69de29bb2 diff --git a/acceptance/bundle/invariant/fuzz/script b/acceptance/bundle/invariant/fuzz/script new file mode 100644 index 0000000000..f7751dde58 --- /dev/null +++ b/acceptance/bundle/invariant/fuzz/script @@ -0,0 +1,62 @@ +# Invariant to test: the CLI never panics or hits an internal error on any config +# generated from the bundle schema, and a config that deploys cleanly has no drift. +# +# gen_fuzz_config.py walks the schema emitted by the CLI under test and produces a +# random-but-schema-valid config. Most invariant work is shared with the no_drift +# test; the difference is the input is generated, not a curated template. +# +# Seeds form a window [START, START+COUNT). The window is env-driven so the nightly +# job can sweep a wide, non-overlapping range (see Taskfile.yml test-fuzz) while this +# committed test stays small and deterministic. Everything is routed to LOG.* / *.json +# so output.txt stays empty regardless of the window: a violation fails via exit code, +# not via output diff, which is what lets the same test run under any seed window. +# +# Drift checking is opt-in (FUZZ_CHECK_DRIFT): a freshly deployed random config can +# legitimately differ from the fake server's state, so the local/PR run asserts only +# the cheap no-panic invariant. The nightly job enables drift on a real workspace. + +START="${FUZZ_SEED_START:-0}" +COUNT="${FUZZ_SEED_COUNT:-5}" + +# Emit the schema from the CLI under test so the generator always matches it. +$CLI bundle schema > schema.json 2>LOG.schema.err +cat LOG.schema.err | contains.py '!panic' '!internal error' > /dev/null + +for ((offset = 0; offset < COUNT; offset++)); do + seed=$((START + offset)) + dir="seed-$seed" + mkdir -p "$dir" + + gen_fuzz_config.py --schema schema.json --seed "$seed" --unique "$UNIQUE_NAME-$seed" --resources "${FUZZ_RESOURCES:-}" > "$dir/databricks.yml" 2>"$dir/LOG.gen.err" + cat "$dir/LOG.gen.err" | contains.py '!Traceback' > /dev/null + + ( + cd "$dir" + + # The CLI is allowed to reject a generated config, but never to crash. + set +e + $CLI bundle validate &> LOG.validate + $CLI bundle deploy &> LOG.deploy + deploy_rc=$? + set -e + cat LOG.validate LOG.deploy | contains.py '!panic' '!internal error' > /dev/null + + # Deploy failed => config was rejected (not a bug). This is the negative of + # the no_drift test's INPUT_CONFIG_OK marker: nothing more to assert. + if [ "$deploy_rc" -ne 0 ]; then + exit 0 + fi + + if [ -n "${FUZZ_CHECK_DRIFT:-}" ]; then + $CLI bundle plan -o json > plan.json 2>LOG.plan.err + cat LOG.plan.err | contains.py '!panic' '!internal error' > /dev/null + verify_no_drift.py plan.json + fi + + $CLI bundle destroy --auto-approve &> LOG.destroy + cat LOG.destroy | contains.py '!panic' '!internal error' > /dev/null + ) || { + echo "fuzz: invariant failed, reproduce with: FUZZ_SEED_START=$seed FUZZ_SEED_COUNT=1 task test-fuzz" >&2 + exit 1 + } +done diff --git a/acceptance/bundle/invariant/fuzz/test.toml b/acceptance/bundle/invariant/fuzz/test.toml new file mode 100644 index 0000000000..019d2dc649 --- /dev/null +++ b/acceptance/bundle/invariant/fuzz/test.toml @@ -0,0 +1,5 @@ +# Schema fuzzing: generate random configs from the bundle schema and assert +# invariants (see script). Unlike the curated-corpus invariant tests (no_drift, +# migrate), the fuzzer generates its own configs, so drop the inherited +# INPUT_CONFIG matrix. +EnvMatrix.INPUT_CONFIG = [] diff --git a/bundle/fuzz/deploy_smoke_test.go b/bundle/fuzz/deploy_smoke_test.go deleted file mode 100644 index 0121c7468e..0000000000 --- a/bundle/fuzz/deploy_smoke_test.go +++ /dev/null @@ -1,21 +0,0 @@ -package fuzz - -import ( - "testing" - - "github.com/stretchr/testify/require" -) - -// TestCaptureJobCreateDirect is intentionally NOT opt-in gated: a single direct -// deploy is cheap, so it runs on every `task test` as a smoke test of the capture -// harness and the invariants. The wider seed sweep stays opt-in via -// requireFuzzOptIn. -func TestCaptureJobCreateDirect(t *testing.T) { - job := generateJob(newRNG(1)) - - body, err := captureJobCreate(t.Context(), t, job) - require.NoError(t, err) - require.NotEmpty(t, body) - - checkJobInvariants(t, 1, job, body) -} diff --git a/bundle/fuzz/deploy_test.go b/bundle/fuzz/deploy_test.go deleted file mode 100644 index ddf8d4342b..0000000000 --- a/bundle/fuzz/deploy_test.go +++ /dev/null @@ -1,112 +0,0 @@ -package fuzz - -import ( - "context" - "encoding/json" - "errors" - "fmt" - "os" - "path/filepath" - "testing" - - "github.com/databricks/cli/bundle/config/resources" - "github.com/databricks/cli/internal/testcli" - "github.com/databricks/cli/libs/testserver" -) - -const ( - // bundleResourceKey is the map key the generated job is registered under. - bundleResourceKey = "fuzz_job" - fakeToken = "testtoken" -) - -// errInvalidConfig marks a generated config that `bundle validate` rejects. The -// caller skips on it: an invalid config can't violate an invariant, so it is not a -// bug. This is the distinction that makes the suite safe to point at a looser -// (e.g. schema-driven) generator, which will produce invalid configs by design. -var errInvalidConfig = errors.New("config did not validate") - -// captureJobCreate validates then deploys a bundle containing job via the direct -// engine against an in-process testserver, returning the create request body sent -// to the Jobs API. A validation failure is wrapped as errInvalidConfig. The -// invariant suite asserts properties of the payload; the terraform engine is not -// involved (we assert fundamental properties rather than compare engines). -func captureJobCreate(ctx context.Context, t *testing.T, job *resources.Job) (json.RawMessage, error) { - rec := &recorder{} - server := testserver.New(t) - server.RequestCallback = rec.callback - testserver.AddDefaultHandlers(server) - - dir := t.TempDir() - if err := writeJobBundle(dir, server.URL, job); err != nil { - return nil, err - } - - t.Setenv("DATABRICKS_HOST", server.URL) - t.Setenv("DATABRICKS_TOKEN", fakeToken) - t.Setenv("DATABRICKS_BUNDLE_ENGINE", "direct") - t.Chdir(dir) - - // Validate first so an invalid config is reported as errInvalidConfig (caller - // skips) rather than a deploy failure (caller fails). - if _, stderr, err := testcli.NewRunner(t, ctx, "bundle", "validate").Run(); err != nil { - return nil, fmt.Errorf("%w: %v\nstderr:\n%s", errInvalidConfig, err, stderr.String()) - } - - stdout, stderr, err := testcli.NewRunner(t, ctx, "bundle", "deploy").Run() - if err != nil { - return nil, fmt.Errorf("bundle deploy failed: %w\nstdout:\n%s\nstderr:\n%s", - err, stdout.String(), stderr.String()) - } - - body, ok := rec.find("POST", jobsCreatePath) - if !ok { - return nil, fmt.Errorf("deploy did not POST %s", jobsCreatePath) - } - return body, nil -} - -// writeJobBundle writes a minimal databricks.yml for a single job. It emits JSON -// (valid YAML) to reuse the job's own marshaling, which honors ForceSendFields. -func writeJobBundle(dir, host string, job *resources.Job) error { - jobJSON, err := json.Marshal(job) - if err != nil { - return fmt.Errorf("marshaling job: %w", err) - } - - var jobMap map[string]any - if err := json.Unmarshal(jobJSON, &jobMap); err != nil { - return fmt.Errorf("unmarshaling job: %w", err) - } - - doc := map[string]any{ - "bundle": map[string]any{"name": "fuzz"}, - "workspace": map[string]any{"host": host}, - "resources": map[string]any{ - "jobs": map[string]any{bundleResourceKey: jobMap}, - }, - } - - data, err := json.MarshalIndent(doc, "", " ") - if err != nil { - return fmt.Errorf("marshaling bundle: %w", err) - } - - return os.WriteFile(filepath.Join(dir, "databricks.yml"), data, 0o600) -} - -// fuzzOptInVars opt a run into the invariant suite. FUZZ_SEED(S)/OFFSET also tune -// it (see invariantSeeds); FUZZ_INVARIANTS is a no-tuning switch for `task test-fuzz`. -var fuzzOptInVars = []string{"FUZZ_INVARIANTS", "FUZZ_SEED", "FUZZ_SEEDS", "FUZZ_SEED_OFFSET"} - -// requireFuzzOptIn skips unless a FUZZ_* var is set. Each seed runs a real -// in-process deploy, so gating keeps a plain `task test` fast (the single -// un-gated direct smoke test still exercises the harness on every run). -func requireFuzzOptIn(t testing.TB) { - for _, name := range fuzzOptInVars { - if os.Getenv(name) != "" { - return - } - } - t.Skip("invariant fuzz suite is opt-in; run `task test-fuzz` or set FUZZ_SEED= to reproduce a single seed") -} diff --git a/bundle/fuzz/doc.go b/bundle/fuzz/doc.go deleted file mode 100644 index 59b0417096..0000000000 --- a/bundle/fuzz/doc.go +++ /dev/null @@ -1,11 +0,0 @@ -// Package fuzz deploys randomly generated bundle resources through the direct -// engine and asserts invariants that any valid config's API create payload must -// satisfy (e.g. task keys are preserved, references resolve, a new_cluster is -// sized by autoscale or num_workers but not both). Unlike a terraform/direct -// payload comparison, an invariant has no legitimate reason to fail, so a failure -// is a real bug. Generators are seeded so any failure reproduces from the printed -// seed. Jobs only for now. -// -// Everything lives in _test.go files: the package is test-only and nothing in the -// product imports it. This file exists only to carry the package doc. -package fuzz diff --git a/bundle/fuzz/fuzz_test.go b/bundle/fuzz/fuzz_test.go deleted file mode 100644 index f8d758e88a..0000000000 --- a/bundle/fuzz/fuzz_test.go +++ /dev/null @@ -1,154 +0,0 @@ -package fuzz - -import ( - "encoding/json" - "errors" - "os" - "strconv" - "strings" - "testing" - - "github.com/stretchr/testify/assert" - "github.com/stretchr/testify/require" -) - -// defaultInvariantSeeds is how many random jobs TestJobInvariants checks by -// default. Each seed runs a real deploy, so keep it modest; override with -// FUZZ_SEEDS. -const defaultInvariantSeeds = 20 - -// regressionSeeds are seeds that previously broke an invariant. They are always -// checked (on top of the rotating nightly window, which never revisits them) so a -// fixed bug can't silently regress. When the nightly job reports a new failing -// FUZZ_SEED, add it here in the PR that fixes it. Empty until the first such bug. -var regressionSeeds = []int64{} - -// TestJobInvariants asserts the engine produces a create payload satisfying the -// invariants in checkJobInvariants for many random jobs, printing the seed on -// failure. -func TestJobInvariants(t *testing.T) { - requireFuzzOptIn(t) - - for _, seed := range invariantSeeds(t) { - t.Run("seed="+strconv.FormatInt(seed, 10), func(t *testing.T) { - checkJob(t, seed) - }) - } -} - -// invariantSeeds returns the seeds TestJobInvariants should check. -// -// FUZZ_SEED (comma-separated) runs exactly those seeds and overrides everything, -// so a reported failure reproduces with one command. Otherwise it runs -// regressionSeeds plus FUZZ_SEEDS seeds (default defaultInvariantSeeds) from -// FUZZ_SEED_OFFSET; the nightly job shifts the offset each run so CI keeps -// exploring new configs. -func invariantSeeds(t *testing.T) []int64 { - if v := os.Getenv("FUZZ_SEED"); v != "" { - var seeds []int64 - for part := range strings.SplitSeq(v, ",") { - part = strings.TrimSpace(part) - if part == "" { - continue - } - n, err := strconv.ParseInt(part, 10, 64) - require.NoErrorf(t, err, "invalid FUZZ_SEED entry %q", part) - seeds = append(seeds, n) - } - require.NotEmptyf(t, seeds, "FUZZ_SEED=%q contained no seeds", v) - return seeds - } - - count := defaultInvariantSeeds - if v := os.Getenv("FUZZ_SEEDS"); v != "" { - n, err := strconv.Atoi(v) - require.NoErrorf(t, err, "invalid FUZZ_SEEDS=%q", v) - require.Positivef(t, n, "FUZZ_SEEDS must be positive, got %d", n) - count = n - } - - var offset int64 - if v := os.Getenv("FUZZ_SEED_OFFSET"); v != "" { - n, err := strconv.ParseInt(v, 10, 64) - require.NoErrorf(t, err, "invalid FUZZ_SEED_OFFSET=%q", v) - offset = n - } - - seeds := make([]int64, 0, len(regressionSeeds)+count) - seen := make(map[int64]bool, len(regressionSeeds)+count) - for _, s := range regressionSeeds { - if !seen[s] { - seen[s] = true - seeds = append(seeds, s) - } - } - for i := range int64(count) { - s := offset + i - if !seen[s] { - seen[s] = true - seeds = append(seeds, s) - } - } - return seeds -} - -// TestInvariantSeeds verifies invariantSeeds composes the regression seeds with -// the rotating window, deduplicates overlaps, and lets FUZZ_SEED override both. -func TestInvariantSeeds(t *testing.T) { - // Isolate from ambient FUZZ_* in the dev environment (invariantSeeds treats "" - // as unset); subtests set only what they need. - t.Setenv("FUZZ_SEED", "") - t.Setenv("FUZZ_SEEDS", "") - t.Setenv("FUZZ_SEED_OFFSET", "") - - t.Run("default is regression seeds then the window", func(t *testing.T) { - t.Setenv("FUZZ_SEEDS", "3") - t.Setenv("FUZZ_SEED_OFFSET", "100") - want := append(append([]int64{}, regressionSeeds...), 100, 101, 102) - assert.Equal(t, want, invariantSeeds(t)) - }) - - t.Run("FUZZ_SEED override ignores regression seeds", func(t *testing.T) { - t.Setenv("FUZZ_SEED", "7, 8") - assert.Equal(t, []int64{7, 8}, invariantSeeds(t)) - }) -} - -// FuzzJobInvariants exposes the invariant check to Go's native fuzzer. Each input -// runs a real deploy, so it's for ad-hoc deep runs, not the default test path. -func FuzzJobInvariants(f *testing.F) { - requireFuzzOptIn(f) - for seed := range int64(5) { - f.Add(seed) - } - // Seed the corpus with known past failures. - for _, seed := range regressionSeeds { - f.Add(seed) - } - f.Fuzz(func(t *testing.T, seed int64) { - checkJob(t, seed) - }) -} - -// checkJob validates and deploys the seed's job, then asserts its create payload -// satisfies the invariants. It separates the two fuzzing outcomes: -// - the config doesn't validate: skip, since invalid input can't be a bug. -// - a validated config that fails to deploy or breaks an invariant: fail (a -// config the CLI accepted must deploy and produce a sound payload). -func checkJob(t *testing.T, seed int64) { - t.Helper() - job := generateJob(newRNG(seed)) - - payload, err := captureJobCreate(t.Context(), t, job) - if errors.Is(err, errInvalidConfig) { - t.Skipf("seed %d: config did not validate, so it can't violate an invariant: %v", seed, err) - } - require.NoErrorf(t, err, "seed %d: validated config failed to deploy", seed) - - checkJobInvariants(t, seed, job, payload) - - if t.Failed() { - jobJSON, _ := json.MarshalIndent(job, "", " ") - t.Logf("reproduce with: FUZZ_SEED=%d task test-fuzz\nonce fixed, add %d to regressionSeeds in bundle/fuzz/fuzz_test.go\n%s", seed, seed, jobJSON) - } -} diff --git a/bundle/fuzz/generate_invariants_test.go b/bundle/fuzz/generate_invariants_test.go deleted file mode 100644 index 9ca3b5cc93..0000000000 --- a/bundle/fuzz/generate_invariants_test.go +++ /dev/null @@ -1,47 +0,0 @@ -package fuzz - -import ( - "testing" - - "github.com/stretchr/testify/assert" - "github.com/stretchr/testify/require" -) - -func TestGenerateJobIsDeterministic(t *testing.T) { - a := generateJob(newRNG(42)) - b := generateJob(newRNG(42)) - assert.Equal(t, a, b, "same seed must produce identical job") -} - -func TestGenerateJobIsWellFormed(t *testing.T) { - for seed := range int64(200) { - job := generateJob(newRNG(seed)) - require.NotEmptyf(t, job.Name, "seed %d: job must have a name", seed) - require.NotEmptyf(t, job.Tasks, "seed %d: job must have at least one task", seed) - - clusterKeys := map[string]bool{} - for _, jc := range job.JobClusters { - clusterKeys[jc.JobClusterKey] = true - } - - taskKeys := map[string]bool{} - for _, task := range job.Tasks { - require.NotEmptyf(t, task.TaskKey, "seed %d: task must have a key", seed) - taskKeys[task.TaskKey] = true - - // A task referencing a job cluster must reference one we generated. - if task.JobClusterKey != "" { - assert.Containsf(t, clusterKeys, task.JobClusterKey, - "seed %d: task %q references unknown job cluster %q", seed, task.TaskKey, task.JobClusterKey) - } - } - - // Every dependency must point at a task that exists in this job. - for _, task := range job.Tasks { - for _, dep := range task.DependsOn { - assert.Containsf(t, taskKeys, dep.TaskKey, - "seed %d: task %q depends on unknown task %q", seed, task.TaskKey, dep.TaskKey) - } - } - } -} diff --git a/bundle/fuzz/generate_test.go b/bundle/fuzz/generate_test.go deleted file mode 100644 index 47abd7b7e9..0000000000 --- a/bundle/fuzz/generate_test.go +++ /dev/null @@ -1,340 +0,0 @@ -package fuzz - -import ( - "fmt" - "math/rand/v2" - "strconv" - - "github.com/databricks/cli/bundle/config/resources" - "github.com/databricks/databricks-sdk-go/service/compute" - "github.com/databricks/databricks-sdk-go/service/jobs" -) - -// Value pools are intentionally small and valid-looking: the goal is to exercise -// config->payload translation across many field combinations, not to stress the -// API with invalid values the testserver would reject. -var ( - sparkVersions = []string{"13.3.x-scala2.12", "14.3.x-scala2.12", "15.4.x-scala2.12", "16.4.x-scala2.12"} - nodeTypeIDs = []string{"i3.xlarge", "m5.large", "r5.xlarge", "Standard_DS3_v2"} - timezones = []string{"UTC", "America/Los_Angeles", "Europe/Amsterdam"} - cronExprs = []string{"0 0 12 * * ?", "0 15 10 ? * MON-FRI", "0 0/30 * * * ?"} - pauseStatuses = []jobs.PauseStatus{jobs.PauseStatusPaused, jobs.PauseStatusUnpaused} - performance = []jobs.PerformanceTarget{jobs.PerformanceTargetPerformanceOptimized, jobs.PerformanceTargetStandard} - timeUnits = []string{"HOURS", "DAYS", "WEEKS"} - healthMetrics = []string{"RUN_DURATION_SECONDS", "STREAMING_BACKLOG_BYTES", "STREAMING_BACKLOG_RECORDS"} - conditionOps = []string{"EQUAL_TO", "NOT_EQUAL", "GREATER_THAN", "LESS_THAN_OR_EQUAL"} - runIfs = []string{"ALL_SUCCESS", "AT_LEAST_ONE_SUCCESS", "NONE_FAILED", "ALL_DONE"} - gitProviders = []jobs.GitProvider{jobs.GitProviderGitHub, jobs.GitProviderGitLab, jobs.GitProviderAzureDevOpsServices} -) - -// generateJob builds a random, well-formed job config driven entirely by rng, so -// the same seed always produces the same job. It favors fields whose -// config->payload translation is non-trivial (clusters, scheduling, references). -// -// TODO: generalize the harness across resource kinds. -func generateJob(rng *rand.Rand) *resources.Job { - job := &resources.Job{} - job.Name = randName(rng, "job") - - if chance(rng, 0.5) { - job.Description = randSentence(rng) - } - if chance(rng, 0.4) { - job.MaxConcurrentRuns = rng.IntN(10) + 1 - } - if chance(rng, 0.4) { - job.TimeoutSeconds = rng.IntN(7200) - } - if chance(rng, 0.3) { - job.PerformanceTarget = oneOf(rng, performance) - } - if chance(rng, 0.5) { - job.Tags = randTags(rng) - } - if chance(rng, 0.3) { - job.GitSource = randGitSource(rng) - } - - randScheduling(rng, job) - - if chance(rng, 0.3) { - job.EmailNotifications = randEmailNotifications(rng) - } - if chance(rng, 0.2) { - job.WebhookNotifications = randWebhookNotifications(rng) - } - if chance(rng, 0.3) { - job.NotificationSettings = &jobs.JobNotificationSettings{ - NoAlertForCanceledRuns: chance(rng, 0.5), - NoAlertForSkippedRuns: chance(rng, 0.5), - } - } - if chance(rng, 0.3) { - job.Health = randHealth(rng) - } - if chance(rng, 0.3) { - job.Parameters = randParameters(rng) - } - if chance(rng, 0.3) { - job.Queue = &jobs.QueueSettings{Enabled: chance(rng, 0.5)} - } - - // Generate shared job clusters first so tasks can reference them by key. - var jobClusterKeys []string - if chance(rng, 0.5) { - n := rng.IntN(2) + 1 - for i := range n { - key := fmt.Sprintf("cluster_%d", i) - jobClusterKeys = append(jobClusterKeys, key) - job.JobClusters = append(job.JobClusters, jobs.JobCluster{ - JobClusterKey: key, - NewCluster: randClusterSpec(rng), - }) - } - } - - nTasks := rng.IntN(3) + 1 - var taskKeys []string - for i := range nTasks { - task := randTask(rng, i, jobClusterKeys) - // Randomly chain dependencies onto previously generated tasks. - if len(taskKeys) > 0 && chance(rng, 0.4) { - dep := taskKeys[rng.IntN(len(taskKeys))] - task.DependsOn = []jobs.TaskDependency{{TaskKey: dep}} - if chance(rng, 0.5) { - task.RunIf = jobs.RunIf(oneOf(rng, runIfs)) - } - } - taskKeys = append(taskKeys, task.TaskKey) - job.Tasks = append(job.Tasks, task) - } - - return job -} - -// randScheduling sets at most one of schedule/trigger/continuous, which are -// mutually exclusive ways to launch a job. -func randScheduling(rng *rand.Rand, job *resources.Job) { - switch rng.IntN(5) { - case 0: - job.Schedule = &jobs.CronSchedule{ - QuartzCronExpression: oneOf(rng, cronExprs), - TimezoneId: oneOf(rng, timezones), - PauseStatus: oneOf(rng, pauseStatuses), - } - case 1: - job.Trigger = &jobs.TriggerSettings{ - PauseStatus: oneOf(rng, pauseStatuses), - Periodic: &jobs.PeriodicTriggerConfiguration{ - Interval: rng.IntN(12) + 1, - Unit: jobs.PeriodicTriggerConfigurationTimeUnit(oneOf(rng, timeUnits)), - }, - } - case 2: - job.Trigger = &jobs.TriggerSettings{ - PauseStatus: oneOf(rng, pauseStatuses), - FileArrival: &jobs.FileArrivalTriggerConfiguration{ - Url: "s3://" + randWord(rng) + "/" + randWord(rng), - }, - } - case 3: - job.Continuous = &jobs.Continuous{PauseStatus: oneOf(rng, pauseStatuses)} - default: - // no scheduling - } -} - -func randTask(rng *rand.Rand, idx int, jobClusterKeys []string) jobs.Task { - task := jobs.Task{TaskKey: fmt.Sprintf("task_%d", idx)} - - // Use absolute workspace paths so deploy never depends on local files. - // condition_task needs no compute, handled separately below. - needsCompute := true - switch rng.IntN(4) { - case 0: - task.NotebookTask = &jobs.NotebookTask{ - NotebookPath: "/Workspace/Users/test/" + randName(rng, "nb"), - Source: jobs.SourceWorkspace, - } - case 1: - task.SparkPythonTask = &jobs.SparkPythonTask{ - PythonFile: "/Workspace/Users/test/" + randName(rng, "main") + ".py", - Source: jobs.SourceWorkspace, - } - case 2: - task.PythonWheelTask = &jobs.PythonWheelTask{ - PackageName: randName(rng, "pkg"), - EntryPoint: "main", - } - case 3: - task.ConditionTask = &jobs.ConditionTask{ - Left: randWord(rng), - Op: jobs.ConditionTaskOp(oneOf(rng, conditionOps)), - Right: randWord(rng), - } - needsCompute = false - } - - if needsCompute { - assignCompute(rng, &task, jobClusterKeys) - if chance(rng, 0.4) { - task.Libraries = randLibraries(rng) - } - } - - if chance(rng, 0.3) { - task.TimeoutSeconds = rng.IntN(3600) - } - if chance(rng, 0.3) { - task.MaxRetries = rng.IntN(5) - task.MinRetryIntervalMillis = rng.IntN(60000) - task.RetryOnTimeout = chance(rng, 0.5) - } - return task -} - -// assignCompute attaches exactly one compute source: a shared job cluster (when -// available), a new cluster, or an existing cluster id. -func assignCompute(rng *rand.Rand, task *jobs.Task, jobClusterKeys []string) { - const ( - computeNew = iota - computeExisting - computeShared - ) - options := []int{computeNew, computeExisting} - if len(jobClusterKeys) > 0 { - options = append(options, computeShared) - } - switch oneOf(rng, options) { - case computeNew: - spec := randClusterSpec(rng) - task.NewCluster = &spec - case computeExisting: - task.ExistingClusterId = randName(rng, "cluster") - case computeShared: - task.JobClusterKey = oneOf(rng, jobClusterKeys) - } -} - -func randClusterSpec(rng *rand.Rand) compute.ClusterSpec { - spec := compute.ClusterSpec{ - SparkVersion: oneOf(rng, sparkVersions), - NodeTypeId: oneOf(rng, nodeTypeIDs), - } - if chance(rng, 0.5) { - spec.NumWorkers = rng.IntN(8) - } else { - spec.Autoscale = &compute.AutoScale{ - MinWorkers: 1, - MaxWorkers: rng.IntN(8) + 2, - } - } - if chance(rng, 0.4) { - spec.SparkConf = map[string]string{ - "spark.databricks.delta.preview.enabled": "true", - "spark.speculation": strconv.FormatBool(chance(rng, 0.5)), - } - } - if chance(rng, 0.3) { - spec.CustomTags = randTags(rng) - } - if chance(rng, 0.3) { - spec.SparkEnvVars = map[string]string{"PYSPARK_PYTHON": "/databricks/python3/bin/python3"} - } - if chance(rng, 0.3) { - spec.DriverNodeTypeId = oneOf(rng, nodeTypeIDs) - } - return spec -} - -func randGitSource(rng *rand.Rand) *jobs.GitSource { - src := &jobs.GitSource{ - GitProvider: oneOf(rng, gitProviders), - GitUrl: "https://example.com/" + randWord(rng) + "/" + randWord(rng) + ".git", - } - switch rng.IntN(3) { - case 0: - src.GitBranch = oneOf(rng, []string{"main", "develop", "release"}) - case 1: - src.GitTag = "v" + fmt.Sprintf("%d.%d.0", rng.IntN(5), rng.IntN(10)) - case 2: - src.GitCommit = fmt.Sprintf("%040x", rng.Int64()) - } - return src -} - -func randEmailNotifications(rng *rand.Rand) *jobs.JobEmailNotifications { - email := randWord(rng) + "@example.com" - n := &jobs.JobEmailNotifications{NoAlertForSkippedRuns: chance(rng, 0.5)} - if chance(rng, 0.6) { - n.OnFailure = []string{email} - } - if chance(rng, 0.4) { - n.OnSuccess = []string{email} - } - if chance(rng, 0.3) { - n.OnStart = []string{email} - } - return n -} - -func randWebhookNotifications(rng *rand.Rand) *jobs.WebhookNotifications { - hook := []jobs.Webhook{{Id: randName(rng, "hook")}} - n := &jobs.WebhookNotifications{} - if chance(rng, 0.6) { - n.OnFailure = hook - } - if chance(rng, 0.4) { - n.OnSuccess = hook - } - return n -} - -func randHealth(rng *rand.Rand) *jobs.JobsHealthRules { - return &jobs.JobsHealthRules{ - Rules: []jobs.JobsHealthRule{ - { - Metric: jobs.JobsHealthMetric(oneOf(rng, healthMetrics)), - Op: jobs.JobsHealthOperatorGreaterThan, - Value: int64(rng.IntN(3600) + 1), - }, - }, - } -} - -func randLibraries(rng *rand.Rand) []compute.Library { - n := rng.IntN(2) + 1 - libs := make([]compute.Library, 0, n) - for range n { - switch rng.IntN(3) { - case 0: - libs = append(libs, compute.Library{Pypi: &compute.PythonPyPiLibrary{Package: randWord(rng)}}) - case 1: - libs = append(libs, compute.Library{Maven: &compute.MavenLibrary{Coordinates: "org.example:" + randWord(rng) + ":1.0.0"}}) - case 2: - libs = append(libs, compute.Library{Whl: "/Workspace/Users/test/" + randName(rng, "lib") + ".whl"}) - } - } - return libs -} - -func randParameters(rng *rand.Rand) []jobs.JobParameterDefinition { - n := rng.IntN(3) + 1 - params := make([]jobs.JobParameterDefinition, 0, n) - for i := range n { - params = append(params, jobs.JobParameterDefinition{ - Name: fmt.Sprintf("param_%d", i), - Default: randWord(rng), - }) - } - return params -} - -func randTags(rng *rand.Rand) map[string]string { - n := rng.IntN(3) + 1 - tags := make(map[string]string, n) - for i := range n { - tags[fmt.Sprintf("tag_%d", i)] = randWord(rng) - } - return tags -} diff --git a/bundle/fuzz/invariants_cases_test.go b/bundle/fuzz/invariants_cases_test.go deleted file mode 100644 index ea14c45f50..0000000000 --- a/bundle/fuzz/invariants_cases_test.go +++ /dev/null @@ -1,93 +0,0 @@ -package fuzz - -import ( - "encoding/json" - "testing" - - "github.com/databricks/cli/bundle/config/resources" - "github.com/databricks/databricks-sdk-go/service/compute" - "github.com/databricks/databricks-sdk-go/service/jobs" - "github.com/stretchr/testify/assert" -) - -// recordingT captures whether the invariant assertions failed, so the table below -// can check that a bad payload is rejected and a good one is accepted without a -// real deploy. -type recordingT struct{ failed bool } - -func (r *recordingT) Errorf(string, ...any) { r.failed = true } - -// FailNow is only reached if decodePayload errors; every case here is valid JSON, -// so record and stop the goroutine the way require would. -func (r *recordingT) FailNow() { panic("unexpected FailNow") } - -func TestCheckJobInvariants(t *testing.T) { - job := &resources.Job{ - JobSettings: jobs.JobSettings{ - Name: "j", - JobClusters: []jobs.JobCluster{ - {JobClusterKey: "shared", NewCluster: compute.ClusterSpec{}}, - }, - Tasks: []jobs.Task{ - {TaskKey: "a"}, - {TaskKey: "b"}, - }, - }, - } - - tests := []struct { - name string - payload string - wantFailed bool - }{ - { - name: "valid payload", - payload: `{"name":"j","job_clusters":[{"job_cluster_key":"shared","new_cluster":{"num_workers":0}}],"tasks":[{"task_key":"a","job_cluster_key":"shared"},{"task_key":"b","depends_on":[{"task_key":"a"}]}]}`, - }, - { - name: "renamed job", - payload: `{"name":"other","job_clusters":[{"job_cluster_key":"shared"}],"tasks":[{"task_key":"a"},{"task_key":"b"}]}`, - wantFailed: true, - }, - { - name: "dropped task", - payload: `{"name":"j","job_clusters":[{"job_cluster_key":"shared"}],"tasks":[{"task_key":"a"}]}`, - wantFailed: true, - }, - { - name: "dangling dependency", - payload: `{"name":"j","job_clusters":[{"job_cluster_key":"shared"}],"tasks":[{"task_key":"a"},{"task_key":"b","depends_on":[{"task_key":"ghost"}]}]}`, - wantFailed: true, - }, - { - name: "dangling job cluster reference", - payload: `{"name":"j","job_clusters":[{"job_cluster_key":"shared"}],"tasks":[{"task_key":"a","job_cluster_key":"missing"},{"task_key":"b"}]}`, - wantFailed: true, - }, - { - name: "new_cluster without explicit size is a valid single node", - payload: `{"name":"j","job_clusters":[{"job_cluster_key":"shared","new_cluster":{"spark_version":"x"}}],"tasks":[{"task_key":"a"},{"task_key":"b"}]}`, - }, - { - name: "single-node new_cluster with num_workers 0", - payload: `{"name":"j","job_clusters":[{"job_cluster_key":"shared","new_cluster":{"num_workers":0}}],"tasks":[{"task_key":"a"},{"task_key":"b"}]}`, - }, - { - name: "autoscale new_cluster", - payload: `{"name":"j","job_clusters":[{"job_cluster_key":"shared","new_cluster":{"autoscale":{"min_workers":1,"max_workers":3}}}],"tasks":[{"task_key":"a"},{"task_key":"b"}]}`, - }, - { - name: "new_cluster sets both autoscale and num_workers", - payload: `{"name":"j","job_clusters":[{"job_cluster_key":"shared","new_cluster":{"autoscale":{"min_workers":1,"max_workers":3},"num_workers":2}}],"tasks":[{"task_key":"a"},{"task_key":"b"}]}`, - wantFailed: true, - }, - } - - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - rec := &recordingT{} - checkJobInvariants(rec, 0, job, json.RawMessage(tt.payload)) - assert.Equal(t, tt.wantFailed, rec.failed) - }) - } -} diff --git a/bundle/fuzz/invariants_test.go b/bundle/fuzz/invariants_test.go deleted file mode 100644 index 055390236a..0000000000 --- a/bundle/fuzz/invariants_test.go +++ /dev/null @@ -1,175 +0,0 @@ -package fuzz - -import ( - "bytes" - "encoding/json" - "fmt" - - "github.com/databricks/cli/bundle/config/resources" - "github.com/stretchr/testify/assert" - "github.com/stretchr/testify/require" -) - -// checkJobInvariants asserts the properties that any valid job's create payload -// must satisfy, independent of deploy engine. Unlike a terraform/direct payload -// diff, an invariant has no legitimate reason to fail, so a failure is a real bug -// and the seed reproduces it. Each invariant is checked separately so a failure -// points at the property that broke. -func checkJobInvariants(t require.TestingT, seed int64, job *resources.Job, payload json.RawMessage) { - p, err := decodePayload(payload) - require.NoErrorf(t, err, "seed %d: decoding create payload", seed) - - nameMatchesConfig(t, seed, job, p) - taskKeysMatchConfig(t, seed, job, p) - dependenciesResolve(t, seed, p) - jobClusterKeysMatchConfig(t, seed, job, p) - taskClusterRefsResolve(t, seed, p) - newClustersSizedExclusively(t, seed, p) -} - -// nameMatchesConfig: the engine must not rename the job. -func nameMatchesConfig(t require.TestingT, seed int64, job *resources.Job, p map[string]any) { - assert.Equalf(t, job.Name, p["name"], "seed %d: payload name must match config", seed) -} - -// taskKeysMatchConfig: the payload must carry exactly the tasks from config, no -// more and no fewer, identified by task_key. -func taskKeysMatchConfig(t require.TestingT, seed int64, job *resources.Job, p map[string]any) { - want := make([]string, 0, len(job.Tasks)) - for _, task := range job.Tasks { - want = append(want, task.TaskKey) - } - assert.ElementsMatchf(t, want, taskKeys(p), "seed %d: payload task keys must match config", seed) -} - -// dependenciesResolve: every depends_on must point at a task in the same payload. -func dependenciesResolve(t require.TestingT, seed int64, p map[string]any) { - keys := sliceToSet(taskKeys(p)) - for _, task := range payloadTasks(p) { - for _, dep := range slice(task["depends_on"]) { - d, ok := dep.(map[string]any) - if !ok { - continue - } - assert.Containsf(t, keys, d["task_key"], - "seed %d: task %v depends on unknown task %v", seed, task["task_key"], d["task_key"]) - } - } -} - -// jobClusterKeysMatchConfig: the payload's shared job clusters must match config. -func jobClusterKeysMatchConfig(t require.TestingT, seed int64, job *resources.Job, p map[string]any) { - want := make([]string, 0, len(job.JobClusters)) - for _, jc := range job.JobClusters { - want = append(want, jc.JobClusterKey) - } - assert.ElementsMatchf(t, want, jobClusterKeys(p), "seed %d: payload job cluster keys must match config", seed) -} - -// taskClusterRefsResolve: a task referencing a shared cluster must reference one -// declared in job_clusters. -func taskClusterRefsResolve(t require.TestingT, seed int64, p map[string]any) { - keys := sliceToSet(jobClusterKeys(p)) - for _, task := range payloadTasks(p) { - ref, ok := task["job_cluster_key"].(string) - if !ok || ref == "" { - continue - } - assert.Containsf(t, keys, ref, - "seed %d: task %v references unknown job cluster %q", seed, task["task_key"], ref) - } -} - -// newClustersSizedExclusively: a new_cluster is sized either by autoscale or by a -// fixed num_workers, never both. The two are mutually exclusive cluster shapes, so -// an engine emitting both (e.g. force-sending num_workers onto an autoscale -// cluster) produces a payload the backend rejects. -func newClustersSizedExclusively(t require.TestingT, seed int64, p map[string]any) { - for _, c := range newClusters(p) { - _, hasAutoscale := c["autoscale"] - _, hasNumWorkers := c["num_workers"] - assert.Falsef(t, hasAutoscale && hasNumWorkers, - "seed %d: new_cluster must not set both autoscale and num_workers, got %v", seed, c) - } -} - -// decodePayload unmarshals the create body with UseNumber so large int64 values -// (job ids, spark_context_id) aren't corrupted by float64 rounding. -func decodePayload(raw json.RawMessage) (map[string]any, error) { - dec := json.NewDecoder(bytes.NewReader(raw)) - dec.UseNumber() - var p map[string]any - if err := dec.Decode(&p); err != nil { - return nil, fmt.Errorf("decoding payload: %w", err) - } - return p, nil -} - -// payloadTasks returns the payload's task objects. -func payloadTasks(p map[string]any) []map[string]any { - tasks := make([]map[string]any, 0, len(slice(p["tasks"]))) - for _, el := range slice(p["tasks"]) { - if m, ok := el.(map[string]any); ok { - tasks = append(tasks, m) - } - } - return tasks -} - -func taskKeys(p map[string]any) []string { - var keys []string - for _, task := range payloadTasks(p) { - if k, ok := task["task_key"].(string); ok { - keys = append(keys, k) - } - } - return keys -} - -func jobClusterKeys(p map[string]any) []string { - var keys []string - for _, el := range slice(p["job_clusters"]) { - jc, ok := el.(map[string]any) - if !ok { - continue - } - if k, ok := jc["job_cluster_key"].(string); ok { - keys = append(keys, k) - } - } - return keys -} - -// newClusters returns every new_cluster spec in the payload: one per task that -// defines its own cluster plus one per shared job cluster. -func newClusters(p map[string]any) []map[string]any { - var specs []map[string]any - for _, task := range payloadTasks(p) { - if c, ok := task["new_cluster"].(map[string]any); ok { - specs = append(specs, c) - } - } - for _, el := range slice(p["job_clusters"]) { - jc, ok := el.(map[string]any) - if !ok { - continue - } - if c, ok := jc["new_cluster"].(map[string]any); ok { - specs = append(specs, c) - } - } - return specs -} - -func slice(v any) []any { - s, _ := v.([]any) - return s -} - -func sliceToSet(s []string) map[string]bool { - set := make(map[string]bool, len(s)) - for _, v := range s { - set[v] = true - } - return set -} diff --git a/bundle/fuzz/rand_test.go b/bundle/fuzz/rand_test.go deleted file mode 100644 index 529e4da115..0000000000 --- a/bundle/fuzz/rand_test.go +++ /dev/null @@ -1,47 +0,0 @@ -package fuzz - -import ( - "fmt" - "math/rand/v2" - "strings" -) - -var words = []string{ - "alpha", "bravo", "charlie", "delta", "echo", "foxtrot", "golf", "hotel", - "india", "juliet", "kilo", "lima", "mike", "november", "oscar", "papa", -} - -// newRNG returns a deterministic RNG for the given seed, so any job the fuzzer -// flags can be regenerated from the printed seed alone. -func newRNG(seed int64) *rand.Rand { - return rand.New(rand.NewPCG(uint64(seed), 0)) -} - -// chance returns true with probability p (0..1). -func chance(rng *rand.Rand, p float64) bool { - return rng.Float64() < p -} - -// oneOf returns a random element of s. s must be non-empty. -func oneOf[T any](rng *rand.Rand, s []T) T { - return s[rng.IntN(len(s))] -} - -func randWord(rng *rand.Rand) string { - return oneOf(rng, words) -} - -// randName returns a deterministic-but-varied identifier with the given prefix, -// e.g. "job_alpha_4271". -func randName(rng *rand.Rand, prefix string) string { - return fmt.Sprintf("%s_%s_%d", prefix, randWord(rng), rng.IntN(10000)) -} - -func randSentence(rng *rand.Rand) string { - n := rng.IntN(4) + 2 - parts := make([]string, 0, n) - for range n { - parts = append(parts, randWord(rng)) - } - return strings.Join(parts, " ") -} diff --git a/bundle/fuzz/recorder_test.go b/bundle/fuzz/recorder_test.go deleted file mode 100644 index cfabf22721..0000000000 --- a/bundle/fuzz/recorder_test.go +++ /dev/null @@ -1,57 +0,0 @@ -package fuzz - -import ( - "encoding/json" - "sync" - - "github.com/databricks/cli/libs/testserver" -) - -// jobsCreatePath is the Jobs API route the deploy must hit on create. The -// testserver registers only this version, so posting to a different one surfaces -// as a capture failure ("did not POST"). -const jobsCreatePath = "/api/2.2/jobs/create" - -// capturedRequest is a single mutating API request observed by the testserver. -type capturedRequest struct { - Method string - Path string - Body json.RawMessage -} - -// recorder collects request bodies sent to a testserver. It is safe for -// concurrent use because the deploy may issue requests from multiple goroutines. -type recorder struct { - mu sync.Mutex - requests []capturedRequest -} - -func (r *recorder) callback(req *testserver.Request) { - r.mu.Lock() - defer r.mu.Unlock() - - var body json.RawMessage - if json.Valid(req.Body) { - // Copy: testserver reuses the underlying buffer across requests. - body = append(json.RawMessage(nil), req.Body...) - } - - r.requests = append(r.requests, capturedRequest{ - Method: req.Method, - Path: req.URL.Path, - Body: body, - }) -} - -// find returns the body of the first recorded request matching method and path. -func (r *recorder) find(method, path string) (json.RawMessage, bool) { - r.mu.Lock() - defer r.mu.Unlock() - - for _, req := range r.requests { - if req.Method == method && req.Path == path { - return req.Body, true - } - } - return nil, false -} From e7271d83d9f95f7e056eb4901aa0cbf5e41bcde5 Mon Sep 17 00:00:00 2001 From: Rada Kamysheva Date: Mon, 29 Jun 2026 08:52:47 +0000 Subject: [PATCH 16/24] acceptance/fuzz: clarify comments and tidy schema fuzz harness - Correct misleading comments: the nightly test-fuzz job runs the same local harness against the fake server (wider seed window + drift on), not a real workspace. - Run config generation inside the per-seed subshell so a generator crash also prints the "reproduce with" hint. - Document the schema-driven fuzz subdir in the invariant README, including that a failure is a real CLI bug and how to reproduce it. - Drop the unused name hint in gen_config (objects ignore it). --- .github/workflows/push.yml | 2 +- acceptance/bin/gen_fuzz_config.py | 2 +- acceptance/bundle/invariant/README.md | 6 ++++++ acceptance/bundle/invariant/fuzz/script | 10 ++++++---- 4 files changed, 14 insertions(+), 6 deletions(-) diff --git a/.github/workflows/push.yml b/.github/workflows/push.yml index 3a3a57c726..57ee29d75a 100644 --- a/.github/workflows/push.yml +++ b/.github/workflows/push.yml @@ -374,7 +374,7 @@ jobs: needs: - cleanups - # A real deploy per seed across a wide rotating window, with the no-drift + # Sweeps a wide rotating seed window against the fake server with the no-drift # invariant on: too slow for every PR, so nightly only and not part of # test-result. (The committed acceptance fuzz test still checks the no-panic # invariant on a small fixed seed window on every PR.) diff --git a/acceptance/bin/gen_fuzz_config.py b/acceptance/bin/gen_fuzz_config.py index 85909bb03f..f016690725 100755 --- a/acceptance/bin/gen_fuzz_config.py +++ b/acceptance/bin/gen_fuzz_config.py @@ -147,7 +147,7 @@ def gen_config(schema, seed, unique, allowed): element = obj["additionalProperties"] key = f"fuzz_{rtype}_{seed}" - instance = gen.gen(element, 0, "name") + instance = gen.gen(element, 0) return { "bundle": {"name": f"fuzz-{unique}"}, "resources": {rtype: {key: instance}}, diff --git a/acceptance/bundle/invariant/README.md b/acceptance/bundle/invariant/README.md index 184d3f541c..12b87902dd 100644 --- a/acceptance/bundle/invariant/README.md +++ b/acceptance/bundle/invariant/README.md @@ -4,3 +4,9 @@ no_drift test checks that there are no actions planned after successful deploy. test will dump full JSON plan to the output. In order to add a new test, add a config to configs/ and include it in test.toml. + +The fuzz/ test is different: instead of a curated config it generates random configs +from the live `databricks bundle schema` (see fuzz/script). Because the schema is read +from the CLI under test, an unrelated change to a resource struct can shift a seed onto +a new config. A failure there is a real CLI bug (a panic, internal error, or drift), not +test flakiness; reproduce it with `FUZZ_SEED_START= FUZZ_SEED_COUNT=1 task test-fuzz`. diff --git a/acceptance/bundle/invariant/fuzz/script b/acceptance/bundle/invariant/fuzz/script index f7751dde58..ce12ddce64 100644 --- a/acceptance/bundle/invariant/fuzz/script +++ b/acceptance/bundle/invariant/fuzz/script @@ -13,7 +13,8 @@ # # Drift checking is opt-in (FUZZ_CHECK_DRIFT): a freshly deployed random config can # legitimately differ from the fake server's state, so the local/PR run asserts only -# the cheap no-panic invariant. The nightly job enables drift on a real workspace. +# the cheap no-panic invariant. The nightly job runs this same harness against the +# fake server with a wider seed window and drift on (see Taskfile.yml test-fuzz). START="${FUZZ_SEED_START:-0}" COUNT="${FUZZ_SEED_COUNT:-5}" @@ -27,12 +28,13 @@ for ((offset = 0; offset < COUNT; offset++)); do dir="seed-$seed" mkdir -p "$dir" - gen_fuzz_config.py --schema schema.json --seed "$seed" --unique "$UNIQUE_NAME-$seed" --resources "${FUZZ_RESOURCES:-}" > "$dir/databricks.yml" 2>"$dir/LOG.gen.err" - cat "$dir/LOG.gen.err" | contains.py '!Traceback' > /dev/null - + # Run inside the subshell so a generator crash also prints the repro hint below. ( cd "$dir" + gen_fuzz_config.py --schema ../schema.json --seed "$seed" --unique "$UNIQUE_NAME-$seed" --resources "${FUZZ_RESOURCES:-}" > databricks.yml 2>LOG.gen.err + cat LOG.gen.err | contains.py '!Traceback' > /dev/null + # The CLI is allowed to reject a generated config, but never to crash. set +e $CLI bundle validate &> LOG.validate From 02e64113c08d3118740b4e934aece3922f486da3 Mon Sep 17 00:00:00 2001 From: Rada Kamysheva Date: Mon, 29 Jun 2026 09:11:28 +0000 Subject: [PATCH 17/24] acceptance/fuzz: shorten and tighten comments Make the comments across the schema fuzz harness more concise while keeping the non-obvious "why" context. --- .github/workflows/push.yml | 18 ++++----- Taskfile.yml | 8 ++-- acceptance/bin/gen_fuzz_config.py | 46 +++++++++++----------- acceptance/bundle/invariant/README.md | 10 ++--- acceptance/bundle/invariant/fuzz/script | 29 ++++++-------- acceptance/bundle/invariant/fuzz/test.toml | 6 +-- 6 files changed, 53 insertions(+), 64 deletions(-) diff --git a/.github/workflows/push.yml b/.github/workflows/push.yml index 57ee29d75a..2b1d3501af 100644 --- a/.github/workflows/push.yml +++ b/.github/workflows/push.yml @@ -374,10 +374,9 @@ jobs: needs: - cleanups - # Sweeps a wide rotating seed window against the fake server with the no-drift - # invariant on: too slow for every PR, so nightly only and not part of - # test-result. (The committed acceptance fuzz test still checks the no-panic - # invariant on a small fixed seed window on every PR.) + # Wide rotating seed window with drift checking on: too slow for every PR, so + # nightly only and not part of test-result. The committed acceptance test still + # checks the no-panic invariant on a small fixed window per PR. if: ${{ github.event_name == 'schedule' }} name: "task test-fuzz" runs-on: @@ -391,7 +390,7 @@ jobs: permissions: id-token: write contents: read - # Needed by the failure-reporting step below to open/comment a tracking issue. + # Failure-reporting step opens/comments a tracking issue. issues: write steps: @@ -405,16 +404,15 @@ jobs: - name: Run tests env: - # Shift the seed window each nightly run so CI explores new configs. - # start = GITHUB_RUN_NUMBER * FUZZ_SEED_COUNT keeps windows non-overlapping - # (GITHUB_RUN_NUMBER is monotonic). A failure prints the failing seed. + # start = monotonic GITHUB_RUN_NUMBER * COUNT keeps each nightly window + # non-overlapping, so CI explores new configs every run. FUZZ_SEED_COUNT: "25" run: | export FUZZ_SEED_START=$(( GITHUB_RUN_NUMBER * FUZZ_SEED_COUNT )) go tool -modfile=tools/task/go.mod task test-fuzz - # Excluded from test-result, so surface failures as a GitHub issue. Reuse one - # open issue (deduped by label) so a recurring failure doesn't spam nightly. + # Not in test-result, so surface failures as an issue. Reuse one open issue + # (deduped by label) so a recurring failure doesn't spam nightly. - name: Report failure if: ${{ failure() }} env: diff --git a/Taskfile.yml b/Taskfile.yml index b2aaadb5eb..a143378838 100644 --- a/Taskfile.yml +++ b/Taskfile.yml @@ -703,12 +703,12 @@ tasks: test-fuzz: desc: Run schema fuzz invariant tests (random configs, direct engine) - # No `sources:` fingerprint: the seed window depends on FUZZ_* env vars Task - # can't see, so always run rather than no-op a repro or a shifted nightly window. + # No `sources:` fingerprint: the window depends on FUZZ_* env vars Task can't + # see, so always run rather than no-op a repro or shifted nightly window. cmds: - | - # Sweep a wider window than the committed acceptance run and turn on the - # no-drift invariant; a repro can narrow it with FUZZ_SEED_START/COUNT. + # Wider window than the committed run, with drift checking on; a repro can + # narrow it via FUZZ_SEED_START/COUNT. export FUZZ_SEED_COUNT="${FUZZ_SEED_COUNT:-200}" export FUZZ_CHECK_DRIFT="${FUZZ_CHECK_DRIFT:-1}" {{.GO_TOOL}} gotestsum \ diff --git a/acceptance/bin/gen_fuzz_config.py b/acceptance/bin/gen_fuzz_config.py index f016690725..1c3f53d046 100755 --- a/acceptance/bin/gen_fuzz_config.py +++ b/acceptance/bin/gen_fuzz_config.py @@ -2,14 +2,13 @@ """ Generate a random bundle config from the bundle JSON schema. -The generator walks the schema (`databricks bundle schema`), resolving $ref and -picking concrete branches of oneOf/anyOf, and emits a single random resource as a -databricks.yml. It is seeded so a failing run can be reproduced with the same --seed. - -This feeds the invariant tests (see acceptance/bundle/invariant/): the harness -deploys the generated config and asserts invariants such as no-drift. Configs the -CLI rejects are filtered out by the harness before invariants are checked, so the -generator is free to produce structurally-random-but-sometimes-invalid configs. +Walks the schema (`databricks bundle schema`), resolving $ref and picking concrete +branches of oneOf/anyOf, and emits one random resource as a databricks.yml. Seeded +so a failing run reproduces with the same --seed. + +Feeds the invariant tests (see acceptance/bundle/invariant/). The harness filters out +configs the CLI rejects, so the generator may emit structurally-random-but-sometimes- +invalid configs. """ import argparse @@ -17,14 +16,13 @@ import random import sys -# Maximum object/array nesting depth. The schema is recursive (e.g. job tasks -> -# for_each_task -> task), so without a cap the walk would not terminate. +# Cap nesting depth: the schema is recursive (e.g. task -> for_each_task -> task), +# so without a cap the walk would not terminate. MAX_DEPTH = 6 -# A string branch whose pattern matches a ${...} reference. These exist because the -# schema generator wraps every concrete field in a oneOf with interpolation-string -# alternatives (see bundle/internal/schema/main.go addInterpolationPatterns). We -# generate concrete values, not references, so these branches are skipped. +# Matches the ${...} interpolation-string branches the schema wraps every concrete +# field in (see bundle/internal/schema/main.go addInterpolationPatterns). We emit +# concrete values, so these branches are skipped. INTERPOLATION_MARKER = "\\$\\{" @@ -35,8 +33,8 @@ def __init__(self, schema, rng, unique): self.unique = unique def resolve(self, schema): - # Follow $ref chains. A ref looks like "#/$defs/github.com/.../resources.Job"; - # definitions are nested under $defs by the "/"-separated path segments. + # Follow $ref chains, e.g. "#/$defs/github.com/.../resources.Job", nested + # under $defs by "/"-separated path segments. while isinstance(schema, dict) and "$ref" in schema: cur = self.root["$defs"] for part in schema["$ref"].split("/")[2:]: @@ -48,7 +46,7 @@ def is_interpolation(self, branch): return branch.get("type") == "string" and INTERPOLATION_MARKER in branch.get("pattern", "") def choose_branch(self, branches): - # Prefer concrete branches over the ${...} interpolation-string alternatives. + # Prefer concrete branches over the ${...} alternatives. concrete = [b for b in branches if not self.is_interpolation(b)] return self.rng.choice(concrete or branches) @@ -82,8 +80,8 @@ def gen_object(self, schema, depth): result = {} for prop_name, prop_schema in props.items(): - # Always emit required fields; emit optional ones with decreasing - # probability as we go deeper to keep configs from exploding. + # Always emit required fields; emit optional ones less often as we go + # deeper to keep configs from exploding. keep = prop_name in required or (depth < MAX_DEPTH and self.rng.random() < 0.35) if not keep: continue @@ -91,8 +89,8 @@ def gen_object(self, schema, depth): if value is not None: result[prop_name] = value - # Map type (additionalProperties schema, no fixed properties): synthesize a - # few random keys, e.g. resources. or string maps like tags. + # Map type (additionalProperties, no fixed properties): synthesize a few + # random keys, e.g. resources. or string maps like tags. if self.is_map(schema): for _ in range(self.rng.randint(1, 2)): key = self.token() @@ -124,7 +122,7 @@ def token(self): def resource_types(schema, gen): - # resources is `oneOf[{object with one property per resource type}]`. + # resources is oneOf[{ object with one property per resource type }]. resources = gen.resolve(schema["properties"]["resources"]) obj = next(b for b in resources["oneOf"] if b.get("type") == "object") return obj["properties"] @@ -140,8 +138,8 @@ def gen_config(schema, seed, unique, allowed): sys.exit(f"no resource types to generate from (allowed={sorted(allowed)})") rtype = rng.choice(sorted(candidates)) - # Each resource type is a map ref; its element schema lives under the object - # branch's additionalProperties. + # Each resource type is a map ref; the element schema is the object branch's + # additionalProperties. map_schema = gen.resolve(types[rtype]) obj = next(b for b in map_schema["oneOf"] if b.get("type") == "object") element = obj["additionalProperties"] diff --git a/acceptance/bundle/invariant/README.md b/acceptance/bundle/invariant/README.md index 12b87902dd..defcafcf35 100644 --- a/acceptance/bundle/invariant/README.md +++ b/acceptance/bundle/invariant/README.md @@ -5,8 +5,8 @@ test will dump full JSON plan to the output. In order to add a new test, add a config to configs/ and include it in test.toml. -The fuzz/ test is different: instead of a curated config it generates random configs -from the live `databricks bundle schema` (see fuzz/script). Because the schema is read -from the CLI under test, an unrelated change to a resource struct can shift a seed onto -a new config. A failure there is a real CLI bug (a panic, internal error, or drift), not -test flakiness; reproduce it with `FUZZ_SEED_START= FUZZ_SEED_COUNT=1 task test-fuzz`. +The fuzz/ test instead generates random configs from the live `databricks bundle +schema` (see fuzz/script). Since the schema comes from the CLI under test, an unrelated +struct change can shift a seed onto a new config. A failure is a real CLI bug (panic, +internal error, or drift), not flakiness; reproduce with +`FUZZ_SEED_START= FUZZ_SEED_COUNT=1 task test-fuzz`. diff --git a/acceptance/bundle/invariant/fuzz/script b/acceptance/bundle/invariant/fuzz/script index ce12ddce64..84994b9f66 100644 --- a/acceptance/bundle/invariant/fuzz/script +++ b/acceptance/bundle/invariant/fuzz/script @@ -1,20 +1,16 @@ -# Invariant to test: the CLI never panics or hits an internal error on any config -# generated from the bundle schema, and a config that deploys cleanly has no drift. +# Invariant: the CLI never panics or hits an internal error on any config generated +# from the bundle schema, and a config that deploys cleanly has no drift. # -# gen_fuzz_config.py walks the schema emitted by the CLI under test and produces a -# random-but-schema-valid config. Most invariant work is shared with the no_drift -# test; the difference is the input is generated, not a curated template. +# gen_fuzz_config.py walks the schema emitted by the CLI under test to produce a +# random schema-valid config; the rest is shared with the no_drift test. # -# Seeds form a window [START, START+COUNT). The window is env-driven so the nightly -# job can sweep a wide, non-overlapping range (see Taskfile.yml test-fuzz) while this -# committed test stays small and deterministic. Everything is routed to LOG.* / *.json -# so output.txt stays empty regardless of the window: a violation fails via exit code, -# not via output diff, which is what lets the same test run under any seed window. +# Seeds form a window [START, START+COUNT), env-driven so the nightly job can sweep a +# wide non-overlapping range while this committed test stays small. All output goes to +# LOG.* / *.json so output.txt stays empty: a violation fails via exit code, not diff, +# which lets the same test run under any seed window. # -# Drift checking is opt-in (FUZZ_CHECK_DRIFT): a freshly deployed random config can -# legitimately differ from the fake server's state, so the local/PR run asserts only -# the cheap no-panic invariant. The nightly job runs this same harness against the -# fake server with a wider seed window and drift on (see Taskfile.yml test-fuzz). +# Drift checking is opt-in (FUZZ_CHECK_DRIFT): a fresh random config can legitimately +# differ from the fake server's state, so the PR run only asserts no-panic. START="${FUZZ_SEED_START:-0}" COUNT="${FUZZ_SEED_COUNT:-5}" @@ -28,7 +24,7 @@ for ((offset = 0; offset < COUNT; offset++)); do dir="seed-$seed" mkdir -p "$dir" - # Run inside the subshell so a generator crash also prints the repro hint below. + # Subshell so a generator crash also prints the repro hint below. ( cd "$dir" @@ -43,8 +39,7 @@ for ((offset = 0; offset < COUNT; offset++)); do set -e cat LOG.validate LOG.deploy | contains.py '!panic' '!internal error' > /dev/null - # Deploy failed => config was rejected (not a bug). This is the negative of - # the no_drift test's INPUT_CONFIG_OK marker: nothing more to assert. + # Deploy failed => config was rejected, not a bug; nothing more to assert. if [ "$deploy_rc" -ne 0 ]; then exit 0 fi diff --git a/acceptance/bundle/invariant/fuzz/test.toml b/acceptance/bundle/invariant/fuzz/test.toml index 019d2dc649..caed93c23e 100644 --- a/acceptance/bundle/invariant/fuzz/test.toml +++ b/acceptance/bundle/invariant/fuzz/test.toml @@ -1,5 +1,3 @@ -# Schema fuzzing: generate random configs from the bundle schema and assert -# invariants (see script). Unlike the curated-corpus invariant tests (no_drift, -# migrate), the fuzzer generates its own configs, so drop the inherited -# INPUT_CONFIG matrix. +# Schema fuzzing (see script). Unlike the curated invariant tests, the fuzzer +# generates its own configs, so drop the inherited INPUT_CONFIG matrix. EnvMatrix.INPUT_CONFIG = [] From fdc22a132a5c7e7ae56eb1740fd7ae80ee0f0046 Mon Sep 17 00:00:00 2001 From: Rada Kamysheva Date: Mon, 29 Jun 2026 13:44:29 +0000 Subject: [PATCH 18/24] acceptance/fuzz: report nightly failures on the PR instead of an issue Mirror the integration-test flow: comment on the PR that introduced the failing commit rather than opening/deduping a tracking issue. --- .github/workflows/push.yml | 24 +++++++++++------------- 1 file changed, 11 insertions(+), 13 deletions(-) diff --git a/.github/workflows/push.yml b/.github/workflows/push.yml index 2b1d3501af..6a7ec2bfff 100644 --- a/.github/workflows/push.yml +++ b/.github/workflows/push.yml @@ -390,8 +390,8 @@ jobs: permissions: id-token: write contents: read - # Failure-reporting step opens/comments a tracking issue. - issues: write + # Failure-reporting step comments on the PR that introduced the failing commit. + pull-requests: write steps: - name: Checkout repository and submodules @@ -411,20 +411,17 @@ jobs: export FUZZ_SEED_START=$(( GITHUB_RUN_NUMBER * FUZZ_SEED_COUNT )) go tool -modfile=tools/task/go.mod task test-fuzz - # Not in test-result, so surface failures as an issue. Reuse one open issue - # (deduped by label) so a recurring failure doesn't spam nightly. + # Not in test-result, so surface failures by commenting on the PR that + # introduced the commit under test. - name: Report failure if: ${{ failure() }} env: GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }} + COMMIT: ${{ github.sha }} run: | - gh label create fuzz-nightly \ - --description "Nightly schema fuzz invariant failures" \ - --color FBCA04 2>/dev/null || true - body=$(cat <&2 fi # This job groups the result of all the above test jobs. From fdae30df603149b12ee9fb8a6bdf296d0906db90 Mon Sep 17 00:00:00 2001 From: Rada Kamysheva Date: Mon, 29 Jun 2026 13:44:31 +0000 Subject: [PATCH 19/24] acceptance/fuzz: reuse the no_drift invariant check instead of duplicating it Extract the no_drift deploy/drift/destroy body into a shared no_drift.sh sourced by both the no_drift test and the fuzzer, so the invariant lives in one place and other invariant tests can be fuzzed the same way. --- acceptance/bundle/invariant/README.md | 3 +- acceptance/bundle/invariant/fuzz/script | 77 +++++++++++++-------- acceptance/bundle/invariant/no_drift.sh | 55 +++++++++++++++ acceptance/bundle/invariant/no_drift/script | 38 +--------- 4 files changed, 106 insertions(+), 67 deletions(-) create mode 100644 acceptance/bundle/invariant/no_drift.sh diff --git a/acceptance/bundle/invariant/README.md b/acceptance/bundle/invariant/README.md index defcafcf35..a3b305f4ef 100644 --- a/acceptance/bundle/invariant/README.md +++ b/acceptance/bundle/invariant/README.md @@ -6,7 +6,8 @@ test will dump full JSON plan to the output. In order to add a new test, add a config to configs/ and include it in test.toml. The fuzz/ test instead generates random configs from the live `databricks bundle -schema` (see fuzz/script). Since the schema comes from the CLI under test, an unrelated +schema` (see fuzz/script) and runs each one through the same no_drift.sh check the +no_drift test uses. Since the schema comes from the CLI under test, an unrelated struct change can shift a seed onto a new config. A failure is a real CLI bug (panic, internal error, or drift), not flakiness; reproduce with `FUZZ_SEED_START= FUZZ_SEED_COUNT=1 task test-fuzz`. diff --git a/acceptance/bundle/invariant/fuzz/script b/acceptance/bundle/invariant/fuzz/script index 84994b9f66..1af93647eb 100644 --- a/acceptance/bundle/invariant/fuzz/script +++ b/acceptance/bundle/invariant/fuzz/script @@ -2,19 +2,36 @@ # from the bundle schema, and a config that deploys cleanly has no drift. # # gen_fuzz_config.py walks the schema emitted by the CLI under test to produce a -# random schema-valid config; the rest is shared with the no_drift test. +# random schema-valid config; the no-drift / no-panic checks are the shared +# ../no_drift.sh body, the same one the no_drift test runs. Reusing it keeps the +# deploy/drift/destroy assertions in one place and lets other invariant tests be +# fuzzed the same way. # # Seeds form a window [START, START+COUNT), env-driven so the nightly job can sweep a # wide non-overlapping range while this committed test stays small. All output goes to -# LOG.* / *.json so output.txt stays empty: a violation fails via exit code, not diff, -# which lets the same test run under any seed window. +# LOG.* so output.txt stays empty: a violation fails via exit code, not diff, which +# lets the same test run under any seed window. # -# Drift checking is opt-in (FUZZ_CHECK_DRIFT): a fresh random config can legitimately -# differ from the fake server's state, so the PR run only asserts no-panic. +# The CLI is free to reject a generated config; that is not a bug. ../no_drift.sh +# prints INPUT_CONFIG_OK once a config deploys cleanly, so a non-zero result before +# that marker (with no panic) means the config was rejected and is skipped, while a +# panic anywhere or a failure after the marker (drift, destroy) is a real CLI bug. +# +# Drift checking is opt-in (FUZZ_CHECK_DRIFT): a fresh random config can deploy yet +# legitimately differ from the fake server's state, so the committed run only asserts +# no-panic and tells ../no_drift.sh to skip its drift assertion. START="${FUZZ_SEED_START:-0}" COUNT="${FUZZ_SEED_COUNT:-5}" +if [ -z "${FUZZ_CHECK_DRIFT:-}" ]; then + export SKIP_DRIFT_CHECK=1 +fi + +# no_drift.sh deploys via readplanarg, which reads READPLAN; the fuzzer doesn't use +# the saved-plan matrix, so deploy once without it (and satisfy the script's set -u). +export READPLAN="" + # Emit the schema from the CLI under test so the generator always matches it. $CLI bundle schema > schema.json 2>LOG.schema.err cat LOG.schema.err | contains.py '!panic' '!internal error' > /dev/null @@ -24,36 +41,36 @@ for ((offset = 0; offset < COUNT; offset++)); do dir="seed-$seed" mkdir -p "$dir" - # Subshell so a generator crash also prints the repro hint below. + # Subshell so a generator crash or shared-check failure is contained per seed. + set +e ( cd "$dir" - gen_fuzz_config.py --schema ../schema.json --seed "$seed" --unique "$UNIQUE_NAME-$seed" --resources "${FUZZ_RESOURCES:-}" > databricks.yml 2>LOG.gen.err cat LOG.gen.err | contains.py '!Traceback' > /dev/null + source "$TESTDIR/../no_drift.sh" + ) > "$dir/LOG.check" 2>&1 + rc=$? + set -e + + if [ "$rc" -eq 0 ]; then + continue + fi + + bug="" + + # A panic or internal error is a bug even when the CLI then rejects the config. + if ! cat "$dir"/LOG.validate "$dir"/LOG.deploy 2>/dev/null | contains.py '!panic' '!internal error' > /dev/null; then + bug=1 + fi + + # Failing after INPUT_CONFIG_OK means the config deployed but drifted (or destroy + # failed); failing before it with no panic just means the config was rejected. + if grep -q INPUT_CONFIG_OK "$dir/LOG.check"; then + bug=1 + fi - # The CLI is allowed to reject a generated config, but never to crash. - set +e - $CLI bundle validate &> LOG.validate - $CLI bundle deploy &> LOG.deploy - deploy_rc=$? - set -e - cat LOG.validate LOG.deploy | contains.py '!panic' '!internal error' > /dev/null - - # Deploy failed => config was rejected, not a bug; nothing more to assert. - if [ "$deploy_rc" -ne 0 ]; then - exit 0 - fi - - if [ -n "${FUZZ_CHECK_DRIFT:-}" ]; then - $CLI bundle plan -o json > plan.json 2>LOG.plan.err - cat LOG.plan.err | contains.py '!panic' '!internal error' > /dev/null - verify_no_drift.py plan.json - fi - - $CLI bundle destroy --auto-approve &> LOG.destroy - cat LOG.destroy | contains.py '!panic' '!internal error' > /dev/null - ) || { + if [ -n "$bug" ]; then echo "fuzz: invariant failed, reproduce with: FUZZ_SEED_START=$seed FUZZ_SEED_COUNT=1 task test-fuzz" >&2 exit 1 - } + fi done diff --git a/acceptance/bundle/invariant/no_drift.sh b/acceptance/bundle/invariant/no_drift.sh new file mode 100644 index 0000000000..662b27e19f --- /dev/null +++ b/acceptance/bundle/invariant/no_drift.sh @@ -0,0 +1,55 @@ +# Shared invariant body: given a databricks.yml in the current directory, deploy it +# and assert there is no drift afterwards, with no panics / internal errors along +# the way. Sourced by no_drift/script (curated configs) and fuzz/script (random +# schema-generated configs) so the deploy/drift/destroy logic lives in one place. + +# We redirect output rather than record it because some configs that are being tested may produce warnings +trace $CLI bundle validate &> LOG.validate + +cat LOG.validate | contains.py '!panic' '!internal error' > /dev/null + +cleanup() { + # Only destroy what we deployed. A curated config always deploys, but a random + # fuzzed config may be rejected, and destroying nothing just makes extra API + # calls (which fail the local fake server on unstubbed URLs). + if [ -z "${deployed:-}" ]; then + return + fi + + trace $CLI bundle destroy --auto-approve &> LOG.destroy + cat LOG.destroy | contains.py '!panic' '!internal error' > /dev/null + + # Run cleanup script if present. The fuzzer has no named INPUT_CONFIG, so guard + # the lookup against the script's `set -u`. + CLEANUP_SCRIPT="$TESTDIR/../configs/${INPUT_CONFIG:-}-cleanup.sh" + if [ -f "$CLEANUP_SCRIPT" ]; then + source "$CLEANUP_SCRIPT" &> LOG.cleanup + fi +} + +trap cleanup EXIT + +$CLI bundle plan -o json > plan.json 2>LOG.plan_initial.err +cat LOG.plan_initial.err | contains.py '!panic' '!internal error' > /dev/null + +trace $CLI bundle deploy $(readplanarg plan.json) &> LOG.deploy +cat LOG.deploy | contains.py '!panic' '!internal error' > /dev/null +deployed=1 + +# Special message to fuzzer that generated config was fine. +# Any failures after this point will be considered as "bug detected" by fuzzer. +echo INPUT_CONFIG_OK + +# Drift is the whole point for the curated no_drift configs, but a random fuzzed +# config can deploy yet legitimately differ from the fake server's state, so the +# fuzzer sets SKIP_DRIFT_CHECK on runs where only the no-panic invariant is asserted. +if [ -z "${SKIP_DRIFT_CHECK:-}" ]; then + # Check both text and JSON plan for no changes + # Note, expect that there maybe more than one resource unchanged + $CLI bundle plan -o json > LOG.planjson 2>LOG.planjson.err + cat LOG.planjson.err | contains.py '!panic' '!internal error' > /dev/null + verify_no_drift.py LOG.planjson + + $CLI bundle plan 2>LOG.plan.err | contains.py '!panic' '!internal error' 'Plan: 0 to add, 0 to change, 0 to delete' > LOG.plan + cat LOG.plan.err | contains.py '!panic' '!internal error' > /dev/null +fi diff --git a/acceptance/bundle/invariant/no_drift/script b/acceptance/bundle/invariant/no_drift/script index 95ecd7cbfd..1edd686409 100644 --- a/acceptance/bundle/invariant/no_drift/script +++ b/acceptance/bundle/invariant/no_drift/script @@ -14,39 +14,5 @@ envsubst < $TESTDIR/../configs/$INPUT_CONFIG > databricks.yml cp databricks.yml LOG.config -# We redirect output rather than record it because some configs that are being tested may produce warnings -trace $CLI bundle validate &> LOG.validate - -cat LOG.validate | contains.py '!panic' '!internal error' > /dev/null - -cleanup() { - trace $CLI bundle destroy --auto-approve &> LOG.destroy - cat LOG.destroy | contains.py '!panic' '!internal error' > /dev/null - - # Run cleanup script if present - CLEANUP_SCRIPT="$TESTDIR/../configs/$INPUT_CONFIG-cleanup.sh" - if [ -f "$CLEANUP_SCRIPT" ]; then - source "$CLEANUP_SCRIPT" &> LOG.cleanup - fi -} - -trap cleanup EXIT - -$CLI bundle plan -o json > plan.json 2>LOG.plan_initial.err -cat LOG.plan_initial.err | contains.py '!panic' '!internal error' > /dev/null - -trace $CLI bundle deploy $(readplanarg plan.json) &> LOG.deploy -cat LOG.deploy | contains.py '!panic' '!internal error' > /dev/null - -# Special message to fuzzer that generated config was fine. -# Any failures after this point will be considered as "bug detected" by fuzzer. -echo INPUT_CONFIG_OK - -# Check both text and JSON plan for no changes -# Note, expect that there maybe more than one resource unchanged -$CLI bundle plan -o json > LOG.planjson 2>LOG.planjson.err -cat LOG.planjson.err | contains.py '!panic' '!internal error' > /dev/null -verify_no_drift.py LOG.planjson - -$CLI bundle plan 2>LOG.plan.err | contains.py '!panic' '!internal error' 'Plan: 0 to add, 0 to change, 0 to delete' > LOG.plan -cat LOG.plan.err | contains.py '!panic' '!internal error' > /dev/null +# Deploy and assert no drift. Shared with the fuzz invariant test. +source "$TESTDIR/../no_drift.sh" From c4f74c9f14181e5efed779ad262572d665d22553 Mon Sep 17 00:00:00 2001 From: Rada Kamysheva Date: Mon, 29 Jun 2026 14:29:49 +0000 Subject: [PATCH 20/24] acceptance/fuzz: skip INPUT_CONFIG_OK marker when deploy is rejected A rejected config never deploys, so emitting the marker made the fuzzer read the re-plan's "needs create" as drift. --- acceptance/bundle/invariant/no_drift.sh | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/acceptance/bundle/invariant/no_drift.sh b/acceptance/bundle/invariant/no_drift.sh index 662b27e19f..9b3746a5ac 100644 --- a/acceptance/bundle/invariant/no_drift.sh +++ b/acceptance/bundle/invariant/no_drift.sh @@ -33,7 +33,15 @@ $CLI bundle plan -o json > plan.json 2>LOG.plan_initial.err cat LOG.plan_initial.err | contains.py '!panic' '!internal error' > /dev/null trace $CLI bundle deploy $(readplanarg plan.json) &> LOG.deploy +deploy_rc=$? cat LOG.deploy | contains.py '!panic' '!internal error' > /dev/null + +# A rejected config didn't deploy, so skip the INPUT_CONFIG_OK marker; otherwise +# the fuzzer reads the re-plan's "needs create" as drift. Curated tests run under +# `bash -e` and already aborted above, so this only fires in the fuzzer subshell. +if [ "$deploy_rc" -ne 0 ]; then + return "$deploy_rc" +fi deployed=1 # Special message to fuzzer that generated config was fine. From 314f4eefe60f30c062f51f5f34ca6ff2faf7f5c4 Mon Sep 17 00:00:00 2001 From: Rada Kamysheva Date: Tue, 30 Jun 2026 08:27:33 +0000 Subject: [PATCH 21/24] acceptance/fuzz: propagate drift failures so the fuzzer detects them The fuzzer runs the shared no_drift.sh body with errexit off and classifies each seed from the captured exit code. The drift block ended with a no-panic check that reset $? to 0, so a config that deployed cleanly but drifted was silently treated as a pass. Accumulate the drift assertions into drift_rc and return it instead. The curated no_drift test (errexit on) is unaffected. Also make verify_no_drift.py fail cleanly on empty/unparseable plan output (when bundle plan itself failed) instead of crashing with a traceback, and tighten the fuzz harness comments. --- acceptance/bin/gen_fuzz_config.py | 19 ++++++--------- acceptance/bin/verify_no_drift.py | 23 ++++++++++-------- acceptance/bundle/invariant/fuzz/script | 32 +++++++++---------------- acceptance/bundle/invariant/no_drift.sh | 15 +++++++----- 4 files changed, 40 insertions(+), 49 deletions(-) diff --git a/acceptance/bin/gen_fuzz_config.py b/acceptance/bin/gen_fuzz_config.py index 1c3f53d046..672f466641 100755 --- a/acceptance/bin/gen_fuzz_config.py +++ b/acceptance/bin/gen_fuzz_config.py @@ -2,13 +2,10 @@ """ Generate a random bundle config from the bundle JSON schema. -Walks the schema (`databricks bundle schema`), resolving $ref and picking concrete -branches of oneOf/anyOf, and emits one random resource as a databricks.yml. Seeded -so a failing run reproduces with the same --seed. - -Feeds the invariant tests (see acceptance/bundle/invariant/). The harness filters out -configs the CLI rejects, so the generator may emit structurally-random-but-sometimes- -invalid configs. +Walks `databricks bundle schema` (resolving $ref, picking concrete oneOf/anyOf +branches) and emits one random resource as databricks.yml, seeded by --seed. Feeds the +invariant tests; the harness filters out configs the CLI rejects, so output may be +structurally-random but sometimes invalid. """ import argparse @@ -16,13 +13,11 @@ import random import sys -# Cap nesting depth: the schema is recursive (e.g. task -> for_each_task -> task), -# so without a cap the walk would not terminate. +# The schema is recursive (e.g. task -> for_each_task -> task); cap the walk. MAX_DEPTH = 6 -# Matches the ${...} interpolation-string branches the schema wraps every concrete -# field in (see bundle/internal/schema/main.go addInterpolationPatterns). We emit -# concrete values, so these branches are skipped. +# The ${...} interpolation branch the schema wraps every field in (see +# bundle/internal/schema/main.go addInterpolationPatterns); we emit concrete values. INTERPOLATION_MARKER = "\\$\\{" diff --git a/acceptance/bin/verify_no_drift.py b/acceptance/bin/verify_no_drift.py index 9b272c1ce7..19d6ed28b8 100755 --- a/acceptance/bin/verify_no_drift.py +++ b/acceptance/bin/verify_no_drift.py @@ -11,18 +11,21 @@ def check_plan(path): with open(path) as fobj: raw = fobj.read() - changes_detected = 0 - + # Empty or unparseable output means `bundle plan` itself failed; report that + # cleanly instead of crashing with a traceback. + if not raw.strip(): + sys.exit(f"{path}: empty plan output (bundle plan failed)") try: data = json.loads(raw) - for key, value in data["plan"].items(): - action = value.get("action") - if action != "skip": - print(f"Unexpected {action=} for {key}") - changes_detected += 1 - except Exception: - print(raw, flush=True) - raise + except json.JSONDecodeError as e: + sys.exit(f"{path}: invalid plan JSON: {e}\n{raw}") + + changes_detected = 0 + for key, value in data["plan"].items(): + action = value.get("action") + if action != "skip": + print(f"Unexpected {action=} for {key}") + changes_detected += 1 if changes_detected: print(raw, flush=True) diff --git a/acceptance/bundle/invariant/fuzz/script b/acceptance/bundle/invariant/fuzz/script index 1af93647eb..634200c03b 100644 --- a/acceptance/bundle/invariant/fuzz/script +++ b/acceptance/bundle/invariant/fuzz/script @@ -1,25 +1,16 @@ -# Invariant: the CLI never panics or hits an internal error on any config generated -# from the bundle schema, and a config that deploys cleanly has no drift. +# Invariant: the CLI never panics on a schema-generated config, and a config that +# deploys cleanly has no drift. gen_fuzz_config.py produces a random schema-valid +# config; ../no_drift.sh (shared with the no_drift test) does the deploy/drift/destroy +# checks. Output goes to LOG.* so a violation fails via exit code, not diff, letting +# the same test run under any seed window [START, START+COUNT). # -# gen_fuzz_config.py walks the schema emitted by the CLI under test to produce a -# random schema-valid config; the no-drift / no-panic checks are the shared -# ../no_drift.sh body, the same one the no_drift test runs. Reusing it keeps the -# deploy/drift/destroy assertions in one place and lets other invariant tests be -# fuzzed the same way. +# A rejected config is not a bug: ../no_drift.sh prints INPUT_CONFIG_OK once a config +# deploys, so a non-zero result before that marker (no panic) is just a rejection, +# while a panic anywhere or a failure after it (drift, destroy) is a real bug. # -# Seeds form a window [START, START+COUNT), env-driven so the nightly job can sweep a -# wide non-overlapping range while this committed test stays small. All output goes to -# LOG.* so output.txt stays empty: a violation fails via exit code, not diff, which -# lets the same test run under any seed window. -# -# The CLI is free to reject a generated config; that is not a bug. ../no_drift.sh -# prints INPUT_CONFIG_OK once a config deploys cleanly, so a non-zero result before -# that marker (with no panic) means the config was rejected and is skipped, while a -# panic anywhere or a failure after the marker (drift, destroy) is a real CLI bug. -# -# Drift checking is opt-in (FUZZ_CHECK_DRIFT): a fresh random config can deploy yet +# Drift checking is opt-in (FUZZ_CHECK_DRIFT): a random config can deploy yet # legitimately differ from the fake server's state, so the committed run only asserts -# no-panic and tells ../no_drift.sh to skip its drift assertion. +# no-panic and skips the drift assertion. START="${FUZZ_SEED_START:-0}" COUNT="${FUZZ_SEED_COUNT:-5}" @@ -28,8 +19,7 @@ if [ -z "${FUZZ_CHECK_DRIFT:-}" ]; then export SKIP_DRIFT_CHECK=1 fi -# no_drift.sh deploys via readplanarg, which reads READPLAN; the fuzzer doesn't use -# the saved-plan matrix, so deploy once without it (and satisfy the script's set -u). +# no_drift.sh reads READPLAN via readplanarg; the fuzzer skips the saved-plan matrix. export READPLAN="" # Emit the schema from the CLI under test so the generator always matches it. diff --git a/acceptance/bundle/invariant/no_drift.sh b/acceptance/bundle/invariant/no_drift.sh index 9b3746a5ac..df0bc319aa 100644 --- a/acceptance/bundle/invariant/no_drift.sh +++ b/acceptance/bundle/invariant/no_drift.sh @@ -52,12 +52,15 @@ echo INPUT_CONFIG_OK # config can deploy yet legitimately differ from the fake server's state, so the # fuzzer sets SKIP_DRIFT_CHECK on runs where only the no-panic invariant is asserted. if [ -z "${SKIP_DRIFT_CHECK:-}" ]; then - # Check both text and JSON plan for no changes - # Note, expect that there maybe more than one resource unchanged + # Check both text and JSON plan for no changes (may be >1 unchanged resource). + # The fuzzer runs this with errexit off and reads the return code, so accumulate + # failures into drift_rc instead of letting the trailing no-panic check reset $?. + drift_rc=0 $CLI bundle plan -o json > LOG.planjson 2>LOG.planjson.err - cat LOG.planjson.err | contains.py '!panic' '!internal error' > /dev/null - verify_no_drift.py LOG.planjson + cat LOG.planjson.err | contains.py '!panic' '!internal error' > /dev/null || drift_rc=1 + verify_no_drift.py LOG.planjson || drift_rc=1 - $CLI bundle plan 2>LOG.plan.err | contains.py '!panic' '!internal error' 'Plan: 0 to add, 0 to change, 0 to delete' > LOG.plan - cat LOG.plan.err | contains.py '!panic' '!internal error' > /dev/null + $CLI bundle plan 2>LOG.plan.err | contains.py '!panic' '!internal error' 'Plan: 0 to add, 0 to change, 0 to delete' > LOG.plan || drift_rc=1 + cat LOG.plan.err | contains.py '!panic' '!internal error' > /dev/null || drift_rc=1 + return "$drift_rc" fi From a223206d3d1a52d7a2558ce8fd7dccdae60e6d10 Mon Sep 17 00:00:00 2001 From: Rada Kamysheva Date: Tue, 30 Jun 2026 09:58:44 +0000 Subject: [PATCH 22/24] acceptance/fuzz: make the fuzzer run any invariant, not just no_drift Extract the migrate invariant body into a shared migrate.sh (mirroring no_drift.sh) and have the fuzzer source ../$FUZZ_INVARIANT.sh so it can exercise any invariant. Wire up FUZZ_INVARIANT=[no_drift, migrate] so the schema fuzzer now also stress-tests the Terraform->direct migration on random configs. The fuzzer's panic scan now globs LOG.* rather than naming LOG.validate/LOG.deploy, since different bodies write different logs. --- acceptance/bundle/invariant/README.md | 11 ++- .../bundle/invariant/fuzz/out.test.toml | 1 + acceptance/bundle/invariant/fuzz/script | 22 +++--- acceptance/bundle/invariant/fuzz/test.toml | 4 + acceptance/bundle/invariant/migrate.sh | 73 +++++++++++++++++++ acceptance/bundle/invariant/migrate/script | 39 +--------- 6 files changed, 100 insertions(+), 50 deletions(-) create mode 100644 acceptance/bundle/invariant/migrate.sh diff --git a/acceptance/bundle/invariant/README.md b/acceptance/bundle/invariant/README.md index a3b305f4ef..80cc095b9e 100644 --- a/acceptance/bundle/invariant/README.md +++ b/acceptance/bundle/invariant/README.md @@ -6,8 +6,11 @@ test will dump full JSON plan to the output. In order to add a new test, add a config to configs/ and include it in test.toml. The fuzz/ test instead generates random configs from the live `databricks bundle -schema` (see fuzz/script) and runs each one through the same no_drift.sh check the -no_drift test uses. Since the schema comes from the CLI under test, an unrelated -struct change can shift a seed onto a new config. A failure is a real CLI bug (panic, -internal error, or drift), not flakiness; reproduce with +schema` (see fuzz/script) and runs each one through a shared invariant body. The body +is selected by `FUZZ_INVARIANT` (matrixed in fuzz/test.toml) and is the same +`.sh` the matching curated test sources, so the fuzzer can exercise any +invariant: `no_drift.sh` (deploy + no drift) and `migrate.sh` (Terraform deploy + +migrate to direct + no drift) today. Since the schema comes from the CLI under test, +an unrelated struct change can shift a seed onto a new config. A failure is a real CLI +bug (panic, internal error, or drift), not flakiness; reproduce with `FUZZ_SEED_START= FUZZ_SEED_COUNT=1 task test-fuzz`. diff --git a/acceptance/bundle/invariant/fuzz/out.test.toml b/acceptance/bundle/invariant/fuzz/out.test.toml index 789aa10c79..aa67f82bc2 100644 --- a/acceptance/bundle/invariant/fuzz/out.test.toml +++ b/acceptance/bundle/invariant/fuzz/out.test.toml @@ -2,4 +2,5 @@ Local = true Cloud = true RequiresUnityCatalog = true EnvMatrix.DATABRICKS_BUNDLE_ENGINE = ["direct"] +EnvMatrix.FUZZ_INVARIANT = ["no_drift", "migrate"] EnvMatrix.INPUT_CONFIG = [] diff --git a/acceptance/bundle/invariant/fuzz/script b/acceptance/bundle/invariant/fuzz/script index 634200c03b..1d881073ab 100644 --- a/acceptance/bundle/invariant/fuzz/script +++ b/acceptance/bundle/invariant/fuzz/script @@ -1,12 +1,13 @@ # Invariant: the CLI never panics on a schema-generated config, and a config that # deploys cleanly has no drift. gen_fuzz_config.py produces a random schema-valid -# config; ../no_drift.sh (shared with the no_drift test) does the deploy/drift/destroy -# checks. Output goes to LOG.* so a violation fails via exit code, not diff, letting -# the same test run under any seed window [START, START+COUNT). +# config; the invariant body ../$FUZZ_INVARIANT.sh (shared with the matching curated +# invariant test, e.g. no_drift or migrate) does the deploy/drift/destroy checks. +# Output goes to LOG.* so a violation fails via exit code, not diff, letting the same +# test run under any seed window [START, START+COUNT). # -# A rejected config is not a bug: ../no_drift.sh prints INPUT_CONFIG_OK once a config -# deploys, so a non-zero result before that marker (no panic) is just a rejection, -# while a panic anywhere or a failure after it (drift, destroy) is a real bug. +# A rejected config is not a bug: every invariant body prints INPUT_CONFIG_OK once a +# config deploys, so a non-zero result before that marker (no panic) is just a +# rejection, while a panic anywhere or a failure after it (drift, destroy) is a real bug. # # Drift checking is opt-in (FUZZ_CHECK_DRIFT): a random config can deploy yet # legitimately differ from the fake server's state, so the committed run only asserts @@ -37,7 +38,7 @@ for ((offset = 0; offset < COUNT; offset++)); do cd "$dir" gen_fuzz_config.py --schema ../schema.json --seed "$seed" --unique "$UNIQUE_NAME-$seed" --resources "${FUZZ_RESOURCES:-}" > databricks.yml 2>LOG.gen.err cat LOG.gen.err | contains.py '!Traceback' > /dev/null - source "$TESTDIR/../no_drift.sh" + source "$TESTDIR/../${FUZZ_INVARIANT:-no_drift}.sh" ) > "$dir/LOG.check" 2>&1 rc=$? set -e @@ -48,8 +49,11 @@ for ((offset = 0; offset < COUNT; offset++)); do bug="" - # A panic or internal error is a bug even when the CLI then rejects the config. - if ! cat "$dir"/LOG.validate "$dir"/LOG.deploy 2>/dev/null | contains.py '!panic' '!internal error' > /dev/null; then + # A panic or internal error anywhere is a bug even when the CLI then rejects the + # config. Invariant bodies write different LOG.* files (no_drift has LOG.validate, + # migrate has LOG.migrate), so scan whatever this run produced rather than naming + # specific files -- a missing name would otherwise fail the pipe under pipefail. + if ! cat "$dir"/LOG.* 2>/dev/null | contains.py '!panic' '!internal error' > /dev/null; then bug=1 fi diff --git a/acceptance/bundle/invariant/fuzz/test.toml b/acceptance/bundle/invariant/fuzz/test.toml index caed93c23e..ef0f7b4438 100644 --- a/acceptance/bundle/invariant/fuzz/test.toml +++ b/acceptance/bundle/invariant/fuzz/test.toml @@ -1,3 +1,7 @@ # Schema fuzzing (see script). Unlike the curated invariant tests, the fuzzer # generates its own configs, so drop the inherited INPUT_CONFIG matrix. EnvMatrix.INPUT_CONFIG = [] + +# Fuzz each invariant body in ../.sh. no_drift runs on the direct engine; +# migrate ignores it and starts from a Terraform deployment (see migrate.sh). +EnvMatrix.FUZZ_INVARIANT = ["no_drift", "migrate"] diff --git a/acceptance/bundle/invariant/migrate.sh b/acceptance/bundle/invariant/migrate.sh new file mode 100644 index 0000000000..00f3948fc4 --- /dev/null +++ b/acceptance/bundle/invariant/migrate.sh @@ -0,0 +1,73 @@ +# Shared invariant body: given a databricks.yml in the current directory, deploy it +# with Terraform, migrate the deployment to the direct engine, and assert there is no +# drift afterwards, with no panics / internal errors along the way. Sourced by +# migrate/script (curated configs) and fuzz/script (random schema-generated configs) +# so the deploy/migrate/drift logic lives in one place. + +# migrate always starts from a Terraform deployment, so drop any engine the caller +# selected (the fuzzer runs the invariant matrix with DATABRICKS_BUNDLE_ENGINE=direct). +unset DATABRICKS_BUNDLE_ENGINE + +cleanup() { + # Only destroy what we deployed. A curated config always deploys, but a random + # fuzzed config may be rejected, and destroying nothing just makes extra API + # calls (which fail the local fake server on unstubbed URLs). + if [ -z "${deployed:-}" ]; then + return + fi + + trace $CLI bundle destroy --auto-approve &> LOG.destroy + cat LOG.destroy | contains.py '!panic:' '!internal error' > /dev/null + + # Run cleanup script if present. The fuzzer has no named INPUT_CONFIG, so guard + # the lookup against the script's `set -u`. + CLEANUP_SCRIPT="$TESTDIR/../configs/${INPUT_CONFIG:-}-cleanup.sh" + if [ -f "$CLEANUP_SCRIPT" ]; then + source "$CLEANUP_SCRIPT" &> LOG.cleanup + fi +} + +trap cleanup EXIT + +trace DATABRICKS_BUNDLE_ENGINE=terraform $CLI bundle deploy &> LOG.deploy +deploy_rc=$? +cat LOG.deploy | contains.py '!panic:' '!internal error' > /dev/null + +# A rejected config didn't deploy, so skip the INPUT_CONFIG_OK marker; otherwise the +# fuzzer reads the failing migrate/drift below as a bug. Curated tests run under +# `bash -e` and already aborted above, so this only fires in the fuzzer subshell. +if [ "$deploy_rc" -ne 0 ]; then + return "$deploy_rc" +fi +deployed=1 + +# Special message to fuzzer that generated config was fine. +# Any failures after this point will be considered as "bug detected" by fuzzer. +echo INPUT_CONFIG_OK + +MIGRATE_ARGS="" +# The terraform provider sorts depends_on entries alphabetically by task_key on Read +# (see terraform-provider-databricks PR #3000). Since depends_on uses TypeList +# (order-sensitive), terraform plan reports positional drift when the bundle config +# specifies depends_on in a different order than the provider's sorted state. +# This is a false positive -- the logical dependencies are identical. +if [[ "${INPUT_CONFIG:-}" == "job_with_depends_on.yml.tmpl" ]]; then + MIGRATE_ARGS="--noplancheck" +fi + +trace $CLI bundle deployment migrate $MIGRATE_ARGS &> LOG.migrate + +cat LOG.migrate | contains.py '!panic:' '!internal error' > /dev/null + +# Drift is the whole point for the curated migrate configs, but a random fuzzed +# config can migrate yet legitimately differ from the fake server's state, so the +# fuzzer sets SKIP_DRIFT_CHECK on runs where only the no-panic invariant is asserted. +if [ -z "${SKIP_DRIFT_CHECK:-}" ]; then + # The fuzzer runs this with errexit off and reads the return code, so accumulate + # failures into drift_rc instead of letting the trailing no-panic check reset $?. + drift_rc=0 + $CLI bundle plan -o json > plan.json 2>plan.json.err + cat plan.json.err | contains.py '!panic:' '!internal error' > /dev/null || drift_rc=1 + verify_no_drift.py plan.json || drift_rc=1 + return "$drift_rc" +fi diff --git a/acceptance/bundle/invariant/migrate/script b/acceptance/bundle/invariant/migrate/script index 78f45faa7d..78eb0630e1 100644 --- a/acceptance/bundle/invariant/migrate/script +++ b/acceptance/bundle/invariant/migrate/script @@ -1,8 +1,6 @@ # Invariant to test: migrate is successful, no drift after deploy # Additional checks: no internal errors / panics in any commands -unset DATABRICKS_BUNDLE_ENGINE - # Copy data files to test directory cp -r "$TESTDIR/../data/." . &> LOG.cp @@ -16,38 +14,5 @@ envsubst < $TESTDIR/../configs/$INPUT_CONFIG > databricks.yml cp databricks.yml LOG.config -cleanup() { - trace $CLI bundle destroy --auto-approve &> LOG.destroy - cat LOG.destroy | contains.py '!panic:' '!internal error' > /dev/null - - # Run cleanup script if present - CLEANUP_SCRIPT="$TESTDIR/../configs/$INPUT_CONFIG-cleanup.sh" - if [ -f "$CLEANUP_SCRIPT" ]; then - source "$CLEANUP_SCRIPT" &> LOG.cleanup - fi -} - -trap cleanup EXIT - -trace DATABRICKS_BUNDLE_ENGINE=terraform $CLI bundle deploy &> LOG.deploy -cat LOG.deploy | contains.py '!panic:' '!internal error' > /dev/null - -echo INPUT_CONFIG_OK - -MIGRATE_ARGS="" -# The terraform provider sorts depends_on entries alphabetically by task_key on Read -# (see terraform-provider-databricks PR #3000). Since depends_on uses TypeList -# (order-sensitive), terraform plan reports positional drift when the bundle config -# specifies depends_on in a different order than the provider's sorted state. -# This is a false positive -- the logical dependencies are identical. -if [[ "$INPUT_CONFIG" == "job_with_depends_on.yml.tmpl" ]]; then - MIGRATE_ARGS="--noplancheck" -fi - -trace $CLI bundle deployment migrate $MIGRATE_ARGS &> LOG.migrate - -cat LOG.migrate | contains.py '!panic:' '!internal error' > /dev/null - -$CLI bundle plan -o json > plan.json 2>plan.json.err -cat plan.json.err | contains.py '!panic:' '!internal error' > /dev/null -verify_no_drift.py plan.json +# Migrate then assert no drift. Shared with the fuzz invariant test. +source "$TESTDIR/../migrate.sh" From 71ffdc7b2f23e7d0d1849f6e2a1bc155a592429c Mon Sep 17 00:00:00 2001 From: Rada Kamysheva Date: Tue, 30 Jun 2026 12:42:11 +0000 Subject: [PATCH 23/24] testserver: round-trip catalog create payload fields CatalogsCreate only echoed a subset of the create request, so a re-read returned null for connection_name, managed_encryption_settings, and custom_max_retention_hours. Because connection_name is recreate_on_changes (immutable), the schema fuzzer's no_drift invariant saw a perpetual recreate; the others showed as update drift. Persist these fields on create so the re-read matches the deployed config. Also clamp the fuzzer's custom_max_retention_hours to UC-valid values (0 or 168-720 hours) so generated catalog configs deploy. --- acceptance/bin/gen_fuzz_config.py | 4 ++++ libs/testserver/catalogs.go | 34 ++++++++++++++++++------------- 2 files changed, 24 insertions(+), 14 deletions(-) diff --git a/acceptance/bin/gen_fuzz_config.py b/acceptance/bin/gen_fuzz_config.py index 672f466641..1fa03f0fe2 100755 --- a/acceptance/bin/gen_fuzz_config.py +++ b/acceptance/bin/gen_fuzz_config.py @@ -104,6 +104,10 @@ def gen_scalar(self, schema, name): if t == "boolean": return self.rng.choice([True, False]) if t == "integer": + # The field is in hours, but UC validates it as a window of 0 or 7-30 + # days; only 0 or 168-720 (hours) are accepted. + if name == "custom_max_retention_hours": + return self.rng.choice([0, self.rng.randint(168, 720)]) return self.rng.choice([0, 1, self.rng.randint(2, 1000)]) if t == "number": return round(self.rng.uniform(0, 1000), 2) diff --git a/libs/testserver/catalogs.go b/libs/testserver/catalogs.go index 859721ee73..f76566cb15 100644 --- a/libs/testserver/catalogs.go +++ b/libs/testserver/catalogs.go @@ -20,20 +20,26 @@ func (s *FakeWorkspace) CatalogsCreate(req Request) Response { } catalogInfo := catalog.CatalogInfo{ - Name: createRequest.Name, - Comment: createRequest.Comment, - StorageRoot: createRequest.StorageRoot, - ProviderName: createRequest.ProviderName, - ShareName: createRequest.ShareName, - Options: createRequest.Options, - Properties: createRequest.Properties, - FullName: createRequest.Name, - CreatedAt: nowMilli(), - CreatedBy: s.CurrentUser().UserName, - UpdatedBy: s.CurrentUser().UserName, - MetastoreId: nextUUID(), - Owner: s.CurrentUser().UserName, - CatalogType: catalog.CatalogTypeManagedCatalog, + Name: createRequest.Name, + Comment: createRequest.Comment, + // Round-trip the remaining create-request fields so a re-read matches the + // deployed config. Dropping them made connection_name (recreate_on_changes) + // re-plan as a perpetual recreate and managed_encryption_settings as drift. + StorageRoot: createRequest.StorageRoot, + ProviderName: createRequest.ProviderName, + ShareName: createRequest.ShareName, + ConnectionName: createRequest.ConnectionName, + ManagedEncryptionSettings: createRequest.ManagedEncryptionSettings, + CustomMaxRetentionHours: createRequest.CustomMaxRetentionHours, + Options: createRequest.Options, + Properties: createRequest.Properties, + FullName: createRequest.Name, + CreatedAt: nowMilli(), + CreatedBy: s.CurrentUser().UserName, + UpdatedBy: s.CurrentUser().UserName, + MetastoreId: nextUUID(), + Owner: s.CurrentUser().UserName, + CatalogType: catalog.CatalogTypeManagedCatalog, } catalogInfo.UpdatedAt = catalogInfo.CreatedAt From 898efb795ce2ba5dbdf2c33d9dad6199e9b33879 Mon Sep 17 00:00:00 2001 From: Rada Kamysheva Date: Tue, 30 Jun 2026 14:00:50 +0000 Subject: [PATCH 24/24] acceptance/fuzz: add redeploy, canonical, update, destroy_recreate invariants Broaden the fuzz invariant matrix beyond no_drift/migrate with four more schema-driven invariant bodies, each selectable via FUZZ_INVARIANT and following the existing INPUT_CONFIG_OK / SKIP_DRIFT_CHECK contract: - redeploy.sh: deploy twice; the second deploy must be a clean no-op, which exercises the write path twice and catches create handlers that don't round-trip their inputs. - canonical.sh: `bundle validate -o json` must be byte-identical across two runs; guards against nondeterministic serialization. Cloud-independent, so it always runs (not gated behind SKIP_DRIFT_CHECK). - update.sh: edit a comment/description and assert the redeploy is an in-place update (not a recreate) that converges with no drift. Configs without an editable field are skipped before the marker (treated as a rejection). - destroy_recreate.sh: deploy then destroy; a re-plan must want to create everything again, proving destroy left no orphaned state. Add two stdlib-only helpers: edit_fuzz_config.py (flips one comment/description scalar via a line match, no YAML dependency) and verify_plan_action.py (asserts a plan shows the expected action, mirroring bundle/deployplan/action.go). --- acceptance/bin/edit_fuzz_config.py | 57 ++++++++++++ acceptance/bin/verify_plan_action.py | 64 +++++++++++++ acceptance/bundle/invariant/README.md | 15 +++- acceptance/bundle/invariant/canonical.sh | 30 +++++++ .../bundle/invariant/destroy_recreate.sh | 76 ++++++++++++++++ .../bundle/invariant/fuzz/out.test.toml | 9 +- acceptance/bundle/invariant/fuzz/test.toml | 7 +- acceptance/bundle/invariant/redeploy.sh | 77 ++++++++++++++++ acceptance/bundle/invariant/update.sh | 90 +++++++++++++++++++ 9 files changed, 418 insertions(+), 7 deletions(-) create mode 100755 acceptance/bin/edit_fuzz_config.py create mode 100755 acceptance/bin/verify_plan_action.py create mode 100644 acceptance/bundle/invariant/canonical.sh create mode 100644 acceptance/bundle/invariant/destroy_recreate.sh create mode 100644 acceptance/bundle/invariant/redeploy.sh create mode 100644 acceptance/bundle/invariant/update.sh diff --git a/acceptance/bin/edit_fuzz_config.py b/acceptance/bin/edit_fuzz_config.py new file mode 100755 index 0000000000..ef7eefe605 --- /dev/null +++ b/acceptance/bin/edit_fuzz_config.py @@ -0,0 +1,57 @@ +#!/usr/bin/env python3 +""" +Edit one updatable field in a generated databricks.yml in place, for the `update` +invariant. It targets a `comment` or `description` scalar -- plain string fields the +update API accepts across resource types -- so a redeploy issues an in-place update +rather than a recreate. + +gen_fuzz_config.py emits every scalar on its own line as `key: `, so a line +match is enough and avoids a YAML dependency. + + edit_fuzz_config.py PATH edit in place; exit 1 if no editable field + edit_fuzz_config.py PATH --detect exit 0 if an editable field exists, else 1 +""" + +import argparse +import re +import sys + +# Allow an optional "- " so a comment/description that is the first key of a list-item +# dict still matches; the captured prefix is preserved verbatim on rewrite. +FIELD_RE = re.compile(r'^(\s*(?:- )?)(comment|description): (".*")\s*$') + +NEW_VALUE = '"fuzz_edited_value"' + + +def find_line(lines): + for i, line in enumerate(lines): + m = FIELD_RE.match(line) + if m: + return i, m + return -1, None + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument("path") + parser.add_argument("--detect", action="store_true", help="only check, don't edit") + args = parser.parse_args() + + with open(args.path) as f: + lines = f.readlines() + + i, m = find_line(lines) + if m is None: + sys.exit(1) + if args.detect: + return + + prefix, key, _ = m.groups() + lines[i] = f"{prefix}{key}: {NEW_VALUE}\n" + with open(args.path, "w") as f: + f.writelines(lines) + sys.stderr.write(f"edited {key} at line {i + 1}\n") + + +if __name__ == "__main__": + main() diff --git a/acceptance/bin/verify_plan_action.py b/acceptance/bin/verify_plan_action.py new file mode 100755 index 0000000000..3b706066ba --- /dev/null +++ b/acceptance/bin/verify_plan_action.py @@ -0,0 +1,64 @@ +#!/usr/bin/env python3 +""" +Check that a `bundle plan -o json` shows an expected action, for invariants beyond +no-drift. + + verify_plan_action.py PATH update every changed resource is an in-place update + (not a recreate) and at least one changed + verify_plan_action.py PATH create every resource is a create (e.g. a re-plan + after destroy must recreate everything) + +Action vocabulary mirrors bundle/deployplan/action.go. +""" + +import json +import sys + +# update_id/resize keep the resource (no recreate), so they count as in-place updates. +ALLOWED = { + "update": {"update", "update_id", "resize"}, + "create": {"create"}, +} +# After a destroy, a "skip" means the resource survived (orphaned state), so skip is +# only tolerated for the update check, where unrelated siblings may be unchanged. +SKIP_OK = {"update": True, "create": False} + + +def main(): + path, expected = sys.argv[1], sys.argv[2] + allowed = ALLOWED[expected] + skip_ok = SKIP_OK[expected] + + with open(path) as fobj: + raw = fobj.read() + + if not raw.strip(): + sys.exit(f"{path}: empty plan output (bundle plan failed)") + try: + data = json.loads(raw) + except json.JSONDecodeError as e: + sys.exit(f"{path}: invalid plan JSON: {e}\n{raw}") + + matched = 0 + bad = 0 + for key, value in data["plan"].items(): + action = value.get("action") + if action == "skip" and skip_ok: + continue + if action in allowed: + matched += 1 + else: + print(f"Unexpected {action=} for {key} (expected {expected})") + bad += 1 + + if matched == 0: + print(f"plan shows no {expected} action; expected at least one") + bad += 1 + + if bad: + print(raw, flush=True) + sys.exit(10) + + +if __name__ == "__main__": + main() diff --git a/acceptance/bundle/invariant/README.md b/acceptance/bundle/invariant/README.md index 80cc095b9e..92eabd5ef9 100644 --- a/acceptance/bundle/invariant/README.md +++ b/acceptance/bundle/invariant/README.md @@ -7,10 +7,17 @@ In order to add a new test, add a config to configs/ and include it in test.toml The fuzz/ test instead generates random configs from the live `databricks bundle schema` (see fuzz/script) and runs each one through a shared invariant body. The body -is selected by `FUZZ_INVARIANT` (matrixed in fuzz/test.toml) and is the same -`.sh` the matching curated test sources, so the fuzzer can exercise any -invariant: `no_drift.sh` (deploy + no drift) and `migrate.sh` (Terraform deploy + -migrate to direct + no drift) today. Since the schema comes from the CLI under test, +is selected by `FUZZ_INVARIANT` (matrixed in fuzz/test.toml) and is a `.sh` +body, so the fuzzer can exercise any invariant: + +- `no_drift.sh` -- deploy, then no drift +- `migrate.sh` -- Terraform deploy, migrate to direct, then no drift +- `redeploy.sh` -- deploy twice; the second deploy must be a no-op +- `canonical.sh` -- `validate -o json` must be byte-identical across two runs +- `update.sh` -- edit a comment/description; the redeploy must update in place (not recreate) +- `destroy_recreate.sh` -- deploy then destroy; a re-plan must recreate everything + +`no_drift.sh` and `migrate.sh` are also sourced by their matching curated tests. Since the schema comes from the CLI under test, an unrelated struct change can shift a seed onto a new config. A failure is a real CLI bug (panic, internal error, or drift), not flakiness; reproduce with `FUZZ_SEED_START= FUZZ_SEED_COUNT=1 task test-fuzz`. diff --git a/acceptance/bundle/invariant/canonical.sh b/acceptance/bundle/invariant/canonical.sh new file mode 100644 index 0000000000..1cb9920d8f --- /dev/null +++ b/acceptance/bundle/invariant/canonical.sh @@ -0,0 +1,30 @@ +# Shared invariant body: given a databricks.yml in the current directory, assert that +# `bundle validate -o json` is deterministic -- two runs on the same config must +# produce byte-identical output. Catches nondeterministic map ordering or other +# unstable serialization in config loading/resolution. There is no deploy, so no +# cleanup/destroy and no cloud state. Sourced by fuzz/script (random configs). + +$CLI bundle validate -o json > validate1.json 2>LOG.validate1.err +validate_rc=$? +cat LOG.validate1.err | contains.py '!panic' '!internal error' > /dev/null + +# A rejected config didn't validate; that's not a bug, just an invalid fuzz config, so +# skip the INPUT_CONFIG_OK marker. Curated tests run under `bash -e` and already +# aborted above, so this only fires in the fuzzer subshell. +if [ "$validate_rc" -ne 0 ]; then + return "$validate_rc" +fi + +# Special message to fuzzer that generated config was fine. +# Any failures after this point will be considered as "bug detected" by fuzzer. +echo INPUT_CONFIG_OK + +$CLI bundle validate -o json > validate2.json 2>LOG.validate2.err +cat LOG.validate2.err | contains.py '!panic' '!internal error' > /dev/null + +# Determinism is cloud-independent and cheap, so unlike drift it always runs (no +# SKIP_DRIFT_CHECK gate): identical input must yield identical output regardless of the +# seed window. A diff here is a real bug, not a fake-server limitation. +diff_rc=0 +diff validate1.json validate2.json > LOG.validate.diff || diff_rc=1 +return "$diff_rc" diff --git a/acceptance/bundle/invariant/destroy_recreate.sh b/acceptance/bundle/invariant/destroy_recreate.sh new file mode 100644 index 0000000000..6e805f33a5 --- /dev/null +++ b/acceptance/bundle/invariant/destroy_recreate.sh @@ -0,0 +1,76 @@ +# Shared invariant body: given a databricks.yml in the current directory, deploy it, +# destroy it, and assert a re-plan wants to CREATE every resource again -- proving the +# destroy cleared all tracked state with nothing orphaned. A resource that destroy +# forgets to remove from state shows up here as a "skip" (still considered present), +# which is a bug. Sourced by fuzz/script (random configs). + +# We redirect output rather than record it because some configs that are being tested may produce warnings +trace $CLI bundle validate &> LOG.validate + +cat LOG.validate | contains.py '!panic' '!internal error' > /dev/null + +cleanup() { + # Only destroy what we deployed. The body destroys on the happy path and clears + # `deployed`, so this trap only fires when deploy or destroy failed partway. + if [ -z "${deployed:-}" ]; then + return + fi + + trace $CLI bundle destroy --auto-approve &> LOG.destroy_cleanup + cat LOG.destroy_cleanup | contains.py '!panic' '!internal error' > /dev/null + + # Run cleanup script if present. The fuzzer has no named INPUT_CONFIG, so guard + # the lookup against the script's `set -u`. + CLEANUP_SCRIPT="$TESTDIR/../configs/${INPUT_CONFIG:-}-cleanup.sh" + if [ -f "$CLEANUP_SCRIPT" ]; then + source "$CLEANUP_SCRIPT" &> LOG.cleanup + fi +} + +trap cleanup EXIT + +$CLI bundle plan -o json > plan.json 2>LOG.plan_initial.err +cat LOG.plan_initial.err | contains.py '!panic' '!internal error' > /dev/null + +trace $CLI bundle deploy $(readplanarg plan.json) &> LOG.deploy +deploy_rc=$? +cat LOG.deploy | contains.py '!panic' '!internal error' > /dev/null + +# A rejected config didn't deploy, so skip the INPUT_CONFIG_OK marker; otherwise the +# fuzzer reads the destroy/recreate below as a bug. Curated tests run under `bash -e` +# and already aborted above, so this only fires in the fuzzer subshell. +if [ "$deploy_rc" -ne 0 ]; then + return "$deploy_rc" +fi +deployed=1 + +# Special message to fuzzer that generated config was fine. +# Any failures after this point will be considered as "bug detected" by fuzzer. +echo INPUT_CONFIG_OK + +# Destroy unconditionally so any panic lands in LOG.destroy for the harness post-scan; +# whether the destroy was complete (re-plan recreates everything) is gated below. +trace $CLI bundle destroy --auto-approve &> LOG.destroy +destroy_rc=$? +cat LOG.destroy | contains.py '!panic' '!internal error' > /dev/null + +# On a clean destroy nothing remains, so stop the trap from destroying again (which +# would just make unstubbed API calls against the fake server). +if [ "$destroy_rc" -eq 0 ]; then + deployed="" +fi + +# A random fuzzed config can deploy yet legitimately leave fake-server state that the +# re-plan reads differently, so the fuzzer sets SKIP_DRIFT_CHECK on runs where only the +# no-panic invariant is asserted. +if [ -z "${SKIP_DRIFT_CHECK:-}" ]; then + # The fuzzer runs this with errexit off and reads the return code, so accumulate + # failures into recreate_rc instead of letting the trailing no-panic check reset $?. + recreate_rc=0 + [ "$destroy_rc" -eq 0 ] || recreate_rc=1 + + $CLI bundle plan -o json > LOG.recreate_plan.json 2>LOG.recreate_plan.err + cat LOG.recreate_plan.err | contains.py '!panic' '!internal error' > /dev/null || recreate_rc=1 + verify_plan_action.py LOG.recreate_plan.json create || recreate_rc=1 + return "$recreate_rc" +fi diff --git a/acceptance/bundle/invariant/fuzz/out.test.toml b/acceptance/bundle/invariant/fuzz/out.test.toml index aa67f82bc2..611343d30c 100644 --- a/acceptance/bundle/invariant/fuzz/out.test.toml +++ b/acceptance/bundle/invariant/fuzz/out.test.toml @@ -2,5 +2,12 @@ Local = true Cloud = true RequiresUnityCatalog = true EnvMatrix.DATABRICKS_BUNDLE_ENGINE = ["direct"] -EnvMatrix.FUZZ_INVARIANT = ["no_drift", "migrate"] +EnvMatrix.FUZZ_INVARIANT = [ + "no_drift", + "migrate", + "redeploy", + "canonical", + "update", + "destroy_recreate" +] EnvMatrix.INPUT_CONFIG = [] diff --git a/acceptance/bundle/invariant/fuzz/test.toml b/acceptance/bundle/invariant/fuzz/test.toml index ef0f7b4438..4ca0c1adee 100644 --- a/acceptance/bundle/invariant/fuzz/test.toml +++ b/acceptance/bundle/invariant/fuzz/test.toml @@ -3,5 +3,8 @@ EnvMatrix.INPUT_CONFIG = [] # Fuzz each invariant body in ../.sh. no_drift runs on the direct engine; -# migrate ignores it and starts from a Terraform deployment (see migrate.sh). -EnvMatrix.FUZZ_INVARIANT = ["no_drift", "migrate"] +# migrate ignores it and starts from a Terraform deployment (see migrate.sh). The +# others deploy on the direct engine and check a different property: redeploy is a +# no-op, canonical is determinism of `validate -o json`, update edits a field and +# expects an in-place update, destroy_recreate expects a re-plan to recreate everything. +EnvMatrix.FUZZ_INVARIANT = ["no_drift", "migrate", "redeploy", "canonical", "update", "destroy_recreate"] diff --git a/acceptance/bundle/invariant/redeploy.sh b/acceptance/bundle/invariant/redeploy.sh new file mode 100644 index 0000000000..3fe561f1d3 --- /dev/null +++ b/acceptance/bundle/invariant/redeploy.sh @@ -0,0 +1,77 @@ +# Shared invariant body: given a databricks.yml in the current directory, deploy it, +# then deploy it a SECOND time, and assert the redeploy is a clean no-op (no drift) +# with no panics / internal errors along the way. The distinguishing check vs no_drift +# is the second deploy: a create handler that doesn't round-trip its inputs (or a +# mutator that re-derives a field) surfaces here as a redeploy that wants to change or +# recreate an already-deployed resource. Sourced by fuzz/script (random configs). + +# We redirect output rather than record it because some configs that are being tested may produce warnings +trace $CLI bundle validate &> LOG.validate + +cat LOG.validate | contains.py '!panic' '!internal error' > /dev/null + +cleanup() { + # Only destroy what we deployed. A curated config always deploys, but a random + # fuzzed config may be rejected, and destroying nothing just makes extra API + # calls (which fail the local fake server on unstubbed URLs). + if [ -z "${deployed:-}" ]; then + return + fi + + trace $CLI bundle destroy --auto-approve &> LOG.destroy + cat LOG.destroy | contains.py '!panic' '!internal error' > /dev/null + + # Run cleanup script if present. The fuzzer has no named INPUT_CONFIG, so guard + # the lookup against the script's `set -u`. + CLEANUP_SCRIPT="$TESTDIR/../configs/${INPUT_CONFIG:-}-cleanup.sh" + if [ -f "$CLEANUP_SCRIPT" ]; then + source "$CLEANUP_SCRIPT" &> LOG.cleanup + fi +} + +trap cleanup EXIT + +$CLI bundle plan -o json > plan.json 2>LOG.plan_initial.err +cat LOG.plan_initial.err | contains.py '!panic' '!internal error' > /dev/null + +trace $CLI bundle deploy $(readplanarg plan.json) &> LOG.deploy +deploy_rc=$? +cat LOG.deploy | contains.py '!panic' '!internal error' > /dev/null + +# A rejected config didn't deploy, so skip the INPUT_CONFIG_OK marker; otherwise the +# fuzzer reads the redeploy/drift below as a bug. Curated tests run under `bash -e` +# and already aborted above, so this only fires in the fuzzer subshell. +if [ "$deploy_rc" -ne 0 ]; then + return "$deploy_rc" +fi +deployed=1 + +# Special message to fuzzer that generated config was fine. +# Any failures after this point will be considered as "bug detected" by fuzzer. +echo INPUT_CONFIG_OK + +# Deploy again on the same config. Run it unconditionally so any panic lands in +# LOG.redeploy for the harness post-scan; whether it converges (success + no drift) is +# part of the drift-class check, gated below. +trace $CLI bundle deploy &> LOG.redeploy +redeploy_rc=$? +cat LOG.redeploy | contains.py '!panic' '!internal error' > /dev/null + +# A random fuzzed config can deploy yet legitimately fail to redeploy or differ from +# the fake server's state, so the fuzzer sets SKIP_DRIFT_CHECK on runs where only the +# no-panic invariant is asserted. +if [ -z "${SKIP_DRIFT_CHECK:-}" ]; then + # The fuzzer runs this with errexit off and reads the return code, so accumulate + # failures into drift_rc instead of letting the trailing no-panic check reset $?. + drift_rc=0 + [ "$redeploy_rc" -eq 0 ] || drift_rc=1 + + # Check both text and JSON plan for no changes (may be >1 unchanged resource). + $CLI bundle plan -o json > LOG.planjson 2>LOG.planjson.err + cat LOG.planjson.err | contains.py '!panic' '!internal error' > /dev/null || drift_rc=1 + verify_no_drift.py LOG.planjson || drift_rc=1 + + $CLI bundle plan 2>LOG.plan.err | contains.py '!panic' '!internal error' 'Plan: 0 to add, 0 to change, 0 to delete' > LOG.plan || drift_rc=1 + cat LOG.plan.err | contains.py '!panic' '!internal error' > /dev/null || drift_rc=1 + return "$drift_rc" +fi diff --git a/acceptance/bundle/invariant/update.sh b/acceptance/bundle/invariant/update.sh new file mode 100644 index 0000000000..531115b561 --- /dev/null +++ b/acceptance/bundle/invariant/update.sh @@ -0,0 +1,90 @@ +# Shared invariant body: given a databricks.yml in the current directory, deploy it, +# edit one updatable field (a comment/description), and assert the redeploy issues an +# in-place update -- not a recreate -- and leaves no drift. This exercises the update +# (PATCH) path that create-only deploys never touch; a resource whose update path is +# missing or buggy shows up here as a recreate, a spurious unrelated change, or drift. +# Sourced by fuzz/script (random configs). + +# The update invariant only applies to configs with an editable comment/description +# field. A random config without one isn't a bug, so skip it before deploying (no +# INPUT_CONFIG_OK marker, so the fuzzer treats it as a rejection). +if ! edit_fuzz_config.py databricks.yml --detect 2>LOG.detect.err; then + return 0 +fi + +# We redirect output rather than record it because some configs that are being tested may produce warnings +trace $CLI bundle validate &> LOG.validate + +cat LOG.validate | contains.py '!panic' '!internal error' > /dev/null + +cleanup() { + # Only destroy what we deployed. A curated config always deploys, but a random + # fuzzed config may be rejected, and destroying nothing just makes extra API + # calls (which fail the local fake server on unstubbed URLs). + if [ -z "${deployed:-}" ]; then + return + fi + + trace $CLI bundle destroy --auto-approve &> LOG.destroy + cat LOG.destroy | contains.py '!panic' '!internal error' > /dev/null + + # Run cleanup script if present. The fuzzer has no named INPUT_CONFIG, so guard + # the lookup against the script's `set -u`. + CLEANUP_SCRIPT="$TESTDIR/../configs/${INPUT_CONFIG:-}-cleanup.sh" + if [ -f "$CLEANUP_SCRIPT" ]; then + source "$CLEANUP_SCRIPT" &> LOG.cleanup + fi +} + +trap cleanup EXIT + +$CLI bundle plan -o json > plan.json 2>LOG.plan_initial.err +cat LOG.plan_initial.err | contains.py '!panic' '!internal error' > /dev/null + +trace $CLI bundle deploy $(readplanarg plan.json) &> LOG.deploy +deploy_rc=$? +cat LOG.deploy | contains.py '!panic' '!internal error' > /dev/null + +# A rejected config didn't deploy, so skip the INPUT_CONFIG_OK marker; otherwise the +# fuzzer reads the update/drift below as a bug. Curated tests run under `bash -e` and +# already aborted above, so this only fires in the fuzzer subshell. +if [ "$deploy_rc" -ne 0 ]; then + return "$deploy_rc" +fi +deployed=1 + +# Special message to fuzzer that generated config was fine. +# Any failures after this point will be considered as "bug detected" by fuzzer. +echo INPUT_CONFIG_OK + +# Change the comment/description and re-plan: this plan must show an in-place update. +edit_fuzz_config.py databricks.yml 2>LOG.edit.err +cat LOG.edit.err | contains.py '!Traceback' > /dev/null + +$CLI bundle plan -o json > LOG.update_plan.json 2>LOG.update_plan.err +cat LOG.update_plan.err | contains.py '!panic' '!internal error' > /dev/null + +# Apply the edit. Run it unconditionally so any panic lands in LOG.redeploy for the +# harness post-scan; whether the update is in-place and converges is gated below. +trace $CLI bundle deploy &> LOG.redeploy +redeploy_rc=$? +cat LOG.redeploy | contains.py '!panic' '!internal error' > /dev/null + +# A random fuzzed config can deploy yet legitimately differ from the fake server's +# state on update, so the fuzzer sets SKIP_DRIFT_CHECK on runs where only the no-panic +# invariant is asserted. +if [ -z "${SKIP_DRIFT_CHECK:-}" ]; then + # The fuzzer runs this with errexit off and reads the return code, so accumulate + # failures into update_rc instead of letting the trailing no-panic check reset $?. + update_rc=0 + [ "$redeploy_rc" -eq 0 ] || update_rc=1 + + # The edit must update in place, not recreate. + verify_plan_action.py LOG.update_plan.json update || update_rc=1 + + # And the applied update must converge: a re-plan shows no further changes. + $CLI bundle plan -o json > LOG.planjson 2>LOG.planjson.err + cat LOG.planjson.err | contains.py '!panic' '!internal error' > /dev/null || update_rc=1 + verify_no_drift.py LOG.planjson || update_rc=1 + return "$update_rc" +fi