From 36c3feb8d995a592dc45aed71950cd10243d8f08 Mon Sep 17 00:00:00 2001 From: Charles Lowell <10964656+chlowell@users.noreply.github.com> Date: Fri, 20 Mar 2026 07:33:22 -0700 Subject: [PATCH 1/2] `run --output-dir` groups files by timestamp --- README.md | 1 + cmd/waza/cmd_run.go | 21 ++++-- cmd/waza/cmd_run_test.go | 88 ++++++++++++++++++++----- site/src/content/docs/reference/cli.mdx | 2 +- 4 files changed, 87 insertions(+), 25 deletions(-) diff --git a/README.md b/README.md index 632b2f4c..f81c874d 100644 --- a/README.md +++ b/README.md @@ -252,6 +252,7 @@ Run an evaluation benchmark from a spec file. |------|-------|-------------| | `--context-dir ` | | Fixture directory (default: `./fixtures` relative to spec) | | `--output ` | `-o` | Save results to JSON | +| `--output-dir ` | | Directory for structured output; each run creates a UTC-timestamped subdirectory (e.g., `/2025-06-15T103045.123/model.json`). Mutually exclusive with `--output`. | | `--verbose` | `-v` | Detailed progress output | | `--transcript-dir ` | | Save per-task transcript JSON files | | `--task ` | | Filter tasks by name/ID pattern (repeatable) | diff --git a/cmd/waza/cmd_run.go b/cmd/waza/cmd_run.go index ff440c86..88898ce3 100644 --- a/cmd/waza/cmd_run.go +++ b/cmd/waza/cmd_run.go @@ -98,7 +98,7 @@ You can also specify a skill name to run its eval: cmd.Flags().StringVar(&contextDir, "context-dir", "", "Context directory for fixtures (default: ./fixtures relative to spec)") cmd.Flags().StringVarP(&outputPath, "output", "o", "", "Output JSON file for results") - cmd.Flags().StringVar(&outputDir, "output-dir", "", "Directory for structured output (mutually exclusive with --output)") + cmd.Flags().StringVar(&outputDir, "output-dir", "", "Directory for structured output; each run creates a UTC-timestamped subdirectory. Mutually exclusive with --output.") cmd.Flags().BoolVarP(&verbose, "verbose", "v", false, "Verbose output with detailed progress") cmd.Flags().StringVar(&transcriptDir, "transcript-dir", "", "Directory to save per-task transcript JSON files") cmd.Flags().StringArrayVar(&taskFilters, "task", nil, "Filter tasks by name/ID glob pattern (can be repeated).") @@ -1488,10 +1488,17 @@ func saveSummary(summary *models.MultiSkillSummary, path string) error { } // writeOutputDir writes results to a structured directory hierarchy. -// For multi-skill runs: {outputDir}/{skillName}/{modelName}.json -// For single-skill runs: {outputDir}/{modelName}.json +// Each run creates a timestamped subdirectory to avoid overwriting previous results. +// For multi-skill runs: {outputDir}/{timestamp}/{skillName}/{modelName}.json +// For single-skill runs: {outputDir}/{timestamp}/{modelName}.json func writeOutputDir(dir string, results []skillRunResult) error { - if err := os.MkdirAll(dir, 0755); err != nil { + return writeOutputDirAt(dir, results, time.Now()) +} + +// writeOutputDirAt is the testable core of writeOutputDir, accepting a timestamp. +func writeOutputDirAt(dir string, results []skillRunResult, now time.Time) error { + runDir := filepath.Join(dir, now.UTC().Format("2006-01-02T150405.000")) + if err := os.MkdirAll(runDir, 0755); err != nil { return fmt.Errorf("create output directory: %w", err) } @@ -1506,16 +1513,16 @@ func writeOutputDir(dir string, results []skillRunResult) error { var outPath string if multiSkill { // Multi-skill: create skill subdirectory - skillDir := filepath.Join(dir, sanitizePathSegment(skillResult.skillName)) + skillDir := filepath.Join(runDir, sanitizePathSegment(skillResult.skillName)) if err := os.MkdirAll(skillDir, 0755); err != nil { return fmt.Errorf("create skill directory %s: %w", skillDir, err) } modelFile := sanitizePathSegment(mr.modelID) + ".json" outPath = filepath.Join(skillDir, modelFile) } else { - // Single-skill: write directly to output dir + // Single-skill: write directly to run dir modelFile := sanitizePathSegment(mr.modelID) + ".json" - outPath = filepath.Join(dir, modelFile) + outPath = filepath.Join(runDir, modelFile) } if err := saveOutcome(mr.outcome, outPath); err != nil { diff --git a/cmd/waza/cmd_run_test.go b/cmd/waza/cmd_run_test.go index 16841f4e..ea0f6f8b 100644 --- a/cmd/waza/cmd_run_test.go +++ b/cmd/waza/cmd_run_test.go @@ -1928,16 +1928,30 @@ func TestRunCommand_OutputDirSingleSkill(t *testing.T) { err := cmd.Execute() require.NoError(t, err) - // Verify output directory was created with a result JSON file + // Find a timestamped subdirectory (e.g. 2025-06-15T103045.000) entries, err := os.ReadDir(outDir) require.NoError(t, err) - require.NotEmpty(t, entries, "expected output files in --output-dir") - // Find and validate the JSON result file - var found bool + var runDir string for _, e := range entries { + if e.IsDir() && len(e.Name()) >= len("2006-01-02T150405") { + if _, terr := time.Parse("2006-01-02T150405", e.Name()[:len("2006-01-02T150405")]); terr == nil { + runDir = filepath.Join(outDir, e.Name()) + break + } + } + } + require.NotEmpty(t, runDir, "expected a timestamped subdirectory in --output-dir") + + // Find and validate the JSON result file inside the timestamped subdirectory + runEntries, err := os.ReadDir(runDir) + require.NoError(t, err) + require.NotEmpty(t, runEntries, "expected output files in run subdirectory") + + var found bool + for _, e := range runEntries { if filepath.Ext(e.Name()) == ".json" { - data, err := os.ReadFile(filepath.Join(outDir, e.Name())) + data, err := os.ReadFile(filepath.Join(runDir, e.Name())) require.NoError(t, err) var outcome models.EvaluationOutcome require.NoError(t, json.Unmarshal(data, &outcome)) @@ -1945,11 +1959,12 @@ func TestRunCommand_OutputDirSingleSkill(t *testing.T) { found = true } } - assert.True(t, found, "expected at least one .json result in output dir") + assert.True(t, found, "expected at least one .json result in run subdirectory") } func TestWriteOutputDir_SingleSkill(t *testing.T) { dir := t.TempDir() + fixedTime := time.Date(2025, 6, 15, 10, 30, 45, 0, time.UTC) // Single skill with single model results := []skillRunResult{ @@ -1969,11 +1984,14 @@ func TestWriteOutputDir_SingleSkill(t *testing.T) { }, } - err := writeOutputDir(dir, results) + err := writeOutputDirAt(dir, results, fixedTime) require.NoError(t, err) - // Single-skill mode: files written directly to output dir - resultPath := filepath.Join(dir, "gpt-4o.json") + // Single-skill mode: files written inside timestamped run directory + runDir := filepath.Join(dir, "2025-06-15T103045.000") + assert.DirExists(t, runDir) + + resultPath := filepath.Join(runDir, "gpt-4o.json") assert.FileExists(t, resultPath) // Verify JSON content @@ -1987,6 +2005,7 @@ func TestWriteOutputDir_SingleSkill(t *testing.T) { func TestWriteOutputDir_MultiSkill(t *testing.T) { dir := t.TempDir() + fixedTime := time.Date(2025, 6, 15, 10, 30, 45, 0, time.UTC) // Multi-skill with multiple models results := []skillRunResult{ @@ -2020,12 +2039,13 @@ func TestWriteOutputDir_MultiSkill(t *testing.T) { }, } - err := writeOutputDir(dir, results) + err := writeOutputDirAt(dir, results, fixedTime) require.NoError(t, err) - // Multi-skill mode: subdirectories created per skill - explainerDir := filepath.Join(dir, "code-explainer") - reviewerDir := filepath.Join(dir, "code-reviewer") + // Multi-skill mode: timestamped run dir with skill subdirectories + runDir := filepath.Join(dir, "2025-06-15T103045.000") + explainerDir := filepath.Join(runDir, "code-explainer") + reviewerDir := filepath.Join(runDir, "code-reviewer") assert.DirExists(t, explainerDir) assert.DirExists(t, reviewerDir) @@ -2046,6 +2066,7 @@ func TestWriteOutputDir_MultiSkill(t *testing.T) { func TestWriteOutputDir_SanitizesPaths(t *testing.T) { dir := t.TempDir() + fixedTime := time.Date(2025, 6, 15, 10, 30, 45, 0, time.UTC) // Skill and model names with special chars // Multi-skill to test subdirectory creation @@ -2074,18 +2095,51 @@ func TestWriteOutputDir_SanitizesPaths(t *testing.T) { }, } - err := writeOutputDir(dir, results) + err := writeOutputDirAt(dir, results, fixedTime) require.NoError(t, err) - // Paths should be sanitized - explainerDir := filepath.Join(dir, "code-explainer") - reviewerDir := filepath.Join(dir, "code-reviewer") + // Paths should be sanitized, inside timestamped run dir + runDir := filepath.Join(dir, "2025-06-15T103045.000") + explainerDir := filepath.Join(runDir, "code-explainer") + reviewerDir := filepath.Join(runDir, "code-reviewer") assert.DirExists(t, explainerDir) assert.DirExists(t, reviewerDir) assert.FileExists(t, filepath.Join(explainerDir, "gpt-4o-latest.json")) assert.FileExists(t, filepath.Join(reviewerDir, "claude-sonnet.json")) } +func TestWriteOutputDir_RepeatRunsDoNotCollide(t *testing.T) { + dir := t.TempDir() + + results := []skillRunResult{ + { + skillName: "code-explainer", + outcomes: []modelResult{ + { + modelID: "gpt-4o", + outcome: &models.EvaluationOutcome{ + Digest: models.OutcomeDigest{TotalTests: 3, Succeeded: 2}, + }, + }, + }, + }, + } + + t1 := time.Date(2025, 6, 15, 10, 30, 45, 0, time.UTC) + t2 := time.Date(2025, 6, 15, 11, 0, 0, 0, time.UTC) + + require.NoError(t, writeOutputDirAt(dir, results, t1)) + require.NoError(t, writeOutputDirAt(dir, results, t2)) + + // Both timestamped subdirectories should exist independently + entries, err := os.ReadDir(dir) + require.NoError(t, err) + require.Len(t, entries, 2, "expected two timestamped subdirectories") + + assert.FileExists(t, filepath.Join(dir, "2025-06-15T103045.000", "gpt-4o.json")) + assert.FileExists(t, filepath.Join(dir, "2025-06-15T110000.000", "gpt-4o.json")) +} + // --------------------------------------------------------------------------- // .waza.yaml config defaults // --------------------------------------------------------------------------- diff --git a/site/src/content/docs/reference/cli.mdx b/site/src/content/docs/reference/cli.mdx index f395ce3a..ee73af66 100644 --- a/site/src/content/docs/reference/cli.mdx +++ b/site/src/content/docs/reference/cli.mdx @@ -34,7 +34,7 @@ waza run [eval.yaml | skill-name] |------|-------|------|---------|-------------| | `--context-dir` | `-c` | string | `./fixtures` | Fixtures directory path | | `--output` | `-o` | string | | Save results JSON to file | -| `--output-dir` | `-d` | string | | Save output artifacts to directory | +| `--output-dir` | `-d` | string | | Directory for structured output; each run creates a UTC-timestamped subdirectory (e.g., `/2025-06-15T103045.123/model.json`). Mutually exclusive with `--output`. | | `--verbose` | `-v` | bool | false | Detailed progress output | | `--parallel` | | bool | false | Run tasks concurrently | | `--workers` | `-w` | int | 4 | Number of concurrent workers | From ab613eb68a50a9ed6e3fa85f6220dbfc3da59838 Mon Sep 17 00:00:00 2001 From: Charles Lowell <10964656+chlowell@users.noreply.github.com> Date: Fri, 20 Mar 2026 10:17:58 -0700 Subject: [PATCH 2/2] tweak the docs --- README.md | 2 +- site/src/content/docs/reference/cli.mdx | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index f81c874d..9fd6d1d8 100644 --- a/README.md +++ b/README.md @@ -252,7 +252,7 @@ Run an evaluation benchmark from a spec file. |------|-------|-------------| | `--context-dir ` | | Fixture directory (default: `./fixtures` relative to spec) | | `--output ` | `-o` | Save results to JSON | -| `--output-dir ` | | Directory for structured output; each run creates a UTC-timestamped subdirectory (e.g., `/2025-06-15T103045.123/model.json`). Mutually exclusive with `--output`. | +| `--output-dir ` | | Directory for structured output; each run creates a UTC-timestamped subdirectory of ``. Mutually exclusive with `--output`. | | `--verbose` | `-v` | Detailed progress output | | `--transcript-dir ` | | Save per-task transcript JSON files | | `--task ` | | Filter tasks by name/ID pattern (repeatable) | diff --git a/site/src/content/docs/reference/cli.mdx b/site/src/content/docs/reference/cli.mdx index ee73af66..0fd56f1d 100644 --- a/site/src/content/docs/reference/cli.mdx +++ b/site/src/content/docs/reference/cli.mdx @@ -34,7 +34,7 @@ waza run [eval.yaml | skill-name] |------|-------|------|---------|-------------| | `--context-dir` | `-c` | string | `./fixtures` | Fixtures directory path | | `--output` | `-o` | string | | Save results JSON to file | -| `--output-dir` | `-d` | string | | Directory for structured output; each run creates a UTC-timestamped subdirectory (e.g., `/2025-06-15T103045.123/model.json`). Mutually exclusive with `--output`. | +| `--output-dir` | `-d` | string | | Directory for structured output; each run creates a UTC-timestamped subdirectory of the specified directory. Mutually exclusive with `--output`. | | `--verbose` | `-v` | bool | false | Detailed progress output | | `--parallel` | | bool | false | Run tasks concurrently | | `--workers` | `-w` | int | 4 | Number of concurrent workers |