From 36c3feb8d995a592dc45aed71950cd10243d8f08 Mon Sep 17 00:00:00 2001
From: Charles Lowell <10964656+chlowell@users.noreply.github.com>
Date: Fri, 20 Mar 2026 07:33:22 -0700
Subject: [PATCH 1/2] `run --output-dir` groups files by timestamp
---
README.md | 1 +
cmd/waza/cmd_run.go | 21 ++++--
cmd/waza/cmd_run_test.go | 88 ++++++++++++++++++++-----
site/src/content/docs/reference/cli.mdx | 2 +-
4 files changed, 87 insertions(+), 25 deletions(-)
diff --git a/README.md b/README.md
index 632b2f4c..f81c874d 100644
--- a/README.md
+++ b/README.md
@@ -252,6 +252,7 @@ Run an evaluation benchmark from a spec file.
|------|-------|-------------|
| `--context-dir
` | | Fixture directory (default: `./fixtures` relative to spec) |
| `--output ` | `-o` | Save results to JSON |
+| `--output-dir ` | | Directory for structured output; each run creates a UTC-timestamped subdirectory (e.g., `/2025-06-15T103045.123/model.json`). Mutually exclusive with `--output`. |
| `--verbose` | `-v` | Detailed progress output |
| `--transcript-dir ` | | Save per-task transcript JSON files |
| `--task ` | | Filter tasks by name/ID pattern (repeatable) |
diff --git a/cmd/waza/cmd_run.go b/cmd/waza/cmd_run.go
index ff440c86..88898ce3 100644
--- a/cmd/waza/cmd_run.go
+++ b/cmd/waza/cmd_run.go
@@ -98,7 +98,7 @@ You can also specify a skill name to run its eval:
cmd.Flags().StringVar(&contextDir, "context-dir", "", "Context directory for fixtures (default: ./fixtures relative to spec)")
cmd.Flags().StringVarP(&outputPath, "output", "o", "", "Output JSON file for results")
- cmd.Flags().StringVar(&outputDir, "output-dir", "", "Directory for structured output (mutually exclusive with --output)")
+ cmd.Flags().StringVar(&outputDir, "output-dir", "", "Directory for structured output; each run creates a UTC-timestamped subdirectory. Mutually exclusive with --output.")
cmd.Flags().BoolVarP(&verbose, "verbose", "v", false, "Verbose output with detailed progress")
cmd.Flags().StringVar(&transcriptDir, "transcript-dir", "", "Directory to save per-task transcript JSON files")
cmd.Flags().StringArrayVar(&taskFilters, "task", nil, "Filter tasks by name/ID glob pattern (can be repeated).")
@@ -1488,10 +1488,17 @@ func saveSummary(summary *models.MultiSkillSummary, path string) error {
}
// writeOutputDir writes results to a structured directory hierarchy.
-// For multi-skill runs: {outputDir}/{skillName}/{modelName}.json
-// For single-skill runs: {outputDir}/{modelName}.json
+// Each run creates a timestamped subdirectory to avoid overwriting previous results.
+// For multi-skill runs: {outputDir}/{timestamp}/{skillName}/{modelName}.json
+// For single-skill runs: {outputDir}/{timestamp}/{modelName}.json
func writeOutputDir(dir string, results []skillRunResult) error {
- if err := os.MkdirAll(dir, 0755); err != nil {
+ return writeOutputDirAt(dir, results, time.Now())
+}
+
+// writeOutputDirAt is the testable core of writeOutputDir, accepting a timestamp.
+func writeOutputDirAt(dir string, results []skillRunResult, now time.Time) error {
+ runDir := filepath.Join(dir, now.UTC().Format("2006-01-02T150405.000"))
+ if err := os.MkdirAll(runDir, 0755); err != nil {
return fmt.Errorf("create output directory: %w", err)
}
@@ -1506,16 +1513,16 @@ func writeOutputDir(dir string, results []skillRunResult) error {
var outPath string
if multiSkill {
// Multi-skill: create skill subdirectory
- skillDir := filepath.Join(dir, sanitizePathSegment(skillResult.skillName))
+ skillDir := filepath.Join(runDir, sanitizePathSegment(skillResult.skillName))
if err := os.MkdirAll(skillDir, 0755); err != nil {
return fmt.Errorf("create skill directory %s: %w", skillDir, err)
}
modelFile := sanitizePathSegment(mr.modelID) + ".json"
outPath = filepath.Join(skillDir, modelFile)
} else {
- // Single-skill: write directly to output dir
+ // Single-skill: write directly to run dir
modelFile := sanitizePathSegment(mr.modelID) + ".json"
- outPath = filepath.Join(dir, modelFile)
+ outPath = filepath.Join(runDir, modelFile)
}
if err := saveOutcome(mr.outcome, outPath); err != nil {
diff --git a/cmd/waza/cmd_run_test.go b/cmd/waza/cmd_run_test.go
index 16841f4e..ea0f6f8b 100644
--- a/cmd/waza/cmd_run_test.go
+++ b/cmd/waza/cmd_run_test.go
@@ -1928,16 +1928,30 @@ func TestRunCommand_OutputDirSingleSkill(t *testing.T) {
err := cmd.Execute()
require.NoError(t, err)
- // Verify output directory was created with a result JSON file
+ // Find a timestamped subdirectory (e.g. 2025-06-15T103045.000)
entries, err := os.ReadDir(outDir)
require.NoError(t, err)
- require.NotEmpty(t, entries, "expected output files in --output-dir")
- // Find and validate the JSON result file
- var found bool
+ var runDir string
for _, e := range entries {
+ if e.IsDir() && len(e.Name()) >= len("2006-01-02T150405") {
+ if _, terr := time.Parse("2006-01-02T150405", e.Name()[:len("2006-01-02T150405")]); terr == nil {
+ runDir = filepath.Join(outDir, e.Name())
+ break
+ }
+ }
+ }
+ require.NotEmpty(t, runDir, "expected a timestamped subdirectory in --output-dir")
+
+ // Find and validate the JSON result file inside the timestamped subdirectory
+ runEntries, err := os.ReadDir(runDir)
+ require.NoError(t, err)
+ require.NotEmpty(t, runEntries, "expected output files in run subdirectory")
+
+ var found bool
+ for _, e := range runEntries {
if filepath.Ext(e.Name()) == ".json" {
- data, err := os.ReadFile(filepath.Join(outDir, e.Name()))
+ data, err := os.ReadFile(filepath.Join(runDir, e.Name()))
require.NoError(t, err)
var outcome models.EvaluationOutcome
require.NoError(t, json.Unmarshal(data, &outcome))
@@ -1945,11 +1959,12 @@ func TestRunCommand_OutputDirSingleSkill(t *testing.T) {
found = true
}
}
- assert.True(t, found, "expected at least one .json result in output dir")
+ assert.True(t, found, "expected at least one .json result in run subdirectory")
}
func TestWriteOutputDir_SingleSkill(t *testing.T) {
dir := t.TempDir()
+ fixedTime := time.Date(2025, 6, 15, 10, 30, 45, 0, time.UTC)
// Single skill with single model
results := []skillRunResult{
@@ -1969,11 +1984,14 @@ func TestWriteOutputDir_SingleSkill(t *testing.T) {
},
}
- err := writeOutputDir(dir, results)
+ err := writeOutputDirAt(dir, results, fixedTime)
require.NoError(t, err)
- // Single-skill mode: files written directly to output dir
- resultPath := filepath.Join(dir, "gpt-4o.json")
+ // Single-skill mode: files written inside timestamped run directory
+ runDir := filepath.Join(dir, "2025-06-15T103045.000")
+ assert.DirExists(t, runDir)
+
+ resultPath := filepath.Join(runDir, "gpt-4o.json")
assert.FileExists(t, resultPath)
// Verify JSON content
@@ -1987,6 +2005,7 @@ func TestWriteOutputDir_SingleSkill(t *testing.T) {
func TestWriteOutputDir_MultiSkill(t *testing.T) {
dir := t.TempDir()
+ fixedTime := time.Date(2025, 6, 15, 10, 30, 45, 0, time.UTC)
// Multi-skill with multiple models
results := []skillRunResult{
@@ -2020,12 +2039,13 @@ func TestWriteOutputDir_MultiSkill(t *testing.T) {
},
}
- err := writeOutputDir(dir, results)
+ err := writeOutputDirAt(dir, results, fixedTime)
require.NoError(t, err)
- // Multi-skill mode: subdirectories created per skill
- explainerDir := filepath.Join(dir, "code-explainer")
- reviewerDir := filepath.Join(dir, "code-reviewer")
+ // Multi-skill mode: timestamped run dir with skill subdirectories
+ runDir := filepath.Join(dir, "2025-06-15T103045.000")
+ explainerDir := filepath.Join(runDir, "code-explainer")
+ reviewerDir := filepath.Join(runDir, "code-reviewer")
assert.DirExists(t, explainerDir)
assert.DirExists(t, reviewerDir)
@@ -2046,6 +2066,7 @@ func TestWriteOutputDir_MultiSkill(t *testing.T) {
func TestWriteOutputDir_SanitizesPaths(t *testing.T) {
dir := t.TempDir()
+ fixedTime := time.Date(2025, 6, 15, 10, 30, 45, 0, time.UTC)
// Skill and model names with special chars
// Multi-skill to test subdirectory creation
@@ -2074,18 +2095,51 @@ func TestWriteOutputDir_SanitizesPaths(t *testing.T) {
},
}
- err := writeOutputDir(dir, results)
+ err := writeOutputDirAt(dir, results, fixedTime)
require.NoError(t, err)
- // Paths should be sanitized
- explainerDir := filepath.Join(dir, "code-explainer")
- reviewerDir := filepath.Join(dir, "code-reviewer")
+ // Paths should be sanitized, inside timestamped run dir
+ runDir := filepath.Join(dir, "2025-06-15T103045.000")
+ explainerDir := filepath.Join(runDir, "code-explainer")
+ reviewerDir := filepath.Join(runDir, "code-reviewer")
assert.DirExists(t, explainerDir)
assert.DirExists(t, reviewerDir)
assert.FileExists(t, filepath.Join(explainerDir, "gpt-4o-latest.json"))
assert.FileExists(t, filepath.Join(reviewerDir, "claude-sonnet.json"))
}
+func TestWriteOutputDir_RepeatRunsDoNotCollide(t *testing.T) {
+ dir := t.TempDir()
+
+ results := []skillRunResult{
+ {
+ skillName: "code-explainer",
+ outcomes: []modelResult{
+ {
+ modelID: "gpt-4o",
+ outcome: &models.EvaluationOutcome{
+ Digest: models.OutcomeDigest{TotalTests: 3, Succeeded: 2},
+ },
+ },
+ },
+ },
+ }
+
+ t1 := time.Date(2025, 6, 15, 10, 30, 45, 0, time.UTC)
+ t2 := time.Date(2025, 6, 15, 11, 0, 0, 0, time.UTC)
+
+ require.NoError(t, writeOutputDirAt(dir, results, t1))
+ require.NoError(t, writeOutputDirAt(dir, results, t2))
+
+ // Both timestamped subdirectories should exist independently
+ entries, err := os.ReadDir(dir)
+ require.NoError(t, err)
+ require.Len(t, entries, 2, "expected two timestamped subdirectories")
+
+ assert.FileExists(t, filepath.Join(dir, "2025-06-15T103045.000", "gpt-4o.json"))
+ assert.FileExists(t, filepath.Join(dir, "2025-06-15T110000.000", "gpt-4o.json"))
+}
+
// ---------------------------------------------------------------------------
// .waza.yaml config defaults
// ---------------------------------------------------------------------------
diff --git a/site/src/content/docs/reference/cli.mdx b/site/src/content/docs/reference/cli.mdx
index f395ce3a..ee73af66 100644
--- a/site/src/content/docs/reference/cli.mdx
+++ b/site/src/content/docs/reference/cli.mdx
@@ -34,7 +34,7 @@ waza run [eval.yaml | skill-name]
|------|-------|------|---------|-------------|
| `--context-dir` | `-c` | string | `./fixtures` | Fixtures directory path |
| `--output` | `-o` | string | | Save results JSON to file |
-| `--output-dir` | `-d` | string | | Save output artifacts to directory |
+| `--output-dir` | `-d` | string | | Directory for structured output; each run creates a UTC-timestamped subdirectory (e.g., `/2025-06-15T103045.123/model.json`). Mutually exclusive with `--output`. |
| `--verbose` | `-v` | bool | false | Detailed progress output |
| `--parallel` | | bool | false | Run tasks concurrently |
| `--workers` | `-w` | int | 4 | Number of concurrent workers |
From ab613eb68a50a9ed6e3fa85f6220dbfc3da59838 Mon Sep 17 00:00:00 2001
From: Charles Lowell <10964656+chlowell@users.noreply.github.com>
Date: Fri, 20 Mar 2026 10:17:58 -0700
Subject: [PATCH 2/2] tweak the docs
---
README.md | 2 +-
site/src/content/docs/reference/cli.mdx | 2 +-
2 files changed, 2 insertions(+), 2 deletions(-)
diff --git a/README.md b/README.md
index f81c874d..9fd6d1d8 100644
--- a/README.md
+++ b/README.md
@@ -252,7 +252,7 @@ Run an evaluation benchmark from a spec file.
|------|-------|-------------|
| `--context-dir ` | | Fixture directory (default: `./fixtures` relative to spec) |
| `--output ` | `-o` | Save results to JSON |
-| `--output-dir ` | | Directory for structured output; each run creates a UTC-timestamped subdirectory (e.g., `/2025-06-15T103045.123/model.json`). Mutually exclusive with `--output`. |
+| `--output-dir ` | | Directory for structured output; each run creates a UTC-timestamped subdirectory of ``. Mutually exclusive with `--output`. |
| `--verbose` | `-v` | Detailed progress output |
| `--transcript-dir ` | | Save per-task transcript JSON files |
| `--task ` | | Filter tasks by name/ID pattern (repeatable) |
diff --git a/site/src/content/docs/reference/cli.mdx b/site/src/content/docs/reference/cli.mdx
index ee73af66..0fd56f1d 100644
--- a/site/src/content/docs/reference/cli.mdx
+++ b/site/src/content/docs/reference/cli.mdx
@@ -34,7 +34,7 @@ waza run [eval.yaml | skill-name]
|------|-------|------|---------|-------------|
| `--context-dir` | `-c` | string | `./fixtures` | Fixtures directory path |
| `--output` | `-o` | string | | Save results JSON to file |
-| `--output-dir` | `-d` | string | | Directory for structured output; each run creates a UTC-timestamped subdirectory (e.g., `/2025-06-15T103045.123/model.json`). Mutually exclusive with `--output`. |
+| `--output-dir` | `-d` | string | | Directory for structured output; each run creates a UTC-timestamped subdirectory of the specified directory. Mutually exclusive with `--output`. |
| `--verbose` | `-v` | bool | false | Detailed progress output |
| `--parallel` | | bool | false | Run tasks concurrently |
| `--workers` | `-w` | int | 4 | Number of concurrent workers |