From 36c3feb8d995a592dc45aed71950cd10243d8f08 Mon Sep 17 00:00:00 2001
From: Charles Lowell <10964656+chlowell@users.noreply.github.com>
Date: Fri, 20 Mar 2026 07:33:22 -0700
Subject: [PATCH 1/2] `run --output-dir` groups files by timestamp

---
 README.md                               |  1 +
 cmd/waza/cmd_run.go                     | 21 ++++--
 cmd/waza/cmd_run_test.go                | 88 ++++++++++++++++++++-----
 site/src/content/docs/reference/cli.mdx |  2 +-
 4 files changed, 87 insertions(+), 25 deletions(-)
diff --git a/README.md b/README.md
index 632b2f4c..f81c874d 100644
--- a/README.md
+++ b/README.md
@@ -252,6 +252,7 @@ Run an evaluation benchmark from a spec file.
 |------|-------|-------------|
 | `--context-dir <dir>` | | Fixture directory (default: `./fixtures` relative to spec) |
 | `--output <file>` | `-o` | Save results to JSON |
+| `--output-dir <dir>` | | Directory for structured output; each run creates a UTC-timestamped subdirectory (e.g., `<dir>/2025-06-15T103045.123/model.json`). Mutually exclusive with `--output`. |
 | `--verbose` | `-v` | Detailed progress output |
 | `--transcript-dir <dir>` | | Save per-task transcript JSON files |
 | `--task <glob>` | | Filter tasks by name/ID pattern (repeatable) |
diff --git a/cmd/waza/cmd_run.go b/cmd/waza/cmd_run.go
index ff440c86..88898ce3 100644
--- a/cmd/waza/cmd_run.go
+++ b/cmd/waza/cmd_run.go
@@ -98,7 +98,7 @@ You can also specify a skill name to run its eval:
 
 	cmd.Flags().StringVar(&contextDir, "context-dir", "", "Context directory for fixtures (default: ./fixtures relative to spec)")
 	cmd.Flags().StringVarP(&outputPath, "output", "o", "", "Output JSON file for results")
-	cmd.Flags().StringVar(&outputDir, "output-dir", "", "Directory for structured output (mutually exclusive with --output)")
+	cmd.Flags().StringVar(&outputDir, "output-dir", "", "Directory for structured output; each run creates a UTC-timestamped subdirectory. Mutually exclusive with --output.")
 	cmd.Flags().BoolVarP(&verbose, "verbose", "v", false, "Verbose output with detailed progress")
 	cmd.Flags().StringVar(&transcriptDir, "transcript-dir", "", "Directory to save per-task transcript JSON files")
 	cmd.Flags().StringArrayVar(&taskFilters, "task", nil, "Filter tasks by name/ID glob pattern (can be repeated).")
@@ -1488,10 +1488,17 @@ func saveSummary(summary *models.MultiSkillSummary, path string) error {
 }
 
 // writeOutputDir writes results to a structured directory hierarchy.
-// For multi-skill runs: {outputDir}/{skillName}/{modelName}.json
-// For single-skill runs: {outputDir}/{modelName}.json
+// Each run creates a timestamped subdirectory to avoid overwriting previous results.
+// For multi-skill runs: {outputDir}/{timestamp}/{skillName}/{modelName}.json
+// For single-skill runs: {outputDir}/{timestamp}/{modelName}.json
 func writeOutputDir(dir string, results []skillRunResult) error {
-	if err := os.MkdirAll(dir, 0755); err != nil {
+	return writeOutputDirAt(dir, results, time.Now())
+}
+
+// writeOutputDirAt is the testable core of writeOutputDir, accepting a timestamp.
+func writeOutputDirAt(dir string, results []skillRunResult, now time.Time) error {
+	runDir := filepath.Join(dir, now.UTC().Format("2006-01-02T150405.000"))
+	if err := os.MkdirAll(runDir, 0755); err != nil {
 		return fmt.Errorf("create output directory: %w", err)
 	}
 
@@ -1506,16 +1513,16 @@ func writeOutputDir(dir string, results []skillRunResult) error {
 			var outPath string
 			if multiSkill {
 				// Multi-skill: create skill subdirectory
-				skillDir := filepath.Join(dir, sanitizePathSegment(skillResult.skillName))
+				skillDir := filepath.Join(runDir, sanitizePathSegment(skillResult.skillName))
 				if err := os.MkdirAll(skillDir, 0755); err != nil {
 					return fmt.Errorf("create skill directory %s: %w", skillDir, err)
 				}
 				modelFile := sanitizePathSegment(mr.modelID) + ".json"
 				outPath = filepath.Join(skillDir, modelFile)
 			} else {
-				// Single-skill: write directly to output dir
+				// Single-skill: write directly to run dir
 				modelFile := sanitizePathSegment(mr.modelID) + ".json"
-				outPath = filepath.Join(dir, modelFile)
+				outPath = filepath.Join(runDir, modelFile)
 			}
 
 			if err := saveOutcome(mr.outcome, outPath); err != nil {
diff --git a/cmd/waza/cmd_run_test.go b/cmd/waza/cmd_run_test.go
index 16841f4e..ea0f6f8b 100644
--- a/cmd/waza/cmd_run_test.go
+++ b/cmd/waza/cmd_run_test.go
@@ -1928,16 +1928,30 @@ func TestRunCommand_OutputDirSingleSkill(t *testing.T) {
 	err := cmd.Execute()
 	require.NoError(t, err)
 
-	// Verify output directory was created with a result JSON file
+	// Find a timestamped subdirectory (e.g. 2025-06-15T103045.000)
 	entries, err := os.ReadDir(outDir)
 	require.NoError(t, err)
-	require.NotEmpty(t, entries, "expected output files in --output-dir")
 
-	// Find and validate the JSON result file
-	var found bool
+	var runDir string
 	for _, e := range entries {
+		if e.IsDir() && len(e.Name()) >= len("2006-01-02T150405") {
+			if _, terr := time.Parse("2006-01-02T150405", e.Name()[:len("2006-01-02T150405")]); terr == nil {
+				runDir = filepath.Join(outDir, e.Name())
+				break
+			}
+		}
+	}
+	require.NotEmpty(t, runDir, "expected a timestamped subdirectory in --output-dir")
+
+	// Find and validate the JSON result file inside the timestamped subdirectory
+	runEntries, err := os.ReadDir(runDir)
+	require.NoError(t, err)
+	require.NotEmpty(t, runEntries, "expected output files in run subdirectory")
+
+	var found bool
+	for _, e := range runEntries {
 		if filepath.Ext(e.Name()) == ".json" {
-			data, err := os.ReadFile(filepath.Join(outDir, e.Name()))
+			data, err := os.ReadFile(filepath.Join(runDir, e.Name()))
 			require.NoError(t, err)
 			var outcome models.EvaluationOutcome
 			require.NoError(t, json.Unmarshal(data, &outcome))
@@ -1945,11 +1959,12 @@ func TestRunCommand_OutputDirSingleSkill(t *testing.T) {
 			found = true
 		}
 	}
-	assert.True(t, found, "expected at least one .json result in output dir")
+	assert.True(t, found, "expected at least one .json result in run subdirectory")
 }
 
 func TestWriteOutputDir_SingleSkill(t *testing.T) {
 	dir := t.TempDir()
+	fixedTime := time.Date(2025, 6, 15, 10, 30, 45, 0, time.UTC)
 
 	// Single skill with single model
 	results := []skillRunResult{
@@ -1969,11 +1984,14 @@ func TestWriteOutputDir_SingleSkill(t *testing.T) {
 		},
 	}
 
-	err := writeOutputDir(dir, results)
+	err := writeOutputDirAt(dir, results, fixedTime)
 	require.NoError(t, err)
 
-	// Single-skill mode: files written directly to output dir
-	resultPath := filepath.Join(dir, "gpt-4o.json")
+	// Single-skill mode: files written inside timestamped run directory
+	runDir := filepath.Join(dir, "2025-06-15T103045.000")
+	assert.DirExists(t, runDir)
+
+	resultPath := filepath.Join(runDir, "gpt-4o.json")
 	assert.FileExists(t, resultPath)
 
 	// Verify JSON content
@@ -1987,6 +2005,7 @@ func TestWriteOutputDir_SingleSkill(t *testing.T) {
 
 func TestWriteOutputDir_MultiSkill(t *testing.T) {
 	dir := t.TempDir()
+	fixedTime := time.Date(2025, 6, 15, 10, 30, 45, 0, time.UTC)
 
 	// Multi-skill with multiple models
 	results := []skillRunResult{
@@ -2020,12 +2039,13 @@ func TestWriteOutputDir_MultiSkill(t *testing.T) {
 		},
 	}
 
-	err := writeOutputDir(dir, results)
+	err := writeOutputDirAt(dir, results, fixedTime)
 	require.NoError(t, err)
 
-	// Multi-skill mode: subdirectories created per skill
-	explainerDir := filepath.Join(dir, "code-explainer")
-	reviewerDir := filepath.Join(dir, "code-reviewer")
+	// Multi-skill mode: timestamped run dir with skill subdirectories
+	runDir := filepath.Join(dir, "2025-06-15T103045.000")
+	explainerDir := filepath.Join(runDir, "code-explainer")
+	reviewerDir := filepath.Join(runDir, "code-reviewer")
 
 	assert.DirExists(t, explainerDir)
 	assert.DirExists(t, reviewerDir)
@@ -2046,6 +2066,7 @@ func TestWriteOutputDir_MultiSkill(t *testing.T) {
 
 func TestWriteOutputDir_SanitizesPaths(t *testing.T) {
 	dir := t.TempDir()
+	fixedTime := time.Date(2025, 6, 15, 10, 30, 45, 0, time.UTC)
 
 	// Skill and model names with special chars
 	// Multi-skill to test subdirectory creation
@@ -2074,18 +2095,51 @@ func TestWriteOutputDir_SanitizesPaths(t *testing.T) {
 		},
 	}
 
-	err := writeOutputDir(dir, results)
+	err := writeOutputDirAt(dir, results, fixedTime)
 	require.NoError(t, err)
 
-	// Paths should be sanitized
-	explainerDir := filepath.Join(dir, "code-explainer")
-	reviewerDir := filepath.Join(dir, "code-reviewer")
+	// Paths should be sanitized, inside timestamped run dir
+	runDir := filepath.Join(dir, "2025-06-15T103045.000")
+	explainerDir := filepath.Join(runDir, "code-explainer")
+	reviewerDir := filepath.Join(runDir, "code-reviewer")
 	assert.DirExists(t, explainerDir)
 	assert.DirExists(t, reviewerDir)
 	assert.FileExists(t, filepath.Join(explainerDir, "gpt-4o-latest.json"))
 	assert.FileExists(t, filepath.Join(reviewerDir, "claude-sonnet.json"))
 }
 
+func TestWriteOutputDir_RepeatRunsDoNotCollide(t *testing.T) {
+	dir := t.TempDir()
+
+	results := []skillRunResult{
+		{
+			skillName: "code-explainer",
+			outcomes: []modelResult{
+				{
+					modelID: "gpt-4o",
+					outcome: &models.EvaluationOutcome{
+						Digest: models.OutcomeDigest{TotalTests: 3, Succeeded: 2},
+					},
+				},
+			},
+		},
+	}
+
+	t1 := time.Date(2025, 6, 15, 10, 30, 45, 0, time.UTC)
+	t2 := time.Date(2025, 6, 15, 11, 0, 0, 0, time.UTC)
+
+	require.NoError(t, writeOutputDirAt(dir, results, t1))
+	require.NoError(t, writeOutputDirAt(dir, results, t2))
+
+	// Both timestamped subdirectories should exist independently
+	entries, err := os.ReadDir(dir)
+	require.NoError(t, err)
+	require.Len(t, entries, 2, "expected two timestamped subdirectories")
+
+	assert.FileExists(t, filepath.Join(dir, "2025-06-15T103045.000", "gpt-4o.json"))
+	assert.FileExists(t, filepath.Join(dir, "2025-06-15T110000.000", "gpt-4o.json"))
+}
+
 // ---------------------------------------------------------------------------
 // .waza.yaml config defaults
 // ---------------------------------------------------------------------------
diff --git a/site/src/content/docs/reference/cli.mdx b/site/src/content/docs/reference/cli.mdx
index f395ce3a..ee73af66 100644
--- a/site/src/content/docs/reference/cli.mdx
+++ b/site/src/content/docs/reference/cli.mdx
@@ -34,7 +34,7 @@ waza run [eval.yaml | skill-name]
 |------|-------|------|---------|-------------|
 | `--context-dir` | `-c` | string | `./fixtures` | Fixtures directory path |
 | `--output` | `-o` | string | | Save results JSON to file |
-| `--output-dir` | `-d` | string | | Save output artifacts to directory |
+| `--output-dir` | `-d` | string | | Directory for structured output; each run creates a UTC-timestamped subdirectory (e.g., `<output-dir>/2025-06-15T103045.123/model.json`). Mutually exclusive with `--output`. |
 | `--verbose` | `-v` | bool | false | Detailed progress output |
 | `--parallel` | | bool | false | Run tasks concurrently |
 | `--workers` | `-w` | int | 4 | Number of concurrent workers |

From ab613eb68a50a9ed6e3fa85f6220dbfc3da59838 Mon Sep 17 00:00:00 2001
From: Charles Lowell <10964656+chlowell@users.noreply.github.com>
Date: Fri, 20 Mar 2026 10:17:58 -0700
Subject: [PATCH 2/2] tweak the docs

---
 README.md                               | 2 +-
 site/src/content/docs/reference/cli.mdx | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index f81c874d..9fd6d1d8 100644
--- a/README.md
+++ b/README.md
@@ -252,7 +252,7 @@ Run an evaluation benchmark from a spec file.
 |------|-------|-------------|
 | `--context-dir <dir>` | | Fixture directory (default: `./fixtures` relative to spec) |
 | `--output <file>` | `-o` | Save results to JSON |
-| `--output-dir <dir>` | | Directory for structured output; each run creates a UTC-timestamped subdirectory (e.g., `<dir>/2025-06-15T103045.123/model.json`). Mutually exclusive with `--output`. |
+| `--output-dir <dir>` | | Directory for structured output; each run creates a UTC-timestamped subdirectory of `<dir>`. Mutually exclusive with `--output`. |
 | `--verbose` | `-v` | Detailed progress output |
 | `--transcript-dir <dir>` | | Save per-task transcript JSON files |
 | `--task <glob>` | | Filter tasks by name/ID pattern (repeatable) |
diff --git a/site/src/content/docs/reference/cli.mdx b/site/src/content/docs/reference/cli.mdx
index ee73af66..0fd56f1d 100644
--- a/site/src/content/docs/reference/cli.mdx
+++ b/site/src/content/docs/reference/cli.mdx
@@ -34,7 +34,7 @@ waza run [eval.yaml | skill-name]
 |------|-------|------|---------|-------------|
 | `--context-dir` | `-c` | string | `./fixtures` | Fixtures directory path |
 | `--output` | `-o` | string | | Save results JSON to file |
-| `--output-dir` | `-d` | string | | Directory for structured output; each run creates a UTC-timestamped subdirectory (e.g., `<output-dir>/2025-06-15T103045.123/model.json`). Mutually exclusive with `--output`. |
+| `--output-dir` | `-d` | string | | Directory for structured output; each run creates a UTC-timestamped subdirectory of the specified directory. Mutually exclusive with `--output`. |
 | `--verbose` | `-v` | bool | false | Detailed progress output |
 | `--parallel` | | bool | false | Run tasks concurrently |
 | `--workers` | `-w` | int | 4 | Number of concurrent workers |