Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -252,6 +252,7 @@ Run an evaluation benchmark from a spec file.
|------|-------|-------------|
| `--context-dir <dir>` | | Fixture directory (default: `./fixtures` relative to spec) |
| `--output <file>` | `-o` | Save results to JSON |
| `--output-dir <dir>` | | Directory for structured output; each run creates a UTC-timestamped subdirectory of `<dir>`. Mutually exclusive with `--output`. |
| `--verbose` | `-v` | Detailed progress output |
| `--transcript-dir <dir>` | | Save per-task transcript JSON files |
| `--task <glob>` | | Filter tasks by name/ID pattern (repeatable) |
Expand Down
21 changes: 14 additions & 7 deletions cmd/waza/cmd_run.go
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,7 @@ You can also specify a skill name to run its eval:

cmd.Flags().StringVar(&contextDir, "context-dir", "", "Context directory for fixtures (default: ./fixtures relative to spec)")
cmd.Flags().StringVarP(&outputPath, "output", "o", "", "Output JSON file for results")
cmd.Flags().StringVar(&outputDir, "output-dir", "", "Directory for structured output (mutually exclusive with --output)")
cmd.Flags().StringVar(&outputDir, "output-dir", "", "Directory for structured output; each run creates a UTC-timestamped subdirectory. Mutually exclusive with --output.")
cmd.Flags().BoolVarP(&verbose, "verbose", "v", false, "Verbose output with detailed progress")
cmd.Flags().StringVar(&transcriptDir, "transcript-dir", "", "Directory to save per-task transcript JSON files")
cmd.Flags().StringArrayVar(&taskFilters, "task", nil, "Filter tasks by name/ID glob pattern (can be repeated).")
Expand Down Expand Up @@ -1488,10 +1488,17 @@ func saveSummary(summary *models.MultiSkillSummary, path string) error {
}

// writeOutputDir writes results to a structured directory hierarchy.
// For multi-skill runs: {outputDir}/{skillName}/{modelName}.json
// For single-skill runs: {outputDir}/{modelName}.json
// Each run creates a timestamped subdirectory to avoid overwriting previous results.
// For multi-skill runs: {outputDir}/{timestamp}/{skillName}/{modelName}.json
// For single-skill runs: {outputDir}/{timestamp}/{modelName}.json
func writeOutputDir(dir string, results []skillRunResult) error {
if err := os.MkdirAll(dir, 0755); err != nil {
return writeOutputDirAt(dir, results, time.Now())
}

// writeOutputDirAt is the testable core of writeOutputDir, accepting a timestamp.
func writeOutputDirAt(dir string, results []skillRunResult, now time.Time) error {
runDir := filepath.Join(dir, now.UTC().Format("2006-01-02T150405.000"))
if err := os.MkdirAll(runDir, 0755); err != nil {
return fmt.Errorf("create output directory: %w", err)
}

Expand All @@ -1506,16 +1513,16 @@ func writeOutputDir(dir string, results []skillRunResult) error {
var outPath string
if multiSkill {
// Multi-skill: create skill subdirectory
skillDir := filepath.Join(dir, sanitizePathSegment(skillResult.skillName))
skillDir := filepath.Join(runDir, sanitizePathSegment(skillResult.skillName))
if err := os.MkdirAll(skillDir, 0755); err != nil {
return fmt.Errorf("create skill directory %s: %w", skillDir, err)
}
modelFile := sanitizePathSegment(mr.modelID) + ".json"
outPath = filepath.Join(skillDir, modelFile)
} else {
// Single-skill: write directly to output dir
// Single-skill: write directly to run dir
modelFile := sanitizePathSegment(mr.modelID) + ".json"
outPath = filepath.Join(dir, modelFile)
outPath = filepath.Join(runDir, modelFile)
}

if err := saveOutcome(mr.outcome, outPath); err != nil {
Expand Down
88 changes: 71 additions & 17 deletions cmd/waza/cmd_run_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1928,28 +1928,43 @@ func TestRunCommand_OutputDirSingleSkill(t *testing.T) {
err := cmd.Execute()
require.NoError(t, err)

// Verify output directory was created with a result JSON file
// Find a timestamped subdirectory (e.g. 2025-06-15T103045.000)
entries, err := os.ReadDir(outDir)
require.NoError(t, err)
require.NotEmpty(t, entries, "expected output files in --output-dir")

// Find and validate the JSON result file
var found bool
var runDir string
for _, e := range entries {
if e.IsDir() && len(e.Name()) >= len("2006-01-02T150405") {
if _, terr := time.Parse("2006-01-02T150405", e.Name()[:len("2006-01-02T150405")]); terr == nil {
runDir = filepath.Join(outDir, e.Name())
break
}
}
}
require.NotEmpty(t, runDir, "expected a timestamped subdirectory in --output-dir")

// Find and validate the JSON result file inside the timestamped subdirectory
runEntries, err := os.ReadDir(runDir)
require.NoError(t, err)
require.NotEmpty(t, runEntries, "expected output files in run subdirectory")

var found bool
for _, e := range runEntries {
if filepath.Ext(e.Name()) == ".json" {
data, err := os.ReadFile(filepath.Join(outDir, e.Name()))
data, err := os.ReadFile(filepath.Join(runDir, e.Name()))
require.NoError(t, err)
var outcome models.EvaluationOutcome
require.NoError(t, json.Unmarshal(data, &outcome))
assert.Equal(t, "test-eval", outcome.BenchName)
found = true
}
}
assert.True(t, found, "expected at least one .json result in output dir")
assert.True(t, found, "expected at least one .json result in run subdirectory")
}

func TestWriteOutputDir_SingleSkill(t *testing.T) {
dir := t.TempDir()
fixedTime := time.Date(2025, 6, 15, 10, 30, 45, 0, time.UTC)

// Single skill with single model
results := []skillRunResult{
Expand All @@ -1969,11 +1984,14 @@ func TestWriteOutputDir_SingleSkill(t *testing.T) {
},
}

err := writeOutputDir(dir, results)
err := writeOutputDirAt(dir, results, fixedTime)
require.NoError(t, err)

// Single-skill mode: files written directly to output dir
resultPath := filepath.Join(dir, "gpt-4o.json")
// Single-skill mode: files written inside timestamped run directory
runDir := filepath.Join(dir, "2025-06-15T103045.000")
assert.DirExists(t, runDir)

resultPath := filepath.Join(runDir, "gpt-4o.json")
assert.FileExists(t, resultPath)

// Verify JSON content
Expand All @@ -1987,6 +2005,7 @@ func TestWriteOutputDir_SingleSkill(t *testing.T) {

func TestWriteOutputDir_MultiSkill(t *testing.T) {
dir := t.TempDir()
fixedTime := time.Date(2025, 6, 15, 10, 30, 45, 0, time.UTC)

// Multi-skill with multiple models
results := []skillRunResult{
Expand Down Expand Up @@ -2020,12 +2039,13 @@ func TestWriteOutputDir_MultiSkill(t *testing.T) {
},
}

err := writeOutputDir(dir, results)
err := writeOutputDirAt(dir, results, fixedTime)
require.NoError(t, err)

// Multi-skill mode: subdirectories created per skill
explainerDir := filepath.Join(dir, "code-explainer")
reviewerDir := filepath.Join(dir, "code-reviewer")
// Multi-skill mode: timestamped run dir with skill subdirectories
runDir := filepath.Join(dir, "2025-06-15T103045.000")
explainerDir := filepath.Join(runDir, "code-explainer")
reviewerDir := filepath.Join(runDir, "code-reviewer")

assert.DirExists(t, explainerDir)
assert.DirExists(t, reviewerDir)
Expand All @@ -2046,6 +2066,7 @@ func TestWriteOutputDir_MultiSkill(t *testing.T) {

func TestWriteOutputDir_SanitizesPaths(t *testing.T) {
dir := t.TempDir()
fixedTime := time.Date(2025, 6, 15, 10, 30, 45, 0, time.UTC)

// Skill and model names with special chars
// Multi-skill to test subdirectory creation
Expand Down Expand Up @@ -2074,18 +2095,51 @@ func TestWriteOutputDir_SanitizesPaths(t *testing.T) {
},
}

err := writeOutputDir(dir, results)
err := writeOutputDirAt(dir, results, fixedTime)
require.NoError(t, err)

// Paths should be sanitized
explainerDir := filepath.Join(dir, "code-explainer")
reviewerDir := filepath.Join(dir, "code-reviewer")
// Paths should be sanitized, inside timestamped run dir
runDir := filepath.Join(dir, "2025-06-15T103045.000")
explainerDir := filepath.Join(runDir, "code-explainer")
reviewerDir := filepath.Join(runDir, "code-reviewer")
assert.DirExists(t, explainerDir)
assert.DirExists(t, reviewerDir)
assert.FileExists(t, filepath.Join(explainerDir, "gpt-4o-latest.json"))
assert.FileExists(t, filepath.Join(reviewerDir, "claude-sonnet.json"))
}

func TestWriteOutputDir_RepeatRunsDoNotCollide(t *testing.T) {
dir := t.TempDir()

results := []skillRunResult{
{
skillName: "code-explainer",
outcomes: []modelResult{
{
modelID: "gpt-4o",
outcome: &models.EvaluationOutcome{
Digest: models.OutcomeDigest{TotalTests: 3, Succeeded: 2},
},
},
},
},
}

t1 := time.Date(2025, 6, 15, 10, 30, 45, 0, time.UTC)
t2 := time.Date(2025, 6, 15, 11, 0, 0, 0, time.UTC)

require.NoError(t, writeOutputDirAt(dir, results, t1))
require.NoError(t, writeOutputDirAt(dir, results, t2))

// Both timestamped subdirectories should exist independently
entries, err := os.ReadDir(dir)
require.NoError(t, err)
require.Len(t, entries, 2, "expected two timestamped subdirectories")

assert.FileExists(t, filepath.Join(dir, "2025-06-15T103045.000", "gpt-4o.json"))
assert.FileExists(t, filepath.Join(dir, "2025-06-15T110000.000", "gpt-4o.json"))
}

// ---------------------------------------------------------------------------
// .waza.yaml config defaults
// ---------------------------------------------------------------------------
Expand Down
2 changes: 1 addition & 1 deletion site/src/content/docs/reference/cli.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ waza run [eval.yaml | skill-name]
|------|-------|------|---------|-------------|
| `--context-dir` | `-c` | string | `./fixtures` | Fixtures directory path |
| `--output` | `-o` | string | | Save results JSON to file |
| `--output-dir` | `-d` | string | | Save output artifacts to directory |
| `--output-dir` | `-d` | string | | Directory for structured output; each run creates a UTC-timestamped subdirectory of the specified directory. Mutually exclusive with `--output`. |
| `--verbose` | `-v` | bool | false | Detailed progress output |
| `--parallel` | | bool | false | Run tasks concurrently |
| `--workers` | `-w` | int | 4 | Number of concurrent workers |
Expand Down
Loading