diff --git a/README.md b/README.md index af494dc1..c10d84a7 100644 --- a/README.md +++ b/README.md @@ -102,6 +102,9 @@ waza run examples/code-explainer/eval.yaml --context-dir examples/code-explainer # Compare results across models waza compare results-gpt4.json results-sonnet.json +# Generate eval coverage grid +waza coverage --format markdown + # Count tokens in skill files waza tokens count skills/ @@ -301,6 +304,17 @@ Compare results from multiple evaluation runs side by side — per-task score de |------|-------|-------------| | `--format ` | `-f` | Output format: `table` or `json` (default: `table`) | +### `waza coverage [root]` + +Generate a skill-to-eval coverage grid showing which skills are fully covered, partially covered, or missing evals. + +**Note**: Full coverage requires tasks (via `tasks:` or `tasks_from:`) and 2+ grader types. The coverage percentage reflects only fully covered skills. + +| Flag | Short | Description | +|------|-------|-------------| +| `--format ` | `-f` | Output format: `text`, `markdown`, or `json` (default: `text`) | +| `--path ` | | Additional directory to scan for skills/evals (repeatable) | + ### `waza cache clear` Clear all cached evaluation results to force re-execution on the next run. diff --git a/cmd/waza/cmd_coverage.go b/cmd/waza/cmd_coverage.go new file mode 100644 index 00000000..9ec2c387 --- /dev/null +++ b/cmd/waza/cmd_coverage.go @@ -0,0 +1,425 @@ +package main + +import ( + "encoding/json" + "fmt" + "io" + "io/fs" + "os" + "path/filepath" + "sort" + "strings" + "text/tabwriter" + + "github.com/microsoft/waza/internal/models" + "github.com/microsoft/waza/internal/skill" + "github.com/spf13/cobra" + "gopkg.in/yaml.v3" +) + +type coverageSkillRow struct { + Skill string `json:"skill"` + Tasks int `json:"tasks"` + Graders []string `json:"graders"` + Coverage string `json:"coverage"` +} + +type coverageReport struct { + TotalSkills int `json:"total_skills"` + Covered int `json:"covered"` + Partial int `json:"partial"` + Uncovered int `json:"uncovered"` + CoveragePct float64 `json:"coverage_pct"` + Skills []coverageSkillRow `json:"skills"` +} + +type evalSpecLite struct { + Skill string `yaml:"skill"` + Tasks []string `yaml:"tasks"` + TasksFrom string `yaml:"tasks_from,omitempty"` + Graders []models.GraderConfig `yaml:"graders"` +} + +func newCoverageCommand() *cobra.Command { + var outputFormat string + var searchPaths []string + + cmd := &cobra.Command{ + Use: "coverage [root]", + Short: "Generate an eval coverage grid for discovered skills", + Long: `Generate an eval coverage grid showing which skills have eval coverage. + +By default, this command scans: + - skills/ and .github/skills for SKILL.md files + - evals/ and skill directories for eval.yaml/eval.yml files + +Use --path to add additional directories to scan for eval and skill files.`, + Args: cobra.MaximumNArgs(1), + RunE: func(cmd *cobra.Command, args []string) error { + root := "." + if len(args) > 0 { + root = args[0] + } + + report, err := buildCoverageReport(root, searchPaths) + if err != nil { + return err + } + + switch outputFormat { + case "text": + renderCoverageText(cmd.OutOrStdout(), report) + case "markdown": + renderCoverageMarkdown(cmd.OutOrStdout(), report) + case "json": + if err := renderCoverageJSON(cmd.OutOrStdout(), report); err != nil { + return err + } + default: + return fmt.Errorf("unsupported format %q: must be text, markdown, or json", outputFormat) + } + return nil + }, + } + + cmd.Flags().StringVarP(&outputFormat, "format", "f", "text", "Output format: text, markdown, or json") + cmd.Flags().StringArrayVar(&searchPaths, "path", nil, "Additional directories to scan for skills/evals (repeatable)") + return cmd +} + +func buildCoverageReport(root string, discoverPaths []string) (*coverageReport, error) { + absRoot, err := filepath.Abs(root) + if err != nil { + return nil, fmt.Errorf("resolving root path: %w", err) + } + info, err := os.Stat(absRoot) + if err != nil { + return nil, fmt.Errorf("invalid root path %q: %w", root, err) + } + if !info.IsDir() { + return nil, fmt.Errorf("root path %q is not a directory", root) + } + + skillPaths, err := discoverSkillFiles(absRoot, discoverPaths) + if err != nil { + return nil, err + } + if len(skillPaths) == 0 { + return nil, fmt.Errorf("no SKILL.md files found under %s", absRoot) + } + + evalBySkill := make(map[string][]string) + tasksBySkill := make(map[string]int) + gradersBySkill := make(map[string]map[string]struct{}) + var parseFailures []string + + evalPaths, err := discoverEvalFiles(absRoot, skillPaths, discoverPaths) + if err != nil { + return nil, err + } + + for _, evalPath := range evalPaths { + spec, parseErr := parseEvalSpec(evalPath) + if parseErr != nil { + parseFailures = append(parseFailures, fmt.Sprintf("%s (%v)", evalPath, parseErr)) + continue + } + skillName := strings.TrimSpace(spec.Skill) + if skillName == "" { + skillName = inferSkillNameFromEvalPath(evalPath) + } + if skillName == "" { + continue + } + evalBySkill[skillName] = append(evalBySkill[skillName], evalPath) + taskCount := len(spec.Tasks) + if taskCount == 0 && spec.TasksFrom != "" { + taskCount = 1 // tasks_from references an external file; count as having tasks + } + tasksBySkill[skillName] += taskCount + if _, ok := gradersBySkill[skillName]; !ok { + gradersBySkill[skillName] = make(map[string]struct{}) + } + for _, g := range spec.Graders { + kind := strings.TrimSpace(string(g.Kind)) + if kind != "" { + gradersBySkill[skillName][kind] = struct{}{} + } + } + } + if len(parseFailures) > 0 { + sort.Strings(parseFailures) + fmt.Fprintf(os.Stderr, "warning: failed to parse %d eval file(s): %s\n", len(parseFailures), strings.Join(parseFailures, "; ")) + } + + skillNames := make([]string, 0, len(skillPaths)) + for name := range skillPaths { + skillNames = append(skillNames, name) + } + sort.Strings(skillNames) + + report := &coverageReport{ + TotalSkills: len(skillNames), + Skills: make([]coverageSkillRow, 0, len(skillNames)), + } + + for _, name := range skillNames { + graderSet := gradersBySkill[name] + graders := sortedKeys(graderSet) + tasks := tasksBySkill[name] + hasEval := len(evalBySkill[name]) > 0 + + coverage := "❌ None" + switch { + case !hasEval: + report.Uncovered++ + case tasks > 0 && len(graders) >= 2: + // Full: eval spec has tasks and multiple grader types + coverage = "✅ Full" + report.Covered++ + default: + coverage = "⚠️ Partial" + report.Partial++ + } + + report.Skills = append(report.Skills, coverageSkillRow{ + Skill: name, + Tasks: tasks, + Graders: graders, + Coverage: coverage, + }) + } + + if report.TotalSkills > 0 { + report.CoveragePct = float64(report.Covered) * 100 / float64(report.TotalSkills) + } + return report, nil +} + +func discoverSkillFiles(root string, discoverPaths []string) (map[string]string, error) { + searchRoots := []string{ + filepath.Join(root, "skills"), + filepath.Join(root, ".github", "skills"), + } + for _, p := range discoverPaths { + searchRoots = append(searchRoots, resolvePath(root, p)) + } + + found := make(map[string]string) + seenPaths := make(map[string]struct{}) + + for _, sr := range searchRoots { + if !isDir(sr) { + continue + } + err := filepath.WalkDir(sr, func(path string, d fs.DirEntry, err error) error { + if err != nil { + return fmt.Errorf("error walking %s: %w", path, err) + } + if d.IsDir() { + name := d.Name() + if strings.HasPrefix(name, ".") || name == "node_modules" || name == "vendor" { + return fs.SkipDir + } + return nil + } + if d.Name() != "SKILL.md" { + return nil + } + absPath, absErr := filepath.Abs(path) + if absErr != nil { + absPath = filepath.Clean(path) + } + if _, ok := seenPaths[absPath]; ok { + return nil + } + seenPaths[absPath] = struct{}{} + skillName := parseSkillName(absPath) + if skillName == "" { + skillName = filepath.Base(filepath.Dir(absPath)) + } + if _, exists := found[skillName]; !exists { + found[skillName] = absPath + } + return nil + }) + if err != nil { + return nil, fmt.Errorf("walking skill directory %s: %w", sr, err) + } + } + + return found, nil +} + +func discoverEvalFiles(root string, skillPaths map[string]string, discoverPaths []string) ([]string, error) { + searchRoots := []string{filepath.Join(root, "evals")} + for _, p := range discoverPaths { + searchRoots = append(searchRoots, resolvePath(root, p)) + } + + candidates := make(map[string]struct{}) + + for _, evalRoot := range searchRoots { + if !isDir(evalRoot) { + continue + } + if err := filepath.WalkDir(evalRoot, func(path string, d fs.DirEntry, err error) error { + if err != nil { + return fmt.Errorf("error walking %s: %w", path, err) + } + if d.IsDir() { + name := d.Name() + if strings.HasPrefix(name, ".") || name == "node_modules" || name == "vendor" { + return fs.SkipDir + } + return nil + } + if d.Name() == "eval.yaml" || d.Name() == "eval.yml" { + absPath, absErr := filepath.Abs(path) + if absErr != nil { + absPath = filepath.Clean(path) + } + candidates[absPath] = struct{}{} + } + return nil + }); err != nil { + return nil, fmt.Errorf("walking eval directory %s: %w", evalRoot, err) + } + } + + for _, skillPath := range skillPaths { + skillDir := filepath.Dir(skillPath) + for _, rel := range []string{ + "eval.yaml", "eval.yml", + filepath.Join("evals", "eval.yaml"), filepath.Join("evals", "eval.yml"), + filepath.Join("tests", "eval.yaml"), filepath.Join("tests", "eval.yml"), + } { + p := filepath.Join(skillDir, rel) + if isFile(p) { + absPath, absErr := filepath.Abs(p) + if absErr != nil { + absPath = filepath.Clean(p) + } + candidates[absPath] = struct{}{} + } + } + } + + evalPaths := make([]string, 0, len(candidates)) + for path := range candidates { + evalPaths = append(evalPaths, path) + } + sort.Strings(evalPaths) + return evalPaths, nil +} + +func parseEvalSpec(evalPath string) (*evalSpecLite, error) { + data, err := os.ReadFile(evalPath) + if err != nil { + return nil, err + } + var spec evalSpecLite + if err := yaml.Unmarshal(data, &spec); err != nil { + return nil, err + } + return &spec, nil +} + +func parseSkillName(path string) string { + data, err := os.ReadFile(path) + if err != nil { + return "" + } + var sk skill.Skill + if err := sk.UnmarshalText(data); err != nil { + return "" + } + return strings.TrimSpace(sk.Frontmatter.Name) +} + +func inferSkillNameFromEvalPath(evalPath string) string { + parent := filepath.Base(filepath.Dir(evalPath)) + switch parent { + case "evals", "tests": + return filepath.Base(filepath.Dir(filepath.Dir(evalPath))) + default: + return parent + } +} + +func renderCoverageText(w io.Writer, report *coverageReport) { + fmt.Fprintln(w, "📊 Eval Coverage Grid") //nolint:errcheck + fmt.Fprintf(w, "Coverage: %.1f%% (%d/%d fully covered)\n\n", report.CoveragePct, report.Covered, report.TotalSkills) //nolint:errcheck + + // Use placeholders for emoji to avoid tabwriter alignment issues + var buf strings.Builder + tw := tabwriter.NewWriter(&buf, 0, 0, 2, ' ', 0) + fmt.Fprintln(tw, "Skill\tTasks\tGraders\tCoverage") //nolint:errcheck + fmt.Fprintln(tw, "-----\t-----\t-------\t--------") //nolint:errcheck + for _, row := range report.Skills { + graders := "—" + if len(row.Graders) > 0 { + graders = strings.Join(row.Graders, ", ") + } + coverage := row.Coverage + coverage = strings.Replace(coverage, "✅", "{CHECK}", 1) + coverage = strings.Replace(coverage, "⚠️", "{WARN}", 1) + coverage = strings.Replace(coverage, "❌", "{CROSS}", 1) + fmt.Fprintf(tw, "%s\t%d\t%s\t%s\n", row.Skill, row.Tasks, graders, coverage) //nolint:errcheck + } + _ = tw.Flush() + + result := buf.String() + result = strings.ReplaceAll(result, "{CHECK}", "✅") + result = strings.ReplaceAll(result, "{WARN}", "⚠️") + result = strings.ReplaceAll(result, "{CROSS}", "❌") + fmt.Fprint(w, result) //nolint:errcheck +} + +func renderCoverageMarkdown(w io.Writer, report *coverageReport) { + fmt.Fprintln(w, "📊 Eval Coverage Grid") //nolint:errcheck + fmt.Fprintln(w, "| Skill | Tasks | Graders | Coverage |") //nolint:errcheck + fmt.Fprintln(w, "|-------|-------|---------|----------|") //nolint:errcheck + for _, row := range report.Skills { + graders := "—" + if len(row.Graders) > 0 { + graders = strings.Join(row.Graders, ", ") + } + fmt.Fprintf(w, "| %s | %d | %s | %s |\n", row.Skill, row.Tasks, graders, row.Coverage) //nolint:errcheck + } +} + +func renderCoverageJSON(w io.Writer, report *coverageReport) error { + enc := json.NewEncoder(w) + enc.SetIndent("", " ") + return enc.Encode(report) +} + +func sortedKeys(set map[string]struct{}) []string { + if len(set) == 0 { + return nil + } + keys := make([]string, 0, len(set)) + for k := range set { + keys = append(keys, k) + } + sort.Strings(keys) + return keys +} + +func resolvePath(root, p string) string { + if filepath.IsAbs(p) { + return p + } + return filepath.Join(root, p) +} + +func isDir(path string) bool { + info, err := os.Stat(path) + return err == nil && info.IsDir() +} + +func isFile(path string) bool { + info, err := os.Stat(path) + return err == nil && !info.IsDir() +} diff --git a/cmd/waza/cmd_coverage_test.go b/cmd/waza/cmd_coverage_test.go new file mode 100644 index 00000000..bfa8c732 --- /dev/null +++ b/cmd/waza/cmd_coverage_test.go @@ -0,0 +1,222 @@ +package main + +import ( + "bytes" + "encoding/json" + "os" + "path/filepath" + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +func TestBuildCoverageReport_NoEvals(t *testing.T) { + root := t.TempDir() + writeSkill(t, root, filepath.Join("skills", "alpha"), "alpha") + writeSkill(t, root, filepath.Join("skills", "beta"), "beta") + + report, err := buildCoverageReport(root, nil) + require.NoError(t, err) + + assert.Equal(t, 2, report.TotalSkills) + assert.Equal(t, 0, report.Covered) + assert.Equal(t, 0, report.Partial) + assert.Equal(t, 2, report.Uncovered) + assert.Equal(t, "❌ None", report.Skills[0].Coverage) + assert.Equal(t, "❌ None", report.Skills[1].Coverage) +} + +func TestBuildCoverageReport_PartialAndFull(t *testing.T) { + root := t.TempDir() + writeSkill(t, root, filepath.Join("skills", "partial-skill"), "partial-skill") + writeSkill(t, root, filepath.Join(".github", "skills", "full-skill"), "full-skill") + + writeEval(t, root, filepath.Join("evals", "partial-skill", "eval.yaml"), ` +skill: partial-skill +tasks: + - tasks/*.yaml +graders: + - type: prompt + name: judge +`) + writeEval(t, root, filepath.Join("custom", "full-skill", "eval.yaml"), ` +skill: full-skill +tasks: + - tasks/a.yaml + - tasks/b.yaml +graders: + - type: prompt + name: judge + - type: file + name: files +`) + + report, err := buildCoverageReport(root, []string{"custom"}) + require.NoError(t, err) + + assert.Equal(t, 2, report.TotalSkills) + assert.Equal(t, 1, report.Covered) + assert.Equal(t, 1, report.Partial) + assert.Equal(t, 0, report.Uncovered) + assert.InDelta(t, 50.0, report.CoveragePct, 0.1) + + rows := map[string]coverageSkillRow{} + for _, row := range report.Skills { + rows[row.Skill] = row + } + + assert.Equal(t, "⚠️ Partial", rows["partial-skill"].Coverage) + assert.Equal(t, 1, rows["partial-skill"].Tasks) + assert.Equal(t, []string{"prompt"}, rows["partial-skill"].Graders) + + assert.Equal(t, "✅ Full", rows["full-skill"].Coverage) + assert.Equal(t, 2, rows["full-skill"].Tasks) + assert.Equal(t, []string{"file", "prompt"}, rows["full-skill"].Graders) +} + +func TestBuildCoverageReport_IncludesEvalYML(t *testing.T) { + root := t.TempDir() + writeSkill(t, root, filepath.Join("skills", "alpha"), "alpha") + writeEval(t, root, filepath.Join("evals", "alpha", "eval.yml"), ` +skill: alpha +tasks: + - tasks/*.yaml +graders: + - type: prompt + name: judge + - type: file + name: files +`) + + report, err := buildCoverageReport(root, nil) + require.NoError(t, err) + require.Len(t, report.Skills, 1) + assert.Equal(t, "✅ Full", report.Skills[0].Coverage) +} + +func TestBuildCoverageReport_WarnsOnParseErrors(t *testing.T) { + root := t.TempDir() + writeSkill(t, root, filepath.Join("skills", "alpha"), "alpha") + writeEval(t, root, filepath.Join("evals", "alpha", "eval.yaml"), "skill: [bad") + + report, err := buildCoverageReport(root, nil) + require.NoError(t, err, "parse failures should warn, not error") + require.NotNil(t, report) + assert.Equal(t, 1, report.TotalSkills) + assert.Equal(t, 0, report.Covered) +} + +func TestRenderCoverageMarkdown(t *testing.T) { + report := &coverageReport{ + TotalSkills: 2, + Skills: []coverageSkillRow{ + {Skill: "alpha", Tasks: 1, Graders: []string{"prompt"}, Coverage: "⚠️ Partial"}, + {Skill: "beta", Tasks: 2, Graders: []string{"file", "prompt"}, Coverage: "✅ Full"}, + }, + } + + var buf bytes.Buffer + renderCoverageMarkdown(&buf, report) + out := buf.String() + + assert.Contains(t, out, "📊 Eval Coverage Grid") + assert.Contains(t, out, "| Skill | Tasks | Graders | Coverage |") + assert.Contains(t, out, "| alpha | 1 | prompt | ⚠️ Partial |") + assert.Contains(t, out, "| beta | 2 | file, prompt | ✅ Full |") +} + +func TestRenderCoverageJSON(t *testing.T) { + report := &coverageReport{ + TotalSkills: 1, + Covered: 1, + Partial: 0, + Uncovered: 0, + CoveragePct: 100, + Skills: []coverageSkillRow{ + {Skill: "alpha", Tasks: 2, Graders: []string{"file", "prompt"}, Coverage: "✅ Full"}, + }, + } + + var buf bytes.Buffer + require.NoError(t, renderCoverageJSON(&buf, report)) + + var decoded map[string]any + require.NoError(t, json.Unmarshal(buf.Bytes(), &decoded)) + assert.Equal(t, float64(1), decoded["total_skills"]) + assert.Contains(t, buf.String(), "\n \"total_skills\"") +} + +func TestCoverageCommand_UnsupportedFormat(t *testing.T) { + root := t.TempDir() + writeSkill(t, root, filepath.Join("skills", "alpha"), "alpha") + + cmd := newCoverageCommand() + cmd.SetOut(new(bytes.Buffer)) + cmd.SetErr(new(bytes.Buffer)) + cmd.SetArgs([]string{root, "--format", "xml"}) + + err := cmd.Execute() + require.Error(t, err) + assert.Contains(t, err.Error(), `unsupported format "xml"`) +} + +func TestRootCommand_HasCoverageSubcommand(t *testing.T) { + root := newRootCommand() + found := false + for _, c := range root.Commands() { + if c.Name() == "coverage" { + found = true + break + } + } + assert.True(t, found, "root command should have 'coverage' subcommand") +} + +func TestBuildCoverageReport_RejectsFilePath(t *testing.T) { + f := filepath.Join(t.TempDir(), "notadir.txt") + require.NoError(t, os.WriteFile(f, []byte("hello"), 0o644)) + + _, err := buildCoverageReport(f, nil) + require.Error(t, err) + assert.Contains(t, err.Error(), "is not a directory") +} + +func TestBuildCoverageReport_TasksFromCountsAsTasks(t *testing.T) { + root := t.TempDir() + writeSkill(t, root, filepath.Join("skills", "from-skill"), "from-skill") + writeEval(t, root, filepath.Join("evals", "from-skill", "eval.yaml"), ` +skill: from-skill +tasks_from: tasks/ +graders: + - type: prompt + name: judge + - type: diff + name: snapshot +`) + + report, err := buildCoverageReport(root, nil) + require.NoError(t, err) + require.Len(t, report.Skills, 1) + assert.Equal(t, "✅ Full", report.Skills[0].Coverage) + assert.Equal(t, 1, report.Skills[0].Tasks) +} + +func writeSkill(t *testing.T, root, relDir, skillName string) { + t.Helper() + dir := filepath.Join(root, relDir) + require.NoError(t, os.MkdirAll(dir, 0o755)) + content := `--- +name: ` + skillName + ` +description: "test skill" +--- +` + require.NoError(t, os.WriteFile(filepath.Join(dir, "SKILL.md"), []byte(content), 0o644)) +} + +func writeEval(t *testing.T, root, relPath, content string) { + t.Helper() + absPath := filepath.Join(root, relPath) + require.NoError(t, os.MkdirAll(filepath.Dir(absPath), 0o755)) + require.NoError(t, os.WriteFile(absPath, []byte(content), 0o644)) +} diff --git a/cmd/waza/root.go b/cmd/waza/root.go index a53d1475..ac2b6e29 100644 --- a/cmd/waza/root.go +++ b/cmd/waza/root.go @@ -40,6 +40,7 @@ performance against predefined test cases.`, cmd.AddCommand(newInitCommand()) cmd.AddCommand(tokens.NewCommand()) cmd.AddCommand(newCompareCommand()) + cmd.AddCommand(newCoverageCommand()) cmd.AddCommand(dev.NewCommand()) cmd.AddCommand(newMetadataCommand(cmd)) cmd.AddCommand(newCheckCommand()) diff --git a/site/src/content/docs/reference/cli.mdx b/site/src/content/docs/reference/cli.mdx index eb348951..5fd81a83 100644 --- a/site/src/content/docs/reference/cli.mdx +++ b/site/src/content/docs/reference/cli.mdx @@ -295,6 +295,44 @@ waza compare gpt4.json sonnet.json opus.json waza compare results-*.json --format json ``` +## waza coverage + +Generate an eval coverage grid for discovered skills. + +```bash +waza coverage [root] +``` + +### Arguments + +| Argument | Description | +|----------|-------------| +| `[root]` | Root directory to scan (default: current directory) | + +### Flags + +| Flag | Description | +|------|-------------| +| `-f, --format` | Output format: `text` (default), `markdown`, `json` | +| `--path` | Additional directories to scan for skills/evals (repeatable) | + +### Coverage Levels + +- **Full**: Skill has an `eval.yaml`/`eval.yml` with tasks (via `tasks:` or `tasks_from:`) and at least 2 distinct grader types. +- **Partial**: Skill has an `eval.yaml`/`eval.yml` but fewer than 2 grader types or no tasks. +- **Missing**: No `eval.yaml`/`eval.yml` found for the skill. + +**Note**: The reported coverage percentage reflects only fully covered skills (`Fully Covered / Total Skills`). + +### Examples + +```bash +waza coverage +waza coverage --format markdown +waza coverage --format json +waza coverage --path custom-evals --path plugins +``` + ## waza suggest Generate suggested eval artifacts from a skill's `SKILL.md` using an LLM.