diff --git a/README.md b/README.md index 632b2f4c..38fb615a 100644 --- a/README.md +++ b/README.md @@ -897,6 +897,57 @@ config: max_attempts: 3 # Retry failed graders up to 3 times (default: 1, no retries) ``` +### Git Resources + +Task inputs can reference git repositories as resources, checked out at a specific commit. This is useful for testing against real codebases without manually preparing fixture directories. + +```yaml +# Task YAML +inputs: + prompt: "Fix the bug in server.go" + workdir: my-repo # agent starts inside this subdirectory + files: + # Existing resource types still work: + - path: helpers/utils.js # file from context_dir + - content: "package main\n..." # inline content + + repos: + # Git resource — checkout a commit from a local repo + - type: worktree # required (currently only worktree is supported) + source: /path/to/local/repo # required for worktree strategy + commit: abc123def + dest: my-repo # optional: subdirectory in workspace +``` + +**`workdir`** (optional): A relative path within the workspace to use as the agent's working directory. When a git resource is checked out into a subdirectory via `dest`, set `workdir` to that subdirectory so the agent starts inside the repo. Must not escape the workspace root. + + **Strategy support:** + +| Strategy | Use Case | Mechanism | +|---|---|---| +| `worktree` | Already inside the target repo; very cheap, no network | `git worktree add` | + +**Fields:** + +| Field | Required | Description | +|---|---|---| +| `type` | Yes | Currently only `worktree` | +| `source` | Yes | Local folder where the git repository resides | +| `commit` | No | Commit SHA, branch, or tag. Defaults to HEAD | +| `dest` | No | Subdirectory name in workspace. Omit to use workspace root | + +**Examples:** + +```yaml +# Worktree strategy — cheap checkout from local repo +- type: worktree + source: /path/to/local/repo + commit: feature-branch + dest: feature +``` + +Worktrees are automatically cleaned up after each task via `git worktree remove`. + When a grader fails, waza will retry the task execution up to `max_attempts` times. The evaluation outcome includes an `attempts` field showing how many executions were needed to pass. This is useful for handling transient failures in external services or non-deterministic grader behavior. **Output:** JSON results include `attempts` per task showing the number of executions performed. diff --git a/cmd/waza/cmd_new_task_test.go b/cmd/waza/cmd_new_task_test.go index c5620585..d03c7c12 100644 --- a/cmd/waza/cmd_new_task_test.go +++ b/cmd/waza/cmd_new_task_test.go @@ -272,6 +272,7 @@ func TestNewTaskFromPromptCommand_EndToEndCreatesTaskFile(t *testing.T) { require.NoError(t, err) expected := &models.TestCase{ + Path: outputPath, DisplayName: "auto-generated", TestID: "auto-generated", Tags: []string{"auto-generated"}, diff --git a/internal/execution/copilot.go b/internal/execution/copilot.go index 18f7e26b..b05e9b4b 100644 --- a/internal/execution/copilot.go +++ b/internal/execution/copilot.go @@ -25,8 +25,14 @@ type CopilotEngine struct { startOnce sync.Once - workspacesMu sync.Mutex - workspaces []string // workspaces to clean up at Shutdown + // resourcesMu protects workspaces and worktrees + resourcesMu sync.Mutex + // workspaces are temp folders - each test run gets a unique one, and it's removed at Shutdown. + workspaces []string + // gitResources that will be cleaned up at Shutdown. + // NOTE: in some cases there is some bookkeeping information (like with git workspaces) so cleanup + // must be called before the workspace is deleted. + gitResources []GitResource // sessions maps session IDs to copilotSessions sessions map[string]CopilotSession @@ -137,7 +143,7 @@ func (e *CopilotEngine) Execute(ctx context.Context, req *ExecutionRequest) (*Ex start := time.Now() - workspaceDir, err := e.setupWorkspace(req.Resources) + workspaceDir, err := e.setupWorkspace(ctx, req.Resources, req.GitResources) if err != nil { return nil, err @@ -289,23 +295,32 @@ func (e *CopilotEngine) doShutdown(ctx context.Context) error { return fmt.Errorf("failed to stop client: %w", err) } - // remove the workspace folders - should be safe now that all the copilot sessions are shut down - // and the tests are complete. - workspaces := func() []string { - e.workspacesMu.Lock() - defer e.workspacesMu.Unlock() + workspaces, gitResources := func() ([]string, []GitResource) { + e.resourcesMu.Lock() + defer e.resourcesMu.Unlock() + worktrees := e.gitResources + e.gitResources = nil + workspaces := e.workspaces e.workspaces = nil - return workspaces + + return workspaces, worktrees }() + // Clean up worktrees before removing workspaces (worktrees may be inside workspace dirs) + for _, wt := range gitResources { + if err := wt.Cleanup(ctx); err != nil { + slog.Warn("failed to cleanup git resource", "error", err) + } + } + + // remove the workspace folders - should be safe now that all the copilot sessions are shut down + // and the tests are complete. for _, ws := range workspaces { - if ws != "" { - if err := os.RemoveAll(ws); err != nil { - // errors here probably indicate some issue with our code continuing to lock files - // even after tests have completed... - slog.Warn("failed to cleanup stale workspace", "path", ws, "error", err) - } + if err := os.RemoveAll(ws); err != nil { + // errors here probably indicate some issue with our code continuing to lock files + // even after tests have completed... + slog.Warn("failed to cleanup stale workspace", "path", ws, "error", err) } } @@ -376,22 +391,32 @@ func (*CopilotEngine) getSkillDirs(cwd string, req *ExecutionRequest) []string { return skillDirs } -func (e *CopilotEngine) setupWorkspace(resources []ResourceFile) (string, error) { +func (e *CopilotEngine) setupWorkspace(ctx context.Context, resources []ResourceFile, gitResources []models.GitResource) (string, error) { workspaceDir, err := os.MkdirTemp("", "waza-*") if err != nil { return "", fmt.Errorf("failed to create temp workspace: %w", err) } - e.workspacesMu.Lock() + e.resourcesMu.Lock() e.workspaces = append(e.workspaces, workspaceDir) - e.workspacesMu.Unlock() + e.resourcesMu.Unlock() // Write resource files to workspace if err := setupWorkspaceResources(workspaceDir, resources); err != nil { return "", fmt.Errorf("failed to setup resources at workspace %s: %w", workspaceDir, err) } + wts, err := CloneGitResources(ctx, gitResources, workspaceDir) + if err != nil { + return "", err + } + if len(wts) > 0 { + e.resourcesMu.Lock() + e.gitResources = append(e.gitResources, wts...) + e.resourcesMu.Unlock() + } + return workspaceDir, nil } diff --git a/internal/execution/engine.go b/internal/execution/engine.go index 4e2d29cf..7cb97cae 100644 --- a/internal/execution/engine.go +++ b/internal/execution/engine.go @@ -2,6 +2,8 @@ package execution import ( "context" + "fmt" + "path/filepath" "strings" "time" @@ -30,10 +32,11 @@ type AgentEngine interface { // ExecutionRequest represents a test execution request type ExecutionRequest struct { - ModelID string - Message string - Context map[string]any - Resources []ResourceFile + ModelID string + Message string + Context map[string]any + Resources []ResourceFile + GitResources []models.GitResource SessionID string SkillName string @@ -54,6 +57,24 @@ type ResourceFile struct { Content []byte } +// ResolveWorkDir returns the effective working directory for the agent session. +// If workDir is empty, the workspace root is returned. Otherwise workDir is +// joined to the workspace root after verifying it doesn't escape via path +// traversal. +func ResolveWorkDir(workspaceDir, workDir string) (string, error) { + if workDir == "" { + return workspaceDir, nil + } + + resolved := filepath.Join(workspaceDir, workDir) + // Prevent traversal outside the workspace (e.g. workDir = "../../etc") + rel, err := filepath.Rel(workspaceDir, resolved) + if err != nil || strings.HasPrefix(rel, "..") { + return "", fmt.Errorf("workdir %q escapes the workspace", workDir) + } + return resolved, nil +} + type SkillInvocation struct { // Name of the invoked skill Name string diff --git a/internal/execution/engine_shutdown_test.go b/internal/execution/engine_shutdown_test.go index f4260b61..8e466322 100644 --- a/internal/execution/engine_shutdown_test.go +++ b/internal/execution/engine_shutdown_test.go @@ -9,6 +9,7 @@ import ( "github.com/microsoft/waza/internal/models" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" + gomock "go.uber.org/mock/gomock" ) // SpyEngine wraps an AgentEngine and tracks Shutdown calls. @@ -172,21 +173,28 @@ func TestCopilotEngine_Shutdown_Idempotent(t *testing.T) { } func TestCopilotEngine_Shutdown_CleansWorkspace(t *testing.T) { + ctrl := gomock.NewController(t) + gr := NewMockGitResource(ctrl) + gr.EXPECT().Cleanup(gomock.Any()) + engine := NewCopilotEngineBuilder("test-model", nil).Build() // Simulate a workspace existing (without running the full SDK) - tmpDir := t.TempDir() - engine.workspacesMu.Lock() - engine.workspaces = append(engine.workspaces, tmpDir) - engine.workspacesMu.Unlock() + tmpWorkspaceDir := t.TempDir() + + engine.resourcesMu.Lock() + engine.workspaces = append(engine.workspaces, tmpWorkspaceDir) + engine.gitResources = append(engine.gitResources, gr) + engine.resourcesMu.Unlock() err := engine.Shutdown(context.Background()) require.NoError(t, err) // After shutdown, workspace should be cleared - engine.workspacesMu.Lock() - defer engine.workspacesMu.Unlock() + engine.resourcesMu.Lock() + defer engine.resourcesMu.Unlock() require.Empty(t, engine.workspaces) + require.Empty(t, engine.gitResources) } func TestCopilotEngine_Shutdown_WithCancelledContext(t *testing.T) { diff --git a/internal/execution/execution_mocks.go b/internal/execution/execution_mocks.go new file mode 100644 index 00000000..0d2b71f0 --- /dev/null +++ b/internal/execution/execution_mocks.go @@ -0,0 +1,55 @@ +// Code generated by MockGen. DO NOT EDIT. +// Source: github.com/microsoft/waza/internal/execution (interfaces: GitResource) +// +// Generated by this command: +// +// mockgen -package execution -destination execution_mocks.go . GitResource +// + +// Package execution is a generated GoMock package. +package execution + +import ( + context "context" + reflect "reflect" + + gomock "go.uber.org/mock/gomock" +) + +// MockGitResource is a mock of GitResource interface. +type MockGitResource struct { + ctrl *gomock.Controller + recorder *MockGitResourceMockRecorder + isgomock struct{} +} + +// MockGitResourceMockRecorder is the mock recorder for MockGitResource. +type MockGitResourceMockRecorder struct { + mock *MockGitResource +} + +// NewMockGitResource creates a new mock instance. +func NewMockGitResource(ctrl *gomock.Controller) *MockGitResource { + mock := &MockGitResource{ctrl: ctrl} + mock.recorder = &MockGitResourceMockRecorder{mock} + return mock +} + +// EXPECT returns an object that allows the caller to indicate expected use. +func (m *MockGitResource) EXPECT() *MockGitResourceMockRecorder { + return m.recorder +} + +// Cleanup mocks base method. +func (m *MockGitResource) Cleanup(ctx context.Context) error { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "Cleanup", ctx) + ret0, _ := ret[0].(error) + return ret0 +} + +// Cleanup indicates an expected call of Cleanup. +func (mr *MockGitResourceMockRecorder) Cleanup(ctx any) *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "Cleanup", reflect.TypeOf((*MockGitResource)(nil).Cleanup), ctx) +} diff --git a/internal/execution/generate.go b/internal/execution/generate.go index b0179c86..c8d8ab2b 100644 --- a/internal/execution/generate.go +++ b/internal/execution/generate.go @@ -1,3 +1,4 @@ package execution //go:generate go tool mockgen -package execution -destination copilot_client_wrapper_mocks_test.go . CopilotSession,CopilotClient +//go:generate go tool mockgen -package execution -destination execution_mocks.go . GitResource diff --git a/internal/execution/git.go b/internal/execution/git.go new file mode 100644 index 00000000..c62a63c8 --- /dev/null +++ b/internal/execution/git.go @@ -0,0 +1,123 @@ +package execution + +import ( + "context" + "fmt" + "os" + "os/exec" + "path/filepath" + "strings" + + "github.com/microsoft/waza/internal/models" +) + +// GitWorkTree tracks a worktree created during task execution for cleanup. +type GitWorkTree struct { + WorktreePath string + RepoDir string // the repo from which the worktree was created +} + +func (gwt GitWorkTree) Cleanup(ctx context.Context) error { + return gitWorktreeRemove(ctx, gwt) +} + +// CloneGitResource checks out a git resource into the workspace directory. +// It returns a GitWorktreeInfo for cleanup. +func CloneGitResource(ctx context.Context, gitRes models.GitResource, workspaceDir string) (GitResource, error) { + switch gitRes.Type { + case models.GitTypeWorktree: + targetDir := workspaceDir + + if gitRes.RelativeDest != "" { + targetDir = filepath.Join(workspaceDir, gitRes.RelativeDest) + } + + // Only worktree is supported; Validate() already enforces this. + return gitWorkTreeAdd(ctx, gitRes.Commit, gitRes.Source, targetDir) + default: + return nil, fmt.Errorf("invalid repo type %q", gitRes.Type) + } +} + +type GitResource interface { + Cleanup(ctx context.Context) error +} + +// CloneGitResources materializes all git resources into the workspace. +func CloneGitResources(ctx context.Context, gitResources []models.GitResource, workspaceDir string) (createdResources []GitResource, err error) { + defer func() { + if err != nil { + // we've got to unroll all the worktrees we created - partial creation isn't acceptable + for _, res := range createdResources { + // best effort. + _ = res.Cleanup(ctx) + } + + createdResources = nil + } + }() + + for i := range gitResources { + gr, err := CloneGitResource(ctx, gitResources[i], workspaceDir) + if err != nil { + return createdResources, fmt.Errorf("failed to clone git resource: %w", err) + } + if gr != nil { + createdResources = append(createdResources, gr) + } + } + + return createdResources, nil +} + +// gitWorkTreeAdd runs 'git worktree add', creating a git worktree (an incredibly cheap copy) of a local repo to +// another local path on disk. Note, this requires a local clone of a git repo to work. +func gitWorkTreeAdd(ctx context.Context, commit string, repoDir string, targetDir string) (*GitWorkTree, error) { + args := []string{"worktree", "add"} + + if commit != "" { + // git worktree add + args = append(args, targetDir, commit) + } else { + // git worktree add --detach (uses HEAD) + args = append(args, "--detach", targetDir) + } + + if _, err := runGitCommand(ctx, repoDir, args...); err != nil { + return nil, fmt.Errorf("git worktree add failed: %w", err) + } + + return &GitWorkTree{ + WorktreePath: targetDir, + RepoDir: repoDir, + }, nil +} + +// gitWorktreeRemove removes a worktree from a git repo, using 'git worktree remove'. +func gitWorktreeRemove(ctx context.Context, wt GitWorkTree) error { + _, err := runGitCommand(ctx, wt.RepoDir, "worktree", "remove", "--force", wt.WorktreePath) + + if err != nil { + return fmt.Errorf("git worktree remove %q failed: %w", wt.WorktreePath, err) + } + + return nil +} + +// runGitCommand executes a git command in the specified directory and returns stdout. +func runGitCommand(ctx context.Context, dir string, args ...string) (string, error) { + cmd := exec.CommandContext(ctx, "git", args...) + cmd.Dir = dir + + // From the docs: if set to false, git will not prompt on the terminal, like for credentials. + cmd.Env = append(os.Environ(), "GIT_TERMINAL_PROMPT=0") + + out, err := cmd.CombinedOutput() + outStr := strings.TrimSpace(string(out)) + + if err != nil { + return "", fmt.Errorf("%w: %s", err, outStr) + } + + return outStr, nil +} diff --git a/internal/execution/git_test.go b/internal/execution/git_test.go new file mode 100644 index 00000000..0a8f0a37 --- /dev/null +++ b/internal/execution/git_test.go @@ -0,0 +1,224 @@ +package execution + +import ( + "context" + "os" + "path/filepath" + "strings" + "testing" + + "github.com/microsoft/waza/internal/models" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +func TestCloneGitResource_WorktreeType(t *testing.T) { + repoDir, commitSHA := mustCreateRepo(t) + workspaceDir := t.TempDir() + destName := "wt-test" + + gitRes := &models.GitResource{ + Commit: commitSHA, + Type: models.GitTypeWorktree, + Source: repoDir, + RelativeDest: destName, + } + + ctx := context.Background() + res, err := CloneGitResource(ctx, *gitRes, workspaceDir) + require.NoError(t, err) + require.NotNil(t, res) + + // Verify the worktree file exists + targetDir := filepath.Join(workspaceDir, destName) + content, err := os.ReadFile(filepath.Join(targetDir, "hello.txt")) + require.NoError(t, err, "reading file in worktree") + assert.Equal(t, "hello world", string(content)) + + // Cleanup + err = res.Cleanup(context.Background()) + require.NoError(t, err) + + // Verify worktree dir is removed + _, err = os.Stat(targetDir) + assert.True(t, os.IsNotExist(err), "worktree directory should have been removed") +} + +func TestCloneGitResource_WorktreeDetachHEAD(t *testing.T) { + repoDir, _ := mustCreateRepo(t) + workspaceDir := t.TempDir() + destName := "wt-detach" + + gitRes := &models.GitResource{ + Type: models.GitTypeWorktree, + Source: repoDir, + RelativeDest: destName, + } + + ctx := context.Background() + res, err := CloneGitResource(ctx, *gitRes, workspaceDir) + require.NoError(t, err, "CloneGitResource (detach)") + + targetDir := filepath.Join(workspaceDir, destName) + _, err = os.Stat(filepath.Join(targetDir, "hello.txt")) + require.NoError(t, err, "expected hello.txt in worktree") + + // Cleanup + err = res.Cleanup(context.Background()) + require.NoError(t, err) +} + +func TestCloneGitResource_UnsupportedType(t *testing.T) { + _, commitSHA := mustCreateRepo(t) + workspaceDir := t.TempDir() + + gitRes := &models.GitResource{ + Commit: commitSHA, + Type: "clone", + Source: "/tmp/repo", + RelativeDest: "clone-test", + } + + ctx := context.Background() + _, err := CloneGitResource(ctx, *gitRes, workspaceDir) + require.Error(t, err, "expected unsupported type to be rejected") + require.Contains(t, err.Error(), "invalid repo type") +} + +func TestCloneGitResource_SourceDoesNotExist(t *testing.T) { + workspaceDir := t.TempDir() + missingDir := filepath.Join(t.TempDir(), "missing-repo") + + gitRes := &models.GitResource{ + Type: models.GitTypeWorktree, + Source: missingDir, + RelativeDest: "wt-test", + } + + _, err := CloneGitResource(context.Background(), *gitRes, workspaceDir) + require.Error(t, err) + assert.Contains(t, err.Error(), "no such file or directory") +} + +func TestCloneGitResource_SourceIsNotDirectory(t *testing.T) { + workspaceDir := t.TempDir() + notDir := filepath.Join(t.TempDir(), "repo.txt") + require.NoError(t, os.WriteFile(notDir, []byte("not a dir"), 0o644)) + + gitRes := &models.GitResource{ + Type: models.GitTypeWorktree, + Source: notDir, + RelativeDest: "wt-test", + } + + _, err := CloneGitResource(context.Background(), *gitRes, workspaceDir) + require.Error(t, err) + assert.Contains(t, err.Error(), "not a directory") +} + +func TestCloneGitResource_SourceIsNotGitRepo(t *testing.T) { + workspaceDir := t.TempDir() + nonRepoDir := t.TempDir() + + gitRes := &models.GitResource{ + Type: models.GitTypeWorktree, + Source: nonRepoDir, + RelativeDest: "wt-test", + } + + _, err := CloneGitResource(context.Background(), *gitRes, workspaceDir) + require.Error(t, err) + assert.Contains(t, err.Error(), "not a git repository") +} + +func TestResolveWorkDir(t *testing.T) { + tests := []struct { + name string + workDir string + want string + wantErr bool + }{ + { + name: "empty returns workspace root", + workDir: "", + want: "/workspace", + }, + { + name: "subdirectory", + workDir: "my-repo", + want: "/workspace/my-repo", + }, + { + name: "nested subdirectory", + workDir: "repos/my-repo", + want: "/workspace/repos/my-repo", + }, + { + name: "traversal rejected", + workDir: "../../etc", + wantErr: true, + }, + { + name: "dot-dot in middle rejected", + workDir: "a/../../outside", + wantErr: true, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + got, err := ResolveWorkDir("/workspace", tt.workDir) + if tt.wantErr { + require.Error(t, err, "expected error") + return + } + require.NoError(t, err) + assert.Equal(t, tt.want, got) + }) + } +} + +func TestCreateGitResources(t *testing.T) { + workspaceDir := t.TempDir() + repoDir, _ := mustCreateRepo(t) + + resources := []models.GitResource{ + {Commit: "", Type: models.GitTypeWorktree, Source: repoDir, RelativeDest: "dest"}, + + // will fail since we already have a worktree at 'dest' + {Commit: "", Type: models.GitTypeWorktree, Source: repoDir, RelativeDest: "dest"}, + } + + createdResources, err := CloneGitResources(context.Background(), resources, workspaceDir) + require.Error(t, err) + require.Empty(t, createdResources) + + require.NoDirExists(t, filepath.Join(workspaceDir, "dest")) +} + +// mustCreateRepo creates a repo with a single commit, with 'test.txt' in the root (contents: "hello world") +func mustCreateRepo(t *testing.T) (repoDir string, headCommitSHA string) { + repoDir = t.TempDir() + + _, err := runGitCommand(context.Background(), repoDir, "init") + require.NoError(t, err) + + err = os.WriteFile(filepath.Join(repoDir, "hello.txt"), []byte("hello world"), 0644) + require.NoError(t, err) + + _, err = runGitCommand(context.Background(), repoDir, "add", "hello.txt") + require.NoError(t, err) + + _, err = runGitCommand(context.Background(), repoDir, + "-c", "user.name=waza", + "-c", "user.email=waza", + "commit", + "-m", "first and only file", "hello.txt") + require.NoError(t, err) + + // Get commit SHA + output, err := runGitCommand(context.Background(), repoDir, "rev-parse", "HEAD") + require.NoError(t, err) + + return repoDir, strings.TrimSpace(output) +} diff --git a/internal/execution/mock.go b/internal/execution/mock.go index 482fe143..47bd5c45 100644 --- a/internal/execution/mock.go +++ b/internal/execution/mock.go @@ -3,6 +3,7 @@ package execution import ( "context" "fmt" + "log/slog" "os" "sync" "sync/atomic" @@ -14,10 +15,11 @@ import ( // MockEngine is a simple mock implementation for testing type MockEngine struct { - modelID string - workspace string - mtx *sync.Mutex - initCalled atomic.Bool + modelID string + workspace string + gitResources []GitResource + mtx *sync.Mutex + initCalled atomic.Bool } // NewMockEngine creates a new mock engine @@ -64,6 +66,13 @@ func (m *MockEngine) Execute(ctx context.Context, req *ExecutionRequest) (*Execu return nil, fmt.Errorf("failed to setup mock workspace resources: %w", err) } + // Materialize git resources + wts, err := CloneGitResources(ctx, req.GitResources, m.workspace) + if err != nil { + return nil, fmt.Errorf("failed to materialize git resource in mock workspace: %w", err) + } + m.gitResources = append(m.gitResources, wts...) + // Simple mock response output := fmt.Sprintf("Mock response for: %s", req.Message) @@ -86,6 +95,13 @@ func (m *MockEngine) Execute(ctx context.Context, req *ExecutionRequest) (*Execu } func (m *MockEngine) Shutdown(ctx context.Context) error { + for _, gr := range m.gitResources { + if err := gr.Cleanup(ctx); err != nil { + slog.Warn("failed to cleanup worktree", "error", err) + } + } + m.gitResources = nil + if m.workspace != "" { if err := os.RemoveAll(m.workspace); err != nil { return fmt.Errorf("failed to remove mock workspace %s: %w", m.workspace, err) diff --git a/internal/models/testcase.go b/internal/models/testcase.go index 437a2e56..435c62ed 100644 --- a/internal/models/testcase.go +++ b/internal/models/testcase.go @@ -19,6 +19,8 @@ type TestCase struct { TestID string `yaml:"id" json:"test_id"` TimeoutSec *int `yaml:"timeout_seconds,omitempty" json:"timeout_sec,omitempty"` Validators []ValidatorInline `yaml:"graders,omitempty" json:"validators,omitempty"` + + Path string `yaml:"-" json:"-"` } // TestStimulus defines the input for a test @@ -26,15 +28,57 @@ type TestStimulus struct { Message string `yaml:"prompt" json:"message"` Metadata map[string]any `yaml:"context,omitempty" json:"metadata,omitempty"` Resources []ResourceRef `yaml:"files,omitempty" json:"resources,omitempty"` + Repos []GitResource `yaml:"repos,omitempty" json:"repos,omitempty"` Environment map[string]string `yaml:"environment,omitempty" json:"environment,omitempty"` } -// ResourceRef points to a file or inline content +// GitType defines how a git resource is acquired. +type GitType string + +const ( + // GitTypeWorktree uses `git worktree add`, a cheap clone that works from a local repository that's already + // on disk. + GitTypeWorktree GitType = "worktree" +) + +func AllGitStrategies() []string { + return []string{ + string(GitTypeWorktree), + } +} + +// GitResource specifies a git repository at a particular commit as a task input. +type GitResource struct { + // Commit is the git commit we will start the git clone at + // Empty string just defaults to HEAD. + Commit string `yaml:"commit,omitempty" json:"commit,omitempty"` + + // Type is the kind of git cloning we're doing. + Type GitType `yaml:"type" json:"type"` + + // Source varies, depending on the type. + // - For 'worktree', Source is the folder where the git repository resides. If empty, uses the current directory. + Source string `yaml:"source" json:"source"` + + // RelativeDest is the destination, relative to the final workspace created for the testcase run + // An empty dest will create the worktree at workspace, instead of as a subdirectory. + RelativeDest string `yaml:"dest,omitempty" json:"dest,omitempty"` +} + +// ResourceRef points to a file path and/or inline content. type ResourceRef struct { Location string `yaml:"path,omitempty" json:"location,omitempty"` Body string `yaml:"content,omitempty" json:"body,omitempty"` } +// Validate checks that at least one file reference field is specified. +func (r *ResourceRef) Validate() error { + if r.Location == "" && r.Body == "" { + return fmt.Errorf("resource must specify one of: path or content") + } + return nil +} + // TestExpectation defines expected outcomes type TestExpectation struct { OutcomeSpecs []OutcomeSpec `yaml:"outcomes,omitempty" json:"outcome_specs,omitempty"` @@ -118,6 +162,8 @@ func LoadTestCase(path string) (*TestCase, error) { return nil, err } + tc.Path = path + // Note: Active field defaults to nil when not specified in YAML. // The runner treats nil as true (enabled by default). // Only explicitly set "enabled: false" will disable a test. diff --git a/internal/models/testcase_test.go b/internal/models/testcase_test.go index 520239ae..7e7f1350 100644 --- a/internal/models/testcase_test.go +++ b/internal/models/testcase_test.go @@ -4,8 +4,104 @@ import ( "os" "path/filepath" "testing" + + "github.com/stretchr/testify/require" ) +func TestLoadTestCase_GitResource(t *testing.T) { + tests := []struct { + name string + yaml string + wantErr bool + wantGitRes bool + wantStrat GitType + }{ + { + name: "git resource with worktree strategy", + yaml: mustReadTestFile(t, "git-resources-task-example.yaml"), + wantGitRes: true, + wantStrat: GitTypeWorktree, + }, + { + name: "no git resource - path only", + yaml: mustReadTestFile(t, "file-resources-task-example.yaml"), + wantGitRes: false, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + dir := t.TempDir() + p := filepath.Join(dir, "tc.yaml") + if err := os.WriteFile(p, []byte(tt.yaml), 0o644); err != nil { + t.Fatalf("write file: %v", err) + } + + tc, err := LoadTestCase(p) + if tt.wantErr { + if err == nil { + t.Fatal("expected error, got nil") + } + return + } + if err != nil { + t.Fatalf("LoadTestCase: %v", err) + } + + hasGit := false + for _, repo := range tc.Stimulus.Repos { + if repo.Type != "" { + hasGit = true + if repo.Type != tt.wantStrat { + t.Errorf("Type = %q, want %q", repo.Type, tt.wantStrat) + } + } + } + if hasGit != tt.wantGitRes { + t.Errorf("hasGit = %v, want %v", hasGit, tt.wantGitRes) + } + }) + } +} + +func TestResourceRef_Validate(t *testing.T) { + tests := []struct { + name string + ref ResourceRef + wantErr bool + }{ + { + name: "valid path", + ref: ResourceRef{Location: "file.txt"}, + }, + { + name: "valid content", + ref: ResourceRef{Body: "inline"}, + }, + { + name: "empty resource", + ref: ResourceRef{}, + wantErr: true, + }, + { + name: "path and content", + ref: ResourceRef{Location: "f.txt", Body: "inline"}, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + err := tt.ref.Validate() + if tt.wantErr && err == nil { + t.Error("expected error, got nil") + } + if !tt.wantErr && err != nil { + t.Errorf("unexpected error: %v", err) + } + }) + } +} + func TestLoadTestCase_ShouldTriggerField(t *testing.T) { tests := []struct { name string @@ -14,36 +110,20 @@ func TestLoadTestCase_ShouldTriggerField(t *testing.T) { wantVal bool }{ { - name: "should_trigger true", - yaml: `id: tc-trigger-true -name: Trigger True -inputs: - prompt: "test prompt" -expected: - should_trigger: true -`, + name: "should_trigger true", + yaml: mustReadTestFile(t, "trigger-true-task-example.yaml"), wantNil: false, wantVal: true, }, { - name: "should_trigger false", - yaml: `id: tc-trigger-false -name: Trigger False -inputs: - prompt: "test prompt" -expected: - should_trigger: false -`, + name: "should_trigger false", + yaml: mustReadTestFile(t, "trigger-false-task-example.yaml"), wantNil: false, wantVal: false, }, { - name: "should_trigger omitted", - yaml: `id: tc-trigger-omit -name: Trigger Omitted -inputs: - prompt: "test prompt" -`, + name: "should_trigger omitted", + yaml: mustReadTestFile(t, "trigger-omit-task-example.yaml"), wantNil: true, }, } @@ -77,3 +157,11 @@ inputs: }) } } + +// path is a path within 'testdata' +func mustReadTestFile(t *testing.T, path string) string { + buff, err := os.ReadFile(filepath.Join("testdata", path)) + require.NoError(t, err) + + return string(buff) +} diff --git a/internal/models/testdata/file-resources-task-example.yaml b/internal/models/testdata/file-resources-task-example.yaml new file mode 100644 index 00000000..f72e242b --- /dev/null +++ b/internal/models/testdata/file-resources-task-example.yaml @@ -0,0 +1,6 @@ +id: tc-path +name: Path Only +inputs: + prompt: "test prompt" + files: + - path: helpers.js diff --git a/internal/models/testdata/git-resources-task-example.yaml b/internal/models/testdata/git-resources-task-example.yaml new file mode 100644 index 00000000..a0f45d20 --- /dev/null +++ b/internal/models/testdata/git-resources-task-example.yaml @@ -0,0 +1,9 @@ +id: tc-git-wt +name: Git Worktree +inputs: + prompt: "test prompt" + repos: + - type: worktree + source: /tmp/repo + commit: main + dest: wt-di diff --git a/internal/models/testdata/trigger-false-task-example.yaml b/internal/models/testdata/trigger-false-task-example.yaml new file mode 100644 index 00000000..ba207e54 --- /dev/null +++ b/internal/models/testdata/trigger-false-task-example.yaml @@ -0,0 +1,6 @@ +id: tc-trigger-false +name: Trigger False +inputs: + prompt: "test prompt" +expected: + should_trigger: false diff --git a/internal/models/testdata/trigger-omit-task-example.yaml b/internal/models/testdata/trigger-omit-task-example.yaml new file mode 100644 index 00000000..16c4ccb0 --- /dev/null +++ b/internal/models/testdata/trigger-omit-task-example.yaml @@ -0,0 +1,4 @@ +id: tc-trigger-omit +name: Trigger Omitted +inputs: + prompt: "test prompt" diff --git a/internal/models/testdata/trigger-true-task-example.yaml b/internal/models/testdata/trigger-true-task-example.yaml new file mode 100644 index 00000000..e7d33925 --- /dev/null +++ b/internal/models/testdata/trigger-true-task-example.yaml @@ -0,0 +1,6 @@ +id: tc-trigger-true +name: Trigger True +inputs: + prompt: "test prompt" +expected: + should_trigger: true diff --git a/internal/orchestration/filter.go b/internal/orchestration/filter.go index 42dd8911..b6349fa0 100644 --- a/internal/orchestration/filter.go +++ b/internal/orchestration/filter.go @@ -3,8 +3,6 @@ package orchestration import ( "fmt" "path/filepath" - - "github.com/microsoft/waza/internal/models" ) // FilterTestCases returns the subset of testCases based on whether it matches tags or task display name, or task id glob patterns. @@ -13,12 +11,12 @@ import ( // // If taskPatterns and tagPatterns are specified the result is the intersection of the matches between them. // If both taskPatterns and tagPatterns are empty, all test cases are returned. -func FilterTestCases(testCases []*models.TestCase, taskPatterns []string, tagPatterns []string) ([]*models.TestCase, error) { +func FilterTestCases(testCases []*ExecutableTestCase, taskPatterns []string, tagPatterns []string) ([]*ExecutableTestCase, error) { if len(taskPatterns) == 0 && len(tagPatterns) == 0 { return testCases, nil } - var matched []*models.TestCase + var matched []*ExecutableTestCase for _, tc := range testCases { taskNameMatch, err := matchesTaskOrDisplayName(tc, taskPatterns) @@ -42,7 +40,7 @@ func FilterTestCases(testCases []*models.TestCase, taskPatterns []string, tagPat } // matchesTaskOrDisplayName reports whether a test case's DisplayName or TestID matches any pattern. -func matchesTaskOrDisplayName(tc *models.TestCase, patterns []string) (bool, error) { +func matchesTaskOrDisplayName(tc *ExecutableTestCase, patterns []string) (bool, error) { if len(patterns) == 0 { return true, nil } @@ -71,7 +69,7 @@ func matchesTaskOrDisplayName(tc *models.TestCase, patterns []string) (bool, err return false, nil } -func matchesTags(tc *models.TestCase, patterns []string) (bool, error) { +func matchesTags(tc *ExecutableTestCase, patterns []string) (bool, error) { if len(patterns) == 0 { return true, nil } diff --git a/internal/orchestration/filter_test.go b/internal/orchestration/filter_test.go index 117cd83c..83259bb9 100644 --- a/internal/orchestration/filter_test.go +++ b/internal/orchestration/filter_test.go @@ -8,12 +8,12 @@ import ( "github.com/stretchr/testify/require" ) -func sampleCases() []*models.TestCase { - return []*models.TestCase{ - {TestID: "tc-001", DisplayName: "Create a REST API", Tags: []string{"fast", "red"}}, - {TestID: "tc-002", DisplayName: "Fix login bug", Tags: []string{"fast", "blue"}}, - {TestID: "tc-003", DisplayName: "Create a CLI tool", Tags: []string{"medium", "green"}}, - {TestID: "tc-004", DisplayName: "Optimize SQL query", Tags: []string{"slow", "chartreuse"}}, +func sampleCases() []*ExecutableTestCase { + return []*ExecutableTestCase{ + {TestCase: &models.TestCase{TestID: "tc-001", DisplayName: "Create a REST API", Tags: []string{"fast", "red"}}}, + {TestCase: &models.TestCase{TestID: "tc-002", DisplayName: "Fix login bug", Tags: []string{"fast", "blue"}}}, + {TestCase: &models.TestCase{TestID: "tc-003", DisplayName: "Create a CLI tool", Tags: []string{"medium", "green"}}}, + {TestCase: &models.TestCase{TestID: "tc-004", DisplayName: "Optimize SQL query", Tags: []string{"slow", "chartreuse"}}}, } } @@ -156,7 +156,7 @@ func TestFilterTestCases_TagsAndTasks_Intersection(t *testing.T) { } } -func testCaseIDs(testCases []*models.TestCase) []string { +func testCaseIDs(testCases []*ExecutableTestCase) []string { var ids []string for _, tc := range testCases { ids = append(ids, tc.TestID) diff --git a/internal/orchestration/runner.go b/internal/orchestration/runner.go index b61636fb..6b7a4732 100644 --- a/internal/orchestration/runner.go +++ b/internal/orchestration/runner.go @@ -2,9 +2,11 @@ package orchestration import ( "context" + "errors" "fmt" "math" "os" + "os/exec" "path/filepath" "sort" "strings" @@ -217,7 +219,7 @@ func (r *TestRunner) runNormalBenchmark(ctx context.Context) (*models.Evaluation } // Load test cases - testCases, err := r.loadTestCases() + testCases, err := r.loadTestCases(ctx) if err != nil { return nil, fmt.Errorf("failed to load test cases: %w", err) } @@ -473,16 +475,38 @@ func (r *TestRunner) printSkillImpactReport(withSkills, withoutSkills *models.Ev fmt.Println("════════════════════════════════════════════════════════════════") } -func (r *TestRunner) loadTestCases() ([]*models.TestCase, error) { +func (r *TestRunner) loadTestCases(ctx context.Context) ([]*ExecutableTestCase, error) { spec := r.cfg.Spec() + var testCases []*models.TestCase + // CSV dataset path: generate tasks from CSV rows - if spec.TasksFrom != "" { - return r.loadTestCasesFromCSV() + testCases, err := func() ([]*models.TestCase, error) { + if spec.TasksFrom != "" { + return r.loadTestCasesFromCSV() + } + return r.loadTestCasesFromFiles() + }() + + if err != nil { + return nil, err + } + + var executableTestCases []*ExecutableTestCase + var errs []error + + for _, tc := range testCases { + etc, err := NewExecutableTestCase(ctx, tc, r.cfg.FixtureDir()) + + if err != nil { + errs = append(errs, err) + continue + } + + executableTestCases = append(executableTestCases, etc) } - // Fall through to existing Tasks []string behavior - return r.loadTestCasesFromFiles() + return executableTestCases, errors.Join(errs...) } // loadTestCasesFromCSV generates in-memory TestCases from CSV rows. @@ -681,7 +705,7 @@ func (r *TestRunner) validateRequiredSkills() error { return nil } -func (r *TestRunner) runSequential(ctx context.Context, testCases []*models.TestCase) []models.TestOutcome { +func (r *TestRunner) runSequential(ctx context.Context, testCases []*ExecutableTestCase) []models.TestOutcome { outcomes := make([]models.TestOutcome, 0, len(testCases)) spec := r.cfg.Spec() @@ -731,8 +755,9 @@ func (r *TestRunner) runSequential(ctx context.Context, testCases []*models.Test }) taskStart := time.Now() + outcome, wasCached := r.runTest(ctx, tc, i+1, len(testCases)) - r.writeTaskTranscript(tc, outcome, taskStart) + r.writeTaskTranscript(tc.TestCase, outcome, taskStart) outcomes = append(outcomes, outcome) // Run after_task hooks @@ -766,7 +791,7 @@ func (r *TestRunner) runSequential(ctx context.Context, testCases []*models.Test return outcomes } -func (r *TestRunner) runConcurrent(ctx context.Context, testCases []*models.TestCase) []models.TestOutcome { +func (r *TestRunner) runConcurrent(ctx context.Context, testCases []*ExecutableTestCase) []models.TestOutcome { // Simple concurrent implementation spec := r.cfg.Spec() workers := spec.Config.Workers @@ -786,7 +811,7 @@ func (r *TestRunner) runConcurrent(ctx context.Context, testCases []*models.Test for i, tc := range testCases { wg.Add(1) - go func(idx int, test *models.TestCase) { + go func(idx int, test *ExecutableTestCase) { defer wg.Done() semaphore <- struct{}{} @@ -822,7 +847,7 @@ func (r *TestRunner) runConcurrent(ctx context.Context, testCases []*models.Test taskStart := time.Now() outcome, wasCached := r.runTest(ctx, test, idx+1, len(testCases)) - r.writeTaskTranscript(test, outcome, taskStart) + r.writeTaskTranscript(test.TestCase, outcome, taskStart) resultChan <- result{index: idx, outcome: outcome} // Run after_task hooks @@ -867,12 +892,12 @@ func (r *TestRunner) runConcurrent(ctx context.Context, testCases []*models.Test return results } -func (r *TestRunner) runTest(ctx context.Context, tc *models.TestCase, testNum, totalTests int) (models.TestOutcome, bool) { +func (r *TestRunner) runTest(ctx context.Context, tc *ExecutableTestCase, testNum, totalTests int) (models.TestOutcome, bool) { spec := r.cfg.Spec() // Check cache if enabled if r.cache != nil { - cacheKey, err := cache.CacheKey(spec, tc, r.cfg.FixtureDir()) + cacheKey, err := cache.CacheKey(spec, tc.TestCase, r.cfg.FixtureDir()) if err == nil { if cachedOutcome, found := r.cache.Get(cacheKey); found { // Return cached outcome with cached flag @@ -904,7 +929,7 @@ func (r *TestRunner) writeTaskTranscript(tc *models.TestCase, outcome models.Tes } } -func (r *TestRunner) runTestUncached(ctx context.Context, tc *models.TestCase, testNum, totalTests int) models.TestOutcome { +func (r *TestRunner) runTestUncached(ctx context.Context, tc *ExecutableTestCase, testNum, totalTests int) models.TestOutcome { spec := r.cfg.Spec() runsPerTest := spec.Config.TrialsPerTask maxAttempts := spec.Config.MaxAttempts @@ -926,7 +951,8 @@ func (r *TestRunner) runTestUncached(ctx context.Context, tc *models.TestCase, t var run models.RunResult for attempt := 1; attempt <= maxAttempts; attempt++ { - run = r.executeRun(ctx, tc, runNum) + run = r.executeRun(ctx, tc, runNum) // NOTE: make sure the outer 'run' var is assigned to! + run.Attempts = attempt // If all graders passed or this is an infrastructure error, stop retrying @@ -999,10 +1025,9 @@ func overallStatus(runs []models.RunResult) models.Status { return status } -func (r *TestRunner) executeRun(ctx context.Context, tc *models.TestCase, runNum int) models.RunResult { +func (r *TestRunner) executeRun(ctx context.Context, tc *ExecutableTestCase, runNum int) models.RunResult { startTime := time.Now() - // Prepare execution request req := r.buildExecutionRequest(tc) // Emit agent prompt event before execution @@ -1040,14 +1065,14 @@ func (r *TestRunner) executeRun(ctx context.Context, tc *models.TestCase, runNum } // Build validation context - vCtx := r.buildGraderContext(tc, resp) + vCtx := r.buildGraderContext(tc.TestCase, resp) var gradersResults map[string]models.GraderResults if r.skipGraders { gradersResults = make(map[string]models.GraderResults) } else { var err error - gradersResults, err = r.runGraders(ctx, tc, vCtx) + gradersResults, err = r.runGraders(ctx, tc.TestCase, vCtx) if err != nil { return models.RunResult{ @@ -1117,10 +1142,8 @@ func (r *TestRunner) executeRun(ctx context.Context, tc *models.TestCase, runNum } } -func (r *TestRunner) buildExecutionRequest(tc *models.TestCase) *execution.ExecutionRequest { - // Load resource files - resources := r.loadResources(tc) - +func (r *TestRunner) buildExecutionRequest(tc *ExecutableTestCase) *execution.ExecutionRequest { + // Load resource files (file and inline resources) spec := r.cfg.Spec() timeout := spec.Config.TimeoutSec if tc.TimeoutSec != nil { @@ -1131,25 +1154,61 @@ func (r *TestRunner) buildExecutionRequest(tc *models.TestCase) *execution.Execu resolvedSkillPaths := utils.ResolvePaths(spec.Config.SkillPaths, r.cfg.SpecDir()) return &execution.ExecutionRequest{ - Message: tc.Stimulus.Message, - Context: tc.Stimulus.Metadata, - Resources: resources, - SkillName: spec.SkillName, - SkillPaths: resolvedSkillPaths, - Timeout: time.Duration(timeout) * time.Second, + Message: tc.Stimulus.Message, + Context: tc.Stimulus.Metadata, + Resources: tc.ResourceFiles, + GitResources: tc.GitResources, + SkillName: spec.SkillName, + SkillPaths: resolvedSkillPaths, + Timeout: time.Duration(timeout) * time.Second, + } +} + +// ExecutableTestCase is a test case that has done basic checks to ensure that file system +// paths that are referenced exist. +type ExecutableTestCase struct { + *models.TestCase + + // ResourceFiles are files that will be copied into the workspace + ResourceFiles []execution.ResourceFile + + // GitResources are git repos that will be cloned/created into the workspace + GitResources []models.GitResource +} + +func NewExecutableTestCase(ctx context.Context, tc *models.TestCase, fallbackFixtureDir string) (*ExecutableTestCase, error) { + resourceFiles, gitResources, err := loadResources(ctx, tc, fallbackFixtureDir) + + if err != nil { + return nil, err } + + return &ExecutableTestCase{ + TestCase: tc, + ResourceFiles: resourceFiles, + GitResources: gitResources, + }, nil } -func (r *TestRunner) loadResources(tc *models.TestCase) []execution.ResourceFile { +func loadResources(ctx context.Context, tc *models.TestCase, defaultFixtureDir string) ([]execution.ResourceFile, []models.GitResource, error) { var resources []execution.ResourceFile + var gitResources []models.GitResource // Determine fixture directory (for loading resource files) - fixtureDir := r.cfg.FixtureDir() + fixtureDir := defaultFixtureDir + if tc.ContextRoot != "" { fixtureDir = tc.ContextRoot } + var errs []error + for _, ref := range tc.Stimulus.Resources { + if err := ref.Validate(); err != nil { + errs = append(errs, fmt.Errorf("invalid resource: %w", err)) + continue + } + if ref.Body != "" { // Inline content resources = append(resources, execution.ResourceFile{ @@ -1159,40 +1218,33 @@ func (r *TestRunner) loadResources(tc *models.TestCase) []execution.ResourceFile } else if ref.Location != "" && fixtureDir != "" { // Load from file - validate path to prevent directory traversal if filepath.IsAbs(ref.Location) { - fmt.Fprintf(os.Stderr, "Warning: absolute resource path %q rejected\n", ref.Location) - continue - } - - cleanPath := filepath.Clean(ref.Location) - if strings.Contains(cleanPath, "..") { - fmt.Fprintf(os.Stderr, "Warning: resource path %q contains '..' and is rejected\n", ref.Location) + errs = append(errs, fmt.Errorf("resource path %q cannot be absolute", ref.Location)) continue } - fullPath := filepath.Join(fixtureDir, cleanPath) + fullPath := filepath.Join(fixtureDir, filepath.Clean(ref.Location)) // Ensure the resolved path is still within fixtureDir absFixtureDir, err := filepath.Abs(fixtureDir) if err != nil { - fmt.Fprintf(os.Stderr, "Warning: failed to get absolute path for fixture dir: %v\n", err) + errs = append(errs, fmt.Errorf("failed to get absolute path for fixture dir %q: %v", fixtureDir, err)) continue } absFullPath, err := filepath.Abs(fullPath) if err != nil { - fmt.Fprintf(os.Stderr, "Warning: failed to get absolute path for resource: %v\n", err) + errs = append(errs, fmt.Errorf("failed to get absolute path for resource %q: %w", fullPath, err)) continue } if !strings.HasPrefix(absFullPath, absFixtureDir+string(filepath.Separator)) { - fmt.Fprintf(os.Stderr, "Warning: resource path %q escapes fixture directory\n", ref.Location) + errs = append(errs, fmt.Errorf("resource path %q escapes fixture directory", ref.Location)) continue } content, err := os.ReadFile(fullPath) if err != nil { - // Log error but continue - let the test fail if resource is critical - fmt.Fprintf(os.Stderr, "Warning: failed to load resource file %s: %v\n", fullPath, err) + errs = append(errs, fmt.Errorf("failed to load resource file %s: %w", fullPath, err)) continue } resources = append(resources, execution.ResourceFile{ @@ -1202,7 +1254,22 @@ func (r *TestRunner) loadResources(tc *models.TestCase) []execution.ResourceFile } } - return resources + for _, repo := range tc.Stimulus.Repos { + switch repo.Type { + case models.GitTypeWorktree: + sourceDir := filepath.Join(filepath.Dir(tc.Path), filepath.FromSlash(repo.Source)) + + if err := validateGitSourceDir(ctx, sourceDir); err != nil { + errs = append(errs, fmt.Errorf("%q source dir is invalid: %w", repo.Source, err)) + } else { + gitResources = append(gitResources, repo) + } + default: + errs = append(errs, fmt.Errorf("%q is not a valid repo type", repo.Type)) + } + } + + return resources, gitResources, errors.Join(errs...) } func (r *TestRunner) buildGraderContext(tc *models.TestCase, resp *execution.ExecutionResponse) *graders.Context { @@ -1270,3 +1337,27 @@ func (r *TestRunner) resolveGroup() string { return "" } } + +func validateGitSourceDir(ctx context.Context, source string) error { + stat, err := os.Stat(source) + if err != nil { + if os.IsNotExist(err) { + return fmt.Errorf("source path %q does not exist", source) + } + return fmt.Errorf("unable to inspect source path %q: %w", source, err) + } + + if !stat.IsDir() { + return fmt.Errorf("source path %q is not a directory", source) + } + + // even though it says 'work-tree', it's not just worktrees - it'll work in any non-bare git repo. + cmd := exec.CommandContext(ctx, "git", "rev-parse", "--is-inside-work-tree") + cmd.Dir = source + + if _, err := cmd.CombinedOutput(); err != nil { + return fmt.Errorf("source path %q is not a git repository: %w", source, err) + } + + return nil +} diff --git a/internal/orchestration/runner_orchestration_test.go b/internal/orchestration/runner_orchestration_test.go index 9cf3fb4e..9ab2f248 100644 --- a/internal/orchestration/runner_orchestration_test.go +++ b/internal/orchestration/runner_orchestration_test.go @@ -347,7 +347,6 @@ func TestLoadResources_PathValidation(t *testing.T) { spec := &models.BenchmarkSpec{} cfg := config.NewBenchmarkConfig(spec, config.WithFixtureDir(fixtureDir)) - runner := NewTestRunner(cfg, nil) testCase := &models.TestCase{ Stimulus: models.TestStimulus{ @@ -361,7 +360,13 @@ func TestLoadResources_PathValidation(t *testing.T) { }, } - resources := runner.loadResources(testCase) + // One of those interesting functions that returns an error and a result... + resources, gitResources, err := loadResources(context.Background(), testCase, cfg.FixtureDir()) + + require.Contains(t, err.Error(), "missing.txt: no such file or directory") + require.Contains(t, err.Error(), "absolute.txt\" cannot be absolute") + + require.Empty(t, gitResources) require.Len(t, resources, 2) assert.Equal(t, "inline.txt", resources[0].Path) assert.Equal(t, []byte("inline"), resources[0].Content) @@ -485,10 +490,13 @@ func TestRunTest_CacheHitAndTranscriptWrite(t *testing.T) { }, } - err := runner.engine.Initialize(context.Background()) + etc, err := NewExecutableTestCase(context.Background(), testCase, cfg.FixtureDir()) + require.NoError(t, err) + + err = runner.engine.Initialize(context.Background()) require.NoError(t, err) - outcome, wasCached := runner.runTest(context.Background(), testCase, 1, 1) + outcome, wasCached := runner.runTest(context.Background(), etc, 1, 1) assert.False(t, wasCached) runner.writeTaskTranscript(testCase, outcome, time.Now()) @@ -496,7 +504,7 @@ func TestRunTest_CacheHitAndTranscriptWrite(t *testing.T) { require.NoError(t, err) require.NotEmpty(t, entries) - cachedOutcome, wasCached := runner.runTest(context.Background(), testCase, 1, 1) + cachedOutcome, wasCached := runner.runTest(context.Background(), etc, 1, 1) assert.True(t, wasCached) assert.Equal(t, outcome.TestID, cachedOutcome.TestID) assert.Equal(t, outcome.Status, cachedOutcome.Status) diff --git a/internal/orchestration/runner_test.go b/internal/orchestration/runner_test.go index 1f48e4c8..c6106234 100644 --- a/internal/orchestration/runner_test.go +++ b/internal/orchestration/runner_test.go @@ -1,6 +1,7 @@ package orchestration import ( + "context" "os" "path/filepath" "testing" @@ -89,8 +90,11 @@ func TestBuildExecutionRequest_SkillPaths(t *testing.T) { // Create runner (engine can be nil for this test) runner := NewTestRunner(cfg, nil) + etc, err := NewExecutableTestCase(context.Background(), tc, cfg.FixtureDir()) + require.NoError(t, err) + // Build execution request - req := runner.buildExecutionRequest(tc) + req := runner.buildExecutionRequest(etc) // Verify skill paths require.NotNil(t, req, "execution request should not be nil") @@ -136,8 +140,11 @@ func TestBuildExecutionRequest_BasicFields(t *testing.T) { }, } + etc, err := NewExecutableTestCase(context.Background(), tc, cfg.FixtureDir()) + require.NoError(t, err) + runner := NewTestRunner(cfg, nil) - req := runner.buildExecutionRequest(tc) + req := runner.buildExecutionRequest(etc) // Verify basic fields assert.Equal(t, "Hello world", req.Message) @@ -174,7 +181,11 @@ func TestBuildExecutionRequest_TimeoutOverride(t *testing.T) { } runner := NewTestRunner(cfg, nil) - req := runner.buildExecutionRequest(tc) + + etc, err := NewExecutableTestCase(context.Background(), tc, cfg.FixtureDir()) + require.NoError(t, err) + + req := runner.buildExecutionRequest(etc) // Verify timeout is overridden assert.Equal(t, float64(300), req.Timeout.Seconds(), "test case timeout should override spec timeout") diff --git a/schemas/task.schema.json b/schemas/task.schema.json index c02e4840..699e27d5 100644 --- a/schemas/task.schema.json +++ b/schemas/task.schema.json @@ -81,6 +81,13 @@ }, "description": "File references or inline content provided to the agent." }, + "repos": { + "type": "array", + "items": { + "$ref": "#/$defs/gitResource" + }, + "description": "Git repositories provided to the agent workspace." + }, "environment": { "type": "object", "additionalProperties": { @@ -92,7 +99,7 @@ }, "resourceRef": { "type": "object", - "description": "A file reference by path or inline content.", + "description": "A file resource reference by path and/or inline content.", "properties": { "path": { "type": "string", @@ -102,7 +109,19 @@ "type": "string", "description": "Inline file content." } - } + }, + "anyOf": [ + { + "required": [ + "path" + ] + }, + { + "required": [ + "content" + ] + } + ] }, "expected": { "type": "object", @@ -1064,6 +1083,53 @@ } } } + }, + "gitResource": { + "type": "object", + "description": "A git repository checked out at a specific commit.", + "required": [ + "type" + ], + "properties": { + "commit": { + "type": "string", + "description": "Git commit SHA, branch, or tag to check out. Defaults to HEAD." + }, + "type": { + "type": "string", + "enum": [ + "worktree" + ], + "description": "Git resource type. Currently only 'worktree' is supported." + }, + "source": { + "type": "string", + "description": "Type-specific source location. For 'worktree', this is the folder where the git repository resides." + }, + "dest": { + "type": "string", + "description": "Subdirectory name within the workspace. If omitted, the git repo becomes the workspace root." + } + }, + "allOf": [ + { + "if": { + "properties": { + "type": { + "const": "worktree" + } + }, + "required": [ + "type" + ] + }, + "then": { + "required": [ + "source" + ] + } + } + ] } } }