Siddhant-K-code · Siddhant-K-code · May 17, 2026 · May 17, 2026
diff --git a/README.md b/README.md
@@ -185,6 +185,7 @@ print(f"Replay with: agent-strace replay {meta.session_id}")
 | `inflation` | Token inflation across model versions |
 | `curve` | Personal cost-efficiency curve |
 | `a2a-tree` | Cross-agent trace correlation (A2A protocol) |
+| `mcp` | MCP server — expose traces as queryable tools for a debugging agent |
 
 ```
 agent-strace setup [--redact] [--global]        Generate Claude Code hooks config
@@ -1204,6 +1205,75 @@ agent-strace export <session-id> --format otlp > trace.json
 | event_id | span ID |
 | parent_id | parent span ID |
 
+## Debug with MCP
+
+`agent-strace mcp` starts an MCP server that exposes your session store as queryable tools. Any MCP-compatible client (Claude Code, Cursor, VS Code Copilot) can then query traces conversationally — the debugging agent reads its own execution history and surfaces what went wrong.
+
+```bash
+agent-strace mcp
+```
+
+**Claude Code config** (`.claude/settings.json`):
+
+```json
+{
+  "mcpServers": {
+    "agent-trace": {
+      "command": "agent-strace",
+      "args": ["mcp"]
+    }
+  }
+}
+```
+
+**Cursor config** (`.cursor/mcp.json`):
+
+```json
+{
+  "mcpServers": {
+    "agent-trace": {
+      "command": "agent-strace",
+      "args": ["mcp"]
+    }
+  }
+}
+```
+
+Once connected, you can ask the debugging agent questions like:
+
+> "Look at the most recent session and tell me why it called bash three times in a row."
+> "Which files did the agent write in session abc123 that it didn't write in def456?"
+> "Find all sessions where the agent hit an error after calling npm test."
+
+### MCP tools
+
+| Tool | Description |
+|---|---|
+| `list_sessions` | List captured sessions with metadata (timestamp, tool calls, cost, tokens) |
+| `get_session` | Full event stream for a session, with optional event type filter |
+| `search_events` | Filter events by tool name, file path, exit code, or error flag across sessions |
+| `get_session_summary` | Plain-English phase breakdown — what the agent did, files touched, retries |
+| `diff_sessions` | Compare two sessions: tool call delta, file overlap, cost delta, error delta |
+
+### Example interactions
+
+```
+# List recent sessions
+list_sessions(limit=5)
+
+# Get all errors from a session
+search_events(session_id="abc123", has_error=true)
+
+# Find all sessions where the agent wrote to package-lock.json
+search_events(file_path="package-lock.json")
+
+# Compare two sessions after changing AGENTS.md
+diff_sessions(session_a="before_change", session_b="after_change")
+
+# Get a plain-English summary of what went wrong
+get_session_summary(session_id="abc123")
+```
+
 ## How it works
 
 ### Claude Code hooks

diff --git a/examples/ci/agent-eval.yml b/examples/ci/agent-eval.yml
@@ -0,0 +1,100 @@
+# Agent eval CI workflow
+#
+# Runs eval scorers on every PR that touches agent config files.
+# Fails the PR if any scorer drops below its threshold.
+# Posts a score summary as a PR comment.
+#
+# Prerequisites:
+#   1. Capture at least one session: agent-strace record -- <your-agent-command>
+#   2. Save a baseline: agent-strace eval ci --save-baseline .agent-traces/baselines/main.json
+#   3. Commit .agent-evals.yaml and .agent-traces/baselines/main.json to the repo
+
+name: Agent eval
+
+on:
+  pull_request:
+    paths:
+      - "AGENTS.md"
+      - "CLAUDE.md"
+      - ".claude/**"
+      - ".agent-evals.yaml"
+      - ".agent-traces/datasets/**"
+
+jobs:
+  eval:
+    runs-on: ubuntu-latest
+
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: "3.12"
+
+      - name: Install agent-strace
+        run: pip install agent-strace
+
+      # Score the latest session in the dataset against all configured scorers.
+      # Exits 1 if any scorer is below threshold or regresses vs baseline.
+      - name: Run eval
+        env:
+          # Required only if using the llm_judge scorer.
+          # Remove if using heuristic scorers only (no_errors, cost_under, etc.)
+          OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
+        run: |
+          agent-strace eval ci \
+            --baseline .agent-traces/baselines/main.json \
+            --tolerance 0.05 \
+            --github-summary
+
+      # Post the Markdown summary as a PR comment so reviewers see the score delta.
+      - name: Post eval summary
+        if: always()
+        uses: actions/github-script@v7
+        with:
+          script: |
+            const fs = require('fs');
+            const summaryPath = '.agent-traces/eval-summary.md';
+            if (!fs.existsSync(summaryPath)) {
+              console.log('No eval summary found — skipping comment.');
+              return;
+            }
+            const summary = fs.readFileSync(summaryPath, 'utf8');
+            await github.rest.issues.createComment({
+              issue_number: context.issue.number,
+              owner: context.repo.owner,
+              repo: context.repo.repo,
+              body: summary,
+            });
+
+  # Optional: update the baseline on every merge to main.
+  # Commit the updated baseline back to the repo so future PRs compare against it.
+  update-baseline:
+    if: github.event_name == 'push' && github.ref == 'refs/heads/main'
+    runs-on: ubuntu-latest
+
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: "3.12"
+
+      - name: Install agent-strace
+        run: pip install agent-strace
+
+      - name: Save new baseline
+        run: |
+          mkdir -p .agent-traces/baselines
+          agent-strace eval ci \
+            --save-baseline .agent-traces/baselines/main.json
+
+      - name: Commit updated baseline
+        run: |
+          git config user.name "github-actions[bot]"
+          git config user.email "github-actions[bot]@users.noreply.github.com"
+          git add .agent-traces/baselines/main.json
+          git diff --staged --quiet || git commit -m "chore: update eval baseline [skip ci]"
+          git push
diff --git a/src/agent_trace/cli.py b/src/agent_trace/cli.py
@@ -23,6 +23,7 @@
 from .hooks import hook_main
 from .http_proxy import HTTPProxyServer
 from .a2a import cmd_a2a_tree
+from .mcp_server import cmd_mcp
 from .annotate import cmd_annotate
 from .drift import cmd_drift
 from .langfuse_export import cmd_export_scores
@@ -747,6 +748,18 @@ def build_parser() -> argparse.ArgumentParser:
     p_standup.add_argument("--no-llm", action="store_true", dest="no_llm",
                            help="structured output only, no LLM narrative (default)")
 
+    # mcp (MCP server — expose traces as queryable tools)
+    p_mcp = sub.add_parser(
+        "mcp",
+        help="start an MCP server that exposes session traces as queryable tools",
+    )
+    p_mcp.add_argument(
+        "--transport",
+        choices=["stdio"],
+        default="stdio",
+        help="transport protocol (default: stdio)",
+    )
+
     # diff --semantic and --eval-config flags (extend existing diff parser)
     p_diff.add_argument("--semantic", action="store_true",
                         help="semantic outcome-level diff (files, cost, errors)")
@@ -806,6 +819,7 @@ def main() -> None:
         "oncall": cmd_oncall,
         "freshness": cmd_freshness,
         "standup": cmd_standup,
+        "mcp": cmd_mcp,
     }
 
     handler = handlers.get(args.command)