kuitos · kuitos · Apr 30, 2026 · Apr 30, 2026
diff --git a/README.md b/README.md
@@ -268,12 +268,20 @@ Supported memory types:
 # Run tests
 bun test
 
+# Run the focused memory-effect task eval harness
+bun test test/evals/task-eval.test.ts
+
+# Print a memory-effect eval report
+bun run eval:memory
+
 # Build published artifacts
 bun run build
 
 # Release: push to main triggers semantic-release → npm publish
 ```
 
+Task eval fixtures now support both in-code synthetic cases and file-backed replay cases under `test/evals/cases/*.json`.
+
 ## 📄 License
 
 [MIT](LICENSE) © [kuitos](https://github.com/kuitos)
diff --git a/docs/superpowers/specs/2026-04-27-memory-task-eval-design.md b/docs/superpowers/specs/2026-04-27-memory-task-eval-design.md
@@ -0,0 +1,60 @@
+# Memory Task Eval Design
+
+## Goal
+Add a lightweight offline evaluation layer that compares memory-on and memory-off plugin behavior for realistic task-shaped inputs, using deterministic rule checks by default and leaving an interface for future judge implementations.
+
+## Scope
+- In scope:
+  - Synthetic task fixtures shaped like real plugin messages
+  - A reusable harness that seeds memories, runs plugin hooks, and captures system prompts
+  - Rule-based checks for expected inclusions and exclusions in memory-on and memory-off runs
+  - A judge interface that can support future optional LLM scoring
+- Out of scope:
+  - Live model invocation
+  - Production telemetry
+  - Large benchmark datasets
+
+## Approach Options
+1. Extend `test/index.test.ts`
+   - Lowest setup cost, but poor reuse and weak structure once cases grow.
+2. Add a dedicated `test/evals/` harness
+   - Recommended. Provides a typed case schema, reusable execution path, and clean future extensions.
+3. Build a standalone CLI benchmark
+   - Overkill for the first version and unnecessary for CI.
+
+## Recommended Design
+Create a dedicated task-eval layer under `test/evals/`:
+
+- `fixtures.ts`
+  - Declares the synthetic case schema and a small initial case set.
+- `harness.ts`
+  - Creates a temp git repo, seeds memories, runs `MemoryPlugin`, feeds messages through `messages.transform`, then through `system.transform`, and returns both memory-on and memory-off outputs.
+- `judges.ts`
+  - Exposes a rule-based judge for CI and a future generic judge interface.
+- `task-eval.test.ts`
+  - Runs the fixture set through the harness and asserts pass/fail with helpful diagnostics.
+
+## Data Flow
+1. A fixture defines memories, messages, and expected checks.
+2. The harness creates a temp repo and seeds memory files with `saveMemory()`.
+3. The harness runs the plugin once with memory enabled and once with `OPENCODE_MEMORY_IGNORE=1`.
+4. The judge compares the resulting system prompts against the fixture's expected inclusions and exclusions.
+5. The test reports the first failing expectation with both prompts attached for debugging.
+
+## Error Handling
+- Missing or malformed fixtures should fail fast with descriptive assertion messages.
+- The harness should restore `OPENCODE_MEMORY_IGNORE` after each run to avoid cross-test leakage.
+- Temp repos should always be cleaned up in `afterEach`.
+
+## Testing Strategy
+- Follow TDD: add task-eval tests first, verify failure, then implement the harness.
+- Start with a small synthetic suite covering:
+  - preference recall in memory-on mode
+  - memory-off suppression
+  - tool-reference filtering interaction with recent completed tools
+- Run targeted tests first, then the broader suite if needed.
+
+## Future Extensions
+- Add adapters that convert sanitized real transcripts into the same fixture schema.
+- Add a non-default judge implementation that scores generated assistant answers with an external LLM.
+- Add lightweight summary reporting if the case set grows enough to justify aggregate metrics.
diff --git a/package.json b/package.json
@@ -19,6 +19,7 @@
     "dist"
   ],
   "scripts": {
+    "eval:memory": "bun test/evals/run.ts",
     "build": "tsc -p tsconfig.json",
     "prepack": "npm run build"
   },

diff --git a/src/recall.ts b/src/recall.ts
@@ -17,9 +17,79 @@ const MAX_MEMORY_LINES = 200
 const MAX_MEMORY_BYTES = 4096
 
 const encoder = new TextEncoder()
+const QUERY_STOP_WORDS = new Set([
+  "the",
+  "and",
+  "for",
+  "with",
+  "this",
+  "that",
+  "what",
+  "when",
+  "where",
+  "which",
+  "who",
+  "why",
+  "how",
+  "should",
+  "would",
+  "could",
+  "please",
+  "about",
+  "again",
+  "into",
+  "from",
+  "have",
+  "know",
+  "need",
+  "only",
+  "over",
+  "tell",
+  "than",
+  "then",
+  "them",
+  "they",
+  "will",
+  "your",
+  "you",
+  "are",
+  "can",
+  "did",
+  "has",
+  "her",
+  "him",
+  "his",
+  "its",
+  "not",
+  "our",
+  "out",
+  "she",
+  "was",
+  "were",
+  "all",
+  "any",
+  "but",
+  "get",
+  "had",
+  "in",
+  "is",
+  "it",
+  "of",
+  "on",
+  "or",
+  "to",
+])
 
 function tokenizeQuery(query: string): string[] {
-  return [...new Set(query.toLowerCase().split(/\s+/).map((token) => token.trim()).filter((token) => token.length >= 2))]
+  return [
+    ...new Set(
+      query
+        .toLowerCase()
+        .split(/[^a-z0-9_]+/)
+        .map((token) => token.trim())
+        .filter((token) => token.length >= 2 && !QUERY_STOP_WORDS.has(token)),
+    ),
+  ]
 }
 
 function readMemoryContent(filePath: string): string {
@@ -121,7 +191,8 @@ export function recallRelevantMemories(
     }
   }).filter(({ header, content }) => !isToolReferenceMemory(header, content, recentTools))
 
-  if (terms.length > 0 && scored.some((s) => s.score > 0)) {
+  if (terms.length > 0) {
+    if (!scored.some((s) => s.score > 0)) return []
     scored.sort((a, b) => b.score - a.score || b.header.mtimeMs - a.header.mtimeMs)
   } else {
     scored.sort((a, b) => b.header.mtimeMs - a.header.mtimeMs)