From 0bdf8f6b9cfc7b8cb9dc58fb2114fec5d2837573 Mon Sep 17 00:00:00 2001 From: cjus Date: Thu, 14 May 2026 19:39:13 -0600 Subject: [PATCH] let skills run agentic + gate on integrations via requires: skills are no longer tool-less single-shot turns. claude tiers get the full claude_code preset (agent/task still denied); ollama tier routes through the same runToolLoop the regular ollama turn uses when integrations are wired. budget is per-skill max_turns frontmatter (1-10, default 1), bounded by the existing cost cap + loop detector + three-tier policy. self-tool entry filtered from the catalog the body sees (direct recursion impossible); indirect cycles bounded by max_turns and the shared loop detector. new requires: frontmatter gates a skill on named integrations being loaded at boot. bare string or array. missing deps -> skill skipped with skills.load_error, absent from /help and telegram autocomplete. loadSkillsSync gains a 4th arg loadedIntegrationNames, sourced from integration result.sources filtered to toolCount > 0 (probe-failed integrations do not satisfy the gate). docs (features, usage, architecture, glossary, roadmap) updated to match. stale scheduler PLAN.md at repo root deleted (shipped in bff8ca4; canonical story now lives in usage + architecture). --- PLAN.md | 339 ---------------------------------------- docs/ARCHITECTURE.md | 23 +-- docs/FEATURES.md | 2 +- docs/GLOSSARY.md | 2 +- docs/ROADMAP.md | 8 +- docs/USAGE.md | 30 ++-- src/agent.ts | 5 +- src/commands.ts | 306 +++++++++++++++++++++++++++++++----- src/main.ts | 46 +++++- src/ollama.ts | 4 +- src/skill-tools.test.ts | 151 +++++++++++++++--- src/skill-tools.ts | 9 +- src/skills.test.ts | 204 ++++++++++++++++++++++++ src/skills.ts | 74 ++++++++- 14 files changed, 775 insertions(+), 428 deletions(-) delete mode 100644 PLAN.md diff --git a/PLAN.md b/PLAN.md deleted file mode 100644 index ce611c4..0000000 --- a/PLAN.md +++ /dev/null @@ -1,339 +0,0 @@ -# PLAN: scheduled tasks (one-off + periodic) - -Generalize `daily-report.ts` into a first-class scheduler that runs operator-authored prompt tasks on a timer. Reuse Solrac's existing primitives (turn queue, audit log, allowlist gate, cost caps) so safety machinery is automatic and the new module stays small. - -This plan lives at the repo root (`solrac/PLAN.md`); `solrac-dev/CLAUDE.md` and `docs/ROADMAP.md#oq12-background-worker-mode` are the prior thinking it operationalizes. - ---- - -## 1. Goal - -An operator drops a markdown file into `./tasks//TASK.md`, restarts Solrac, and the prompt body fires on the configured schedule into a configured chat. One-off (`at: `) and periodic (`every: `, `daily_at: HH:MM`) cases are both supported. - -Out of scope for v1: cron expressions, timezones, hot-reload, audit-only (no-Telegram-output) runs, peer-agent fan-out. - -## 2. Why this shape - -The user's instinct (markdown + frontmatter + prompt body) maps 1:1 onto an existing precedent: - -- `src/skills.ts` already loads operator-authored markdown files with YAML-ish frontmatter and a prompt-template body. The schema parser, error handling, name-collision rules, and load-soft semantics are all reusable patterns. -- `src/daily-report.ts` is already the `setInterval` + meta-key-idempotency primitive. Today it hardcodes one job (yesterday's spend). The generalization is straightforward. -- `main.ts` (web transport, lines 856–871) already synthesizes `Update` shapes to push into the existing turn queue. Scheduled tasks can do the same — no new dispatch path. - -The result: the scheduler is "load files like skills, fire like daily-report, dispatch like the web transport." Three known patterns composed, no new architectural surface. - -## 3. File format - -```markdown ---- -name: morning-digest -description: Weekday morning Notion ticket digest. -schedule: daily_at 09:00 # OR `every 1h` OR `at 2026-05-15T13:00:00Z` -chat_id: 123456789 # default: first allowlist entry -# engine: ollama # primary | secondary | ollama - # default: inherits config.defaultEngine (ollama on this deploy) -catch_up: true # default: true for periodic; ignored for one-off -enabled: true # default: true; lets operator pause without deleting -# max_cost_usd: 0.10 # optional per-task cap (Claude tiers only — Ollama is free) -boot_catch_up_jitter_s: 30 # optional; stagger boot fires by 0..N seconds ---- - -You are running as the morning digest. Open Notion (mcp__solrac__notion-search) -and list any "In progress" tickets with no update in the last 48h. Reply with a -short bullet list. If there are none, reply "All clear." -``` - -A task with no `engine:` line on this deploy runs against Ollama for free. Operators escalate individual digests to Anthropic by setting `engine: secondary` (Opus) or `engine: primary` (Sonnet) — same shape as a user typing `!` or `@` in chat. - -### Where TASK.md files live - -Operator-authored, one subdirectory per task: - -``` -/ -└── tasks/ ← $SOLRAC_TASKS_DIR (default: ./tasks) - ├── morning-digest/ - │ └── TASK.md - ├── weekly-pr-review/ - │ └── TASK.md - └── one-off-2026-05-15/ - └── TASK.md -``` - -Same shape as `./skills//SKILL.md` and `./integrations//index.ts`. Path is launch-cwd-relative, override via `SOLRAC_TASKS_DIR=/abs/or/rel/path`. Loader walks one level deep — a flat layout (`tasks/morning-digest.md`) is rejected for symmetry with skills (the per-task subdir is reserved for future companion files: examples, fixtures, fixtures-replay tests). - -### Schedule grammar (one of, not all) - -| Form | Meaning | Example | -|------|---------|---------| -| `every ` | Periodic interval from `last_run_at` | `every 1h`, `every 30m`, `every 24h` | -| `daily_at HH:MM` | Anchored daily fire (UTC) | `daily_at 09:00` | -| `at ` | Single fire at absolute time (one-off) | `at 2026-05-15T13:00:00Z` | - -Duration suffixes: `s`, `m`, `h`, `d`. Reject anything else at parse time. No cron in v1 — keeps the parser ~30 LOC and avoids `cron-parser` as a dep (anti-goal: no extra dependencies). - -### Validation - -- `name` matches `/^[a-z0-9_]{1,32}$/` (same regex as skills). -- `engine`: optional. Defaults to `config.defaultEngine` (operator's deploy uses `ollama`, so unprefixed tasks run free on local inference). When set explicitly: ∈ `{primary, secondary, ollama}`. If `ollama`: boot fails loud when `OLLAMA_ENABLED=false` AND when `config.defaultEngine !== "ollama"` (PR-B removed the `>` prefix; Ollama is reachable only when it's the default engine, so a task that asks for it on a Claude-default deploy is unreachable and rejected at parse). -- `chat_id`: integer; default `config.allowlistBootstrap[0]`. NOT auto-allowlisted — `from.id` for synthetic updates is the operator (already allowlisted), not `chat_id`. -- `every `: minimum 5 minutes for Claude tiers (cost guard). Ollama tasks may go as low as 1 minute (`MIN_OLLAMA_INTERVAL_MIN=1`) since inference is free, but 1-min Ollama still pins the GPU — document the trade-off in `examples/tasks/README.md`. -- One-off `at`: parses ISO8601; rejects timezone-naive strings. -- `max_cost_usd`: optional positive number; **Claude tiers only** — silently ignored for `engine: ollama` (no cost to cap). Defaults to unset (per-chat hourly cap is the only gate). When set, the runner aborts the turn before tool calls if `audit.cost_usd` for this fire exceeds the cap. Wires through the existing `PreToolUse` hook alongside the per-chat / global guards. -- `boot_catch_up_jitter_s`: optional non-negative integer; default `0`. Scheduler delays the boot catch-up fire by `random(0, jitter_s)` seconds so 12 daily tasks don't pile up simultaneously on restart. -- Reject unknown frontmatter keys (matches skills behavior — typos shouldn't silently skip). - -## 4. Module layout - -``` -src/scheduler.ts NEW — loader + schedule parser + tick loop -src/scheduler.test.ts NEW — parser + nextRunAt + catch-up semantics -examples/tasks/ NEW — README + minimal sample TASK.md -src/db.ts MODIFIED — scheduled_tasks table + idempotent ALTER -src/config.ts MODIFIED — SOLRAC_TASKS_ENABLED + SOLRAC_TASKS_DIR -src/main.ts MODIFIED — load + start + wire shutdown -src/lifecycle.ts MODIFIED — add scheduler.stop() step -src/commands.ts MODIFIED (Phase 2) — `/tasks` command -docs/USAGE.md MODIFIED — operator-facing TASK.md tutorial -docs/CONFIG.md MODIFIED — env vars + minimum-interval rule -docs/ARCHITECTURE.md MODIFIED — scheduler section + dep graph entry -``` - -Dependency direction (extends `docs/ARCHITECTURE.md#module-map`): - -``` -scheduler → db + queue + telegram + log + config -``` - -No cycles. Scheduler sits at the same layer as `daily-report`. - -## 5. Concrete data model - -### Schema additions (idempotent ALTER, mirrors existing migrations in `db.ts`) - -```sql -CREATE TABLE IF NOT EXISTS scheduled_tasks ( - name TEXT PRIMARY KEY, - last_run_at INTEGER, - last_audit_id INTEGER, - last_status TEXT, -- 'ok' | 'denied' | 'error' | NULL - one_off_consumed INTEGER NOT NULL DEFAULT 0, - source_path TEXT NOT NULL, - source_hash TEXT NOT NULL, -- SHA-256 of TASK.md; helps `/tasks status` - updated_at INTEGER NOT NULL -); - --- Distinguish scheduled fires from user/system rows in the audit log. -ALTER TABLE audit ADD COLUMN origin TEXT NOT NULL DEFAULT 'user'; -- 'user' | 'scheduled' | 'system' -ALTER TABLE audit ADD COLUMN task_name TEXT; -- NULL except origin='scheduled' -CREATE INDEX IF NOT EXISTS idx_audit_task ON audit(task_name); -``` - -The existing `audit.model = 'engine:tier:modelId'` format stays unchanged — `origin` is orthogonal. Cost cap queries (`sumChatCostSince`, `sumCostSince`) ignore `origin`, so scheduled spend rolls into the same per-chat hourly cap. Intentional — a runaway task and a runaway user are equally dangerous to the bill. - -### Synthetic `Update` shape (per fire) - -```ts -const update: Update = { - update_id: nextSyntheticId(), // negative range, scheduler-local counter - message: { - message_id: nextSyntheticId(), - date: Math.floor(Date.now() / 1000), - chat: { id: task.chat_id, type: "private", first_name: "scheduler" }, - from: { id: operatorFromId, is_bot: false, first_name: "scheduler" }, - text: schedulerWirePrefix(task.engine) + task.body, - }, -}; -``` - -`schedulerWirePrefix` maps `engine` to the existing prefix grammar relative to `config.defaultEngine`: - -| Task `engine` | Deploy default = `ollama` | Deploy default = `primary` | Deploy default = `secondary` | -|---|---|---|---| -| (omitted) | no prefix → ollama | no prefix → primary | no prefix → secondary | -| `ollama` | no prefix → ollama | rejected at parse | rejected at parse | -| `primary` | `@` | no prefix | `@` | -| `secondary` | `!` | `!` | no prefix | - -Reuses the engine dispatch already in `makeRunTurn`; nothing new to maintain. - -`origin: 'scheduled'`, `task_name`, and (if set) `max_cost_usd` are passed alongside the prompt as runner-side context (a small extension to `AuditInsert` and to the `PreToolUse` hook factory) so the audit row records them and the per-task cap is enforced. Avoids leaking scheduler concepts into the prefix grammar. - -**Verified (open-question #4):** `audit.update_id` is a plain `INTEGER` column with no UNIQUE constraint, no JOINs, and no indexes referencing it (`db.ts:107` and `idx_audit_*` confirm). Setting it to `NULL` for scheduled fires is safe. The scheduler does NOT call `db.claimUpdate(update_id)` — `handled_updates.update_id` IS PRIMARY KEY (`db.ts:83`), so a synthetic id colliding with a future Telegram poll offset would silently dedupe a real user message. The web transport already follows this pattern (synthetic Updates flow through `queue.enqueue` directly, never touching `claimUpdate`) and the scheduler does the same. - -## 6. Scheduler primitive - -```ts -// src/scheduler.ts (sketch) - -export interface ScheduleSpec { - kind: "every" | "daily_at" | "at"; - ms?: number; // for "every" - hourUtc?: number; // for "daily_at" - minuteUtc?: number; // for "daily_at" - atMs?: number; // for "at" -} - -export interface Task { - name: string; - description: string; - body: string; - chatId: number; - engine: "primary" | "secondary" | "ollama"; - spec: ScheduleSpec; - catchUp: boolean; - enabled: boolean; - sourcePath: string; - sourceHash: string; -} - -// Pure: given spec + last_run_at + now, returns the next-fire ms or null -// (one-off + already consumed). -export function nextRunAt(spec: ScheduleSpec, lastRunAt: number | null, now: number): number | null; - -export function startScheduler(deps: SchedulerDeps): { stop: () => void }; -``` - -The driver: - -```ts -function tick() { - const now = Date.now(); - for (const t of registry.all) { - if (!t.enabled) continue; - const state = db.getTaskState(t.name); - if (state?.one_off_consumed) continue; - const due = nextRunAt(t.spec, state?.last_run_at ?? null, now); - if (due === null) continue; - if (now < due) continue; - void fireTask(t, now); // synthesize Update → queue.enqueue - db.markTaskFired(t.name, now, t.spec.kind === "at" ? 1 : 0); - } -} -const timer = setInterval(tick, 60_000); -tick(); // boot fire (catch-up) — matches daily-report -return { stop: () => clearInterval(timer) }; -``` - -Single shared 60s tick. Tasks finer than 1 minute are out of scope (and rejected by the 5-min minimum); we don't need millisecond precision. - -### Catch-up rules - -- `every `, `last_run_at` is `(now - duration)` or older → fire once. Don't fire N times for N missed intervals. -- `daily_at HH:MM`, today's anchor has passed AND `last_run_at < today's anchor` → fire once. -- `at `, `at_ms < now` AND `!one_off_consumed` AND `catch_up=true` → fire once, then mark consumed. -- `at `, `at_ms < now` AND `catch_up=false` → mark consumed without firing. (Operator wanted "9am Tuesday" and Solrac was down at 9am — don't fire at 4pm.) - -`catch_up: true` is the default for periodic, `false` for one-off. Documented. - -## 7. Wiring (main.ts) - -Mirror the skills wiring already in `main.ts`. Pseudo-diff: - -```diff -+ const taskRegistry = config.tasksEnabled -+ ? loadTasksSync(config.tasksDir, ...) -+ : EMPTY_TASK_REGISTRY; -+ logTaskLoadResult(config.tasksDir, taskRegistry); - ... - installShutdown({ tracker, db, pidPath, pollAbort, server, webServer, scheduler }); -+ const scheduler = config.tasksEnabled && taskRegistry.size() > 0 -+ ? startScheduler({ -+ db, registry: taskRegistry, queue, -+ operatorFromId: config.allowlistBootstrap[0]!, -+ defaultEngine: config.defaultEngine, -+ }) -+ : null; -``` - -Lifecycle stops the scheduler BEFORE `pollAbort.abort()` so no new fires land mid-shutdown. In-flight task turns drain through the existing tracker. - -## 8. Auditability and operator visibility - -Phase 1 (must ship): - -- Every fire writes one `audit` row with `origin='scheduled'`, `task_name=`. Existing cost cap, error_message, status columns all populate as for a user turn. -- `scheduled_tasks` table tracks `last_run_at`, `last_status`, `last_audit_id`, `one_off_consumed`. Operator queries it directly via sqlite3 if needed. - -Phase 2 (follow-up): - -- `/tasks` slash command — list loaded tasks with last/next fire and status. Reuse `commands.ts` dispatcher. -- `/tasks run ` — manual trigger (synthesizes a fire). Useful for debugging without waiting for the next anchor. - -## 9. Safety analysis - -| Risk | Mitigation | -|------|------------| -| Misconfigured `every: 1m` task → cost runaway | 5-min minimum at parse. Per-chat hourly cap is the eventual safety net. | -| Task fires while user is mid-conversation in the same chat | KeyedMutex serializes per-chat — task waits behind user message (correct behavior; documented). | -| Scheduled task uses tools that prompt for confirmation → no operator at the keyboard at 3am | Confirmation broker times out at 60s and fail-closes (already true for tier-3 tools). Task body should be written for tools that auto-allow OR tier-3 tools the operator pre-trusts. Document in `examples/tasks/README.md`. | -| Fires during shutdown drain | Scheduler stops first; `pollAbort` aborts after; tracker drains in-flight turns. Same shape as poll-loop shutdown. | -| Synthetic `update_id` collides with Telegram poll id space | Use a negative-int counter (web transport already does this). `handled_updates` is bypassed for scheduled fires (no dedupe needed — scheduler is the only source of these ids). | -| Operator deletes TASK.md mid-run | Scheduler is boot-loaded only. Edits/deletes take effect on next restart. Matches skills/integrations contract. | -| Loop detector trips on a task that hits the same tool repeatedly across many fires | Loop detector is per-turn, not per-task. Each fire starts fresh. Correct. | - -## 10. Test surface - -Co-located, `bun:test`, no mocking framework — same as the rest of Solrac. - -- `scheduler.test.ts`: - - Parser: every shape valid + invalid path produces a clear error. - - `nextRunAt` for `every`, `daily_at`, `at` × {never run, just ran, missed window}. - - `catch_up` true/false behavior on one-off. - - 5-min minimum enforced. -- `db.test.ts` additions: round-trip on `scheduled_tasks` insert/update/get; idempotent ALTER. -- `commands.test.ts` (Phase 2): parse `/tasks` and `/tasks run `. - -Smoke test: a flood-style harness with two synthetic tasks (one `every 5s`, one `at `) confirms (a) periodic fires advance, (b) one-off marks consumed, (c) drain on signal stops the loop. Add as `test/smokes/scheduler.ts`. - -## 11. Phasing - -### Phase 1 — minimum viable scheduler (1–2 days) - -- [ ] `db.ts`: schema additions + idempotent ALTERs + new prepared statements (`getTaskState`, `markTaskFired`, `setTaskOneOffConsumed`). -- [ ] `config.ts`: `SOLRAC_TASKS_ENABLED` (default `false`), `SOLRAC_TASKS_DIR` (default `./tasks`). -- [ ] `scheduler.ts`: parser, registry, tick loop, synthetic-Update construction. -- [ ] `main.ts`: load registry, start scheduler, pass to lifecycle. -- [ ] `lifecycle.ts`: stop scheduler before `pollAbort.abort()`. -- [ ] `examples/tasks/`: README + a `morning-digest` sample. -- [ ] Tests: `scheduler.test.ts`, additions to `db.test.ts`. -- [ ] Docs: USAGE.md tutorial, CONFIG.md env vars, ARCHITECTURE.md scheduler section. - -### Phase 2 — operator visibility (~half day) - -- [ ] `/tasks` command (list, status). -- [ ] `/tasks run ` manual trigger. -- [ ] `commands.test.ts` additions. -- [ ] Update `BOT_COMMAND_REGISTRY` so Telegram autocomplete shows it. - -### Phase 3 — deferred - -- cron expressions (probably never; `every` + `daily_at` covers 90% of real cases). -- Timezones (operator can use `daily_at` in UTC and do the math). -- `notify: false` (audit-only). Requires a "no Telegram output" mode in the runners; not free. -- Hot-reload via FS watch — explicit anti-goal today; matches skills/integrations. - -## 12. Documentation hits - -- `docs/USAGE.md` — new "Scheduled tasks" section: TASK.md format, schedule grammar, examples. -- `docs/CONFIG.md` — `SOLRAC_TASKS_ENABLED`, `SOLRAC_TASKS_DIR`, 5-min minimum interval rule. -- `docs/ARCHITECTURE.md` — module map entry; scheduler section under "Lifecycle"; mention in "Concurrency model" that scheduled fires share the same queue. -- `docs/ROADMAP.md` — close OQ#12 once Phase 1 ships. -- `CLAUDE.md` (under `solrac-dev/`) — add a "Scheduled tasks" learning if the implementation surfaces a gotcha. - -## 13. Resolved decisions - -All five Phase-1 questions resolved (operator, 2026-05-10): - -1. **Result delivery** → stream into the configured Telegram chat. Same UX as a user-typed turn; `notify: false` (audit-only) deferred to Phase 3. -2. **`engine: ollama` post-PR-B** → fail loud at parse when `defaultEngine !== "ollama"`. Validation is in §3 above. -3. **Per-task max cost cap** → `max_cost_usd` frontmatter field, optional. Wires through the existing `PreToolUse` hook as a fourth pre-flight check (after global, per-chat, loop). Aborts the turn when this fire's cumulative `cost_usd` exceeds the cap. Silently ignored when `engine: ollama` (no cost to cap on this deploy's default path). Tested in `scheduler.test.ts` + a dedicated row in `policy.test.ts`. -4. **`audit.update_id` collision** → verified safe: no UNIQUE, no JOIN, no index. Scheduled fires write `NULL` `update_id` and bypass `handled_updates`. See §5. -5. **Boot-catch-up jitter** → `boot_catch_up_jitter_s` frontmatter field (per-task, default `0`). Scheduler defers the catch-up fire by `random(0, jitter_s)` ms. Tasks the operator wants to fire deterministically leave it at `0`; daily digests that all anchor at 09:00 set it to e.g. `300` so they spread across 5 minutes. - -## 14. Open follow-ups - -None blocking Phase 1. Tracked for later: - -- Phase 3 candidates listed in §11 (cron, timezones, `notify: false`, hot-reload). -- Operator UX: should `/tasks` show the next-fire time in the operator's local timezone or UTC? Defer until the command lands. diff --git a/docs/ARCHITECTURE.md b/docs/ARCHITECTURE.md index 1708c1e..68090d2 100644 --- a/docs/ARCHITECTURE.md +++ b/docs/ARCHITECTURE.md @@ -241,35 +241,40 @@ For `queue_full`: `INSERT INTO audit … status='error', error_message='queue_fu `skills.ts` adds a filesystem-discovered command surface on top of the `commands.ts` dispatcher. Skills are operator-authored `SKILL.md` files under `$SOLRAC_SKILLS_DIR//` discovered ONCE at boot (no hot-reload, matching the boot-once config story). Disabled by default (`SOLRAC_SKILLS_ENABLED=false`). **Boot sequence (in `main.ts`).** -1. Load skills sync: `loadSkillsSync(config.skillsDir, BUILT_IN_NAMES)` — fail-soft. Each malformed `SKILL.md` adds an error to the result; valid ones populate the registry. Missing directory → empty registry + one error (NOT a crash). -2. Telegram autocomplete: `setMyCommands([...BOT_COMMAND_REGISTRY, ...skillsToBotCommands(registry.all)])` — built-ins first, skills after. +1. Load skills sync: `loadSkillsSync(config.skillsDir, BUILT_IN_NAMES, config.defaultEngine, loadedIntegrationNames)` — fail-soft. Each malformed `SKILL.md` adds an error to the result; valid ones populate the registry. Missing directory → empty registry + one error (NOT a crash). `loadedIntegrationNames` is the set of integrations that registered ≥1 tool (a probe-failed integration with `toolCount === 0` does NOT count); it gates skills that declare `requires:` (see below). +2. Telegram autocomplete: `setMyCommands([...BOT_COMMAND_REGISTRY, ...skillsToBotCommands(registry.all)])` — built-ins first, skills after. Skills filtered by `requires:` are absent from this payload, so operators never see autocomplete suggestions that fail at use-time. 3. Plumb the registry into `commandDeps` and `parseCommand`. **Parser hook.** After the `KNOWN_COMMANDS` check misses, `parseCommand` looks up the name in the registry. A hit returns `{ kind: "skill", skill, args }`. A miss falls through to the existing unknown-vs-passthrough logic (`/` → unknown, `:` → passthrough). Built-ins always win — even if a buggy registry returned a skill named `clear`, the built-in arm fires first. -**Frontmatter schema (4 fields).** +**Frontmatter schema (2 required + 4 optional).** - `name` — required, matches `[a-z0-9_]{1,32}`, must NOT collide with built-in names (rejected at load time). - `description` — required, ≤256 chars (used in `setMyCommands` payload + `/help` rendering, and as the tool description when `tool: true`). - `tier` — optional, `primary` | `secondary` | `ollama`. Defaults to `SOLRAC_DEFAULT_ENGINE` so an Ollama-default deploy gets free skills automatically. Explicit `tier: ollama` is rejected when the deploy default isn't ollama (PR-B removed the `>` prefix). +- `max_turns` — optional, integer in `[1, 10]`, default `1`. Model-turn budget for the skill body. Doubles as the SDK `maxTurns` on Claude tiers and as `runToolLoop`'s `maxIterations` on the Ollama tier — the operator gets one knob that constrains both paths uniformly. - `tool` — optional boolean, default false. When true, exposes the skill as a callable MCP tool to the Ollama agent (Phase 1 restriction: `tool: true` requires `tier: ollama`). +- `requires` — optional, bare string or string array (entries match `[a-z][a-z0-9_-]{0,31}`). Integration dependencies. When any name is absent from `loadedIntegrationNames` at boot, the loader skips the skill with a non-fatal `skills.load_error` and the registry never sees it. `/help` and Telegram autocomplete are filtered by the same registry, so the operator never gets advertised a skill that would fail at use-time. Empty / omitted → unconditional load (preserves back-compat for pre-`requires:` skills). -The body is a prompt template; `{{args}}` is the only placeholder and is replaced literally with the user's text after the command name (or with the agent-supplied `args` argument when called as a tool). The frontmatter parser is a homemade YAML subset (~70 LOC in `skills.ts`) — handles `key: scalar`, `key: [a, b, c]`, quoted strings, integers, booleans. Adding `js-yaml` for a 4-key schema was disproportionate. +The body is a prompt template; `{{args}}` is the only placeholder and is replaced literally with the user's text after the command name (or with the agent-supplied `args` argument when called as a tool). The frontmatter parser is a homemade YAML subset in `skills.ts` — handles `key: scalar`, `key: [a, b, c]`, quoted strings, integers, booleans. Adding `js-yaml` for a 6-key schema was disproportionate. **Skill execution.** The path forks on `tier`: -- **Claude tiers (`primary` / `secondary`).** `runSkill` in `commands.ts`. Mirrors `runCompactTurn`'s structural pattern: pre-flight cost cap (chat + global; cap-rejected skills cost $0), `query()` with `maxTurns: 1`, no `resume`, `disallowedTools` deny-list, `canUseTool: deny-all`. Audit row tagged `claude:::skill:`. -- **Ollama tier.** `runOllamaSkill` (and the bare `runSkillBare` helper) in `commands.ts`. One-shot `/api/chat` (`stream: false`), no history, no SOLRAC.md overlay, no tool loop, no streaming stub. Audit row tagged `ollama::skill:` with `cost_usd: 0`. Pre-flight cap is skipped (a chat throttled by Claude burn shouldn't lose access to free local inference). +- **Claude tiers (`primary` / `secondary`).** `runSkill` in `commands.ts`. Pre-flight cost cap (chat + global; cap-rejected skills cost $0), then `query()` with `maxTurns: skill.maxTurns`, no `resume` (fresh isolated turn), `tools: { type: "preset", preset: "claude_code" }`, `disallowedTools: ["Agent","Task"]` (sub-agents off; belt-and-suspenders with `policy.ts::SUBAGENT_DENY_TOOLS`). The interactive `canUseTool` factory + `PreToolUse` / `PostToolUse` / `PostToolUseFailure` hooks come from `deps.createCanUseTool` / `policy.ts` — same instances `runAgent` uses, so cost cap, loop detector, and the Telegram-confirm UX behave identically inside a skill. When integrations are loaded, `deps.mcpServer` is wired so the body sees `mcp__solrac__` tools too. Audit row tagged `claude:::skill:`; mid-turn cap or loop denials get promoted into `error_message` as `policy_deny:: …`. +- **Ollama tier.** `runOllamaSkill` (and the bare `runSkillBare` helper) in `commands.ts`. The helper dispatches on whether `OllamaSkillDeps` has `tools + toolTiers + broker` wired: + - **Tools wired** → `runSkillBareWithTools` routes the body through the same `runToolLoop` driver that `runOllamaTurnWithTools` uses. `maxIterations = skill.maxTurns`, fresh loop detector, full `mcp__solrac__*` + `skills__*` catalog with the skill's own `skills__` entry filtered out (recursion guard — see below). No history, no SOLRAC.md overlay, no streaming stub. + - **Tools absent** → fall through to a single-shot `/api/chat` (`stream: false`). Preserves pure text-transform skills (no `requires:`, `max_turns: 1`) at minimum latency. + Either way: audit row tagged `ollama::skill:` with `cost_usd: 0`. Pre-flight Claude cap is skipped (a chat throttled by Claude burn shouldn't lose access to free local inference). -Reply for both: model output verbatim, HTML-escaped, truncated to ≈3,500 chars (Telegram per-message ceiling minus headroom). Skills are tool-less across both engines — the Claude path denies every tool call via `canUseTool`; the Ollama path posts to `/api/chat` with no `tools` field (a load-bearing invariant for skills-as-tools recursion safety; see below). +Reply for both: model output verbatim, HTML-escaped, truncated to ≈3,500 chars (Telegram per-message ceiling minus headroom). The Ollama path's `runOllamaSkill` wraps the call in `skillToolCtx.run({chatId, fromId, updateId, parentAuditId}, ...)` so any nested `skills__*` invocation inherits the chat context for its own audit row. -**Skills as tools (Phase 1: Ollama-only).** A skill with `tool: true` is also exposed as a callable MCP tool to the Ollama agent (`skill-tools.ts::buildSkillTools`). The model sees it in its tool catalog as `mcp__solrac__skills__` (wire format on Ollama: `skills__`) with the operator-authored description. Tool dispatch: +**Skills as tools (Phase 1: Ollama-only).** Distinct axis from "skills using tools" above — *that* is shipped on both tiers. *This* is whether the Ollama agent can call a skill **by name** as a tool entry in its catalog. A skill with `tool: true` is exposed as a callable MCP tool to the Ollama agent (`skill-tools.ts::buildSkillTools`). The model sees it in its tool catalog as `mcp__solrac__skills__` (wire format on Ollama: `skills__`) with the operator-authored description. Tool dispatch: 1. **Catalog merge.** At boot, eligible skills (`tool: true && tier: ollama`) become `SdkMcpToolDefinition` entries with input schema `{ args: string }`. They're merged into `integrationTools` and `integrationToolTiers` (all `auto`-allow) before `ollamaDeps` is constructed. 2. **Per-turn context propagation.** `runOllamaTurnWithTools` wraps the loop in `skillToolCtx.run({chatId, fromId, updateId, parentAuditId}, () => runToolLoop(...))`. The skill handler reads the store via `AsyncLocalStorage.getStore()` — needed because the SDK tool-handler signature `(args, extra) => ...` leaves no slot for chat context, and concurrent turns require race-free context (the queue runs N chats in parallel). ALS is the standard Node primitive for this. 3. **Handler.** Reads ALS context, calls `runSkillBare`, writes a fresh audit row with `origin='tool_call'` so operators can distinguish agent-driven invocations from operator-typed `/` calls (`origin='user'`). Returns the model's text as the tool result; the parent Ollama turn composes its final user-facing reply on top. 4. **Permission tier.** Auto-allow. Cost cap is the backstop (Phase 1 ollama skills are free; Phase 2 unlocks Claude-tier skills with a per-skill cost cap). -**Recursion safety (load-bearing).** A skill called as a tool MUST NOT call any tool itself — otherwise an Ollama agent could trigger `skills__foo` which calls `skills__foo` infinitely. `runSkillBare` posts to `/api/chat` with no `tools` field, so the sub-call has no tool surface. `skill-tools.test.ts` asserts the outgoing fetch body has no `tools` key — a regression breaks CI before production. +**Recursion safety (load-bearing).** A skill body must not be able to call itself. Direct recursion is prevented by filtering the skill's own `skills__` entry out of the MCP catalog `runSkillBareWithTools` hands to `runToolLoop` (see `commands.ts::runSkillBareWithTools`). Indirect recursion (A → `skills__B` → `skills__A`) is bounded by two backstops in series: `skill.maxTurns` caps the tool-loop iterations for each invocation, and the shared loop detector (`policy.ts::createLoopDetector`, fresh per invocation, threshold 3 identical `(tool, input)` calls) trips before a deep cycle materializes. `skill-tools.test.ts` asserts the self-filter; a regression breaks CI before production. **Phase 2 deferred.** Cross-engine tool calls (Ollama agent → Sonnet skill) would land via the same SDK MCP server already used for integrations. Phase 1's ollama-tier restriction sidesteps the cost-escalation question (a misbehaving Ollama agent calling a `tier: primary` skill 100× would burn real $$$). When Phase 2 lands, expect a per-skill cost cap and a `confirm`-tier option for Claude-backed tool calls. diff --git a/docs/FEATURES.md b/docs/FEATURES.md index 9c6a99c..4efb428 100644 --- a/docs/FEATURES.md +++ b/docs/FEATURES.md @@ -12,7 +12,7 @@ The complete feature list, grouped by theme. See [../README.md](../README.md) fo - **Customizable persona via `SOUL.md` + `SOLRAC.md`** — two operator-editable markdown files at the launch directory. `SOUL.md` (voice, stance, safety) ships with the package and is read once at boot. `SOLRAC.md` (operator overlay: who runs it, channel posture, project context) is re-read every turn so live edits land on the next message without a restart. See [USAGE.md#customizing-solrac-soulmd-and-solracmd](./USAGE.md#customizing-solrac-soulmd-and-solracmd). - **Slash commands** — `/help`, `/status`, `/context`, `/clear`, `/compact` give the operator visibility and control over conversation context, spend, and session state without leaving Telegram. Both `/cmd` and `:cmd` invoke the same handler (`:` avoids Telegram's auto-link on bold text). -- **Operator-defined skills** — drop a `SKILL.md` into `$SOLRAC_SKILLS_DIR//` and that filename becomes a slash command on the next boot. Tool-less single-turn prompts with `{{args}}` templating; tier defaults to `SOLRAC_DEFAULT_ENGINE` (free on Ollama deploys); cost-capped under the existing per-chat hourly budget. Optional `tool: true` frontmatter exposes the skill as a callable MCP tool to the local Ollama agent (Phase 1: `tier: ollama` only) so natural-language requests can route through your prompts. Off by default; enable with `SOLRAC_SKILLS_ENABLED=true`. +- **Operator-defined skills** — drop a `SKILL.md` into `$SOLRAC_SKILLS_DIR//` and that filename becomes a slash command on the next boot. `{{args}}` templating; per-skill `max_turns` (1–10) so a single-shot text transform stays bounded while an agentic skill (e.g. `notion_search` → `notion_create_page`) gets headroom; the body runs with the same Claude Code tool preset (Claude tiers) or integrations MCP catalog (Ollama tier) as a normal turn, under the same three-tier policy, cost cap, and loop detector. Optional `requires:` frontmatter gates a skill on named integrations being loaded at boot — missing deps → skill skipped, never appears in `/help` or autocomplete. Optional `tool: true` exposes the skill as a callable MCP tool to the local Ollama agent (Phase 1: `tier: ollama` only) so natural-language requests can route through your prompts. Off by default; enable with `SOLRAC_SKILLS_ENABLED=true`. - **Scheduled tasks** — drop a `TASK.md` into `$SOLRAC_TASKS_DIR//` and the prompt fires on its configured schedule (`every 1h`, `daily_at 09:00`, `at 2026-05-15T13:00:00Z`) into a configured chat. Engine inheritance (defaults to `config.defaultEngine`), per-task `max_cost_usd`, boot catch-up jitter; fires synthesize updates through the same turn queue so all existing safety machinery applies. `/tasks` lists loaded tasks with last + next fire; `/tasks run ` triggers on demand. Off by default; enable with `SOLRAC_TASKS_ENABLED=true`. See [USAGE.md#scheduled-tasks](./USAGE.md#scheduled-tasks). ## Transport diff --git a/docs/GLOSSARY.md b/docs/GLOSSARY.md index a65a505..ac79690 100644 --- a/docs/GLOSSARY.md +++ b/docs/GLOSSARY.md @@ -78,7 +78,7 @@ Terms that recur across Solrac's codebase and docs. Alphabetical. **skill** — User-level Claude Code skill in `.claude/skills//SKILL.md`. Available to the agent via the SDK's preset systemPrompt + tool routing. v1 doesn't enumerate skills explicitly in the systemPrompt — that's [OQ#11](./ROADMAP.md#oq11-skill-router). -**Solrac skill (operator-defined)** — Distinct from the Claude Code skill above. A `SKILL.md` file under `$SOLRAC_SKILLS_DIR//` that defines a Telegram slash command (`/`) without code changes. Loaded ONCE at boot by `skills.ts::loadSkillsSync`; runs as a single tool-less turn (`runSkill` in `commands.ts`). Tier defaults to `SOLRAC_DEFAULT_ENGINE` so an Ollama-default deploy gets free skills automatically. Optional `tool: true` frontmatter additionally exposes the skill as a callable MCP tool to the Ollama agent — see **skill tool**. Disabled by default (`SOLRAC_SKILLS_ENABLED=false`). See [USAGE.md#skills-operator-defined-commands](./USAGE.md#skills-operator-defined-commands). +**Solrac skill (operator-defined)** — Distinct from the Claude Code skill above. A `SKILL.md` file under `$SOLRAC_SKILLS_DIR//` that defines a Telegram slash command (`/`) without code changes. Loaded ONCE at boot by `skills.ts::loadSkillsSync`; runs via `runSkill` (Claude tiers) or `runOllamaSkill` (Ollama) in `commands.ts`. The body sees the same tool surface a normal turn does (Claude Code preset on Claude tiers; the integrations MCP catalog via `runToolLoop` on Ollama when tools are wired) — bounded by the per-skill `max_turns` frontmatter (default 1, max 10) and constrained by the same three-tier policy + cost cap + loop detector + `canUseTool` confirm UX as a regular turn. Tier defaults to `SOLRAC_DEFAULT_ENGINE` so an Ollama-default deploy gets free skills automatically. Optional `requires:` frontmatter gates the skill on named integrations being loaded at boot (missing deps → silently absent from `/help` + autocomplete). Optional `tool: true` additionally exposes the skill as a callable MCP tool to the Ollama agent — see **skill tool**. Disabled by default (`SOLRAC_SKILLS_ENABLED=false`). See [USAGE.md#skills-operator-defined-commands](./USAGE.md#skills-operator-defined-commands). **skill tool** — A Solrac skill with `tool: true` frontmatter, exposed to the Ollama agent's tool catalog as `mcp__solrac__skills__` (wire format on Ollama: `skills__`). The model decides when to call it from natural language; the tool description is `skill.description`; input schema is `{ args: string }`. Phase 1 restriction: requires `tier: ollama` (free, no cross-engine cost surprises). Auto-allow permission tier; cost cap is the backstop. Built by `skill-tools.ts::buildSkillTools`. Per-turn context (chatId, fromId, updateId, parentAuditId) propagates via `node:async_hooks::AsyncLocalStorage` (`skillToolCtx`) — the SDK tool-handler signature `(args, extra)` leaves no slot for chat context, and concurrent turns require race-free isolation. Audit row tagged `origin='tool_call'` to distinguish from operator-typed slash invocations. diff --git a/docs/ROADMAP.md b/docs/ROADMAP.md index 79c0acf..782563d 100644 --- a/docs/ROADMAP.md +++ b/docs/ROADMAP.md @@ -316,9 +316,13 @@ Trade-off: every token in systemPrompt ships on every turn. If the registry is 5 **Status:** Phase 1 shipped (Ollama-only). See `src/skill-tools.ts` and [USAGE.md#skills-as-tools-phase-1-ollama-only](./USAGE.md#skills-as-tools-phase-1-ollama-only). -A `SKILL.md` with `tool: true` frontmatter is exposed to the Ollama agent's tool catalog as `mcp__solrac__skills__`. The model decides when to call from natural language; the description is `skill.description`; the schema is `{ args: string }`. Auto-allow tier; cost cap is the backstop. Phase 1 restricted to `tier: ollama` skills (free) to sidestep the cost-escalation question. Audit row tagged `origin='tool_call'`. +Two distinct axes — kept separate because they have different cost-exposure shapes: -**Phase 2 (deferred).** Expose to Claude tiers via the existing `solrac` MCP server. Lift the `tier: ollama` restriction; add per-skill cost cap; consider `confirm`-tier gating on Claude-backed tool calls so a runaway Ollama agent can't burn $$$ silently. +1. **Skills *using* tools (shipped on both tiers).** A skill body — Claude or Ollama — runs with the same tool surface a regular turn does (Claude Code preset on Claude; integrations MCP catalog on Ollama). Bounded by per-skill `max_turns` frontmatter (1–10, default 1) and the same three-tier policy + cost cap + loop detector. Pure text-transform skills stay cheap with `max_turns: 1`; agentic skills (`/log` chaining `notion_search` → `notion_create_page`) declare what they need. + +2. **Skills *callable as* tools by the agent (Phase 1: Ollama-only).** A `SKILL.md` with `tool: true` frontmatter is exposed to the Ollama agent's tool catalog as `mcp__solrac__skills__`. The model decides when to call from natural language; the description is `skill.description`; the schema is `{ args: string }`. Auto-allow tier; cost cap is the backstop. Phase 1 restricted to `tier: ollama` skills (free) to sidestep the cost-escalation question (a misbehaving Ollama agent calling a `tier: primary` skill 100× would burn real $$$). Audit row tagged `origin='tool_call'`. + +**Phase 2 (deferred) — axis 2 expansion.** Expose tool-callable skills to Claude tiers via the existing `solrac` MCP server. Lift the `tier: ollama` restriction on `tool: true`; add a per-skill `max_cost_usd` cap separate from the chat-level cap; consider `confirm`-tier gating on Claude-backed tool-callable skills so the operator approves each cross-engine escalation. **Phase 3 (deferred).** Streamed skill output (currently the agent waits for the full skill reply before continuing); per-skill telemetry surface in `/status` or a dedicated `/skills` slash command. diff --git a/docs/USAGE.md b/docs/USAGE.md index 556c662..60ec238 100644 --- a/docs/USAGE.md +++ b/docs/USAGE.md @@ -340,7 +340,9 @@ The directory path comes from `SOLRAC_SKILLS_DIR` (default `./skills`, resolved name: summarize # required, [a-z0-9_]{1,32}, must not collide with built-in commands description: Summarize the URL or pasted text in 3 bullets. # required, ≤256 chars tier: primary # optional, primary|secondary|ollama, default = SOLRAC_DEFAULT_ENGINE +max_turns: 1 # optional, integer in [1,10], default 1. Model-turn budget for the skill body. Pure text-transforms want 1; agentic skills that chain tool calls (e.g. `notion_search` → `notion_create_page`) need headroom. Doubles as `maxIterations` for the Ollama tool loop. tool: false # optional, default false. When true, also expose this skill as a callable MCP tool to the Ollama agent (Phase 1: requires tier: ollama). +requires: notion # optional, integration deps. Bare string OR array (`requires: [notion, gmail]`). When any name is missing from the loaded integrations at boot, the skill is skipped with a `skills.load_error` warn — it never appears in `/help` or Telegram autocomplete. Omit for unconditional load. --- You are a concise summarizer. Produce exactly 3 bullets, no preamble. @@ -348,29 +350,34 @@ Input: {{args}} ``` -Three frontmatter fields, plus a body. The body is a Claude prompt — `{{args}}` is the only placeholder and gets replaced with whatever the user typed after the command (e.g. `/summarize ` → `args = ""`). The substitution is literal text-for-text; no escaping, no nested templating. +Two required fields (`name`, `description`), four optional (`tier`, `max_turns`, `tool`, `requires`), plus a body. The body is a prompt template — `{{args}}` is the only placeholder and gets replaced with whatever the user typed after the command (e.g. `/summarize ` → `args = ""`). The substitution is literal text-for-text; no escaping, no nested templating. The frontmatter parser supports a YAML *subset*: `key: scalar`, `key: [a, b, c]` string arrays, single- or double-quoted strings, integers, booleans. Multi-line strings, anchors, and nested maps are NOT supported and produce a clear error pointing at the offending line. ### What skills can do -Skills are tool-less in v1 — `Bash`, `Edit`, `Write`, `WebFetch`, `WebSearch`, `Agent`, `Task`, and `NotebookEdit` are explicitly disallowed; `canUseTool` denies everything else by default. Skills run a single Claude turn (`maxTurns: 1`, no `resume`) and reply with the model's text output verbatim (HTML-escaped). +Skills run with the full tool surface their tier provides, bounded by `max_turns` (default 1): -This means skills are best for: +- **Claude tiers (`primary` / `secondary`)** — the body sees the same Claude Code tool preset a normal turn does (`Bash`, `Read`, `Edit`, `Write`, `WebFetch`, `WebSearch`, plus every `mcp__solrac__*` integration tool). `Agent` and `Task` stay denied at the SDK + policy layers — no sub-agents from inside a skill. +- **Ollama tier** — when the deploy has integrations + Ollama tools enabled, the body routes through the same `runToolLoop` driver as a regular Ollama turn and sees the full MCP catalog (minus its own `skills__` entry — see "Skills as tools" below). Without integrations / tools, the path falls back to a single-shot `/api/chat` round trip. -- **Text transformations** (summarize, translate, rephrase, format). -- **Templated prompts** the operator wants to invoke quickly without retyping. -- **Quick lookups** that don't require fetching anything (Claude's training data only). +Every tool call (both tiers) flows through the same three-tier policy (auto-allow / auto-deny / Telegram-confirm), the same `PreToolUse` cost-cap + loop-detector hooks, and the same `canUseTool` interactive confirm UX as a normal turn. A skill body that calls `Bash(rm -rf /)` gets denied identically — there's no skill-specific bypass. + +`max_turns` is the per-skill model-turn budget. A pure text-transform (summarize, translate) wants `max_turns: 1`. An agentic skill that chains tool calls (e.g. `/log` doing `notion_search` → `notion_create_page` → return URL) needs a few more; the bound caps runaway behavior the same way the SDK's `maxTurns` does for a regular turn. Hard ceiling is 10; the cost cap is the ultimate backstop on Claude tiers, `OLLAMA_MAX_TOOL_ITERATIONS` on Ollama. -If you need tool use, file an issue — that's a v1.1 conversation. +This means skills are good for: + +- **Text transformations** (summarize, translate, rephrase, format) — `max_turns: 1`, no `requires:`. +- **Integration-backed actions** (append a Notion row, send a Gmail draft, fetch a URL and summarize) — `max_turns: 3–5`, `requires: notion` (or whatever). +- **Templated prompts** the operator wants to invoke quickly without retyping. **Tier inherits the deploy default.** When `tier:` is omitted, the skill runs on whatever `SOLRAC_DEFAULT_ENGINE` resolves to (`ollama`, `primary`, or `secondary`). Override per-skill with an explicit `tier:` value. `tier: ollama` is rejected at load if `SOLRAC_DEFAULT_ENGINE != ollama` (PR-B removed the `>` prefix; Ollama is reachable only as the deploy default). ### Cost & caps -A Claude-tier skill (`primary` or `secondary`) costs a real Claude turn. The audit row is tagged `claude:::skill:` so cost rolls up under the existing per-chat hourly cap (`HOURLY_COST_CAP_USD`) and the global cap. The pre-flight cap check fires *before* the SDK call — a cap-rejected skill costs $0. +A Claude-tier skill (`primary` or `secondary`) costs real Claude turns — up to `skill.maxTurns` of them. The audit row is tagged `claude:::skill:` so cost rolls up under the existing per-chat hourly cap (`HOURLY_COST_CAP_USD`) and the global cap. The pre-flight cap check fires *before* the SDK call — a cap-rejected skill costs $0. Mid-turn cap exhaustion is caught by the `PreToolUse` hook (same path as a normal turn) and stamped into the audit row as `policy_deny:cost_cap_exceeded: …`. -An Ollama-tier skill is free. The audit row is tagged `ollama::skill:` with `cost_usd = 0`; the per-chat hourly cap pre-flight is skipped (a chat throttled by Claude burn shouldn't lose access to local inference). Ollama skills run a single non-streaming `/api/chat` round trip — no history, no SOLRAC.md overlay, no tool loop, no streaming stub. Tool use is unavailable on this path even if `OLLAMA_TOOLS_ENABLED=true`; skills are tool-less by design across both engines. +An Ollama-tier skill is free. The audit row is tagged `ollama::skill:` with `cost_usd = 0`; the per-chat hourly cap pre-flight is skipped (a chat throttled by Claude burn shouldn't lose access to local inference). When integrations + Ollama tools are enabled the skill body routes through the same `runToolLoop` a regular Ollama turn uses, capped at `skill.maxTurns` iterations and constrained by the shared loop detector. Without those wired (e.g. `OLLAMA_TOOLS_ENABLED=false` or no integrations loaded), the body falls back to a single non-streaming `/api/chat` round trip — no history, no SOLRAC.md overlay, no tool loop, no streaming stub. Either way, no Claude burn. ### Failure modes @@ -379,6 +386,7 @@ An Ollama-tier skill is free. The audit row is tagged `ollama::skill:` entry is filtered out of the catalog the body sees, so direct recursion (`/foo` → `skills__foo`) is structurally impossible. Indirect cycles (A → `skills__B` → `skills__A`) are bounded by `skill.maxTurns` plus the shared loop detector (third identical `(tool, input)` in a turn → deny). A test (`skill-tools.test.ts`) asserts the self-filter; a regression breaks CI. Audit visibility: every tool-called skill writes its own `audit` row tagged `origin='tool_call'` and `model='ollama::skill:'`. Operator-typed `/` invocations stay tagged `origin='user'`, so the two surfaces are distinguishable in the audit log: @@ -423,7 +431,7 @@ sqlite3 data/solrac.sqlite "SELECT started_at, origin, model, status FROM audit Description quality matters: the model's natural-language → tool routing depends entirely on `skill.description`. Bad descriptions → wrong tool fires or misses. Write descriptions as if you're describing a tool to a model. -Latency: a tool-called skill is one extra `/api/chat` round trip mid-loop. With `OLLAMA_MAX_TOOL_ITERATIONS=8` and `OLLAMA_TIMEOUT_MS=60000`, two skill calls per turn is roughly the practical ceiling on a busy turn before timeout risk. +Latency: a tool-called skill costs at least one extra `/api/chat` round trip mid-loop, and more if the skill body itself loops over tools (bounded by `skill.maxTurns`). With `OLLAMA_MAX_TOOL_ITERATIONS=8` and `OLLAMA_TIMEOUT_MS=60000`, two skill calls per turn is roughly the practical ceiling on a busy turn before timeout risk; setting a generous `max_turns` on the skill multiplies that. Use `max_turns: 1` for fire-and-return skills (text transforms); bump it only when the skill genuinely needs to chain calls. Example: `skills/tldr/SKILL.md` ships with `tool: true`. Type `summarize this: ` to your Ollama deploy and watch the audit log — you'll see two rows: the Ollama parent turn (`origin: user`, `model: ollama:`) plus the skill tool call (`origin: tool_call`, `model: ollama::skill:tldr`). diff --git a/src/agent.ts b/src/agent.ts index c808359..13bee93 100644 --- a/src/agent.ts +++ b/src/agent.ts @@ -106,7 +106,10 @@ import type { SessionStore, SessionTier } from "./session.ts"; import { mdToTelegramHtml } from "./markdown.ts"; import { htmlEscapeText, type TelegramClient } from "./telegram.ts"; -const LOOP_THRESHOLD = 3; +// Exported so the skill runner in commands.ts uses the same threshold as +// runAgent — diverging values would make loop-detection behavior depend on +// invocation surface, which is surprising. +export const LOOP_THRESHOLD = 3; const TELEGRAM_TEXT_MAX = 3800; const EDIT_THROTTLE_MS = 1500; diff --git a/src/commands.ts b/src/commands.ts index 9deb214..53eb65d 100644 --- a/src/commands.ts +++ b/src/commands.ts @@ -60,26 +60,50 @@ * - policy.ts::createCostCapGuard — `/compact` pre-flight check */ -import { query, type Options } from "@anthropic-ai/claude-agent-sdk"; +import { + query, + type CanUseTool, + type McpSdkServerConfigWithInstance, + type Options, + type SdkMcpToolDefinition, +} from "@anthropic-ai/claude-agent-sdk"; import type { Message } from "@grammyjs/types"; import { mkdir } from "node:fs/promises"; import { join } from "node:path"; -import { sanitizedSubprocessEnv } from "./agent.ts"; +import { LOOP_THRESHOLD, sanitizedSubprocessEnv } from "./agent.ts"; import type { Allowlist } from "./allowlist.ts"; import type { ChatHistoryRow, SolracDb } from "./db.ts"; +import type { IntegrationTier } from "./integrations.ts"; import { log } from "./log.ts"; import { mdToTelegramHtml } from "./markdown.ts"; +import { buildToolCapabilityNote } from "./ollama.ts"; +import { mcpToOllamaTools, runToolLoop } from "./ollama-tools.ts"; import { + createLoopDetector, + createPostToolUseHook, + createPreToolUseHook, truncateAuditPrompt, + type ConfirmationBroker, + type ConfirmHandle, type CostCapGuard, type GlobalCostCapGuard, + type PolicyDenyEvent, } from "./policy.ts"; import type { SessionStore, SessionTier } from "./session.ts"; +import { skillToolCtx } from "./skill-tools.ts"; import { renderSkillTemplate, type Skill, type SkillRegistry, } from "./skills.ts"; + +// Mirror of `skill-tools.ts::SKILL_TOOL_PREFIX` — duplicated rather than +// imported to keep the cyclic surface between commands.ts and skill-tools.ts +// minimal. `skillToolCtx` (the AsyncLocalStorage instance) is imported above; +// it's safe because the import is only dereferenced inside functions, never +// at module load time. If you rename `SKILL_TOOL_PREFIX` in skill-tools.ts, +// rename it here too; `skill-tools.test.ts` covers the prefix invariant. +const SKILL_TOOL_PREFIX = "skills__"; import { nextRunAt, type SchedulerHandle, @@ -303,9 +327,13 @@ export function parseCommand(text: string, deps: ParseCommandDeps): ParseCommand // --------------------------------------------------------------------------- // Subset of OllamaRunDeps the skill path needs. Skills don't reuse runOllamaTurn -// (no history, no SOLRAC.md overlay, no streaming stub, no tool loop) — they -// hit `/api/chat` once with stream:false and return the model's text. Only the -// connection params + soul are required. +// because they don't carry history or SOLRAC.md overlays and have no streaming +// stub — but with PR-skills-tools they DO route through the same tool loop +// (`runToolLoop`) when tool deps are wired, so the skill body can call +// `mcp__solrac__*` / `skills__*` tools end-to-end. When tool deps are absent +// or `tools` is empty, `runSkillBare` falls through to the single-shot +// /api/chat path (preserving back-compat for pure text-transform skills +// like `tldr`). export interface OllamaSkillDeps { url: string; model: string; @@ -316,6 +344,15 @@ export interface OllamaSkillDeps { soul: string; // Injectable for tests; production passes `globalThis.fetch`. fetch?: typeof fetch; + // PR-skills-tools — when all three are wired, runSkillBare routes the + // skill body through `runToolLoop` so the model can call MCP tools the + // same way `runOllamaTurnWithTools` does. The skill's own MCP tool entry + // (`skills__`) is filtered out of the catalog at dispatch time to + // prevent direct recursion; indirect recursion (skill A → skills__B → + // skills__A) is bounded by `runToolLoop`'s `maxIterations`. + tools?: ReadonlyArray>; + toolTiers?: ReadonlyMap; + broker?: Pick; } export interface RunCommandDeps { @@ -356,6 +393,20 @@ export interface RunCommandDeps { // registry is empty or absent. taskRegistry?: TaskRegistry; triggerScheduledTask?: (name: string) => TriggerNowResult; + // Skills-as-agents (PR-skills-tools). Optional so legacy callers / tests + // that don't drive tool-using skills keep working — when omitted, skill + // tool calls fall through to `defaultAllowAll` (no interactive UX, no + // confirm prompts). Production wires both, mirroring `runAgent`. + createCanUseTool?: (args: { + chatId: number; + auditId: number; + pendingHandles: Map; + }) => CanUseTool; + // In-process MCP server hosting operator + blessed integrations. Skills + // see the same `mcp__solrac__` surface as a normal turn when this is + // set; `null` means integrations are disabled and skills run without an + // MCP catalog (built-in SDK tools like Read/Bash still apply). + mcpServer?: McpSdkServerConfigWithInstance | null; } const COMPACT_SOURCE_LIMIT = 50; @@ -1178,22 +1229,23 @@ async function runUnknown( // session-id drop), and the duplication is contained. A v1.1 refactor that // extracts `runOneShotClaudeTurn` is fine when a third caller arrives. -const SKILL_DISALLOWED_TOOLS = [ - "Agent", - "Task", - "Bash", - "Write", - "Edit", - "NotebookEdit", - "WebFetch", - "WebSearch", -]; +// Agentic skills can use the full Claude Code tool preset; sub-agents stay +// off at the SDK layer (belt-and-suspenders with policy.ts::SUBAGENT_DENY_TOOLS). +const SKILL_DISALLOWED_TOOLS = ["Agent", "Task"]; // Cap on the model's reply text we forward to Telegram. Telegram messages cap // at 4096 chars; we leave headroom for formatting overhead. If the model // produces more, the tail is dropped with a `…` marker. const SKILL_REPLY_MAX = 3500; +// Fallback canUseTool when `deps.createCanUseTool` is absent (tests, dev +// harnesses). Mirrors `agent.ts::defaultAllowAll`; logged with a `skill` +// prefix so audit-log greps can tell the surfaces apart. +const skillDefaultAllowAll: CanUseTool = async (toolName) => { + log.info("skill.tool_allow_all", { toolName }); + return { behavior: "allow" }; +}; + async function runSkill( deps: RunCommandDeps, msg: Message, @@ -1238,23 +1290,70 @@ async function runSkill( return; } - // 2. Render the prompt template and run a one-shot, tool-less turn. + // 2. Insert audit row up-front so the PreToolUse hook (which fires + // mid-turn for every tool call) has a real auditId to reference. + // Mirrors runAgent's ordering — same reason. + const auditId = deps.db.insertAudit({ + chatId: msg.chat.id, + fromId: msg.from!.id, + updateId, + prompt: truncateAuditPrompt(msg.text ?? ""), + startedAt, + model: engineModelTag, + }); + + // 3. Render the prompt template and run the skill turn with full tool + // surface (was tool-less in v1 — see docs/USAGE.md#skills). const prompt = renderSkillTemplate(skill.body, args); const cwd = join(deps.dataDir, "workspaces", String(msg.chat.id)); await mkdir(cwd, { recursive: true }); + // Per-turn state for the hook trio. Same shape as runAgent (agent.ts:273-310). + // `loopDetector` and `pendingHandles` are created fresh per skill invocation + // so prior turns can't influence loop counts or stale confirm handles. + const loopDetector = createLoopDetector({ threshold: LOOP_THRESHOLD }); + const pendingHandles = new Map(); + const policyDeny: { event: PolicyDenyEvent | null } = { event: null }; + + const canUseTool: CanUseTool = + deps.createCanUseTool?.({ + chatId: msg.chat.id, + auditId, + pendingHandles, + }) ?? skillDefaultAllowAll; + + const preToolUseHook = createPreToolUseHook({ + chatId: msg.chat.id, + getAuditId: () => auditId, + costGuard: deps.costGuard, + globalCostGuard: deps.globalCostGuard, + loopDetector, + onPolicyDeny: (event) => { + policyDeny.event = event; + }, + }); + const postToolUseHook = createPostToolUseHook({ pendingHandles }); + const options: Options = { cwd, model: modelId, - maxTurns: 1, + maxTurns: skill.maxTurns, permissionMode: "default", tools: { type: "preset", preset: "claude_code" }, + // Belt-and-suspenders: policy.ts also denies Agent/Task; the SDK-level + // disallow keeps them out of the model's tool catalog entirely so the + // model can't try and bounce off the policy layer. disallowedTools: SKILL_DISALLOWED_TOOLS, - canUseTool: async () => ({ - behavior: "deny", - message: "tools are disabled for skills in v1", - }), + canUseTool, env: sanitizedSubprocessEnv(), + ...(deps.mcpServer && { + mcpServers: { solrac: deps.mcpServer }, + }), + hooks: { + PreToolUse: [{ hooks: [preToolUseHook] }], + PostToolUse: [{ hooks: [postToolUseHook] }], + PostToolUseFailure: [{ hooks: [postToolUseHook] }], + }, }; let resultText = ""; @@ -1294,18 +1393,17 @@ async function runSkill( }); } + // PreToolUse-hook denies (cost cap, loop) surface as a SDK turn that often + // looks successful (the agent recovers gracefully from a hook deny). Promote + // the captured policy reason into errorMessage so the audit row + reply both + // tell the operator what actually happened. + if (policyDeny.event !== null && errorMessage === null) { + errorMessage = `policy_deny:${policyDeny.event.reason}: ${policyDeny.event.message}`; + } + const trimmed = resultText.trim(); if (errorMessage === null && trimmed === "") errorMessage = "empty_response"; - const auditId = deps.db.insertAudit({ - chatId: msg.chat.id, - fromId: msg.from!.id, - updateId, - prompt: truncateAuditPrompt(msg.text ?? ""), - startedAt, - model: engineModelTag, - }); - if (errorMessage !== null) { deps.db.updateAuditEnd({ id: auditId, @@ -1409,16 +1507,20 @@ function writeSkillAudit( // this with their own audit + reply / return-string handling. // // **RECURSION SAFETY INVARIANT** — this function MUST NOT add a `tools` field -// to the outgoing `/api/chat` body. Skills are tool-less by design; if a -// future "smart skills" change exposes tools here, a tool-callable skill -// would gain the ability to call itself (or another tool-callable skill) -// → infinite recursion. The recursion-safety test in `skill-tools.test.ts` -// asserts the request body has no `tools` key — keep both pieces in sync. +// to the outgoing `/api/chat` body. PR-skills-tools lifts the "tool-less" +// constraint: when `OllamaSkillDeps` is wired with `tools/toolTiers/broker`, +// the skill body sees the full MCP catalog MINUS its own `skills__` +// entry (recursion guard). The regression test in `skill-tools.test.ts` now +// asserts that filter — keep both in sync. export interface RunSkillBareResult { readonly text: string; readonly errorMessage: string | null; readonly inputTokens: number | null; readonly outputTokens: number | null; + // PR-skills-tools — populated when the tool-loop path runs (else empty). + // Mirrors `ToolLoopResult.toolCallSummaries` so callers can persist into + // the audit `tool_calls` column. + readonly toolCallSummaries: ReadonlyArray<{ name: string; input: unknown }>; } export async function runSkillBare( @@ -1426,6 +1528,18 @@ export async function runSkillBare( skill: Skill, args: string, ): Promise { + // PR-skills-tools dispatch. Tool surface wired → route through the tool + // loop so the body can call `mcp__solrac__*` / `skills__*` exactly like a + // regular Ollama turn. Mirrors the same gate in `runOllamaTurn`. + if ( + ollama.tools !== undefined && + ollama.tools.length > 0 && + ollama.toolTiers !== undefined && + ollama.broker !== undefined + ) { + return runSkillBareWithTools(ollama, skill, args); + } + const prompt = renderSkillTemplate(skill.body, args); const messages = [ { role: "system", content: ollama.soul }, @@ -1505,9 +1619,103 @@ export async function runSkillBare( errorMessage, inputTokens, outputTokens, + toolCallSummaries: [], }; } +// --------------------------------------------------------------------------- +// runSkillBareWithTools — PR-skills-tools tool-loop path +// --------------------------------------------------------------------------- +// +// Mirrors `runOllamaTurnWithTools` (ollama.ts) but skill-shaped: +// - No history, no SOLRAC.md overlay, no streaming UX (skills already cap +// their reply by template; live rendering would muddy the operator's +// intent baked into the skill body). +// - Recursion guard: this skill's own MCP entry (`skills__`) is +// filtered out of the catalog the body sees. Indirect recursion (A→B→A) +// is bounded by `runToolLoop`'s `maxIterations`. +// - `maxTurns` from the SKILL.md frontmatter doubles as `maxIterations` +// so the operator controls the budget per skill. +// +// Caller (`runOllamaSkill` for / typing, `skill-tools.ts` for +// agent-driven invocations) is responsible for wrapping this in +// `skillToolCtx.run(...)` so any nested `skills__*` calls have ALS context. +async function runSkillBareWithTools( + ollama: OllamaSkillDeps, + skill: Skill, + args: string, +): Promise { + // These are guaranteed non-undefined by the dispatch gate above. + const allTools = ollama.tools!; + const toolTiers = ollama.toolTiers!; + const broker = ollama.broker!; + + // Recursion guard. The skill's body must not see its own MCP entry. The + // model can still call OTHER skills tools; cycles longer than 1 are bounded + // by `maxIterations` and the loop detector. + const selfToolName = `${SKILL_TOOL_PREFIX}${skill.name}`; + const filteredTools = allTools.filter((t) => t.name !== selfToolName); + const toolMap = new Map(filteredTools.map((t) => [t.name, t])); + const toolDefs = mcpToOllamaTools(filteredTools); + const toolNames = filteredTools.map((t) => t.name); + + const prompt = renderSkillTemplate(skill.body, args); + // Skills are tier-stable (`tier: ollama` for tool-callable skills, per + // skills.ts Phase 1 restriction). Build the capability note as the default- + // engine variant — accurate when the skill body runs on the deploy's main + // Ollama model, which is always the case today. + const capabilityNote = buildToolCapabilityNote(toolNames, true); + + const initialMessages = [ + { role: "system" as const, content: `${ollama.soul}\n\n${capabilityNote}` }, + { role: "user" as const, content: prompt }, + ]; + + const ac = new AbortController(); + const timer = setTimeout(() => ac.abort(), ollama.timeoutMs); + const loopDetector = createLoopDetector({ threshold: LOOP_THRESHOLD }); + + try { + const result = await runToolLoop( + { + fetch: ollama.fetch, + url: ollama.url, + model: ollama.model, + signal: ac.signal, + tools: toolMap, + toolTiers, + toolDefs, + broker, + loopDetector, + maxIterations: skill.maxTurns, + // Skill audit row is owned by the caller (runOllamaSkill or + // skill-tools.ts::buildOneSkillTool); pass 0 here. The audit + // correlation in `log.info("ollama.tool_loop_start", ...)` is + // best-effort for skills. + auditId: 0, + // chatId is only used for log correlation inside runToolLoop. The + // ALS context (skillToolCtx) carries the real chatId for any + // nested skill calls. + chatId: 0, + }, + { initialMessages }, + ); + const text = result.assistantText.trim(); + let errorMessage = result.errorMessage; + if (errorMessage === null && text === "") errorMessage = "empty_response"; + return { + text, + errorMessage, + inputTokens: result.inputTokens, + outputTokens: result.outputTokens, + toolCallSummaries: result.toolCallSummaries, + }; + } finally { + clearTimeout(timer); + ac.abort(); + } +} + // Ollama-tier skill: one-shot `/api/chat` (stream:false), no history, no tool // loop, no streaming stub. Mirrors Claude runSkill's audit + reply shape so // operator-side observability is identical (`skill.done` log, audit row tagged @@ -1546,9 +1754,9 @@ async function runOllamaSkill( const ollama = deps.ollamaSkillDeps; const engineModelTag = `ollama:${ollama.model}:skill:${skill.name}`; - const { text: trimmed, errorMessage, inputTokens, outputTokens } = - await runSkillBare(ollama, skill, args); - + // Insert audit row BEFORE running so the ALS context can carry the real + // parentAuditId — nested `skills__*` calls record it in their own + // `origin='tool_call'` rows for the cross-skill audit story. const auditId = deps.db.insertAudit({ chatId: msg.chat.id, fromId: msg.from!.id, @@ -1558,11 +1766,29 @@ async function runOllamaSkill( model: engineModelTag, }); + // Wrap in `skillToolCtx.run(...)` so the body — when it reaches the + // tool-loop path — can call `skills__other` and have those handlers + // see (chatId, fromId, updateId, parentAuditId) via ALS. Cheap to wrap + // unconditionally; no-op when the body never reaches a tool. + const { text: trimmed, errorMessage, inputTokens, outputTokens, toolCallSummaries } = + await skillToolCtx.run( + { + chatId: msg.chat.id, + fromId: msg.from!.id, + updateId, + parentAuditId: auditId, + }, + () => runSkillBare(ollama, skill, args), + ); + + const toolCallsJson = + toolCallSummaries.length > 0 ? JSON.stringify(toolCallSummaries) : null; + if (errorMessage !== null) { deps.db.updateAuditEnd({ id: auditId, response: null, - toolCalls: null, + toolCalls: toolCallsJson, inputTokens, outputTokens, cacheCreationInputTokens: null, @@ -1585,7 +1811,7 @@ async function runOllamaSkill( deps.db.updateAuditEnd({ id: auditId, response: snippet(trimmed, 200), - toolCalls: null, + toolCalls: toolCallsJson, inputTokens, outputTokens, cacheCreationInputTokens: null, diff --git a/src/main.ts b/src/main.ts index 89d6c81..526581b 100644 --- a/src/main.ts +++ b/src/main.ts @@ -607,6 +607,12 @@ async function main(): Promise { // reference is shared as `EMPTY_INTEGRATIONS_TOOLS`) when integrations // are off so downstream `Array.isArray + length>0` checks work uniformly. let integrationTools: ReadonlyArray> = []; + // Names of integrations that successfully registered ≥1 tool. Consumed by + // `loadSkillsSync` to gate `requires:`-declared skills at boot rather than + // letting them fail opaquely on first invocation. A probe-failed integration + // (sources entry with `toolCount === 0`) does NOT satisfy the gate — a + // skill that calls notion_search needs the tool to actually exist. + let loadedIntegrationNames: ReadonlySet = new Set(); if (config.integrationsEnabled) { // PNX-168 — builtins now load from a static-import registry so the // compiled binary includes them. Operator dirs still load via dynamic @@ -624,6 +630,9 @@ async function main(): Promise { integrationToolTiers = result.toolTiers; integrationConfirmFormatters = result.confirmFormatters; integrationTools = result.tools; + loadedIntegrationNames = new Set( + result.sources.filter((s) => s.toolCount > 0).map((s) => s.name), + ); if (result.tools.length > 0) { integrationsMcpServer = createSdkMcpServer({ name: "solrac", @@ -655,7 +664,12 @@ async function main(): Promise { const skillRegistry: SkillRegistry = config.skillsEnabled ? (() => { const reserved = new Set(BOT_COMMAND_REGISTRY.map((c) => c.command)); - const result = loadSkillsSync(config.skillsDir, reserved, config.defaultEngine); + const result = loadSkillsSync( + config.skillsDir, + reserved, + config.defaultEngine, + loadedIntegrationNames, + ); logSkillLoadResult(config.skillsDir, result); return result.registry; })() @@ -764,6 +778,17 @@ async function main(): Promise { timeoutMs: config.ollamaTimeoutMs, }); } + // PR-skills-tools — attach the tool surface to ollamaSkillDeps AFTER + // integrationTools/skillTools are merged and the broker is built. The + // `buildSkillTools` closure earlier captures ollamaSkillDeps by + // reference, so mutating the same object reaches every captured site. + // Telegram broker is wired here; the web transport rewrites the broker + // field in webCommandDeps below for browser-routed confirm prompts. + if (ollamaSkillDeps && ollamaToolsActive) { + ollamaSkillDeps.tools = integrationTools; + ollamaSkillDeps.toolTiers = integrationToolTiers; + ollamaSkillDeps.broker = broker; + } // PR-B — Ollama is the recommended default; probe the daemon at boot so // operators see a misconfiguration immediately (vs. on first user turn). // Non-fatal: a slow-starting daemon may not be ready yet under systemd @@ -845,6 +870,11 @@ async function main(): Promise { schedulerRef ? schedulerRef.triggerNow(name) : { kind: "unknown_task", name }, + // Skills now run with full tool surface (see commands.ts::runSkill); + // share the same canUseTool factory + MCP server with runAgent so the + // interactive confirm UX and integration tool catalog are identical. + createCanUseTool, + mcpServer: integrationsMcpServer, }; // Web UI transport (optional). The `webClient` was built earlier so the // `webBroker` could capture it; reuse the same instance here so all bus @@ -853,9 +883,23 @@ async function main(): Promise { let webCommandDeps: RunCommandDeps | null = null; let webOllamaDeps: OllamaRunDeps | null = null; if (webClient) { + // Web-routed / invocations: rewrite the broker so confirm + // prompts ride the SSE bus rather than Telegram (mirrors the + // webOllamaDeps swap below). `tools` and `toolTiers` are unchanged — + // only the broker differs per transport. + const webOllamaSkillDeps: OllamaSkillDeps | null = commandDeps.ollamaSkillDeps + ? { + ...commandDeps.ollamaSkillDeps, + broker: + commandDeps.ollamaSkillDeps.broker !== undefined + ? webBroker! + : undefined, + } + : null; webCommandDeps = { ...commandDeps, tg: webClient, + ollamaSkillDeps: webOllamaSkillDeps, }; // Ollama-on-web path needs the web broker (not the Telegram broker) // so confirm prompts ride the SSE bus to the operator's browser diff --git a/src/ollama.ts b/src/ollama.ts index 31cc199..173fa19 100644 --- a/src/ollama.ts +++ b/src/ollama.ts @@ -180,7 +180,9 @@ export function buildOllamaCapabilityNote(opts: OllamaCapabilityNoteOpts): strin * Backwards-shaped helper for the tools-on path. Defers to * `buildOllamaCapabilityNote` so the §3c matrix is the single source of truth. */ -function buildToolCapabilityNote( +// Exported so the skill tool-loop runner in commands.ts can build the same +// capability note for skill bodies without duplicating the matrix. +export function buildToolCapabilityNote( toolNames: ReadonlyArray, isDefaultEngine: boolean, ): string { diff --git a/src/skill-tools.test.ts b/src/skill-tools.test.ts index 0388439..4621b83 100644 --- a/src/skill-tools.test.ts +++ b/src/skill-tools.test.ts @@ -2,21 +2,36 @@ * @fileoverview Unit tests for skill-as-tool dispatcher. * @proves Tool definition shape, naming, registry filtering, ALS context * propagation, audit row content, AND the load-bearing recursion- - * safety invariant: the skill handler's outgoing fetch body has no - * `tools` field. If a future change adds tool surface to skill - * execution, an Ollama agent calling skills__foo could trigger foo - * calling skills__foo → infinite loop. This test catches that - * regression at CI time. + * safety invariant. + * + * Recursion-safety invariant — PR-skills-tools: + * PR-skills-tools lifts the "tool-less skill body" constraint. Skill bodies + * now see the same MCP catalog Ollama turns see, MINUS the skill's own + * `skills__` entry. The filter is the load-bearing guard against a + * tool-callable skill recursing into itself (infinite loop). + * + * Two cases: + * 1. `OllamaSkillDeps` has no tool surface wired → falls through to the + * single-shot /api/chat path. Outgoing body has no `tools` field. + * (Back-compat for pure text-transform skills like `tldr`.) + * 2. `OllamaSkillDeps` HAS tools wired → routes through `runToolLoop`. + * Outgoing body HAS a `tools` field. The skill's own + * `skills__` entry is filtered out. + * + * Indirect recursion (A → skills__B → skills__A) is bounded by + * `runToolLoop`'s `maxIterations` (= `skill.maxTurns`) and the loop detector. * * Cross-references: * - src/skill-tools.ts — implementation - * - src/commands.ts::runSkillBare — pure-execution helper invoked by handler + * - src/commands.ts::runSkillBareWithTools — tool-loop path + * - src/commands.ts::runSkillBare — dispatcher; bare path for no-tools case */ import { afterEach, beforeEach, describe, expect, test } from "bun:test"; import { mkdtempSync, rmSync } from "node:fs"; import { tmpdir } from "node:os"; import { join } from "node:path"; +import { z } from "zod"; import type { OllamaSkillDeps } from "./commands.ts"; import { openDb, type SolracDb } from "./db.ts"; import { @@ -69,6 +84,8 @@ function fakeSkill(overrides: Partial = {}): Skill { body: "Summarize: {{args}}", sourcePath: "/test/SKILL.md", tool: true, + maxTurns: 1, + requires: [] as ReadonlyArray, ...overrides, }); } @@ -180,17 +197,14 @@ describe("buildSkillTools — tool definition shape", () => { }); // --------------------------------------------------------------------------- -// RECURSION SAFETY INVARIANT — outgoing fetch body has no `tools` field +// RECURSION SAFETY INVARIANT — outgoing fetch body filters `skills__` // --------------------------------------------------------------------------- describe("RECURSION SAFETY — handler fetch body", () => { - // Capture every outgoing fetch the handler makes so we can assert its body. - // If a future change accidentally adds `tools` to the request body (e.g. - // a "smart skills" refactor), this test fails — and it MUST fail, because - // a tool-enabled skill could call itself or another tool-callable skill, - // creating infinite recursion. This is the parser-level guard the design - // depends on. - test("outgoing /api/chat body has no `tools` key", async () => { + // Case 1: no tool surface wired in OllamaSkillDeps → falls through to the + // single-shot /api/chat path. Body has no `tools` field. Back-compat with + // pre-PR-skills-tools text-transform skills (e.g. `tldr`). + test("no tool surface → outgoing /api/chat body has no `tools` key", async () => { const db = await tempDb(); let captured: { url: string; body: any } | null = null; const fakeFetch = (async ( @@ -201,7 +215,6 @@ describe("RECURSION SAFETY — handler fetch body", () => { url: String(input), body: init?.body ? JSON.parse(String(init.body)) : null, }; - // Return a successful Ollama response so the handler completes. return new Response( JSON.stringify({ message: { content: "summary" }, @@ -226,20 +239,120 @@ describe("RECURSION SAFETY — handler fetch body", () => { db, ollamaSkillDeps, }); - // Invoke the handler under the ALS context (matches what - // runOllamaTurnWithTools does). Without this, the handler errors. await skillToolCtx.run(TEST_CTX, async () => { await tools[0]!.handler({ args: "hello world" }, undefined); }); expect(captured).not.toBeNull(); expect(captured!.url).toBe("http://test/api/chat"); expect(captured!.body).not.toBeNull(); - // THE invariant. expect(captured!.body).not.toHaveProperty("tools"); - // Sanity check: the body has the expected fields. expect(captured!.body.stream).toBe(false); expect(Array.isArray(captured!.body.messages)).toBe(true); }); + + // Case 2: tool surface wired → routes through runToolLoop. Body HAS a + // `tools` field that includes other tools but EXCLUDES `skills__`. + // This is the load-bearing filter for direct-recursion safety. + test("tool surface wired → `tools` array excludes skills__", async () => { + const db = await tempDb(); + // Capture only the FIRST outgoing POST — runToolLoop may follow up with a + // cap-finalize POST that doesn't carry the `tools` field, which would + // overwrite our capture and mask the real first-round body. + let captured: { url: string; body: any } | null = null; + const fakeFetch = (async ( + input: string | URL | Request, + init?: RequestInit, + ): Promise => { + if (captured === null) { + captured = { + url: String(input), + body: init?.body ? JSON.parse(String(init.body)) : null, + }; + } + // NDJSON-shaped reply (single frame + trailing newline) so the streaming + // parser in runToolLoop exits round 1 with a clean assistant text and + // skips the cap-finalize path. + const frame = + JSON.stringify({ + message: { content: "done", role: "assistant" }, + done: true, + prompt_eval_count: 10, + eval_count: 5, + }) + "\n"; + return new Response(frame, { + status: 200, + headers: { "content-type": "application/x-ndjson" }, + }); + }) as unknown as typeof fetch; + + // Hand-craft a minimal SdkMcpToolDefinition for the skill's own entry + // and one other tool. The handlers never fire because the model returns + // no tool_calls; only the catalog shape matters here. `inputSchema` must + // be a Zod raw shape (`{key: ZodType}`) because `mcpToOllamaTools` runs + // it through `z.object(...)` to derive the JSON schema for /api/chat. + const makeFakeTool = ( + name: string, + ): import("@anthropic-ai/claude-agent-sdk").SdkMcpToolDefinition => ({ + name, + description: `fake ${name}`, + inputSchema: { args: z.string() }, + handler: async () => ({ content: [{ type: "text", text: "ok" }] }), + }); + + const selfName = `${SKILL_TOOL_PREFIX}tldr`; + const otherSkillName = `${SKILL_TOOL_PREFIX}other`; + const integrationToolName = "notion_search"; + const wiredTools = [ + makeFakeTool(selfName), + makeFakeTool(otherSkillName), + makeFakeTool(integrationToolName), + ]; + const toolTiers = new Map< + string, + import("./integrations.ts").IntegrationTier + >([ + [selfName, "auto"], + [otherSkillName, "auto"], + [integrationToolName, "auto"], + ]); + const fakeBroker = { + request: async () => { + throw new Error("broker.request should not be called in this test"); + }, + }; + + const ollamaSkillDeps: OllamaSkillDeps = { + url: "http://test", + model: "m", + timeoutMs: 5000, + soul: "you are a test bot", + fetch: fakeFetch, + tools: wiredTools, + toolTiers, + broker: fakeBroker, + }; + const skill = fakeSkill({ name: "tldr" }); + const tools = buildSkillTools(makeRegistry([skill]), { + db, + ollamaSkillDeps, + }); + await skillToolCtx.run(TEST_CTX, async () => { + await tools[0]!.handler({ args: "hello world" }, undefined); + }); + expect(captured).not.toBeNull(); + expect(captured!.url).toBe("http://test/api/chat"); + expect(captured!.body).not.toBeNull(); + // The new contract: body HAS a `tools` array. + expect(Array.isArray(captured!.body.tools)).toBe(true); + const names = (captured!.body.tools as Array<{ function?: { name?: string } }>) + .map((t) => t.function?.name ?? "") + .filter(Boolean); + // Other tools are present. + expect(names).toContain(otherSkillName); + expect(names).toContain(integrationToolName); + // THE load-bearing filter — direct recursion prevention. + expect(names).not.toContain(selfName); + }); }); // --------------------------------------------------------------------------- diff --git a/src/skill-tools.ts b/src/skill-tools.ts index 29de52c..541cc6f 100644 --- a/src/skill-tools.ts +++ b/src/skill-tools.ts @@ -198,11 +198,16 @@ function buildOneSkillTool( origin: "tool_call", }); + const toolCallsJson = + result.toolCallSummaries.length > 0 + ? JSON.stringify(result.toolCallSummaries) + : null; + if (result.errorMessage !== null) { db.updateAuditEnd({ id: auditId, response: null, - toolCalls: null, + toolCalls: toolCallsJson, inputTokens: result.inputTokens, outputTokens: result.outputTokens, cacheCreationInputTokens: null, @@ -238,7 +243,7 @@ function buildOneSkillTool( db.updateAuditEnd({ id: auditId, response: result.text.slice(0, 200), - toolCalls: null, + toolCalls: toolCallsJson, inputTokens: result.inputTokens, outputTokens: result.outputTokens, cacheCreationInputTokens: null, diff --git a/src/skills.test.ts b/src/skills.test.ts index 5bfaa9b..dc84ea3 100644 --- a/src/skills.test.ts +++ b/src/skills.test.ts @@ -427,6 +427,92 @@ Body.`; }); }); +// --------------------------------------------------------------------------- +// parseSkillFile — `max_turns` field +// --------------------------------------------------------------------------- + +describe("parseSkillFile — max_turns field", () => { + test("defaults to 1 when omitted", () => { + const c = `--- +name: example +description: x +--- +Body.`; + const skill = parseSkillFile(c, "/p", RESERVED); + expect(skill.maxTurns).toBe(1); + }); + + test("accepts a valid integer in [1, 10]", () => { + const c = `--- +name: example +description: x +max_turns: 5 +--- +Body.`; + const skill = parseSkillFile(c, "/p", RESERVED); + expect(skill.maxTurns).toBe(5); + }); + + test("accepts upper bound 10", () => { + const c = `--- +name: example +description: x +max_turns: 10 +--- +Body.`; + const skill = parseSkillFile(c, "/p", RESERVED); + expect(skill.maxTurns).toBe(10); + }); + + test("rejects 0 (below lower bound)", () => { + const c = `--- +name: example +description: x +max_turns: 0 +--- +Body.`; + expect(() => parseSkillFile(c, "/p", RESERVED)).toThrow( + /"max_turns" must be an integer in \[1, 10\]/, + ); + }); + + test("rejects negative", () => { + const c = `--- +name: example +description: x +max_turns: -1 +--- +Body.`; + expect(() => parseSkillFile(c, "/p", RESERVED)).toThrow( + /"max_turns" must be an integer in \[1, 10\]/, + ); + }); + + test("rejects 11 (above upper bound)", () => { + const c = `--- +name: example +description: x +max_turns: 11 +--- +Body.`; + expect(() => parseSkillFile(c, "/p", RESERVED)).toThrow( + /"max_turns" must be an integer in \[1, 10\]/, + ); + }); + + test("rejects non-integer string", () => { + const c = `--- +name: example +description: x +max_turns: "five" +--- +Body.`; + expect(() => parseSkillFile(c, "/p", RESERVED)).toThrow( + /"max_turns" must be an integer in \[1, 10\]/, + ); + }); +}); + // --------------------------------------------------------------------------- // parseSkillFile — tier inheritance from defaultTier // --------------------------------------------------------------------------- @@ -476,6 +562,69 @@ Body.`; }); }); +// --------------------------------------------------------------------------- +// parseSkillFile — `requires` field +// --------------------------------------------------------------------------- + +describe("parseSkillFile — requires field", () => { + test("omitted → empty array (unconditional load)", () => { + const skill = parseSkillFile(MINIMAL, "/p", RESERVED); + expect(skill.requires).toEqual([]); + }); + + test("bare string normalized to single-element array", () => { + const c = `--- +name: log +description: x +requires: notion +--- +Body.`; + const skill = parseSkillFile(c, "/p", RESERVED); + expect(skill.requires).toEqual(["notion"]); + }); + + test("array form passes through", () => { + const c = `--- +name: log +description: x +requires: [notion, gmail] +--- +Body.`; + const skill = parseSkillFile(c, "/p", RESERVED); + expect(skill.requires).toEqual(["notion", "gmail"]); + }); + + test("empty string entry rejected", () => { + const c = `--- +name: log +description: x +requires: "" +--- +Body.`; + expect(() => parseSkillFile(c, "/p", RESERVED)).toThrow(/"requires" entry must match/); + }); + + test("uppercase entry rejected (slug must be lowercase)", () => { + const c = `--- +name: log +description: x +requires: NOTION +--- +Body.`; + expect(() => parseSkillFile(c, "/p", RESERVED)).toThrow(/"requires" entry must match/); + }); + + test("array entry with whitespace rejected", () => { + const c = `--- +name: log +description: x +requires: ["a b"] +--- +Body.`; + expect(() => parseSkillFile(c, "/p", RESERVED)).toThrow(/"requires" entry must match/); + }); +}); + // --------------------------------------------------------------------------- // loadSkillsSync // --------------------------------------------------------------------------- @@ -551,6 +700,59 @@ describe("loadSkillsSync", () => { expect(result.registry.get("fine")).toBeDefined(); expect(result.registry.get("clear")).toBeUndefined(); }); + + // requires: gating — every skill with a `requires:` list must have all + // entries present in `loadedIntegrations` or it's dropped at boot with a + // non-fatal error. Skills with no `requires:` load regardless. + const REQUIRES_SKILL = `--- +name: log +description: x +requires: notion +--- +Body.`; + + test("requires: skill loads when integration is present", () => { + const root = tempSkillsDir(); + writeSkill(root, "log", REQUIRES_SKILL); + const result = loadSkillsSync(root, RESERVED, "primary", new Set(["notion"])); + expect(result.loadedCount).toBe(1); + expect(result.errors.length).toBe(0); + expect(result.registry.get("log")).toBeDefined(); + }); + + test("requires: skill skipped + errored when integration is missing", () => { + const root = tempSkillsDir(); + writeSkill(root, "log", REQUIRES_SKILL); + const result = loadSkillsSync(root, RESERVED, "primary", new Set()); + expect(result.loadedCount).toBe(0); + expect(result.errors.length).toBe(1); + expect(result.errors[0]!.message).toMatch(/requires unloaded integration\(s\): notion/); + expect(result.registry.get("log")).toBeUndefined(); + }); + + test("multi-requires skill skipped when any entry is missing", () => { + const root = tempSkillsDir(); + const c = `--- +name: log +description: x +requires: [notion, gmail] +--- +Body.`; + writeSkill(root, "log", c); + const result = loadSkillsSync(root, RESERVED, "primary", new Set(["notion"])); + expect(result.loadedCount).toBe(0); + expect(result.errors.length).toBe(1); + expect(result.errors[0]!.message).toMatch(/gmail/); + expect(result.errors[0]!.message).not.toMatch(/notion,/); + }); + + test("skill with no requires loads regardless of loadedIntegrations", () => { + const root = tempSkillsDir(); + writeSkill(root, "plain", MINIMAL.replace("name: example", "name: plain")); + const result = loadSkillsSync(root, RESERVED, "primary", new Set()); + expect(result.loadedCount).toBe(1); + expect(result.registry.get("plain")).toBeDefined(); + }); }); // --------------------------------------------------------------------------- @@ -595,6 +797,8 @@ describe("skillsToBotCommands", () => { body: "x", sourcePath: "/p", tool: false, + maxTurns: 1, + requires: [] as ReadonlyArray, }); } diff --git a/src/skills.ts b/src/skills.ts index b7308fd..ce756d0 100644 --- a/src/skills.ts +++ b/src/skills.ts @@ -94,6 +94,18 @@ export interface Skill { // `tool: true` requires `tier: "ollama"` (free-only — avoids cross-engine // cost surprises from agent-driven invocations). readonly tool: boolean; + // Max model turns when running this skill's body. Pure text-transform skills + // (no tool calls) want 1; agentic skills that chain tool calls (e.g. a + // /log skill doing notion_get_database_schema → notion_create_page) need + // headroom. Frontmatter key: `max_turns`. Bounds: 1 ≤ maxTurns ≤ 10. + readonly maxTurns: number; + // Integration names this skill depends on. When any name is absent from the + // set of loaded integrations at boot, the skill is skipped with a non-fatal + // error so `/help` + Telegram autocomplete never advertise a skill that + // would fail at use-time. Empty array means "no integration deps" + // (unconditional load). Frontmatter key: `requires`; accepts a bare string + // or a string array. + readonly requires: ReadonlyArray; } export interface SkillLoadError { @@ -128,6 +140,12 @@ const NAME_RE = /^[a-z0-9_]{1,32}$/; const MAX_DESCRIPTION_LEN = 256; const FRONTMATTER_DELIM = "---"; +// Slug pattern for `requires:` entries. Mirrors the shape of integration +// subdirectory names (lowercase letters / digits / underscore / hyphen, leading +// alpha). Hyphens are permitted here even though skill `name` rejects them — +// integration directories may legitimately be hyphenated (`my-thing`). +const REQUIRES_NAME_RE = /^[a-z][a-z0-9_-]{0,31}$/; + // --------------------------------------------------------------------------- // Frontmatter parser (schema-restricted YAML subset) // --------------------------------------------------------------------------- @@ -301,13 +319,54 @@ export function parseSkillFile( toolFlag = v; } + // max_turns — model-turn budget for the skill's body. Default 1 preserves + // back-compat with pre-agentic-skills (single-shot text transforms like + // tldr). Cap at 10 to keep a runaway skill bounded; cost-cap is the + // ultimate backstop for Claude, OLLAMA_MAX_TOOL_ITERATIONS for Ollama. + let maxTurns = 1; + if ("max_turns" in f) { + const v = f.max_turns; + if (typeof v !== "number" || !Number.isInteger(v) || v < 1 || v > 10) { + throw new Error( + `${sourcePath}: "max_turns" must be an integer in [1, 10] (got ${String(v)})`, + ); + } + maxTurns = v; + } + + // requires — integration deps that must be loaded for this skill to register. + // Accepts a bare string (`requires: notion`) or an array + // (`requires: [notion, gmail]`). Empty / omitted → unconditional load + // (preserves back-compat for pre-`requires:` skills like `tldr`). The loader + // gates on the set of loaded integration names (see `loadSkillsSync`). + let requires: ReadonlyArray = Object.freeze([]); + if ("requires" in f) { + const v = f.requires; + let list: ReadonlyArray; + if (typeof v === "string") { + list = [v]; + } else if (Array.isArray(v) && v.every((x): x is string => typeof x === "string")) { + list = v; + } else { + throw new Error(`${sourcePath}: "requires" must be a string or string array`); + } + for (const entry of list) { + if (entry === "" || !REQUIRES_NAME_RE.test(entry)) { + throw new Error( + `${sourcePath}: "requires" entry must match ${REQUIRES_NAME_RE.source} (got "${entry}")`, + ); + } + } + requires = Object.freeze([...list]); + } + // body if (body.trim() === "") { throw new Error(`${sourcePath}: body must be non-empty (no prompt template)`); } // Reject unknown keys so typos don't silently skip. - const ALLOWED = new Set(["name", "description", "tier", "tool"]); + const ALLOWED = new Set(["name", "description", "tier", "tool", "max_turns", "requires"]); for (const k of Object.keys(f)) { if (!ALLOWED.has(k)) { throw new Error( @@ -323,6 +382,8 @@ export function parseSkillFile( body, sourcePath, tool: toolFlag, + maxTurns, + requires, }); } @@ -334,6 +395,7 @@ export function loadSkillsSync( dir: string, reservedNames: ReadonlySet, defaultTier: SkillTier, + loadedIntegrations: ReadonlySet = new Set(), ): SkillLoadResult { const errors: SkillLoadError[] = []; const byName = new Map(); @@ -378,6 +440,16 @@ export function loadSkillsSync( errors.push({ path: skillPath, message: (err as Error).message }); continue; } + if (skill.requires.length > 0) { + const missing = skill.requires.filter((r) => !loadedIntegrations.has(r)); + if (missing.length > 0) { + errors.push({ + path: skillPath, + message: `skill "${skill.name}" requires unloaded integration(s): ${missing.join(", ")}; skipping`, + }); + continue; + } + } if (byName.has(skill.name)) { const first = byName.get(skill.name)!; errors.push({