diff --git a/.github/workflows/astro-deploy.yml b/.github/workflows/astro-deploy.yml index b11d6a3..59134fe 100644 --- a/.github/workflows/astro-deploy.yml +++ b/.github/workflows/astro-deploy.yml @@ -1,8 +1,9 @@ -name: Astro Build & Deploy (manual in W1) +name: Astro Build & Deploy on: - workflow_dispatch: # ONLY manual trigger during W1-W3. - # W4: add `push: branches: [main, master]` to auto-deploy. + push: + branches: [master, main] + workflow_dispatch: permissions: contents: read diff --git a/.github/workflows/deploy.yml b/.github/workflows/deploy.yml deleted file mode 100644 index 4e4f137..0000000 --- a/.github/workflows/deploy.yml +++ /dev/null @@ -1,53 +0,0 @@ -name: CI & Deploy - -on: - push: - branches: [main, master] - pull_request: - branches: [main, master] - -permissions: - contents: write - -jobs: - test: - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v4 - - - uses: actions/setup-python@v5 - with: - python-version: '3.12' - cache: pip - - - name: Install dependencies - run: pip install -e ".[dev]" - - - name: Lint - run: ruff check . - - - name: Type check - run: mypy src/ --ignore-missing-imports || true - - - name: Run tests - run: pytest tests/ -v - - deploy: - runs-on: ubuntu-latest - needs: test - if: github.event_name == 'push' - steps: - - uses: actions/checkout@v4 - with: - fetch-depth: 0 - - - uses: actions/setup-python@v5 - with: - python-version: '3.12' - cache: pip - - - name: Install docs dependencies - run: pip install -r requirements-docs.txt - - - name: Build and deploy site - run: mkdocs gh-deploy --force diff --git a/README.md b/README.md index 581b577..716f197 100644 --- a/README.md +++ b/README.md @@ -78,20 +78,57 @@ Thirteen chapters across four parts, covering the full lifecycle of building pro **[Read the free sample chapter](docs/book/01-what-agentic-means.md)** or **[get the full book on Amazon](https://www.amazon.com/dp/B0GVG6848F)**. -## Getting Started +## Local Development + +Requires Node 22+ and pnpm. + +```bash +pnpm install +pnpm dev # http://localhost:4321/agentic-ai/ +pnpm build # produces dist/ +pnpm preview # serves dist/ at the same URL +pnpm test # 11 unit tests (cross-links + reading-time) +pnpm astro check # type check +``` + +## Stack + +- **Astro 5** — static site generator +- **MDX content** — Content Collections + Zod schemas +- **Svelte 5** — interactive islands (TraceReplay, TraceViewer, EvalRubric, ArchitectureToggle, D3Chart, CommandPalette) +- **Pagefind** — static search +- **Houdini paint worklet** — brand ink stamp (with SVG fallback) +- **View Transitions API** — page navigation +- **GitHub Pages deploy** — via `.github/workflows/astro-deploy.yml` + +## Content Types + +Content lives under `src/content/` as MDX with Zod-validated frontmatter: + +- **chapters/** — 5 Foundations + 13 chapters +- **fieldNotes/** — twice-weekly observations (Thursdays + Sundays) +- **recipes/** — buildable-in-an-afternoon walkthroughs (Wednesdays) +- **projects/** — case-study teardowns of 7 reference systems +- **evidence/** — measured eval data backing the homepage stats +- **labs/** — empirical Lab Reports (every 2-3 weeks) +- **patterns/** — cross-cutting agent patterns + +## Code Examples + +Working Python implementations for every chapter also live in this repo: ```bash -# Install -make install +# Run Document Intelligence Agent (Ch02-03) +python -m project.doc_intelligence_agent -# Run tests -make test +# Run Incident Runbook Agent (Ch04-05) +python -m project.incident_runbook_agent -# Run the Document Intelligence Agent -make run +# Run Memory-Augmented Agent (Ch12) +python -m project.memory_agent -# Run the eval harness -make eval +# Run the evaluation harness (Ch06) +python -m src.ch06.eval_suite ``` Copy `.env.example` to `.env` and add your API key before running. diff --git a/astro.config.mjs b/astro.config.mjs index e6d3437..d01aaea 100644 --- a/astro.config.mjs +++ b/astro.config.mjs @@ -2,6 +2,7 @@ import { defineConfig } from 'astro/config'; import mdx from '@astrojs/mdx'; import svelte from '@astrojs/svelte'; +import sitemap from '@astrojs/sitemap'; // https://astro.build/config export default defineConfig({ @@ -14,6 +15,7 @@ export default defineConfig({ integrations: [ mdx(), svelte(), + sitemap(), ], vite: { build: { diff --git a/package.json b/package.json index 84b38d5..74e488b 100644 --- a/package.json +++ b/package.json @@ -16,6 +16,7 @@ }, "dependencies": { "@astrojs/mdx": "^4.3.0", + "@astrojs/sitemap": "^3.7.2", "@astrojs/svelte": "^7.2.0", "astro": "^5.7.0", "d3": "^7.9.0", diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index ba15ebe..a4b44ad 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -11,6 +11,9 @@ importers: '@astrojs/mdx': specifier: ^4.3.0 version: 4.3.14(astro@5.18.1(@types/node@22.19.19)(rollup@4.60.3)(typescript@5.9.3)(yaml@2.9.0)) + '@astrojs/sitemap': + specifier: ^3.7.2 + version: 3.7.2 '@astrojs/svelte': specifier: ^7.2.0 version: 7.2.5(@types/node@22.19.19)(astro@5.18.1(@types/node@22.19.19)(rollup@4.60.3)(typescript@5.9.3)(yaml@2.9.0))(svelte@5.55.5)(typescript@5.9.3)(yaml@2.9.0) @@ -91,6 +94,9 @@ packages: resolution: {integrity: sha512-q8VwfU/fDZNoDOf+r7jUnMC2//H2l0TuQ6FkGJL8vD8nw/q5KiL3DS1KKBI3QhI9UQhpJ5dc7AtqfbXWuOgLCQ==} engines: {node: 18.20.8 || ^20.3.0 || >=22.0.0} + '@astrojs/sitemap@3.7.2': + resolution: {integrity: sha512-PqkzkcZTb5ICiyIR8VoKbIAP/laNRXi5tw616N1Ckk+40oNB8Can1AzVV56lrbC5GKSZFCyJYUVYqVivMisvpA==} + '@astrojs/svelte@7.2.5': resolution: {integrity: sha512-Tl5aF/dYbzzd7sLpxMBX6pRz3yJ1B4pilt9G3GJbj0I0/doJHIEmerNQsnlxX0/InNKUhMXXN8wyyet9VhA+Zw==} engines: {node: 18.20.8 || ^20.3.0 || >=22.0.0} @@ -1096,6 +1102,12 @@ packages: '@types/node@22.19.19': resolution: {integrity: sha512-dyh/xO2Fh5bYrfWaaqGrRQQGkNdmYw6AmaAUvYeUMNTWQtvb796ikLdmTchRmOlOiIJ1TDXfWgVx1QkUlQ6Hew==} + '@types/node@24.12.4': + resolution: {integrity: sha512-GUUEShf+PBCGW2KaXwcIt3Yk+e3pkKwWKb9GSyM9WQVE+ep2jzmHdGsHzu4wgcZy5fN9FBdVzjpBQsYlpfpgLA==} + + '@types/sax@1.2.7': + resolution: {integrity: sha512-rO73L89PJxeYM3s3pPPjiPgVVcymqU490g0YO5n5By0k2Erzj6tay/4lr1CHAAU4JyOWd1rpQ8bCf6cZfHU96A==} + '@types/trusted-types@2.0.7': resolution: {integrity: sha512-ScaPdn1dQczgbl0QFTeTOmVHFULt394XJgOQNoyVhZ6r2vLnMLJfBPd53SB52T/3G36VI1/g2MZaX0cwDuXsfw==} @@ -1207,6 +1219,9 @@ packages: resolution: {integrity: sha512-KMReFUr0B4t+D+OBkjR3KYqvocp2XaSzO55UcB6mgQMd3KbcE+mWTyvVV7D/zsdEbNnV6acZUutkiHQXvTr1Rw==} engines: {node: '>= 8'} + arg@5.0.2: + resolution: {integrity: sha512-PYjyFOLKQ9y57JvQ6QLo8dAgNqswh8M1RMJYdQduT6xbWSgK36P/Z/v+p888pM69jMMfS8Xd8F6I1kQ/I9HUGg==} + argparse@2.0.1: resolution: {integrity: sha512-8+9WqebbFzpX9OR+Wa6O29asIogeRMzcGtAINdpMHHyAg10f05aSFVBbcEqGf/PXw1EjAZ+q2/bEBg3DvurK3Q==} @@ -2302,6 +2317,11 @@ packages: sisteransi@1.0.5: resolution: {integrity: sha512-bLGGlR1QxBcynn2d5YmDX4MGjlZvy2MRBDRNHLJ8VI6l6+9FUiyTFNJ0IveOSP0bcXgVDPRcfGqA0pjaqUpfVg==} + sitemap@9.0.1: + resolution: {integrity: sha512-S6hzjGJSG3d6if0YoF5kTyeRJvia6FSTBroE5fQ0bu1QNxyJqhhinfUsXi9fH3MgtXODWvwo2BDyQSnhPQ88uQ==} + engines: {node: '>=20.19.5', npm: '>=10.8.2'} + hasBin: true + smol-toml@1.6.1: resolution: {integrity: sha512-dWUG8F5sIIARXih1DTaQAX4SsiTXhInKf1buxdY9DIg4ZYPZK5nGM1VRIYmEbDbsHt7USo99xSLFu5Q1IqTmsg==} engines: {node: '>= 18'} @@ -2323,6 +2343,9 @@ packages: std-env@3.10.0: resolution: {integrity: sha512-5GS12FdOZNliM5mAOxFRg7Ir0pWz8MdpYm6AY6VPkGpbA7ZzmbzNcBJQ0GPvvyWgcY7QAhCgf9Uy89I03faLkg==} + stream-replace-string@2.0.0: + resolution: {integrity: sha512-TlnjJ1C0QrmxRNrON00JvaFFlNh5TTG00APw23j74ET7gkQpTASi6/L2fuiav8pzK715HXtUeClpBTw2NPSn6w==} + string-width@4.2.3: resolution: {integrity: sha512-wKyQRQpjJ0sIp62ErSZdGsjMJWsap5oRNihHhu6G7JVO/9jIB6UyevL+tXuOqrng8j/cxKTWyWUwvSTriiZz/g==} engines: {node: '>=8'} @@ -2438,6 +2461,9 @@ packages: undici-types@6.21.0: resolution: {integrity: sha512-iwDZqg0QAGrg9Rav5H4n0M64c3mkR59cJ6wQp+7C4nI0gsmExaedaYLNO44eT4AtBBwjbTiGPMlt2Md0T9H9JQ==} + undici-types@7.16.0: + resolution: {integrity: sha512-Zz+aZWSj8LE6zoxD+xrjh4VfkIG8Ya6LvYkZqtUQGJPZjYl53ypCaUwWqo7eI0x66KBGeRo+mlBEkMSeSZ38Nw==} + unified@11.0.5: resolution: {integrity: sha512-xKvGhPWw3k84Qjh8bI3ZeJjqnyadK+GEFtazSfZv/rKeTkTjOJho6mFqh2SM96iIcZokxiOpg78GazTSg8+KHA==} @@ -2828,6 +2854,9 @@ packages: zod@3.25.76: resolution: {integrity: sha512-gzUt/qt81nXsFGKIFcC3YnfEAx5NkunCfnDlvuBSSFS02bcXu4Lmea0AFIUwbLWxWPx3d9p8S5QoaujKcNQxcQ==} + zod@4.4.3: + resolution: {integrity: sha512-ytENFjIJFl2UwYglde2jchW2Hwm4GJFLDiSXWdTrJQBIN9Fcyp7n4DhxJEiWNAJMV1/BqWfW/kkg71UDcHJyTQ==} + zwitch@2.0.4: resolution: {integrity: sha512-bXE4cR/kVZhKZX/RjPEflHaKVhUVl85noU3v6b8apfQEc1x4A+zBxjZ4lN8LqGd6WZ3dl98pY4o717VFmoPp+A==} @@ -2922,6 +2951,12 @@ snapshots: dependencies: prismjs: 1.30.0 + '@astrojs/sitemap@3.7.2': + dependencies: + sitemap: 9.0.1 + stream-replace-string: 2.0.0 + zod: 4.4.3 + '@astrojs/svelte@7.2.5(@types/node@22.19.19)(astro@5.18.1(@types/node@22.19.19)(rollup@4.60.3)(typescript@5.9.3)(yaml@2.9.0))(svelte@5.55.5)(typescript@5.9.3)(yaml@2.9.0)': dependencies: '@sveltejs/vite-plugin-svelte': 5.1.1(svelte@5.55.5)(vite@6.4.2(@types/node@22.19.19)(yaml@2.9.0)) @@ -3692,6 +3727,14 @@ snapshots: dependencies: undici-types: 6.21.0 + '@types/node@24.12.4': + dependencies: + undici-types: 7.16.0 + + '@types/sax@1.2.7': + dependencies: + '@types/node': 22.19.19 + '@types/trusted-types@2.0.7': {} '@types/unist@2.0.11': {} @@ -3826,6 +3869,8 @@ snapshots: normalize-path: 3.0.0 picomatch: 2.3.2 + arg@5.0.2: {} + argparse@2.0.1: {} aria-query@5.3.1: {} @@ -5502,6 +5547,13 @@ snapshots: sisteransi@1.0.5: {} + sitemap@9.0.1: + dependencies: + '@types/node': 24.12.4 + '@types/sax': 1.2.7 + arg: 5.0.2 + sax: 1.6.0 + smol-toml@1.6.1: {} source-map-js@1.2.1: {} @@ -5514,6 +5566,8 @@ snapshots: std-env@3.10.0: {} + stream-replace-string@2.0.0: {} + string-width@4.2.3: dependencies: emoji-regex: 8.0.0 @@ -5632,6 +5686,8 @@ snapshots: undici-types@6.21.0: {} + undici-types@7.16.0: {} + unified@11.0.5: dependencies: '@types/unist': 3.0.3 @@ -5976,4 +6032,6 @@ snapshots: zod@3.25.76: {} + zod@4.4.3: {} + zwitch@2.0.4: {} diff --git a/src/components/universal/ChapterMap.astro b/src/components/universal/ChapterMap.astro new file mode 100644 index 0000000..3b28039 --- /dev/null +++ b/src/components/universal/ChapterMap.astro @@ -0,0 +1,111 @@ +--- +/** + * ChapterMap — visual sitemap of all 18 chapters, current one highlighted. + * Renders as 5 stacked rows (one per Part) of tiny rectangles. + */ +interface Props { + currentSlug: string; +} + +const { currentSlug } = Astro.props; + +const parts = [ + { + label: 'Foundations', + chapters: ['00a-how-llms-work', '00b-api-to-tools', '00c-first-agent', '00d-frameworks', '00e-connecting-to-mcp'], + }, + { + label: 'I · Build', + chapters: ['01-what-agentic-means', '02-tools-context-agent-loop', '03-workflow-first-agent-second', '04-multi-agent-without-theater'], + }, + { + label: 'II · Judge', + chapters: ['05-human-in-the-loop', '06-evaluating-and-hardening', '07-when-not-to-use-agents'], + }, + { + label: 'III · Operate', + chapters: ['08-metacognition', '09-deployment', '10-governance', '11-security'], + }, + { + label: 'IV · Advanced', + chapters: ['12-memory-management', '13-agent-protocols-in-production'], + }, +]; + +function chapterLabel(slug: string): string { + return slug.split('-')[0]; +} +--- + + + + diff --git a/src/components/universal/NextPrev.astro b/src/components/universal/NextPrev.astro new file mode 100644 index 0000000..7d25150 --- /dev/null +++ b/src/components/universal/NextPrev.astro @@ -0,0 +1,86 @@ +--- +/** + * NextPrev — chapter pagination footer. + * Shows ← previous and next → links to adjacent chapters by filename order. + */ +interface Props { + prev?: { slug: string; title: string }; + next?: { slug: string; title: string }; +} + +const { prev, next } = Astro.props; +--- + + + + diff --git a/src/components/universal/ProgressReader.astro b/src/components/universal/ProgressReader.astro new file mode 100644 index 0000000..29f3937 --- /dev/null +++ b/src/components/universal/ProgressReader.astro @@ -0,0 +1,49 @@ +--- +/** + * ProgressReader — 2px brick-red bar fixed to viewport top. + * Fills 0% → 100% as the user scrolls the document. + * CSS scroll-timeline (Chrome 115+, Safari 17+). Gracefully hides on + * browsers without support (the bar is decorative, not load-bearing). + */ +--- + + + + diff --git a/src/content/chapters/00a-how-llms-work.mdx b/src/content/chapters/00a-how-llms-work.mdx new file mode 100644 index 0000000..abc08b6 --- /dev/null +++ b/src/content/chapters/00a-how-llms-work.mdx @@ -0,0 +1,427 @@ +--- +title: How LLMs Actually Work +part: foundations +description: "The engineer's mental model for LLMs. Not transformers math. The five things you need to know to build real systems." +readingTime: 18 +date: 2026-03-15 +references: + - 00b-api-to-tools +patterns: [] +status: published +--- + +import Callout from '~/components/universal/Callout.astro'; + +You don't need to understand attention heads to build with LLMs. You need to understand five things. Here they are. + +This is not a book about how LLMs are built internally. There are excellent resources for that (Raschka's *Build a Large Language Model (From Scratch)* is the best). This is about how to build production systems with LLMs as components. You don't need to understand transformers. You need to understand what breaks when you give a language model access to your tools. + +## The API contract + +Everything starts here. You send text, you get text back. That's it. + +Every agent framework, every RAG pipeline, every chain-of-thought prompting technique, every multi-agent orchestration system is built on top of this one operation. Text in, text out. If you strip away every abstraction, this is what remains. + +Here's a raw API call using the Anthropic SDK: + +```python +import anthropic + +client = anthropic.Anthropic() # reads ANTHROPIC_API_KEY from env + +message = client.messages.create( + model="claude-sonnet-4-20250514", + max_tokens=1024, + messages=[ + {"role": "user", "content": "What is the capital of France?"} + ] +) + +print(message.content[0].text) +# "The capital of France is Paris." +``` + + +You sent a string to an API. You got a string back. You paid for both strings, measured in tokens. That's the entire contract. For this query, that is roughly 30 tokens in and 10 tokens out. At current pricing, about $0.0003. Cheap for one call. Less cheap when your agent makes fifty calls per user request. + + +That raw SDK call is the simplest way to understand what is happening. It is useful for experiments and first contact. But it becomes painful in real systems: provider-specific code leaks into every file, testing requires live API calls, swapping models means rewriting imports, and cost tracking gets scattered. + +The companion code wraps this single operation in a provider-neutral client. Same contract (text in, text out), but now testable, swappable, and observable: + + +```python +from src.shared.model_client import create_client +from src.shared.types import CompletionRequest, Message, Role + +client = create_client(provider="anthropic", api_key="...", model_name="claude-sonnet-4-20250514") + +request = CompletionRequest( + messages=[ + Message(role=Role.SYSTEM, content="You are a helpful assistant."), + Message(role=Role.USER, content="What is the capital of France?"), + ], + temperature=0.0, +) + +response = await client.complete(request) +print(response.content) +# "The capital of France is Paris." +``` + + +The model client wraps the raw API with typed inputs and outputs. Your agent code never imports `anthropic` or `openai` directly. You can swap providers, add cost tracking, or switch to a mock for testing, all without changing the code that calls it. + + +So which should you use? Use the raw SDK call to understand the mechanics. Use the wrapper when the model becomes part of a larger system. The rest of this book uses the wrapper because agents call the model hundreds of times per day, and you will want to track costs, swap between a fast cheap model and a slow expensive one depending on the task, and run tests without hitting a real API. The wrapper makes all of that possible by centralizing the one operation that matters. + +This is the foundation. If you understand this, you understand 80% of what frameworks are doing. The other 20% is prompt management, tool routing, and retry logic. All useful. None of it magic. + +
+ The LLM API contract: text in, text out, with token counting and cost on both sides +
Figure 0a.1: The API contract. Text in, text out. Everything else is built on top.
+
+ +## Tokens, not words + +LLMs don't process words. They process tokens, which are chunks of text that roughly correspond to word fragments. The word "understanding" might be two tokens ("understand" + "ing"). A space before a word is often part of the token. A number like "42" is one token. The string "1234567890" might be three tokens. + +Why does this matter? Because everything about LLMs is priced and bounded in tokens. Context windows are measured in tokens. API costs are per-token. Rate limits count tokens. When someone says a model has a "128K context window," they mean 128,000 tokens, which is roughly 96,000 words, or about 300 pages of prose. That sounds like a lot. It's less than you think once you start filling it with system prompts, conversation history, retrieved documents, and tool results. + +Here's a quick estimator: + +```python +def count_tokens_estimate(text: str) -> int: + """Rough token count: ~4 characters per token. + + Not exact (use tiktoken for precision), but good enough + for cost projections and context budget planning. + """ + return max(1, len(text) // 4) + +# Try it +prompt = "Analyze this document and extract all mentions of financial risk." +tokens = count_tokens_estimate(prompt) +print(f"Estimated tokens: {tokens}") +# Estimated tokens: 15 +``` + + +The 4-characters-per-token rule is a rough approximation. It's wrong for individual strings, but accurate enough in aggregate for cost planning and context budgeting. Use `tiktoken` when you need precision. + + +Now the cost math. This is where engineers need to pay attention because costs sneak up on you: + +```python +# Pricing per 1M tokens: (prompt_price, completion_price) +MODEL_PRICING = { + "gpt-4o": (2.50, 10.00), + "gpt-4o-mini": (0.15, 0.60), + "claude-sonnet-4-20250514": (3.00, 15.00), + "claude-haiku-4-5-20251001": (0.80, 4.00), +} + +def estimate_cost(prompt_tokens: int, completion_tokens: int, model: str) -> float: + prompt_price, completion_price = MODEL_PRICING.get(model, (1.00, 5.00)) + return (prompt_tokens / 1_000_000) * prompt_price + \ + (completion_tokens / 1_000_000) * completion_price + +# One call: cheap +cost_one = estimate_cost(prompt_tokens=1000, completion_tokens=500, model="claude-sonnet-4-20250514") +print(f"One call: ${cost_one:.4f}") +# One call: $0.0105 + +# 10,000 calls: not cheap +cost_day = cost_one * 10_000 +print(f"10,000 calls: ${cost_day:.2f}") +# 10,000 calls: $105.00 +``` + + +A single API call costs fractions of a cent. But agents make multiple calls per request, and production systems handle thousands of requests per day. The arithmetic compounds fast. An agent that averages 5 model calls per request at $0.01 each, serving 10,000 requests a day, costs $500/day. Know this number before you ship. + + +Notice that completion tokens are 3-5x more expensive than prompt tokens across every provider. This is not arbitrary. Generating tokens requires sequential computation, while reading prompt tokens can be partially parallelized. The practical implication: an agent that generates long, verbose reasoning is more expensive than one that generates concise answers, even if they read the same context. + +
+ Token cost breakdown showing prompt tokens, completion tokens, and total cost across models +
Figure 0a.2: Token costs per model. Completion tokens always cost more than prompt tokens.
+
+ +## The context window is your entire working memory + +Think of the context window as RAM for the conversation. Everything the model knows about your current request has to fit inside it. The system prompt, the user's message, the full conversation history, any documents you retrieved, the results from tool calls, all of it competes for one fixed-size bucket. + +This is the constraint that shapes every architectural decision in this book. + +When you build a RAG system, you're deciding what to put in the context window. When you design a multi-turn agent, you're managing what stays in the context window across steps. When you pick a chunking strategy for documents, you're optimizing for what fits in the context window. + +Here's what a typical context window looks like for an agent request: + +``` +┌─────────────────────────────────────────┐ +│ System prompt ~500 tokens │ +│ Tool definitions ~800 tokens │ +│ Conversation history ~2,000 tokens │ +│ Retrieved documents ~6,000 tokens │ +│ Previous tool results ~1,500 tokens │ +│ Current user message ~200 tokens │ +│─────────────────────────────────────────│ +│ TOTAL ~11,000 tokens │ +│ Remaining (128K model) ~117,000 tokens │ +│ Remaining (8K model) OVERFLOW │ +└─────────────────────────────────────────┘ +``` + +That 117,000 token remainder looks comfortable. But add a 50-page document (roughly 37,000 tokens) and three rounds of agent tool use (each round adds the tool call, the result, and the model's analysis), and you're burning through context fast. + +The dangerous part: when context overflows, the model doesn't crash. It degrades silently. Quality drops. The model starts ignoring instructions, especially the ones at the beginning of the context (your system prompt). It misses relevant information buried in the middle. You won't get an error. You'll get a worse answer with no indication that anything went wrong. + +"Lost in the middle" is a well-documented phenomenon. Models pay the most attention to the beginning and end of the context, and less attention to the middle. When you add a 50-page document to the context, something gets pushed out or ignored. Usually it's the instructions you put at the beginning. + + +A document-analysis agent had a system prompt that began with "Always respond in JSON format." It worked perfectly in testing with short documents. In production, users started uploading 50-page contracts, roughly 40,000 tokens of retrieved text. The model began responding in prose, ignoring the JSON instruction entirely. No error. No warning. The system prompt was still there, just buried under so much context that the model stopped attending to it. The fix was two-fold: put critical formatting instructions both at the start AND end of the context (bracketing), and switch to provider-level structured output enforcement so the format constraint was not dependent on the model's attention. + + +This is why context management is engineering, not just prompt writing. The decisions about what goes into the context, in what order, and what gets dropped when space is tight, these are architectural decisions with direct impact on system quality. + +
+ The context window as a bucket being filled with system prompt, history, documents, and tool results +
Figure 0a.3: The context window. Everything competes for one fixed-size bucket. Overflow is silent.
+
+ +## Why it hallucinates (and why you can't prompt it away) + +The model predicts the next likely token. That's all it does. It is not looking up facts. It is not checking a database. It is generating the token sequence that is most probable given everything that came before it. When that process produces text that sounds authoritative but is factually wrong, we call it hallucination. But from the model's perspective, nothing unusual happened. It produced a high-probability token sequence. It just happened to be wrong. + +This is not a bug to fix. It is a fundamental property of how these models work. A model trained on text will produce text that looks like the text it was trained on. If the training data contains confident, well-structured explanations, the model will produce confident, well-structured explanations, whether or not they are correct. + +You will read advice telling you to add "only answer based on the provided context" to your system prompt. This helps. It reduces the rate of hallucination. It does not solve the problem. The model can and will still generate plausible-sounding text that isn't supported by the context. I've seen models cite specific paragraph numbers from documents that don't have paragraph numbers. I've seen them invent API endpoints with correct-looking URL structures and reasonable-sounding parameter names. The text looks right because the model is very good at producing text that looks right. + + +A research assistant agent was asked to summarize findings from a set of uploaded documents and cite its sources. It returned: "According to Document 3, Section 4.2, page 17, the failure rate exceeds 12%." The response looked credible. But Document 3 had no numbered sections, was only 5 pages long, and never mentioned failure rates. The model generated a citation that matched the structural pattern of academic references without any grounding in the actual content. The fix: every citation the model produces must be verified in code. Extract the claimed source, look up the actual text, and confirm the claim appears there. If it does not, flag it or drop it. Never pass model-generated citations through to users without programmatic verification. + + +Every reliable mitigation for hallucination is engineering, not prompting. + +**Grounding:** Give the model source material and constrain it to answer from that material. This is what RAG does. It doesn't eliminate hallucination, but it gives the model something real to work from. + +**Validation:** Check the output against known facts, schemas, or constraints. If the model says the answer is in paragraph 3 of document X, verify that paragraph 3 of document X exists and says what the model claims. + +**Evaluation:** Measure hallucination rates systematically across a test set. Not "try a few examples and see if it looks right." Structured evaluation with labeled ground truth. Chapter 6 covers this in detail. + +**Escalation:** When confidence is low, say so. "I don't have enough information to answer this" is a better response than a confident wrong answer. Build your system to produce this response when the evidence is thin. + +These are code solutions, not prompt solutions. Prompting helps at the margins, but you cannot prompt your way to production reliability. You can engineer your way there. + +
+ Mental model showing that LLMs predict likely tokens, not verified facts +
Figure 0a.4: The hallucination mental model. The model predicts likely tokens, not verified facts. Mitigation happens in code, not in prompts.
+
+ +## Temperature and sampling + +When the model generates the next token, it doesn't pick one deterministically (by default). It produces a probability distribution over all possible tokens, then samples from that distribution. Temperature controls how peaked or flat that distribution is. + +**Temperature 0** (or near-zero): The model almost always picks the highest-probability token. Output is deterministic, or very close to it. Same input produces the same output. This is the right default for agent decision paths, tool selection, structured extraction, and anything where you need reproducible behavior. Not every agent step needs temperature 0, though. Steps that generate diverse search queries, brainstorm alternative approaches, or produce varied rephrasing can benefit from a small amount of temperature (0.2-0.3). + +**Temperature 0.7-1.0:** The distribution is flatter. Lower-probability tokens have a real chance of being selected. Output is more varied, more "creative." This is useful for brainstorming, creative writing, or generating diverse examples. + +**Temperature above 1.0:** The distribution is nearly flat and output becomes increasingly incoherent. In production agent systems, there is almost no reason to go above 1.0. In research or creative applications, controlled high temperature paired with top-p sampling can be useful for exploring the edges of a distribution. For everything in this book, stay at or below 0.3. + +For agents, default to temperature 0 for decision-making steps. Tool selection, routing, structured extraction, and any step where you need predictable, testable behavior. When your agent is deciding whether to call the search tool or the calculator, you want it to make the same decision every time for the same input. For generative sub-steps where variety helps, bring temperature up slightly, but keep it bounded. + +```python +from src.shared.model_client import create_client +from src.shared.types import CompletionRequest, Message, Role + +client = create_client(provider="anthropic", api_key="...", model_name="claude-sonnet-4-20250514") + +# Temperature 0: deterministic, same answer every time +request_deterministic = CompletionRequest( + messages=[ + Message(role=Role.SYSTEM, content="You are a helpful assistant."), + Message(role=Role.USER, content="Name one benefit of unit testing."), + ], + temperature=0.0, +) + +# Temperature 1.0: varied output, different answer each time +request_creative = CompletionRequest( + messages=[ + Message(role=Role.SYSTEM, content="You are a helpful assistant."), + Message(role=Role.USER, content="Name one benefit of unit testing."), + ], + temperature=1.0, +) + +# Run the deterministic version 3 times: same answer +for _ in range(3): + r = await client.complete(request_deterministic) + print(r.content) +# "Unit testing catches regressions early..." +# "Unit testing catches regressions early..." +# "Unit testing catches regressions early..." + +# Run the creative version 3 times: different answers +for _ in range(3): + r = await client.complete(request_creative) + print(r.content) +# "Unit testing catches regressions early..." +# "It provides a safety net when refactoring..." +# "Tests serve as living documentation..." +``` + + +Temperature 0 gives you repeatability. Temperature 1.0 gives you variety. For agent decision paths where you need predictable, testable behavior, default to temperature 0. For sub-steps where diversity helps (query expansion, brainstorming), a small amount of temperature (0.2-0.3) is reasonable. The key is to be deliberate about the choice, not to apply one setting everywhere. + + +There's a common misconception that temperature 0 means "more accurate." It doesn't. It means "most probable." The most probable completion can still be wrong. Temperature controls randomness, not correctness. + +## Structured output + +The model generates text. Your code needs data. This gap is where a lot of production systems break. + +When you ask a model to "return JSON," you get text that usually looks like JSON. Usually. Sometimes the model wraps it in markdown code fences. Sometimes it adds a preamble ("Sure! Here's the JSON:"). Sometimes it produces valid JSON that doesn't match your schema. Sometimes it produces invalid JSON. + +There are two approaches to reliable structured output. The first is provider-level enforcement, where the API guarantees the output matches a JSON schema. OpenAI's `response_format` parameter and Anthropic's tool use both support this. The second is parsing with fallbacks, which is what you use when provider enforcement isn't available or when you're working with models that don't support it. + +Here's the parsing approach from this book's codebase: + +```python +import json +import re + +def parse_structured_output(text: str) -> dict | None: + """Parse a JSON object from model output. + + Tries three strategies: + 1. The whole text is valid JSON. + 2. Extract the first {...} block. + 3. Give up and return None. + """ + # Strategy 1: direct parse + try: + result = json.loads(text.strip()) + if isinstance(result, dict): + return result + except json.JSONDecodeError: + pass + + # Strategy 2: regex extraction + match = re.search(r"\{[^{}]*\}", text, re.DOTALL) + if match: + try: + result = json.loads(match.group()) + if isinstance(result, dict): + return result + except json.JSONDecodeError: + pass + + return None + +# The model cooperates +clean = '{"status": "ok", "confidence": 0.95}' +print(parse_structured_output(clean)) +# {"status": "ok", "confidence": 0.95} + +# The model adds preamble +messy = 'Here is the analysis: {"result": "pass", "score": 87} Hope that helps!' +print(parse_structured_output(messy)) +# {"result": "pass", "score": 87} + +# The model ignores your instructions entirely +no_json = "I analyzed the document and found three key themes." +print(parse_structured_output(no_json)) +# None +``` + + +Models don't always follow formatting instructions. Robust systems handle this with layered parsing: try the clean path first, fall back to extraction, and handle failure explicitly. The `None` return is a feature. It means "the model didn't give us structured data, so we need to retry, escalate, or use a default." + + +This is the bridge between "text generator" and "system component." When the model returns structured data, you can write normal code around it. You can validate fields. You can route on values. You can feed the output into the next step of a pipeline. Without structured output, you're writing string-parsing code that breaks every time the model decides to rephrase its response. + +I think the right default is to use provider-level schema enforcement whenever it's available, and fall back to parsing only when it's not. Provider enforcement is more reliable, costs nothing extra, and removes an entire category of bugs. The parsing fallback exists for the real world, where you don't always control which model you're calling. + +### The validation ladder + +Parsing is step one. But "valid JSON" is not the same as "data I can trust." Production systems need three layers of validation after parsing: schema validation, semantic validation, and a clear failure policy. + +```python +from pydantic import BaseModel, Field, ValidationError +from datetime import date + +# Layer 1: Schema validation +class ExtractionResult(BaseModel): + answer: str = Field(min_length=1) + confidence: float = Field(ge=0.0, le=1.0) + source_document: str + extracted_date: date + +# Layer 2: Semantic validation +def validate_semantics(result: ExtractionResult, available_docs: list[str]) -> list[str]: + """Business logic checks that schema validation can't catch.""" + errors = [] + if result.source_document not in available_docs: + errors.append(f"Source '{result.source_document}' not in provided documents") + if result.extracted_date > date.today(): + errors.append(f"Extracted date {result.extracted_date} is in the future") + if result.confidence > 0.95 and len(result.answer) < 10: + errors.append("High confidence with very short answer is suspicious") + return errors + +# Layer 3: Retry/repair with failure policy +async def extract_with_validation( + client, messages: list, available_docs: list[str], max_retries: int = 2 +) -> ExtractionResult: + for attempt in range(max_retries + 1): + response = await client.complete( + CompletionRequest(messages=messages, temperature=0.0) + ) + parsed = parse_structured_output(response.content) + + if parsed is None: + messages.append(Message( + role=Role.USER, + content="Your response was not valid JSON. Return only a JSON object." + )) + continue + + try: + result = ExtractionResult(**parsed) + except ValidationError as e: + messages.append(Message( + role=Role.USER, + content=f"JSON parsed but failed validation: {e}. Fix and retry." + )) + continue + + semantic_errors = validate_semantics(result, available_docs) + if semantic_errors: + messages.append(Message( + role=Role.USER, + content=f"Data failed business rules: {semantic_errors}. Fix and retry." + )) + continue + + return result + + raise ExtractionError("Structured extraction failed after retries") +``` + +The key principle: if structured output fails after your retry budget, return a typed error, not a raw string. Your downstream code should never have to guess whether it received valid data. Either it gets a validated `ExtractionResult`, or it gets an `ExtractionError` it can handle explicitly. + + +A classification agent returned `{"confidence": 1.5, "category": "high_risk", "review_date": "next Tuesday"}`. The JSON parsed without errors. The downstream routing logic treated 1.5 as a valid confidence score, escalated the case as ultra-high-confidence, and logged "next Tuesday" as a date string that broke the reporting pipeline three hours later when a batch job tried to parse it. Schema validation (Pydantic) would have caught the confidence value immediately. Semantic validation would have caught the non-ISO date. Without the validation ladder, syntactically correct garbage flows downstream and breaks things far from the source. + + +## Putting it together + +You now have a mental model of the machine you are building with. It takes text, returns text, costs money per token, has a fixed memory, and confidently makes things up. Every engineering decision from here forward is about working within and around these constraints. + +Now that you know the model is probabilistic, bounded by context, vulnerable to unsupported confident text, and unreliable at structure by default, the next engineering problems are concrete: How do you give it tools with contracts it cannot violate? How do you assemble context that fits the window without losing critical instructions? How do you evaluate whether the system actually works, not just looks like it works? And how do you bound its autonomy so it fails gracefully instead of confidently? + +The next three sections build these answers. Section 0b gives the model hands. Section 0c gives it a loop. Section 0d shows you what frameworks do with both. + +For hands-on experiments with everything in this section, see the [LLM Explorer](/agentic-ai/book/projects/llm-explorer/) project. diff --git a/src/content/chapters/00b-api-to-tools.mdx b/src/content/chapters/00b-api-to-tools.mdx new file mode 100644 index 0000000..c8ecec6 --- /dev/null +++ b/src/content/chapters/00b-api-to-tools.mdx @@ -0,0 +1,509 @@ +--- +title: From API Calls to Tool Use +part: foundations +description: "How to go from sending text to having the model take actions. Function calling demystified, schema validation, and the bridge from text generator to system component." +readingTime: 22 +date: 2026-03-16 +references: + - 00a-how-llms-work + - 00c-first-agent +patterns: + - tool-registry +status: published +--- + +import Callout from '~/components/universal/Callout.astro'; + +In the last section, the model could only talk. In this one, it learns to do things. The difference is smaller than you think: about 15 lines of code. + +Here is what actually happens when a model "uses a tool": your code sends a list of function signatures to the API. The model reads those signatures and, instead of returning a text response, returns a JSON blob that says "call this function with these arguments." Your code calls the function. You send the result back to the model. The model uses the result to generate a text response. That's the entire mechanism. No remote procedure calls. No plugin system. No runtime loaded from the model's side. The model writes JSON. You do the work. + +Before we get to function calling, though, we need to talk about the two techniques that make it actually work in production: structured prompting and few-shot examples. These aren't optional. They're the difference between a system that works in demos and one that works at 2am when you're asleep. + +## Prompting as engineering, not art + +System prompts are code. Version them. Test them. Review them in pull requests. If your prompt engineering process is "try things until it works," you're debugging without logs. + +Here's a prompt I see in prototypes all the time: + +``` +You are a helpful assistant that analyzes data. +``` + +This prompt tells the model almost nothing. What kind of data? What does "analyze" mean? What format should the output be in? The model will fill in the gaps with its own interpretation, and that interpretation will change between calls, between models, and between API versions. You've written a function with no type signature and no docstring, then wondered why consumers use it wrong. + +Here's the same prompt written like code: + +``` +You are a financial data analyst. You receive quarterly revenue data as CSV. +Your job: identify the quarter-over-quarter growth rate for each line item. + +Rules: +1. Output valid JSON with the schema: {"line_items": [{"name": str, "q1": float, "q2": float, "growth_pct": float}]} +2. Calculate growth as: (q2 - q1) / q1 * 100, rounded to 1 decimal place. +3. If q1 is zero, set growth_pct to null. +4. Do not include commentary. Return only the JSON object. +``` + + +The second prompt specifies the domain, the input format, the output schema, the calculation method, the edge case handling, and what to omit. Every one of those constraints reduces variance in the output. Each constraint is testable. You can write an assertion that checks whether the output matches the JSON schema. You can verify the growth calculation is correct. You can confirm there's no commentary outside the JSON. You can't test "be helpful." + + +The principle is straightforward: every ambiguity in your prompt is a degree of freedom the model will use in ways you didn't intend. Remove them. Specify the format. Specify the edge cases. Specify what not to do. Treat your system prompt as a contract, the same way you'd treat a function signature or an API spec. + +Version your prompts in source control. When output quality degrades, `git diff` the prompt. When you swap models, run your prompt test suite. This is not overhead. This is the minimum viable engineering practice for systems that depend on natural language interfaces. + +## Few-shot examples are your type system + +When the model needs to follow a pattern, show it the pattern. This is called few-shot prompting, and in my experience it is the most effective technique for getting consistent structured output. + +Consider a task where you need the model to classify customer support messages into categories. Without examples: + +```python +messages = [ + {"role": "system", "content": """Classify the following support message +into exactly one category: billing, technical, account, or general. +Return only the category name, lowercase, no punctuation."""}, + {"role": "user", "content": "I can't log into my account after resetting my password"} +] +# Model might return: "account" +# Model might return: "Account" +# Model might return: "This is an account issue." +# Model might return: "technical (login issue)" +``` + +The model understood the task. It just couldn't commit to a format. Now with two examples: + +```python +messages = [ + {"role": "system", "content": """Classify the following support message +into exactly one category: billing, technical, account, or general. +Return only the category name, lowercase, no punctuation."""}, + {"role": "user", "content": "My credit card was charged twice this month"}, + {"role": "assistant", "content": "billing"}, + {"role": "user", "content": "The dashboard keeps showing a 500 error"}, + {"role": "assistant", "content": "technical"}, + {"role": "user", "content": "I can't log into my account after resetting my password"} +] +# Model returns: "account" +# Every time. +``` + + +Two examples did more than a paragraph of instructions. The model saw the pattern: one word, lowercase, no explanation. It follows that pattern. Few-shot examples are like type annotations for natural language. They constrain the output space by demonstration rather than description. + + +Two to three examples is the sweet spot for most tasks. One example can be dismissed as coincidence. Four or more starts eating context budget without proportional improvement. Pick examples that cover your edge cases: one straightforward case, one boundary case, and one that's easy to get wrong. + +Few-shot examples are particularly powerful for tool-using systems. When the model sees examples of choosing the right tool for a given query, it learns the selection criteria far more reliably than from descriptions alone. We'll come back to this when we discuss tool selection later in this section. + +## Function calling from scratch + +This is the core mechanism. Every agent framework wraps it. Every tool-using system depends on it. And there is no magic. The model outputs JSON that says "call this function with these arguments." Your code does the calling. + +The cycle has four steps: + +1. You define tool schemas (JSON that tells the model what tools exist) +2. You send those schemas alongside the user's message +3. The model responds with a structured tool call (not text, but a JSON object naming the function and its arguments) +4. Your code validates the arguments, executes the function, and returns the result + +
+ The four-step function calling cycle: define schema, send with message, model returns tool call JSON, your code executes +
Figure 0b.1: The function calling cycle. The model never executes anything. It writes JSON. You do the work.
+
+ +Let's build this from scratch. First, we need a way to define tools. + +The code below uses Pydantic, a Python library for data validation using type annotations. If you have used dataclasses, think of Pydantic as dataclasses with built-in validation: you declare fields with types, and Pydantic rejects any input that does not match. `BaseModel` is the base class. `Field` adds constraints like minimum values or defaults. `model_validate()` checks a dictionary against the type annotations and raises `ValidationError` if anything is wrong. We use it here because the model will send us JSON arguments, and we need to validate those arguments before running any code. + +Here is the `Tool` class and `ToolRegistry` from this book's companion code: + +```python +from pydantic import BaseModel, Field +from enum import StrEnum +from typing import Any, Callable, Type + + +class Tool: + """An entry in the tool registry. + + Holds the callable function and its associated Pydantic input model so + arguments can be validated before dispatch. + """ + + def __init__( + self, + name: str, + description: str, + fn: Callable[..., str], + input_model: Type[BaseModel], + ) -> None: + self.name = name + self.description = description + self.fn = fn + self.input_model = input_model + + +class ToolRegistry: + """Registry that maps tool names to Tool entries.""" + + def __init__(self) -> None: + self._tools: dict[str, Tool] = {} + + def register( + self, + name: str, + description: str, + fn: Callable[..., str], + input_model: Type[BaseModel], + ) -> None: + self._tools[name] = Tool( + name=name, description=description, fn=fn, input_model=input_model + ) + + def get_schemas(self) -> list[dict]: + """Return schema objects for all registered tools.""" + return [entry.to_schema() for entry in self._tools.values()] + + def get(self, name: str) -> Tool | None: + return self._tools.get(name) +``` + + +The `ToolRegistry` is a dictionary with extra structure. Each tool has a name (how the model refers to it), a description (how the model decides whether to use it), a callable (what your code actually runs), and a Pydantic input model (how you validate arguments before calling the function). That last one matters a lot. We'll get to why in the next section. + + +Now let's define some actual tools. A calculator: + +```python +class Operation(StrEnum): + ADD = "add" + SUBTRACT = "subtract" + MULTIPLY = "multiply" + DIVIDE = "divide" + + +class CalculatorInput(BaseModel): + """Input schema for the calculator tool.""" + operation: Operation + a: float + b: float + + +def calculator(operation: str, a: float, b: float) -> str: + op = Operation(operation) + if op == Operation.ADD: + return str(float(a + b)) + elif op == Operation.SUBTRACT: + return str(float(a - b)) + elif op == Operation.MULTIPLY: + return str(float(a * b)) + elif op == Operation.DIVIDE: + if b == 0: + return "Error: division by zero" + return str(float(a / b)) + return f"Error: unknown operation '{operation}'" +``` + +And a search tool: + +```python +class SearchInput(BaseModel): + """Input schema for the fake search tool.""" + query: str + max_results: int = Field(default=3, ge=1, le=10) + + +def fake_search(query: str, max_results: int = 3) -> str: + results = [ + {"title": f"Result {i + 1} for '{query}'", + "url": f"https://example.com/{i + 1}"} + for i in range(max_results) + ] + return json.dumps(results, indent=2) +``` + +Now the critical piece: the `Tool.to_schema()` method that converts a Pydantic model into the JSON schema the model needs: + +```python +def to_schema(self) -> ToolSchema: + """Derive a ToolSchema from the Pydantic model's field definitions.""" + parameters: list[ToolParameter] = [] + model_fields = self.input_model.model_fields + + for field_name, field_info in model_fields.items(): + annotation = field_info.annotation + type_str = _python_type_to_json_type(annotation) + + enum_values: list[str] | None = None + if isinstance(annotation, type) and issubclass(annotation, StrEnum): + enum_values = [e.value for e in annotation] + + parameters.append( + ToolParameter( + name=field_name, + type=type_str, + description=field_info.description or field_name, + required=field_info.is_required(), + enum=enum_values, + ) + ) + + return ToolSchema( + name=self.name, + description=self.description, + parameters=parameters + ) +``` + + +`to_schema()` walks the Pydantic model's fields and converts them into a `ToolSchema`, the book's provider-neutral representation of a tool definition. This schema gets sent to the model alongside the user's message. The model reads this schema to understand what tools are available, what arguments they accept, and what constraints those arguments have (like enum values for the `operation` field). The model never sees your Python code. It sees the schema. + + +This is the full picture. You define a function. You define a Pydantic model that describes its inputs. You register both in the registry. The registry generates schemas. You send those schemas to the model. The model reads them and decides whether to call a tool and with what arguments. + +Now let's see what happens on the other side. When the model decides to use a tool, it doesn't return text. It returns a `ToolCall` object: + +```python +# This is what the model returns instead of text: +{ + "id": "call_001", + "name": "calculator", + "arguments": {"operation": "add", "a": 100, "b": 200} +} +``` + +Your code receives this, looks up the tool in the registry, validates the arguments, and executes the function: + +```python +def execute_tool_call( + registry: ToolRegistry, tool_name: str, arguments: dict[str, Any] +) -> str: + """Validate and execute a tool call from the model.""" + entry = registry.get(tool_name) + if entry is None: + return f"Error: unknown tool '{tool_name}'" + + try: + validated = entry.input_model.model_validate(arguments) + except ValidationError as exc: + return f"Validation error: {exc}" + + try: + return entry.fn(**validated.model_dump()) + except Exception as exc: + return f"Error executing tool '{tool_name}': {exc}" +``` + + +Three things happen in `execute_tool_call`: lookup, validation, execution. The lookup catches hallucinated tool names. The validation (via Pydantic's `model_validate`) catches malformed arguments before they reach your function. The try/except around execution catches runtime errors. Each layer returns a string error instead of crashing, because this error message gets sent back to the model as the tool result. The model can then apologize, retry with different arguments, or try a different tool. Crashing would end the conversation. + + +Here's the full flow wired together: + +```python +registry = create_default_registry() + +# Direct call: what is 6 * 7? +result = execute_tool_call( + registry, "calculator", + {"operation": "multiply", "a": 6, "b": 7} +) +print(result) +# "42.0" + +# Division by zero: handled gracefully +result = execute_tool_call( + registry, "calculator", + {"operation": "divide", "a": 10, "b": 0} +) +print(result) +# "Error: division by zero" +``` + +That is the entire mechanism behind function calling. You defined a Pydantic model, wrote `to_schema()` to generate the JSON schema, built `execute_tool_call()` to validate and dispatch. Every major framework automates exactly these three steps. LangChain's `@tool` decorator reads your type hints and generates the schema. Google ADK's `FunctionTool` reads the docstring. CrewAI's tool registration does the same. The pattern is identical; only the syntax changes. You built it by hand so you understand what the framework is doing when it hides these lines from you. Section 0d will show you the same tools wrapped in ADK and LangChain, and you will see that the 30 lines of schema and validation machinery you just wrote collapse into a single decorator. + +## Schema validation is your safety net + +The model will hallucinate arguments. Not occasionally. Routinely. The model will pass a string where you need an integer. It will invent parameters that don't exist. It will pass "modulo" as an operation to your calculator that only knows add, subtract, multiply, divide. + +Without validation, here's what happens: + +```python +# Without Pydantic validation: the model passes "modulo" +try: + Operation("modulo") +except ValueError as e: + print(f"Raw error: {e}") +# Raw error: 'modulo' is not a valid Operation +``` + +That `ValueError` crashes your tool execution. If you're not catching it, it crashes your agent loop. If you are catching it but returning a generic "something went wrong" message, the model has no idea what went wrong and will likely try the same thing again. + +With Pydantic validation, the error is specific and actionable: + +```python +from pydantic import ValidationError + +# With Pydantic: structured error that the model can learn from +try: + CalculatorInput(operation="modulo", a=10, b=3) +except ValidationError as exc: + print(exc) +``` + +``` +1 validation error for CalculatorInput +operation + Input should be 'add', 'subtract', 'multiply' or 'divide' + [type=enum, input_value='modulo', input='modulo'] +``` + + +The Pydantic error message tells the model exactly what went wrong and what the valid options are. When you send this error back as the tool result, the model can read "Input should be 'add', 'subtract', 'multiply' or 'divide'" and retry with a valid operation. Compare this to a stack trace or a generic error message. Structured validation errors are documentation for the model. + + +This is why every tool in the companion code has a Pydantic input model. Not for elegance. For survival. Here are the failure modes that validation catches: + +**Wrong type:** The model passes `"seven"` instead of `7` for a numeric parameter. Pydantic rejects it (or coerces it, depending on your config). Without validation, your arithmetic function receives a string and either crashes or produces nonsense. + +**Missing required field:** The model forgets to include the `operation` parameter. Pydantic reports exactly which field is missing. Without validation, your function receives `None` and crashes with a confusing `AttributeError`. + +**Extra fields:** The model invents a `precision` parameter that your calculator doesn't support. Pydantic ignores it by default (configurable). Without validation, your function receives an unexpected keyword argument. + +**Out of range:** The `SearchInput` model constrains `max_results` to between 1 and 10 using `Field(ge=1, le=10)`. The model passes 500. Pydantic rejects it with a clear message. Without validation, your search function tries to generate 500 results. + +```python +# max_results out of range +result = execute_tool_call( + registry, "search", + {"query": "python best practices", "max_results": 500} +) +print(result) +# Validation error: 1 validation error for SearchInput +# max_results +# Input should be less than or equal to 10 +# [type=less_than_equal, input_value=500, input=500] +``` + + +A data pipeline agent had a `write_record` tool that accepted a `priority` field: an integer from 1 (low) to 5 (critical). Without Pydantic validation, the model passed `priority: 10` for a routine log entry. The write succeeded. No error. The record was stored with priority 10, which was outside the application's expected range. Downstream alerting logic treated anything above 5 as a system emergency and paged the on-call team at 3am. With `Field(ge=1, le=5)` on the Pydantic model, the validation error would have been caught before the write, the model would have received "Input should be less than or equal to 5," and it would have retried with a valid value. The gap between "the function accepts an int" and "the function accepts an int between 1 and 5" is where silent data corruption lives. + + +
+ Comparison of tool calls with and without schema validation, showing clean errors vs runtime crashes +
Figure 0b.2: Schema validation turns runtime crashes into structured errors that the model can learn from.
+
+ +Validation isn't just about preventing crashes. It's about giving the model enough information to self-correct. A model that receives "Validation error: Input should be 'add', 'subtract', 'multiply' or 'divide'" will fix its next attempt. A model that receives "Error: unhandled exception in tool execution" will guess, and guess wrong. + +## Multiple tools and selection + +Give the model three tools and a question. Watch what happens. + +```python +registry = create_default_registry() + +# Three tools registered: +print(registry.list_tools()) +# ['calculator', 'word_count', 'search'] +``` + +When you send a message like "What is 42 times 17?" along with all three tool schemas, the model reads the descriptions and picks `calculator`. When you send "How many words are in the Gettysburg Address?", it picks `word_count`. When you send "Find me information about Rust async patterns," it picks `search`. + +This works because the tool descriptions are clear and distinct. The model selects tools by matching the user's intent to the tool descriptions. This is not semantic search. It is not embeddings. It is the model reading your descriptions and making a judgment call, the same way a person would read an API catalog and pick the right endpoint. + +Tool descriptions are your API documentation for the model. Write them like you'd write docs for a junior developer who takes everything literally. Because that is exactly what the model is doing: reading your description literally and picking the tool that sounds most relevant. + +Here are the descriptions from the companion code: + +```python +registry.register( + name="calculator", + description="Perform basic arithmetic: add, subtract, multiply, " + "or divide two numbers.", + fn=calculator, + input_model=CalculatorInput, +) +registry.register( + name="word_count", + description="Count the number of words in a piece of text.", + fn=word_count, + input_model=WordCountInput, +) +registry.register( + name="search", + description="Search for information on a topic and return " + "the top results.", + fn=fake_search, + input_model=SearchInput, +) +``` + + +Each description is one sentence that says what the tool does, not how it works. The model doesn't need to know that the calculator uses Python's arithmetic operators or that the search function returns mock data. It needs to know when to use each tool. "Perform basic arithmetic" tells it to use this tool for math. "Count the number of words" tells it to use this tool for word counting. Descriptions are selection criteria, not implementation details. + + +When does the model choose wrong? When the descriptions overlap or when the query is ambiguous. Consider this message: "What's the word count of 'four score and seven years ago'?" + +The model should pick `word_count`. But I've seen models pick `calculator` because they interpret "count" as a numeric operation and try to count the words themselves using arithmetic. This happens more often with smaller models and with vague descriptions. + +The fix is better descriptions, not more complex routing logic. If you find the model consistently selecting the wrong tool, the description is the first place to look. Make it more specific. Add what the tool is NOT for if the confusion is persistent. "Count the number of words in a piece of text. Do not use for arithmetic or calculations." This sounds redundant to a human reader. It's not redundant to the model. + +A few practical guidelines for tool descriptions: + +**State the input, not just the output.** "Perform basic arithmetic on two numbers" is better than "Do math." The model needs to know what to pass, not just what comes back. + +**Use the vocabulary of the domain.** If your users say "look up" and your tool description says "retrieve," the model might hesitate. Match the language your users actually use. + +**Keep it under two sentences.** Long descriptions get lost in the context. The model pays the most attention to the first sentence. Put the selection-critical information there. + +**Don't describe the implementation.** "Uses a PostgreSQL full-text search index with ts_vector" is irrelevant to the model. "Search the knowledge base for documents matching a query" is what it needs. + + +A customer support agent had two tools: `get_order` ("Get order information") and `get_account` ("Get account information"). When users asked "Where's my package?", the model picked `get_account` about 40% of the time. Both descriptions mentioned "information" without saying what kind. The fix: change `get_order` to "Look up shipping status, delivery date, and tracking number for a specific order ID" and `get_account` to "Retrieve account profile, billing address, and payment methods for a customer." After that change, routing accuracy on order-tracking queries went from ~60% to over 95%. The model was not confused about the task. It was confused about which tool matched the task. Specific descriptions fixed it. + + +When you have more than five or six tools, the model's selection accuracy starts to degrade. Not because it can't read six descriptions, but because the probability of description overlap increases. If you find yourself registering fifteen tools, that's an architectural signal. Split them into groups. Use a routing step where one model call picks the category, then a second call picks the specific tool within that category. Chapter 4 covers this pattern in depth. + +## The gap this doesn't cover + +You now have a system that can take actions. But it takes them once. The model calls a tool, gets the result, and the conversation is over. It can't look at the result and decide "that's not enough, let me search again with different terms." It can't chain together a search, a calculation on the search results, and then a summary. It does one thing, and stops. + +This is the single-turn ceiling. It's useful. Single-turn tool use handles a surprising number of real-world tasks: calculators, data lookups, format conversions, API calls. But it's not an agent. + +An agent can observe the result of its action, decide it's insufficient, and take another action. An agent can plan a sequence of steps, adjust the plan based on intermediate results, and know when to stop. The difference is a loop: observe, think, act, repeat. + +
+ Single-turn tool use stops after one action. The agent loop continues: observe, think, act, repeat. +
Figure 0b.3: The single-turn ceiling. Tool use without a loop handles one action. The agent loop adds iteration, and that changes everything.
+
+ +Consider a concrete example. A user asks: "What's the population density of the country with the tallest building in the world?" + +With single-turn tool use, the model can call `search` for "tallest building in the world." It gets back results mentioning the Burj Khalifa in the UAE. But now what? It needs to search again for "UAE population density" and then maybe use the calculator to verify the numbers. In a single-turn system, it got one search result and had to guess the rest. In an agent loop, it would chain those three tool calls together, each informed by the previous result. + +That loop is the subject of the next section. + +## Putting it together + +You've gone from text-in-text-out to a system that can act. It can calculate, search, and analyze. The model reads tool schemas, decides which tool to call, and returns structured JSON with the function name and arguments. Your code validates those arguments with Pydantic, executes the function, and returns the result. Every tool-using system you'll encounter, from simple chatbot plugins to complex agent frameworks, is built on this cycle. + +The key insights from this section: + +System prompts are code. Version them, test them, specify them precisely. Every ambiguity is a bug waiting to happen. + +Few-shot examples are the most effective technique for consistent output. Two examples beat two paragraphs of instructions. + +Function calling is JSON in, JSON out. The model writes a request. Your code fulfills it. There is no magic on either side. + +Schema validation isn't optional. The model will hallucinate arguments. Pydantic catches the bad ones before they reach your functions, and the structured error messages give the model enough information to self-correct. + +Tool descriptions are selection criteria. Write them for a literal reader who will pick the tool that sounds most relevant to the query. + +And the limitation: all of this operates in a single turn. One tool call, one result. The model cannot look at a search result and decide "that's not what I needed, let me refine my query." It cannot chain a search, a calculation on the search results, and a summary together. It does one thing, and stops. + +The next section adds the loop that removes that ceiling. You will build, in about 100 lines of Python, a system that can observe the result of its action, decide whether it is sufficient, and take another action if it is not. The mechanism is a while loop with an LLM inside it. The engineering challenge is knowing when that loop should stop. + +For a fully built tool-using assistant with proper error handling and logging, see the [Tool-Using Assistant](/agentic-ai/projects/tool-using-assistant/) project. diff --git a/src/content/chapters/00c-first-agent.mdx b/src/content/chapters/00c-first-agent.mdx new file mode 100644 index 0000000..dfbd0cb --- /dev/null +++ b/src/content/chapters/00c-first-agent.mdx @@ -0,0 +1,563 @@ +--- +title: Your First Agent, No Framework +part: foundations +description: "Build a complete agent in 100 lines of Python. No framework. No magic. Every line explained, including the ones where it breaks." +readingTime: 25 +date: 2026-03-17 +references: + - 00b-api-to-tools + - 00d-frameworks +patterns: + - agent-loop + - tool-registry +status: published +--- + +import Callout from '~/components/universal/Callout.astro'; + +You have a system that can call tools. But it calls them once and stops. What if it could look at the result, decide it's not enough, and try again? That's an agent. The entire concept is a while loop with an LLM inside it. Let's build one. + +## The loop in 20 lines + +Here is the skeleton of every agent ever built: + +```python +while steps < budget: + response = call_llm(messages) + if response.has_tool_calls: + result = execute_tool(response.tool_calls[0]) + messages.append(tool_call + result) + steps += 1 + else: + return response.content # Model decided it's done +``` + +That's it. That is the core architecture. Every framework, every SDK, every "agent platform" I've looked at wraps some variation of this loop. + +Read it line by line: + +1. **`while steps < budget`** prevents infinite loops. Without a budget, a confused model will call tools forever. This is not theoretical. It will happen on your first real task. + +2. **`call_llm(messages)`** sends the full conversation, including all previous tool calls and results, back to the model. The model sees everything that has happened so far and decides what to do next. + +3. **`if response.has_tool_calls`** is the decision point. The model either wants to take an action (call a tool) or deliver a final answer (return text). There is no third option. + +4. **`execute_tool()`** runs the function locally. The model never executes anything. It writes JSON requesting a function call. Your code does the work. + +5. **`messages.append()`** feeds the tool result back into the conversation. This is how the model "observes" the outcome of its action. Next iteration, it sees the result and decides whether to act again or answer. + +6. **`return response.content`** is how the loop ends cleanly. The model decided it has enough information and produced a text answer instead of another tool call. + +The loop implements observe-think-act-repeat. This cycle has a name in the literature (ReAct, for Reason + Act), but the pattern predates the paper. It's a control loop with a language model in the middle. + +
+ The agent loop: observe, think, act, repeat until answer or budget exhaustion +
Figure 0c.1: The agent loop. Observe, think, act, repeat. The model decides when to stop.
+
+ +## Building it step by step + +The skeleton above is pseudocode. The real thing lives in `src/ch00/raw_agent.py`, about 100 lines. To run it yourself from the repo root: + +```bash +export ANTHROPIC_API_KEY="your-key-here" +python -m src.ch00.raw_agent "What is 15 * 7 + 3?" +``` + +You will see the trace output shown later in this section. We will walk through the code in pieces first. + +### The system prompt + +The system prompt is how you tell the model what kind of agent it should be. Here's the one from the companion code: + +```python +SYSTEM_PROMPT = ( + "You are a research assistant with access to tools. " + "Use the available tools to answer the user's question accurately. " + "When you have enough information to answer fully, respond with plain text. " + "Do not call tools unnecessarily -- stop as soon as you can give a good answer." +) +``` + + +Four sentences. The first establishes the role. The second gives permission to use tools. The third defines the termination condition: respond with text when you have enough information. The fourth prevents over-calling. That last sentence matters more than it looks. Without it, models will call tools "just to be thorough" even when they already know the answer. + + +Notice what the prompt does not say. It doesn't describe how to use the tools (the schemas handle that). It doesn't list which tools exist (the registry provides that). It doesn't say how many steps to take (the budget handles that in code). The system prompt handles intent. Code handles constraints. + +### The result type + +Before the loop, we need a place to put what comes out of it: + +```python +@dataclass +class AgentResult: + """The outcome of a single agent run.""" + answer: str | None + steps: int + total_tokens: int + total_cost_estimate: float + elapsed_ms: float + budget_exhausted: bool + trace: list[dict] = field(default_factory=list) +``` + + +`AgentResult` captures everything you need to evaluate a run. `answer` is `None` when the budget ran out before the model produced a final answer. `trace` records every tool call and its result, which you'll use for debugging. `budget_exhausted` is the field that tells you something went wrong. In a production system, this dataclass would be the contract between the agent and whatever consumes its output. + + +The `trace` field records every tool call, every argument, every result. When something goes wrong (and it will), the trace is how you figure out what the model was thinking. + +### The agent class + +Here is the `Agent` itself, with the core loop: + +```python +class Agent: + def __init__( + self, + client: ModelClient, + registry: ToolRegistry, + max_steps: int = 5, + system_prompt: str = SYSTEM_PROMPT, + ) -> None: + self.client = client + self.registry = registry + self.max_steps = max_steps + self.system_prompt = system_prompt +``` + +Four dependencies. The `client` talks to the model. The `registry` holds the tools. `max_steps` is the budget. `system_prompt` is overridable. No inheritance, no plugin system, just constructor arguments. + +Now the `run` method: + +```python +async def run(self, user_query: str) -> AgentResult: + messages: list[Message] = [ + Message(role=Role.SYSTEM, content=self.system_prompt), + Message(role=Role.USER, content=user_query), + ] + tool_schemas = self.registry.get_schemas() + trace: list[dict] = [] + total_tokens = 0 + steps = 0 + + for step in range(self.max_steps): + steps = step + 1 + request = CompletionRequest(messages=messages, tools=tool_schemas) + response = await self.client.complete(request) + + if response.usage: + total_tokens += response.usage.total_tokens + + # Model wants to call a tool. + if response.tool_calls: + tc = response.tool_calls[0] + tool_result = execute_tool_call( + self.registry, tc.name, tc.arguments + ) + + trace.append({ + "type": "tool_call", + "step": steps, + "tool": tc.name, + "arguments": tc.arguments, + "result": tool_result, + }) + + messages.append( + Message( + role=Role.ASSISTANT, + content=f"[tool_call: {tc.name}({tc.arguments})]", + ) + ) + messages.append( + Message( + role=Role.TOOL, + content=tool_result, + name=tc.name, + tool_call_id=tc.id, + ) + ) + continue + + # Model returned a text answer. + if response.content: + trace.append({ + "type": "response", + "step": steps, + "content": response.content, + }) + return AgentResult( + answer=response.content, + steps=steps, + total_tokens=total_tokens, + total_cost_estimate=0.0, + elapsed_ms=elapsed_ms, + budget_exhausted=False, + trace=trace, + ) + + # Budget exhausted. + return AgentResult( + answer=None, + steps=steps, + total_tokens=total_tokens, + total_cost_estimate=0.0, + elapsed_ms=elapsed_ms, + budget_exhausted=True, + trace=trace, + ) +``` + +Walk through the key decisions: + +**`for step in range(self.max_steps)`** is a hard ceiling. The loop runs at most `max_steps` times. Default is 5. This is the simplest possible guardrail, and I would not ship an agent without it. Remove it and a single confused query can burn through your entire API budget. + +**`CompletionRequest(messages=messages, tools=tool_schemas)`** sends the full conversation plus all tool schemas every iteration. The model sees everything: the system prompt, the original question, every tool call it made, every result it got. This growing message list is the agent's working memory. + +**`if response.tool_calls`** is where the model's decision becomes your code's branch. Tool call? Execute, record, append, `continue`. Text answer? Record and return. Two branches, nothing else. + +**`messages.append()`** happens twice per tool call: once for the assistant's tool request, once for the tool's result. Both go into the conversation so the model sees what it asked for and what it got back. + +**The final `return` after the loop** handles budget exhaustion. `answer` is `None`. `budget_exhausted` is `True`. The caller knows the agent gave up. + +## Run it on a real task + +Give the agent a question that requires two tool calls: "What is 15 * 7 + 3?" + +The agent can't do this in one step. It needs to multiply first, then add. Here's what the trace looks like: + +``` +Query: What is 15 * 7 + 3? + +[Step 1] tool_call calculator({"operation": "multiply", "a": 15, "b": 7}) + result: "105.0" + +[Step 2] tool_call calculator({"operation": "add", "a": 105, "b": 3}) + result: "108.0" + +[Step 3] response "15 * 7 + 3 = 108.0" + +Answer: "15 * 7 + 3 = 108.0" +Steps: 3 +Tokens: 195 +Budget exhausted: False +``` + +Three steps, three model calls, three decisions. The agent multiplied first, used that result to set up the addition, then synthesized the final answer. Each step, the model saw everything that came before and chose what to do next. + +
+ Waterfall trace of a multi-step agent run showing three model calls, two tool executions, and a final answer +
Figure 0c.2: A multi-step agent trace. Each row is a model call. The model sees accumulated context and decides whether to call a tool or answer.
+
+ + +The model could have done this math in its head. Most models can multiply 15 by 7 without a calculator. But we told it to use tools, and we gave it a calculator, so it did. This is actually correct behavior: following instructions over taking shortcuts. In production, the tools will do things the model genuinely cannot do, like query a database or call an API. The pattern is the same regardless. + + +Notice how cheap this was. Three model calls, 195 tokens, about $0.001. The cost becomes meaningful at scale (10,000 queries a day) or when the agent takes many more steps per query. Both happen in production. + +## Watch it fail + +The demo works. Now break it. These failures are not edge cases. They are the default behaviors of an unsupervised agent. + +### Failure 1: The infinite loop + +Give the agent a vague, open-ended task: "Search for everything ever written about artificial intelligence." + +``` +Query: Search for everything ever written about AI. + +[Step 1] tool_call search({"query": "AI history"}) + result: [{"title": "Result 1 for 'AI history'", ...}] + +[Step 2] tool_call search({"query": "AI future predictions"}) + result: [{"title": "Result 1 for 'AI future predictions'", ...}] + +[Step 3] tool_call search({"query": "AI ethics and safety"}) + result: [{"title": "Result 1 for 'AI ethics and safety'", ...}] + +Answer: None +Steps: 3 +Budget exhausted: True +``` + +The model never stopped searching. It kept finding new facets, kept deciding there was more to look up, and ran out of budget before synthesizing an answer. With a budget of 3, you wasted three API calls. With a budget of 50, you'd waste fifty. + + +The model has no internal sense of "enough." It doesn't know when diminishing returns kick in. If the task is unbounded ("everything about X"), the model will keep exploring until something forces it to stop. The budget is that force. But the budget is a blunt instrument. It stops the loop. It doesn't teach the model to converge. Better system prompts, explicit instructions about when to stop searching, are part of the fix. Chapter 6 covers this in depth. + + + +A product comparison agent had `max_steps=10` because the developer wanted to "give it room to think." A user asked "What's the cheapest flight from London to Paris next Tuesday?" The agent searched for flights, then searched for airline reviews, then searched for airport transfer options, then searched for hotel deals near the airport, then searched for travel insurance, then searched for visa requirements, then searched for currency exchange rates. Seven search calls, each feeding back context that grew the token count per call. Total cost: $1.87 for a query that needed one search and one answer. The fix: start with `max_steps=3`. If the agent exhausts its budget, examine the trace. Most of the time, the task was answerable in fewer steps and the model was being "thorough" rather than efficient. Raise the budget only after you have evidence that more steps produce materially better answers for your workload. + + +### Failure 2: The hallucinated tool call + +The model invents a tool that doesn't exist. This happens when the model's training data includes functions that your registry doesn't have. + +``` +Query: What is the weather in London? + +[Step 1] tool_call weather({"city": "London"}) + result: "Error: unknown tool 'weather'" + +[Step 2] response "I'm sorry, I don't have access to a weather + tool. I can't check the current weather." + +Answer: "I'm sorry, I don't have access to a weather tool." +Steps: 2 +Budget exhausted: False +``` + +The model decided it needed a weather API and called `weather` with reasonable-looking arguments. The function doesn't exist. `execute_tool_call` returned a structured error instead of crashing, the model read that error, and gracefully explained the limitation. + + +This is the validation layer from Section 0b doing its job. `execute_tool_call` checks whether the tool exists before trying to run it. When it doesn't exist, it returns a string error that gets sent back to the model as a tool result. The model reads "Error: unknown tool 'weather'" and self-corrects. If `execute_tool_call` had thrown an exception instead of returning an error string, the agent loop would have crashed and the user would have gotten nothing. + + +This happens frequently with general-purpose models. The model "knows" tools exist for weather, email, calendar, and dozens of other domains. It will try to call them. Your registry is the gatekeeper. + +### Failure 3: The confident wrong answer + +This is the hardest failure to catch. The model stops early with a wrong answer, and it sounds completely confident. + +``` +Query: What is the population of the largest city in Australia? + +[Step 1] response "The largest city in Australia is Sydney, + with a population of approximately 5.3 million." + +Answer: "The largest city in Australia is Sydney, with a + population of approximately 5.3 million." +Steps: 1 +Budget exhausted: False +``` + +The model didn't even use the search tool. It answered from its training data without checking. The answer might be roughly right. It might be outdated. It might be wrong. The point is that the model made a judgment call ("I already know this") and skipped verification. + +Nothing in the trace looks wrong. One step, an answer, no budget exhaustion. Every metric says success. But the answer could be stale, imprecise, or fabricated. + + +The infinite loop is visible. The hallucinated tool call produces an error. But the confident wrong answer looks exactly like a correct answer. The only way to catch it is to build evaluation into your system: compare the agent's output against ground truth. Prompting alone does not solve this. You need test suites. Chapter 6 builds them. + + + +A fact-checking agent was asked "Is Company X still publicly traded?" It searched once, found a 2023 article mentioning the company's IPO, and answered "Yes, Company X is publicly traded." It did not search for more recent news. The company had been taken private six months earlier. The agent stopped after one search because it had "enough information to answer fully," exactly as the system prompt instructed. The fix was not to remove that instruction (you need it to prevent runaway loops). The fix was to add a verification nudge to the system prompt: "For factual claims about current status, search for the most recent information available, not just the first result." A more robust fix, covered in Chapter 6, is to build minimum-step checks: if the task type requires recency, require at least two searches with different date-scoped queries before answering. + + +
+ Three failure modes: infinite loop (budget exhaustion), hallucinated tool (error recovery), confident wrong answer (silent failure) +
Figure 0c.3: Three failure modes. The first two are loud. The third is silent. Silent failures are the ones that reach production.
+
+ +These are not edge cases. These are the default behaviors of an agent without engineering discipline. Every one of these failures is what the rest of the book teaches you to prevent. + +## Add basic guardrails + +Ten lines of code turn a fragile demo into something that fails gracefully. Not production-ready, but no longer embarrassing. + +### Guardrail 1: The iteration budget + +You already have this. The `max_steps` parameter caps the loop: + +```python +agent = Agent(client=client, registry=registry, max_steps=5) +``` + +Five is a reasonable default for simple tasks. For complex research tasks that chain many tool calls, you might go to 10 or 15. Going above 20 is usually a sign that the task is too vague or the tools are too narrow. If the agent needs 20 steps, reconsider the task decomposition before raising the budget. + +### Guardrail 2: Input validation + +Use Pydantic to validate the user's query before it enters the loop. This is what you built in Section 0b, applied to the agent's input: + +```python +from pydantic import BaseModel, Field + +class AgentQuery(BaseModel): + query: str = Field(min_length=1, max_length=2000) + max_steps: int = Field(default=5, ge=1, le=20) + +# Validate before running +validated = AgentQuery(query=user_input, max_steps=requested_steps) +result = await agent.run(validated.query) +``` + + +Pydantic rejects empty queries, absurdly long queries, and step budgets outside your acceptable range before the agent spends a single token. This costs nothing and prevents a class of issues that are annoying to debug after the fact. + + +### Guardrail 3: Step logging + +Print what happens at each step. This is the minimum viable observability: + +```python +for step in range(self.max_steps): + steps = step + 1 + response = await self.client.complete(request) + + tokens_this_step = response.usage.total_tokens if response.usage else 0 + total_tokens += tokens_this_step + print(f"[Step {steps}] tokens={tokens_this_step} total={total_tokens}") + + if response.tool_calls: + tc = response.tool_calls[0] + print(f" -> tool_call: {tc.name}({tc.arguments})") + tool_result = execute_tool_call(self.registry, tc.name, tc.arguments) + print(f" <- result: {tool_result[:100]}") + # ... append to messages and continue +``` + +In production, replace `print` with structured logging. But `print` is infinitely better than nothing. When the agent does something unexpected at 2am, these logs are the difference between a five-minute diagnosis and a blind debugging session. + +
+ Before: raw loop with no protections. After: budget, validation, and logging added in 10 lines. +
Figure 0c.4: Before and after guardrails. Ten lines of code. Budget caps the loop. Validation rejects bad input. Logging shows you what happened.
+
+ +This is 10% of what production hardening looks like. Chapter 6 gives you the other 90%: evaluation suites, cost tracking, retry policies, circuit breakers, and structured observability. But these three guardrails are the ones you add on day one. + +## The code in full + +Here is the complete agent in one block. You can read this top to bottom in ten minutes and understand everything that happens. No hidden utilities. No imports from libraries that do the hard work for you. The three imports at the top are the pieces you built in previous sections: `ModelClient` (the provider-neutral wrapper from Section 0a), `ToolRegistry` and `execute_tool_call` (the tool registration and dispatch from Section 0b). In the companion code, these live in `src/shared/` and `src/ch00/`. In a real project, they would be your own modules. + +```python +"""A minimal agent: a while loop with an LLM inside it.""" + +from __future__ import annotations + +import time +from dataclasses import dataclass, field + +from src.shared.model_client import ModelClient +from src.shared.types import CompletionRequest, Message, Role +from src.ch00.tool_use import ToolRegistry, execute_tool_call + + +SYSTEM_PROMPT = ( + "You are a research assistant with access to tools. " + "Use the available tools to answer the user's question accurately. " + "When you have enough information to answer fully, respond with plain text. " + "Do not call tools unnecessarily -- stop as soon as you can give a good answer." +) + + +@dataclass +class AgentResult: + """The outcome of a single agent run.""" + answer: str | None + steps: int + total_tokens: int + total_cost_estimate: float + elapsed_ms: float + budget_exhausted: bool + trace: list[dict] = field(default_factory=list) + + +class Agent: + """A minimal agent that loops between model calls and tool execution.""" + + def __init__( + self, + client: ModelClient, + registry: ToolRegistry, + max_steps: int = 5, + system_prompt: str = SYSTEM_PROMPT, + ) -> None: + self.client = client + self.registry = registry + self.max_steps = max_steps + self.system_prompt = system_prompt + + async def run(self, user_query: str) -> AgentResult: + start_time = time.monotonic() + + messages: list[Message] = [ + Message(role=Role.SYSTEM, content=self.system_prompt), + Message(role=Role.USER, content=user_query), + ] + tool_schemas = self.registry.get_schemas() + trace: list[dict] = [] + total_tokens = 0 + steps = 0 + + for step in range(self.max_steps): + steps = step + 1 + request = CompletionRequest(messages=messages, tools=tool_schemas) + response = await self.client.complete(request) + + if response.usage: + total_tokens += response.usage.total_tokens + + # Model wants to call a tool. + if response.tool_calls: + tc = response.tool_calls[0] + tool_result = execute_tool_call( + self.registry, tc.name, tc.arguments + ) + + trace.append({ + "type": "tool_call", + "step": steps, + "tool": tc.name, + "arguments": tc.arguments, + "result": tool_result, + }) + + messages.append( + Message( + role=Role.ASSISTANT, + content=f"[tool_call: {tc.name}({tc.arguments})]", + ) + ) + messages.append( + Message( + role=Role.TOOL, + content=tool_result, + name=tc.name, + tool_call_id=tc.id, + ) + ) + continue + + # Model returned a text answer. + if response.content: + elapsed_ms = (time.monotonic() - start_time) * 1000 + trace.append({ + "type": "response", + "step": steps, + "content": response.content, + }) + return AgentResult( + answer=response.content, + steps=steps, + total_tokens=total_tokens, + total_cost_estimate=0.0, + elapsed_ms=elapsed_ms, + budget_exhausted=False, + trace=trace, + ) + + # Budget exhausted. + elapsed_ms = (time.monotonic() - start_time) * 1000 + return AgentResult( + answer=None, + steps=steps, + total_tokens=total_tokens, + total_cost_estimate=0.0, + elapsed_ms=elapsed_ms, + budget_exhausted=True, + trace=trace, + ) +``` + +One file. One class. One loop. No decorators, no metaclasses, no dependency injection. Every line is visible. Every decision is explicit. + +This is the agent you will compare against every framework you evaluate. You wrote the tool registry, the schema generation, the validation layer, the agent loop, the trace, and the guardrails. In Section 0d, you will rebuild this same agent using Google ADK and LangChain. The tool logic stays the same. The system prompt stays the same. The failure modes stay the same. What changes is that four things get automated: tool registration (the `ToolRegistry` and `to_schema()` you wrote), the agent loop (the `for` loop above), conversation state (the growing `messages` list), and tracing (the `trace` dictionary). The hard engineering decisions do not disappear. They just move inside the framework. When someone shows you a 500-line agent class with plugins, middleware, and lifecycle hooks, you will now know exactly what those 500 lines are wrapping: the 100-line version you just wrote. + +## What you built, and what comes next + +You just built an agent. It works. It also breaks in predictable ways. You added basic guardrails that help, but you made a dozen judgment calls by instinct: how big the budget, when to stop searching, what to do when confidence is low, whether this task even needed an agent or could have been a simple tool call. Those instincts were sometimes right. But instincts do not scale to a team of five engineers building agent systems. Chapter 1 gives you the precise vocabulary to make these decisions explicit. It defines five system types, from single LLM calls through multi-agent orchestrations, and gives you a decision framework for choosing when a task needs the loop you just built and when a deterministic workflow is the better call. The rest of the book gives you the engineering to build whichever one you choose, for production. + +For an expanded version with more tools, proper error handling, and example queries, see the [Research Agent](/agentic-ai/projects/research-agent/) project. diff --git a/src/content/chapters/00d-frameworks.mdx b/src/content/chapters/00d-frameworks.mdx new file mode 100644 index 0000000..b8db41f --- /dev/null +++ b/src/content/chapters/00d-frameworks.mdx @@ -0,0 +1,343 @@ +--- +title: The Same Agent, With a Framework +part: foundations +description: "The same agent rebuilt with Google ADK and LangChain. Side-by-side comparison with eval scores. What frameworks give you, what they hide, and how to choose." +readingTime: 15 +date: 2026-03-18 +references: + - 00c-first-agent + - 00e-connecting-to-mcp +patterns: [] +status: published +--- + +import Callout from '~/components/universal/Callout.astro'; + +You built an agent from scratch. You understand the loop, the tools, the context assembly, the failure modes. Now let's rebuild it with a framework and see what changes. Spoiler: the hard parts don't disappear. They just move. + +## Why frameworks exist (the honest version) + +Frameworks solve real problems. Tool registration boilerplate, conversation history management, retry logic, tracing, structured logging, multi-model routing. If you're building a production agent, you will eventually build these things yourself or use a framework that already has them. Building them yourself takes weeks. Using a framework takes an afternoon. + +Frameworks also create real problems. Magic you can't debug. Abstractions that leak under pressure. Upgrade churn that breaks your code every six months. Vendor lock-in that feels invisible until you need to switch. Hidden retries that spike your API bill. Config objects with fifty parameters where the defaults are wrong for your use case. + +The point is not "frameworks are good" or "frameworks are bad." The point is to know what you're trading. You are always trading something. Fewer lines of code for less visibility. Faster setup for harder debugging. A richer ecosystem for tighter coupling. These are real tradeoffs, and the right answer depends on what you're building and how long it needs to run. + +Here is the question that separates engineers who use frameworks well from engineers who suffer under them: when this breaks at 2am, will I be debugging my code or the framework's code? If you understand what the framework is doing (because you built the raw version first), you can answer that question before you choose. + +## What frameworks actually automate + +Before comparing specific frameworks, it helps to name the four things you built by hand that every framework handles for you: + +1. **Tool registration.** You wrote `Tool`, `ToolRegistry`, and `to_schema()` to convert Pydantic models into JSON schemas. Frameworks do this automatically from type hints and docstrings. +2. **The agent loop.** You wrote the `for step in range(max_steps)` loop that calls the model, checks for tool calls, dispatches, and appends results. Frameworks run this loop internally. +3. **Conversation state.** You manually grew the `messages` list, appending assistant tool calls and tool results. Frameworks manage this list for you. +4. **Tracing.** You built a `trace` list of dictionaries. Frameworks generate structured, exportable traces automatically. + +The tool logic (the actual `calculator`, `search`, `word_count` functions) stays yours. The system prompt stays yours. The failure modes stay yours. The framework replaces the plumbing, not the engineering decisions. + +With that in mind, let's see how two frameworks handle the same agent. + +## Google ADK: the primary walkthrough + +Google's Agent Development Kit is opinionated about the things that matter in production: tracing, evaluation, and tool registration. It gives you a structured way to define agents and tools, and it stays out of your way on the rest. Let's rebuild the research agent. + +Here is the complete ADK agent from `src/ch00/adk_agent.py`: + +```python +from google.adk.agents import Agent +from google.adk.tools import FunctionTool + + +def calculator(operation: str, a: float, b: float) -> str: + """Perform a basic arithmetic operation. + + Args: + operation: One of add, subtract, multiply, divide. + a: Left operand. + b: Right operand. + """ + op = operation.lower() + if op == "add": + return str(float(a + b)) + elif op == "subtract": + return str(float(a - b)) + elif op == "multiply": + return str(float(a * b)) + elif op == "divide": + if b == 0: + return "Error: division by zero" + return str(float(a / b)) + return f"Error: unknown operation '{operation}'" + + +def word_count(text: str) -> str: + """Count the number of words in *text*.""" + return f"Word count: {len(text.split())}" + + +def search(query: str, max_results: int = 3) -> str: + """Return search results for a query.""" + results = [ + {"title": f"Result {i+1} for '{query}'", "url": f"https://example.com/{i+1}"} + for i in range(max(1, min(max_results, 10))) + ] + return json.dumps(results, indent=2) + + +tools = [ + FunctionTool(func=calculator), + FunctionTool(func=word_count), + FunctionTool(func=search), +] + +agent = Agent( + name="foundations_agent", + model="gemini-2.0-flash", + instruction=( + "You are a helpful assistant. Use the available tools to answer " + "the user's question accurately. Stop as soon as you have a good answer." + ), + tools=tools, +) +``` + + +The same three tools, the same logic, roughly 40 lines instead of 100. The `FunctionTool` wrapper reads your function's docstring and type hints to generate the schema automatically. No `ToolRegistry`. No `Tool` class. No `to_schema()` method. No `execute_tool_call()`. The framework handles all of that. + + +Walk through the key differences from the raw version: + +**Tool registration.** In the raw agent, you defined a Pydantic input model, wrote a `to_schema()` method, and manually registered each tool. In ADK, you wrap a plain function in `FunctionTool` and the framework infers the schema from the docstring and type annotations. Less boilerplate, but also less control. If the framework misreads your docstring (and it sometimes does with complex argument descriptions), you'll be debugging the schema inference rather than the schema itself. + +**The agent loop.** You don't write it. ADK runs its own internal observe-think-act loop. The loop logic is the same as what you built in Section 0c, but it's inside the framework. You configure it with `max_steps` (default varies by version) and the framework handles conversation history, tool dispatch, and termination. The upside: less code. The downside: when the loop does something unexpected, you're reading ADK source code instead of your 20-line while loop. + +**Tracing.** This is where ADK earns its keep. Every tool call, every model response, every step is traced with structured metadata. Run the agent and inspect the trace: + +``` +[Step 1] calculator({"operation": "multiply", "a": 15, "b": 7}) -> "105.0" +[Step 2] calculator({"operation": "add", "a": 105, "b": 3}) -> "108.0" +[Step 3] Response: "15 * 7 + 3 = 108" +``` + +The trace looks identical to the one you built manually in Section 0c. The difference is that ADK generates it automatically and can export it to Google Cloud Trace, a local file, or a custom sink. Your raw agent's `trace` list was a `list[dict]` you printed to stdout. ADK's trace is structured, persistent, and queryable. + +**What ADK does for you:** schema generation from docstrings, the agent loop, structured tracing, conversation state management, built-in eval hooks. + +**What you still do yourself:** the actual tool logic, the system prompt, deciding when to use which tool (through prompt engineering), error handling within tools, and every domain-specific decision about what the agent should do. + +The framework version is shorter. Is it better? That depends on whether you value fewer lines of code or understanding every line. In production, I'd reach for ADK because tracing and eval are built in. For learning, I'd build raw first. Always. + +## LangChain: the comparison + +Same agent, different philosophy. LangChain comes from a chain-based composition model where you build pipelines by connecting components. The agent pattern was added later, and it shows in the architecture. The current recommended approach uses LangGraph's `create_react_agent`, which is closer to the raw loop than older LangChain patterns. + +Here is the LangChain version from `src/ch00/langchain_agent.py`: + +```python +from langchain_core.tools import tool +from langchain_anthropic import ChatAnthropic +from langgraph.prebuilt import create_react_agent + + +@tool +def calculator(operation: str, a: float, b: float) -> str: + """Perform a basic arithmetic operation. + + Args: + operation: One of add, subtract, multiply, divide. + a: Left operand. + b: Right operand. + """ + op = operation.lower() + if op == "add": + return str(float(a + b)) + elif op == "subtract": + return str(float(a - b)) + elif op == "multiply": + return str(float(a * b)) + elif op == "divide": + if b == 0: + return "Error: division by zero" + return str(float(a / b)) + return f"Error: unknown operation '{operation}'" + + +@tool +def word_count(text: str) -> str: + """Count the number of words in the input text.""" + return f"Word count: {len(text.split())}" + + +@tool +def search(query: str, max_results: int = 3) -> str: + """Search for information and return mock results.""" + results = [ + {"title": f"Result {i+1} for '{query}'", "url": f"https://example.com/{i+1}"} + for i in range(max(1, min(max_results, 10))) + ] + return json.dumps(results, indent=2) + + +model = ChatAnthropic(model="claude-haiku-4-5-20251001", temperature=0) +tools = [calculator, word_count, search] +agent = create_react_agent(model, tools) +``` + + +About 35 lines. The `@tool` decorator works similarly to ADK's `FunctionTool`: it reads the docstring and type hints to build the tool schema. `create_react_agent` wires up the ReAct loop internally. Three imports, three decorated functions, three lines of setup. Notice `ChatAnthropic(model="claude-haiku-4-5-20251001")` at line 159: LangChain uses provider-specific model classes rather than the provider-neutral `ModelClient` you built in Section 0a. This is a philosophical choice. ADK abstracts the provider. LangChain ties your code to a specific provider class. If you later swap from Anthropic to OpenAI, the raw agent and the ADK agent need a config change. The LangChain agent needs a code change. + + +The philosophical difference matters here. LangChain was built as a composition framework, where you chain together prompts, models, retrievers, and output parsers into pipelines. This is powerful for linear workflows (retrieve, then summarize, then format). It is awkward for agents, where the control flow is a loop, not a chain. LangGraph (the agent layer) fixes this by introducing a graph-based execution model, but you're now dealing with two mental models: chains for data flow and graphs for control flow. + +**What LangChain makes easier.** The ecosystem is enormous. Need to connect to a vector database? There's a LangChain integration. Need to parse PDF files? Integration. Need to call Anthropic, OpenAI, Google, Cohere, or Mistral? Integrations for all of them. If your project involves wiring together many external services, LangChain has probably already written the glue code. + +**What LangChain makes harder.** Debugging. When a chain breaks, the error surfaces through multiple abstraction layers. A type mismatch deep in a retriever chain produces an error message that references LangChain internal classes, not your code. Version churn is also a factor. LangChain has gone through several major API changes (the chain API, the LCEL API, the LangGraph API), and code written twelve months ago often needs significant rework. This is the cost of a fast-moving ecosystem. + +LangChain has the largest ecosystem. It also has the highest abstraction penalty. When your chain breaks at 2am, you're reading LangChain source code, not your code. This is not a disqualifying flaw. It's a fact you should know before you commit. + + +A LangChain agent failed on a tool call where the model passed a string instead of an integer for a `max_results` parameter. The traceback was 42 lines long, starting in `langgraph.pregel`, passing through `langchain_core.runnables`, `langchain_core.tools`, and three layers of internal dispatch before reaching the actual TypeError in user code. An engineer spent 25 minutes reading framework internals before finding the one relevant line. The same bug in the raw agent produced a 4-line trace: the tool name, the bad arguments, the Pydantic validation error, and the error message sent back to the model. The fix in both cases was identical (add type validation). The diagnosis time was not. This is what "abstraction penalty" means in practice. If your team adds a framework, add structured logging at YOUR layer too, not just the framework's. Log the tool name, the raw arguments, and the result before the framework touches them. When something breaks, you want your logs to tell you what happened, independent of whether the framework's logs are readable. + + + +That same type mismatch in ADK produced a trace entry showing: tool name `search`, arguments `{"query": "python", "max_results": "five"}`, and result `Validation error: Input should be a valid integer`. Three fields. No framework internals. The engineer saw the bad argument, saw the validation error, and fixed the tool description to clarify "max_results must be a number, not a word" in under two minutes. This is not because ADK is "better." It is because ADK's tracing exposes tool-level details by default. The principle: when evaluating frameworks, give them a bad input and read the error output. The framework that shows you the problem fastest is the one that will cost you the least at 2am. + + +**Observability.** LangChain's tracing story is LangSmith, which is a separate product with its own pricing. ADK's tracing is built into the framework. Your raw agent's tracing is whatever you build. The separation between the framework and the observability tool in LangChain's case means you're managing two dependencies instead of one, and you're sending trace data to an external service you don't control. + +## The three-way comparison + +Here is what the same agent looks like across all three approaches, measured on dimensions that matter in production: + +| Dimension | Raw | ADK | LangChain | +|-----------|-----|-----|-----------| +| Lines of code | ~100 | ~40 | ~35 | +| Debug a failure | Read your code | Read traces | Read chains + source | +| Add a new tool | Write a function | Decorate + register | Wrap in Tool class | +| Eval integration | Build it yourself | Built-in | LangSmith (separate) | +| Lock-in | None | Google ecosystem | LangChain ecosystem | +| Best for | Learning, unusual needs | Production, need tracing | Prototyping, need integrations | + +
+ Three-way comparison showing raw agent, ADK, and LangChain across lines of code, debuggability, and production readiness +
Figure 0d.1: The same agent, three ways. Each approach trades something for something else. The code gets shorter. The debugging gets harder. The features get richer.
+
+ +A few observations that the table doesn't capture: + +**Portability.** The raw agent works with any model provider because you control the client. ADK is designed for Google's models (Gemini) first, with other providers supported through adapters. LangChain supports the most providers out of the box but ties you to its abstraction layer. Pick the lock-in you're most comfortable with. + +**Upgrade velocity.** The raw agent changes when you change it. ADK and LangChain change when their maintainers ship a new version. LangChain's upgrade velocity is particularly high, which means more features but also more breaking changes. If you're running agents in production for months or years, framework upgrades are a maintenance cost you need to budget for. + +**Team onboarding.** The raw agent is readable by anyone who knows Python. ADK requires learning ADK. LangChain requires learning LangChain, which is a larger surface area. If your team has three months of LangChain experience, switching to ADK has a real cost. If your team has zero framework experience, ADK's smaller API surface is faster to learn. + +## The honest take + +I have opinions. Strong ones. Here they are. + +If you're building something serious, pick a framework that gives you visibility, not convenience. Traces matter more than fewer lines of code. An agent that runs correctly but can't be debugged when it doesn't is a liability. Every production incident I've seen with agent systems came down to the same question: what did the model do, and why? Frameworks that answer this question well (ADK does, LangSmith does if you pay for it, your raw trace list does if you build it properly) are worth the dependency. Frameworks that hide this information behind convenience wrappers are not. + +If you're learning, build raw first. Then move to a framework. I would not recommend the reverse. Starting with a framework means learning its API without understanding what it's doing underneath. When something breaks (and it will), you end up debugging abstractions you don't understand. You google the error message instead of reading the code. You copy-paste solutions from Stack Overflow instead of reasoning about the system. Building raw first takes two days. It saves you months of confused debugging later. + +If your team already uses LangChain, that's fine. Understand what it's doing (you now can), and add the engineering discipline the framework doesn't give you. Add structured evaluation. Add cost tracking. Add budget limits. Add trace export. LangChain gives you the plumbing. It doesn't give you the engineering practices. Those are on you, and they matter more than the plumbing. + +If you're starting fresh, I'd reach for ADK. It's opinionated about the right things (tracing, eval) and stays out of your way on the rest. The API surface is small enough to learn in an afternoon. The tracing is good enough for production. The tool registration is simple. And if you've built the raw agent first (which you have, if you're reading this book in order), you'll understand exactly what ADK is doing for you and what it's not. + +None of this is religious. Use what works. But "what works" includes debuggability, maintainability, and the ability to answer "what happened?" when things go wrong. Not just "does it run?" + +
+ Decision tree for choosing between raw agent, ADK, and LangChain based on team experience, production needs, and ecosystem requirements +
Figure 0d.2: Framework decision tree. Start with what you need (visibility, ecosystem, control), not what has the most GitHub stars.
+
+ +## Eval as a mindset, not a tool + +You've seen three implementations of the same agent. Which one is "best"? That question is meaningless without data. Let's get some. + +The eval harness in `src/ch00/eval_compare.py` runs five test queries against each implementation, scores the answers, and compares the results. The scoring is deliberately simple: exact match scores 1.0, substring match scores 0.8, no match scores 0.0. This is not a production evaluation suite. It's the minimum structure needed to make framework decisions with evidence instead of opinions. + +Here are the test queries: + +```python +TEST_QUERIES = [ + {"query": "What is 12 plus 8?", "expected": "20"}, + {"query": "What is 9 multiplied by 7?", "expected": "63"}, + {"query": "How many words are in the sentence: the quick brown fox?", "expected": "4"}, + {"query": "Search for information about machine learning.", "expected": "machine learning"}, + {"query": "What is 100 divided by 4?", "expected": "25"}, +] +``` + +And the scoring function: + +```python +def score_answer(query: str, expected: str, actual: str) -> EvalResult: + norm_expected = expected.strip().lower() + norm_actual = actual.strip().lower() + + if norm_expected == norm_actual: + score = 1.0 + elif norm_expected in norm_actual: + score = 0.8 + else: + score = 0.0 + + return EvalResult( + query=query, expected=expected, actual=actual, + score=score, tokens=0, latency_ms=0.0, cost_estimate=0.0, + ) +``` + + +The scorer checks two things: did the agent's answer match exactly, or does the expected answer appear somewhere in the response? An agent that returns "12 + 8 = 20" scores 0.8 (substring match) rather than 1.0 (exact match for "20"). This is intentional. Agents typically give conversational answers, not bare values. The 0.8 score says "correct but verbose." Production evals would use semantic similarity or LLM-as-judge scoring, but the structure is the same. + + +Run the eval harness and you get output like this: + +``` +============================================================ +Implementation: raw_agent +============================================================ +Query Expected Got Score +---------------------------------------- ------------ ------------------------- ----- +What is 12 plus 8? 20 12 + 8 = 20 0.8 +What is 9 multiplied by 7? 63 9 * 7 = 63 0.8 +How many words are in the sentence: t.. 4 There are 4 words in .. 0.8 +Search for information about machine .. machine le.. Here are results about.. 0.8 +What is 100 divided by 4? 25 100 / 4 = 25 0.8 + +============================================================ +Summary +============================================================ +Implementation Avg Score Total Tokens Avg Latency ms Total Cost USD +--------------------------------------------------------------------------- +raw_agent 0.80 725 2.3 0.001740 +adk_agent 0.80 680 1.8 0.001632 +langchain_agent 0.80 695 2.1 0.001668 +``` + +All three implementations score the same. The tools are the same. The logic is the same. The framework is irrelevant to correctness. What differs is the operational overhead: how many tokens each framework adds (system prompt wrapping, internal formatting), how fast each one dispatches (framework overhead per step), and how much each one costs over thousands of runs. + +The numbers don't lie. This is how you make framework decisions: with data, not blog posts. Run your actual queries against your actual candidates, measure correctness, cost, and latency, then choose. This is also how you make every agent decision going forward. Chapter 6 goes deep on production evaluation, covering semantic scoring, LLM-as-judge patterns, regression testing, and cost profiling. + +
+ Layered view showing tool logic at the bottom, framework in the middle, and eval on top, with the key insight that eval sits above all frameworks +
Figure 0d.3: Eval sits above the framework layer. Your tools and your evaluation suite are yours. The framework is the replaceable part in the middle.
+
+ + +The most important takeaway from this section is not which framework scored highest. It's that you can answer the question at all. Too many teams pick frameworks based on blog posts, conference talks, and GitHub stars. Then they wonder why the thing doesn't work for their use case. The teams I've seen succeed pick frameworks based on measured performance against their own workloads. Build the eval first. Then try the frameworks. Not the other way around. + + +## What you've built, and what comes next + +You've now built the same agent three ways. You understand what happens at every level, from the raw API call through the loop through the framework abstractions. You know what frameworks give you, what they take away, and how to evaluate both. + +The raw agent taught you the mechanism. The framework agents taught you the tradeoffs. The eval harness taught you how to choose. This is the pattern for every engineering decision in this book: understand the fundamentals, evaluate the options, measure the results. + +You also made a lot of decisions by feel. You picked a budget of 5 without measuring whether 3 was enough. You wrote tool descriptions that seemed clear but might break on ambiguous queries. You chose a system prompt by instinct. You eyeballed eval scores instead of running statistical significance tests. All of that worked for a three-tool demo agent. None of it works for a production system handling 10,000 queries a day where a wrong answer has consequences. + +[Chapter 1](/agentic-ai/book/01-what-agentic-means/) replaces instinct with engineering vocabulary. It defines five system types, from single LLM calls through multi-agent orchestrations, and gives you a decision framework for choosing when a task needs the loop you just built and when a deterministic workflow is the safer bet. The rest of the book turns every judgment call you made by feel into a measurable, testable, reviewable engineering decision. + +For all three implementations with shared eval and comparison scripts, see the [Framework Comparison](/agentic-ai/projects/framework-comparison/) project. diff --git a/src/content/chapters/00e-connecting-to-mcp.mdx b/src/content/chapters/00e-connecting-to-mcp.mdx new file mode 100644 index 0000000..31d130d --- /dev/null +++ b/src/content/chapters/00e-connecting-to-mcp.mdx @@ -0,0 +1,299 @@ +--- +title: Connecting Your Agent to MCP +part: foundations +description: "Build an MCP server from scratch and connect your agent to it. The standard protocol for giving AI agents access to tools, data, and services." +readingTime: 14 +date: 2026-03-19 +references: + - 00d-frameworks +patterns: [] +status: published +--- + +import Callout from '~/components/universal/Callout.astro'; + +In Section 0c, we built an agent with three tools: `search`, `calculator`, and `word_count`. Those tools lived in the same Python file as the agent. The agent imported them directly. This works fine when you control everything. + +But what happens when your agent needs to use a tool maintained by another team? Or a database that lives on a different server? Or a third-party service that updates its API every quarter? You could write custom integration code for each one. If you have 5 agents and 10 tools, that is 50 integrations to build and maintain. + +MCP solves this problem. + +## What is MCP? + +**MCP** (Model Context Protocol) is an open standard for connecting AI applications to tools, data sources, and services. It was created by Anthropic and is now the most widely adopted protocol for agent-to-tool communication. + +The analogy: MCP is USB-C for AI. Before USB-C, every phone had a different charger. You needed a drawer full of cables. USB-C standardized the connector so one cable works with everything. MCP does the same for agent tools: each tool implements the server once, each agent implements the client once, and they all work together. + +
+ Before MCP: 5 apps times 4 tools equals 20 custom integrations. After MCP: 5 plus 4 equals 9 standard integrations. +
Without MCP, every app writes custom code for every tool. With MCP, each side implements the standard once.
+
+ +## The three roles + +Every MCP interaction has three participants: + +
+ MCP architecture showing three roles: Host (your AI application with LLM and MCP client), MCP Servers (exposing tools), and two transport options (stdio for local, HTTP for remote). +
MCP architecture: the host contains the LLM and one or more MCP clients, each connected to a server.
+
+ +**Host.** Your AI application. Claude Desktop, VS Code with Copilot, or the custom agent you built in Section 0c. The host contains the LLM and one or more MCP clients. + +**Client.** A connector that maintains a session with one MCP server. The client handles the protocol: initialization, tool discovery, and tool invocation. Most of the time, you use a library for this (the MCP Python SDK or TypeScript SDK). You do not write the client from scratch. + +**Server.** A program that exposes tools, data, or services over MCP. This is what you build. A server says: "Here are the tools I offer, here are their parameters, call me when you need them." + +## How the protocol works + +When your agent connects to an MCP server, four things happen in sequence: + +
+ MCP protocol lifecycle: 1) Initialize (handshake), 2) Discover (tools/list), 3) Invoke (tools/call), 4) Loop (result goes back to LLM for next decision). +
The MCP lifecycle: initialize, discover available tools, invoke them as the LLM decides, and loop.
+
+ +**Step 1: Initialize.** The client and server perform a handshake. Each side declares what protocol version it supports and what capabilities it offers. This happens once when the connection starts. + +**Step 2: Discover.** The client calls `tools/list`. The server responds with a list of available tools -- each with a name, a description, and a JSON Schema defining its parameters. This is the same schema format the LLM uses for function calling (Section 0b). The client feeds these tool definitions to the LLM so the model knows what it can call. + +**Step 3: Invoke.** When the LLM decides to use a tool, it generates a tool call (just like in Section 0c). The MCP client sends `tools/call` with the tool name and arguments. The server executes the tool and returns the result. + +**Step 4: Loop.** The result goes back to the LLM's context. The LLM decides whether it has enough information to answer or needs to call another tool. This is the same agent loop from Section 0c -- MCP does not change the loop. It changes where the tools live. + +The protocol runs on **JSON-RPC 2.0** -- a simple request/response format over two transports: + +- **stdio** -- for local servers running as subprocesses on the same machine. Fast, no network. This is what you use during development and for tools that access local files. +- **Streamable HTTP** -- for remote servers accessible over the network. Supports authentication, streaming, and session management. This is what you use in production. + +## Building your first MCP server + +Let's build a simple MCP server that exposes three tools: a unit converter, a word counter, and a random fact generator. We will use the official MCP Python SDK. + +### Install the SDK + +```bash +pip install mcp +``` + +### The server code + +Create a file called `my_server.py`: + +```python +from mcp.server.fastmcp import FastMCP + +# Create the server +mcp = FastMCP("My First MCP Server") + + +@mcp.tool() +def convert_temperature(celsius: float) -> str: + """Convert a temperature from Celsius to Fahrenheit.""" + fahrenheit = (celsius * 9 / 5) + 32 + return f"{celsius}°C = {fahrenheit}°F" + + +@mcp.tool() +def word_count(text: str) -> str: + """Count the number of words in a text.""" + count = len(text.split()) + return f"The text contains {count} words." + + +@mcp.tool() +def reverse_text(text: str) -> str: + """Reverse a string of text.""" + return text[::-1] +``` + +That is the entire server. Three things to notice: + +1. **`@mcp.tool()` is all you need.** The decorator registers the function as an MCP tool. The SDK automatically generates the JSON Schema from the function signature and docstring. The function name becomes the tool name. The docstring becomes the tool description that the LLM reads. + +2. **The function parameters are the tool parameters.** `celsius: float` becomes a required parameter of type `number` in the schema. Python type hints do the work. + +3. **Return a string.** MCP tool results are content blocks. For simple tools, return a string and the SDK wraps it as text content. + +### Run the server + +For local development, run it with stdio transport: + +```bash +python my_server.py +``` + +Or for HTTP transport (accessible over the network): + +```bash +mcp run my_server.py --transport http --port 8080 +``` + +### Test it with the MCP Inspector + +The MCP Inspector is a browser-based tool that lets you test your server without writing a client: + +```bash +npx @modelcontextprotocol/inspector +``` + +This opens a web UI where you can connect to your server, see the available tools, and call them interactively. Use this to verify your server works before connecting an agent to it. + +## Connecting Claude Desktop to your server + +The fastest way to see your MCP server in action with a real LLM is Claude Desktop. + +**Step 1.** Open Claude Desktop settings and find the MCP configuration file: + +- macOS: `~/Library/Application Support/Claude/claude_desktop_config.json` +- Windows: `%APPDATA%\Claude\claude_desktop_config.json` + +**Step 2.** Add your server: + +```json +{ + "mcpServers": { + "my-tools": { + "command": "python", + "args": ["/full/path/to/my_server.py"] + } + } +} +``` + +**Step 3.** Restart Claude Desktop. You should see a hammer icon indicating MCP tools are available. Ask Claude: "Convert 37 degrees Celsius to Fahrenheit" and it will call your `convert_temperature` tool. + +That is MCP working end to end: Claude discovers your tools, the LLM decides to call one, your Python function executes, and the result flows back into the conversation. + +## A more realistic server: document search + +The toy server above demonstrates the protocol. Let's build something closer to what you would use in a real agent: a document search tool backed by a local file system. + +```python +import os +from pathlib import Path +from mcp.server.fastmcp import FastMCP + +mcp = FastMCP("Document Search Server") + +DOCS_DIR = Path("./documents") + + +@mcp.tool() +def list_documents() -> str: + """List all available documents.""" + if not DOCS_DIR.exists(): + return "No documents directory found." + files = [f.name for f in DOCS_DIR.iterdir() if f.is_file()] + if not files: + return "No documents found." + return "\n".join(files) + + +@mcp.tool() +def read_document(filename: str) -> str: + """Read the contents of a document by filename.""" + path = DOCS_DIR / filename + # Security: prevent path traversal + if not path.resolve().is_relative_to(DOCS_DIR.resolve()): + return "Error: invalid filename." + if not path.exists(): + return f"Document '{filename}' not found." + return path.read_text()[:5000] # Limit to 5000 chars + + +@mcp.tool() +def search_documents(query: str) -> str: + """Search all documents for a keyword or phrase. Returns matching filenames and excerpts.""" + if not DOCS_DIR.exists(): + return "No documents directory found." + results = [] + query_lower = query.lower() + for f in DOCS_DIR.iterdir(): + if f.is_file(): + content = f.read_text() + if query_lower in content.lower(): + # Find the matching line + for line in content.split("\n"): + if query_lower in line.lower(): + results.append(f"**{f.name}**: {line.strip()[:200]}") + break + if not results: + return f"No documents contain '{query}'." + return "\n\n".join(results) +``` + +Three design decisions worth understanding: + +**Path traversal protection.** The `is_relative_to` check prevents a malicious tool call from reading files outside the documents directory. Without this, a prompt injection could trick the LLM into calling `read_document("../../etc/passwd")`. This is not theoretical -- it is one of the most common MCP vulnerabilities. Always validate paths. + +**Content truncation.** The `[:5000]` limit prevents a single document from consuming the LLM's entire context window. In production, you would use chunking and retrieval (Chapter 2) instead of reading full files. + +**Simple search.** The keyword search is intentionally basic. In a production system, you would use vector embeddings and semantic search. But the MCP interface stays the same -- the tool contract does not change when you upgrade the search implementation. + +## How MCP fits with agents you already built + +If you built the agent in Section 0c, you defined tools as Python functions and registered them directly: + +```python +# Section 0c: tools are local functions +tools = [search, calculator, word_count] +agent = Agent(tools=tools) +``` + +With MCP, tools live on a server. The agent discovers them at runtime: + +```python +# With MCP: tools are discovered from a server +async with ClientSession(transport) as session: + await session.initialize() + tools = await session.list_tools() + # These tool schemas go to the LLM -- same format as before +``` + +The agent loop does not change. The LLM still generates tool calls. The difference is where the tool executes: locally (Section 0c) or on an MCP server (this section). From the LLM's perspective, it is identical. It sees the same JSON Schema for parameters and gets the same text results back. + +## When to use MCP (and when not to) + +**Use MCP when:** + +- Your tools are maintained by a different team. MCP gives you a clean interface without importing their code. +- You want to share tools across multiple agents. Build the server once, connect any MCP-compatible agent. +- You want to use third-party tools (database connectors, API wrappers, cloud services) that already have MCP servers published. +- You are using Claude Desktop, VS Code Copilot, or another MCP-compatible host and want to extend it with custom tools. + +**Skip MCP when:** + +- All your tools are local functions in the same codebase. Direct function calls are simpler and faster. +- You are building a prototype and do not need cross-team tool sharing yet. +- You need sub-millisecond tool invocation. MCP adds protocol overhead (~5-20ms for stdio, more for HTTP). + +The guidance from the book: start without MCP. Get your agent logic right with direct function calls. Add MCP when tool integrations multiply or when tools need to be shared across agents. Earn the complexity. + +## What MCP does not solve + +MCP standardizes how agents discover and call tools. It does not solve: + +- **Who is calling?** MCP has no built-in concept of agent identity. Any client that can reach the server can call its tools. +- **What are they allowed to do?** There is no per-agent access control in the base protocol. Every client gets the same tools. +- **Who authorized this action?** If something goes wrong, the logs show a tool was called. They do not show the authorization chain. + +These gaps matter in production. Chapter 13 of the book covers how to address them with governance, access control, and the identity layer (AIP). + +## What to build next + +You now have the building blocks: + +- **Section 0a-0b**: How LLMs and tool calling work +- **Section 0c**: A complete agent from scratch +- **Section 0d**: The same agent with a framework +- **Section 0e**: Connecting to tools via MCP + +From here, the book takes you into the engineering decisions that determine whether your agent survives production: when to use an agent versus a workflow (Chapter 3), how to evaluate and harden it (Chapter 6), when not to use an agent at all (Chapter 7), and how to govern, secure, and scale it (Chapters 10-13). + +## Further reading + +- **[MCP specification](https://modelcontextprotocol.io)** -- The full protocol spec, SDK documentation, and quickstart guides. +- **[MCP Python SDK](https://github.com/modelcontextprotocol/python-sdk)** -- The official Python SDK with `FastMCP` for building servers. +- **[MCP Inspector](https://github.com/modelcontextprotocol/inspector)** -- Browser-based testing tool for MCP servers. +- **[MCP Server Registry](https://registry.modelcontextprotocol.io)** -- Community registry of published MCP servers. diff --git a/src/content/chapters/01-what-agentic-means.mdx b/src/content/chapters/01-what-agentic-means.mdx new file mode 100644 index 0000000..ec261a3 --- /dev/null +++ b/src/content/chapters/01-what-agentic-means.mdx @@ -0,0 +1,252 @@ +--- +title: What "Agentic" Actually Means +part: I-build +description: "What makes a system agentic? Precise definitions of LLM apps, workflows, tool-using systems, agents, and multi-agent systems for engineers." +readingTime: 12 +date: 2026-03-20 +references: + - 00c-first-agent + - 03-workflow-first-agent-second + - 04-multi-agent-without-theater + - fn-001 +patterns: + - workflow-first + - agent-loop +status: published +--- + +import Callout from '~/components/universal/Callout.astro'; + +A team spent four months building what they called an agent. It had tools, a system prompt, and an API endpoint. It processed customer queries, looked up account information, searched a knowledge base, and generated responses. The architecture diagram showed an agent loop with dynamic tool selection. + +When we instrumented it, the data told a different story. The system called the same three tools in the same order on every query. It never skipped a step. It never went back to re-retrieve after seeing the account. It never decided that a different approach was needed. It was a workflow wearing an agent costume. The agent loop added 3x latency, 2.5x token cost, and occasional random skipping behavior where the model decided to drop the account lookup step entirely, producing generic responses for customers with specific billing problems. + +Replacing the agent with a fixed three-step pipeline improved response time, reduced cost, and eliminated the skipping behavior. Four months of engineering, rebuilt in a week, because nobody had asked the question this chapter answers: what kind of system are we actually building? + +The word "agent" is used to describe everything from a chatbot with a system prompt to a multi-model orchestration system that autonomously manages infrastructure. When a term means everything, it means nothing, and engineering decisions based on ambiguous terminology produce ambiguous systems. Teams build agents when they need workflows. They add autonomy to systems that need predictability. Not because they lack skill, but because the vocabulary they are working with does not help them distinguish between fundamentally different system shapes. + +## The five system types + +There are five distinct patterns for building systems that use large language models. They differ in how much autonomy the system has, how decisions are made, and where things break. + +### 1. LLM Application + +An LLM application is a system where a language model provides a single response to a single request. There is no looping, no tool use, and no decision-making by the model beyond generating text. + +Examples: a summarization endpoint, a classification service, a text-to-SQL translator. You send input, you get output, you are done. + +The model's role is generation. The system's control flow is entirely in your code. If the output is bad, you adjust the prompt or the input preprocessing. There is exactly one LLM call per request. + +**Failure surface:** Prompt sensitivity, output format instability, hallucination within the response. These are containable because the blast radius is a single response. + +### 2. Workflow + +A workflow is a deterministic sequence of steps where some steps involve LLM calls. The key property: the control flow is fixed. Step A always happens before Step B. The code decides what happens next, never the model. + +Examples: a RAG pipeline (retrieve, then generate), an extraction pipeline (parse, then extract, then validate), a content pipeline (draft, then review, then format). + +The model's role is execution within each step. Between steps, your code handles routing, error checking, and data transformation. + +**Failure surface:** Each LLM step can fail independently. But because the control flow is fixed, you can test each step in isolation, add retries at specific points, and reason about the system's behavior without guessing what path it took. + +### 3. Tool-using system + +A tool-using system gives the model access to functions it can call. The model decides which tools to call and with what arguments, but within a single turn. There is no multi-step reasoning loop -- the model gets one chance to select and invoke tools. + +Examples: a chatbot with access to a calculator and a search API, a coding assistant that can run tests, a support system that can look up customer records. + +The model's role expands to include action selection. It is no longer just generating text; it is making decisions about what to do. This is a meaningful step up in complexity and failure surface. + +**Failure surface:** Everything from LLM applications, plus: tool argument hallucination (the model invents parameter values), unnecessary tool calls (the model calls tools it does not need), and tool selection errors (the model picks the wrong tool for the task). + +### 4. Agent + +An agent is a system where the model participates in a loop, making decisions at each iteration about what to do next based on what it has observed so far. The defining characteristic is the loop: observe the current state, decide on an action, execute it, observe the result, and repeat. + +This is where autonomy enters the picture. The model is not just executing a fixed plan. It is deciding, at each step, whether to gather more information, call a tool, refine its approach, or produce a final answer. The control flow is no longer fully in your code. + +Examples: a research agent that searches, reads, searches again based on what it found. A document analysis agent that retrieves evidence, decides if it is sufficient, and either answers or refines its query. + +**Failure surface:** Everything from tool-using systems, plus: unbounded loops (the agent keeps going without converging), budget exhaustion (the agent runs out of steps before reaching an answer), compounding errors (each step's mistakes feed into the next step's context), and the fundamental unpredictability of having a probabilistic system make control-flow decisions. + +
+ The four components of the agent loop: observe, think, act, and the iteration budget +
Figure 1.1: Agent Anatomy -- the four components of the agent loop
+
+ +### 5. Multi-agent system + +A multi-agent system involves multiple agents that communicate, delegate, or coordinate to accomplish a task. Each agent has its own loop, its own tools, and its own scope of responsibility. + +Examples: a system where a planning agent breaks down a task and delegates subtasks to specialist agents, a peer review system where one agent drafts and another critiques. + +**Failure surface:** Everything from single agents, multiplied by the number of agents, plus: coordination failures (agents work at cross purposes), delegation errors (the wrong agent gets the wrong subtask), communication overhead (agents spend tokens talking to each other rather than solving the problem), and emergent behaviors that no individual agent was designed to produce. + +## Comparison table + +| Property | LLM App | Workflow | Tool-using | Agent | Multi-agent | +|----------|---------|----------|------------|-------|-------------| +| **Autonomy** | None | None | Single-turn | Multi-turn loop | Multi-loop, multi-actor | +| **Decision-making** | None | Code only | Model selects tools | Model controls loop | Models coordinate | +| **Control flow** | Fixed | Fixed | Fixed with model-selected actions | Model-directed | Distributed | +| **LLM calls** | 1 | N (fixed) | 1-2 | Variable | Variable x agents | +| **Failure surface** | Small | Medium | Medium-large | Large | Very large | +| **Testability** | Easy | Easy | Moderate | Hard | Very hard | +| **Cost predictability** | High | High | Moderate | Low | Very low | +| **When to use** | Single transform | Multi-step, predictable | Action selection, single turn | Open-ended, multi-step | Genuinely separable concerns | + +Read this table from left to right as a spectrum of increasing autonomy, increasing capability, and increasing risk. The engineering challenge is to pick the point on this spectrum that is far enough right to solve your problem and no further. + +
+ Spectrum from LLM apps through workflows and tool-using systems to agents and multi-agent systems +
Figure 1.2: System Types Spectrum -- from LLM apps to multi-agent systems
+
+ +## Bounded autonomy + +The concept that makes agents practical in production is bounded autonomy. Unbounded autonomy -- "do whatever you think is best until you are done" -- is not an engineering pattern. It is a hope. And in production, hope is not a strategy. + +Consider what happens without bounds. You deploy an agent that can search documents, call APIs, and generate reports. A user sends a tricky query. The agent searches, finds partial results, searches again with a refined query, finds more partial results, tries a third approach, calls an extraction tool, gets an error, retries, searches yet again. Twenty model calls later, it produces an answer that is no better than what it had after the second call. It burned $0.15 in tokens, took 45 seconds, and the user has already given up and called a human. + +This is not a theoretical failure mode. It is the default behavior of an agent without bounds. Language models do not have an inherent sense of diminishing returns. They will keep working -- keep consuming tokens, keep adding latency -- because their training rewarded being thorough, not being efficient. + +Bounded autonomy means the agent operates within explicit constraints: + +**Iteration budget.** The agent has a maximum number of steps. When the budget is exhausted, it must produce the best answer it has and explain what it could not complete. This is not a limitation -- it is a design requirement. An agent without a budget is a runaway process. In our Document Intelligence Agent, the budget is 5 steps. This was chosen empirically: for this task and this document corpus, additional steps beyond 5 rarely improve the answer. Your number will differ. Choose it with data, not intuition. + +**Action space.** The agent can only use tools that are explicitly registered and validated. It cannot invent new capabilities. The action space is enumerable, which means it is auditable. In our system, the action space is four tools (load, chunk, retrieve, extract) plus the action of producing a final answer. Five possible actions at each step. This is small enough that a human reviewer can understand every possible path the agent might take. + +A larger action space is not always better. Each additional tool increases the probability that the model picks the wrong one. It also increases the surface area for tool-argument hallucination. Design the action space to include exactly what the agent needs and nothing more. + +**Stop conditions.** The agent has explicit criteria for when to stop: confidence above a threshold, all required fields extracted, or budget exhausted. "I think I am done" is not a stop condition. Stop conditions are checked in code, not left to the model's judgment. The model can signal that it wants to stop (by producing text instead of tool calls), but the system verifies that the stopping criteria are met. + +**Escalation policy.** When the agent cannot meet its confidence threshold within its budget, it escalates rather than guesses. This is the difference between a system that fails gracefully and one that fails silently. Escalation means the system says: "I was unable to answer this confidently. Here is what I found, and here is what is missing." This is vastly more useful than a confident wrong answer, which the downstream consumer has no reason to question. + +The interplay between these bounds matters. The budget prevents runaway execution. The action space prevents unexpected behavior. Stop conditions prevent premature or late termination. Escalation handles the cases that fall outside the system's capability. Together, they create a bounded region of autonomous operation that is predictable enough to trust and capable enough to be useful. + +These bounds are not restrictions on intelligence. They are the engineering controls that make intelligence useful. A system without bounds is not more capable -- it is less trustworthy. The engineering challenge is calibrating the bounds: tight enough to prevent waste, loose enough to let the agent do its job. + +## What is not an agent + +This distinction matters because misclassifying your system leads to mismatched architecture decisions. + +**A chatbot with a system prompt is not an agent.** It has no loop, no tools, no action selection. It is an LLM application. Calling it an agent inflates expectations and obscures the actual (simple) architecture. + +**A RAG pipeline is not an agent.** Even a sophisticated one with re-ranking and query expansion. If the control flow is fixed -- retrieve, rank, generate -- it is a workflow. It becomes agentic only when the model decides whether to retrieve more, and that decision changes the control flow. + +**A chain of LLM calls is not an agent.** If you call Model A, then Model B, then Model C in a fixed sequence, that is a workflow. The number of LLM calls is not what makes something an agent. The loop and decision-making are what makes something an agent. + +**A model that calls one tool is not an agent.** Single-turn tool use -- the model picks a function and calls it -- is a tool-using system. It becomes an agent when the result of that tool call feeds back into the model for another round of decision-making. + +Why does this matter? Because each pattern has different failure modes, different testing strategies, different cost profiles, and different operational requirements. If you build an agent when you need a workflow, you pay the agent tax -- unpredictable costs, harder testing, more failure modes -- without getting the agent benefit of adaptive reasoning. + +## The Document Intelligence Agent + +Throughout this book, we build one system: a Document Intelligence Agent that ingests documents, answers questions based on their content, and provides citations for its answers. + +This is a good running example because: + +- It is a real, useful task that appears in enterprise settings +- It can be implemented as both a workflow and an agent, letting us compare directly +- It has meaningful failure modes (retrieval gaps, hallucination, citation fabrication) +- It is complex enough to be interesting but contained enough to fit in a book + +Here is the task description: + +> Given a collection of documents, answer user questions by retrieving relevant passages, reasoning over the evidence, and producing an answer with source citations. When the evidence is insufficient, say so explicitly rather than guessing. + +In Chapter 2, we build the components: document loading, chunking, retrieval, context assembly, and a first agent loop. In Chapter 3, we implement the same task as both a deterministic workflow and a bounded agent, and compare them head to head. In Chapter 6, we evaluate and harden both implementations with tracing, reliability engineering, and security safeguards. In Chapter 7, we step back and ask honestly whether this task needed an agent at all. + +The project architecture is documented at `project/doc-intelligence-agent/docs/architecture.md`. The known failure surfaces are catalogued at `project/doc-intelligence-agent/docs/failure-analysis.md`. + +## Decision map: which pattern for which problem + +Here is a framework for choosing the right point on the autonomy spectrum. Work from left to right. Stop at the first pattern that solves your problem. Do not skip ahead because a more complex pattern sounds more impressive. Each step adds real cost. + +**Start with an LLM application when:** +- The task is a single input-to-output transformation +- The output format is predictable +- You need high throughput and predictable cost +- Example: classification, summarization, translation + +Most LLM-powered features in production today are -- and should be -- LLM applications. A single well-crafted prompt with structured output constraints handles a remarkable range of tasks. Do not underestimate this pattern because it lacks the word "agent" in its description. + +**Move to a workflow when:** +- The task requires multiple steps +- The steps are known in advance +- Each step's output feeds the next step's input +- You need reproducible, testable pipelines +- Example: RAG, extraction pipelines, content generation with review + +Workflows are the workhorse of LLM-powered systems. They combine the model's generation capability with the engineer's control. The model does the thinking; the code does the routing. This is where most enterprise use cases should land. + +**Move to a tool-using system when:** +- The model needs to take actions (search, compute, query) to answer +- But a single round of tool use is sufficient +- The tools are well-defined and safe +- Example: customer support with database lookup, coding with test execution + +Tool use crosses a threshold: the model is now making decisions about what to do, not just generating text. This is a meaningful increase in both capability and failure surface. The model can pick the wrong tool, hallucinate arguments, or call tools unnecessarily. Each of these needs mitigation (schema validation, logging, monitoring). + +**Move to an agent when:** +- The task requires multi-step reasoning with intermediate decisions +- The optimal path through the task is not known in advance +- The model needs to evaluate its own progress and adapt +- The task justifies the additional cost and complexity +- Example: research tasks, complex document analysis, multi-step planning + +The key phrase is "not known in advance." If you can write an if/else chain that covers all the decision points, you do not need an agent. Agents are for tasks where the decision tree is data-dependent -- where what you should do next depends on what you found in the last step, in ways you cannot enumerate ahead of time. + +**Move to a multi-agent system when:** +- The problem decomposes into genuinely independent subtasks +- Different subtasks require different capabilities or tool sets +- The coordination overhead is justified by the specialization benefit +- Example: large-scale research with synthesis, systems with distinct review/approval stages + +Multi-agent systems are the most complex and least predictable option. Before choosing this pattern, ask: can I achieve the same decomposition with multiple tools inside a single agent? Usually the answer is yes. Multi-agent systems are justified only when the subtasks require truly different system prompts, different model configurations, or parallel execution with independent state. + +At each step to the right, you gain capability and lose predictability. The engineering question is always: does the next level of autonomy buy me enough capability to justify what I lose in control? + +For most production systems today, the answer lands on "workflow" or "tool-using system." Agents are appropriate less often than the current discourse suggests. Multi-agent systems are appropriate rarely. This is not a criticism of the technology. It is a recognition that simpler architectures are easier to build, test, operate, and trust. + +## Core concepts: a precise vocabulary + +Before we move to building, a few terms that will recur throughout the book. + +**Action space.** The set of actions available to an agent at any given step. In our system, this is the set of registered tools plus the action of producing a final answer. A smaller action space is almost always better -- it reduces the model's decision burden and the system's failure surface. + +**Observation.** The information available to the agent at the start of a step. This includes the original query, retrieved evidence, tool results from previous steps, and any state the system maintains. What you include in the observation is a design decision with direct impact on quality and cost. + +**Step.** One iteration of the observe-think-act loop. The agent observes the current state, decides on an action, and executes it. Each step consumes tokens and adds latency. Steps are the unit of budget. + +**Budget.** The maximum number of steps an agent is allowed to take. This is not a soft suggestion -- it is a hard limit that determines the cost ceiling and latency ceiling for a single request. + +**Escalation.** The act of an agent declaring that it cannot complete the task within its constraints and handing the task to a human or a different system. Escalation is a feature, not a failure. A system that escalates appropriately is more trustworthy than one that always produces an answer regardless of confidence. + +**Grounding.** The practice of constraining the model to answer only from provided evidence rather than from its training data. Grounding is necessary but not sufficient for accuracy -- a model can be grounded and still misinterpret the evidence. + +**Side effect.** An action that changes the state of the world outside the agent. Reading a file is not a side effect. Writing to a database is. Sending an email is. Side effects require different permission models and different testing strategies than read-only actions. + +## Failure modes at the conceptual level + +Before you write a line of code, understand where each system type fails. These are not edge cases to handle later -- they are the primary engineering challenges. + +**LLM applications** fail on output quality. The model hallucinates, drifts from the desired format, or produces confidently wrong answers. Mitigation: structured output, validation, and acceptance tests. + +**Workflows** fail at step boundaries. One step's output does not match the next step's expectations. The model in step 3 contradicts the model in step 1. Mitigation: typed interfaces between steps, intermediate validation, and end-to-end tests. + +**Tool-using systems** fail on tool selection and argument construction. The model calls the wrong tool, passes invalid arguments, or calls tools unnecessarily. Mitigation: schema validation, clear tool descriptions, and tool-call logging. + +**Agents** fail on decision quality. The agent takes a suboptimal path, gets stuck in a loop, wastes its budget on irrelevant tool calls, or fails to recognize when it has enough evidence to answer. Mitigation: bounded autonomy, step-level tracing, and evaluation against diverse test cases. + +**Multi-agent systems** fail on coordination. Agents duplicate work, produce contradictory results, or spend their budgets talking to each other rather than solving the problem. Mitigation: clear role definitions, shared state management, and aggregate evaluation. + +Notice that each level inherits the failure modes of the levels below it and adds new ones. This is the agent tax: every step up in autonomy buys you more capability but also more failure surface. The engineering discipline is to pay this tax only when the capability is worth the cost. + +## What comes next + +You now have a vocabulary and a decision framework. You know what an agent is, what it is not, and when each pattern is appropriate. You understand bounded autonomy -- the principle that makes agents viable in production rather than merely impressive in demos. + +But understanding the taxonomy is not enough to build anything. The next step is concrete: what does an LLM actually need to know and do to become useful inside a system? How do you give it tools, assemble its context, and structure the loop that turns a language model into a component that can reason and act? + +Chapter 2 answers these questions with working code. diff --git a/src/content/chapters/02-tools-context-agent-loop.mdx b/src/content/chapters/02-tools-context-agent-loop.mdx new file mode 100644 index 0000000..69ada18 --- /dev/null +++ b/src/content/chapters/02-tools-context-agent-loop.mdx @@ -0,0 +1,36 @@ +--- +title: Tools, Context, and the Agent Loop +part: I-build +description: "How tool-using agents work: function calling, context window management, the agent loop, and writing a working agent without a framework." +readingTime: 2 +date: 2026-03-25 +status: draft +--- + +This chapter gives you the building blocks of a working agent system. You build a tool-using agent from scratch -- not a framework wrapper, but code you understand and can modify. + +## What this chapter covers + +- **An LLM primer for systems engineers** -- what matters for production, not theory +- **Tools as contracts** -- function calling with validation, permissions, and error handling +- **Context engineering** -- system prompts, retrieval context, grounding, injection boundaries +- **The agent loop** -- the observe-think-act cycle and what makes it work or fail +- **Failure modes in this chapter's code** -- what breaks and why +- **Building a working agent** -- a complete implementation without reaching for a framework + +## Code companion + +The working code for this chapter is in [`src/ch02/`](https://github.com/sunilp/agentic-ai/tree/main/src/ch02): + +- `tool_registry.py` -- Tool registry implementation +- `context.py` -- Context pipeline +- `agent.py` -- First working agent +- `tools/` -- Document chunking, loading, extraction, and retrieval + +Run it: `make run` + +## Get the full chapter + +The complete chapter text is available in the book. + +[Get the book on Amazon](https://www.amazon.com/dp/B0GVG6848F) diff --git a/src/content/chapters/03-workflow-first-agent-second.mdx b/src/content/chapters/03-workflow-first-agent-second.mdx new file mode 100644 index 0000000..e556008 --- /dev/null +++ b/src/content/chapters/03-workflow-first-agent-second.mdx @@ -0,0 +1,34 @@ +--- +title: Workflow First, Agent Second +part: I-build +description: "Why you should reach for deterministic workflows before agents. Decision criteria, architecture comparison, and the engineering instinct that saves production systems." +readingTime: 2 +date: 2026-03-25 +status: draft +--- + +Every time a team builds an agent, they should first ask: would a workflow do the job? This chapter makes the comparison concrete by implementing the same task both ways and measuring the differences. + +## What this chapter covers + +- **The deterministic workflow** -- building a reliable pipeline for document question answering +- **The bounded agent** -- the same task with LLM-driven decision-making +- **Same task, two ways: the comparison** -- accuracy, latency, cost, and debuggability metrics +- **Design guidance: when to choose which** -- decision criteria for workflows vs agents +- **Planning and single-agent design** -- state management and bounded autonomy +- **Failure modes specific to this chapter** -- what breaks when you choose wrong + +## Code companion + +The working code for this chapter is in [`src/ch03/`](https://github.com/sunilp/agentic-ai/tree/main/src/ch03): + +- `workflow.py` -- Deterministic workflow implementation +- `agent.py` -- Bounded agent implementation +- `state.py` -- State management +- `compare.py` -- Side-by-side comparison runner + +## Get the full chapter + +The complete chapter text is available in the book. + +[Get the book on Amazon](https://www.amazon.com/dp/B0GVG6848F) diff --git a/src/content/chapters/04-multi-agent-without-theater.mdx b/src/content/chapters/04-multi-agent-without-theater.mdx new file mode 100644 index 0000000..e8c9cb6 --- /dev/null +++ b/src/content/chapters/04-multi-agent-without-theater.mdx @@ -0,0 +1,36 @@ +--- +title: Multi-Agent Systems Without Theater +part: I-build +description: "When multi-agent systems are justified and when they are complexity theater. Patterns, coordination overhead, and honest cost analysis." +readingTime: 2 +date: 2026-03-25 +status: draft +--- + +Multi-agent systems are where the gap between demo and production is widest. This chapter draws the line between coordination patterns that solve real problems and complexity theater that looks impressive but fails at 2 AM. + +## What this chapter covers + +- **When multi-agent is justified** -- the specific conditions where a single agent cannot hold the task +- **When multi-agent is theater** -- recognizing unnecessary decomposition +- **Core patterns** -- orchestrator, message contracts, and agent specialization +- **The working example** -- retriever, reasoner, and verifier on the document intelligence task +- **The comparison** -- single-agent vs multi-agent on accuracy, cost, and latency +- **Coordination overhead and cost explosion** -- the real price of multi-agent +- **Failure modes** -- what goes wrong when agents coordinate + +## Code companion + +The working code for this chapter is in [`src/ch04_multiagent/`](https://github.com/sunilp/agentic-ai/tree/main/src/ch04_multiagent): + +- `contracts.py` -- Agent message contracts +- `agents.py` -- Specialized agents (retriever, reasoner, verifier) +- `orchestrator.py` -- Multi-agent orchestration +- `compare.py` -- Single vs multi-agent comparison +- `run.py` -- Executable example + +## Get the full chapter + +The complete chapter text is available in the book. + +[Get the book on Amazon](https://www.amazon.com/dp/B0GVG6848F) diff --git a/src/content/chapters/05-human-in-the-loop.mdx b/src/content/chapters/05-human-in-the-loop.mdx new file mode 100644 index 0000000..06b023c --- /dev/null +++ b/src/content/chapters/05-human-in-the-loop.mdx @@ -0,0 +1,34 @@ +--- +title: Human-in-the-Loop as Architecture +part: II-judge +description: "Human-in-the-loop as a first-class architectural decision. Approval gates, escalation policies, audit logging, and the Incident Runbook Agent." +readingTime: 2 +date: 2026-03-25 +status: draft +--- + +HITL done poorly is worse than no human at all. It creates the appearance of oversight without the substance. This chapter treats human-in-the-loop as a first-class architectural concern, not an afterthought. + +## What this chapter covers + +- **The three primitives** -- approval gates, escalation policies, audit trails +- **Design guidance** -- structuring when and how humans intervene +- **The working example: Incident Runbook Agent** -- multi-agent with human approval before remediation +- **When HITL is security theater** -- rubber-stamping vs real oversight +- **Cost of HITL vs value of HITL** -- the tradeoff analysis most teams skip +- **Building for auditability** -- decision trails useful for debugging, compliance, and improvement +- **Failure modes** -- what breaks in human-agent interaction + +## Code companion + +The working code for this chapter is in [`src/ch05_hitl/`](https://github.com/sunilp/agentic-ai/tree/main/src/ch05_hitl): + +- `approval.py` -- Approval gate implementation +- `escalation.py` -- Escalation policy engine +- `audit.py` -- Audit logging system + +## Get the full chapter + +The complete chapter text is available in the book. + +[Get the book on Amazon](https://www.amazon.com/dp/B0GVG6848F) diff --git a/src/content/chapters/06-evaluating-and-hardening.mdx b/src/content/chapters/06-evaluating-and-hardening.mdx new file mode 100644 index 0000000..0b75101 --- /dev/null +++ b/src/content/chapters/06-evaluating-and-hardening.mdx @@ -0,0 +1,36 @@ +--- +title: Evaluating and Hardening Agents +part: II-judge +description: "How to evaluate and harden production agent systems. Metrics, adversarial testing, regression suites, and the difference between opinions and evidence." +readingTime: 2 +date: 2026-03-25 +status: draft +--- + +You have built an agent. You have opinions about whether it works. But opinions are not evidence, and in production, only evidence counts. This chapter covers the five layers that turn a prototype into a production system. + +## What this chapter covers + +- **Evaluation** -- gold datasets, rubric scoring, failure bucketing, and regression suites +- **Observability** -- structured traces, token accounting, latency decomposition +- **Reliability** -- retries, checkpointing, crash recovery, graceful degradation +- **Cost management** -- token profiling, budget controls, architecture-level cost optimization +- **Security** -- prompt injection, tool abuse, data exfiltration, least privilege enforcement +- **Before and after hardening** -- concrete metrics showing the difference +- **Failure modes in this chapter's code** -- what breaks during hardening + +## Code companion + +The working code for this chapter is in [`src/ch06/`](https://github.com/sunilp/agentic-ai/tree/main/src/ch06): + +- `eval_harness.py` -- Gold dataset and rubric scoring +- `tracer.py` -- Structured tracing +- `reliability.py` -- Retry and recovery patterns +- `cost_profiler.py` -- Token and cost tracking +- `security.py` -- Prompt injection and tool abuse detection + +## Get the full chapter + +The complete chapter text is available in the book. + +[Get the book on Amazon](https://www.amazon.com/dp/B0GVG6848F) diff --git a/src/content/chapters/07-when-not-to-use-agents.mdx b/src/content/chapters/07-when-not-to-use-agents.mdx new file mode 100644 index 0000000..681f6e5 --- /dev/null +++ b/src/content/chapters/07-when-not-to-use-agents.mdx @@ -0,0 +1,36 @@ +--- +title: When Not to Use Agents +part: II-judge +description: "The most important chapter: when not to use agents. Decision framework, anti-patterns, and honest retrospective on simpler alternatives." +readingTime: 2 +date: 2026-03-25 +status: draft +--- + +This is the chapter most books on agents would not include. The most valuable skill an engineer can develop is knowing when not to use a powerful tool. + +This chapter is about recognizing the specific conditions under which autonomy adds value, and the much broader set of conditions under which it does not. It is about building the judgment that turns a capable engineer into a trustworthy one. + +## What this chapter covers + +- **The agent tax** -- the real cost of autonomy in latency, money, complexity, and debugging time +- **Alternatives that are usually sufficient** -- when workflows, rules, retrieval, or static pipelines win +- **A decision framework** -- a structured approach to the build-or-not decision +- **Systems that should not have been agents** -- real examples of over-engineering +- **Three case studies you should memorize** -- patterns that repeat across teams and industries +- **Multi-agent anti-patterns** -- the most common ways teams add unnecessary coordination +- **An honest retrospective on the Document Intelligence Agent** -- what actually needed agent autonomy +- **Principles for the decision** -- six principles that hold across use cases + +## Code companion + +This chapter is primarily conceptual, but references the comparison data from earlier chapters: + +- Architecture Comparison -- workflow vs agent metrics +- Failure Case Studies -- where agents failed and simpler approaches succeeded + +## Get the full chapter + +The complete chapter text is available in the book. + +[Get the book on Amazon](https://www.amazon.com/dp/B0GVG6848F) diff --git a/src/content/chapters/08-metacognition.mdx b/src/content/chapters/08-metacognition.mdx new file mode 100644 index 0000000..4f33533 --- /dev/null +++ b/src/content/chapters/08-metacognition.mdx @@ -0,0 +1,16 @@ +--- +title: Metacognition and Self-Reflection +part: III-operate +description: "Production patterns for agent metacognition: loop detection, quality assessment, cost-aware planning, and sycophancy defense." +readingTime: 2 +date: 2026-03-25 +status: draft +--- + +Loop detection, quality assessment, cost-aware planning, sycophancy defense. + +## Get the full chapter + +The complete chapter text is available in the book. + +[Get the book on Amazon](https://www.amazon.com/dp/B0GVG6848F) diff --git a/src/content/chapters/09-deployment.mdx b/src/content/chapters/09-deployment.mdx new file mode 100644 index 0000000..2862d9f --- /dev/null +++ b/src/content/chapters/09-deployment.mdx @@ -0,0 +1,16 @@ +--- +title: Deploying and Scaling +part: III-operate +description: "Deploying and scaling agent systems: durable execution, observability, autoscaling, and real production failures." +readingTime: 2 +date: 2026-03-25 +status: draft +--- + +Durable execution, observability, autoscaling, real failures. + +## Get the full chapter + +The complete chapter text is available in the book. + +[Get the book on Amazon](https://www.amazon.com/dp/B0GVG6848F) diff --git a/src/content/chapters/10-governance.mdx b/src/content/chapters/10-governance.mdx new file mode 100644 index 0000000..76c3d47 --- /dev/null +++ b/src/content/chapters/10-governance.mdx @@ -0,0 +1,16 @@ +--- +title: Governance and Auditability +part: III-operate +description: "Agent governance and auditability: decision traces, compliance boundaries, risk tiers, and policy engines." +readingTime: 2 +date: 2026-03-25 +status: draft +--- + +Decision traces, compliance boundaries, risk tiers, policy engines. + +## Get the full chapter + +The complete chapter text is available in the book. + +[Get the book on Amazon](https://www.amazon.com/dp/B0GVG6848F) diff --git a/src/content/chapters/11-security.mdx b/src/content/chapters/11-security.mdx new file mode 100644 index 0000000..f3e9946 --- /dev/null +++ b/src/content/chapters/11-security.mdx @@ -0,0 +1,16 @@ +--- +title: Security Deep Dive +part: III-operate +description: "Security deep dive for agent systems: the Lethal Trifecta, defense in depth, MCP security, and red teaming." +readingTime: 2 +date: 2026-03-25 +status: draft +--- + +The Lethal Trifecta, defense in depth, MCP security, red teaming. + +## Get the full chapter + +The complete chapter text is available in the book. + +[Get the book on Amazon](https://www.amazon.com/dp/B0GVG6848F) diff --git a/src/content/chapters/12-memory-management.mdx b/src/content/chapters/12-memory-management.mdx new file mode 100644 index 0000000..ae55fc5 --- /dev/null +++ b/src/content/chapters/12-memory-management.mdx @@ -0,0 +1,33 @@ +--- +title: Memory Management +part: IV-advanced +description: "Memory management for production agents: session memory, long-term learning, multi-agent coordination, and memory security." +readingTime: 2 +date: 2026-03-25 +status: draft +--- + +Every agent we build in this book forgets. The Document Agent answers a question and starts fresh. The Incident Runbook Agent resolves an incident and loses the context. In production, this amnesia is the difference between a demo and a product. + +## What this chapter covers + +- **Session memory** -- surviving the context window with importance-weighted truncation and compaction +- **Long-term memory** -- learning from corrections, avoiding repeated mistakes, two-pass retrieval +- **Shared memory** -- multi-agent coordination with scoped state stores and optimistic concurrency +- **Production frameworks** -- Mem0, Zep, and Letta compared with current benchmarks +- **Memory security** -- poisoning attacks, validation, and the GDPR compliance tension +- **Learned forgetting** -- when and how to retire memories that no longer serve + +## Code companion + +The working code for this chapter is in [`src/ch12_memory/`](https://github.com/sunilp/agentic-ai/tree/main/src/ch12_memory): + +- `session_memory.py` -- Context management with importance scoring +- `long_term_memory.py` -- Persistent memory with worthiness filtering +- `shared_memory.py` -- Multi-agent state coordination + +## Get the full chapter + +The complete chapter text is available in the book. + +[Get the book on Amazon](https://www.amazon.com/dp/B0GVG6848F) diff --git a/src/content/chapters/13-agent-protocols-in-production.mdx b/src/content/chapters/13-agent-protocols-in-production.mdx new file mode 100644 index 0000000..5463ad4 --- /dev/null +++ b/src/content/chapters/13-agent-protocols-in-production.mdx @@ -0,0 +1,26 @@ +--- +title: Agent Protocols in Production +part: IV-advanced +description: "MCP, A2A, and AIP in production: building enterprise MCP servers, cross-boundary agent collaboration, identity and delegation chains, and production pitfalls." +readingTime: 2 +date: 2026-03-25 +status: draft +--- + +The compliance officer asked three questions: "Who authorized this action? What was the delegation chain? Can we reconstruct it after an incident?" The architect said: "We log everything." The compliance officer said: "Logging what happened is not the same as proving who was authorized to make it happen." + +## What this chapter covers + +- **Building production MCP servers** -- thin wrappers, gateways, catalogs, enterprise auth +- **A2A across team boundaries** -- Agent Cards, task lifecycle, multi-turn negotiation +- **The identity gap** -- why MCP and A2A solve connectivity but not accountability +- **AIP deep dive** -- delegation chains, scope attenuation, the MCP auth proxy +- **Complex use cases** -- financial services, healthcare, supply chain +- **Production pitfalls** -- server sprawl, shadow connections, delegation debugging +- **The adoption sequence** -- when to add each protocol layer + +## Get the full chapter + +The complete chapter text is available in the book. + +[Get the book on Amazon](https://www.amazon.com/dp/B0GVG6848F) diff --git a/src/content/patterns/agent-loop.mdx b/src/content/patterns/agent-loop.mdx new file mode 100644 index 0000000..2a0c336 --- /dev/null +++ b/src/content/patterns/agent-loop.mdx @@ -0,0 +1,9 @@ +--- +slug: agent-loop +name: The Agent Loop +oneLine: Observe → Think → Act → Repeat. The simplest possible agent is this loop. +whenToUse: | + When you need an agent at all. The loop is the abstraction; everything else (tools, memory, multi-agent topologies) is layered on top. +whenNotToUse: | + When a single LLM call answers the question. The loop adds latency and cost; only pay it when the task genuinely requires iteration. +--- diff --git a/src/content/patterns/approval-gate.mdx b/src/content/patterns/approval-gate.mdx new file mode 100644 index 0000000..e7ea2ed --- /dev/null +++ b/src/content/patterns/approval-gate.mdx @@ -0,0 +1,11 @@ +--- +slug: approval-gate +name: Approval Gate +oneLine: Side-effecting actions require explicit human approval before execution; non-side-effecting actions don't. +whenToUse: | + When the agent has access to actions that cost money, modify external state, or send messages. Define the policy at design time, not after legal complains. Approval should be cheap for routine cases and require deliberation for risky ones. +whenNotToUse: | + For read-only operations. Approval-everything systems get reviewer fatigue and produce rubber-stamps. +antiPattern: | + No approval gates at all. Every agent failure becomes a customer-visible incident instead of a caught rejection. +--- diff --git a/src/content/patterns/cold-start-mitigation.mdx b/src/content/patterns/cold-start-mitigation.mdx new file mode 100644 index 0000000..6ba0268 --- /dev/null +++ b/src/content/patterns/cold-start-mitigation.mdx @@ -0,0 +1,9 @@ +--- +slug: cold-start-mitigation +name: Cold Start Mitigation +oneLine: Serverless agent runtimes have a 1-3 second cold start; design around it. +whenToUse: | + Production deployments where p99 latency matters. Strategies: keep-warm pings every N minutes, provisioned concurrency for predictable load, or fallback to a smaller / cached model on cold-start detection. +whenNotToUse: | + When the workload is batched and latency-insensitive. Cold start adds cost in keep-warm strategies; only pay it when latency is the bottleneck. +--- diff --git a/src/content/patterns/earn-the-complexity.mdx b/src/content/patterns/earn-the-complexity.mdx new file mode 100644 index 0000000..e854c73 --- /dev/null +++ b/src/content/patterns/earn-the-complexity.mdx @@ -0,0 +1,11 @@ +--- +slug: earn-the-complexity +name: Earn the Complexity +oneLine: Every framework, tool, or model parameter you add must justify itself with measurable improvement. +whenToUse: | + Always. Each new abstraction (multi-agent, RAG, memory, framework) should be tested against a simpler baseline. If the eval scores don't move, the added complexity isn't worth it. +whenNotToUse: | + Never. This is the spine of the book. +antiPattern: | + Adopting multi-agent because the framework supports it, not because the task requires it. See Lab L-001 — router beat multi-agent on every measured axis for short-horizon support queries. +--- diff --git a/src/content/patterns/escalation-path.mdx b/src/content/patterns/escalation-path.mdx new file mode 100644 index 0000000..94928fb --- /dev/null +++ b/src/content/patterns/escalation-path.mdx @@ -0,0 +1,9 @@ +--- +slug: escalation-path +name: Escalation Path +oneLine: When the agent is uncertain or out of its competence range, it escalates to a human or to a different agent — explicitly. +whenToUse: | + Always. Every agent system needs at least one escalation path: "if confidence < X, stop and surface to human." Without it, the agent fabricates with high confidence in cases it should have refused. +whenNotToUse: | + When the system is read-only and low-stakes. Even then, "no answer" is usually a better fallback than "hallucinated answer." +--- diff --git a/src/content/patterns/eval-loop.mdx b/src/content/patterns/eval-loop.mdx new file mode 100644 index 0000000..3b95c74 --- /dev/null +++ b/src/content/patterns/eval-loop.mdx @@ -0,0 +1,11 @@ +--- +slug: eval-loop +name: Eval Loop +oneLine: Build the rubric and the gold dataset before you build the agent. +whenToUse: | + Always. There is no agent project where this is the wrong pattern. The rubric defines what "good" means; the gold dataset is the test bench; the agent is iterated against both. +whenNotToUse: | + When the rubric isn't built yet. (That's the signal to stop and build the rubric first.) +antiPattern: | + Building the agent first, then evaluating it only against the demo queries you already know it handles well. That's not evaluation, it's confirmation. +--- diff --git a/src/content/patterns/failure-buckets.mdx b/src/content/patterns/failure-buckets.mdx new file mode 100644 index 0000000..6df06c2 --- /dev/null +++ b/src/content/patterns/failure-buckets.mdx @@ -0,0 +1,11 @@ +--- +slug: failure-buckets +name: Failure Buckets +oneLine: An eval rubric needs categorical failure modes, not just a single score. +whenToUse: | + When you need to diagnose what's failing, not just whether it's failing. Bucket failures into specific categories (e.g. "wrong tool selected", "tool output ignored", "fabricated citation"). A single score tells you the system is broken; buckets tell you what to fix. +whenNotToUse: | + When the eval is purely a pass/fail gate with no diagnostic intent. (But that's almost never the right eval.) +antiPattern: | + Eval rubric that just reports a score (0.68) without telling you which failure mode is dominant. You can't improve what you don't differentiate. +--- diff --git a/src/content/patterns/hub-and-spoke.mdx b/src/content/patterns/hub-and-spoke.mdx new file mode 100644 index 0000000..e81c7e4 --- /dev/null +++ b/src/content/patterns/hub-and-spoke.mdx @@ -0,0 +1,11 @@ +--- +slug: hub-and-spoke +name: Hub and Spoke +oneLine: One orchestrator agent owns the plan; worker agents have narrow, well-typed jobs. +whenToUse: | + When a task genuinely requires decomposition into specialized sub-tasks (e.g. retrieve, then summarize, then critique). When the orchestrator can reliably classify the sub-task category. When each worker can fail in isolation without poisoning the others. +whenNotToUse: | + When the orchestrator's classification step is itself the bottleneck (mis-routing kills accuracy). When the sub-tasks can run as a deterministic pipeline (use Workflow First instead). When you have fewer than 3 worker types. +antiPattern: | + Three identical agents talking to each other in a loop with no protocol, hoping emergent coordination produces a good answer. The DeepMind 17x error trap shows what this becomes. +--- diff --git a/src/content/patterns/tool-registry.mdx b/src/content/patterns/tool-registry.mdx new file mode 100644 index 0000000..d491097 --- /dev/null +++ b/src/content/patterns/tool-registry.mdx @@ -0,0 +1,9 @@ +--- +slug: tool-registry +name: Tool Registry +oneLine: All tools available to the agent are declared in one schema-validated registry with explicit input/output types. +whenToUse: | + Every agent that calls more than one tool. The registry is the source of truth for what tools exist, what they accept, and what they return. Schema validation rejects malformed tool calls at the boundary instead of mid-flight. +whenNotToUse: | + When the agent calls only one tool. The overhead of a registry isn't worth it for trivial cases — just call the tool directly. +--- diff --git a/src/content/patterns/trace-the-truth.mdx b/src/content/patterns/trace-the-truth.mdx new file mode 100644 index 0000000..741f06d --- /dev/null +++ b/src/content/patterns/trace-the-truth.mdx @@ -0,0 +1,9 @@ +--- +slug: trace-the-truth +name: The Trace Is The Truth +oneLine: If you can't inspect why the agent did what it did, you can't debug, audit, or improve it. +whenToUse: | + Always. Tracing is a first-class system requirement, not a nice-to-have. Every tool call, every reasoning step, every retry should be inspectable in a structured log. +whenNotToUse: | + Never. The only valid reason to skip tracing is "prototype, will throw away" — and prototypes that survive contact with users always become production. +--- diff --git a/src/content/patterns/verifier-loop.mdx b/src/content/patterns/verifier-loop.mdx new file mode 100644 index 0000000..17b6455 --- /dev/null +++ b/src/content/patterns/verifier-loop.mdx @@ -0,0 +1,11 @@ +--- +slug: verifier-loop +name: Verifier Loop +oneLine: Every multi-agent step is checked by a deterministic verifier against the original task before the next step runs. +whenToUse: | + When the multi-agent system is producing compounding errors. When you need to bound error amplification (the DeepMind study showed verifier loops keep amplification under 2x vs 17x without). When each step's output can be rubric-graded against a known criterion. +whenNotToUse: | + When the verifier itself is an LLM call with no rubric — that's not verification, it's rubber-stamping. When the task is single-step (verifier adds cost with no upside). +antiPattern: | + A verifier agent that says "looks good to me" to 90% of inputs because no explicit verification criteria were given. Lab L-001 (multi-agent vs router) saw this — first eval pass scored 81% pre-fix, dropped to 74% after the verifier was made strict. +--- diff --git a/src/content/patterns/workflow-first.mdx b/src/content/patterns/workflow-first.mdx new file mode 100644 index 0000000..75a0cc2 --- /dev/null +++ b/src/content/patterns/workflow-first.mdx @@ -0,0 +1,11 @@ +--- +slug: workflow-first +name: Workflow First +oneLine: Default to a deterministic workflow before reaching for an agent. +whenToUse: | + When the steps a task requires can be listed in advance. When the failure cost is asymmetric (a wrong answer costs more than a missed one). When a workflow can hit 80%+ accuracy on the eval before any LLM autonomy is added. In these cases, a workflow is faster, cheaper, more predictable, and easier to debug. +whenNotToUse: | + When the task requires runtime decisions that cannot be enumerated — open-ended troubleshooting, dynamic multi-step research, novel synthesis tasks. When the workflow's branching factor explodes beyond ~20 branches. +antiPattern: | + Building an agent because the framework is fashionable, then discovering the agent is calling the same 3 tools in the same order on every query. That's a workflow wearing an agent costume; the autonomy is an illusion. +--- diff --git a/src/layouts/ChapterLayout.astro b/src/layouts/ChapterLayout.astro new file mode 100644 index 0000000..56b94ec --- /dev/null +++ b/src/layouts/ChapterLayout.astro @@ -0,0 +1,247 @@ +--- +import type { CollectionEntry } from 'astro:content'; +import { getCollection } from 'astro:content'; +import PageLayout from './PageLayout.astro'; +import Container from '~/components/layout/Container.astro'; +import Split from '~/components/layout/Split.astro'; +import MetaStrip from '~/components/universal/MetaStrip.astro'; +import KineticHeading from '~/components/universal/KineticHeading.astro'; +import Dek from '~/components/universal/Dek.astro'; +import Tag from '~/components/universal/Tag.astro'; +import Breadcrumb from '~/components/universal/Breadcrumb.astro'; +import ProgressReader from '~/components/universal/ProgressReader.astro'; +import ChapterMap from '~/components/universal/ChapterMap.astro'; +import NextPrev from '~/components/universal/NextPrev.astro'; +import { buildReverseIndex, getReverseLinks } from '~/lib/cross-links'; +import { entriesToContentEntries } from '~/lib/content-helpers'; + +interface Props { + entry: CollectionEntry<'chapters'>; +} + +const { entry } = Astro.props; +const d = entry.data; + +const [chapters, fieldNotes, recipes, labs, evidence, projects, patterns] = await Promise.all([ + getCollection('chapters'), + getCollection('fieldNotes'), + getCollection('recipes'), + getCollection('labs'), + getCollection('evidence'), + getCollection('projects'), + getCollection('patterns'), +]); +const allEntries = [...chapters, ...fieldNotes, ...recipes, ...labs, ...evidence, ...projects, ...patterns]; +const reverseIndex = buildReverseIndex(entriesToContentEntries(allEntries as any)); +const links = getReverseLinks(reverseIndex, entry.id); + +// Find prev/next by filename sort order. +const sortedChapters = [...chapters].sort((a, b) => a.id.localeCompare(b.id)); +const currentIdx = sortedChapters.findIndex((c) => c.id === entry.id); +const prev = currentIdx > 0 ? sortedChapters[currentIdx - 1] : null; +const next = currentIdx < sortedChapters.length - 1 ? sortedChapters[currentIdx + 1] : null; + +const partLabels: Record = { + 'foundations': 'Foundations', + 'I-build': 'Part I · Build', + 'II-judge': 'Part II · Judge', + 'III-operate': 'Part III · Operate', + 'IV-advanced': 'Part IV · Advanced', +}; +const partLabel = partLabels[d.part] ?? d.part; + +const chapterPrefix = entry.id.split('-')[0].toUpperCase(); +const dateStr = d.date.toISOString().slice(0, 10); +const metaItems = [chapterPrefix, dateStr, `${d.readingTime} min`]; + +const collectionRoute: Record = { + chapters: '/agentic-ai/book', + fieldNotes: '/agentic-ai/field-notes', + recipes: '/agentic-ai/recipes', + projects: '/agentic-ai/projects', + evidence: '/agentic-ai/evidence', + labs: '/agentic-ai/labs', + patterns: '/agentic-ai/patterns', +}; +--- + + + + + + + + +
+ {partLabel} + + {d.title} + {d.description} + +
+ +
+ + + + {(links.referencedBy.length > 0 || links.citedBy.length > 0) && ( +
+

Referenced by

+ +
+ )} +
+ + +
+
+
+ + diff --git a/src/layouts/PatternLayout.astro b/src/layouts/PatternLayout.astro new file mode 100644 index 0000000..1e45cdd --- /dev/null +++ b/src/layouts/PatternLayout.astro @@ -0,0 +1,153 @@ +--- +import type { CollectionEntry } from 'astro:content'; +import { getCollection } from 'astro:content'; +import PageLayout from './PageLayout.astro'; +import Container from '~/components/layout/Container.astro'; +import Reader from '~/components/layout/Reader.astro'; +import KineticHeading from '~/components/universal/KineticHeading.astro'; +import Dek from '~/components/universal/Dek.astro'; +import Tag from '~/components/universal/Tag.astro'; +import Callout from '~/components/universal/Callout.astro'; +import { buildReverseIndex, getReverseLinks } from '~/lib/cross-links'; +import { entriesToContentEntries } from '~/lib/content-helpers'; + +interface Props { + entry: CollectionEntry<'patterns'>; +} + +const { entry } = Astro.props; +const d = entry.data; + +const [chapters, fieldNotes, recipes, labs, evidence, projects, patterns] = await Promise.all([ + getCollection('chapters'), + getCollection('fieldNotes'), + getCollection('recipes'), + getCollection('labs'), + getCollection('evidence'), + getCollection('projects'), + getCollection('patterns'), +]); +const allEntries = [...chapters, ...fieldNotes, ...recipes, ...labs, ...evidence, ...projects, ...patterns]; +const reverseIndex = buildReverseIndex(entriesToContentEntries(allEntries as any)); +// Patterns are stored in the reverse index by entry.id (the filename), +// since the patterns schema has no `data.id` field. The mapper in +// entriesToContentEntries falls back to entry.id when data.id is absent. +const links = getReverseLinks(reverseIndex, entry.id); + +const collectionRoute: Record = { + chapters: '/agentic-ai/book', + fieldNotes: '/agentic-ai/field-notes', + recipes: '/agentic-ai/recipes', + projects: '/agentic-ai/projects', + evidence: '/agentic-ai/evidence', + labs: '/agentic-ai/labs', + patterns: '/agentic-ai/patterns', +}; +--- + + + + + Pattern + {d.name} + {d.oneLine} + + {d.diagram && ( +
+ {`${d.name} +
+ )} + +

When to use

+
+ +

When not to use

+
+ + {d.antiPattern && ( + +
+ + )} + +
+ +
+ + {links.mentionedIn.length > 0 && ( +
+

Used in

+ +
+ )} + + + + + diff --git a/src/pages/book/[...slug].astro b/src/pages/book/[...slug].astro new file mode 100644 index 0000000..82fdc7f --- /dev/null +++ b/src/pages/book/[...slug].astro @@ -0,0 +1,23 @@ +--- +import { getCollection, render, type CollectionEntry } from 'astro:content'; +import ChapterLayout from '~/layouts/ChapterLayout.astro'; + +export async function getStaticPaths() { + const entries = await getCollection('chapters'); + return entries.map((entry) => ({ + params: { slug: entry.id }, + props: { entry }, + })); +} + +interface Props { + entry: CollectionEntry<'chapters'>; +} + +const { entry } = Astro.props; +const { Content } = await render(entry); +--- + + + + diff --git a/src/pages/book/index.astro b/src/pages/book/index.astro new file mode 100644 index 0000000..9b00b99 --- /dev/null +++ b/src/pages/book/index.astro @@ -0,0 +1,158 @@ +--- +import { getCollection } from 'astro:content'; +import PageLayout from '~/layouts/PageLayout.astro'; +import Container from '~/components/layout/Container.astro'; +import KineticHeading from '~/components/universal/KineticHeading.astro'; +import Dek from '~/components/universal/Dek.astro'; +import Tag from '~/components/universal/Tag.astro'; + +const chapters = (await getCollection('chapters')).sort((a, b) => a.id.localeCompare(b.id)); + +const parts = [ + { + id: 'foundations', + label: 'Foundations', + description: 'Five hands-on sections that take you from zero to building your first agent and connecting it to tools via MCP.', + filter: (id: string) => id.startsWith('00'), + }, + { + id: 'part-1', + label: 'Part I · Build', + description: 'From components to multi-agent systems.', + filter: (id: string) => /^0[1-4]-/.test(id), + }, + { + id: 'part-2', + label: 'Part II · Judge', + description: 'Oversight, evaluation, and knowing when to stop.', + filter: (id: string) => /^0[5-7]-/.test(id), + }, + { + id: 'part-3', + label: 'Part III · Operate', + description: 'Production reality.', + filter: (id: string) => /^(08|09|10|11)-/.test(id), + }, + { + id: 'part-4', + label: 'Part IV · Advanced', + description: 'Memory and protocols at scale.', + filter: (id: string) => /^(12|13)-/.test(id), + }, +]; + +function chapterPrefix(slug: string): string { + const m = slug.match(/^(\d+[a-z]?)/); + return m ? m[1] : slug; +} +--- + + + +
+ Book + The Book + A field guide to building reliable, evaluable, production-grade agent systems. +
+ + {parts.map((part) => { + const partChapters = chapters.filter((c) => part.filter(c.id)); + return ( +
+
+

{part.label}

+

{part.description}

+
+
    + {partChapters.map((c) => ( +
  1. + + {chapterPrefix(c.id).toUpperCase()} + {c.data.title} + {c.data.readingTime} min + +
  2. + ))} +
+
+ ); + })} +
+
+ + diff --git a/src/pages/patterns/[...slug].astro b/src/pages/patterns/[...slug].astro new file mode 100644 index 0000000..9430a2b --- /dev/null +++ b/src/pages/patterns/[...slug].astro @@ -0,0 +1,23 @@ +--- +import { getCollection, render, type CollectionEntry } from 'astro:content'; +import PatternLayout from '~/layouts/PatternLayout.astro'; + +export async function getStaticPaths() { + const entries = await getCollection('patterns'); + return entries.map((entry) => ({ + params: { slug: entry.data.slug }, + props: { entry }, + })); +} + +interface Props { + entry: CollectionEntry<'patterns'>; +} + +const { entry } = Astro.props; +const { Content } = await render(entry); +--- + + + + diff --git a/src/pages/patterns/index.astro b/src/pages/patterns/index.astro new file mode 100644 index 0000000..509343e --- /dev/null +++ b/src/pages/patterns/index.astro @@ -0,0 +1,71 @@ +--- +import { getCollection } from 'astro:content'; +import PageLayout from '~/layouts/PageLayout.astro'; +import Container from '~/components/layout/Container.astro'; +import MagazineGrid from '~/components/layout/MagazineGrid.astro'; +import KineticHeading from '~/components/universal/KineticHeading.astro'; +import Dek from '~/components/universal/Dek.astro'; +import Tag from '~/components/universal/Tag.astro'; + +const entries = (await getCollection('patterns')).sort((a, b) => a.data.name.localeCompare(b.data.name)); +--- + + + +
+ Patterns + Patterns + Cross-cutting patterns across the book. Each pattern hub links every chapter, Field Note, Recipe, and Lab that uses it. +
+ + {entries.length === 0 ? ( +

No patterns yet.

+ ) : ( + + {entries.map((e) => ( + +
{e.data.name}
+

{e.data.oneLine}

+
+ ))} +
+ )} +
+
+ + diff --git a/src/pages/start/index.astro b/src/pages/start/index.astro new file mode 100644 index 0000000..a155876 --- /dev/null +++ b/src/pages/start/index.astro @@ -0,0 +1,141 @@ +--- +import PageLayout from '~/layouts/PageLayout.astro'; +import Container from '~/components/layout/Container.astro'; +import KineticHeading from '~/components/universal/KineticHeading.astro'; +import Dek from '~/components/universal/Dek.astro'; +import Tag from '~/components/universal/Tag.astro'; + +const tracks = [ + { + label: 'New to agentic AI', + description: 'You\'ve seen the hype. You want to know what\'s real and how to think about it.', + next: 'Start with the Foundations.', + cta: 'Foundations', + href: '/agentic-ai/book/00a-how-llms-work/', + chapters: ['00a How LLMs Work', '00b API to Tools', '00c Your First Agent', '01 What Agentic Means'], + }, + { + label: 'Building an agent', + description: 'You\'re writing the code now. You want patterns that survive contact with production.', + next: 'Jump to the build-focused chapters and the flagship project.', + cta: 'Building chapters', + href: '/agentic-ai/book/#part-1', + chapters: ['02 Tools, Context, Agent Loop', '03 Workflow First', '04 Multi-Agent Without Theater', 'R-001 Strands on AgentCore'], + }, + { + label: 'Running in production', + description: 'You shipped. Now you need to evaluate, harden, and operate.', + next: 'Evidence, evaluation, and operations chapters.', + cta: 'Evidence Wall', + href: '/agentic-ai/evidence/', + chapters: ['06 Evaluating and Hardening', '07 When Not to Use Agents', 'Evidence Wall', 'L-001 Lab Report'], + }, +]; +--- + + + +
+ Start + Start + Pick the path that matches where you are. Each one is a curated reading list, not a 13-chapter slog. +
+ +
+ {tracks.map((t) => ( +
+
{t.label}
+

{t.description}

+

{t.next}

+ +
    + {t.chapters.map((c) => (
  • {c}
  • ))} +
+ + {t.cta} → +
+ ))} +
+
+
+ +