vasylenko · vasylenko · May 13, 2026 · May 12, 2026 · May 12, 2026
diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
@@ -58,8 +58,9 @@ jobs:
           TAG_NAME: ${{ github.ref_name }}
         run: |
           VERSION=${TAG_NAME#v}
-          # Print everything between "## [VERSION]" and the next "## [" header.
-          # The trailing blank lines are trimmed by awk's NR/sub trick below.
+          # Prints lines after the matched "## [VERSION]" header until the
+          # next "## [" header (or EOF). The matched header line itself is
+          # skipped via `next`.
           CHANGELOG_BODY=$(awk -v ver="$VERSION" '
             /^## \[/ {
               if (found) exit

diff --git a/.npmignore b/.npmignore
@@ -1,4 +1,4 @@
-# Source files (compiled to dist/, which is in `files`)
+# Source, tests, and TS build config (not needed at runtime; dist/ ships instead)
 src/
 tests/
 *.ts
@@ -12,7 +12,7 @@ tsconfig.json
 docs/
 scripts/
 
-# Project files (rendered into the GitHub release body, not the npm package)
+# Repo-only docs (CHANGELOG.md is rendered into the GitHub release body; CLAUDE.md is agent instructions). Excluded from the npm tarball.
 CHANGELOG.md
 CLAUDE.md
 

diff --git a/README.md b/README.md
@@ -1,6 +1,6 @@
 # markfetch
 
-**Reader View for AI agents and your shell. Fetch any URL, get back clean markdown — at a real Chrome's request rate, not curl's.**
+**Reader View for AI agents and your shell. Fetch any URL, get back clean markdown — with a real Chrome's request fingerprint, not curl's.**
 
 [![npm](https://img.shields.io/npm/v/markfetch.svg?color=10b981&label=npm)](https://www.npmjs.com/package/markfetch)
 [![ci](https://github.com/vasylenko/markfetch/actions/workflows/ci.yml/badge.svg)](https://github.com/vasylenko/markfetch/actions/workflows/ci.yml)
@@ -32,7 +32,7 @@ markfetch https://en.wikipedia.org/wiki/Markdown
 }
 ```
 
-That snippet is the whole MCP setup — or jump to [CLI usage](#cli-usage) to drive the same binary from a shell.
+That snippet is the whole MCP setup — or jump to [CLI usage](#cli-usage) to drive the same command from a shell.
 
 ## MCP install commands
 
@@ -65,15 +65,15 @@ gemini mcp add -s user markfetch npx -y markfetch
 
 - **Reader-View-quality extraction.** [linkedom](https://github.com/WebReflection/linkedom) → [@mozilla/readability](https://github.com/mozilla/readability) → [turndown](https://github.com/mixmark-io/turndown) with GFM tables, strikethrough, and task lists. Code fences preserve `language-X` hints. Sphinx-style bare `<pre>` blocks render as code, not escaped prose. Intraword underscores stay un-escaped — no more `list\_tools`.
 
-- **One tool, one shape (MCP).** `fetch_markdown(url, savePath?)` returns markdown in `content[0].text`. No `structuredContent`, no frontmatter, no metadata fields. Modern MCP clients hide `content[]` when `structuredContent` is present — `markfetch` deliberately stays on the channel your LLM can actually read.
+- **One tool, one shape (MCP).** `fetch_markdown(url, savePath?)` returns markdown in `content[0].text`. No `structuredContent`, no frontmatter, no metadata fields. Several major MCP clients (Claude Code CLI, VS Code/Copilot) forward only `structuredContent` to the model and drop `content[]` when both are present — `markfetch` deliberately stays on the channel your LLM can actually read.
 
 - **`savePath` / `-o` escape valve.** Pass an absolute path (MCP `savePath`) or `-o <path>` (CLI) and the markdown lands on disk instead of the response channel. Use it when your client's inline tool-result cap would truncate large responses, or to redirect output from a shell pipeline. The file is only ever the markdown of the URL — fetch errors return a `[code]` string and never touch the disk.
 
 - **Whole document or honest failure.** No pagination, no truncation. If the document doesn't fit in `MARKFETCH_MAX_BYTES`, you get `too_large` — never a half-truth.
 
-- **Stdio-clean.** Stdout is reserved for MCP frames. Stderr is fatal-only. No log spam, no ANSI escapes that could corrupt protocol framing.
+- **Stdio-clean.** Stdout is reserved for MCP frames. Stderr is fatal-only. No log spam, no ANSI escapes — keeping stderr parseable for shell consumers.
 
-- **Pure Node, no subprocesses.** No Playwright, no headless Chromium, no Python hop. Single TypeScript executable on Node 24+ — one process whether you invoke it as an MCP server or from the shell.
+- **Pure Node, no subprocesses.** No Playwright, no headless Chromium, no Python hop. Single Node process — one Node process whether you invoke it as an MCP server or from the shell.
 
 ## CLI usage
 
@@ -112,7 +112,7 @@ Errors go to stderr with the same `[code] message` shape the MCP tool returns (s
 
 - **Not a crawler.** No recursion, no `robots.txt` parsing, no rate-limit orchestration. One URL in, one document out.
 - **Not authenticated.** Anonymous fetch only — no cookie jar, no auth headers, no session reuse. Pages behind login walls return whatever the public response is, usually surfaced as `http_error`.
-- **Not a JS renderer.** Single-page apps that paint their content client-side return `extraction_failed`. Use this on server-rendered pages.
+- **Not a JS renderer.** Pure client-rendered SPAs with no static content return `extraction_failed`. SPAs with server-rendered or SEO-prerendered HTML will extract whatever static content they ship.
 
 ## Configuration
 
@@ -142,7 +142,7 @@ Pass overrides via the `env` block of your MCP client config:
 
 Requires Node.js ≥ 24.
 
-When iterating on CLI changes, `tsx src/index.ts <url>` and `tsx src/index.ts --help` route through the same argv-discriminated dispatcher as the compiled binary — no rebuild needed between edits.
+When iterating on CLI changes, `tsx src/index.ts <url>` and `tsx src/index.ts --help` route through the same argv-discriminated dispatcher as the built `dist/index.js` — no rebuild needed between edits.
 
 To point an MCP client at a local source build, swap `npx` for `node` + an absolute path to `dist/index.js`:
 

diff --git a/docs/SPEC.md b/docs/SPEC.md
@@ -21,11 +21,11 @@ Errors throw `MarkfetchError` uniformly from core; adapters catch once. Codes: `
 
 - **Lazy adapter imports.** The dispatcher uses `await import()` to load exactly one adapter. The only `console.log` in the project lives in `cli.ts`; under MCP, `cli.ts` never loads, so stdout-discipline is enforced by the module graph — not by linter or convention.
 
-- **Core throws, adapters translate.** All 7 error codes throw from `core.ts`; `classifyError` normalizes underlying-API errors (undici TypeErrors, AbortSignal timeouts). New codes need an `ErrorCode` union member + a throw site; adapters don't change.
+- **Core throws, adapters translate.** All 7 error codes surface from `core.ts` — five are thrown explicitly as `MarkfetchError`; `network_error`, `timeout`, and (sometimes) `http_error` are translated by `classifyError` from underlying-API errors (undici TypeErrors, AbortSignal timeouts). New codes need an `ErrorCode` union member + a throw site; adapters don't change.
 
 - **HTTP/2 + coherent Chrome fingerprint.** Wire protocol, headers, and UA must agree — a Chrome UA over HTTP/1.1 or without `Sec-CH-UA-*` is *more* suspicious than curl. `Sec-CH-UA-*` is derived from `MARKFETCH_USER_AGENT` at startup so override-coherence is mechanical.
 
-- **Single-channel MCP response.** `content[0].text` only. Modern MCP clients hide `content[]` when `structuredContent` is present, which would route the response away from the LLM that called the tool.
+- **Single-channel MCP response.** `content[0].text` only. Several major MCP clients (Claude Code CLI, VS Code/Copilot) forward only `structuredContent` to the model and drop `content[]` when both are present — a single-channel response keeps the markdown reachable from those clients.
 
 - **Whole document or `too_large`.** No pagination. Partial content lets the agent reason over truncated bodies without knowing they're truncated. `savePath` / `-o` is the escape valve for genuinely large documents.
 
@@ -36,7 +36,7 @@ Errors throw `MarkfetchError` uniformly from core; adapters catch once. Codes: `
 ## Ideas for future
 
 - **Authentication.** `MARKFETCH_AUTH_HEADER` env var (simple), or Chrome-cookie import for sites where the user is already logged in (frictionless, platform-specific, security-sensitive). Trigger: first useful internal / paywalled doc.
-- **JS rendering fallback for SPAs.** Playwright / headless Chrome as a companion package (`markfetch-heavy`) so the lean binary stays lean. Trigger: enough useful sites returning `extraction_failed`.
+- **JS rendering fallback for SPAs.** Playwright / headless Chrome as a companion package (`markfetch-heavy`) so the lean package stays lean. Trigger: enough useful sites returning `extraction_failed`.
 - **CloudFlare `/markdown` fallback.** Gated by `CF_AUTH_TOKEN`; fall back when Readability fails. Trigger: extraction failure rate stays high after Readability tuning.
 - **Cookie reuse across redirects within a single fetch.** Currently none. Trigger: a target serves content only after a session-cookie redirect.
 - **Proxy support** (`MARKFETCH_PROXY_URL`) and **`Accept-Language` control** (`MARKFETCH_ACCEPT_LANGUAGE`). Trigger: corporate proxy / locale-specific content.

diff --git a/package.json b/package.json
@@ -1,7 +1,7 @@
 {
   "name": "markfetch",
   "version": "0.5.0",
-  "description": "MCP server: fetch a URL, return clean markdown. Built for AI agents.",
+  "description": "Fetch a URL, return clean markdown. MCP server and CLI for AI agents.",
   "license": "MIT",
   "author": {
     "name": "Serhii Vasylenko",

diff --git a/scripts/postbuild.mjs b/scripts/postbuild.mjs
@@ -1,6 +1,7 @@
-// Sets execute bit on dist/index.js so the npm `bin` entry resolves correctly
-// when invoked via `npx markfetch` or as a direct script. tsc preserves the
-// shebang but doesn't chmod its outputs.
+// Sets execute bit on dist/index.js so the shebang-based launch works —
+// both when npm links the `bin` entry (npm/npx exec the linked target)
+// and when running ./dist/index.js directly. tsc preserves the shebang
+// but doesn't chmod its outputs.
 import { chmodSync } from "node:fs";
 
 chmodSync("dist/index.js", 0o755);
diff --git a/src/core.ts b/src/core.ts
@@ -42,10 +42,10 @@ const config = {
   userAgent: process.env.MARKFETCH_USER_AGENT || DEFAULT_USER_AGENT,
 };
 
-// Derive Sec-CH-UA-* client hints from the User-Agent. PRD §4 calls out that a
-// Chrome UA paired with mismatched (or absent) client hints is a stronger bot
-// signal than a curl UA — the two MUST agree. Deriving from a single source
-// makes that invariant mechanical: override the UA, the hints follow.
+// Derive Sec-CH-UA-* client hints from the User-Agent. A Chrome UA paired
+// with mismatched (or absent) client hints is a stronger bot signal than a
+// curl UA — the two MUST agree. Deriving from a single source makes that
+// invariant mechanical: override the UA, the hints follow.
 function deriveClientHints(ua: string): {
   brands: string;
   mobile: string;
@@ -58,8 +58,12 @@ function deriveClientHints(ua: string): {
     );
   }
   const major = versionMatch[1];
-  // The "Not?A_Brand" decoy rotates per Chrome major (130 ships v="99"). Servers
-  // don't fingerprint the decoy version, so pinning v="99" is acceptable.
+  // Chrome's GREASE rotation changes BOTH the decoy brand token AND its
+  // version per major: Chrome 130 ships "Not?A_Brand";v="99", Chrome 131
+  // ships "Not_A Brand";v="24". We hard-code the Chrome-130 values; if a
+  // caller overrides MARKFETCH_USER_AGENT to a different Chrome major, the
+  // decoy shape will be stale. That is acceptable because bot detectors
+  // don't fingerprint the decoy itself — only the real brand pair.
   const brands = `"Chromium";v="${major}", "Google Chrome";v="${major}", "Not?A_Brand";v="99"`;
   // Chrome's mobile UAs include a literal " Mobile " token; tablets/desktop omit it.
   const mobile = /\bMobile\b/.test(ua) ? "?1" : "?0";
@@ -83,11 +87,11 @@ function deriveClientHints(ua: string): {
 const clientHints = deriveClientHints(config.userAgent);
 
 // Enable HTTP/2 via TLS ALPN. Modern bot-detection systems and CDNs consider
-// wire protocol alongside header fingerprint; HTTP/2 paired with a Chrome
-// header set is internally consistent, HTTP/1.1 + Chrome headers is not.
-// Servers that don't advertise h2 in ALPN fall back to HTTP/1.1 transparently
-// during the TLS handshake — no manual retry needed. Plain-HTTP connections
-// (port 80) skip ALPN entirely and use HTTP/1.1.
+// wire protocol alongside header fingerprint; HTTP/2 over TLS pairs cleanly
+// with a Chrome header set. Servers that don't advertise h2 in ALPN fall back
+// to HTTP/1.1 transparently during the TLS handshake — no manual retry needed.
+// Plain-HTTP connections (port 80) skip ALPN entirely and use HTTP/1.1,
+// accepting the protocol/fingerprint mismatch in that case.
 setGlobalDispatcher(new Agent({ allowH2: true }));
 
 const TURNDOWN = new TurndownService({
@@ -111,9 +115,10 @@ TURNDOWN.use(gfm);
 //     the start of each text node, not start-of-line. After inline
 //     elements, the next text node often begins with `-suffix` / `=value`,
 //     and gets escaped even though it sits mid-line in the rendered
-//     markdown. CommonMark requires `- ` (dash + space) for an unordered
-//     list and `===` alone for a setext underline, so `\-X` / `\=X` where
-//     X is alphanumeric is never structurally meaningful.
+//     markdown. CommonMark setext underlines are `=` or `-` characters on
+//     a line by themselves; unordered-list markers require `-`/`+`/`*`
+//     followed by whitespace or end-of-line. `\-X` / `\=X` where X is
+//     alphanumeric cannot match either rule, so the escape is pure noise.
 //
 // Drop both. The negative lookbehind `(?<!\\)` on the second replace
 // protects literal-backslash content: source HTML containing `\-X`
@@ -165,7 +170,8 @@ export function classifyError(err: unknown): { code: ErrorCode; message: string
   if (err instanceof MarkfetchError) {
     return { code: err.code, message: err.message };
   }
-  // AbortSignal.timeout produces DOMException with name "TimeoutError".
+  // AbortSignal.timeout normally produces a DOMException named "TimeoutError";
+  // some undici code paths surface AbortError instead, so accept both.
   if (
     err instanceof Error &&
     (err.name === "TimeoutError" || err.name === "AbortError")
@@ -210,6 +216,9 @@ function chromeHeaders(): Record<string, string> {
     "Sec-Fetch-Dest": "document",
     "Sec-Fetch-Mode": "navigate",
     "Sec-Fetch-Site": "none",
+    // Always-on. Real browsers omit this header when there's no user
+    // activation; we model a "user clicked a link" navigation, consistent
+    // with `Sec-Fetch-Site: "none"` above.
     "Sec-Fetch-User": "?1",
     "Sec-CH-UA": clientHints.brands,
     "Sec-CH-UA-Mobile": clientHints.mobile,
@@ -267,9 +276,9 @@ function enforceTooLarge(stage: string, actual: number): MarkfetchError {
 // rather than real `<code>` elements. Decode those specific tag patterns so
 // turndown processes them as real elements and converts to backticks.
 // Pattern accepts `<code>`, `<code class="...">`, `</code>`, `<pre>` etc., but
-// rejects `<codename>`, `<preview>`, `<codeblock>` — the trailing requirement
-// is whitespace, "/", or end-of-tag, so element names with extra characters
-// after `code`/`pre` are not matched.
+// rejects `<codename>`, `<preview>`, `<codeblock>` — the next char after
+// `code`/`pre` must be whitespace, `/`, or `&` (the start of `&gt;`), so
+// element names with extra characters are not matched.
 function decodeEncodedCodeTags(html: string): string {
   return html.replaceAll(
     /&lt;(\/?(?:code|pre)(?:\s[^&]*?)?\/?)&gt;/g,
@@ -389,7 +398,7 @@ function convertToMarkdown(article: {
 //
 // Errors are thrown uniformly as MarkfetchError. Adapters catch and translate:
 //   - mcp.ts catches → errorResult(code, message) → MCP {isError, content}
-//   - cli.ts catches → console.error("[code] message") → exit code 1
+//   - cli.ts catches → console.error("[code] message") → sets process.exitCode = 1
 //
 // The full set of error codes this can throw:
 //   network_error, http_error, timeout, unsupported_content_type,

diff --git a/src/index.ts b/src/index.ts
@@ -3,7 +3,7 @@
 // Argv-discriminated dispatcher.
 //
 // `process.argv.length === 2` means the user provided zero arguments
-// (argv[0] is the node binary, argv[1] is this script path). That's the
+// (argv[0] is the path to node, argv[1] is this script path). That's the
 // shape every MCP client uses when spawning a server — so bare invocation
 // routes to the MCP adapter and preserves every existing client config.
 //

diff --git a/src/mcp.ts b/src/mcp.ts
@@ -27,7 +27,7 @@ server.registerTool(
   "fetch_markdown",
   {
     description:
-      "Fetch a single public HTTP/S URL and return its main article content as clean markdown. Best for articles, documentation, blog posts, news, and reference pages. JavaScript-rendered SPAs and non-HTML responses return structured errors instead of partial content. Also supports saving the markdown to a file, e.g., to bypass client tool-result size limits or to reuse later.",
+      "Fetch a single public HTTP/S URL and return its main article content as clean markdown. Best for articles, documentation, blog posts, news, and reference pages. Non-HTML responses return `unsupported_content_type`. Pure client-rendered SPAs with no extractable static HTML return `extraction_failed`; SPAs that ship server-rendered or SEO-prerendered HTML will extract whatever static content they expose. Also supports saving the markdown to a file, e.g., to bypass client tool-result size limits or to reuse later.",
     inputSchema: {
       url: z
         .string()

diff --git a/tests/cli.test.ts b/tests/cli.test.ts
@@ -18,10 +18,10 @@ import { join, resolve as resolvePath } from "node:path";
 const execFileAsync = promisify(execFile);
 
 // Resolved at module load against the test runner's cwd (the project root).
-// Tests that override `cwd` to a tmpdir still need to find the tsx binary
+// Tests that override `cwd` to a tmpdir still need to find the tsx CLI
 // and the source entry — passing relative paths would resolve against the
 // new cwd and produce a confusing ENOENT instead of the behavior under test.
-const TSX_BIN = resolvePath("./node_modules/.bin/tsx");
+const TSX_CLI = resolvePath("./node_modules/.bin/tsx");
 const ENTRY = resolvePath("src/index.ts");
 
 const HAPPY_FIXTURE = `<!DOCTYPE html>
@@ -80,7 +80,7 @@ async function runCli(
 ): Promise<RunResult> {
   try {
     const { stdout, stderr } = await execFileAsync(
-      TSX_BIN,
+      TSX_CLI,
       [ENTRY, ...args],
       {
         env: { ...process.env, ...env } as Record<string, string>,