From b162fb23e777f88ce102c2290928b07fefc0a33e Mon Sep 17 00:00:00 2001 From: Serhii Vasylenko Date: Tue, 12 May 2026 21:04:32 +0200 Subject: [PATCH 1/2] Fix terminology mislabels and inaccurate technical claims MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Calling the published JS artifact a "binary" or "TypeScript executable" misled readers — the project ships `dist/index.js` (a JS file with a node shebang) via npm, no native binary anywhere. Replaced those usages in README, e2e test names, comments, and the e2e fixture body with "compiled output" / "built dist/index.js" / "compiled JS output" depending on context. Kept npm-`bin` jargon (BUILT_BIN, TSX_BIN, spawnCompiled) and the SPEC line 43 future-feature mention of `bun --compile` / Node SEA — those uses are accurate. Also corrected several other inaccuracies: - release.yml line 62: the comment claimed awk did NR/sub trimming, but the script only prints-after-match until the next header. Rewrote the comment to describe what the awk block actually does. - .npmignore: the group headers misclassified tests/ + tsconfig.json as "source files" and claimed CLAUDE.md is rendered into the GitHub release body (only CHANGELOG.md is). Rewrote both headers. - package.json description: omitted CLI mode. Replaced with one line that covers both MCP and CLI surfaces. - scripts/postbuild.mjs: misattributed the +x effect to npm `bin` resolution. Rewrote to credit the shebang-based launch path. - docs/SPEC.md: "All 7 error codes throw from core.ts" was imprecise — only five throw explicitly; network_error, timeout, and sometimes http_error are translated by classifyError. Tightened the wording. - src/core.ts: softened the HTTP/2 framing comment to acknowledge the plain- HTTP fallback the same comment already noted; documented the AbortError branch in classifyError that the comment previously omitted. Text-only changes. `npm run build` and `npm test` (50/50) pass. --- .github/workflows/release.yml | 5 +++-- .npmignore | 4 ++-- README.md | 6 +++--- docs/SPEC.md | 4 ++-- package.json | 2 +- scripts/postbuild.mjs | 7 ++++--- src/core.ts | 13 +++++++------ tests/cli.test.ts | 2 +- tests/e2e.test.ts | 22 +++++++++++----------- 9 files changed, 34 insertions(+), 31 deletions(-) diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 08f8723..db27a29 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -58,8 +58,9 @@ jobs: TAG_NAME: ${{ github.ref_name }} run: | VERSION=${TAG_NAME#v} - # Print everything between "## [VERSION]" and the next "## [" header. - # The trailing blank lines are trimmed by awk's NR/sub trick below. + # Prints lines after the matched "## [VERSION]" header until the + # next "## [" header (or EOF). The matched header line itself is + # skipped via `next`. CHANGELOG_BODY=$(awk -v ver="$VERSION" ' /^## \[/ { if (found) exit diff --git a/.npmignore b/.npmignore index 8230ec0..3b8569f 100644 --- a/.npmignore +++ b/.npmignore @@ -1,4 +1,4 @@ -# Source files (compiled to dist/, which is in `files`) +# Source, tests, and TS build config (not needed at runtime; dist/ ships instead) src/ tests/ *.ts @@ -12,7 +12,7 @@ tsconfig.json docs/ scripts/ -# Project files (rendered into the GitHub release body, not the npm package) +# Repo-only docs (CHANGELOG.md is rendered into the GitHub release body; CLAUDE.md is agent instructions). Excluded from the npm tarball. CHANGELOG.md CLAUDE.md diff --git a/README.md b/README.md index f739e58..e649560 100644 --- a/README.md +++ b/README.md @@ -32,7 +32,7 @@ markfetch https://en.wikipedia.org/wiki/Markdown } ``` -That snippet is the whole MCP setup — or jump to [CLI usage](#cli-usage) to drive the same binary from a shell. +That snippet is the whole MCP setup — or jump to [CLI usage](#cli-usage) to drive the same command from a shell. ## MCP install commands @@ -73,7 +73,7 @@ gemini mcp add -s user markfetch npx -y markfetch - **Stdio-clean.** Stdout is reserved for MCP frames. Stderr is fatal-only. No log spam, no ANSI escapes that could corrupt protocol framing. -- **Pure Node, no subprocesses.** No Playwright, no headless Chromium, no Python hop. Single TypeScript executable on Node 24+ — one process whether you invoke it as an MCP server or from the shell. +- **Pure Node, no subprocesses.** No Playwright, no headless Chromium, no Python hop. Single Node process — one Node process whether you invoke it as an MCP server or from the shell. ## CLI usage @@ -142,7 +142,7 @@ Pass overrides via the `env` block of your MCP client config: Requires Node.js ≥ 24. -When iterating on CLI changes, `tsx src/index.ts ` and `tsx src/index.ts --help` route through the same argv-discriminated dispatcher as the compiled binary — no rebuild needed between edits. +When iterating on CLI changes, `tsx src/index.ts ` and `tsx src/index.ts --help` route through the same argv-discriminated dispatcher as the built `dist/index.js` — no rebuild needed between edits. To point an MCP client at a local source build, swap `npx` for `node` + an absolute path to `dist/index.js`: diff --git a/docs/SPEC.md b/docs/SPEC.md index 4c9ee40..f8c6305 100644 --- a/docs/SPEC.md +++ b/docs/SPEC.md @@ -21,7 +21,7 @@ Errors throw `MarkfetchError` uniformly from core; adapters catch once. Codes: ` - **Lazy adapter imports.** The dispatcher uses `await import()` to load exactly one adapter. The only `console.log` in the project lives in `cli.ts`; under MCP, `cli.ts` never loads, so stdout-discipline is enforced by the module graph — not by linter or convention. -- **Core throws, adapters translate.** All 7 error codes throw from `core.ts`; `classifyError` normalizes underlying-API errors (undici TypeErrors, AbortSignal timeouts). New codes need an `ErrorCode` union member + a throw site; adapters don't change. +- **Core throws, adapters translate.** All 7 error codes surface from `core.ts` — five are thrown explicitly as `MarkfetchError`; `network_error`, `timeout`, and (sometimes) `http_error` are translated by `classifyError` from underlying-API errors (undici TypeErrors, AbortSignal timeouts). New codes need an `ErrorCode` union member + a throw site; adapters don't change. - **HTTP/2 + coherent Chrome fingerprint.** Wire protocol, headers, and UA must agree — a Chrome UA over HTTP/1.1 or without `Sec-CH-UA-*` is *more* suspicious than curl. `Sec-CH-UA-*` is derived from `MARKFETCH_USER_AGENT` at startup so override-coherence is mechanical. @@ -36,7 +36,7 @@ Errors throw `MarkfetchError` uniformly from core; adapters catch once. Codes: ` ## Ideas for future - **Authentication.** `MARKFETCH_AUTH_HEADER` env var (simple), or Chrome-cookie import for sites where the user is already logged in (frictionless, platform-specific, security-sensitive). Trigger: first useful internal / paywalled doc. -- **JS rendering fallback for SPAs.** Playwright / headless Chrome as a companion package (`markfetch-heavy`) so the lean binary stays lean. Trigger: enough useful sites returning `extraction_failed`. +- **JS rendering fallback for SPAs.** Playwright / headless Chrome as a companion package (`markfetch-heavy`) so the lean package stays lean. Trigger: enough useful sites returning `extraction_failed`. - **CloudFlare `/markdown` fallback.** Gated by `CF_AUTH_TOKEN`; fall back when Readability fails. Trigger: extraction failure rate stays high after Readability tuning. - **Cookie reuse across redirects within a single fetch.** Currently none. Trigger: a target serves content only after a session-cookie redirect. - **Proxy support** (`MARKFETCH_PROXY_URL`) and **`Accept-Language` control** (`MARKFETCH_ACCEPT_LANGUAGE`). Trigger: corporate proxy / locale-specific content. diff --git a/package.json b/package.json index f81c12c..445f75b 100644 --- a/package.json +++ b/package.json @@ -1,7 +1,7 @@ { "name": "markfetch", "version": "0.5.0", - "description": "MCP server: fetch a URL, return clean markdown. Built for AI agents.", + "description": "Fetch a URL, return clean markdown. MCP server and CLI for AI agents.", "license": "MIT", "author": { "name": "Serhii Vasylenko", diff --git a/scripts/postbuild.mjs b/scripts/postbuild.mjs index 8fd8b9d..ada56b7 100644 --- a/scripts/postbuild.mjs +++ b/scripts/postbuild.mjs @@ -1,6 +1,7 @@ -// Sets execute bit on dist/index.js so the npm `bin` entry resolves correctly -// when invoked via `npx markfetch` or as a direct script. tsc preserves the -// shebang but doesn't chmod its outputs. +// Sets execute bit on dist/index.js so the shebang-based launch works — +// both when npm links the `bin` entry (npm/npx exec the linked target) +// and when running ./dist/index.js directly. tsc preserves the shebang +// but doesn't chmod its outputs. import { chmodSync } from "node:fs"; chmodSync("dist/index.js", 0o755); diff --git a/src/core.ts b/src/core.ts index eff4f29..2c7e1cd 100644 --- a/src/core.ts +++ b/src/core.ts @@ -83,11 +83,11 @@ function deriveClientHints(ua: string): { const clientHints = deriveClientHints(config.userAgent); // Enable HTTP/2 via TLS ALPN. Modern bot-detection systems and CDNs consider -// wire protocol alongside header fingerprint; HTTP/2 paired with a Chrome -// header set is internally consistent, HTTP/1.1 + Chrome headers is not. -// Servers that don't advertise h2 in ALPN fall back to HTTP/1.1 transparently -// during the TLS handshake — no manual retry needed. Plain-HTTP connections -// (port 80) skip ALPN entirely and use HTTP/1.1. +// wire protocol alongside header fingerprint; HTTP/2 over TLS pairs cleanly +// with a Chrome header set. Servers that don't advertise h2 in ALPN fall back +// to HTTP/1.1 transparently during the TLS handshake — no manual retry needed. +// Plain-HTTP connections (port 80) skip ALPN entirely and use HTTP/1.1, +// accepting the protocol/fingerprint mismatch in that case. setGlobalDispatcher(new Agent({ allowH2: true })); const TURNDOWN = new TurndownService({ @@ -165,7 +165,8 @@ export function classifyError(err: unknown): { code: ErrorCode; message: string if (err instanceof MarkfetchError) { return { code: err.code, message: err.message }; } - // AbortSignal.timeout produces DOMException with name "TimeoutError". + // AbortSignal.timeout normally produces a DOMException named "TimeoutError"; + // some undici code paths surface AbortError instead, so accept both. if ( err instanceof Error && (err.name === "TimeoutError" || err.name === "AbortError") diff --git a/tests/cli.test.ts b/tests/cli.test.ts index bbac5a0..271e600 100644 --- a/tests/cli.test.ts +++ b/tests/cli.test.ts @@ -18,7 +18,7 @@ import { join, resolve as resolvePath } from "node:path"; const execFileAsync = promisify(execFile); // Resolved at module load against the test runner's cwd (the project root). -// Tests that override `cwd` to a tmpdir still need to find the tsx binary +// Tests that override `cwd` to a tmpdir still need to find the tsx CLI // and the source entry — passing relative paths would resolve against the // new cwd and produce a confusing ENOENT instead of the behavior under test. const TSX_BIN = resolvePath("./node_modules/.bin/tsx"); diff --git a/tests/e2e.test.ts b/tests/e2e.test.ts index 5f56230..e90f9f9 100644 --- a/tests/e2e.test.ts +++ b/tests/e2e.test.ts @@ -1,4 +1,4 @@ -// E2E tests against the COMPILED binary (`node dist/index.js`), not the dev +// E2E tests against the COMPILED JS output (`node dist/index.js`), not the dev // source. server.test.ts already exercises the full surface via tsx; this file // verifies that `tsc` output is itself correct and runnable. If server.test.ts // passes but this file fails, the bug lives in the build pipeline, not the @@ -21,7 +21,7 @@ import { join, resolve as resolvePath } from "node:path"; const execFileAsync = promisify(execFile); // Resolved absolute paths so a test that overrides cwd still locates the -// built binary. node is on PATH, so a bare command name is fine for it. +// built JS entry. node is on PATH, so a bare command name is fine for it. const BUILT_BIN = resolvePath("dist/index.js"); before(() => { @@ -74,7 +74,7 @@ const HAPPY_FIXTURE = `

E2E Fixture Heading

-

This is a deterministic fixture for verifying the compiled binary's full pipeline. The article contains enough prose to pass Readability scoring without depending on any external network resource.

+

This is a deterministic fixture for verifying the compiled output's full pipeline. The article contains enough prose to pass Readability scoring without depending on any external network resource.

Sub-section

Second paragraph adds more substance so the extracted markdown has multiple structural elements to assert against. Lorem ipsum dolor sit amet.

@@ -83,7 +83,7 @@ const HAPPY_FIXTURE = ` `; -test("e2e: compiled binary boots, exposes fetch_markdown, pins version", async () => { +test("e2e: compiled output boots, exposes fetch_markdown, pins version", async () => { const client = await spawnCompiled(); try { const info = client.getServerVersion(); @@ -97,7 +97,7 @@ test("e2e: compiled binary boots, exposes fetch_markdown, pins version", async ( } }); -test("e2e: compiled binary returns markdown for a mock fixture", async () => { +test("e2e: compiled output returns markdown for a mock fixture", async () => { const mock = await startMock((_req, res) => { res.writeHead(200, { "Content-Type": "text/html; charset=utf-8" }); res.end(HAPPY_FIXTURE); @@ -121,7 +121,7 @@ test("e2e: compiled binary returns markdown for a mock fixture", async () => { } }); -test("e2e: compiled binary returns [network_error] for invalid host", async () => { +test("e2e: compiled output returns [network_error] for invalid host", async () => { const client = await spawnCompiled(); try { const result = await client.callTool({ @@ -135,10 +135,10 @@ test("e2e: compiled binary returns [network_error] for invalid host", async () = } }); -// E1 — savePath against the compiled binary. Pins the build pipeline against +// E1 — savePath against the compiled JS output. Pins the build pipeline against // the new code path. If T1 (server.test) passes but this fails, the bug is // in tsc/postbuild, not the runtime logic. -test("e2e: compiled binary writes markdown to savePath, returns confirmation", async () => { +test("e2e: compiled output writes markdown to savePath, returns confirmation", async () => { const mock = await startMock((_req, res) => { res.writeHead(200, { "Content-Type": "text/html; charset=utf-8" }); res.end(HAPPY_FIXTURE); @@ -166,12 +166,12 @@ test("e2e: compiled binary writes markdown to savePath, returns confirmation", a } }); -// CLI-mode e2e tests. These spawn the compiled binary with arguments so the +// CLI-mode e2e tests. These spawn the compiled JS output with arguments so the // dispatcher in dist/index.js routes to dist/cli.js — exercising the lazy // import path that tsc must emit correctly. If the corresponding cli.test // passes but these fail, the bug is in the build pipeline, not runtime logic. -test("e2e: compiled binary CLI prints markdown to stdout, exit 0", async () => { +test("e2e: compiled output CLI prints markdown to stdout, exit 0", async () => { const mock = await startMock((_req, res) => { res.writeHead(200, { "Content-Type": "text/html; charset=utf-8" }); res.end(HAPPY_FIXTURE); @@ -189,7 +189,7 @@ test("e2e: compiled binary CLI prints markdown to stdout, exit 0", async () => { } }); -test("e2e: compiled binary --version prints package version, exit 0", async () => { +test("e2e: compiled output --version prints package version, exit 0", async () => { const { stdout, stderr } = await execFileAsync( "node", [BUILT_BIN, "--version"], From 2a5faad134378c47395f0518c95c0a47ad590ea1 Mon Sep 17 00:00:00 2001 From: Serhii Vasylenko Date: Tue, 12 May 2026 21:35:44 +0200 Subject: [PATCH 2/2] Clean up remaining build vocabulary and tighten technical claims MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A follow-up to b162fb2 that addresses items the previous pass kept as "npm-`bin` jargon" plus several technical inaccuracies surfaced by a fresh audit. Build vocabulary (the dist/ artifact is plain JavaScript transpiled by tsc — not a binary or compiled executable): - tests/e2e.test.ts: spawnCompiled → spawnBuilt, BUILT_BIN → BUILT_JS, "compiled output" → "built output" across 10 test names and comments. - tests/cli.test.ts: TSX_BIN → TSX_CLI. - src/index.ts: "node binary" → "path to node" in the argv comment. Kept legitimate uses intact: Bun's --compile / Node SEA references in docs/SPEC.md:43 (those tools really do produce binaries), npm's literal .bin/ folder paths, libuv "native" assertion, "pure Node" idiom. Technical accuracy in src/core.ts (comment-only — runtime untouched): - Sec-CH-UA decoy comment: Chrome's GREASE rotation changes BOTH the brand token and its version per major (130: "Not?A_Brand";v="99", 131: "Not_A Brand";v="24"). Previous comment claimed only the version rotated. - CommonMark escape rules: setext underlines are = or - on a line by themselves; list markers require -/+/* then whitespace or EOL. The previous "===" / "- " restatement was overly specific. - Sec-Fetch-User: ?1 is always-on; real browsers omit it on non-user-activated navigations. Added a one-line comment noting the deliberate simplification. - decodeEncodedCodeTags: trailing requirement is whitespace, /, or & (the start of >), not "end-of-tag". - "exit code 1" → "sets process.exitCode = 1" — matches cli.ts's actual mechanism documented in its own comments. Overreaching claims softened: - src/mcp.ts tool description + README:115: only pure client-rendered SPAs with no extractable static HTML return extraction_failed. SPAs with server-rendered or SEO-prerendered HTML extract whatever they ship. Previous wording implied a determinism the pipeline doesn't have. - README:68 + docs/SPEC.md:28: "Modern MCP clients hide content[]" replaced with concrete client names (Claude Code CLI, VS Code/Copilot). Behavior varies across clients; concrete examples are honest. - README:3 lede: "request rate" → "request fingerprint". markfetch has no rate-limit logic; the actual contrast is shape (HTTP/2 + headers), not pacing. - README:74 Stdio-clean ANSI clause: "could corrupt protocol framing" → "keeping stderr parseable for shell consumers". Stdout-discipline is already covered by the prior sentence in the same bullet. Stale PRD references (the PRD was deleted in commit 52e2139): - src/core.ts: dropped "PRD §4 calls out that" from the client-hints derivation comment. - tests/server.test.ts:436: "(Principle #4: stderr is fatal-only)" → "(stderr-is-fatal-only invariant per SPEC.md)". - tests/server.test.ts:670: "PRD §5: file at savePath is only ever the markdown" → reformulated to cite README and SPEC.md instead. Text-only changes. `npm run build` and `npm test` (50/50) pass. --- README.md | 8 ++++---- docs/SPEC.md | 2 +- src/core.ts | 34 +++++++++++++++++++++------------- src/index.ts | 2 +- src/mcp.ts | 2 +- tests/cli.test.ts | 4 ++-- tests/e2e.test.ts | 36 ++++++++++++++++++------------------ tests/server.test.ts | 4 ++-- 8 files changed, 50 insertions(+), 42 deletions(-) diff --git a/README.md b/README.md index e649560..6e5c7fe 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ # markfetch -**Reader View for AI agents and your shell. Fetch any URL, get back clean markdown — at a real Chrome's request rate, not curl's.** +**Reader View for AI agents and your shell. Fetch any URL, get back clean markdown — with a real Chrome's request fingerprint, not curl's.** [![npm](https://img.shields.io/npm/v/markfetch.svg?color=10b981&label=npm)](https://www.npmjs.com/package/markfetch) [![ci](https://github.com/vasylenko/markfetch/actions/workflows/ci.yml/badge.svg)](https://github.com/vasylenko/markfetch/actions/workflows/ci.yml) @@ -65,13 +65,13 @@ gemini mcp add -s user markfetch npx -y markfetch - **Reader-View-quality extraction.** [linkedom](https://github.com/WebReflection/linkedom) → [@mozilla/readability](https://github.com/mozilla/readability) → [turndown](https://github.com/mixmark-io/turndown) with GFM tables, strikethrough, and task lists. Code fences preserve `language-X` hints. Sphinx-style bare `
` blocks render as code, not escaped prose. Intraword underscores stay un-escaped — no more `list\_tools`.
 
-- **One tool, one shape (MCP).** `fetch_markdown(url, savePath?)` returns markdown in `content[0].text`. No `structuredContent`, no frontmatter, no metadata fields. Modern MCP clients hide `content[]` when `structuredContent` is present — `markfetch` deliberately stays on the channel your LLM can actually read.
+- **One tool, one shape (MCP).** `fetch_markdown(url, savePath?)` returns markdown in `content[0].text`. No `structuredContent`, no frontmatter, no metadata fields. Several major MCP clients (Claude Code CLI, VS Code/Copilot) forward only `structuredContent` to the model and drop `content[]` when both are present — `markfetch` deliberately stays on the channel your LLM can actually read.
 
 - **`savePath` / `-o` escape valve.** Pass an absolute path (MCP `savePath`) or `-o ` (CLI) and the markdown lands on disk instead of the response channel. Use it when your client's inline tool-result cap would truncate large responses, or to redirect output from a shell pipeline. The file is only ever the markdown of the URL — fetch errors return a `[code]` string and never touch the disk.
 
 - **Whole document or honest failure.** No pagination, no truncation. If the document doesn't fit in `MARKFETCH_MAX_BYTES`, you get `too_large` — never a half-truth.
 
-- **Stdio-clean.** Stdout is reserved for MCP frames. Stderr is fatal-only. No log spam, no ANSI escapes that could corrupt protocol framing.
+- **Stdio-clean.** Stdout is reserved for MCP frames. Stderr is fatal-only. No log spam, no ANSI escapes — keeping stderr parseable for shell consumers.
 
 - **Pure Node, no subprocesses.** No Playwright, no headless Chromium, no Python hop. Single Node process — one Node process whether you invoke it as an MCP server or from the shell.
 
@@ -112,7 +112,7 @@ Errors go to stderr with the same `[code] message` shape the MCP tool returns (s
 
 - **Not a crawler.** No recursion, no `robots.txt` parsing, no rate-limit orchestration. One URL in, one document out.
 - **Not authenticated.** Anonymous fetch only — no cookie jar, no auth headers, no session reuse. Pages behind login walls return whatever the public response is, usually surfaced as `http_error`.
-- **Not a JS renderer.** Single-page apps that paint their content client-side return `extraction_failed`. Use this on server-rendered pages.
+- **Not a JS renderer.** Pure client-rendered SPAs with no static content return `extraction_failed`. SPAs with server-rendered or SEO-prerendered HTML will extract whatever static content they ship.
 
 ## Configuration
 
diff --git a/docs/SPEC.md b/docs/SPEC.md
index f8c6305..3429baf 100644
--- a/docs/SPEC.md
+++ b/docs/SPEC.md
@@ -25,7 +25,7 @@ Errors throw `MarkfetchError` uniformly from core; adapters catch once. Codes: `
 
 - **HTTP/2 + coherent Chrome fingerprint.** Wire protocol, headers, and UA must agree — a Chrome UA over HTTP/1.1 or without `Sec-CH-UA-*` is *more* suspicious than curl. `Sec-CH-UA-*` is derived from `MARKFETCH_USER_AGENT` at startup so override-coherence is mechanical.
 
-- **Single-channel MCP response.** `content[0].text` only. Modern MCP clients hide `content[]` when `structuredContent` is present, which would route the response away from the LLM that called the tool.
+- **Single-channel MCP response.** `content[0].text` only. Several major MCP clients (Claude Code CLI, VS Code/Copilot) forward only `structuredContent` to the model and drop `content[]` when both are present — a single-channel response keeps the markdown reachable from those clients.
 
 - **Whole document or `too_large`.** No pagination. Partial content lets the agent reason over truncated bodies without knowing they're truncated. `savePath` / `-o` is the escape valve for genuinely large documents.
 
diff --git a/src/core.ts b/src/core.ts
index 2c7e1cd..f6122bc 100644
--- a/src/core.ts
+++ b/src/core.ts
@@ -42,10 +42,10 @@ const config = {
   userAgent: process.env.MARKFETCH_USER_AGENT || DEFAULT_USER_AGENT,
 };
 
-// Derive Sec-CH-UA-* client hints from the User-Agent. PRD §4 calls out that a
-// Chrome UA paired with mismatched (or absent) client hints is a stronger bot
-// signal than a curl UA — the two MUST agree. Deriving from a single source
-// makes that invariant mechanical: override the UA, the hints follow.
+// Derive Sec-CH-UA-* client hints from the User-Agent. A Chrome UA paired
+// with mismatched (or absent) client hints is a stronger bot signal than a
+// curl UA — the two MUST agree. Deriving from a single source makes that
+// invariant mechanical: override the UA, the hints follow.
 function deriveClientHints(ua: string): {
   brands: string;
   mobile: string;
@@ -58,8 +58,12 @@ function deriveClientHints(ua: string): {
     );
   }
   const major = versionMatch[1];
-  // The "Not?A_Brand" decoy rotates per Chrome major (130 ships v="99"). Servers
-  // don't fingerprint the decoy version, so pinning v="99" is acceptable.
+  // Chrome's GREASE rotation changes BOTH the decoy brand token AND its
+  // version per major: Chrome 130 ships "Not?A_Brand";v="99", Chrome 131
+  // ships "Not_A Brand";v="24". We hard-code the Chrome-130 values; if a
+  // caller overrides MARKFETCH_USER_AGENT to a different Chrome major, the
+  // decoy shape will be stale. That is acceptable because bot detectors
+  // don't fingerprint the decoy itself — only the real brand pair.
   const brands = `"Chromium";v="${major}", "Google Chrome";v="${major}", "Not?A_Brand";v="99"`;
   // Chrome's mobile UAs include a literal " Mobile " token; tablets/desktop omit it.
   const mobile = /\bMobile\b/.test(ua) ? "?1" : "?0";
@@ -111,9 +115,10 @@ TURNDOWN.use(gfm);
 //     the start of each text node, not start-of-line. After inline
 //     elements, the next text node often begins with `-suffix` / `=value`,
 //     and gets escaped even though it sits mid-line in the rendered
-//     markdown. CommonMark requires `- ` (dash + space) for an unordered
-//     list and `===` alone for a setext underline, so `\-X` / `\=X` where
-//     X is alphanumeric is never structurally meaningful.
+//     markdown. CommonMark setext underlines are `=` or `-` characters on
+//     a line by themselves; unordered-list markers require `-`/`+`/`*`
+//     followed by whitespace or end-of-line. `\-X` / `\=X` where X is
+//     alphanumeric cannot match either rule, so the escape is pure noise.
 //
 // Drop both. The negative lookbehind `(? {
     "Sec-Fetch-Dest": "document",
     "Sec-Fetch-Mode": "navigate",
     "Sec-Fetch-Site": "none",
+    // Always-on. Real browsers omit this header when there's no user
+    // activation; we model a "user clicked a link" navigation, consistent
+    // with `Sec-Fetch-Site: "none"` above.
     "Sec-Fetch-User": "?1",
     "Sec-CH-UA": clientHints.brands,
     "Sec-CH-UA-Mobile": clientHints.mobile,
@@ -268,9 +276,9 @@ function enforceTooLarge(stage: string, actual: number): MarkfetchError {
 // rather than real `` elements. Decode those specific tag patterns so
 // turndown processes them as real elements and converts to backticks.
 // Pattern accepts ``, ``, ``, `
` etc., but
-// rejects ``, ``, `` — the trailing requirement
-// is whitespace, "/", or end-of-tag, so element names with extra characters
-// after `code`/`pre` are not matched.
+// rejects ``, ``, `` — the next char after
+// `code`/`pre` must be whitespace, `/`, or `&` (the start of `>`), so
+// element names with extra characters are not matched.
 function decodeEncodedCodeTags(html: string): string {
   return html.replaceAll(
     /<(\/?(?:code|pre)(?:\s[^&]*?)?\/?)>/g,
@@ -390,7 +398,7 @@ function convertToMarkdown(article: {
 //
 // Errors are thrown uniformly as MarkfetchError. Adapters catch and translate:
 //   - mcp.ts catches → errorResult(code, message) → MCP {isError, content}
-//   - cli.ts catches → console.error("[code] message") → exit code 1
+//   - cli.ts catches → console.error("[code] message") → sets process.exitCode = 1
 //
 // The full set of error codes this can throw:
 //   network_error, http_error, timeout, unsupported_content_type,
diff --git a/src/index.ts b/src/index.ts
index 2215e77..f4a0345 100644
--- a/src/index.ts
+++ b/src/index.ts
@@ -3,7 +3,7 @@
 // Argv-discriminated dispatcher.
 //
 // `process.argv.length === 2` means the user provided zero arguments
-// (argv[0] is the node binary, argv[1] is this script path). That's the
+// (argv[0] is the path to node, argv[1] is this script path). That's the
 // shape every MCP client uses when spawning a server — so bare invocation
 // routes to the MCP adapter and preserves every existing client config.
 //
diff --git a/src/mcp.ts b/src/mcp.ts
index 62e0518..94f0d7c 100644
--- a/src/mcp.ts
+++ b/src/mcp.ts
@@ -27,7 +27,7 @@ server.registerTool(
   "fetch_markdown",
   {
     description:
-      "Fetch a single public HTTP/S URL and return its main article content as clean markdown. Best for articles, documentation, blog posts, news, and reference pages. JavaScript-rendered SPAs and non-HTML responses return structured errors instead of partial content. Also supports saving the markdown to a file, e.g., to bypass client tool-result size limits or to reuse later.",
+      "Fetch a single public HTTP/S URL and return its main article content as clean markdown. Best for articles, documentation, blog posts, news, and reference pages. Non-HTML responses return `unsupported_content_type`. Pure client-rendered SPAs with no extractable static HTML return `extraction_failed`; SPAs that ship server-rendered or SEO-prerendered HTML will extract whatever static content they expose. Also supports saving the markdown to a file, e.g., to bypass client tool-result size limits or to reuse later.",
     inputSchema: {
       url: z
         .string()
diff --git a/tests/cli.test.ts b/tests/cli.test.ts
index 271e600..5087374 100644
--- a/tests/cli.test.ts
+++ b/tests/cli.test.ts
@@ -21,7 +21,7 @@ const execFileAsync = promisify(execFile);
 // Tests that override `cwd` to a tmpdir still need to find the tsx CLI
 // and the source entry — passing relative paths would resolve against the
 // new cwd and produce a confusing ENOENT instead of the behavior under test.
-const TSX_BIN = resolvePath("./node_modules/.bin/tsx");
+const TSX_CLI = resolvePath("./node_modules/.bin/tsx");
 const ENTRY = resolvePath("src/index.ts");
 
 const HAPPY_FIXTURE = `
@@ -80,7 +80,7 @@ async function runCli(
 ): Promise {
   try {
     const { stdout, stderr } = await execFileAsync(
-      TSX_BIN,
+      TSX_CLI,
       [ENTRY, ...args],
       {
         env: { ...process.env, ...env } as Record,
diff --git a/tests/e2e.test.ts b/tests/e2e.test.ts
index e90f9f9..e637447 100644
--- a/tests/e2e.test.ts
+++ b/tests/e2e.test.ts
@@ -1,4 +1,4 @@
-// E2E tests against the COMPILED JS output (`node dist/index.js`), not the dev
+// E2E tests against the BUILT JS output (`node dist/index.js`), not the dev
 // source. server.test.ts already exercises the full surface via tsx; this file
 // verifies that `tsc` output is itself correct and runnable. If server.test.ts
 // passes but this file fails, the bug lives in the build pipeline, not the
@@ -22,14 +22,14 @@ const execFileAsync = promisify(execFile);
 
 // Resolved absolute paths so a test that overrides cwd still locates the
 // built JS entry. node is on PATH, so a bare command name is fine for it.
-const BUILT_BIN = resolvePath("dist/index.js");
+const BUILT_JS = resolvePath("dist/index.js");
 
 before(() => {
   // Always rebuild so e2e tests run against current source, not a stale dist/.
   execSync("npm run build", { stdio: "inherit" });
 });
 
-async function spawnCompiled(env: Record = {}) {
+async function spawnBuilt(env: Record = {}) {
   const transport = new StdioClientTransport({
     command: "node",
     args: ["dist/index.js"],
@@ -74,7 +74,7 @@ const HAPPY_FIXTURE = `
   

E2E Fixture Heading

-

This is a deterministic fixture for verifying the compiled output's full pipeline. The article contains enough prose to pass Readability scoring without depending on any external network resource.

+

This is a deterministic fixture for verifying the built output's full pipeline. The article contains enough prose to pass Readability scoring without depending on any external network resource.

Sub-section

Second paragraph adds more substance so the extracted markdown has multiple structural elements to assert against. Lorem ipsum dolor sit amet.

@@ -83,8 +83,8 @@ const HAPPY_FIXTURE = ` `; -test("e2e: compiled output boots, exposes fetch_markdown, pins version", async () => { - const client = await spawnCompiled(); +test("e2e: built output boots, exposes fetch_markdown, pins version", async () => { + const client = await spawnBuilt(); try { const info = client.getServerVersion(); assert.equal(info?.name, "markfetch"); @@ -97,12 +97,12 @@ test("e2e: compiled output boots, exposes fetch_markdown, pins version", async ( } }); -test("e2e: compiled output returns markdown for a mock fixture", async () => { +test("e2e: built output returns markdown for a mock fixture", async () => { const mock = await startMock((_req, res) => { res.writeHead(200, { "Content-Type": "text/html; charset=utf-8" }); res.end(HAPPY_FIXTURE); }); - const client = await spawnCompiled(); + const client = await spawnBuilt(); try { const result = await client.callTool({ name: "fetch_markdown", @@ -121,8 +121,8 @@ test("e2e: compiled output returns markdown for a mock fixture", async () => { } }); -test("e2e: compiled output returns [network_error] for invalid host", async () => { - const client = await spawnCompiled(); +test("e2e: built output returns [network_error] for invalid host", async () => { + const client = await spawnBuilt(); try { const result = await client.callTool({ name: "fetch_markdown", @@ -135,17 +135,17 @@ test("e2e: compiled output returns [network_error] for invalid host", async () = } }); -// E1 — savePath against the compiled JS output. Pins the build pipeline against +// E1 — savePath against the built JS output. Pins the build pipeline against // the new code path. If T1 (server.test) passes but this fails, the bug is // in tsc/postbuild, not the runtime logic. -test("e2e: compiled output writes markdown to savePath, returns confirmation", async () => { +test("e2e: built output writes markdown to savePath, returns confirmation", async () => { const mock = await startMock((_req, res) => { res.writeHead(200, { "Content-Type": "text/html; charset=utf-8" }); res.end(HAPPY_FIXTURE); }); const dir = await mkdtemp(join(tmpdir(), "mf-e2e-savepath-")); const savePath = join(dir, "out.md"); - const client = await spawnCompiled(); + const client = await spawnBuilt(); try { const result = await client.callTool({ name: "fetch_markdown", @@ -166,12 +166,12 @@ test("e2e: compiled output writes markdown to savePath, returns confirmation", a } }); -// CLI-mode e2e tests. These spawn the compiled JS output with arguments so the +// CLI-mode e2e tests. These spawn the built JS output with arguments so the // dispatcher in dist/index.js routes to dist/cli.js — exercising the lazy // import path that tsc must emit correctly. If the corresponding cli.test // passes but these fail, the bug is in the build pipeline, not runtime logic. -test("e2e: compiled output CLI prints markdown to stdout, exit 0", async () => { +test("e2e: built output CLI prints markdown to stdout, exit 0", async () => { const mock = await startMock((_req, res) => { res.writeHead(200, { "Content-Type": "text/html; charset=utf-8" }); res.end(HAPPY_FIXTURE); @@ -179,7 +179,7 @@ test("e2e: compiled output CLI prints markdown to stdout, exit 0", async () => { try { const { stdout, stderr } = await execFileAsync( "node", - [BUILT_BIN, mock.url], + [BUILT_JS, mock.url], { timeout: 10_000, maxBuffer: 5_000_000 }, ); assert.equal(stderr, "", "stderr must stay empty on happy path"); @@ -189,10 +189,10 @@ test("e2e: compiled output CLI prints markdown to stdout, exit 0", async () => { } }); -test("e2e: compiled output --version prints package version, exit 0", async () => { +test("e2e: built output --version prints package version, exit 0", async () => { const { stdout, stderr } = await execFileAsync( "node", - [BUILT_BIN, "--version"], + [BUILT_JS, "--version"], { timeout: 10_000 }, ); assert.equal(stderr, ""); diff --git a/tests/server.test.ts b/tests/server.test.ts index 4129fa6..0d174ef 100644 --- a/tests/server.test.ts +++ b/tests/server.test.ts @@ -433,7 +433,7 @@ test("Sec-CH-UA-* client hints are derived from MARKFETCH_USER_AGENT", async () } }); -test("per-request errors do not leak to stderr (Principle #4: stderr is fatal-only)", async () => { +test("per-request errors do not leak to stderr (stderr-is-fatal-only invariant per SPEC.md)", async () => { // Connect with stderr: "pipe" so we observe the server's stderr directly // while it handles a per-request failure. A network_error from an // unresolvable host is the cheapest reliable per-request failure. @@ -667,7 +667,7 @@ test("savePath: writeFile rejection surfaces as [save_failed] with errno; file i } }); -// T6 — THE Invariant. PRD §5: file at savePath is only ever the markdown. +// T6 — THE Invariant. The file at savePath is only ever the markdown of the URL (per README and SPEC.md). test("savePath INVARIANT: fetch error + savePath → file is NOT written", async () => { const mock = await startMock((_req, res) => { res.writeHead(404, { "Content-Type": "text/html" });