diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
index 08f8723..db27a29 100644
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -58,8 +58,9 @@ jobs:
TAG_NAME: ${{ github.ref_name }}
run: |
VERSION=${TAG_NAME#v}
- # Print everything between "## [VERSION]" and the next "## [" header.
- # The trailing blank lines are trimmed by awk's NR/sub trick below.
+ # Prints lines after the matched "## [VERSION]" header until the
+ # next "## [" header (or EOF). The matched header line itself is
+ # skipped via `next`.
CHANGELOG_BODY=$(awk -v ver="$VERSION" '
/^## \[/ {
if (found) exit
diff --git a/.npmignore b/.npmignore
index 8230ec0..3b8569f 100644
--- a/.npmignore
+++ b/.npmignore
@@ -1,4 +1,4 @@
-# Source files (compiled to dist/, which is in `files`)
+# Source, tests, and TS build config (not needed at runtime; dist/ ships instead)
src/
tests/
*.ts
@@ -12,7 +12,7 @@ tsconfig.json
docs/
scripts/
-# Project files (rendered into the GitHub release body, not the npm package)
+# Repo-only docs (CHANGELOG.md is rendered into the GitHub release body; CLAUDE.md is agent instructions). Excluded from the npm tarball.
CHANGELOG.md
CLAUDE.md
diff --git a/README.md b/README.md
index f739e58..6e5c7fe 100644
--- a/README.md
+++ b/README.md
@@ -1,6 +1,6 @@
# markfetch
-**Reader View for AI agents and your shell. Fetch any URL, get back clean markdown — at a real Chrome's request rate, not curl's.**
+**Reader View for AI agents and your shell. Fetch any URL, get back clean markdown — with a real Chrome's request fingerprint, not curl's.**
[](https://www.npmjs.com/package/markfetch)
[](https://github.com/vasylenko/markfetch/actions/workflows/ci.yml)
@@ -32,7 +32,7 @@ markfetch https://en.wikipedia.org/wiki/Markdown
}
```
-That snippet is the whole MCP setup — or jump to [CLI usage](#cli-usage) to drive the same binary from a shell.
+That snippet is the whole MCP setup — or jump to [CLI usage](#cli-usage) to drive the same command from a shell.
## MCP install commands
@@ -65,15 +65,15 @@ gemini mcp add -s user markfetch npx -y markfetch
- **Reader-View-quality extraction.** [linkedom](https://github.com/WebReflection/linkedom) → [@mozilla/readability](https://github.com/mozilla/readability) → [turndown](https://github.com/mixmark-io/turndown) with GFM tables, strikethrough, and task lists. Code fences preserve `language-X` hints. Sphinx-style bare `
` blocks render as code, not escaped prose. Intraword underscores stay un-escaped — no more `list\_tools`.
-- **One tool, one shape (MCP).** `fetch_markdown(url, savePath?)` returns markdown in `content[0].text`. No `structuredContent`, no frontmatter, no metadata fields. Modern MCP clients hide `content[]` when `structuredContent` is present — `markfetch` deliberately stays on the channel your LLM can actually read.
+- **One tool, one shape (MCP).** `fetch_markdown(url, savePath?)` returns markdown in `content[0].text`. No `structuredContent`, no frontmatter, no metadata fields. Several major MCP clients (Claude Code CLI, VS Code/Copilot) forward only `structuredContent` to the model and drop `content[]` when both are present — `markfetch` deliberately stays on the channel your LLM can actually read.
- **`savePath` / `-o` escape valve.** Pass an absolute path (MCP `savePath`) or `-o ` (CLI) and the markdown lands on disk instead of the response channel. Use it when your client's inline tool-result cap would truncate large responses, or to redirect output from a shell pipeline. The file is only ever the markdown of the URL — fetch errors return a `[code]` string and never touch the disk.
- **Whole document or honest failure.** No pagination, no truncation. If the document doesn't fit in `MARKFETCH_MAX_BYTES`, you get `too_large` — never a half-truth.
-- **Stdio-clean.** Stdout is reserved for MCP frames. Stderr is fatal-only. No log spam, no ANSI escapes that could corrupt protocol framing.
+- **Stdio-clean.** Stdout is reserved for MCP frames. Stderr is fatal-only. No log spam, no ANSI escapes — keeping stderr parseable for shell consumers.
-- **Pure Node, no subprocesses.** No Playwright, no headless Chromium, no Python hop. Single TypeScript executable on Node 24+ — one process whether you invoke it as an MCP server or from the shell.
+- **Pure Node, no subprocesses.** No Playwright, no headless Chromium, no Python hop. Single Node process — one Node process whether you invoke it as an MCP server or from the shell.
## CLI usage
@@ -112,7 +112,7 @@ Errors go to stderr with the same `[code] message` shape the MCP tool returns (s
- **Not a crawler.** No recursion, no `robots.txt` parsing, no rate-limit orchestration. One URL in, one document out.
- **Not authenticated.** Anonymous fetch only — no cookie jar, no auth headers, no session reuse. Pages behind login walls return whatever the public response is, usually surfaced as `http_error`.
-- **Not a JS renderer.** Single-page apps that paint their content client-side return `extraction_failed`. Use this on server-rendered pages.
+- **Not a JS renderer.** Pure client-rendered SPAs with no static content return `extraction_failed`. SPAs with server-rendered or SEO-prerendered HTML will extract whatever static content they ship.
## Configuration
@@ -142,7 +142,7 @@ Pass overrides via the `env` block of your MCP client config:
Requires Node.js ≥ 24.
-When iterating on CLI changes, `tsx src/index.ts ` and `tsx src/index.ts --help` route through the same argv-discriminated dispatcher as the compiled binary — no rebuild needed between edits.
+When iterating on CLI changes, `tsx src/index.ts ` and `tsx src/index.ts --help` route through the same argv-discriminated dispatcher as the built `dist/index.js` — no rebuild needed between edits.
To point an MCP client at a local source build, swap `npx` for `node` + an absolute path to `dist/index.js`:
diff --git a/docs/SPEC.md b/docs/SPEC.md
index 4c9ee40..3429baf 100644
--- a/docs/SPEC.md
+++ b/docs/SPEC.md
@@ -21,11 +21,11 @@ Errors throw `MarkfetchError` uniformly from core; adapters catch once. Codes: `
- **Lazy adapter imports.** The dispatcher uses `await import()` to load exactly one adapter. The only `console.log` in the project lives in `cli.ts`; under MCP, `cli.ts` never loads, so stdout-discipline is enforced by the module graph — not by linter or convention.
-- **Core throws, adapters translate.** All 7 error codes throw from `core.ts`; `classifyError` normalizes underlying-API errors (undici TypeErrors, AbortSignal timeouts). New codes need an `ErrorCode` union member + a throw site; adapters don't change.
+- **Core throws, adapters translate.** All 7 error codes surface from `core.ts` — five are thrown explicitly as `MarkfetchError`; `network_error`, `timeout`, and (sometimes) `http_error` are translated by `classifyError` from underlying-API errors (undici TypeErrors, AbortSignal timeouts). New codes need an `ErrorCode` union member + a throw site; adapters don't change.
- **HTTP/2 + coherent Chrome fingerprint.** Wire protocol, headers, and UA must agree — a Chrome UA over HTTP/1.1 or without `Sec-CH-UA-*` is *more* suspicious than curl. `Sec-CH-UA-*` is derived from `MARKFETCH_USER_AGENT` at startup so override-coherence is mechanical.
-- **Single-channel MCP response.** `content[0].text` only. Modern MCP clients hide `content[]` when `structuredContent` is present, which would route the response away from the LLM that called the tool.
+- **Single-channel MCP response.** `content[0].text` only. Several major MCP clients (Claude Code CLI, VS Code/Copilot) forward only `structuredContent` to the model and drop `content[]` when both are present — a single-channel response keeps the markdown reachable from those clients.
- **Whole document or `too_large`.** No pagination. Partial content lets the agent reason over truncated bodies without knowing they're truncated. `savePath` / `-o` is the escape valve for genuinely large documents.
@@ -36,7 +36,7 @@ Errors throw `MarkfetchError` uniformly from core; adapters catch once. Codes: `
## Ideas for future
- **Authentication.** `MARKFETCH_AUTH_HEADER` env var (simple), or Chrome-cookie import for sites where the user is already logged in (frictionless, platform-specific, security-sensitive). Trigger: first useful internal / paywalled doc.
-- **JS rendering fallback for SPAs.** Playwright / headless Chrome as a companion package (`markfetch-heavy`) so the lean binary stays lean. Trigger: enough useful sites returning `extraction_failed`.
+- **JS rendering fallback for SPAs.** Playwright / headless Chrome as a companion package (`markfetch-heavy`) so the lean package stays lean. Trigger: enough useful sites returning `extraction_failed`.
- **CloudFlare `/markdown` fallback.** Gated by `CF_AUTH_TOKEN`; fall back when Readability fails. Trigger: extraction failure rate stays high after Readability tuning.
- **Cookie reuse across redirects within a single fetch.** Currently none. Trigger: a target serves content only after a session-cookie redirect.
- **Proxy support** (`MARKFETCH_PROXY_URL`) and **`Accept-Language` control** (`MARKFETCH_ACCEPT_LANGUAGE`). Trigger: corporate proxy / locale-specific content.
diff --git a/package.json b/package.json
index f81c12c..445f75b 100644
--- a/package.json
+++ b/package.json
@@ -1,7 +1,7 @@
{
"name": "markfetch",
"version": "0.5.0",
- "description": "MCP server: fetch a URL, return clean markdown. Built for AI agents.",
+ "description": "Fetch a URL, return clean markdown. MCP server and CLI for AI agents.",
"license": "MIT",
"author": {
"name": "Serhii Vasylenko",
diff --git a/scripts/postbuild.mjs b/scripts/postbuild.mjs
index 8fd8b9d..ada56b7 100644
--- a/scripts/postbuild.mjs
+++ b/scripts/postbuild.mjs
@@ -1,6 +1,7 @@
-// Sets execute bit on dist/index.js so the npm `bin` entry resolves correctly
-// when invoked via `npx markfetch` or as a direct script. tsc preserves the
-// shebang but doesn't chmod its outputs.
+// Sets execute bit on dist/index.js so the shebang-based launch works —
+// both when npm links the `bin` entry (npm/npx exec the linked target)
+// and when running ./dist/index.js directly. tsc preserves the shebang
+// but doesn't chmod its outputs.
import { chmodSync } from "node:fs";
chmodSync("dist/index.js", 0o755);
diff --git a/src/core.ts b/src/core.ts
index eff4f29..f6122bc 100644
--- a/src/core.ts
+++ b/src/core.ts
@@ -42,10 +42,10 @@ const config = {
userAgent: process.env.MARKFETCH_USER_AGENT || DEFAULT_USER_AGENT,
};
-// Derive Sec-CH-UA-* client hints from the User-Agent. PRD §4 calls out that a
-// Chrome UA paired with mismatched (or absent) client hints is a stronger bot
-// signal than a curl UA — the two MUST agree. Deriving from a single source
-// makes that invariant mechanical: override the UA, the hints follow.
+// Derive Sec-CH-UA-* client hints from the User-Agent. A Chrome UA paired
+// with mismatched (or absent) client hints is a stronger bot signal than a
+// curl UA — the two MUST agree. Deriving from a single source makes that
+// invariant mechanical: override the UA, the hints follow.
function deriveClientHints(ua: string): {
brands: string;
mobile: string;
@@ -58,8 +58,12 @@ function deriveClientHints(ua: string): {
);
}
const major = versionMatch[1];
- // The "Not?A_Brand" decoy rotates per Chrome major (130 ships v="99"). Servers
- // don't fingerprint the decoy version, so pinning v="99" is acceptable.
+ // Chrome's GREASE rotation changes BOTH the decoy brand token AND its
+ // version per major: Chrome 130 ships "Not?A_Brand";v="99", Chrome 131
+ // ships "Not_A Brand";v="24". We hard-code the Chrome-130 values; if a
+ // caller overrides MARKFETCH_USER_AGENT to a different Chrome major, the
+ // decoy shape will be stale. That is acceptable because bot detectors
+ // don't fingerprint the decoy itself — only the real brand pair.
const brands = `"Chromium";v="${major}", "Google Chrome";v="${major}", "Not?A_Brand";v="99"`;
// Chrome's mobile UAs include a literal " Mobile " token; tablets/desktop omit it.
const mobile = /\bMobile\b/.test(ua) ? "?1" : "?0";
@@ -83,11 +87,11 @@ function deriveClientHints(ua: string): {
const clientHints = deriveClientHints(config.userAgent);
// Enable HTTP/2 via TLS ALPN. Modern bot-detection systems and CDNs consider
-// wire protocol alongside header fingerprint; HTTP/2 paired with a Chrome
-// header set is internally consistent, HTTP/1.1 + Chrome headers is not.
-// Servers that don't advertise h2 in ALPN fall back to HTTP/1.1 transparently
-// during the TLS handshake — no manual retry needed. Plain-HTTP connections
-// (port 80) skip ALPN entirely and use HTTP/1.1.
+// wire protocol alongside header fingerprint; HTTP/2 over TLS pairs cleanly
+// with a Chrome header set. Servers that don't advertise h2 in ALPN fall back
+// to HTTP/1.1 transparently during the TLS handshake — no manual retry needed.
+// Plain-HTTP connections (port 80) skip ALPN entirely and use HTTP/1.1,
+// accepting the protocol/fingerprint mismatch in that case.
setGlobalDispatcher(new Agent({ allowH2: true }));
const TURNDOWN = new TurndownService({
@@ -111,9 +115,10 @@ TURNDOWN.use(gfm);
// the start of each text node, not start-of-line. After inline
// elements, the next text node often begins with `-suffix` / `=value`,
// and gets escaped even though it sits mid-line in the rendered
-// markdown. CommonMark requires `- ` (dash + space) for an unordered
-// list and `===` alone for a setext underline, so `\-X` / `\=X` where
-// X is alphanumeric is never structurally meaningful.
+// markdown. CommonMark setext underlines are `=` or `-` characters on
+// a line by themselves; unordered-list markers require `-`/`+`/`*`
+// followed by whitespace or end-of-line. `\-X` / `\=X` where X is
+// alphanumeric cannot match either rule, so the escape is pure noise.
//
// Drop both. The negative lookbehind `(? {
"Sec-Fetch-Dest": "document",
"Sec-Fetch-Mode": "navigate",
"Sec-Fetch-Site": "none",
+ // Always-on. Real browsers omit this header when there's no user
+ // activation; we model a "user clicked a link" navigation, consistent
+ // with `Sec-Fetch-Site: "none"` above.
"Sec-Fetch-User": "?1",
"Sec-CH-UA": clientHints.brands,
"Sec-CH-UA-Mobile": clientHints.mobile,
@@ -267,9 +276,9 @@ function enforceTooLarge(stage: string, actual: number): MarkfetchError {
// rather than real `` elements. Decode those specific tag patterns so
// turndown processes them as real elements and converts to backticks.
// Pattern accepts ``, ``, ``, `` etc., but
-// rejects ``, ``, `` — the trailing requirement
-// is whitespace, "/", or end-of-tag, so element names with extra characters
-// after `code`/`pre` are not matched.
+// rejects ``, ``, `` — the next char after
+// `code`/`pre` must be whitespace, `/`, or `&` (the start of `>`), so
+// element names with extra characters are not matched.
function decodeEncodedCodeTags(html: string): string {
return html.replaceAll(
/<(\/?(?:code|pre)(?:\s[^&]*?)?\/?)>/g,
@@ -389,7 +398,7 @@ function convertToMarkdown(article: {
//
// Errors are thrown uniformly as MarkfetchError. Adapters catch and translate:
// - mcp.ts catches → errorResult(code, message) → MCP {isError, content}
-// - cli.ts catches → console.error("[code] message") → exit code 1
+// - cli.ts catches → console.error("[code] message") → sets process.exitCode = 1
//
// The full set of error codes this can throw:
// network_error, http_error, timeout, unsupported_content_type,
diff --git a/src/index.ts b/src/index.ts
index 2215e77..f4a0345 100644
--- a/src/index.ts
+++ b/src/index.ts
@@ -3,7 +3,7 @@
// Argv-discriminated dispatcher.
//
// `process.argv.length === 2` means the user provided zero arguments
-// (argv[0] is the node binary, argv[1] is this script path). That's the
+// (argv[0] is the path to node, argv[1] is this script path). That's the
// shape every MCP client uses when spawning a server — so bare invocation
// routes to the MCP adapter and preserves every existing client config.
//
diff --git a/src/mcp.ts b/src/mcp.ts
index 62e0518..94f0d7c 100644
--- a/src/mcp.ts
+++ b/src/mcp.ts
@@ -27,7 +27,7 @@ server.registerTool(
"fetch_markdown",
{
description:
- "Fetch a single public HTTP/S URL and return its main article content as clean markdown. Best for articles, documentation, blog posts, news, and reference pages. JavaScript-rendered SPAs and non-HTML responses return structured errors instead of partial content. Also supports saving the markdown to a file, e.g., to bypass client tool-result size limits or to reuse later.",
+ "Fetch a single public HTTP/S URL and return its main article content as clean markdown. Best for articles, documentation, blog posts, news, and reference pages. Non-HTML responses return `unsupported_content_type`. Pure client-rendered SPAs with no extractable static HTML return `extraction_failed`; SPAs that ship server-rendered or SEO-prerendered HTML will extract whatever static content they expose. Also supports saving the markdown to a file, e.g., to bypass client tool-result size limits or to reuse later.",
inputSchema: {
url: z
.string()
diff --git a/tests/cli.test.ts b/tests/cli.test.ts
index bbac5a0..5087374 100644
--- a/tests/cli.test.ts
+++ b/tests/cli.test.ts
@@ -18,10 +18,10 @@ import { join, resolve as resolvePath } from "node:path";
const execFileAsync = promisify(execFile);
// Resolved at module load against the test runner's cwd (the project root).
-// Tests that override `cwd` to a tmpdir still need to find the tsx binary
+// Tests that override `cwd` to a tmpdir still need to find the tsx CLI
// and the source entry — passing relative paths would resolve against the
// new cwd and produce a confusing ENOENT instead of the behavior under test.
-const TSX_BIN = resolvePath("./node_modules/.bin/tsx");
+const TSX_CLI = resolvePath("./node_modules/.bin/tsx");
const ENTRY = resolvePath("src/index.ts");
const HAPPY_FIXTURE = `
@@ -80,7 +80,7 @@ async function runCli(
): Promise {
try {
const { stdout, stderr } = await execFileAsync(
- TSX_BIN,
+ TSX_CLI,
[ENTRY, ...args],
{
env: { ...process.env, ...env } as Record,
diff --git a/tests/e2e.test.ts b/tests/e2e.test.ts
index 5f56230..e637447 100644
--- a/tests/e2e.test.ts
+++ b/tests/e2e.test.ts
@@ -1,4 +1,4 @@
-// E2E tests against the COMPILED binary (`node dist/index.js`), not the dev
+// E2E tests against the BUILT JS output (`node dist/index.js`), not the dev
// source. server.test.ts already exercises the full surface via tsx; this file
// verifies that `tsc` output is itself correct and runnable. If server.test.ts
// passes but this file fails, the bug lives in the build pipeline, not the
@@ -21,15 +21,15 @@ import { join, resolve as resolvePath } from "node:path";
const execFileAsync = promisify(execFile);
// Resolved absolute paths so a test that overrides cwd still locates the
-// built binary. node is on PATH, so a bare command name is fine for it.
-const BUILT_BIN = resolvePath("dist/index.js");
+// built JS entry. node is on PATH, so a bare command name is fine for it.
+const BUILT_JS = resolvePath("dist/index.js");
before(() => {
// Always rebuild so e2e tests run against current source, not a stale dist/.
execSync("npm run build", { stdio: "inherit" });
});
-async function spawnCompiled(env: Record = {}) {
+async function spawnBuilt(env: Record = {}) {
const transport = new StdioClientTransport({
command: "node",
args: ["dist/index.js"],
@@ -74,7 +74,7 @@ const HAPPY_FIXTURE = `
E2E Fixture Heading
- This is a deterministic fixture for verifying the compiled binary's full pipeline. The article contains enough prose to pass Readability scoring without depending on any external network resource.
+ This is a deterministic fixture for verifying the built output's full pipeline. The article contains enough prose to pass Readability scoring without depending on any external network resource.
Sub-section
Second paragraph adds more substance so the extracted markdown has multiple structural elements to assert against. Lorem ipsum dolor sit amet.
@@ -83,8 +83,8 @@ const HAPPY_FIXTURE = `