From c33cd28b1b4b03a97aceca2a932b0bf49eca6be3 Mon Sep 17 00:00:00 2001 From: Claude Date: Wed, 6 May 2026 13:22:05 +0000 Subject: [PATCH] Allow Googlebot to fetch public read-only APIs in robots.txt MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The public server's robots.txt had a blanket Disallow: /api/, which blocked Googlebot from fetching the JSON the public site hydrates from. JS-rendering crawlers that can't load /api/* fall back to the SSR shell and skip the second render pass, hurting indexing. The public server only exposes GET-only, rate-limited, sensitive-field- filtered endpoints by design, so emit explicit Allow: rules for each public read-only API path before the trailing Disallow: /api/ — the longest-prefix-wins rule keeps anything not on the allow-list blocked. Extracted a shared buildRobotsTxt(req) helper so the dual-server and PUBLIC_ONLY robots.txt handlers can't drift apart. --- CHANGELOG.md | 5 ++++ package-lock.json | 4 +-- package.json | 2 +- src/server.js | 49 ++++++++++++++++++++++++++-------- tests/backend.test.js | 61 +++++++++++++++++++++++++++++++++++++++++++ version.json | 2 +- 6 files changed, 108 insertions(+), 15 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index dbba376..f371d05 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,11 @@ All notable changes to CV Manager will be documented in this file. Format follows [Keep a Changelog](https://keepachangelog.com/), versioning follows [Semantic Versioning](https://semver.org/). +## [1.49.5] - 2026-05-06 + +### Fixed +- **Googlebot reported `Blocked by robots.txt` for public read-only API endpoints** (e.g. `/api/datasets/id/:id`, `/api/settings/language`). The public site hydrates client-side from `/api/*` JSON, so a JS-rendering crawler that can't fetch those endpoints sees the SSR shell only and skips re-render — degrading indexing. The public server's `robots.txt` previously had a blanket `Disallow: /api/`, which blocked all of it. The public server only ever exposes a curated set of GET-only, rate-limited, sensitive-field-filtered endpoints, so it's safe to expose those to crawlers. `robots.txt` now emits explicit `Allow:` rules for each public read-only API path (`/api/profile`, `/api/sections`, `/api/settings`, `/api/experiences`, `/api/certifications`, `/api/education`, `/api/skills`, `/api/projects`, `/api/timeline`, `/api/custom-sections`, `/api/layout-types`, `/api/social-platforms`, `/api/cv`, `/api/datasets/slug/`, `/api/datasets/id/`) before the trailing `Disallow: /api/`, so the longest-prefix-wins rule keeps anything not on the allow-list blocked by default. Both `robots.txt` handlers (the dual-server and PUBLIC_ONLY paths) now share a single `buildRobotsTxt(req)` helper in `src/server.js` so they can't drift, with regression tests covering both the indexable and `noindex` branches. + ## [1.49.4] - 2026-05-06 ### Added diff --git a/package-lock.json b/package-lock.json index 88175a4..4470560 100644 --- a/package-lock.json +++ b/package-lock.json @@ -1,12 +1,12 @@ { "name": "cv-manager", - "version": "1.49.4", + "version": "1.49.5", "lockfileVersion": 3, "requires": true, "packages": { "": { "name": "cv-manager", - "version": "1.49.4", + "version": "1.49.5", "dependencies": { "archiver": "^7.0.1", "better-sqlite3": "^9.4.3", diff --git a/package.json b/package.json index b7ded9d..57d9259 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "cv-manager", - "version": "1.49.4", + "version": "1.49.5", "description": "Professional CV Management System", "main": "src/server.js", "scripts": { diff --git a/src/server.js b/src/server.js index f130d81..f2f80df 100644 --- a/src/server.js +++ b/src/server.js @@ -307,6 +307,42 @@ function buildCanonicalTag(req) { return ` `; } +// Public read-only API paths the public site fetches client-side. Listed here +// (rather than just dropping `Disallow: /api/`) so that JS-rendering crawlers +// like Googlebot can hydrate the page while any future `/api/*` route that +// isn't on this list stays blocked by the trailing `Disallow: /api/` — most +// specific match wins per Google's robots.txt rules. Keep in sync with the +// `publicApp.get('/api/...')` routes below. +const PUBLIC_API_ALLOW_PATHS = [ + '/api/profile', + '/api/sections', + '/api/settings', + '/api/experiences', + '/api/certifications', + '/api/education', + '/api/skills', + '/api/projects', + '/api/timeline', + '/api/custom-sections', + '/api/layout-types', + '/api/social-platforms', + '/api/cv', + '/api/datasets/slug/', + '/api/datasets/id/' +]; + +function buildRobotsTxt(req) { + const protocol = req.headers['x-forwarded-proto'] || req.protocol || 'https'; + const host = req.headers['x-forwarded-host'] || req.headers.host || 'localhost'; + const robotsMeta = db.prepare('SELECT value FROM settings WHERE key = ?').get('robotsMeta'); + const metaValue = robotsMeta?.value || 'index, follow'; + if (metaValue.includes('noindex')) { + return `User-agent: *\nDisallow: /`; + } + const allows = PUBLIC_API_ALLOW_PATHS.map(p => `Allow: ${p}`).join('\n'); + return `User-agent: *\nAllow: /\n${allows}\nDisallow: /api/\nSitemap: ${protocol}://${host}/sitemap.xml`; +} + // Pull the current live CV into the same shape as a saved-dataset blob so // the SSR helper has one input format to deal with. function gatherLiveCvData() { @@ -1978,17 +2014,8 @@ if (PUBLIC_ONLY) { }); publicApp.get('/robots.txt', (req, res) => { - const protocol = req.headers['x-forwarded-proto'] || req.protocol || 'https'; - const host = req.headers['x-forwarded-host'] || req.headers.host || 'localhost'; - const robotsMeta = db.prepare('SELECT value FROM settings WHERE key = ?').get('robotsMeta'); - const metaValue = robotsMeta?.value || 'index, follow'; - const isNoIndex = metaValue.includes('noindex'); res.setHeader('Content-Type', 'text/plain'); - if (isNoIndex) { - res.send(`User-agent: *\nDisallow: /`); - } else { - res.send(`User-agent: *\nAllow: /\nSitemap: ${protocol}://${host}/sitemap.xml\nDisallow: /api/`); - } + res.send(buildRobotsTxt(req)); }); publicApp.use('/shared', express.static(path.join(__dirname, '../public/shared'))); @@ -4400,7 +4427,7 @@ if (PUBLIC_ONLY) { next(); }); publicApp.get('/sitemap.xml', (req, res) => { const protocol = req.headers['x-forwarded-proto'] || req.protocol || 'https'; const host = req.headers['x-forwarded-host'] || req.headers.host || 'localhost'; res.setHeader('Content-Type', 'application/xml'); res.send(`${protocol}://${host}/${new Date().toISOString().split('T')[0]}weekly1.0`); }); - publicApp.get('/robots.txt', (req, res) => { const protocol = req.headers['x-forwarded-proto'] || req.protocol || 'https'; const host = req.headers['x-forwarded-host'] || req.headers.host || 'localhost'; const robotsMeta = db.prepare('SELECT value FROM settings WHERE key = ?').get('robotsMeta'); const metaValue = robotsMeta?.value || 'index, follow'; const isNoIndex = metaValue.includes('noindex'); res.setHeader('Content-Type', 'text/plain'); if (isNoIndex) { res.send(`User-agent: *\nDisallow: /`); } else { res.send(`User-agent: *\nAllow: /\nSitemap: ${protocol}://${host}/sitemap.xml\nDisallow: /api/`); } }); + publicApp.get('/robots.txt', (req, res) => { res.setHeader('Content-Type', 'text/plain'); res.send(buildRobotsTxt(req)); }); publicApp.use('/shared', express.static(path.join(__dirname, '../public/shared'))); // Favicon and icons (public uses icon-public.png with eye badge) const publicIconPathB = path.join(__dirname, '../icon-public.png'); diff --git a/tests/backend.test.js b/tests/backend.test.js index c50c97a..cf4b04c 100644 --- a/tests/backend.test.js +++ b/tests/backend.test.js @@ -2634,6 +2634,67 @@ describe('Backend API', () => { }); }); + describe('robots.txt API allow-list', () => { + it('does not block public read-only API paths from JS-rendering crawlers', async () => { + // Make sure the indexable branch is exercised. + await fetch(`${BASE_URL}/api/settings/robotsMeta`, { + method: 'PUT', + headers: { 'Content-Type': 'application/json' }, + body: JSON.stringify({ value: 'index, follow' }), + }); + + const res = await fetch(`${PUBLIC_URL}/robots.txt`); + assert.strictEqual(res.status, 200); + const text = await res.text(); + + // Sanity: the rule the public site relies on for hydration must not + // be a bare blanket block. The Disallow may still appear as the + // catch-all fallback, but explicit Allow rules for the read-only + // endpoints must precede it (longer-prefix Allow wins for Google). + const requiredAllows = [ + '/api/profile', + '/api/sections', + '/api/settings', + '/api/experiences', + '/api/certifications', + '/api/education', + '/api/skills', + '/api/projects', + '/api/timeline', + '/api/custom-sections', + '/api/cv', + '/api/datasets/slug/', + '/api/datasets/id/', + ]; + for (const path of requiredAllows) { + assert.ok( + text.includes(`Allow: ${path}`), + `robots.txt is missing Allow rule for ${path}; full body:\n${text}`, + ); + } + }); + + it('still emits a single global Disallow when robotsMeta is noindex', async () => { + await fetch(`${BASE_URL}/api/settings/robotsMeta`, { + method: 'PUT', + headers: { 'Content-Type': 'application/json' }, + body: JSON.stringify({ value: 'noindex, nofollow' }), + }); + + const res = await fetch(`${PUBLIC_URL}/robots.txt`); + assert.strictEqual(res.status, 200); + const text = await res.text(); + assert.match(text, /^User-agent: \*\nDisallow: \/$/); + + // Restore default so subsequent tests see the indexable branch. + await fetch(`${BASE_URL}/api/settings/robotsMeta`, { + method: 'PUT', + headers: { 'Content-Type': 'application/json' }, + body: JSON.stringify({ value: 'index, follow' }), + }); + }); + }); + describe('Canonical link injection', () => { it('emits canonical from request host on public root', async () => { // Node's fetch reserves the Host header, so simulate the deployed-host diff --git a/version.json b/version.json index 4d635a4..f9350dc 100644 --- a/version.json +++ b/version.json @@ -1,4 +1,4 @@ { - "version": "1.49.4", + "version": "1.49.5", "changelog": "https://github.com/vincentmakes/cv-manager/blob/main/CHANGELOG.md" }