From c33cd28b1b4b03a97aceca2a932b0bf49eca6be3 Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Wed, 6 May 2026 13:22:05 +0000
Subject: [PATCH] Allow Googlebot to fetch public read-only APIs in robots.txt
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The public server's robots.txt had a blanket Disallow: /api/, which
blocked Googlebot from fetching the JSON the public site hydrates from.
JS-rendering crawlers that can't load /api/* fall back to the SSR shell
and skip the second render pass, hurting indexing.

The public server only exposes GET-only, rate-limited, sensitive-field-
filtered endpoints by design, so emit explicit Allow: rules for each
public read-only API path before the trailing Disallow: /api/ — the
longest-prefix-wins rule keeps anything not on the allow-list blocked.
Extracted a shared buildRobotsTxt(req) helper so the dual-server and
PUBLIC_ONLY robots.txt handlers can't drift apart.
---
 CHANGELOG.md          |  5 ++++
 package-lock.json     |  4 +--
 package.json          |  2 +-
 src/server.js         | 49 ++++++++++++++++++++++++++--------
 tests/backend.test.js | 61 +++++++++++++++++++++++++++++++++++++++++++
 version.json          |  2 +-
 6 files changed, 108 insertions(+), 15 deletions(-)
diff --git a/CHANGELOG.md b/CHANGELOG.md
index dbba376..f371d05 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -4,6 +4,11 @@ All notable changes to CV Manager will be documented in this file.
 
 Format follows [Keep a Changelog](https://keepachangelog.com/), versioning follows [Semantic Versioning](https://semver.org/).
 
+## [1.49.5] - 2026-05-06
+
+### Fixed
+- **Googlebot reported `Blocked by robots.txt` for public read-only API endpoints** (e.g. `/api/datasets/id/:id`, `/api/settings/language`). The public site hydrates client-side from `/api/*` JSON, so a JS-rendering crawler that can't fetch those endpoints sees the SSR shell only and skips re-render — degrading indexing. The public server's `robots.txt` previously had a blanket `Disallow: /api/`, which blocked all of it. The public server only ever exposes a curated set of GET-only, rate-limited, sensitive-field-filtered endpoints, so it's safe to expose those to crawlers. `robots.txt` now emits explicit `Allow:` rules for each public read-only API path (`/api/profile`, `/api/sections`, `/api/settings`, `/api/experiences`, `/api/certifications`, `/api/education`, `/api/skills`, `/api/projects`, `/api/timeline`, `/api/custom-sections`, `/api/layout-types`, `/api/social-platforms`, `/api/cv`, `/api/datasets/slug/`, `/api/datasets/id/`) before the trailing `Disallow: /api/`, so the longest-prefix-wins rule keeps anything not on the allow-list blocked by default. Both `robots.txt` handlers (the dual-server and PUBLIC_ONLY paths) now share a single `buildRobotsTxt(req)` helper in `src/server.js` so they can't drift, with regression tests covering both the indexable and `noindex` branches.
+
 ## [1.49.4] - 2026-05-06
 
 ### Added
diff --git a/package-lock.json b/package-lock.json
index 88175a4..4470560 100644
--- a/package-lock.json
+++ b/package-lock.json
@@ -1,12 +1,12 @@
 {
   "name": "cv-manager",
-  "version": "1.49.4",
+  "version": "1.49.5",
   "lockfileVersion": 3,
   "requires": true,
   "packages": {
     "": {
       "name": "cv-manager",
-      "version": "1.49.4",
+      "version": "1.49.5",
       "dependencies": {
         "archiver": "^7.0.1",
         "better-sqlite3": "^9.4.3",
diff --git a/package.json b/package.json
index b7ded9d..57d9259 100644
--- a/package.json
+++ b/package.json
@@ -1,6 +1,6 @@
 {
   "name": "cv-manager",
-  "version": "1.49.4",
+  "version": "1.49.5",
   "description": "Professional CV Management System",
   "main": "src/server.js",
   "scripts": {
diff --git a/src/server.js b/src/server.js
index f130d81..f2f80df 100644
--- a/src/server.js
+++ b/src/server.js
@@ -307,6 +307,42 @@ function buildCanonicalTag(req) {
     return `    <link rel="canonical" href="${escapeHtmlServer(url)}">`;
 }
 
+// Public read-only API paths the public site fetches client-side. Listed here
+// (rather than just dropping `Disallow: /api/`) so that JS-rendering crawlers
+// like Googlebot can hydrate the page while any future `/api/*` route that
+// isn't on this list stays blocked by the trailing `Disallow: /api/` — most
+// specific match wins per Google's robots.txt rules. Keep in sync with the
+// `publicApp.get('/api/...')` routes below.
+const PUBLIC_API_ALLOW_PATHS = [
+    '/api/profile',
+    '/api/sections',
+    '/api/settings',
+    '/api/experiences',
+    '/api/certifications',
+    '/api/education',
+    '/api/skills',
+    '/api/projects',
+    '/api/timeline',
+    '/api/custom-sections',
+    '/api/layout-types',
+    '/api/social-platforms',
+    '/api/cv',
+    '/api/datasets/slug/',
+    '/api/datasets/id/'
+];
+
+function buildRobotsTxt(req) {
+    const protocol = req.headers['x-forwarded-proto'] || req.protocol || 'https';
+    const host = req.headers['x-forwarded-host'] || req.headers.host || 'localhost';
+    const robotsMeta = db.prepare('SELECT value FROM settings WHERE key = ?').get('robotsMeta');
+    const metaValue = robotsMeta?.value || 'index, follow';
+    if (metaValue.includes('noindex')) {
+        return `User-agent: *\nDisallow: /`;
+    }
+    const allows = PUBLIC_API_ALLOW_PATHS.map(p => `Allow: ${p}`).join('\n');
+    return `User-agent: *\nAllow: /\n${allows}\nDisallow: /api/\nSitemap: ${protocol}://${host}/sitemap.xml`;
+}
+
 // Pull the current live CV into the same shape as a saved-dataset blob so
 // the SSR helper has one input format to deal with.
 function gatherLiveCvData() {
@@ -1978,17 +2014,8 @@ if (PUBLIC_ONLY) {
     });
 
     publicApp.get('/robots.txt', (req, res) => {
-        const protocol = req.headers['x-forwarded-proto'] || req.protocol || 'https';
-        const host = req.headers['x-forwarded-host'] || req.headers.host || 'localhost';
-        const robotsMeta = db.prepare('SELECT value FROM settings WHERE key = ?').get('robotsMeta');
-        const metaValue = robotsMeta?.value || 'index, follow';
-        const isNoIndex = metaValue.includes('noindex');
         res.setHeader('Content-Type', 'text/plain');
-        if (isNoIndex) {
-            res.send(`User-agent: *\nDisallow: /`);
-        } else {
-            res.send(`User-agent: *\nAllow: /\nSitemap: ${protocol}://${host}/sitemap.xml\nDisallow: /api/`);
-        }
+        res.send(buildRobotsTxt(req));
     });
 
     publicApp.use('/shared', express.static(path.join(__dirname, '../public/shared')));
@@ -4400,7 +4427,7 @@ if (PUBLIC_ONLY) {
         next();
     });
     publicApp.get('/sitemap.xml', (req, res) => { const protocol = req.headers['x-forwarded-proto'] || req.protocol || 'https'; const host = req.headers['x-forwarded-host'] || req.headers.host || 'localhost'; res.setHeader('Content-Type', 'application/xml'); res.send(`<?xml version="1.0" encoding="UTF-8"?><urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"><url><loc>${protocol}://${host}/</loc><lastmod>${new Date().toISOString().split('T')[0]}</lastmod><changefreq>weekly</changefreq><priority>1.0</priority></url></urlset>`); });
-    publicApp.get('/robots.txt', (req, res) => { const protocol = req.headers['x-forwarded-proto'] || req.protocol || 'https'; const host = req.headers['x-forwarded-host'] || req.headers.host || 'localhost'; const robotsMeta = db.prepare('SELECT value FROM settings WHERE key = ?').get('robotsMeta'); const metaValue = robotsMeta?.value || 'index, follow'; const isNoIndex = metaValue.includes('noindex'); res.setHeader('Content-Type', 'text/plain'); if (isNoIndex) { res.send(`User-agent: *\nDisallow: /`); } else { res.send(`User-agent: *\nAllow: /\nSitemap: ${protocol}://${host}/sitemap.xml\nDisallow: /api/`); } });
+    publicApp.get('/robots.txt', (req, res) => { res.setHeader('Content-Type', 'text/plain'); res.send(buildRobotsTxt(req)); });
     publicApp.use('/shared', express.static(path.join(__dirname, '../public/shared')));
     // Favicon and icons (public uses icon-public.png with eye badge)
     const publicIconPathB = path.join(__dirname, '../icon-public.png');
diff --git a/tests/backend.test.js b/tests/backend.test.js
index c50c97a..cf4b04c 100644
--- a/tests/backend.test.js
+++ b/tests/backend.test.js
@@ -2634,6 +2634,67 @@ describe('Backend API', () => {
         });
     });
 
+    describe('robots.txt API allow-list', () => {
+        it('does not block public read-only API paths from JS-rendering crawlers', async () => {
+            // Make sure the indexable branch is exercised.
+            await fetch(`${BASE_URL}/api/settings/robotsMeta`, {
+                method: 'PUT',
+                headers: { 'Content-Type': 'application/json' },
+                body: JSON.stringify({ value: 'index, follow' }),
+            });
+
+            const res = await fetch(`${PUBLIC_URL}/robots.txt`);
+            assert.strictEqual(res.status, 200);
+            const text = await res.text();
+
+            // Sanity: the rule the public site relies on for hydration must not
+            // be a bare blanket block. The Disallow may still appear as the
+            // catch-all fallback, but explicit Allow rules for the read-only
+            // endpoints must precede it (longer-prefix Allow wins for Google).
+            const requiredAllows = [
+                '/api/profile',
+                '/api/sections',
+                '/api/settings',
+                '/api/experiences',
+                '/api/certifications',
+                '/api/education',
+                '/api/skills',
+                '/api/projects',
+                '/api/timeline',
+                '/api/custom-sections',
+                '/api/cv',
+                '/api/datasets/slug/',
+                '/api/datasets/id/',
+            ];
+            for (const path of requiredAllows) {
+                assert.ok(
+                    text.includes(`Allow: ${path}`),
+                    `robots.txt is missing Allow rule for ${path}; full body:\n${text}`,
+                );
+            }
+        });
+
+        it('still emits a single global Disallow when robotsMeta is noindex', async () => {
+            await fetch(`${BASE_URL}/api/settings/robotsMeta`, {
+                method: 'PUT',
+                headers: { 'Content-Type': 'application/json' },
+                body: JSON.stringify({ value: 'noindex, nofollow' }),
+            });
+
+            const res = await fetch(`${PUBLIC_URL}/robots.txt`);
+            assert.strictEqual(res.status, 200);
+            const text = await res.text();
+            assert.match(text, /^User-agent: \*\nDisallow: \/$/);
+
+            // Restore default so subsequent tests see the indexable branch.
+            await fetch(`${BASE_URL}/api/settings/robotsMeta`, {
+                method: 'PUT',
+                headers: { 'Content-Type': 'application/json' },
+                body: JSON.stringify({ value: 'index, follow' }),
+            });
+        });
+    });
+
     describe('Canonical link injection', () => {
         it('emits canonical from request host on public root', async () => {
             // Node's fetch reserves the Host header, so simulate the deployed-host
diff --git a/version.json b/version.json
index 4d635a4..f9350dc 100644
--- a/version.json
+++ b/version.json
@@ -1,4 +1,4 @@
 {
-  "version": "1.49.4",
+  "version": "1.49.5",
   "changelog": "https://github.com/vincentmakes/cv-manager/blob/main/CHANGELOG.md"
 }