From b2ff9fdb7a0c690f92c69370de34049dca6de2ae Mon Sep 17 00:00:00 2001 From: Ronald Tse Date: Thu, 18 Jun 2026 10:36:54 +0800 Subject: [PATCH] fix(docs): regenerate sitemap post-merge so it lists all 4,283 formula pages MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit PR #266 added VitePress's built-in sitemap config, but on the live site the deployed sitemap only listed ~10 URLs (browse index, guide pages, licenses). The 4,283 formula pages were missing. Root cause: VitePress generates its sitemap per-batch during vitepress build. The combine job in docs.yml deploys dist-main's sitemap, which only lists the main site. Browse pages merged in from build-batch matrix jobs don't update the sitemap. Fix: docs/generate-sitemap.js — a post-build script that walks the final dist/ structure (after the multi-batch merge AND the clean-URLs post-process), finds every index.html, and emits a complete sitemap.xml. Excludes 404. Stable-sorted for reproducible builds. Wired in two places: (1) docs/build.js — runs after rewriteCleanUrls, so local builds and link_checker get the complete sitemap; (2) docs.yml combine job — runs after Merge batch browse pages and assets AND after Rewrite clean URLs, just before upload-pages-artifact. Verified locally: with 3 formula pages in dist, generated sitemap lists 42 URLs (was 10 before). In production with all 4,283 formulas, it will list ~4,300 URLs. --- .github/workflows/docs.yml | 9 ++++- docs/build.js | 4 ++ docs/generate-sitemap.js | 81 ++++++++++++++++++++++++++++++++++++++ 3 files changed, 93 insertions(+), 1 deletion(-) create mode 100644 docs/generate-sitemap.js diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml index f2a08cf9..6087aa7d 100644 --- a/.github/workflows/docs.yml +++ b/.github/workflows/docs.yml @@ -287,9 +287,16 @@ jobs: mkdir -p "$dir" mv "$f" "$dir/index.html" done - echo "Rewrote $(find dist -type d -name index.html -exec dirname {} \; | wc -l) pages" shell: bash + - name: Regenerate sitemap + # VitePress sitemap is generated per-batch and only lists that batch's + # pages. The combined dist has 4,283 browse pages from the matrix build + # that the deployed sitemap would otherwise miss. Scan the final dist/ + # and emit a complete sitemap.xml. + working-directory: docs + run: node generate-sitemap.js ../dist + - name: Upload combined artifact uses: actions/upload-pages-artifact@v4 with: diff --git a/docs/build.js b/docs/build.js index 63ab2720..24c54aa0 100644 --- a/docs/build.js +++ b/docs/build.js @@ -186,6 +186,10 @@ async function main() { const rewritten = await rewriteCleanUrls(DIST_DIR); console.log(`Rewrote ${rewritten} pages`); + console.log("=== Regenerating sitemap ==="); + const sitemapResult = spawnSync("node", ["generate-sitemap.js", DIST_DIR], { stdio: "inherit" }); + if (sitemapResult.status !== 0) process.exit(sitemapResult.status ?? 1); + console.log(`\n=== Build complete: ${await countHtmlFiles(DIST_DIR)} total HTML pages ===`); } diff --git a/docs/generate-sitemap.js b/docs/generate-sitemap.js new file mode 100644 index 00000000..7d6a5f2c --- /dev/null +++ b/docs/generate-sitemap.js @@ -0,0 +1,81 @@ +#!/usr/bin/env node +// Regenerate dist/sitemap.xml from the final dist/ structure. +// +// VitePress's built-in sitemap config only sees the pages from its own build +// — in the multi-batch CI pipeline (docs.yml build-main + build-batch matrix) +// each batch's sitemap lists only that batch's pages. The combine job deploys +// dist-main's sitemap, which is missing all 4,283 formula browse pages that +// were merged in from the batches. +// +// Run this AFTER all browse pages have been merged AND the clean-URLs post- +// process has converted foo.html -> foo/index.html. The script scans dist/ +// for index.html files and emits one entry per page. + +import { readdir, stat, writeFile } from "node:fs/promises"; +import { join, relative, sep } from "node:path"; +import { fileURLToPath } from "node:url"; + +const __dirname = fileURLToPath(new URL(".", import.meta.url)); +const DIST = process.argv[2] || join(__dirname, "..", ".vitepress", "dist"); +const ORIGIN = process.env.SITE_URL || "http://localhost:5173"; +const BASE_PATH = process.env.BASE_PATH || "/formulas/"; + +// 404 etc. shouldn't appear in the sitemap. +const EXCLUDE_PATHS = new Set(["404"]); + +async function findIndexFiles(dir) { + const out = []; + const entries = await readdir(dir, { withFileTypes: true }); + for (const e of entries) { + const full = join(dir, e.name); + if (e.isDirectory()) { + out.push(...await findIndexFiles(full)); + } else if (e.name === "index.html") { + out.push(full); + } + } + return out; +} + +function escapeXml(s) { + return s.replace(/[<>&'"]/g, (c) => ({ + "<": "<", + ">": ">", + "&": "&", + "'": "'", + '"': """, + }[c])); +} + +async function main() { + const files = await findIndexFiles(DIST); + const entries = []; + for (const f of files) { + const relPath = relative(DIST, f).split(sep).slice(0, -1).join("/"); + if (EXCLUDE_PATHS.has(relPath)) continue; + const url = `${BASE_PATH}${relPath}/`.replace(/\/{2,}/g, "/"); + let lastmod; + try { + lastmod = (await stat(f)).mtime.toISOString(); + } catch { + lastmod = new Date().toISOString(); + } + entries.push( + ` ${escapeXml(`${ORIGIN}${url}`)}${lastmod}` + ); + } + // Stable sort by URL for reproducible builds. + entries.sort(); + const xml = + `\n` + + `\n` + + `${entries.join("\n")}\n` + + `\n`; + await writeFile(join(DIST, "sitemap.xml"), xml); + console.log(`Generated sitemap.xml with ${entries.length} URLs`); +} + +main().catch((e) => { + console.error(e); + process.exit(1); +});