From 72d00bf76b3749dcb43244710e7573d2d04b92f1 Mon Sep 17 00:00:00 2001 From: vyctorbrzezowski Date: Sat, 6 Jun 2026 15:32:41 -0300 Subject: [PATCH 1/3] fix: tolerate docs corpus outages --- README.md | 2 +- scripts/smoke.ts | 148 +++++++++++++++++++++++++++++++++++++++++++ src/retrieval.ts | 159 +++++++++++++++++++++++++++++++++++++++++++---- src/types.ts | 1 + wrangler.toml | 1 + 5 files changed, 298 insertions(+), 13 deletions(-) diff --git a/README.md b/README.md index bf80bdb..15f78b7 100644 --- a/README.md +++ b/README.md @@ -71,7 +71,7 @@ The Worker expects `OPENAI_API_KEY` as a Cloudflare Worker secret. The default m Docs chat auth is brokered through ClawHub: - `CLAWHUB_AUTH_URL` sends users to `https://hub.openclaw.ai/docs/auth`. -- Corpus and workspace URLs default to `https://docs.openclaw.ai`; the legacy +- Docs index, corpus, and workspace URLs default to `https://docs.openclaw.ai`; the legacy documentation host remains routed only for compatibility. - `CLAWHUB_SESSION_VERIFY_URL` verifies the ClawHub Convex Auth token once. - `ASK_MOLTY_AUTH_SECRET` signs the docs-only session cookie; set it in production so OpenAI key rotation does not invalidate sessions. diff --git a/scripts/smoke.ts b/scripts/smoke.ts index c7ac0e0..9d159e4 100644 --- a/scripts/smoke.ts +++ b/scripts/smoke.ts @@ -1,6 +1,10 @@ #!/usr/bin/env node +/// + import fs from "node:fs"; import path from "node:path"; +import { buildWorkspace } from "../src/retrieval"; +import type { Env } from "../src/types"; const root = process.cwd(); const outDir = path.join(root, "dist", "test"); @@ -29,4 +33,148 @@ const github = fs.readFileSync(path.join(outDir, "github-search.jsonl"), "utf8") if (!github.includes("github.com/openclaw/openclaw")) throw new Error("github index missing OpenClaw links"); +await smokeRuntimeRetrieval(); + console.log(`ask-molty smoke ok: ${fileCount} workspace files`); + +async function smokeRuntimeRetrieval(): Promise { + const docsIndexUrl = "https://example.test/docs-search.json"; + const docsCorpusUrl = "https://example.test/llms-full.txt"; + const sourceIndexUrl = "https://example.test/source-index.jsonl"; + const githubIndexUrl = "https://example.test/github-search.jsonl"; + const emptyIndex = " ".repeat(1000); + const env: Env = { + OPENAI_API_KEY: "test", + DOCS_INDEX_URL: docsIndexUrl, + DOCS_CORPUS_URL: docsCorpusUrl, + SOURCE_INDEX_URL: sourceIndexUrl, + GITHUB_INDEX_URL: githubIndexUrl, + }; + + const docsIndex = JSON.stringify({ + version: 1, + entries: [ + { + title: "Getting started", + url: "/start/getting-started", + snippet: "Install and configure OpenClaw.", + search: "getting started install configure ".repeat(80), + }, + ], + }); + + const sourceIndex = `${JSON.stringify({ + path: "src/settings.ts", + search: "settings implementation issue ".repeat(80), + })}\n`; + const githubIndex = `${JSON.stringify({ + path: "/workspace/github/000.md#issue-123", + number: 123, + state: "open", + title: "Settings issue", + url: "https://github.com/openclaw/openclaw/issues/123", + search: "settings implementation issue ".repeat(80), + })}\n`; + const docsCorpus = [ + "# Deep fallback page", + "Source: https://docs.openclaw.ai/deep/fallback", + "", + "obscure fallback phrase ".repeat(80), + ].join("\n"); + + const firstCalls: string[] = []; + await withMockNetwork( + async (url) => { + firstCalls.push(url); + if (url === docsIndexUrl) return new Response(docsIndex); + if (url === sourceIndexUrl || url === githubIndexUrl) return new Response(emptyIndex); + return new Response("missing", { status: 404 }); + }, + async () => { + const files = await buildWorkspace(env, "getting started"); + if (!files.some((file) => file.path === "/docs/start__getting-started.md")) { + throw new Error("runtime retrieval: docs-search.json record was not mounted"); + } + }, + ); + if (firstCalls.includes(docsCorpusUrl)) { + throw new Error("runtime retrieval: docs corpus loaded despite a usable docs index"); + } + + await withMockNetwork( + async (url) => { + if (url === docsIndexUrl) return new Response("missing", { status: 503 }); + if (url === docsCorpusUrl) return new Response(docsCorpus); + if (url === sourceIndexUrl || url === githubIndexUrl) return new Response(emptyIndex); + return new Response("missing", { status: 404 }); + }, + async () => { + const files = await buildWorkspace(env, "obscure fallback phrase"); + if (!files.some((file) => file.url === "https://docs.openclaw.ai/deep/fallback")) { + throw new Error("runtime retrieval: docs corpus fallback was not mounted"); + } + }, + ); + + await withMockNetwork( + async (url) => { + if (url === sourceIndexUrl) return new Response(sourceIndex); + if (url === githubIndexUrl) return new Response(githubIndex); + return new Response("missing", { status: 522 }); + }, + async () => { + const files = await buildWorkspace(env, "settings implementation issue"); + if (!files.some((file) => file.path === "/workspace/docs/unavailable.md")) { + throw new Error("runtime retrieval: missing docs unavailable workspace note"); + } + if (!files.some((file) => file.kind === "source")) { + throw new Error("runtime retrieval: source context was blocked by docs outage"); + } + if (!files.some((file) => file.kind === "github")) { + throw new Error("runtime retrieval: GitHub context was blocked by docs outage"); + } + }, + ); +} + +async function withMockNetwork( + fetchText: (url: string) => Promise, + run: () => Promise, +): Promise { + const originalFetch = globalThis.fetch; + const originalCaches = (globalThis as unknown as { caches?: CacheStorage }).caches; + const cache = new Map(); + Object.defineProperty(globalThis, "fetch", { + configurable: true, + value: (input: RequestInfo | URL) => { + const url = typeof input === "string" ? input : input instanceof URL ? input.href : input.url; + return fetchText(url); + }, + }); + Object.defineProperty(globalThis, "caches", { + configurable: true, + value: { + default: { + match: async (request: Request) => { + const text = cache.get(request.url); + return text === undefined ? undefined : new Response(text); + }, + put: async (request: Request, response: Response) => { + cache.set(request.url, await response.clone().text()); + }, + }, + }, + }); + try { + await run(); + } finally { + Object.defineProperty(globalThis, "fetch", { + configurable: true, + value: originalFetch, + }); + Object.defineProperty(globalThis, "caches", { + configurable: true, + value: originalCaches, + }); + } +} diff --git a/src/retrieval.ts b/src/retrieval.ts index 5d73a98..c10bd6e 100644 --- a/src/retrieval.ts +++ b/src/retrieval.ts @@ -1,25 +1,44 @@ import type { Env, SearchRecord, WorkspaceFile } from "./types"; const docsCorpusUrl = "https://docs.openclaw.ai/llms-full.txt"; +const docsCorpusFallbackUrl = "https://docs.openclaw.ai/.well-known/llms-full.txt"; +const docsSearchIndexUrl = "https://docs.openclaw.ai/docs-search.json"; const sourceIndexUrl = "https://docs.openclaw.ai/source-index.jsonl"; const githubIndexUrl = "https://docs.openclaw.ai/ask-molty/github-search.jsonl"; const workspaceManifestUrl = "https://docs.openclaw.ai/ask-molty/workspace-manifest.json"; +const loadTextRetryDelaysMs = [150, 450]; export async function buildWorkspace(env: Env, query: string): Promise { - const [docsCorpus, sourceIndex, githubIndex] = await Promise.all([ - loadText(env.DOCS_CORPUS_URL ?? docsCorpusUrl, 1000), + const [docsResult, sourceIndex, githubIndex] = await Promise.all([ + loadDocsRecords(env).catch(() => ({ records: [], usesSearchIndex: false })), loadText(env.SOURCE_INDEX_URL ?? sourceIndexUrl, 1000).catch(() => ""), loadText(env.GITHUB_INDEX_URL ?? githubIndexUrl, 1000).catch(() => ""), ]); - const docs = docsRecordsFromCorpus(docsCorpus); const source = recordsFromJsonl(sourceIndex, "source"); const github = recordsFromJsonl(githubIndex, "github"); - const docMatches = selectRecords(docs, query, 10); + let docs = docsResult.records; + let docMatches = selectRecords(docs, query, 10); const sourceMatches = selectRecords(source, query, sourceSeeking(query) ? 10 : 5); const githubMatches = selectRecords(github, query, githubSeeking(query) ? 12 : 4); + if ( + !docMatches.length && + !sourceMatches.length && + !githubMatches.length && + docsResult.usesSearchIndex + ) { + const corpusDocs = await loadDocsCorpus(env) + .then((corpus) => docsRecordsFromCorpus(corpus)) + .catch(() => []); + const corpusMatches = selectRecords(corpusDocs, query, 10); + if (corpusMatches.length) { + docs = corpusDocs; + docMatches = corpusMatches; + } + } const files: WorkspaceFile[] = []; + if (!docs.length) files.push(docsUnavailableFile()); const githubSummary = githubSummaryFile(github, query); if (githubSummary) files.push(githubSummary); for (const record of docMatches) files.push(recordToWorkspaceFile(record)); @@ -80,24 +99,115 @@ export function workspaceContext(files: WorkspaceFile[]): string { .join("\n\n---\n\n"); } -async function loadText(url: string, minLength: number): Promise { +async function loadText( + url: string, + minLength: number, + retryDelaysMs: readonly number[] = [], +): Promise { const cache = caches.default; const key = new Request(url, { method: "GET" }); const cached = await cache.match(key); if (cached?.ok) return cached.text(); - const response = await fetch(url, { cf: { cacheEverything: true, cacheTtl: 300 } }); - if (!response.ok) throw new Error(`Unable to load ${url}: ${response.status}`); - const text = await response.text(); - if (text.startsWith("") || text.length < minLength) - throw new Error(`Invalid text from ${url}`); - await cache.put(key, new Response(text, { headers: { "Cache-Control": "public, max-age=300" } })); - return text; + let lastError: unknown; + for (let attempt = 0; attempt <= retryDelaysMs.length; attempt += 1) { + try { + const response = await fetch(url, { cf: { cacheEverything: true, cacheTtl: 300 } }); + if (!response.ok) throw new Error(`Unable to load ${url}: ${response.status}`); + const text = await response.text(); + if (text.startsWith("") || text.length < minLength) + throw new Error(`Invalid text from ${url}`); + await cache.put( + key, + new Response(text, { headers: { "Cache-Control": "public, max-age=300" } }), + ); + return text; + } catch (error) { + lastError = error; + const delay = retryDelaysMs[attempt]; + if (delay) await sleep(delay); + } + } + throw lastError instanceof Error ? lastError : new Error(`Unable to load ${url}`); +} + +async function loadDocsCorpus(env: Env): Promise { + for (const url of docsCorpusUrls(env)) { + try { + return await loadText(url, 1000, loadTextRetryDelaysMs); + } catch (error) { + console.warn("docs corpus fetch failed", { + url, + error: error instanceof Error ? error.message : String(error), + }); + } + } + throw new Error("Docs corpus is temporarily unavailable. Please retry in a moment."); +} + +async function loadDocsRecords(env: Env): Promise { + try { + return { + records: docsRecordsFromSearchIndex( + await loadText(env.DOCS_INDEX_URL ?? docsSearchIndexUrl, 1000, loadTextRetryDelaysMs), + ), + usesSearchIndex: true, + }; + } catch (error) { + console.warn("docs search index fetch failed", { + error: error instanceof Error ? error.message : String(error), + }); + } + return { + records: docsRecordsFromCorpus(await loadDocsCorpus(env)), + usesSearchIndex: false, + }; +} + +function docsCorpusUrls(env: Env): string[] { + const primary = env.DOCS_CORPUS_URL ?? docsCorpusUrl; + return primary === docsCorpusUrl ? [docsCorpusUrl, docsCorpusFallbackUrl] : [primary]; +} + +function sleep(ms: number): Promise { + return new Promise((resolve) => setTimeout(resolve, ms)); } async function loadJson(url: string): Promise { return JSON.parse(await loadText(url, 2)) as T; } +function docsRecordsFromSearchIndex(text: string): SearchRecord[] { + const parsed = JSON.parse(text) as Partial; + if (!Array.isArray(parsed.entries)) throw new Error("Invalid docs search index"); + const records = parsed.entries + .map((entry) => docsSearchEntryToRecord(entry)) + .filter((record): record is SearchRecord => Boolean(record)); + if (!records.length) throw new Error("Docs search index has no usable entries"); + return records; +} + +function docsSearchEntryToRecord(entry: DocsSearchEntry): SearchRecord | undefined { + if (!entry.url || !entry.search) return undefined; + const url = new URL(entry.url, "https://docs.openclaw.ai").toString(); + const title = entry.title || titleFromRoute(entry.url); + return { + kind: "docs", + path: `/docs/${flatPath(entry.url.replace(/^\/+/, "") || "index")}.md`, + title, + url, + search: [title, entry.snippet, entry.search].filter(Boolean).join("\n\n"), + }; +} + +function titleFromRoute(value: string): string { + const base = value.replace(/\/$/u, "").split("/").pop() || "Docs"; + return base + .split(/[-_]+/u) + .filter(Boolean) + .map((part) => `${part.slice(0, 1).toUpperCase()}${part.slice(1)}`) + .join(" "); +} + function docsRecordsFromCorpus(corpus: string): SearchRecord[] { return corpus .split(/\n---\n/g) @@ -116,6 +226,15 @@ function docsRecordsFromCorpus(corpus: string): SearchRecord[] { }); } +function docsUnavailableFile(): WorkspaceFile { + return { + path: "/workspace/docs/unavailable.md", + kind: "docs", + content: + "# Docs unavailable\n\nThe OpenClaw docs index could not be loaded for this answer. If the question needs documentation context, tell the user Molty cannot load the docs right now and ask them to retry in a moment. Do not invent documentation details.", + }; +} + function recordsFromJsonl(text: string, kind: SearchRecord["kind"]): SearchRecord[] { const records: SearchRecord[] = []; for (const line of text.split("\n")) { @@ -439,3 +558,19 @@ interface WorkspaceManifest { baseUrl?: string; files?: Record; } + +interface DocsRecordSet { + records: SearchRecord[]; + usesSearchIndex: boolean; +} + +interface DocsSearchIndex { + entries?: DocsSearchEntry[]; +} + +interface DocsSearchEntry { + search?: string; + snippet?: string; + title?: string; + url?: string; +} diff --git a/src/types.ts b/src/types.ts index 6b48c29..c30b234 100644 --- a/src/types.ts +++ b/src/types.ts @@ -1,5 +1,6 @@ export interface Env { OPENAI_API_KEY: string; + DOCS_INDEX_URL?: string; DOCS_CORPUS_URL?: string; SOURCE_INDEX_URL?: string; GITHUB_INDEX_URL?: string; diff --git a/wrangler.toml b/wrangler.toml index ef41fa4..0b31957 100644 --- a/wrangler.toml +++ b/wrangler.toml @@ -11,6 +11,7 @@ routes = [ ] [vars] +DOCS_INDEX_URL = "https://docs.openclaw.ai/docs-search.json" DOCS_CORPUS_URL = "https://docs.openclaw.ai/llms-full.txt" SOURCE_INDEX_URL = "https://docs.openclaw.ai/source-index.jsonl" GITHUB_INDEX_URL = "https://docs.openclaw.ai/ask-molty/github-search.jsonl" From 6e0f3d3292cb4dfe6f4529a65522940a65e2231e Mon Sep 17 00:00:00 2001 From: Peter Steinberger Date: Sat, 6 Jun 2026 22:42:17 -0700 Subject: [PATCH 2/3] test: print retrieval fallback proof --- scripts/smoke.ts | 3 +++ 1 file changed, 3 insertions(+) diff --git a/scripts/smoke.ts b/scripts/smoke.ts index 9d159e4..4b9127e 100644 --- a/scripts/smoke.ts +++ b/scripts/smoke.ts @@ -100,6 +100,7 @@ async function smokeRuntimeRetrieval(): Promise { if (firstCalls.includes(docsCorpusUrl)) { throw new Error("runtime retrieval: docs corpus loaded despite a usable docs index"); } + console.log("runtime retrieval ok: docs-search.json mounted without corpus fetch"); await withMockNetwork( async (url) => { @@ -115,6 +116,7 @@ async function smokeRuntimeRetrieval(): Promise { } }, ); + console.log("runtime retrieval ok: docs corpus fallback mounted after index failure"); await withMockNetwork( async (url) => { @@ -135,6 +137,7 @@ async function smokeRuntimeRetrieval(): Promise { } }, ); + console.log("runtime retrieval ok: docs outage keeps source and GitHub context"); } async function withMockNetwork( From 9fc511dc594ccdca488fd21e637316054b0362d4 Mon Sep 17 00:00:00 2001 From: Peter Steinberger Date: Sat, 6 Jun 2026 22:46:55 -0700 Subject: [PATCH 3/3] fix: resolve docs index links from override origin --- scripts/smoke.ts | 8 +++++++- src/retrieval.ts | 12 +++++++----- 2 files changed, 14 insertions(+), 6 deletions(-) diff --git a/scripts/smoke.ts b/scripts/smoke.ts index 4b9127e..9251bf4 100644 --- a/scripts/smoke.ts +++ b/scripts/smoke.ts @@ -92,7 +92,13 @@ async function smokeRuntimeRetrieval(): Promise { }, async () => { const files = await buildWorkspace(env, "getting started"); - if (!files.some((file) => file.path === "/docs/start__getting-started.md")) { + if ( + !files.some( + (file) => + file.path === "/docs/start__getting-started.md" && + file.url === "https://example.test/start/getting-started", + ) + ) { throw new Error("runtime retrieval: docs-search.json record was not mounted"); } }, diff --git a/src/retrieval.ts b/src/retrieval.ts index c10bd6e..6e1c398 100644 --- a/src/retrieval.ts +++ b/src/retrieval.ts @@ -145,10 +145,12 @@ async function loadDocsCorpus(env: Env): Promise { } async function loadDocsRecords(env: Env): Promise { + const indexUrl = env.DOCS_INDEX_URL ?? docsSearchIndexUrl; try { return { records: docsRecordsFromSearchIndex( - await loadText(env.DOCS_INDEX_URL ?? docsSearchIndexUrl, 1000, loadTextRetryDelaysMs), + await loadText(indexUrl, 1000, loadTextRetryDelaysMs), + new URL(indexUrl).origin, ), usesSearchIndex: true, }; @@ -176,19 +178,19 @@ async function loadJson(url: string): Promise { return JSON.parse(await loadText(url, 2)) as T; } -function docsRecordsFromSearchIndex(text: string): SearchRecord[] { +function docsRecordsFromSearchIndex(text: string, origin: string): SearchRecord[] { const parsed = JSON.parse(text) as Partial; if (!Array.isArray(parsed.entries)) throw new Error("Invalid docs search index"); const records = parsed.entries - .map((entry) => docsSearchEntryToRecord(entry)) + .map((entry) => docsSearchEntryToRecord(entry, origin)) .filter((record): record is SearchRecord => Boolean(record)); if (!records.length) throw new Error("Docs search index has no usable entries"); return records; } -function docsSearchEntryToRecord(entry: DocsSearchEntry): SearchRecord | undefined { +function docsSearchEntryToRecord(entry: DocsSearchEntry, origin: string): SearchRecord | undefined { if (!entry.url || !entry.search) return undefined; - const url = new URL(entry.url, "https://docs.openclaw.ai").toString(); + const url = new URL(entry.url, origin).toString(); const title = entry.title || titleFromRoute(entry.url); return { kind: "docs",