diff --git a/README.md b/README.md index bf80bdb..15f78b7 100644 --- a/README.md +++ b/README.md @@ -71,7 +71,7 @@ The Worker expects `OPENAI_API_KEY` as a Cloudflare Worker secret. The default m Docs chat auth is brokered through ClawHub: - `CLAWHUB_AUTH_URL` sends users to `https://hub.openclaw.ai/docs/auth`. -- Corpus and workspace URLs default to `https://docs.openclaw.ai`; the legacy +- Docs index, corpus, and workspace URLs default to `https://docs.openclaw.ai`; the legacy documentation host remains routed only for compatibility. - `CLAWHUB_SESSION_VERIFY_URL` verifies the ClawHub Convex Auth token once. - `ASK_MOLTY_AUTH_SECRET` signs the docs-only session cookie; set it in production so OpenAI key rotation does not invalidate sessions. diff --git a/scripts/smoke.ts b/scripts/smoke.ts index c7ac0e0..9251bf4 100644 --- a/scripts/smoke.ts +++ b/scripts/smoke.ts @@ -1,6 +1,10 @@ #!/usr/bin/env node +/// + import fs from "node:fs"; import path from "node:path"; +import { buildWorkspace } from "../src/retrieval"; +import type { Env } from "../src/types"; const root = process.cwd(); const outDir = path.join(root, "dist", "test"); @@ -29,4 +33,157 @@ const github = fs.readFileSync(path.join(outDir, "github-search.jsonl"), "utf8") if (!github.includes("github.com/openclaw/openclaw")) throw new Error("github index missing OpenClaw links"); +await smokeRuntimeRetrieval(); + console.log(`ask-molty smoke ok: ${fileCount} workspace files`); + +async function smokeRuntimeRetrieval(): Promise { + const docsIndexUrl = "https://example.test/docs-search.json"; + const docsCorpusUrl = "https://example.test/llms-full.txt"; + const sourceIndexUrl = "https://example.test/source-index.jsonl"; + const githubIndexUrl = "https://example.test/github-search.jsonl"; + const emptyIndex = " ".repeat(1000); + const env: Env = { + OPENAI_API_KEY: "test", + DOCS_INDEX_URL: docsIndexUrl, + DOCS_CORPUS_URL: docsCorpusUrl, + SOURCE_INDEX_URL: sourceIndexUrl, + GITHUB_INDEX_URL: githubIndexUrl, + }; + + const docsIndex = JSON.stringify({ + version: 1, + entries: [ + { + title: "Getting started", + url: "/start/getting-started", + snippet: "Install and configure OpenClaw.", + search: "getting started install configure ".repeat(80), + }, + ], + }); + + const sourceIndex = `${JSON.stringify({ + path: "src/settings.ts", + search: "settings implementation issue ".repeat(80), + })}\n`; + const githubIndex = `${JSON.stringify({ + path: "/workspace/github/000.md#issue-123", + number: 123, + state: "open", + title: "Settings issue", + url: "https://github.com/openclaw/openclaw/issues/123", + search: "settings implementation issue ".repeat(80), + })}\n`; + const docsCorpus = [ + "# Deep fallback page", + "Source: https://docs.openclaw.ai/deep/fallback", + "", + "obscure fallback phrase ".repeat(80), + ].join("\n"); + + const firstCalls: string[] = []; + await withMockNetwork( + async (url) => { + firstCalls.push(url); + if (url === docsIndexUrl) return new Response(docsIndex); + if (url === sourceIndexUrl || url === githubIndexUrl) return new Response(emptyIndex); + return new Response("missing", { status: 404 }); + }, + async () => { + const files = await buildWorkspace(env, "getting started"); + if ( + !files.some( + (file) => + file.path === "/docs/start__getting-started.md" && + file.url === "https://example.test/start/getting-started", + ) + ) { + throw new Error("runtime retrieval: docs-search.json record was not mounted"); + } + }, + ); + if (firstCalls.includes(docsCorpusUrl)) { + throw new Error("runtime retrieval: docs corpus loaded despite a usable docs index"); + } + console.log("runtime retrieval ok: docs-search.json mounted without corpus fetch"); + + await withMockNetwork( + async (url) => { + if (url === docsIndexUrl) return new Response("missing", { status: 503 }); + if (url === docsCorpusUrl) return new Response(docsCorpus); + if (url === sourceIndexUrl || url === githubIndexUrl) return new Response(emptyIndex); + return new Response("missing", { status: 404 }); + }, + async () => { + const files = await buildWorkspace(env, "obscure fallback phrase"); + if (!files.some((file) => file.url === "https://docs.openclaw.ai/deep/fallback")) { + throw new Error("runtime retrieval: docs corpus fallback was not mounted"); + } + }, + ); + console.log("runtime retrieval ok: docs corpus fallback mounted after index failure"); + + await withMockNetwork( + async (url) => { + if (url === sourceIndexUrl) return new Response(sourceIndex); + if (url === githubIndexUrl) return new Response(githubIndex); + return new Response("missing", { status: 522 }); + }, + async () => { + const files = await buildWorkspace(env, "settings implementation issue"); + if (!files.some((file) => file.path === "/workspace/docs/unavailable.md")) { + throw new Error("runtime retrieval: missing docs unavailable workspace note"); + } + if (!files.some((file) => file.kind === "source")) { + throw new Error("runtime retrieval: source context was blocked by docs outage"); + } + if (!files.some((file) => file.kind === "github")) { + throw new Error("runtime retrieval: GitHub context was blocked by docs outage"); + } + }, + ); + console.log("runtime retrieval ok: docs outage keeps source and GitHub context"); +} + +async function withMockNetwork( + fetchText: (url: string) => Promise, + run: () => Promise, +): Promise { + const originalFetch = globalThis.fetch; + const originalCaches = (globalThis as unknown as { caches?: CacheStorage }).caches; + const cache = new Map(); + Object.defineProperty(globalThis, "fetch", { + configurable: true, + value: (input: RequestInfo | URL) => { + const url = typeof input === "string" ? input : input instanceof URL ? input.href : input.url; + return fetchText(url); + }, + }); + Object.defineProperty(globalThis, "caches", { + configurable: true, + value: { + default: { + match: async (request: Request) => { + const text = cache.get(request.url); + return text === undefined ? undefined : new Response(text); + }, + put: async (request: Request, response: Response) => { + cache.set(request.url, await response.clone().text()); + }, + }, + }, + }); + try { + await run(); + } finally { + Object.defineProperty(globalThis, "fetch", { + configurable: true, + value: originalFetch, + }); + Object.defineProperty(globalThis, "caches", { + configurable: true, + value: originalCaches, + }); + } +} diff --git a/src/retrieval.ts b/src/retrieval.ts index 5d73a98..6e1c398 100644 --- a/src/retrieval.ts +++ b/src/retrieval.ts @@ -1,25 +1,44 @@ import type { Env, SearchRecord, WorkspaceFile } from "./types"; const docsCorpusUrl = "https://docs.openclaw.ai/llms-full.txt"; +const docsCorpusFallbackUrl = "https://docs.openclaw.ai/.well-known/llms-full.txt"; +const docsSearchIndexUrl = "https://docs.openclaw.ai/docs-search.json"; const sourceIndexUrl = "https://docs.openclaw.ai/source-index.jsonl"; const githubIndexUrl = "https://docs.openclaw.ai/ask-molty/github-search.jsonl"; const workspaceManifestUrl = "https://docs.openclaw.ai/ask-molty/workspace-manifest.json"; +const loadTextRetryDelaysMs = [150, 450]; export async function buildWorkspace(env: Env, query: string): Promise { - const [docsCorpus, sourceIndex, githubIndex] = await Promise.all([ - loadText(env.DOCS_CORPUS_URL ?? docsCorpusUrl, 1000), + const [docsResult, sourceIndex, githubIndex] = await Promise.all([ + loadDocsRecords(env).catch(() => ({ records: [], usesSearchIndex: false })), loadText(env.SOURCE_INDEX_URL ?? sourceIndexUrl, 1000).catch(() => ""), loadText(env.GITHUB_INDEX_URL ?? githubIndexUrl, 1000).catch(() => ""), ]); - const docs = docsRecordsFromCorpus(docsCorpus); const source = recordsFromJsonl(sourceIndex, "source"); const github = recordsFromJsonl(githubIndex, "github"); - const docMatches = selectRecords(docs, query, 10); + let docs = docsResult.records; + let docMatches = selectRecords(docs, query, 10); const sourceMatches = selectRecords(source, query, sourceSeeking(query) ? 10 : 5); const githubMatches = selectRecords(github, query, githubSeeking(query) ? 12 : 4); + if ( + !docMatches.length && + !sourceMatches.length && + !githubMatches.length && + docsResult.usesSearchIndex + ) { + const corpusDocs = await loadDocsCorpus(env) + .then((corpus) => docsRecordsFromCorpus(corpus)) + .catch(() => []); + const corpusMatches = selectRecords(corpusDocs, query, 10); + if (corpusMatches.length) { + docs = corpusDocs; + docMatches = corpusMatches; + } + } const files: WorkspaceFile[] = []; + if (!docs.length) files.push(docsUnavailableFile()); const githubSummary = githubSummaryFile(github, query); if (githubSummary) files.push(githubSummary); for (const record of docMatches) files.push(recordToWorkspaceFile(record)); @@ -80,24 +99,117 @@ export function workspaceContext(files: WorkspaceFile[]): string { .join("\n\n---\n\n"); } -async function loadText(url: string, minLength: number): Promise { +async function loadText( + url: string, + minLength: number, + retryDelaysMs: readonly number[] = [], +): Promise { const cache = caches.default; const key = new Request(url, { method: "GET" }); const cached = await cache.match(key); if (cached?.ok) return cached.text(); - const response = await fetch(url, { cf: { cacheEverything: true, cacheTtl: 300 } }); - if (!response.ok) throw new Error(`Unable to load ${url}: ${response.status}`); - const text = await response.text(); - if (text.startsWith("") || text.length < minLength) - throw new Error(`Invalid text from ${url}`); - await cache.put(key, new Response(text, { headers: { "Cache-Control": "public, max-age=300" } })); - return text; + let lastError: unknown; + for (let attempt = 0; attempt <= retryDelaysMs.length; attempt += 1) { + try { + const response = await fetch(url, { cf: { cacheEverything: true, cacheTtl: 300 } }); + if (!response.ok) throw new Error(`Unable to load ${url}: ${response.status}`); + const text = await response.text(); + if (text.startsWith("") || text.length < minLength) + throw new Error(`Invalid text from ${url}`); + await cache.put( + key, + new Response(text, { headers: { "Cache-Control": "public, max-age=300" } }), + ); + return text; + } catch (error) { + lastError = error; + const delay = retryDelaysMs[attempt]; + if (delay) await sleep(delay); + } + } + throw lastError instanceof Error ? lastError : new Error(`Unable to load ${url}`); +} + +async function loadDocsCorpus(env: Env): Promise { + for (const url of docsCorpusUrls(env)) { + try { + return await loadText(url, 1000, loadTextRetryDelaysMs); + } catch (error) { + console.warn("docs corpus fetch failed", { + url, + error: error instanceof Error ? error.message : String(error), + }); + } + } + throw new Error("Docs corpus is temporarily unavailable. Please retry in a moment."); +} + +async function loadDocsRecords(env: Env): Promise { + const indexUrl = env.DOCS_INDEX_URL ?? docsSearchIndexUrl; + try { + return { + records: docsRecordsFromSearchIndex( + await loadText(indexUrl, 1000, loadTextRetryDelaysMs), + new URL(indexUrl).origin, + ), + usesSearchIndex: true, + }; + } catch (error) { + console.warn("docs search index fetch failed", { + error: error instanceof Error ? error.message : String(error), + }); + } + return { + records: docsRecordsFromCorpus(await loadDocsCorpus(env)), + usesSearchIndex: false, + }; +} + +function docsCorpusUrls(env: Env): string[] { + const primary = env.DOCS_CORPUS_URL ?? docsCorpusUrl; + return primary === docsCorpusUrl ? [docsCorpusUrl, docsCorpusFallbackUrl] : [primary]; +} + +function sleep(ms: number): Promise { + return new Promise((resolve) => setTimeout(resolve, ms)); } async function loadJson(url: string): Promise { return JSON.parse(await loadText(url, 2)) as T; } +function docsRecordsFromSearchIndex(text: string, origin: string): SearchRecord[] { + const parsed = JSON.parse(text) as Partial; + if (!Array.isArray(parsed.entries)) throw new Error("Invalid docs search index"); + const records = parsed.entries + .map((entry) => docsSearchEntryToRecord(entry, origin)) + .filter((record): record is SearchRecord => Boolean(record)); + if (!records.length) throw new Error("Docs search index has no usable entries"); + return records; +} + +function docsSearchEntryToRecord(entry: DocsSearchEntry, origin: string): SearchRecord | undefined { + if (!entry.url || !entry.search) return undefined; + const url = new URL(entry.url, origin).toString(); + const title = entry.title || titleFromRoute(entry.url); + return { + kind: "docs", + path: `/docs/${flatPath(entry.url.replace(/^\/+/, "") || "index")}.md`, + title, + url, + search: [title, entry.snippet, entry.search].filter(Boolean).join("\n\n"), + }; +} + +function titleFromRoute(value: string): string { + const base = value.replace(/\/$/u, "").split("/").pop() || "Docs"; + return base + .split(/[-_]+/u) + .filter(Boolean) + .map((part) => `${part.slice(0, 1).toUpperCase()}${part.slice(1)}`) + .join(" "); +} + function docsRecordsFromCorpus(corpus: string): SearchRecord[] { return corpus .split(/\n---\n/g) @@ -116,6 +228,15 @@ function docsRecordsFromCorpus(corpus: string): SearchRecord[] { }); } +function docsUnavailableFile(): WorkspaceFile { + return { + path: "/workspace/docs/unavailable.md", + kind: "docs", + content: + "# Docs unavailable\n\nThe OpenClaw docs index could not be loaded for this answer. If the question needs documentation context, tell the user Molty cannot load the docs right now and ask them to retry in a moment. Do not invent documentation details.", + }; +} + function recordsFromJsonl(text: string, kind: SearchRecord["kind"]): SearchRecord[] { const records: SearchRecord[] = []; for (const line of text.split("\n")) { @@ -439,3 +560,19 @@ interface WorkspaceManifest { baseUrl?: string; files?: Record; } + +interface DocsRecordSet { + records: SearchRecord[]; + usesSearchIndex: boolean; +} + +interface DocsSearchIndex { + entries?: DocsSearchEntry[]; +} + +interface DocsSearchEntry { + search?: string; + snippet?: string; + title?: string; + url?: string; +} diff --git a/src/types.ts b/src/types.ts index 6b48c29..c30b234 100644 --- a/src/types.ts +++ b/src/types.ts @@ -1,5 +1,6 @@ export interface Env { OPENAI_API_KEY: string; + DOCS_INDEX_URL?: string; DOCS_CORPUS_URL?: string; SOURCE_INDEX_URL?: string; GITHUB_INDEX_URL?: string; diff --git a/wrangler.toml b/wrangler.toml index ef41fa4..0b31957 100644 --- a/wrangler.toml +++ b/wrangler.toml @@ -11,6 +11,7 @@ routes = [ ] [vars] +DOCS_INDEX_URL = "https://docs.openclaw.ai/docs-search.json" DOCS_CORPUS_URL = "https://docs.openclaw.ai/llms-full.txt" SOURCE_INDEX_URL = "https://docs.openclaw.ai/source-index.jsonl" GITHUB_INDEX_URL = "https://docs.openclaw.ai/ask-molty/github-search.jsonl"