Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,7 @@ The Worker expects `OPENAI_API_KEY` as a Cloudflare Worker secret. The default m
Docs chat auth is brokered through ClawHub:

- `CLAWHUB_AUTH_URL` sends users to `https://hub.openclaw.ai/docs/auth`.
- Corpus and workspace URLs default to `https://docs.openclaw.ai`; the legacy
- Docs index, corpus, and workspace URLs default to `https://docs.openclaw.ai`; the legacy
documentation host remains routed only for compatibility.
- `CLAWHUB_SESSION_VERIFY_URL` verifies the ClawHub Convex Auth token once.
- `ASK_MOLTY_AUTH_SECRET` signs the docs-only session cookie; set it in production so OpenAI key rotation does not invalidate sessions.
Expand Down
157 changes: 157 additions & 0 deletions scripts/smoke.ts
Original file line number Diff line number Diff line change
@@ -1,6 +1,10 @@
#!/usr/bin/env node
/// <reference types="@cloudflare/workers-types" />

import fs from "node:fs";
import path from "node:path";
import { buildWorkspace } from "../src/retrieval";
import type { Env } from "../src/types";

const root = process.cwd();
const outDir = path.join(root, "dist", "test");
Expand Down Expand Up @@ -29,4 +33,157 @@ const github = fs.readFileSync(path.join(outDir, "github-search.jsonl"), "utf8")
if (!github.includes("github.com/openclaw/openclaw"))
throw new Error("github index missing OpenClaw links");

await smokeRuntimeRetrieval();

console.log(`ask-molty smoke ok: ${fileCount} workspace files`);

async function smokeRuntimeRetrieval(): Promise<void> {
const docsIndexUrl = "https://example.test/docs-search.json";
const docsCorpusUrl = "https://example.test/llms-full.txt";
const sourceIndexUrl = "https://example.test/source-index.jsonl";
const githubIndexUrl = "https://example.test/github-search.jsonl";
const emptyIndex = " ".repeat(1000);
const env: Env = {
OPENAI_API_KEY: "test",
DOCS_INDEX_URL: docsIndexUrl,
DOCS_CORPUS_URL: docsCorpusUrl,
SOURCE_INDEX_URL: sourceIndexUrl,
GITHUB_INDEX_URL: githubIndexUrl,
};

const docsIndex = JSON.stringify({
version: 1,
entries: [
{
title: "Getting started",
url: "/start/getting-started",
snippet: "Install and configure OpenClaw.",
search: "getting started install configure ".repeat(80),
},
],
});

const sourceIndex = `${JSON.stringify({
path: "src/settings.ts",
search: "settings implementation issue ".repeat(80),
})}\n`;
const githubIndex = `${JSON.stringify({
path: "/workspace/github/000.md#issue-123",
number: 123,
state: "open",
title: "Settings issue",
url: "https://github.com/openclaw/openclaw/issues/123",
search: "settings implementation issue ".repeat(80),
})}\n`;
const docsCorpus = [
"# Deep fallback page",
"Source: https://docs.openclaw.ai/deep/fallback",
"",
"obscure fallback phrase ".repeat(80),
].join("\n");

const firstCalls: string[] = [];
await withMockNetwork(
async (url) => {
firstCalls.push(url);
if (url === docsIndexUrl) return new Response(docsIndex);
if (url === sourceIndexUrl || url === githubIndexUrl) return new Response(emptyIndex);
return new Response("missing", { status: 404 });
},
async () => {
const files = await buildWorkspace(env, "getting started");
if (
!files.some(
(file) =>
file.path === "/docs/start__getting-started.md" &&
file.url === "https://example.test/start/getting-started",
)
) {
throw new Error("runtime retrieval: docs-search.json record was not mounted");
}
},
);
if (firstCalls.includes(docsCorpusUrl)) {
throw new Error("runtime retrieval: docs corpus loaded despite a usable docs index");
}
console.log("runtime retrieval ok: docs-search.json mounted without corpus fetch");

await withMockNetwork(
async (url) => {
if (url === docsIndexUrl) return new Response("missing", { status: 503 });
if (url === docsCorpusUrl) return new Response(docsCorpus);
if (url === sourceIndexUrl || url === githubIndexUrl) return new Response(emptyIndex);
return new Response("missing", { status: 404 });
},
async () => {
const files = await buildWorkspace(env, "obscure fallback phrase");
if (!files.some((file) => file.url === "https://docs.openclaw.ai/deep/fallback")) {
throw new Error("runtime retrieval: docs corpus fallback was not mounted");
}
},
);
console.log("runtime retrieval ok: docs corpus fallback mounted after index failure");

await withMockNetwork(
async (url) => {
if (url === sourceIndexUrl) return new Response(sourceIndex);
if (url === githubIndexUrl) return new Response(githubIndex);
return new Response("missing", { status: 522 });
},
async () => {
const files = await buildWorkspace(env, "settings implementation issue");
if (!files.some((file) => file.path === "/workspace/docs/unavailable.md")) {
throw new Error("runtime retrieval: missing docs unavailable workspace note");
}
if (!files.some((file) => file.kind === "source")) {
throw new Error("runtime retrieval: source context was blocked by docs outage");
}
if (!files.some((file) => file.kind === "github")) {
throw new Error("runtime retrieval: GitHub context was blocked by docs outage");
}
},
);
console.log("runtime retrieval ok: docs outage keeps source and GitHub context");
}

async function withMockNetwork(
fetchText: (url: string) => Promise<Response>,
run: () => Promise<void>,
): Promise<void> {
const originalFetch = globalThis.fetch;
const originalCaches = (globalThis as unknown as { caches?: CacheStorage }).caches;
const cache = new Map<string, string>();
Object.defineProperty(globalThis, "fetch", {
configurable: true,
value: (input: RequestInfo | URL) => {
const url = typeof input === "string" ? input : input instanceof URL ? input.href : input.url;
return fetchText(url);
},
});
Object.defineProperty(globalThis, "caches", {
configurable: true,
value: {
default: {
match: async (request: Request) => {
const text = cache.get(request.url);
return text === undefined ? undefined : new Response(text);
},
put: async (request: Request, response: Response) => {
cache.set(request.url, await response.clone().text());
},
},
},
});
try {
await run();
} finally {
Object.defineProperty(globalThis, "fetch", {
configurable: true,
value: originalFetch,
});
Object.defineProperty(globalThis, "caches", {
configurable: true,
value: originalCaches,
});
}
}
161 changes: 149 additions & 12 deletions src/retrieval.ts
Original file line number Diff line number Diff line change
@@ -1,25 +1,44 @@
import type { Env, SearchRecord, WorkspaceFile } from "./types";

const docsCorpusUrl = "https://docs.openclaw.ai/llms-full.txt";
const docsCorpusFallbackUrl = "https://docs.openclaw.ai/.well-known/llms-full.txt";
const docsSearchIndexUrl = "https://docs.openclaw.ai/docs-search.json";
const sourceIndexUrl = "https://docs.openclaw.ai/source-index.jsonl";
const githubIndexUrl = "https://docs.openclaw.ai/ask-molty/github-search.jsonl";
const workspaceManifestUrl = "https://docs.openclaw.ai/ask-molty/workspace-manifest.json";
const loadTextRetryDelaysMs = [150, 450];

export async function buildWorkspace(env: Env, query: string): Promise<WorkspaceFile[]> {
const [docsCorpus, sourceIndex, githubIndex] = await Promise.all([
loadText(env.DOCS_CORPUS_URL ?? docsCorpusUrl, 1000),
const [docsResult, sourceIndex, githubIndex] = await Promise.all([
loadDocsRecords(env).catch(() => ({ records: [], usesSearchIndex: false })),
loadText(env.SOURCE_INDEX_URL ?? sourceIndexUrl, 1000).catch(() => ""),
loadText(env.GITHUB_INDEX_URL ?? githubIndexUrl, 1000).catch(() => ""),
]);
const docs = docsRecordsFromCorpus(docsCorpus);
const source = recordsFromJsonl(sourceIndex, "source");
const github = recordsFromJsonl(githubIndex, "github");

const docMatches = selectRecords(docs, query, 10);
let docs = docsResult.records;
let docMatches = selectRecords(docs, query, 10);
const sourceMatches = selectRecords(source, query, sourceSeeking(query) ? 10 : 5);
const githubMatches = selectRecords(github, query, githubSeeking(query) ? 12 : 4);
if (
!docMatches.length &&
!sourceMatches.length &&
!githubMatches.length &&
docsResult.usesSearchIndex
) {
const corpusDocs = await loadDocsCorpus(env)
.then((corpus) => docsRecordsFromCorpus(corpus))
.catch(() => []);
const corpusMatches = selectRecords(corpusDocs, query, 10);
if (corpusMatches.length) {
docs = corpusDocs;
docMatches = corpusMatches;
}
}

const files: WorkspaceFile[] = [];
if (!docs.length) files.push(docsUnavailableFile());
const githubSummary = githubSummaryFile(github, query);
if (githubSummary) files.push(githubSummary);
for (const record of docMatches) files.push(recordToWorkspaceFile(record));
Expand Down Expand Up @@ -80,24 +99,117 @@ export function workspaceContext(files: WorkspaceFile[]): string {
.join("\n\n---\n\n");
}

async function loadText(url: string, minLength: number): Promise<string> {
async function loadText(
url: string,
minLength: number,
retryDelaysMs: readonly number[] = [],
): Promise<string> {
const cache = caches.default;
const key = new Request(url, { method: "GET" });
const cached = await cache.match(key);
if (cached?.ok) return cached.text();
const response = await fetch(url, { cf: { cacheEverything: true, cacheTtl: 300 } });
if (!response.ok) throw new Error(`Unable to load ${url}: ${response.status}`);
const text = await response.text();
if (text.startsWith("<!DOCTYPE html>") || text.length < minLength)
throw new Error(`Invalid text from ${url}`);
await cache.put(key, new Response(text, { headers: { "Cache-Control": "public, max-age=300" } }));
return text;
let lastError: unknown;
for (let attempt = 0; attempt <= retryDelaysMs.length; attempt += 1) {
try {
const response = await fetch(url, { cf: { cacheEverything: true, cacheTtl: 300 } });
if (!response.ok) throw new Error(`Unable to load ${url}: ${response.status}`);
const text = await response.text();
if (text.startsWith("<!DOCTYPE html>") || text.length < minLength)
throw new Error(`Invalid text from ${url}`);
await cache.put(
key,
new Response(text, { headers: { "Cache-Control": "public, max-age=300" } }),
);
return text;
} catch (error) {
lastError = error;
const delay = retryDelaysMs[attempt];
if (delay) await sleep(delay);
}
}
throw lastError instanceof Error ? lastError : new Error(`Unable to load ${url}`);
}

async function loadDocsCorpus(env: Env): Promise<string> {
for (const url of docsCorpusUrls(env)) {
try {
return await loadText(url, 1000, loadTextRetryDelaysMs);
} catch (error) {
console.warn("docs corpus fetch failed", {
url,
error: error instanceof Error ? error.message : String(error),
});
}
}
throw new Error("Docs corpus is temporarily unavailable. Please retry in a moment.");
}

async function loadDocsRecords(env: Env): Promise<DocsRecordSet> {
const indexUrl = env.DOCS_INDEX_URL ?? docsSearchIndexUrl;
try {
return {
records: docsRecordsFromSearchIndex(
await loadText(indexUrl, 1000, loadTextRetryDelaysMs),
new URL(indexUrl).origin,
),
usesSearchIndex: true,
};
} catch (error) {
console.warn("docs search index fetch failed", {
error: error instanceof Error ? error.message : String(error),
});
}
return {
records: docsRecordsFromCorpus(await loadDocsCorpus(env)),
usesSearchIndex: false,
};
}

function docsCorpusUrls(env: Env): string[] {
const primary = env.DOCS_CORPUS_URL ?? docsCorpusUrl;
return primary === docsCorpusUrl ? [docsCorpusUrl, docsCorpusFallbackUrl] : [primary];
}

function sleep(ms: number): Promise<void> {
return new Promise((resolve) => setTimeout(resolve, ms));
}

async function loadJson<T>(url: string): Promise<T> {
return JSON.parse(await loadText(url, 2)) as T;
}

function docsRecordsFromSearchIndex(text: string, origin: string): SearchRecord[] {
const parsed = JSON.parse(text) as Partial<DocsSearchIndex>;
if (!Array.isArray(parsed.entries)) throw new Error("Invalid docs search index");
const records = parsed.entries
.map((entry) => docsSearchEntryToRecord(entry, origin))
.filter((record): record is SearchRecord => Boolean(record));
if (!records.length) throw new Error("Docs search index has no usable entries");
return records;
}

function docsSearchEntryToRecord(entry: DocsSearchEntry, origin: string): SearchRecord | undefined {
if (!entry.url || !entry.search) return undefined;
const url = new URL(entry.url, origin).toString();
const title = entry.title || titleFromRoute(entry.url);
return {
kind: "docs",
path: `/docs/${flatPath(entry.url.replace(/^\/+/, "") || "index")}.md`,
title,
url,
search: [title, entry.snippet, entry.search].filter(Boolean).join("\n\n"),
};
}

function titleFromRoute(value: string): string {
const base = value.replace(/\/$/u, "").split("/").pop() || "Docs";
return base
.split(/[-_]+/u)
.filter(Boolean)
.map((part) => `${part.slice(0, 1).toUpperCase()}${part.slice(1)}`)
.join(" ");
}

function docsRecordsFromCorpus(corpus: string): SearchRecord[] {
return corpus
.split(/\n---\n/g)
Expand All @@ -116,6 +228,15 @@ function docsRecordsFromCorpus(corpus: string): SearchRecord[] {
});
}

function docsUnavailableFile(): WorkspaceFile {
return {
path: "/workspace/docs/unavailable.md",
kind: "docs",
content:
"# Docs unavailable\n\nThe OpenClaw docs index could not be loaded for this answer. If the question needs documentation context, tell the user Molty cannot load the docs right now and ask them to retry in a moment. Do not invent documentation details.",
};
}

function recordsFromJsonl(text: string, kind: SearchRecord["kind"]): SearchRecord[] {
const records: SearchRecord[] = [];
for (const line of text.split("\n")) {
Expand Down Expand Up @@ -439,3 +560,19 @@ interface WorkspaceManifest {
baseUrl?: string;
files?: Record<string, { url: string; bytes?: number; sha256?: string; kind?: string }>;
}

interface DocsRecordSet {
records: SearchRecord[];
usesSearchIndex: boolean;
}

interface DocsSearchIndex {
entries?: DocsSearchEntry[];
}

interface DocsSearchEntry {
search?: string;
snippet?: string;
title?: string;
url?: string;
}
1 change: 1 addition & 0 deletions src/types.ts
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
export interface Env {
OPENAI_API_KEY: string;
DOCS_INDEX_URL?: string;
DOCS_CORPUS_URL?: string;
SOURCE_INDEX_URL?: string;
GITHUB_INDEX_URL?: string;
Expand Down
Loading