From a4a53689884730489514c66aadf66fef3646a5df Mon Sep 17 00:00:00 2001 From: Kiyeon Jeon Date: Thu, 18 Jun 2026 00:04:35 +0900 Subject: [PATCH 1/6] feat: querypad CLI (inspect + ask) for dataset understanding CLI-first "Cursor for Data" engine alongside the web app: - querypad inspect: profile a folder, infer FK relationships with confidence - querypad ask: NL question -> relationship-aware SQL -> DuckDB -> insight - engine-agnostic discovery core shared by browser (Wasm) and CLI (@duckdb/node-api) - read-only gate + code-fence stripping on AI-generated SQL; env-var BYOK keys - web: data profiling, profile drawer, copy-agent-context, multi-provider BYOK Co-Authored-By: Claude Opus 4.8 (1M context) --- .github/workflows/ci.yml | 11 +- .gitignore | 4 +- e2e/app.spec.ts | 58 +- fixtures/data/events.csv | 11 + fixtures/data/payments.csv | 9 + fixtures/data/users.csv | 6 + package-lock.json | 650 +++++++++++++++++- package.json | 11 +- scripts/check-version.mjs | 29 +- src/cli/ai-env.ts | 39 ++ src/cli/artifacts.ts | 123 ++++ src/cli/ask.ts | 116 ++++ src/cli/index.ts | 94 +++ src/cli/inspect.ts | 60 ++ src/cli/render.ts | 42 ++ src/components/editor/AiAssistant.tsx | 75 +- src/components/sidebar/ProfileDrawer.tsx | 177 +++++ src/components/sidebar/Sidebar.tsx | 62 +- src/components/sidebar/TableSchema.tsx | 69 +- .../workspace/CopyAgentContextButton.tsx | 70 ++ src/components/workspace/Workspace.tsx | 2 + src/lib/agent/ask-context.ts | 34 + src/lib/agent/context.ts | 160 +++++ src/lib/ai/api-key.ts | 48 +- src/lib/ai/complete.ts | 175 +++++ src/lib/ai/generate-sql.ts | 93 +-- src/lib/ai/providers.ts | 41 ++ src/lib/discovery/relationships.ts | 199 ++++++ src/lib/discovery/signals.ts | 117 ++++ src/lib/discovery/sql-safety.ts | 44 ++ src/lib/duckdb-node/connection.ts | 42 ++ src/lib/duckdb-node/load.ts | 100 +++ src/lib/duckdb-node/profile.ts | 142 ++++ src/lib/duckdb/profile.ts | 153 +++++ src/lib/duckdb/sql-utils.ts | 24 + src/stores/workspace-store.ts | 60 +- src/types/discovery.ts | 42 ++ src/types/index.ts | 34 + test/ask.test.ts | 122 ++++ test/discovery.test.ts | 107 +++ 40 files changed, 3314 insertions(+), 141 deletions(-) create mode 100644 fixtures/data/events.csv create mode 100644 fixtures/data/payments.csv create mode 100644 fixtures/data/users.csv create mode 100644 src/cli/ai-env.ts create mode 100644 src/cli/artifacts.ts create mode 100644 src/cli/ask.ts create mode 100644 src/cli/index.ts create mode 100644 src/cli/inspect.ts create mode 100644 src/cli/render.ts create mode 100644 src/components/sidebar/ProfileDrawer.tsx create mode 100644 src/components/workspace/CopyAgentContextButton.tsx create mode 100644 src/lib/agent/ask-context.ts create mode 100644 src/lib/agent/context.ts create mode 100644 src/lib/ai/complete.ts create mode 100644 src/lib/ai/providers.ts create mode 100644 src/lib/discovery/relationships.ts create mode 100644 src/lib/discovery/signals.ts create mode 100644 src/lib/discovery/sql-safety.ts create mode 100644 src/lib/duckdb-node/connection.ts create mode 100644 src/lib/duckdb-node/load.ts create mode 100644 src/lib/duckdb-node/profile.ts create mode 100644 src/lib/duckdb/profile.ts create mode 100644 src/lib/duckdb/sql-utils.ts create mode 100644 src/types/discovery.ts create mode 100644 test/ask.test.ts create mode 100644 test/discovery.test.ts diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 2a1467e..27325cb 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -6,12 +6,19 @@ on: pull_request: branches: [main] +concurrency: + group: ci-${{ github.ref }} + cancel-in-progress: true + permissions: contents: read jobs: verify: runs-on: ubuntu-latest + container: + image: mcr.microsoft.com/playwright:v1.58.2-noble + timeout-minutes: 20 steps: - name: Checkout @@ -38,8 +45,6 @@ jobs: - name: Build run: npm run build - - name: Install Playwright browser - run: npx playwright install --with-deps chromium - - name: Run e2e tests + timeout-minutes: 10 run: npm test -- --reporter=line diff --git a/.gitignore b/.gitignore index 4d7a133..1e798dd 100644 --- a/.gitignore +++ b/.gitignore @@ -36,6 +36,9 @@ yarn-error.log* # generated wasm files /public/duckdb/ +# querypad CLI inspection artifacts +.querypad/ + # vercel .vercel @@ -51,7 +54,6 @@ yarn-error.log* # internal docs FEEDBACK.md -ROADMAP-PHASE4.md DEMO.md docs/SHOW-HN.md diff --git a/e2e/app.spec.ts b/e2e/app.spec.ts index 35039d5..27a4f56 100644 --- a/e2e/app.spec.ts +++ b/e2e/app.spec.ts @@ -8,15 +8,15 @@ test.describe("QueryPad", () => { await expect(page.locator("text=QueryPad")).toBeVisible({ timeout: 15000 }); // Sidebar should show sample tables - await expect(page.getByRole("button", { name: "employees" })).toBeVisible({ timeout: 10000 }); - await expect(page.getByRole("button", { name: "departments" })).toBeVisible(); + await expect(page.getByRole("button", { name: "employees", exact: true })).toBeVisible({ timeout: 10000 }); + await expect(page.getByRole("button", { name: "departments", exact: true })).toBeVisible(); }); test("executes sample SQL query and shows results", async ({ page }) => { await page.goto("/"); // Wait for workspace to load with sample data - await expect(page.getByRole("button", { name: "employees" })).toBeVisible({ timeout: 15000 }); + await expect(page.getByRole("button", { name: "employees", exact: true })).toBeVisible({ timeout: 15000 }); // The sample query should be prefilled — click Run button await page.getByRole("button", { name: "Run" }).click(); @@ -26,9 +26,53 @@ test.describe("QueryPad", () => { await expect(page.getByText("avg_salary", { exact: true }).last()).toBeVisible(); }); + test("shows a data profile for a sample table", async ({ page }) => { + await page.goto("/"); + await expect(page.getByRole("button", { name: "employees", exact: true })).toBeVisible({ timeout: 15000 }); + + await page.getByRole("button", { name: "Profile employees" }).click(); + + await expect(page.getByText("employees profile")).toBeVisible({ timeout: 10000 }); + await expect(page.getByText("salary", { exact: true }).first()).toBeVisible(); + await expect(page.getByText(/Null \d/).first()).toBeVisible(); + await expect(page.getByText(/Distinct/).first()).toBeVisible(); + }); + + test("copies agent context with schema and query state", async ({ page, context }) => { + await context.grantPermissions(["clipboard-read", "clipboard-write"]); + await page.goto("/"); + await expect(page.getByRole("button", { name: "employees", exact: true })).toBeVisible({ timeout: 15000 }); + + await page.getByRole("button", { name: "Run" }).click(); + await expect(page.getByText("dept_name", { exact: true }).first()).toBeVisible({ timeout: 10000 }); + + await page.getByRole("button", { name: "Copy context" }).click(); + const copied = await page.evaluate(() => navigator.clipboard.readText()); + + expect(copied).toContain("# QueryPad Context"); + expect(copied).toContain("### employees"); + expect(copied).toContain("### departments"); + expect(copied).toContain("SELECT d.dept_name"); + expect(copied).toContain("## Latest Result"); + }); + + test("can switch AI SQL assistant provider", async ({ page }) => { + await page.goto("/"); + await expect(page.getByRole("button", { name: "employees", exact: true })).toBeVisible({ timeout: 15000 }); + + await page.getByRole("button", { name: /AI/ }).click(); + await expect(page.getByPlaceholder("Enter your Anthropic API key (sk-ant-...)")).toBeVisible(); + + await page.getByRole("button", { name: "Use OpenAI" }).click(); + await expect(page.getByPlaceholder("Enter your OpenAI API key (sk-...)")).toBeVisible(); + + await page.getByRole("button", { name: "Use Claude" }).click(); + await expect(page.getByPlaceholder("Enter your Anthropic API key (sk-ant-...)")).toBeVisible(); + }); + test("can switch between SQL and Pipeline mode", async ({ page }) => { await page.goto("/"); - await expect(page.getByRole("button", { name: "employees" })).toBeVisible({ timeout: 15000 }); + await expect(page.getByRole("button", { name: "employees", exact: true })).toBeVisible({ timeout: 15000 }); // Click Pipeline button await page.locator("button", { hasText: "Pipeline" }).click(); @@ -43,7 +87,7 @@ test.describe("QueryPad", () => { test("shows welcome banner with sample data", async ({ page }) => { await page.goto("/"); - await expect(page.getByRole("button", { name: "employees" })).toBeVisible({ timeout: 15000 }); + await expect(page.getByRole("button", { name: "employees", exact: true })).toBeVisible({ timeout: 15000 }); // Welcome banner should be visible await expect( @@ -53,14 +97,14 @@ test.describe("QueryPad", () => { test("can clear workspace", async ({ page }) => { await page.goto("/"); - await expect(page.getByRole("button", { name: "employees" })).toBeVisible({ timeout: 15000 }); + await expect(page.getByRole("button", { name: "employees", exact: true })).toBeVisible({ timeout: 15000 }); // Click Clear button await page.locator("button", { hasText: "Clear" }).click(); // Should show the drop zone (empty state) await expect( - page.getByRole("button", { name: "employees" }) + page.getByRole("button", { name: "employees", exact: true }) ).not.toBeVisible({ timeout: 5000 }); }); }); diff --git a/fixtures/data/events.csv b/fixtures/data/events.csv new file mode 100644 index 0000000..3fb6fd2 --- /dev/null +++ b/fixtures/data/events.csv @@ -0,0 +1,11 @@ +id,user_id,event_type +1,1,login +2,2,login +3,1,view +4,3,login +5,4,view +6,5,login +7,2,purchase +8,3,view +9,5,purchase +10,1,logout diff --git a/fixtures/data/payments.csv b/fixtures/data/payments.csv new file mode 100644 index 0000000..3595a97 --- /dev/null +++ b/fixtures/data/payments.csv @@ -0,0 +1,9 @@ +id,user_id,amount +1,1,42.00 +2,1,18.50 +3,3,99.99 +4,3,12.00 +5,5,7.25 +6,5,55.00 +7,1,30.00 +8,3,21.00 diff --git a/fixtures/data/users.csv b/fixtures/data/users.csv new file mode 100644 index 0000000..e164475 --- /dev/null +++ b/fixtures/data/users.csv @@ -0,0 +1,6 @@ +id,name,plan +1,Alice,paid +2,Bob,free +3,Carol,paid +4,Dan,free +5,Erin,paid diff --git a/package-lock.json b/package-lock.json index 624a0df..bbe4d93 100644 --- a/package-lock.json +++ b/package-lock.json @@ -12,6 +12,7 @@ "dependencies": { "@dagrejs/dagre": "^2.0.4", "@duckdb/duckdb-wasm": "^1.33.1-dev20.0", + "@duckdb/node-api": "^1.5.3-r.3", "@monaco-editor/react": "^4.7.0", "@tanstack/react-virtual": "^3.13.23", "@vercel/analytics": "^2.0.1", @@ -40,6 +41,7 @@ "eslint": "^9", "eslint-config-next": "16.2.0", "tailwindcss": "^4", + "tsx": "^4.22.4", "typescript": "^5" } }, @@ -429,6 +431,150 @@ "qs": "^6.14.1" } }, + "node_modules/@duckdb/node-api": { + "version": "1.5.3-r.3", + "resolved": "https://registry.npmjs.org/@duckdb/node-api/-/node-api-1.5.3-r.3.tgz", + "integrity": "sha512-FzuL6sevuFfEFwkgiUMRMUAj4TaVqV//L0oo2FVZ9s9oYpLpALF9qZyQv2ucclTNQZwDCkm8+e6yLMc6t8IjlA==", + "license": "MIT", + "dependencies": { + "@duckdb/node-bindings": "1.5.3-r.3" + } + }, + "node_modules/@duckdb/node-bindings": { + "version": "1.5.3-r.3", + "resolved": "https://registry.npmjs.org/@duckdb/node-bindings/-/node-bindings-1.5.3-r.3.tgz", + "integrity": "sha512-Dphw1a9kKXZnCiWX1YCEAJsQ7WJQO2Ikgxy7m8jy0QVXqAwB9esr5NGsuEL3vMKL7velZHeZCjGOMnHZEcIsdg==", + "license": "MIT", + "dependencies": { + "detect-libc": "^2.1.2" + }, + "optionalDependencies": { + "@duckdb/node-bindings-darwin-arm64": "1.5.3-r.3", + "@duckdb/node-bindings-darwin-x64": "1.5.3-r.3", + "@duckdb/node-bindings-linux-arm64": "1.5.3-r.3", + "@duckdb/node-bindings-linux-arm64-musl": "1.5.3-r.3", + "@duckdb/node-bindings-linux-x64": "1.5.3-r.3", + "@duckdb/node-bindings-linux-x64-musl": "1.5.3-r.3", + "@duckdb/node-bindings-win32-arm64": "1.5.3-r.3", + "@duckdb/node-bindings-win32-x64": "1.5.3-r.3" + } + }, + "node_modules/@duckdb/node-bindings-darwin-arm64": { + "version": "1.5.3-r.3", + "resolved": "https://registry.npmjs.org/@duckdb/node-bindings-darwin-arm64/-/node-bindings-darwin-arm64-1.5.3-r.3.tgz", + "integrity": "sha512-ttD8QBesgzHu7Sc4qouuIGLM7PWedLW8GvFbnZEyMqk24mQz1HWFgaT0ivw6nDRaDPUQLB9QnAOq8MZUh1zWHQ==", + "cpu": [ + "arm64" + ], + "license": "MIT", + "optional": true, + "os": [ + "darwin" + ] + }, + "node_modules/@duckdb/node-bindings-darwin-x64": { + "version": "1.5.3-r.3", + "resolved": "https://registry.npmjs.org/@duckdb/node-bindings-darwin-x64/-/node-bindings-darwin-x64-1.5.3-r.3.tgz", + "integrity": "sha512-Vp9MYtoYf6zUWHdCmHXwUcJlHq3YaaIeULWeSiPUM1hsDflLiZKXtz5i250Ulz03VsfWBjpO4wdM99sjjrYKkg==", + "cpu": [ + "x64" + ], + "license": "MIT", + "optional": true, + "os": [ + "darwin" + ] + }, + "node_modules/@duckdb/node-bindings-linux-arm64": { + "version": "1.5.3-r.3", + "resolved": "https://registry.npmjs.org/@duckdb/node-bindings-linux-arm64/-/node-bindings-linux-arm64-1.5.3-r.3.tgz", + "integrity": "sha512-3HLcrzQE83947JS51UVR7C9qnXQMltCOk4Dnhiz1CD+9u32DGLMgPTIIxclk7O+Q7EwfqzD8JV86Ud+LT1crcQ==", + "cpu": [ + "arm64" + ], + "libc": [ + "glibc" + ], + "license": "MIT", + "optional": true, + "os": [ + "linux" + ] + }, + "node_modules/@duckdb/node-bindings-linux-arm64-musl": { + "version": "1.5.3-r.3", + "resolved": "https://registry.npmjs.org/@duckdb/node-bindings-linux-arm64-musl/-/node-bindings-linux-arm64-musl-1.5.3-r.3.tgz", + "integrity": "sha512-IadRyx+98FEynKLXAk2MzReinFgduiDXgNd5Z8c5VKch+8FgBfqkEUYGOnBMMUPT8kuheKdLj23vpWXaCzOgoQ==", + "cpu": [ + "arm64" + ], + "libc": [ + "musl" + ], + "license": "MIT", + "optional": true, + "os": [ + "linux" + ] + }, + "node_modules/@duckdb/node-bindings-linux-x64": { + "version": "1.5.3-r.3", + "resolved": "https://registry.npmjs.org/@duckdb/node-bindings-linux-x64/-/node-bindings-linux-x64-1.5.3-r.3.tgz", + "integrity": "sha512-TXndAL0ZoETq17Df6wB+SUZjLGDmOsKuDSySxB+wy6sHfpRtbDgQibyXRlajVeUkRDwSzBFC5ymy16YG0Fl4iw==", + "cpu": [ + "x64" + ], + "libc": [ + "glibc" + ], + "license": "MIT", + "optional": true, + "os": [ + "linux" + ] + }, + "node_modules/@duckdb/node-bindings-linux-x64-musl": { + "version": "1.5.3-r.3", + "resolved": "https://registry.npmjs.org/@duckdb/node-bindings-linux-x64-musl/-/node-bindings-linux-x64-musl-1.5.3-r.3.tgz", + "integrity": "sha512-5bulS16YhftXcarki4tvCufVslntpQDLOEF6RZ+FSMOGiv5d7SDXqklmVRy4DKW3C5ekgN7S2oYzuGL/ss9BuA==", + "cpu": [ + "x64" + ], + "libc": [ + "musl" + ], + "license": "MIT", + "optional": true, + "os": [ + "linux" + ] + }, + "node_modules/@duckdb/node-bindings-win32-arm64": { + "version": "1.5.3-r.3", + "resolved": "https://registry.npmjs.org/@duckdb/node-bindings-win32-arm64/-/node-bindings-win32-arm64-1.5.3-r.3.tgz", + "integrity": "sha512-55Vu13S0jUudiAGlNWJd7UvlW1iKjwWehD8s93jBCNm0AdE/EJN4nz5rQ0IuWzPWXpMjAYuKu00yE7NdtbTyug==", + "cpu": [ + "arm64" + ], + "license": "MIT", + "optional": true, + "os": [ + "win32" + ] + }, + "node_modules/@duckdb/node-bindings-win32-x64": { + "version": "1.5.3-r.3", + "resolved": "https://registry.npmjs.org/@duckdb/node-bindings-win32-x64/-/node-bindings-win32-x64-1.5.3-r.3.tgz", + "integrity": "sha512-rlOc9ltWQNHuDq99Ah8XaD80nN1ucrSK5AcH/7ibSp9ogX/jswPYlRVE7ODFJAjnQNf8bVvs++Mp+wyGvuG7ag==", + "cpu": [ + "x64" + ], + "license": "MIT", + "optional": true, + "os": [ + "win32" + ] + }, "node_modules/@emnapi/core": { "version": "1.9.1", "resolved": "https://registry.npmjs.org/@emnapi/core/-/core-1.9.1.tgz", @@ -734,6 +880,23 @@ "node": ">=12" } }, + "node_modules/@esbuild/netbsd-arm64": { + "version": "0.28.1", + "resolved": "https://registry.npmjs.org/@esbuild/netbsd-arm64/-/netbsd-arm64-0.28.1.tgz", + "integrity": "sha512-oks0DYbLwWMmaakTsCb+zL4E+aHRVLom9IJZOAthMQEPiQmydXHkziYEsGYRx0uNV/IjEKGAV941JzH02pflqw==", + "cpu": [ + "arm64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "netbsd" + ], + "engines": { + "node": ">=18" + } + }, "node_modules/@esbuild/netbsd-x64": { "version": "0.21.5", "resolved": "https://registry.npmjs.org/@esbuild/netbsd-x64/-/netbsd-x64-0.21.5.tgz", @@ -750,6 +913,23 @@ "node": ">=12" } }, + "node_modules/@esbuild/openbsd-arm64": { + "version": "0.28.1", + "resolved": "https://registry.npmjs.org/@esbuild/openbsd-arm64/-/openbsd-arm64-0.28.1.tgz", + "integrity": "sha512-MEFJe5C3R8pwXdZ5Y21oo6m7ePiS0d9pWucn99O/wvyJZChoIQKrQDxKrGeW8F5+T0okTHesAmDeiHDTIq0V/Q==", + "cpu": [ + "arm64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "openbsd" + ], + "engines": { + "node": ">=18" + } + }, "node_modules/@esbuild/openbsd-x64": { "version": "0.21.5", "resolved": "https://registry.npmjs.org/@esbuild/openbsd-x64/-/openbsd-x64-0.21.5.tgz", @@ -766,6 +946,23 @@ "node": ">=12" } }, + "node_modules/@esbuild/openharmony-arm64": { + "version": "0.28.1", + "resolved": "https://registry.npmjs.org/@esbuild/openharmony-arm64/-/openharmony-arm64-0.28.1.tgz", + "integrity": "sha512-ge+Z7EXFNt2BO1oAMsVpiQ8EwndV9i1xXerAeTIK7AtPs3bKFXQM7nlRxDSIUIMeueR1CNXxqztLzdNeReKBJg==", + "cpu": [ + "arm64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "openharmony" + ], + "engines": { + "node": ">=18" + } + }, "node_modules/@esbuild/sunos-x64": { "version": "0.21.5", "resolved": "https://registry.npmjs.org/@esbuild/sunos-x64/-/sunos-x64-0.21.5.tgz", @@ -4076,7 +4273,6 @@ "version": "2.1.2", "resolved": "https://registry.npmjs.org/detect-libc/-/detect-libc-2.1.2.tgz", "integrity": "sha512-Btj2BOOO83o3WyH59e8MgXsxEQVcarkUOpEYrubB0urwnN10yQ364rsiByU11nZlqWYZm05i/of7io4mzihBtQ==", - "devOptional": true, "license": "Apache-2.0", "engines": { "node": ">=8" @@ -8256,6 +8452,458 @@ "integrity": "sha512-oJFu94HQb+KVduSUQL7wnpmqnfmLsOA/nAh6b6EH0wCEoK0/mPeXU6c3wKDV83MkOuHPRHtSXKKU99IBazS/2w==", "license": "0BSD" }, + "node_modules/tsx": { + "version": "4.22.4", + "resolved": "https://registry.npmjs.org/tsx/-/tsx-4.22.4.tgz", + "integrity": "sha512-X8EX+XV4QR5xCsrgxaED954zTDfY8KqlDtskKEL0cHhyS/P8b4IFOvGDQpsC9Q1XnLq915wEfwwY/zzskCtmhg==", + "dev": true, + "license": "MIT", + "dependencies": { + "esbuild": "~0.28.0" + }, + "bin": { + "tsx": "dist/cli.mjs" + }, + "engines": { + "node": ">=18.0.0" + }, + "optionalDependencies": { + "fsevents": "~2.3.3" + } + }, + "node_modules/tsx/node_modules/@esbuild/aix-ppc64": { + "version": "0.28.1", + "resolved": "https://registry.npmjs.org/@esbuild/aix-ppc64/-/aix-ppc64-0.28.1.tgz", + "integrity": "sha512-Svl7tq8k/08+p6CXPpRjQ1fKX+1odH/BQbb48fV6fj3CWHhsoIOoY87w1oHXm0qEpkIK3ZfVgp0hed3XBXzXMQ==", + "cpu": [ + "ppc64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "aix" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/tsx/node_modules/@esbuild/android-arm": { + "version": "0.28.1", + "resolved": "https://registry.npmjs.org/@esbuild/android-arm/-/android-arm-0.28.1.tgz", + "integrity": "sha512-0k2F129Xdio1TdJfzJ8sy1Q47vUD2NnwdhiAf7drUN1EBTfPf4hsFCtmMgu/6m8JSzsBrlmVjudMBQqOfG8usQ==", + "cpu": [ + "arm" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "android" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/tsx/node_modules/@esbuild/android-arm64": { + "version": "0.28.1", + "resolved": "https://registry.npmjs.org/@esbuild/android-arm64/-/android-arm64-0.28.1.tgz", + "integrity": "sha512-34EGEbCIAgosYz6goLcopX6Mo7NyGv9tfwEM2/7Ce2VcVRk568iSvniGWcUXIy7wEDR1wzolcxcriFVrWYcwBg==", + "cpu": [ + "arm64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "android" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/tsx/node_modules/@esbuild/android-x64": { + "version": "0.28.1", + "resolved": "https://registry.npmjs.org/@esbuild/android-x64/-/android-x64-0.28.1.tgz", + "integrity": "sha512-dbwY7ltSMDWsRatcRpCnES4F+im88OCUgGZjy52shC7GqHRE/cYlxNbB4Z4UpJswpcc4Qxd2oE/ufM0p61IKng==", + "cpu": [ + "x64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "android" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/tsx/node_modules/@esbuild/darwin-arm64": { + "version": "0.28.1", + "resolved": "https://registry.npmjs.org/@esbuild/darwin-arm64/-/darwin-arm64-0.28.1.tgz", + "integrity": "sha512-TZbWkQY7kvTAXbXUT7uVACR5cMHsDiSz9z7ZKAX/RTq/WJEk3QyRr0wZpNhBDX+/0CtdqUIJlOiodQcta6tY3Q==", + "cpu": [ + "arm64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "darwin" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/tsx/node_modules/@esbuild/darwin-x64": { + "version": "0.28.1", + "resolved": "https://registry.npmjs.org/@esbuild/darwin-x64/-/darwin-x64-0.28.1.tgz", + "integrity": "sha512-zfdzgK9ACBNZLI/CyHTOx81SyNbM6YXn7rxSgX97VjyiPl9W1i4Ka4fgKECEoFCKGpvBj5qArWIGgQjOwkgskQ==", + "cpu": [ + "x64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "darwin" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/tsx/node_modules/@esbuild/freebsd-arm64": { + "version": "0.28.1", + "resolved": "https://registry.npmjs.org/@esbuild/freebsd-arm64/-/freebsd-arm64-0.28.1.tgz", + "integrity": "sha512-wG2EA8ENdEI0qhkSZMjfqrdY+ziCYCPMmtZjjIwOmXFjmyzEHn+UUxk5of+SYsjtfs3VpnlC7QLzSI5hY/rOAw==", + "cpu": [ + "arm64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "freebsd" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/tsx/node_modules/@esbuild/freebsd-x64": { + "version": "0.28.1", + "resolved": "https://registry.npmjs.org/@esbuild/freebsd-x64/-/freebsd-x64-0.28.1.tgz", + "integrity": "sha512-i7dZ9vQgnvSCzi/rYCXNgtF/U+eKZNJBzu3eTQbRgHnM7tNSizLOkRFAl3qzVc/Op/u5YkHHa4pf/3DOYHthLQ==", + "cpu": [ + "x64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "freebsd" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/tsx/node_modules/@esbuild/linux-arm": { + "version": "0.28.1", + "resolved": "https://registry.npmjs.org/@esbuild/linux-arm/-/linux-arm-0.28.1.tgz", + "integrity": "sha512-qVXBOHQS+d5Y722GwJzJUtOLlX7km3CraOaGormF1pDtPd2C/l1SHRPgjLunLGe51Sh5YYWKMFDyV4SxgMQYTQ==", + "cpu": [ + "arm" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/tsx/node_modules/@esbuild/linux-arm64": { + "version": "0.28.1", + "resolved": "https://registry.npmjs.org/@esbuild/linux-arm64/-/linux-arm64-0.28.1.tgz", + "integrity": "sha512-yHs+0uc8+nvEAfAfxrWQKK5peSNzBc4PegcMO0EJ2hT71uA7vB8Ihg2e77R2P7SG5uYjPbHlLLmve4LLLRCf0g==", + "cpu": [ + "arm64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/tsx/node_modules/@esbuild/linux-ia32": { + "version": "0.28.1", + "resolved": "https://registry.npmjs.org/@esbuild/linux-ia32/-/linux-ia32-0.28.1.tgz", + "integrity": "sha512-d1z4ZuP0ajrfz/FhGT4vv278rX8KnPPJx8i5+AtK7TYbx9Le9F1hyzurZpkEyjkGa9dUGhQow4C1NmeGvqxN2w==", + "cpu": [ + "ia32" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/tsx/node_modules/@esbuild/linux-loong64": { + "version": "0.28.1", + "resolved": "https://registry.npmjs.org/@esbuild/linux-loong64/-/linux-loong64-0.28.1.tgz", + "integrity": "sha512-M5sRjUVZrkm1OAPR3dlOYzNmN+loZKGVi1VUQGrwuqLcbR6qeAz+famMhjASeH3YVKvZz+zT1jlh/keC3Rj/lg==", + "cpu": [ + "loong64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/tsx/node_modules/@esbuild/linux-mips64el": { + "version": "0.28.1", + "resolved": "https://registry.npmjs.org/@esbuild/linux-mips64el/-/linux-mips64el-0.28.1.tgz", + "integrity": "sha512-mRObBZeHh2OxcBFPWE/FjylkRgZdYuiTR3vaTozquCGOH14iP9oN4x4Ge81CoIDYQrXmIxpFumJBu5MtZpnQJQ==", + "cpu": [ + "mips64el" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/tsx/node_modules/@esbuild/linux-ppc64": { + "version": "0.28.1", + "resolved": "https://registry.npmjs.org/@esbuild/linux-ppc64/-/linux-ppc64-0.28.1.tgz", + "integrity": "sha512-slScBsMAb3GFDcdrCgLwZtPYRoH2H/youv10QiZyRjmsP48fznoveWytSgCI/R0ZcUgpc0ZhIUEx6LHts8yrfQ==", + "cpu": [ + "ppc64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/tsx/node_modules/@esbuild/linux-riscv64": { + "version": "0.28.1", + "resolved": "https://registry.npmjs.org/@esbuild/linux-riscv64/-/linux-riscv64-0.28.1.tgz", + "integrity": "sha512-kw0owk1o0GFETUJyW0jc0G4Yzs0BHZn0JDZ8JRT088vjJYX777BAs1fDGxAC+q831qOs2DTC96mNsG2opdfyyQ==", + "cpu": [ + "riscv64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/tsx/node_modules/@esbuild/linux-s390x": { + "version": "0.28.1", + "resolved": "https://registry.npmjs.org/@esbuild/linux-s390x/-/linux-s390x-0.28.1.tgz", + "integrity": "sha512-/lAIjX8aYFRByhh6L5rYtPEDRqa9de/4V/juOXcta5frjvzXO4/sqEtyytse0g3zZFuWu5cDN0MkLz2qRDD2Ag==", + "cpu": [ + "s390x" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/tsx/node_modules/@esbuild/linux-x64": { + "version": "0.28.1", + "resolved": "https://registry.npmjs.org/@esbuild/linux-x64/-/linux-x64-0.28.1.tgz", + "integrity": "sha512-u/anNYF2mmVOEDwLtnQ1wOr3EZ9sTNGLWrsYGYwHWzGA3Si84IOkHXlbWTD1NB+9/1lcnweYKO54uhxZydNzfA==", + "cpu": [ + "x64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/tsx/node_modules/@esbuild/netbsd-x64": { + "version": "0.28.1", + "resolved": "https://registry.npmjs.org/@esbuild/netbsd-x64/-/netbsd-x64-0.28.1.tgz", + "integrity": "sha512-aeL6lAnN89Hz43Mlh1G8ARasbuoYvSITDEx0tHh5b7jJnHcssqgjy9Yx430GDpmCa6OyrKoS0aNRjKundRizGg==", + "cpu": [ + "x64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "netbsd" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/tsx/node_modules/@esbuild/openbsd-x64": { + "version": "0.28.1", + "resolved": "https://registry.npmjs.org/@esbuild/openbsd-x64/-/openbsd-x64-0.28.1.tgz", + "integrity": "sha512-i/ZLIOafE0Z8cI/XANJAixoJL/uRAoS2xOA3rb0xN+KK0K177cMAsQYkzHtBrtMXAKuAc7HGgcWiZ/sRC1Nxgw==", + "cpu": [ + "x64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "openbsd" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/tsx/node_modules/@esbuild/sunos-x64": { + "version": "0.28.1", + "resolved": "https://registry.npmjs.org/@esbuild/sunos-x64/-/sunos-x64-0.28.1.tgz", + "integrity": "sha512-BEjgtECkL3vY+SaSQ6nzVfiALUeFxpawyp8Jmf5PtYhf1Ug40N1h/hxlhts+f1FvSvarEigdxS3BlSMI2PJLcQ==", + "cpu": [ + "x64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "sunos" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/tsx/node_modules/@esbuild/win32-arm64": { + "version": "0.28.1", + "resolved": "https://registry.npmjs.org/@esbuild/win32-arm64/-/win32-arm64-0.28.1.tgz", + "integrity": "sha512-lCv9eK/H6ZJWbE7bh2nw54CZ9M2nupBxJcTsdk/QQnWkdSjKGuxmmH8/GWrlT1eMmZfn4dGcCjRte397WqfQXA==", + "cpu": [ + "arm64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "win32" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/tsx/node_modules/@esbuild/win32-ia32": { + "version": "0.28.1", + "resolved": "https://registry.npmjs.org/@esbuild/win32-ia32/-/win32-ia32-0.28.1.tgz", + "integrity": "sha512-zvb/mB2bSCoJOpoCBgYKKpX6YM6mJBlBUVUtVj41DlZJVEB6/0CKlRYxP5wWl1C1ILiCoAU5wZZ4q1P3qeS6Eg==", + "cpu": [ + "ia32" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "win32" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/tsx/node_modules/@esbuild/win32-x64": { + "version": "0.28.1", + "resolved": "https://registry.npmjs.org/@esbuild/win32-x64/-/win32-x64-0.28.1.tgz", + "integrity": "sha512-bm4Mowrv+GXMlpWX++EcXw/iLyd1o3+bJkC2DkWXYVvgZCqD/bSj9ctZeAMC3cIxgjRVR2Dufaiu4YPxr5gW1A==", + "cpu": [ + "x64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "win32" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/tsx/node_modules/esbuild": { + "version": "0.28.1", + "resolved": "https://registry.npmjs.org/esbuild/-/esbuild-0.28.1.tgz", + "integrity": "sha512-HrJrvZv5ayxBzPfwphOoNzkzOIIlifzk0KJrGK2c8R4+LKpMtpYLQeUdjnwjWv/LZlkH2laZk+4w78pi99D4Vw==", + "dev": true, + "hasInstallScript": true, + "license": "MIT", + "bin": { + "esbuild": "bin/esbuild" + }, + "engines": { + "node": ">=18" + }, + "optionalDependencies": { + "@esbuild/aix-ppc64": "0.28.1", + "@esbuild/android-arm": "0.28.1", + "@esbuild/android-arm64": "0.28.1", + "@esbuild/android-x64": "0.28.1", + "@esbuild/darwin-arm64": "0.28.1", + "@esbuild/darwin-x64": "0.28.1", + "@esbuild/freebsd-arm64": "0.28.1", + "@esbuild/freebsd-x64": "0.28.1", + "@esbuild/linux-arm": "0.28.1", + "@esbuild/linux-arm64": "0.28.1", + "@esbuild/linux-ia32": "0.28.1", + "@esbuild/linux-loong64": "0.28.1", + "@esbuild/linux-mips64el": "0.28.1", + "@esbuild/linux-ppc64": "0.28.1", + "@esbuild/linux-riscv64": "0.28.1", + "@esbuild/linux-s390x": "0.28.1", + "@esbuild/linux-x64": "0.28.1", + "@esbuild/netbsd-arm64": "0.28.1", + "@esbuild/netbsd-x64": "0.28.1", + "@esbuild/openbsd-arm64": "0.28.1", + "@esbuild/openbsd-x64": "0.28.1", + "@esbuild/openharmony-arm64": "0.28.1", + "@esbuild/sunos-x64": "0.28.1", + "@esbuild/win32-arm64": "0.28.1", + "@esbuild/win32-ia32": "0.28.1", + "@esbuild/win32-x64": "0.28.1" + } + }, "node_modules/type-check": { "version": "0.4.0", "resolved": "https://registry.npmjs.org/type-check/-/type-check-0.4.0.tgz", diff --git a/package.json b/package.json index 13154bd..72a103e 100644 --- a/package.json +++ b/package.json @@ -2,7 +2,7 @@ "name": "querypad", "version": "0.6.0", "private": true, - "description": "Browser-native SQL playground powered by DuckDB-Wasm", + "description": "Local-first SQL scratchpad powered by DuckDB-Wasm", "author": "QueryPad Contributors", "license": "MIT", "repository": { @@ -10,6 +10,9 @@ "url": "https://github.com/vericontext/querypad.git" }, "homepage": "https://querypad.io", + "bin": { + "querypad": "src/cli/index.ts" + }, "keywords": [ "sql", "duckdb", @@ -24,6 +27,7 @@ "dev": "next dev", "build": "next build", "start": "next start", + "querypad": "tsx src/cli/index.ts", "lint": "eslint", "typecheck": "tsc --noEmit", "check:version": "node scripts/check-version.mjs", @@ -33,11 +37,13 @@ "partykit:deploy": "partykit deploy", "postinstall": "node scripts/copy-duckdb-wasm.mjs", "test": "playwright test", - "test:ui": "playwright test --ui" + "test:ui": "playwright test --ui", + "test:cli": "node --import tsx --test test/*.test.ts" }, "dependencies": { "@dagrejs/dagre": "^2.0.4", "@duckdb/duckdb-wasm": "^1.33.1-dev20.0", + "@duckdb/node-api": "^1.5.3-r.3", "@monaco-editor/react": "^4.7.0", "@tanstack/react-virtual": "^3.13.23", "@vercel/analytics": "^2.0.1", @@ -66,6 +72,7 @@ "eslint": "^9", "eslint-config-next": "16.2.0", "tailwindcss": "^4", + "tsx": "^4.22.4", "typescript": "^5" } } diff --git a/scripts/check-version.mjs b/scripts/check-version.mjs index 7d9cf25..ad2dcbc 100644 --- a/scripts/check-version.mjs +++ b/scripts/check-version.mjs @@ -15,10 +15,11 @@ function normalizeVersion(version) { } function fail(message) { - console.error(`Version check failed: ${message}`); + failures.push(message); process.exitCode = 1; } +const failures = []; const packageJson = readJson("package.json"); const packageLock = readJson("package-lock.json"); const changelog = readFileSync(join(root, "CHANGELOG.md"), "utf8"); @@ -45,6 +46,30 @@ if (!changelogVersion) { fail(`package.json (${packageVersion}) and CHANGELOG latest release (${changelogVersion}) differ.`); } -if (!process.exitCode) { +const humanOutput = + process.argv.includes("--human") || + process.env.npm_lifecycle_event === "check:version"; + +if (humanOutput) { + for (const failure of failures) { + console.error(`Version check failed: ${failure}`); + } +} + +if (failures.length === 0 && humanOutput) { console.log(`Version metadata is consistent at v${packageVersion}.`); } + +if (!humanOutput) { + if (failures.length === 0) { + console.log(JSON.stringify({ continue: true })); + } else { + console.log( + JSON.stringify({ + decision: "block", + reason: failures.map((failure) => `Version check failed: ${failure}`).join("\n"), + }) + ); + } + process.exitCode = 0; +} diff --git a/src/cli/ai-env.ts b/src/cli/ai-env.ts new file mode 100644 index 0000000..de12703 --- /dev/null +++ b/src/cli/ai-env.ts @@ -0,0 +1,39 @@ +import type { AiProvider } from "../lib/ai/providers"; +import { DEFAULT_AI_PROVIDER, isAiProvider } from "../lib/ai/providers"; + +export interface AiCredentials { + provider: AiProvider; + apiKey: string; +} + +const ENV_KEYS: Record = { + anthropic: "ANTHROPIC_API_KEY", + openai: "OPENAI_API_KEY", +}; + +/** + * Resolve the AI provider + API key for the CLI from flags and environment. + * Precedence: explicit `provider` arg → `QUERYPAD_AI_PROVIDER` → default (anthropic). + * The key comes from the provider-specific env var. + */ +export function resolveAiCredentials(provider?: string): AiCredentials { + const fromEnv = process.env.QUERYPAD_AI_PROVIDER; + const selected = provider ?? fromEnv ?? DEFAULT_AI_PROVIDER; + + if (!isAiProvider(selected)) { + throw new Error( + `Unknown AI provider "${selected}". Use "anthropic" or "openai".` + ); + } + + const envVar = ENV_KEYS[selected]; + const apiKey = process.env[envVar]; + if (!apiKey) { + throw new Error( + `Missing API key for ${selected}. Set ${envVar} in your environment, ` + + `e.g. \`${envVar}=sk-... querypad ask "..."\`.` + ); + } + + return { provider: selected, apiKey }; +} diff --git a/src/cli/artifacts.ts b/src/cli/artifacts.ts new file mode 100644 index 0000000..0504313 --- /dev/null +++ b/src/cli/artifacts.ts @@ -0,0 +1,123 @@ +import { mkdir, readFile, writeFile } from "node:fs/promises"; +import path from "node:path"; +import type { TableProfile } from "../types"; +import type { DiscoveryReport, Relationship } from "../types/discovery"; + +const ARTIFACT_DIR = ".querypad"; + +export interface CachedArtifacts { + relationships: Relationship[] | null; + profiles: TableProfile[] | null; +} + +async function readJson(filePath: string): Promise { + try { + return JSON.parse(await readFile(filePath, "utf8")) as T; + } catch { + return null; + } +} + +/** Read previously written `.querypad/` artifacts for context reuse (null when absent). */ +export async function readArtifacts(folder: string): Promise { + const dir = path.resolve(folder, ARTIFACT_DIR); + const relDoc = await readJson<{ relationships: Relationship[] }>( + path.join(dir, "relationships.json") + ); + const schemaDoc = await readJson<{ tables: TableProfile[] }>(path.join(dir, "schema.json")); + return { + relationships: relDoc?.relationships ?? null, + profiles: schemaDoc?.tables ?? null, + }; +} + +export interface WrittenArtifacts { + dir: string; + schemaPath: string; + relationshipsPath: string; + summaryPath: string; +} + +/** Write schema.json, relationships.json and inspect-summary.md under /.querypad/. */ +export async function writeArtifacts( + folder: string, + report: DiscoveryReport, + skipped: string[] +): Promise { + const dir = path.resolve(folder, ARTIFACT_DIR); + await mkdir(dir, { recursive: true }); + + const schemaPath = path.join(dir, "schema.json"); + const relationshipsPath = path.join(dir, "relationships.json"); + const summaryPath = path.join(dir, "inspect-summary.md"); + + await writeFile( + schemaPath, + JSON.stringify({ generatedAt: report.generatedAt, tables: report.profiles }, null, 2) + ); + await writeFile( + relationshipsPath, + JSON.stringify( + { generatedAt: report.generatedAt, relationships: report.relationships }, + null, + 2 + ) + ); + await writeFile(summaryPath, buildSummary(report, skipped)); + + return { dir, schemaPath, relationshipsPath, summaryPath }; +} + +/** Human- and agent-readable markdown overview of the inspection. */ +export function buildSummary(report: DiscoveryReport, skipped: string[]): string { + const lines: string[] = ["# QueryPad Inspection", ""]; + lines.push(`Generated: ${new Date(report.generatedAt).toISOString()}`, ""); + + lines.push(`## Tables (${report.profiles.length})`, ""); + for (const table of report.profiles) { + lines.push( + `### ${table.tableName}`, + `Rows: ${table.rowCount.toLocaleString()} · Columns: ${table.columnCount}`, + "", + "| Column | Type | Null | Distinct |", + "| --- | --- | --- | --- |" + ); + for (const column of table.columns) { + const nullPercent = `${column.nullPercent.toFixed(column.nullPercent >= 10 ? 0 : 1)}%`; + lines.push( + `| ${column.name} | ${column.type} | ${nullPercent} | ${column.distinctCount?.toLocaleString() ?? "n/a"} |` + ); + } + lines.push(""); + } + + lines.push(`## Relationships (${report.relationships.length})`, ""); + if (report.relationships.length === 0) { + lines.push("No relationships inferred above the confidence threshold.", ""); + } else { + lines.push("| Foreign | References | Confidence | Cardinality | Overlap |"); + lines.push("| --- | --- | --- | --- | --- |"); + for (const rel of report.relationships) { + const from = `${rel.from.table}.${rel.from.column}`; + const to = `${rel.to.table}.${rel.to.column}`; + const overlap = `${Math.round(rel.signals.valueOverlap * 100)}%`; + lines.push(`| ${from} | ${to} | ${rel.confidence}% | ${rel.cardinality} | ${overlap} |`); + } + lines.push(""); + } + + if (skipped.length > 0) { + lines.push(`## Skipped files (${skipped.length})`, ""); + lines.push(skipped.map((name) => `- ${name} (unsupported type)`).join("\n"), ""); + } + + lines.push( + "## Next steps", + "", + "- Review inferred relationships and adjust as needed.", + "- Feed `.querypad/schema.json` + `.querypad/relationships.json` to an AI agent to reason about the dataset.", + "" + ); + + return lines.join("\n"); +} diff --git a/src/cli/ask.ts b/src/cli/ask.ts new file mode 100644 index 0000000..fdee836 --- /dev/null +++ b/src/cli/ask.ts @@ -0,0 +1,116 @@ +import { complete } from "../lib/ai/complete"; +import { SQL_SYSTEM_PROMPT, buildSqlInput } from "../lib/ai/generate-sql"; +import { buildAskContext } from "../lib/agent/ask-context"; +import { discoverRelationships } from "../lib/discovery/relationships"; +import { isReadOnlyQuery, stripSqlFences } from "../lib/discovery/sql-safety"; +import { createNodeDb, type QueryResultRows } from "../lib/duckdb-node/connection"; +import { loadFolder } from "../lib/duckdb-node/load"; +import { profileTable } from "../lib/duckdb-node/profile"; +import type { Relationship } from "../types/discovery"; +import { resolveAiCredentials } from "./ai-env"; +import { readArtifacts } from "./artifacts"; +import { renderTable } from "./render"; + +const ANALYST_SYSTEM_PROMPT = `You are a data analyst. Given a question, the SQL that was run, and a sample of the result rows, state the answer as a concise 1-3 sentence finding. No preamble, do not restate the SQL, do not apologize.`; + +/** Injectable AI surface so the pipeline can be tested without network calls. */ +export interface AskAi { + generateSql(input: { context: string; question: string }): Promise; + generateInsight(input: { question: string; sql: string; sample: string }): Promise; +} + +export interface RunAskOptions { + question: string; + folder: string; + provider?: string; + showSql?: boolean; + /** Injected in tests; built from env credentials otherwise. */ + ai?: AskAi; + log?: (line: string) => void; +} + +export interface AskResult { + sql: string; + result: QueryResultRows | null; + insight: string | null; +} + +function realAi(provider: string | undefined): AskAi { + const creds = resolveAiCredentials(provider); + return { + generateSql: ({ context, question }) => + complete({ + provider: creds.provider, + apiKey: creds.apiKey, + system: SQL_SYSTEM_PROMPT, + input: buildSqlInput(context, question), + }), + generateInsight: ({ question, sql, sample }) => + complete({ + provider: creds.provider, + apiKey: creds.apiKey, + system: ANALYST_SYSTEM_PROMPT, + input: `Question: ${question}\n\nSQL:\n${sql}\n\nResult sample:\n${sample}`, + maxTokens: 300, + }), + }; +} + +export async function runAsk(options: RunAskOptions): Promise { + const log = options.log ?? ((line: string) => console.log(line)); + const ai = options.ai ?? realAi(options.provider); + + const db = await createNodeDb(); + try { + const { tables } = await loadFolder(options.folder, db.runner); + if (tables.length === 0) { + throw new Error( + `No supported data files found in ${options.folder} ` + + "(.parquet, .csv, .tsv, .json, .jsonl, .ndjson)." + ); + } + + // Prefer cached .querypad/ artifacts; otherwise profile + discover on the fly. + const cached = await readArtifacts(options.folder); + let relationships: Relationship[]; + if (cached.relationships) { + relationships = cached.relationships; + } else { + const now = Date.now(); + const profiles = + cached.profiles ?? + (await Promise.all(tables.map((table) => profileTable(table, db.runner, now)))); + relationships = await discoverRelationships(profiles, db.runner); + } + + const context = buildAskContext({ tables, relationships }); + const sql = stripSqlFences(await ai.generateSql({ context, question: options.question })); + + log("-- SQL"); + log(sql); + + if (options.showSql) { + return { sql, result: null, insight: null }; + } + + if (!isReadOnlyQuery(sql)) { + throw new Error(`Refusing to execute non-read-only SQL:\n${sql}`); + } + + const result = await db.query(sql); + log(""); + log(renderTable(result)); + + const insight = await ai.generateInsight({ + question: options.question, + sql, + sample: renderTable(result, 20), + }); + log(""); + log(`Insight: ${insight.trim()}`); + + return { sql, result, insight }; + } finally { + db.close(); + } +} diff --git a/src/cli/index.ts b/src/cli/index.ts new file mode 100644 index 0000000..9d3903f --- /dev/null +++ b/src/cli/index.ts @@ -0,0 +1,94 @@ +#!/usr/bin/env -S npx tsx +import { runAsk } from "./ask"; +import { runInspect } from "./inspect"; + +const HELP = `querypad — local-first dataset understanding + +Usage: + querypad inspect [folder] Profile a folder of data files and infer relationships + (writes .querypad/ artifacts). Defaults to the current directory. + querypad ask "" [folder] + Answer a natural-language question: generate SQL using the + inferred relationships, run it, and explain the result. + querypad help Show this help + +Options for ask: + --provider AI provider (default: anthropic, or QUERYPAD_AI_PROVIDER) + --show-sql Print the generated SQL without executing + +Environment: ANTHROPIC_API_KEY or OPENAI_API_KEY for the chosen provider. +Supported file types: .parquet, .csv, .tsv, .json, .jsonl, .ndjson +`; + +/** Split positional args from flags. Returns { positionals, flags }. */ +function parseArgs(args: string[]): { + positionals: string[]; + flags: Record; +} { + const positionals: string[] = []; + const flags: Record = {}; + for (let i = 0; i < args.length; i += 1) { + const arg = args[i]; + if (arg.startsWith("--")) { + const name = arg.slice(2); + const next = args[i + 1]; + if (next !== undefined && !next.startsWith("--")) { + flags[name] = next; + i += 1; + } else { + flags[name] = true; + } + } else { + positionals.push(arg); + } + } + return { positionals, flags }; +} + +async function main(argv: string[]): Promise { + const [command, ...rest] = argv; + + switch (command) { + case undefined: + case "help": + case "-h": + case "--help": + console.log(HELP); + return 0; + case "inspect": { + const { positionals } = parseArgs(rest); + const folder = positionals[0] ?? "."; + await runInspect(folder, Date.now()); + return 0; + } + case "ask": { + const { positionals, flags } = parseArgs(rest); + const question = positionals[0]; + if (!question) { + console.error('Usage: querypad ask "" [folder]\n'); + console.error(HELP); + return 1; + } + await runAsk({ + question, + folder: positionals[1] ?? ".", + provider: typeof flags.provider === "string" ? flags.provider : undefined, + showSql: flags["show-sql"] === true, + }); + return 0; + } + default: + console.error(`Unknown command: ${command}\n`); + console.error(HELP); + return 1; + } +} + +main(process.argv.slice(2)) + .then((code) => { + process.exitCode = code; + }) + .catch((err) => { + console.error(err instanceof Error ? err.message : err); + process.exitCode = 1; + }); diff --git a/src/cli/inspect.ts b/src/cli/inspect.ts new file mode 100644 index 0000000..45c8997 --- /dev/null +++ b/src/cli/inspect.ts @@ -0,0 +1,60 @@ +import path from "node:path"; +import { discoverRelationships } from "../lib/discovery/relationships"; +import { createNodeDb } from "../lib/duckdb-node/connection"; +import { loadFolder } from "../lib/duckdb-node/load"; +import { profileTable } from "../lib/duckdb-node/profile"; +import type { DiscoveryReport } from "../types/discovery"; +import { writeArtifacts } from "./artifacts"; + +/** + * `querypad inspect `: load every supported file in the folder, profile it, + * infer relationships, and write `.querypad/` artifacts. Returns the report. + */ +export async function runInspect(folder: string, now: number): Promise { + const resolved = path.resolve(folder); + const db = await createNodeDb(); + try { + console.log(`Inspecting ${resolved} ...`); + const { tables, skipped } = await loadFolder(resolved, db.runner); + + if (tables.length === 0) { + console.error( + "No supported data files found (.parquet, .csv, .tsv, .json, .jsonl, .ndjson)." + ); + const report: DiscoveryReport = { generatedAt: now, profiles: [], relationships: [] }; + await writeArtifacts(resolved, report, skipped); + return report; + } + + console.log(`Loaded ${tables.length} table(s). Profiling ...`); + const profiles = []; + for (const table of tables) { + profiles.push(await profileTable(table, db.runner, now)); + } + + console.log("Discovering relationships ..."); + const relationships = await discoverRelationships(profiles, db.runner); + + const report: DiscoveryReport = { generatedAt: now, profiles, relationships }; + const artifacts = await writeArtifacts(resolved, report, skipped); + + console.log(""); + console.log(`Tables: ${profiles.length}`); + console.log(`Relationships: ${relationships.length}`); + for (const rel of relationships) { + console.log( + ` ${rel.from.table}.${rel.from.column} ↳ ${rel.to.table}.${rel.to.column}` + + ` (${rel.confidence}%, ${rel.cardinality})` + ); + } + if (skipped.length > 0) { + console.log(`Skipped: ${skipped.length} unsupported file(s)`); + } + console.log(""); + console.log(`Wrote artifacts to ${artifacts.dir}`); + + return report; + } finally { + db.close(); + } +} diff --git a/src/cli/render.ts b/src/cli/render.ts new file mode 100644 index 0000000..d2bf892 --- /dev/null +++ b/src/cli/render.ts @@ -0,0 +1,42 @@ +import type { QueryResultRows } from "../lib/duckdb-node/connection"; + +const DEFAULT_ROW_CAP = 50; + +function formatCell(value: unknown): string { + if (value === null || value === undefined) return "NULL"; + if (typeof value === "bigint") return value.toString(); + if (typeof value === "number") return String(value); + if (typeof value === "boolean") return String(value); + if (value instanceof Date) return value.toISOString(); + if (typeof value === "object") { + if ("valueOf" in value) { + const unwrapped = (value as { valueOf(): unknown }).valueOf(); + if (unwrapped !== value) return formatCell(unwrapped); + } + return JSON.stringify(value); + } + return String(value); +} + +/** Render a query result as a fixed-width text table, capped at `rowCap` rows. */ +export function renderTable(result: QueryResultRows, rowCap = DEFAULT_ROW_CAP): string { + const { columns, rows } = result; + if (columns.length === 0) return "(no columns)"; + if (rows.length === 0) return "(0 rows)"; + + const shown = rows.slice(0, rowCap); + const cells = shown.map((row) => columns.map((col) => formatCell(row[col]))); + const widths = columns.map((col, i) => + Math.max(col.length, ...cells.map((row) => row[i].length)) + ); + + const pad = (text: string, width: number) => text.padEnd(width); + const header = columns.map((col, i) => pad(col, widths[i])).join(" "); + const divider = widths.map((w) => "-".repeat(w)).join(" "); + const body = cells.map((row) => row.map((cell, i) => pad(cell, widths[i])).join(" ")); + + const lines = [header, divider, ...body]; + const omitted = rows.length - shown.length; + if (omitted > 0) lines.push(`… ${omitted.toLocaleString()} more row(s) not shown`); + return lines.join("\n"); +} diff --git a/src/components/editor/AiAssistant.tsx b/src/components/editor/AiAssistant.tsx index 264d3a8..7d4c0d5 100644 --- a/src/components/editor/AiAssistant.tsx +++ b/src/components/editor/AiAssistant.tsx @@ -4,7 +4,18 @@ import { useState, useRef, useCallback, useEffect } from "react"; import { useWorkspaceStore } from "@/stores/workspace-store"; import { buildSchemaContext } from "@/lib/ai/schema-context"; import { generateSql } from "@/lib/ai/generate-sql"; -import { getApiKey, setApiKey, clearApiKey } from "@/lib/ai/api-key"; +import { + getAiProvider, + setAiProvider, + getApiKey, + setApiKey, + clearApiKey, +} from "@/lib/ai/api-key"; +import { + AI_PROVIDER_OPTIONS, + getAiProviderConfig, + type AiProvider, +} from "@/lib/ai/providers"; interface AiAssistantProps { onAccept: (sql: string) => void; @@ -22,15 +33,19 @@ export default function AiAssistant({ const [generatedSql, setGeneratedSql] = useState(""); const [isGenerating, setIsGenerating] = useState(false); const [error, setError] = useState(null); + const [provider, setProviderState] = useState("anthropic"); const [apiKey, setApiKeyState] = useState(null); const [keyInput, setKeyInput] = useState(""); const [showKeyForm, setShowKeyForm] = useState(false); const inputRef = useRef(null); const keyInputRef = useRef(null); const abortRef = useRef(false); + const providerConfig = getAiProviderConfig(provider); useEffect(() => { - setApiKeyState(getApiKey()); + const storedProvider = getAiProvider(); + setProviderState(storedProvider); + setApiKeyState(getApiKey(storedProvider)); }, []); useEffect(() => { @@ -44,12 +59,12 @@ export default function AiAssistant({ const handleSaveKey = useCallback(() => { const trimmed = keyInput.trim(); if (!trimmed) return; - setApiKey(trimmed); + setApiKey(provider, trimmed); setApiKeyState(trimmed); setKeyInput(""); setShowKeyForm(false); setError(null); - }, [keyInput]); + }, [keyInput, provider]); const handleChangeKey = useCallback(() => { setShowKeyForm(true); @@ -57,6 +72,17 @@ export default function AiAssistant({ setError(null); }, []); + const handleProviderChange = useCallback((nextProvider: AiProvider) => { + const nextKey = getApiKey(nextProvider); + setProviderState(nextProvider); + setAiProvider(nextProvider); + setApiKeyState(nextKey); + setShowKeyForm(!nextKey); + setKeyInput(""); + setGeneratedSql(""); + setError(null); + }, []); + const handleGenerate = useCallback(async () => { if (!prompt.trim() || isGenerating || !apiKey) return; setIsGenerating(true); @@ -67,7 +93,7 @@ export default function AiAssistant({ try { const schema = buildSchemaContext(tables); let sql = ""; - for await (const chunk of generateSql(apiKey, prompt, schema)) { + for await (const chunk of generateSql({ provider, apiKey, prompt, schema })) { if (abortRef.current) break; sql += chunk; setGeneratedSql(sql); @@ -76,13 +102,14 @@ export default function AiAssistant({ const message = err instanceof Error ? err.message : String(err); setError(message); if (message.includes("Invalid API key")) { - clearApiKey(); + clearApiKey(provider); setApiKeyState(null); + setShowKeyForm(true); } } finally { setIsGenerating(false); } - }, [prompt, tables, isGenerating, apiKey]); + }, [prompt, tables, isGenerating, apiKey, provider]); const handleKeyDown = useCallback( (e: React.KeyboardEvent) => { @@ -111,6 +138,25 @@ export default function AiAssistant({ [handleSaveKey, onClose] ); + const providerSelector = ( +
+ {AI_PROVIDER_OPTIONS.map((option) => ( + + ))} +
+ ); + // Show key input form if (!apiKey || showKeyForm) { return ( @@ -119,13 +165,14 @@ export default function AiAssistant({ + {providerSelector} setKeyInput(e.target.value)} onKeyDown={handleKeyInputKeyDown} - placeholder="Enter your Anthropic API key (sk-ant-...)" + placeholder={providerConfig.keyPlaceholder} className="flex-1 px-2 py-1 text-sm border border-purple-200 rounded bg-white focus:outline-none focus:ring-2 focus:ring-purple-400 focus:border-transparent font-mono" /> diff --git a/src/components/sidebar/ProfileDrawer.tsx b/src/components/sidebar/ProfileDrawer.tsx new file mode 100644 index 0000000..8e7f27d --- /dev/null +++ b/src/components/sidebar/ProfileDrawer.tsx @@ -0,0 +1,177 @@ +"use client"; + +import { useEffect } from "react"; +import { useWorkspaceStore } from "@/stores/workspace-store"; +import type { ColumnProfile, TableProfileState } from "@/types"; + +const IDLE_PROFILE_STATE: TableProfileState = { + status: "idle", + profile: null, + error: null, +}; + +function formatNumber(value: number | null): string { + if (value === null || !Number.isFinite(value)) return "n/a"; + return new Intl.NumberFormat("en-US", { + maximumFractionDigits: Math.abs(value) >= 100 ? 0 : 2, + }).format(value); +} + +function formatScalar(value: string | number | null): string { + if (value === null || value === "") return "n/a"; + if (typeof value === "number") return formatNumber(value); + return value.length > 48 ? `${value.slice(0, 45)}...` : value; +} + +function formatPercent(value: number): string { + return `${value >= 10 ? value.toFixed(0) : value.toFixed(1)}%`; +} + +function describeColumn(profile: ColumnProfile): string { + if (profile.kind === "numeric") { + const range = + profile.min !== null || profile.max !== null + ? `${formatScalar(profile.min)} to ${formatScalar(profile.max)}` + : "n/a"; + return `range ${range}, avg ${formatNumber(profile.avg)}`; + } + + if (profile.kind === "date") { + return `range ${formatScalar(profile.min)} to ${formatScalar(profile.max)}`; + } + + if (profile.topValues.length > 0) { + return profile.topValues + .map((item) => `${item.value || "n/a"} (${item.count.toLocaleString()})`) + .join(", "); + } + + return "No non-null values"; +} + +interface ProfileDrawerProps { + tableName: string; + onClose: () => void; +} + +export default function ProfileDrawer({ tableName, onClose }: ProfileDrawerProps) { + const table = useWorkspaceStore((s) => s.tables.find((t) => t.name === tableName)); + const profileState = useWorkspaceStore((s) => s.tableProfiles[tableName]) ?? IDLE_PROFILE_STATE; + const loadTableProfile = useWorkspaceStore((s) => s.loadTableProfile); + + useEffect(() => { + if (table && profileState.status === "idle") { + void loadTableProfile(tableName); + } + }, [loadTableProfile, profileState.status, table, tableName]); + + if (!table) return null; + + return ( + + ); +} diff --git a/src/components/sidebar/Sidebar.tsx b/src/components/sidebar/Sidebar.tsx index 1b316a3..8fd1f1a 100644 --- a/src/components/sidebar/Sidebar.tsx +++ b/src/components/sidebar/Sidebar.tsx @@ -1,16 +1,22 @@ "use client"; -import { useCallback } from "react"; +import { useCallback, useState } from "react"; import { useWorkspaceStore } from "@/stores/workspace-store"; import { SAMPLE_TABLE_NAMES } from "@/lib/constants"; import TableSchema from "./TableSchema"; +import ProfileDrawer from "./ProfileDrawer"; import DropZone from "@/components/dropzone/DropZone"; export default function Sidebar() { const tables = useWorkspaceStore((s) => s.tables); const removeTable = useWorkspaceStore((s) => s.removeTable); + const [profileTableName, setProfileTableName] = useState(null); const onlySampleTables = tables.length > 0 && tables.every((t) => SAMPLE_TABLE_NAMES.has(t.name)); + const visibleProfileTableName = + profileTableName && tables.some((t) => t.name === profileTableName) + ? profileTableName + : null; const handleFilesAdded = useCallback(() => { if (onlySampleTables) { @@ -20,23 +26,47 @@ export default function Sidebar() { } }, [onlySampleTables, removeTable]); + const handleRemove = useCallback( + (name: string) => { + if (profileTableName === name) setProfileTableName(null); + removeTable(name); + }, + [profileTableName, removeTable] + ); + return ( -
-
-

- Tables -

- -
-
- {tables.length === 0 ? ( -

- No tables loaded -

- ) : ( - tables.map((t) => ) - )} +
+
+
+

+ Tables +

+ +
+
+ {tables.length === 0 ? ( +

+ No tables loaded +

+ ) : ( + tables.map((t) => ( + + )) + )} +
+ {visibleProfileTableName && ( + setProfileTableName(null)} + /> + )}
); } diff --git a/src/components/sidebar/TableSchema.tsx b/src/components/sidebar/TableSchema.tsx index f76d0ad..792f711 100644 --- a/src/components/sidebar/TableSchema.tsx +++ b/src/components/sidebar/TableSchema.tsx @@ -6,39 +6,66 @@ import type { TableInfo } from "@/types"; interface TableSchemaProps { table: TableInfo; onRemove: (name: string) => void; + onOpenProfile: (name: string) => void; + profileActive: boolean; } -export default function TableSchema({ table, onRemove }: TableSchemaProps) { +export default function TableSchema({ + table, + onRemove, + onOpenProfile, + profileActive, +}: TableSchemaProps) { const [expanded, setExpanded] = useState(true); return (
- + + + +
{expanded && (
{table.columns.map((col) => ( diff --git a/src/components/workspace/CopyAgentContextButton.tsx b/src/components/workspace/CopyAgentContextButton.tsx new file mode 100644 index 0000000..9a57989 --- /dev/null +++ b/src/components/workspace/CopyAgentContextButton.tsx @@ -0,0 +1,70 @@ +"use client"; + +import { useMemo, useState } from "react"; +import { buildAgentContext } from "@/lib/agent/context"; +import { useWorkspaceStore } from "@/stores/workspace-store"; + +async function copyText(text: string): Promise { + if (navigator.clipboard?.writeText) { + await navigator.clipboard.writeText(text); + return; + } + + const textarea = document.createElement("textarea"); + textarea.value = text; + textarea.setAttribute("readonly", "true"); + textarea.style.position = "fixed"; + textarea.style.left = "-9999px"; + document.body.appendChild(textarea); + textarea.select(); + document.execCommand("copy"); + document.body.removeChild(textarea); +} + +export default function CopyAgentContextButton() { + const tables = useWorkspaceStore((s) => s.tables); + const tableProfiles = useWorkspaceStore((s) => s.tableProfiles); + const activeTab = useWorkspaceStore((s) => + s.tabs.find((tab) => tab.id === s.activeTabId) + ); + const [copied, setCopied] = useState(false); + + const contextText = useMemo( + () => + buildAgentContext({ + tables, + tableProfiles, + activeQuery: activeTab?.query ?? "", + activeResult: activeTab?.result ?? null, + activeError: activeTab?.error ?? null, + }), + [activeTab?.error, activeTab?.query, activeTab?.result, tableProfiles, tables] + ); + + const disabled = + tables.length === 0 && + !activeTab?.query.trim() && + !activeTab?.result && + !activeTab?.error; + + const handleCopy = async () => { + if (disabled) return; + await copyText(contextText); + setCopied(true); + window.setTimeout(() => setCopied(false), 1600); + }; + + return ( + + ); +} diff --git a/src/components/workspace/Workspace.tsx b/src/components/workspace/Workspace.tsx index 5608e8f..123a0aa 100644 --- a/src/components/workspace/Workspace.tsx +++ b/src/components/workspace/Workspace.tsx @@ -18,6 +18,7 @@ import ShareButton from "@/components/share/ShareButton"; import DropZone from "@/components/dropzone/DropZone"; import RoomBar from "@/components/collaboration/RoomBar"; import JoinDialog from "@/components/collaboration/JoinDialog"; +import CopyAgentContextButton from "./CopyAgentContextButton"; const PipelineEditor = dynamic( () => import("@/components/pipeline/PipelineEditor"), @@ -356,6 +357,7 @@ GROUP BY d.dept_name`; > Clear + { + const from = `${rel.from.table}.${rel.from.column}`; + const to = `${rel.to.table}.${rel.to.column}`; + return `- ${from} -> ${to} (${rel.cardinality}, ${rel.confidence}% confidence)`; + }); + return lines.join("\n"); +} + +/** + * Build the context block handed to the AI Analyst: table schemas plus the inferred + * relationships, so generated SQL joins on the correct keys. + */ +export function buildAskContext({ tables, relationships }: AskContextInput): string { + return [ + buildSchemaContext(tables), + "", + "Known relationships (use these for JOINs):", + renderRelationships(relationships), + ].join("\n"); +} diff --git a/src/lib/agent/context.ts b/src/lib/agent/context.ts new file mode 100644 index 0000000..81c0a64 --- /dev/null +++ b/src/lib/agent/context.ts @@ -0,0 +1,160 @@ +import type { + ColumnProfile, + QueryError, + QueryResult, + TableInfo, + TableProfileState, +} from "@/types"; + +const RESULT_ROW_LIMIT = 20; +const RESULT_COLUMN_LIMIT = 12; + +function formatScalar(value: unknown): string { + if (value === null || value === undefined) return "NULL"; + if (typeof value === "number") { + return Number.isFinite(value) + ? new Intl.NumberFormat("en-US", { maximumFractionDigits: 4 }).format(value) + : String(value); + } + if (typeof value === "bigint") return value.toString(); + if (typeof value === "boolean") return String(value); + if (value instanceof Date) return value.toISOString(); + if (typeof value === "object" && "valueOf" in value) { + const unwrapped = (value as { valueOf(): unknown }).valueOf(); + if (unwrapped !== value) return formatScalar(unwrapped); + } + return String(value); +} + +function escapeMarkdownCell(value: unknown): string { + return formatScalar(value).replaceAll("|", "\\|").replace(/\s+/g, " ").trim(); +} + +function profileDetail(column: ColumnProfile): string { + if (column.kind === "numeric") { + return `range ${formatScalar(column.min)} to ${formatScalar(column.max)}, avg ${formatScalar(column.avg)}`; + } + if (column.kind === "date") { + return `range ${formatScalar(column.min)} to ${formatScalar(column.max)}`; + } + if (column.topValues.length > 0) { + return column.topValues + .map((item) => `${escapeMarkdownCell(item.value)} (${item.count})`) + .join(", "); + } + return "no non-null values"; +} + +function renderResult(result: QueryResult): string { + if (result.columns.length === 0) return "The latest statement returned no columns."; + + const columns = result.columns.slice(0, RESULT_COLUMN_LIMIT); + const rows = result.rows.slice(0, RESULT_ROW_LIMIT); + const omittedColumns = result.columns.length - columns.length; + const omittedRows = result.rowCount - rows.length; + const header = `| ${columns.map(escapeMarkdownCell).join(" | ")} |`; + const divider = `| ${columns.map(() => "---").join(" | ")} |`; + const body = rows + .map((row) => `| ${columns.map((column) => escapeMarkdownCell(row[column])).join(" | ")} |`) + .join("\n"); + + const notes = [ + omittedRows > 0 ? `${omittedRows.toLocaleString()} more rows not shown` : null, + omittedColumns > 0 ? `${omittedColumns.toLocaleString()} more columns not shown` : null, + ].filter(Boolean); + + return [ + `Rows: ${result.rowCount.toLocaleString()}`, + `Execution: ${result.executionTimeMs}ms`, + "", + header, + divider, + body, + notes.length > 0 ? `\n${notes.join("; ")}.` : "", + ] + .filter((part) => part !== "") + .join("\n"); +} + +interface BuildAgentContextInput { + tables: TableInfo[]; + tableProfiles: Record; + activeQuery: string; + activeResult: QueryResult | null; + activeError: QueryError | null; +} + +export function buildAgentContext({ + tables, + tableProfiles, + activeQuery, + activeResult, + activeError, +}: BuildAgentContextInput): string { + const sections: string[] = ["# QueryPad Context"]; + + sections.push( + [ + "## Tables", + tables.length === 0 + ? "No tables loaded." + : tables + .map((table) => { + const columns = table.columns + .map((column) => `- ${column.name}: ${column.type}`) + .join("\n"); + return `### ${table.name}\nRows: ${table.rowCount.toLocaleString()}\n\n${columns}`; + }) + .join("\n\n"), + ].join("\n") + ); + + const profileSections = tables.map((table) => { + const state = tableProfiles[table.name]; + if (state?.status !== "ready" || !state.profile) { + return `### ${table.name}\nProfile not generated.`; + } + + const rows = state.profile.columns + .map((column) => { + const nullPercent = `${column.nullPercent.toFixed(column.nullPercent >= 10 ? 0 : 1)}%`; + return `| ${escapeMarkdownCell(column.name)} | ${escapeMarkdownCell(column.type)} | ${nullPercent} | ${column.distinctCount?.toLocaleString() ?? "n/a"} | ${profileDetail(column)} |`; + }) + .join("\n"); + + return [ + `### ${table.name}`, + `Generated: ${new Date(state.profile.generatedAt).toISOString()}`, + "", + "| Column | Type | Null | Distinct | Profile |", + "| --- | --- | --- | --- | --- |", + rows, + ].join("\n"); + }); + + sections.push(["## Data Profiles", profileSections.join("\n\n")].join("\n")); + + sections.push( + [ + "## Active SQL", + activeQuery.trim() ? `\`\`\`sql\n${activeQuery.trim()}\n\`\`\`` : "No active SQL query.", + ].join("\n") + ); + + if (activeError) { + sections.push(["## Latest Query Error", activeError.message].join("\n")); + } else if (activeResult) { + sections.push(["## Latest Result", renderResult(activeResult)].join("\n")); + } else { + sections.push(["## Latest Result", "No query result yet."].join("\n")); + } + + sections.push( + [ + "## Task", + "Use this context to inspect the data, write DuckDB SQL, explain results, or suggest the next analysis step.", + ].join("\n") + ); + + return sections.join("\n\n"); +} diff --git a/src/lib/ai/api-key.ts b/src/lib/ai/api-key.ts index 6912f65..75bc591 100644 --- a/src/lib/ai/api-key.ts +++ b/src/lib/ai/api-key.ts @@ -1,14 +1,48 @@ -const STORAGE_KEY = "querypad:anthropic-api-key"; +import { + DEFAULT_AI_PROVIDER, + type AiProvider, + isAiProvider, +} from "./providers"; -export function getApiKey(): string | null { +const PROVIDER_STORAGE_KEY = "querypad:ai:provider"; +const LEGACY_ANTHROPIC_KEY = "querypad:anthropic-api-key"; +const STORAGE_KEYS: Record = { + anthropic: "querypad:ai:anthropic-api-key", + openai: "querypad:ai:openai-api-key", +}; + +export function getAiProvider(): AiProvider { + if (typeof window === "undefined") return DEFAULT_AI_PROVIDER; + const stored = localStorage.getItem(PROVIDER_STORAGE_KEY); + return isAiProvider(stored) ? stored : DEFAULT_AI_PROVIDER; +} + +export function setAiProvider(provider: AiProvider): void { + localStorage.setItem(PROVIDER_STORAGE_KEY, provider); +} + +export function getApiKey(provider: AiProvider): string | null { if (typeof window === "undefined") return null; - return localStorage.getItem(STORAGE_KEY); + + const key = localStorage.getItem(STORAGE_KEYS[provider]); + if (key) return key; + + if (provider === "anthropic") { + const legacyKey = localStorage.getItem(LEGACY_ANTHROPIC_KEY); + if (legacyKey) { + localStorage.setItem(STORAGE_KEYS.anthropic, legacyKey); + localStorage.removeItem(LEGACY_ANTHROPIC_KEY); + return legacyKey; + } + } + + return null; } -export function setApiKey(key: string): void { - localStorage.setItem(STORAGE_KEY, key); +export function setApiKey(provider: AiProvider, key: string): void { + localStorage.setItem(STORAGE_KEYS[provider], key); } -export function clearApiKey(): void { - localStorage.removeItem(STORAGE_KEY); +export function clearApiKey(provider: AiProvider): void { + localStorage.removeItem(STORAGE_KEYS[provider]); } diff --git a/src/lib/ai/complete.ts b/src/lib/ai/complete.ts new file mode 100644 index 0000000..2584b8b --- /dev/null +++ b/src/lib/ai/complete.ts @@ -0,0 +1,175 @@ +import type { AiProvider } from "./providers"; +import { getAiProviderConfig } from "./providers"; + +export interface CompleteOptions { + provider: AiProvider; + apiKey: string; + /** System prompt / instructions. */ + system: string; + /** User input (already-assembled prompt). */ + input: string; + /** Max output tokens (default 1024). */ + maxTokens?: number; +} + +function httpError(status: number, body: string): Error { + if (status === 401) { + return new Error("Invalid API key. Please check your key and try again."); + } + if (status === 429) { + return new Error("Rate limit exceeded. Please wait a moment and try again."); + } + return new Error(body || `HTTP ${status}`); +} + +async function* streamAnthropic({ + apiKey, + system, + input, + maxTokens = 1024, +}: CompleteOptions): AsyncGenerator { + const config = getAiProviderConfig("anthropic"); + const response = await fetch("https://api.anthropic.com/v1/messages", { + method: "POST", + headers: { + "Content-Type": "application/json", + "x-api-key": apiKey, + "anthropic-version": "2023-06-01", + "anthropic-dangerous-direct-browser-access": "true", + }, + body: JSON.stringify({ + model: config.model, + max_tokens: maxTokens, + stream: true, + system, + messages: [{ role: "user", content: input }], + }), + }); + + if (!response.ok) throw httpError(response.status, await response.text()); + + const reader = response.body?.getReader(); + if (!reader) throw new Error("No response body"); + + const decoder = new TextDecoder(); + let buffer = ""; + + while (true) { + const { done, value } = await reader.read(); + if (done) break; + + buffer += decoder.decode(value, { stream: true }); + const lines = buffer.split("\n"); + buffer = lines.pop() || ""; + + for (const line of lines) { + if (!line.startsWith("data: ")) continue; + const data = line.slice(6); + if (data === "[DONE]") return; + + try { + const event = JSON.parse(data); + if (event.type === "content_block_delta" && event.delta?.type === "text_delta") { + yield event.delta.text; + } + } catch { + // skip non-JSON lines + } + } + } +} + +async function* streamOpenAi({ + apiKey, + system, + input, + maxTokens = 1024, +}: CompleteOptions): AsyncGenerator { + const config = getAiProviderConfig("openai"); + const response = await fetch("https://api.openai.com/v1/responses", { + method: "POST", + headers: { + "Content-Type": "application/json", + Authorization: `Bearer ${apiKey}`, + }, + body: JSON.stringify({ + model: config.model, + instructions: system, + input, + max_output_tokens: maxTokens, + reasoning: { effort: "low" }, + stream: true, + store: false, + text: { + format: { type: "text" }, + verbosity: "low", + }, + }), + }); + + if (!response.ok) { + const body = await response.text(); + if (response.status !== 401 && response.status !== 429) { + try { + const parsed = JSON.parse(body) as { error?: { message?: string } }; + if (parsed.error?.message) throw new Error(parsed.error.message); + } catch { + // fall through to generic error below + } + } + throw httpError(response.status, body); + } + + const reader = response.body?.getReader(); + if (!reader) throw new Error("No response body"); + + const decoder = new TextDecoder(); + let buffer = ""; + + while (true) { + const { done, value } = await reader.read(); + if (done) break; + + buffer += decoder.decode(value, { stream: true }); + const events = buffer.split("\n\n"); + buffer = events.pop() || ""; + + for (const eventText of events) { + const dataLine = eventText.split("\n").find((line) => line.startsWith("data: ")); + if (!dataLine) continue; + + const data = dataLine.slice(6); + if (data === "[DONE]") return; + + let event; + try { + event = JSON.parse(data); + } catch { + continue; + } + + if (event.type === "response.output_text.delta") { + yield event.delta; + } + if (event.type === "response.failed") { + throw new Error(event.response?.error?.message || "OpenAI response failed."); + } + } + } +} + +/** Provider-routed streaming completion. Yields text deltas as they arrive. */ +export async function* streamComplete(options: CompleteOptions): AsyncGenerator { + if (options.provider === "openai") { + yield* streamOpenAi(options); + return; + } + yield* streamAnthropic(options); +} + +/** Convenience: collect a streamed completion into a single string. */ +export async function complete(options: CompleteOptions): Promise { + let out = ""; + for await (const chunk of streamComplete(options)) out += chunk; + return out; +} diff --git a/src/lib/ai/generate-sql.ts b/src/lib/ai/generate-sql.ts index 8597e90..866939a 100644 --- a/src/lib/ai/generate-sql.ts +++ b/src/lib/ai/generate-sql.ts @@ -1,73 +1,30 @@ -const SYSTEM_PROMPT = `You are a DuckDB SQL expert. Generate only valid DuckDB SQL queries based on the user's natural language request and the provided table schema. Output ONLY the SQL query — no explanations, no markdown fences, no comments. If the request is ambiguous, make reasonable assumptions.`; +import type { AiProvider } from "./providers"; +import { streamComplete } from "./complete"; -export async function* generateSql( - apiKey: string, - prompt: string, - schema: string -): AsyncGenerator { - const response = await fetch("https://api.anthropic.com/v1/messages", { - method: "POST", - headers: { - "Content-Type": "application/json", - "x-api-key": apiKey, - "anthropic-version": "2023-06-01", - "anthropic-dangerous-direct-browser-access": "true", - }, - body: JSON.stringify({ - model: "claude-sonnet-4-20250514", - max_tokens: 1024, - stream: true, - system: SYSTEM_PROMPT, - messages: [ - { - role: "user", - content: `Table schema:\n${schema || "No tables loaded."}\n\nRequest: ${prompt}`, - }, - ], - }), - }); - - if (!response.ok) { - if (response.status === 401) { - throw new Error("Invalid API key. Please check your key and try again."); - } - if (response.status === 429) { - throw new Error("Rate limit exceeded. Please wait a moment and try again."); - } - const text = await response.text(); - throw new Error(text || `HTTP ${response.status}`); - } - - const reader = response.body?.getReader(); - if (!reader) throw new Error("No response body"); - - const decoder = new TextDecoder(); - let buffer = ""; +export const SQL_SYSTEM_PROMPT = `You are a DuckDB SQL expert. Generate only valid DuckDB SQL queries based on the user's natural language request and the provided table schema. Output ONLY the SQL query — no explanations, no markdown fences, no comments. If the request is ambiguous, make reasonable assumptions.`; - while (true) { - const { done, value } = await reader.read(); - if (done) break; - - buffer += decoder.decode(value, { stream: true }); - const lines = buffer.split("\n"); - buffer = lines.pop() || ""; +interface GenerateSqlOptions { + provider: AiProvider; + apiKey: string; + prompt: string; + schema: string; +} - for (const line of lines) { - if (!line.startsWith("data: ")) continue; - const data = line.slice(6); - if (data === "[DONE]") return; +/** Build the user-message body shared by both providers. */ +export function buildSqlInput(schema: string, prompt: string): string { + return `Table schema:\n${schema || "No tables loaded."}\n\nRequest: ${prompt}`; +} - try { - const event = JSON.parse(data); - if ( - event.type === "content_block_delta" && - event.delta?.type === "text_delta" - ) { - yield event.delta.text; - } - } catch { - // skip non-JSON lines - } - } - } +export async function* generateSql({ + provider, + apiKey, + prompt, + schema, +}: GenerateSqlOptions): AsyncGenerator { + yield* streamComplete({ + provider, + apiKey, + system: SQL_SYSTEM_PROMPT, + input: buildSqlInput(schema, prompt), + }); } diff --git a/src/lib/ai/providers.ts b/src/lib/ai/providers.ts new file mode 100644 index 0000000..45fb8d5 --- /dev/null +++ b/src/lib/ai/providers.ts @@ -0,0 +1,41 @@ +export type AiProvider = "anthropic" | "openai"; + +export interface AiProviderConfig { + id: AiProvider; + label: string; + modelLabel: string; + model: string; + keyPlaceholder: string; + keyUrl: string; +} + +export const DEFAULT_AI_PROVIDER: AiProvider = "anthropic"; + +export const AI_PROVIDER_CONFIGS: Record = { + anthropic: { + id: "anthropic", + label: "Claude", + modelLabel: "Sonnet 4.6", + model: "claude-sonnet-4-6", + keyPlaceholder: "Enter your Anthropic API key (sk-ant-...)", + keyUrl: "https://console.anthropic.com/settings/keys", + }, + openai: { + id: "openai", + label: "OpenAI", + modelLabel: "GPT-5.5", + model: "gpt-5.5", + keyPlaceholder: "Enter your OpenAI API key (sk-...)", + keyUrl: "https://platform.openai.com/api-keys", + }, +}; + +export const AI_PROVIDER_OPTIONS = Object.values(AI_PROVIDER_CONFIGS); + +export function isAiProvider(value: string | null): value is AiProvider { + return value === "anthropic" || value === "openai"; +} + +export function getAiProviderConfig(provider: AiProvider): AiProviderConfig { + return AI_PROVIDER_CONFIGS[provider]; +} diff --git a/src/lib/discovery/relationships.ts b/src/lib/discovery/relationships.ts new file mode 100644 index 0000000..b4259a8 --- /dev/null +++ b/src/lib/discovery/relationships.ts @@ -0,0 +1,199 @@ +import type { ColumnProfile, TableProfile } from "../../types"; +import type { Relationship } from "../../types/discovery"; +import { quoteIdent } from "../duckdb/sql-utils"; +import { + NAME_SIMILARITY_FLOOR, + OVERLAP_FLOOR, + STRONG_NAME_SIMILARITY, + cardinalityShapeScore, + confidence, + isTypeCompatible, + nameSimilarity, + typeMatchScore, +} from "./signals"; + +/** + * Engine-agnostic query interface. Both the browser (DuckDB-Wasm) and the Node CLI + * (`@duckdb/node-api`) can supply a runner; the discovery logic never touches a + * concrete connection. + */ +export type QueryRunner = (sql: string) => Promise[]>; + +/** Minimum blended confidence for an edge to be reported. */ +const CONFIDENCE_FLOOR = 50; + +function toCount(value: unknown): number { + if (typeof value === "bigint") return Number(value); + if (typeof value === "number") return value; + const parsed = Number(value); + return Number.isFinite(parsed) ? parsed : 0; +} + +/** A column is a primary-key candidate when it is unique and has no nulls. */ +function isKeyCandidate(column: ColumnProfile, rowCount: number): boolean { + return ( + rowCount > 0 && + column.distinctCount !== null && + column.distinctCount === rowCount && + column.nullCount === 0 + ); +} + +/** A column is "unique" if its distinct count equals the row count. */ +function isUnique(column: ColumnProfile, rowCount: number): boolean { + return rowCount > 0 && column.distinctCount !== null && column.distinctCount === rowCount; +} + +interface Candidate { + keyTable: string; + keyColumn: ColumnProfile; + foreignTable: string; + foreignColumn: ColumnProfile; + foreignRowCount: number; + nameScore: number; +} + +/** + * Discover directed foreign-key relationships across a set of profiled tables. + * Returns edges `from.column ↳ to.column` sorted by descending confidence. + */ +export async function discoverRelationships( + profiles: TableProfile[], + runner: QueryRunner +): Promise { + // 1. Generate pruned candidates (name + type filtered) to avoid O(n²) overlap scans. + const candidates: Candidate[] = []; + for (const keyProfile of profiles) { + for (const keyColumn of keyProfile.columns) { + if (!isKeyCandidate(keyColumn, keyProfile.rowCount)) continue; + + for (const foreignProfile of profiles) { + if (foreignProfile.tableName === keyProfile.tableName) continue; + for (const foreignColumn of foreignProfile.columns) { + if (!isTypeCompatible(foreignColumn.kind, keyColumn.kind)) continue; + const nameScore = nameSimilarity( + foreignColumn.name, + keyColumn.name, + keyProfile.tableName + ); + if (nameScore < NAME_SIMILARITY_FLOOR) continue; + + // A column that is its own table's surrogate primary key (unique, non-null) + // is only a foreign key when its name explicitly references the target; + // otherwise overlapping integer id ranges produce false positives. + if ( + isKeyCandidate(foreignColumn, foreignProfile.rowCount) && + nameScore < STRONG_NAME_SIMILARITY + ) { + continue; + } + candidates.push({ + keyTable: keyProfile.tableName, + keyColumn, + foreignTable: foreignProfile.tableName, + foreignColumn, + foreignRowCount: foreignProfile.rowCount, + nameScore, + }); + } + } + } + } + + // 2. Run the decisive value-overlap query for each surviving candidate. + const edges: Relationship[] = []; + for (const candidate of candidates) { + const overlap = await measureOverlap(candidate, runner); + if (overlap < OVERLAP_FLOOR) continue; + + const signals = { + valueOverlap: overlap, + nameSimilarity: candidate.nameScore, + typeMatch: typeMatchScore( + candidate.foreignColumn.type, + candidate.keyColumn.type, + candidate.foreignColumn.kind, + candidate.keyColumn.kind + ), + cardinalityShape: cardinalityShapeScore( + true, + isUnique(candidate.foreignColumn, candidate.foreignRowCount) + ), + }; + const score = confidence(signals); + if (score < CONFIDENCE_FLOOR) continue; + + edges.push({ + from: { table: candidate.foreignTable, column: candidate.foreignColumn.name }, + to: { table: candidate.keyTable, column: candidate.keyColumn.name }, + confidence: score, + cardinality: signals.cardinalityShape === 0.8 ? "one-to-one" : "many-to-one", + signals, + }); + } + + // 3. A foreign column references a single table: keep only its strongest target + // (disambiguates spurious overlaps against unrelated id ranges), then collapse + // mirrored one-to-one directions. + return dedupeEdges(bestPerForeignColumn(edges)); +} + +/** Keep at most one edge per foreign column — the highest-confidence target. */ +function bestPerForeignColumn(edges: Relationship[]): Relationship[] { + const best = new Map(); + for (const edge of edges) { + const k = `${edge.from.table}.${edge.from.column}`; + const existing = best.get(k); + const better = + !existing || + edge.confidence > existing.confidence || + (edge.confidence === existing.confidence && + edge.signals.nameSimilarity > existing.signals.nameSimilarity); + if (better) best.set(k, edge); + } + return [...best.values()]; +} + +async function measureOverlap(candidate: Candidate, runner: QueryRunner): Promise { + const foreign = quoteIdent(candidate.foreignTable); + const fc = quoteIdent(candidate.foreignColumn.name); + const key = quoteIdent(candidate.keyTable); + const kc = quoteIdent(candidate.keyColumn.name); + + // Cast both sides to VARCHAR so numeric-vs-text key conventions still match. + const sql = ` + SELECT + COUNT(DISTINCT CAST(f.${fc} AS VARCHAR)) AS distinct_fk, + COUNT(DISTINCT CASE WHEN k.key IS NOT NULL THEN CAST(f.${fc} AS VARCHAR) END) AS matched_fk + FROM ${foreign} f + LEFT JOIN (SELECT DISTINCT CAST(${kc} AS VARCHAR) AS key FROM ${key}) k + ON CAST(f.${fc} AS VARCHAR) = k.key + WHERE f.${fc} IS NOT NULL + `; + + try { + const rows = await runner(sql); + const row = rows[0] ?? {}; + const distinct = toCount(row.distinct_fk); + const matched = toCount(row.matched_fk); + return distinct === 0 ? 0 : matched / distinct; + } catch { + return 0; + } +} + +function edgeKey(rel: Relationship): string { + const a = `${rel.from.table}.${rel.from.column}`; + const b = `${rel.to.table}.${rel.to.column}`; + return [a, b].sort().join("::"); +} + +function dedupeEdges(edges: Relationship[]): Relationship[] { + const best = new Map(); + for (const edge of edges) { + const k = edgeKey(edge); + const existing = best.get(k); + if (!existing || edge.confidence > existing.confidence) best.set(k, edge); + } + return [...best.values()].sort((a, b) => b.confidence - a.confidence); +} diff --git a/src/lib/discovery/signals.ts b/src/lib/discovery/signals.ts new file mode 100644 index 0000000..e4a71e4 --- /dev/null +++ b/src/lib/discovery/signals.ts @@ -0,0 +1,117 @@ +import type { ProfileColumnKind } from "../../types"; +import type { RelationshipSignals } from "../../types/discovery"; + +/** + * Pure scoring helpers for relationship discovery. No DuckDB / IO — everything here + * is deterministic over plain inputs so it can be unit-tested in isolation. + */ + +/** Minimum name similarity for a column pair to be worth a (costly) value-overlap query. */ +export const NAME_SIMILARITY_FLOOR = 0.15; + +/** Name similarity at/above which a name is considered an explicit reference to the target. */ +export const STRONG_NAME_SIMILARITY = 0.9; + +/** Minimum value overlap for an edge to be emitted at all. */ +export const OVERLAP_FLOOR = 0.5; + +// Confidence weights — value overlap dominates, name similarity is the next strongest. +const WEIGHTS = { + valueOverlap: 0.55, + nameSimilarity: 0.25, + typeMatch: 0.1, + cardinalityShape: 0.1, +} as const; + +/** Split an identifier into lowercase tokens across snake_case and camelCase boundaries. */ +export function splitTokens(name: string): string[] { + return name + .replace(/([a-z0-9])([A-Z])/g, "$1 $2") + .split(/[^a-zA-Z0-9]+/) + .map((token) => token.toLowerCase()) + .filter(Boolean); +} + +/** Very small, dependency-free singularizer good enough for table-name heuristics. */ +export function singularize(word: string): string { + const lower = word.toLowerCase(); + if (lower.endsWith("ies") && lower.length > 3) return `${lower.slice(0, -3)}y`; + if (lower.endsWith("ses") && lower.length > 3) return lower.slice(0, -2); + if (lower.endsWith("s") && !lower.endsWith("ss") && lower.length > 1) return lower.slice(0, -1); + return lower; +} + +function jaccard(a: string[], b: string[]): number { + if (a.length === 0 || b.length === 0) return 0; + const setA = new Set(a); + const setB = new Set(b); + let intersection = 0; + for (const token of setA) if (setB.has(token)) intersection += 1; + const union = new Set([...a, ...b]).size; + return union === 0 ? 0 : intersection / union; +} + +/** + * Name similarity between a foreign column and the key it might reference. + * Rewards the canonical `_` and exact-name conventions, + * and falls back to token overlap against both the key column and its table. + */ +export function nameSimilarity( + foreignColumn: string, + keyColumn: string, + keyTable: string +): number { + const foreign = foreignColumn.toLowerCase(); + const key = keyColumn.toLowerCase(); + const table = singularize(keyTable); + + // events.user_id ↳ users.id → "user_id" === "user" + "_" + "id" + if (foreign === `${table}_${key}`) return 1; + // payments.customer_id ↳ subscriptions.customer_id → identical column names + if (foreign === key && key !== "id") return 1; + // orders.user ↳ users.id + if (foreign === table) return 0.9; + + const foreignTokens = splitTokens(foreignColumn); + const keyTokens = splitTokens(`${keyTable} ${keyColumn}`); + const overlap = jaccard(foreignTokens, keyTokens); + + // A bare shared "id" token is weak; require the table name to also appear. + if (foreignTokens.includes(table)) return Math.max(overlap, 0.7); + return overlap; +} + +/** Are two column kinds joinable at all? */ +export function isTypeCompatible(a: ProfileColumnKind, b: ProfileColumnKind): boolean { + if (a === b) return true; + // numbers stored as text vs numeric still routinely join after casting + return (a === "numeric" && b === "text") || (a === "text" && b === "numeric"); +} + +/** 1 for identical type strings, 0.85 for same-kind, 0.4 for compatible cross-kind. */ +export function typeMatchScore( + foreignType: string, + keyType: string, + foreignKind: ProfileColumnKind, + keyKind: ProfileColumnKind +): number { + if (foreignType.toUpperCase() === keyType.toUpperCase()) return 1; + if (foreignKind === keyKind) return 0.85; + return isTypeCompatible(foreignKind, keyKind) ? 0.4 : 0; +} + +/** 1 for a clean many-to-one; 0.8 when both sides are unique (one-to-one). */ +export function cardinalityShapeScore(keyUnique: boolean, foreignUnique: boolean): number { + if (!keyUnique) return 0; + return foreignUnique ? 0.8 : 1; +} + +/** Blend the individual signals into a 0..100 confidence score. */ +export function confidence(signals: RelationshipSignals): number { + const weighted = + WEIGHTS.valueOverlap * signals.valueOverlap + + WEIGHTS.nameSimilarity * signals.nameSimilarity + + WEIGHTS.typeMatch * signals.typeMatch + + WEIGHTS.cardinalityShape * signals.cardinalityShape; + return Math.round(weighted * 100); +} diff --git a/src/lib/discovery/sql-safety.ts b/src/lib/discovery/sql-safety.ts new file mode 100644 index 0000000..5ae1c66 --- /dev/null +++ b/src/lib/discovery/sql-safety.ts @@ -0,0 +1,44 @@ +/** + * Pure guards applied to AI-generated SQL before execution. No IO — unit-testable. + */ + +/** Statement kinds we allow `ask` to execute (read-only). */ +const READ_ONLY_LEADERS = ["SELECT", "WITH", "EXPLAIN", "DESCRIBE", "PRAGMA", "SHOW", "TABLE", "FROM"]; + +/** Strip markdown code fences (```sql … ```) and surrounding whitespace from model output. */ +export function stripSqlFences(text: string): string { + let sql = text.trim(); + if (sql.startsWith("```")) { + sql = sql.replace(/^```[a-zA-Z]*\n?/, "").replace(/```$/, ""); + } + return sql.trim(); +} + +/** Remove SQL line/block comments so the leading keyword check can't be bypassed. */ +function stripComments(sql: string): string { + return sql + .replace(/\/\*[\s\S]*?\*\//g, " ") + .replace(/--[^\n]*/g, " ") + .trim(); +} + +/** + * True only when every statement is read-only. Rejects DDL/DML (DROP, DELETE, + * UPDATE, INSERT, ALTER, CREATE, ATTACH, COPY, …) so a slipped-through statement + * can't mutate state. + */ +export function isReadOnlyQuery(sql: string): boolean { + const cleaned = stripComments(stripSqlFences(sql)); + if (!cleaned) return false; + + const statements = cleaned + .split(";") + .map((s) => s.trim()) + .filter(Boolean); + if (statements.length === 0) return false; + + return statements.every((statement) => { + const leader = statement.split(/[\s(]+/, 1)[0]?.toUpperCase() ?? ""; + return READ_ONLY_LEADERS.includes(leader); + }); +} diff --git a/src/lib/duckdb-node/connection.ts b/src/lib/duckdb-node/connection.ts new file mode 100644 index 0000000..136f616 --- /dev/null +++ b/src/lib/duckdb-node/connection.ts @@ -0,0 +1,42 @@ +import { DuckDBInstance, type DuckDBConnection } from "@duckdb/node-api"; +import type { QueryRunner } from "../discovery/relationships"; + +export interface QueryResultRows { + columns: string[]; + rows: Record[]; +} + +export interface NodeDb { + /** Engine-agnostic runner consumed by the discovery + profiling code. */ + runner: QueryRunner; + /** Like `runner`, but also returns ordered column names (for result display). */ + query: (sql: string) => Promise; + connection: DuckDBConnection; + close: () => void; +} + +/** Create an in-memory Node DuckDB instance and expose it as a QueryRunner. */ +export async function createNodeDb(): Promise { + const instance = await DuckDBInstance.create(":memory:"); + const connection = await instance.connect(); + + const runner: QueryRunner = async (sql) => { + const reader = await connection.runAndReadAll(sql); + return reader.getRowObjects() as Record[]; + }; + + const query = async (sql: string): Promise => { + const reader = await connection.runAndReadAll(sql); + return { + columns: reader.columnNames(), + rows: reader.getRowObjects() as Record[], + }; + }; + + return { + runner, + query, + connection, + close: () => connection.closeSync(), + }; +} diff --git a/src/lib/duckdb-node/load.ts b/src/lib/duckdb-node/load.ts new file mode 100644 index 0000000..b7df4e9 --- /dev/null +++ b/src/lib/duckdb-node/load.ts @@ -0,0 +1,100 @@ +import { readdir } from "node:fs/promises"; +import path from "node:path"; +import type { ColumnInfo, TableInfo } from "../../types"; +import type { QueryRunner } from "../discovery/relationships"; +import { fileExtension, sanitizeTableName } from "../utils"; +import { quoteIdent } from "../duckdb/sql-utils"; + +/** File types DuckDB can read directly from disk in Node. */ +const SUPPORTED_EXTENSIONS = new Set(["parquet", "csv", "tsv", "json", "jsonl", "ndjson"]); + +export interface LoadFolderResult { + tables: TableInfo[]; + /** Files present in the folder that were skipped (unsupported type). */ + skipped: string[]; +} + +function escapeLiteral(value: string): string { + return value.replaceAll("'", "''"); +} + +function readFunction(ext: string, absolutePath: string): string | null { + const literal = `'${escapeLiteral(absolutePath)}'`; + switch (ext) { + case "parquet": + return `read_parquet(${literal})`; + case "csv": + case "tsv": + return `read_csv_auto(${literal})`; + case "json": + case "jsonl": + case "ndjson": + return `read_json_auto(${literal})`; + default: + return null; + } +} + +/** Ensure a table name is unique within this load (e.g. users.csv vs users.parquet). */ +function uniqueName(base: string, taken: Set): string { + if (!taken.has(base)) { + taken.add(base); + return base; + } + let suffix = 2; + while (taken.has(`${base}_${suffix}`)) suffix += 1; + const name = `${base}_${suffix}`; + taken.add(name); + return name; +} + +async function describeTable(name: string, runner: QueryRunner): Promise { + const rows = await runner(`DESCRIBE ${quoteIdent(name)}`); + return rows.map((row) => ({ + name: String(row.column_name), + type: String(row.column_type), + })); +} + +async function countRows(name: string, runner: QueryRunner): Promise { + const rows = await runner(`SELECT COUNT(*) AS cnt FROM ${quoteIdent(name)}`); + return Number(rows[0]?.cnt ?? 0); +} + +/** + * Scan a folder (top level) and register every supported data file as a DuckDB table. + * Returns the loaded tables plus any files that were skipped. + */ +export async function loadFolder( + folder: string, + runner: QueryRunner +): Promise { + const entries = await readdir(folder, { withFileTypes: true }); + const tables: TableInfo[] = []; + const skipped: string[] = []; + const taken = new Set(); + + for (const entry of entries) { + if (!entry.isFile()) continue; + const ext = fileExtension(entry.name); + if (!SUPPORTED_EXTENSIONS.has(ext)) { + skipped.push(entry.name); + continue; + } + + const absolutePath = path.resolve(folder, entry.name); + const readFn = readFunction(ext, absolutePath); + if (!readFn) { + skipped.push(entry.name); + continue; + } + + const name = uniqueName(sanitizeTableName(entry.name), taken); + await runner(`CREATE OR REPLACE TABLE ${quoteIdent(name)} AS SELECT * FROM ${readFn}`); + const columns = await describeTable(name, runner); + const rowCount = await countRows(name, runner); + tables.push({ name, columns, rowCount }); + } + + return { tables, skipped }; +} diff --git a/src/lib/duckdb-node/profile.ts b/src/lib/duckdb-node/profile.ts new file mode 100644 index 0000000..e6320e4 --- /dev/null +++ b/src/lib/duckdb-node/profile.ts @@ -0,0 +1,142 @@ +import type { + ColumnInfo, + ColumnProfile, + ProfileTopValue, + TableInfo, + TableProfile, +} from "../../types"; +import type { QueryRunner } from "../discovery/relationships"; +import { classifyType, quoteIdent } from "../duckdb/sql-utils"; + +function toNumber(value: unknown): number | null { + if (typeof value === "bigint") return Number(value); + if (typeof value === "number") return Number.isFinite(value) ? value : null; + if (typeof value === "string" && value.trim() !== "") { + const parsed = Number(value); + return Number.isFinite(parsed) ? parsed : null; + } + return null; +} + +function toScalar(value: unknown): string | number | null { + if (value === null || value === undefined) return null; + if (typeof value === "bigint") return Number(value); + if (typeof value === "number") return Number.isFinite(value) ? value : String(value); + if (typeof value === "string") return value; + if (value instanceof Date) return value.toISOString(); + return String(value); +} + +async function readSingleRow( + runner: QueryRunner, + sql: string +): Promise | null> { + const rows = await runner(sql); + return rows[0] ?? null; +} + +async function readTopValues( + runner: QueryRunner, + tableName: string, + columnName: string +): Promise { + const table = quoteIdent(tableName); + const column = quoteIdent(columnName); + const rows = await runner(` + SELECT CAST(${column} AS VARCHAR) AS value, COUNT(*) AS value_count + FROM ${table} + WHERE ${column} IS NOT NULL + GROUP BY 1 + ORDER BY value_count DESC, value ASC + LIMIT 5 + `); + return rows.map((row) => ({ + value: String(toScalar(row.value) ?? ""), + count: toNumber(row.value_count) ?? 0, + })); +} + +async function profileColumn( + runner: QueryRunner, + tableName: string, + rowCount: number, + column: ColumnInfo +): Promise { + const table = quoteIdent(tableName); + const columnName = quoteIdent(column.name); + const kind = classifyType(column.type); + + let nullCount = 0; + let distinctCount: number | null = null; + try { + const base = await readSingleRow( + runner, + `SELECT COUNT(*) - COUNT(${columnName}) AS null_count, COUNT(DISTINCT ${columnName}) AS distinct_count FROM ${table}` + ); + nullCount = toNumber(base?.null_count) ?? 0; + distinctCount = toNumber(base?.distinct_count); + } catch (err) { + console.error(`Failed to profile ${tableName}.${column.name}:`, err); + } + + let min: string | number | null = null; + let max: string | number | null = null; + let avg: number | null = null; + if (kind === "numeric") { + const stats = await readSingleRow( + runner, + `SELECT MIN(${columnName}) AS min_value, MAX(${columnName}) AS max_value, AVG(${columnName}) AS avg_value FROM ${table} WHERE ${columnName} IS NOT NULL` + ); + min = toScalar(stats?.min_value); + max = toScalar(stats?.max_value); + avg = toNumber(stats?.avg_value); + } else if (kind === "date") { + const stats = await readSingleRow( + runner, + `SELECT MIN(${columnName}) AS min_value, MAX(${columnName}) AS max_value FROM ${table} WHERE ${columnName} IS NOT NULL` + ); + min = toScalar(stats?.min_value); + max = toScalar(stats?.max_value); + } + + let topValues: ProfileTopValue[] = []; + if (kind === "text" || kind === "boolean" || kind === "other") { + try { + topValues = await readTopValues(runner, tableName, column.name); + } catch (err) { + console.error(`Failed to read top values for ${tableName}.${column.name}:`, err); + } + } + + return { + name: column.name, + type: column.type, + kind, + nullCount, + nullPercent: rowCount === 0 ? 0 : (nullCount / rowCount) * 100, + distinctCount, + min, + max, + avg, + topValues, + }; +} + +/** Profile a single loaded table using the Node DuckDB runner. */ +export async function profileTable( + table: TableInfo, + runner: QueryRunner, + now: number +): Promise { + const columns: ColumnProfile[] = []; + for (const column of table.columns) { + columns.push(await profileColumn(runner, table.name, table.rowCount, column)); + } + return { + tableName: table.name, + rowCount: table.rowCount, + columnCount: table.columns.length, + generatedAt: now, + columns, + }; +} diff --git a/src/lib/duckdb/profile.ts b/src/lib/duckdb/profile.ts new file mode 100644 index 0000000..5d5ca94 --- /dev/null +++ b/src/lib/duckdb/profile.ts @@ -0,0 +1,153 @@ +import type { + ColumnInfo, + ColumnProfile, + ProfileTopValue, + TableInfo, + TableProfile, +} from "@/types"; +import { getConnection } from "./instance"; +import { classifyType, quoteIdent } from "./sql-utils"; + +function unwrapDuckValue(value: unknown): unknown { + if (value === null || value === undefined) return null; + if (typeof value === "bigint") return Number(value); + if (value instanceof Date) return value.toISOString(); + if (typeof value === "object" && "valueOf" in value) { + const unwrapped = (value as { valueOf(): unknown }).valueOf(); + if (unwrapped !== value) return unwrapDuckValue(unwrapped); + } + return value; +} + +function toNumber(value: unknown): number | null { + const unwrapped = unwrapDuckValue(value); + if (typeof unwrapped === "number" && Number.isFinite(unwrapped)) return unwrapped; + if (typeof unwrapped === "string" && unwrapped.trim() !== "") { + const parsed = Number(unwrapped); + return Number.isFinite(parsed) ? parsed : null; + } + return null; +} + +function toProfileScalar(value: unknown): string | number | null { + const unwrapped = unwrapDuckValue(value); + if (unwrapped === null || unwrapped === undefined) return null; + if (typeof unwrapped === "number") return Number.isFinite(unwrapped) ? unwrapped : String(unwrapped); + if (typeof unwrapped === "string") return unwrapped; + if (typeof unwrapped === "boolean") return String(unwrapped); + return String(unwrapped); +} + +async function readSingleRow(sql: string): Promise | null> { + const conn = await getConnection(); + const result = await conn.query(sql); + return (result.toArray()[0] as Record | undefined) ?? null; +} + +async function readTopValues(tableName: string, columnName: string): Promise { + const table = quoteIdent(tableName); + const column = quoteIdent(columnName); + const conn = await getConnection(); + const result = await conn.query(` + SELECT CAST(${column} AS VARCHAR) AS value, COUNT(*) AS value_count + FROM ${table} + WHERE ${column} IS NOT NULL + GROUP BY 1 + ORDER BY value_count DESC, value ASC + LIMIT 5 + `); + + return result.toArray().map((row) => ({ + value: String(toProfileScalar((row as Record).value) ?? ""), + count: toNumber((row as Record).value_count) ?? 0, + })); +} + +async function profileColumn( + tableName: string, + rowCount: number, + column: ColumnInfo +): Promise { + const table = quoteIdent(tableName); + const columnName = quoteIdent(column.name); + const kind = classifyType(column.type); + + let nullCount = 0; + let distinctCount: number | null = null; + try { + const base = await readSingleRow(` + SELECT + COUNT(*) - COUNT(${columnName}) AS null_count, + COUNT(DISTINCT ${columnName}) AS distinct_count + FROM ${table} + `); + nullCount = toNumber(base?.null_count) ?? 0; + distinctCount = toNumber(base?.distinct_count); + } catch (err) { + console.error(`Failed to profile ${tableName}.${column.name}:`, err); + } + + let min: string | number | null = null; + let max: string | number | null = null; + let avg: number | null = null; + if (kind === "numeric") { + const stats = await readSingleRow(` + SELECT + MIN(${columnName}) AS min_value, + MAX(${columnName}) AS max_value, + AVG(${columnName}) AS avg_value + FROM ${table} + WHERE ${columnName} IS NOT NULL + `); + min = toProfileScalar(stats?.min_value); + max = toProfileScalar(stats?.max_value); + avg = toNumber(stats?.avg_value); + } else if (kind === "date") { + const stats = await readSingleRow(` + SELECT + MIN(${columnName}) AS min_value, + MAX(${columnName}) AS max_value + FROM ${table} + WHERE ${columnName} IS NOT NULL + `); + min = toProfileScalar(stats?.min_value); + max = toProfileScalar(stats?.max_value); + } + + let topValues: ProfileTopValue[] = []; + if (kind === "text" || kind === "boolean" || kind === "other") { + try { + topValues = await readTopValues(tableName, column.name); + } catch (err) { + console.error(`Failed to read top values for ${tableName}.${column.name}:`, err); + } + } + + return { + name: column.name, + type: column.type, + kind, + nullCount, + nullPercent: rowCount === 0 ? 0 : (nullCount / rowCount) * 100, + distinctCount, + min, + max, + avg, + topValues, + }; +} + +export async function profileTable(table: TableInfo): Promise { + const columns: ColumnProfile[] = []; + for (const column of table.columns) { + columns.push(await profileColumn(table.name, table.rowCount, column)); + } + + return { + tableName: table.name, + rowCount: table.rowCount, + columnCount: table.columns.length, + generatedAt: Date.now(), + columns, + }; +} diff --git a/src/lib/duckdb/sql-utils.ts b/src/lib/duckdb/sql-utils.ts new file mode 100644 index 0000000..7ca9372 --- /dev/null +++ b/src/lib/duckdb/sql-utils.ts @@ -0,0 +1,24 @@ +import type { ProfileColumnKind } from "@/types"; + +/** Quote a SQL identifier (table or column name), escaping embedded double quotes. */ +export function quoteIdent(identifier: string): string { + return `"${identifier.replaceAll('"', '""')}"`; +} + +/** Bucket a DuckDB column type into a coarse kind used for profiling and discovery. */ +export function classifyType(type: string): ProfileColumnKind { + const normalized = type.toUpperCase(); + if (/(HUGEINT|BIGINT|INTEGER|SMALLINT|TINYINT|DOUBLE|FLOAT|DECIMAL|NUMERIC|REAL|UBIGINT|UINTEGER|USMALLINT|UTINYINT|INT)/.test(normalized)) { + return "numeric"; + } + if (/(TIMESTAMP|DATE|TIME|INTERVAL)/.test(normalized)) { + return "date"; + } + if (/BOOL/.test(normalized)) { + return "boolean"; + } + if (/(CHAR|VARCHAR|TEXT|STRING|UUID|JSON)/.test(normalized)) { + return "text"; + } + return "other"; +} diff --git a/src/stores/workspace-store.ts b/src/stores/workspace-store.ts index dfea5c8..e580495 100644 --- a/src/stores/workspace-store.ts +++ b/src/stores/workspace-store.ts @@ -1,5 +1,5 @@ import { create } from "zustand"; -import type { TableInfo, EditorTab } from "@/types"; +import type { TableInfo, EditorTab, TableProfileState } from "@/types"; import type { Pipeline, PipelineStep, PipelineExecutionResult } from "@/types/pipeline"; import type { LoadedPlugin } from "@/types/plugin"; import { @@ -43,8 +43,11 @@ interface WorkspaceState { // Tables tables: TableInfo[]; fileEntries: FileEntry[]; + tableProfiles: Record; addTable: (table: TableInfo, fileName: string, data: Uint8Array) => void; removeTable: (name: string) => void; + loadTableProfile: (name: string) => Promise; + clearTableProfile: (name: string) => void; // Clear clearWorkspace: () => Promise; @@ -168,6 +171,7 @@ export const useWorkspaceStore = create((set, get) => ({ tables: [], fileEntries: [], + tableProfiles: {}, addTable: (table, fileName, data) => set((state) => ({ tables: [...state.tables.filter((t) => t.name !== table.name), table], @@ -175,11 +179,18 @@ export const useWorkspaceStore = create((set, get) => ({ ...state.fileEntries.filter((f) => f.name !== table.name), { name: table.name, fileName, data: new Uint8Array(data) }, ], + tableProfiles: { + ...state.tableProfiles, + [table.name]: { status: "idle", profile: null, error: null }, + }, })), removeTable: async (name) => { set((state) => ({ tables: state.tables.filter((t) => t.name !== name), fileEntries: state.fileEntries.filter((f) => f.name !== name), + tableProfiles: Object.fromEntries( + Object.entries(state.tableProfiles).filter(([tableName]) => tableName !== name) + ), })); try { const conn = await getConnection(); @@ -205,12 +216,59 @@ export const useWorkspaceStore = create((set, get) => ({ set({ tables: [], fileEntries: [], + tableProfiles: {}, tabs: [tab], activeTabId: tab.id, shareUrl: null, }); }, + loadTableProfile: async (name) => { + const table = get().tables.find((t) => t.name === name); + if (!table) return; + + const current = get().tableProfiles[name]; + if (current?.status === "loading") return; + + set((state) => ({ + tableProfiles: { + ...state.tableProfiles, + [name]: { status: "loading", profile: current?.profile ?? null, error: null }, + }, + })); + + try { + const { profileTable } = await import("@/lib/duckdb/profile"); + const profile = await profileTable(table); + if (!get().tables.some((t) => t.name === name)) return; + set((state) => ({ + tableProfiles: { + ...state.tableProfiles, + [name]: { status: "ready", profile, error: null }, + }, + })); + } catch (err) { + if (!get().tables.some((t) => t.name === name)) return; + set((state) => ({ + tableProfiles: { + ...state.tableProfiles, + [name]: { + status: "error", + profile: null, + error: err instanceof Error ? err.message : String(err), + }, + }, + })); + } + }, + + clearTableProfile: (name) => + set((state) => ({ + tableProfiles: Object.fromEntries( + Object.entries(state.tableProfiles).filter(([tableName]) => tableName !== name) + ), + })), + tabs: [initialTab], activeTabId: initialTab.id, diff --git a/src/types/discovery.ts b/src/types/discovery.ts new file mode 100644 index 0000000..ba9e770 --- /dev/null +++ b/src/types/discovery.ts @@ -0,0 +1,42 @@ +import type { TableProfile } from "./index"; + +/** One endpoint of a relationship: a specific column in a specific table. */ +export interface ColumnRef { + table: string; + column: string; +} + +/** Individual scoring signals behind a relationship's confidence (0..1 each). */ +export interface RelationshipSignals { + /** Fraction of distinct foreign values found in the referenced key (the decisive signal). */ + valueOverlap: number; + /** Name-similarity between the foreign column and its key / referenced table. */ + nameSimilarity: number; + /** 1 when types match exactly, lower for compatible-but-different types. */ + typeMatch: number; + /** 1 for a clean many-to-one shape (key side unique, foreign side not). */ + cardinalityShape: number; +} + +/** Cardinality of the relationship as observed in the data. */ +export type RelationshipCardinality = "one-to-one" | "many-to-one"; + +/** + * A directed foreign-key style relationship: `from` (the foreign column) references + * `to` (the unique key column). Read as `from.column ↳ to.column`. + */ +export interface Relationship { + from: ColumnRef; + to: ColumnRef; + /** 0..100, blended from `signals`. */ + confidence: number; + cardinality: RelationshipCardinality; + signals: RelationshipSignals; +} + +/** Full output of a folder inspection, serialized to .querypad/ artifacts. */ +export interface DiscoveryReport { + generatedAt: number; + profiles: TableProfile[]; + relationships: Relationship[]; +} diff --git a/src/types/index.ts b/src/types/index.ts index 89ffeba..5ded149 100644 --- a/src/types/index.ts +++ b/src/types/index.ts @@ -9,6 +9,40 @@ export interface TableInfo { rowCount: number; } +export type ProfileColumnKind = "numeric" | "date" | "text" | "boolean" | "other"; + +export interface ProfileTopValue { + value: string; + count: number; +} + +export interface ColumnProfile { + name: string; + type: string; + kind: ProfileColumnKind; + nullCount: number; + nullPercent: number; + distinctCount: number | null; + min: string | number | null; + max: string | number | null; + avg: number | null; + topValues: ProfileTopValue[]; +} + +export interface TableProfile { + tableName: string; + rowCount: number; + columnCount: number; + generatedAt: number; + columns: ColumnProfile[]; +} + +export interface TableProfileState { + status: "idle" | "loading" | "ready" | "error"; + profile: TableProfile | null; + error: string | null; +} + export interface QueryResult { columns: string[]; columnTypes: string[]; diff --git a/test/ask.test.ts b/test/ask.test.ts new file mode 100644 index 0000000..57d6d25 --- /dev/null +++ b/test/ask.test.ts @@ -0,0 +1,122 @@ +import assert from "node:assert/strict"; +import test from "node:test"; +import { isReadOnlyQuery, stripSqlFences } from "../src/lib/discovery/sql-safety"; +import { buildAskContext } from "../src/lib/agent/ask-context"; +import { runAsk, type AskAi } from "../src/cli/ask"; +import type { Relationship } from "../src/types/discovery"; + +const REL: Relationship = { + from: { table: "payments", column: "user_id" }, + to: { table: "users", column: "id" }, + confidence: 100, + cardinality: "many-to-one", + signals: { valueOverlap: 1, nameSimilarity: 1, typeMatch: 1, cardinalityShape: 1 }, +}; + +// ---- Pure safety unit tests --------------------------------------------------- + +test("stripSqlFences removes ```sql fences and bare fences", () => { + assert.equal(stripSqlFences("```sql\nSELECT 1\n```"), "SELECT 1"); + assert.equal(stripSqlFences("```\nSELECT 1\n```"), "SELECT 1"); + assert.equal(stripSqlFences(" SELECT 1 "), "SELECT 1"); +}); + +test("isReadOnlyQuery allows reads and rejects writes/DDL", () => { + assert.ok(isReadOnlyQuery("SELECT * FROM users")); + assert.ok(isReadOnlyQuery("WITH t AS (SELECT 1) SELECT * FROM t")); + assert.ok(isReadOnlyQuery("```sql\nSELECT 1\n```")); + for (const bad of [ + "DROP TABLE users", + "DELETE FROM users", + "UPDATE users SET x = 1", + "INSERT INTO users VALUES (1)", + "ALTER TABLE users ADD c INT", + "CREATE TABLE t (a INT)", + "ATTACH 'x.db'", + "COPY users TO 'x.csv'", + ]) { + assert.ok(!isReadOnlyQuery(bad), `should reject: ${bad}`); + } +}); + +test("isReadOnlyQuery cannot be bypassed with a leading comment", () => { + assert.ok(!isReadOnlyQuery("-- harmless\nDROP TABLE users")); + assert.ok(!isReadOnlyQuery("/* c */ DELETE FROM users")); +}); + +test("buildAskContext includes the inferred relationship lines", () => { + const context = buildAskContext({ + tables: [ + { name: "users", columns: [{ name: "id", type: "BIGINT" }], rowCount: 5 }, + { + name: "payments", + columns: [ + { name: "id", type: "BIGINT" }, + { name: "user_id", type: "BIGINT" }, + ], + rowCount: 8, + }, + ], + relationships: [REL], + }); + assert.match(context, /Known relationships/); + assert.match(context, /payments\.user_id -> users\.id/); +}); + +// ---- Pipeline integration with a stubbed AI (no network) ---------------------- + +const JOIN_SQL = + "SELECT u.plan, COUNT(*) AS payment_count, CAST(SUM(p.amount) AS DOUBLE) AS total " + + "FROM payments p JOIN users u ON p.user_id = u.id GROUP BY u.plan ORDER BY u.plan"; + +function stubAi(sql: string): AskAi { + return { + generateSql: async () => sql, + generateInsight: async () => "All payments come from paid-plan users.", + }; +} + +test("runAsk executes generated SQL over fixtures and returns results", async () => { + const lines: string[] = []; + const result = await runAsk({ + question: "total payment amount by user plan", + folder: "fixtures/data", + // Fenced to also exercise stripSqlFences end-to-end. + ai: stubAi("```sql\n" + JOIN_SQL + "\n```"), + log: (line) => lines.push(line), + }); + + assert.deepEqual(result.result?.columns, ["plan", "payment_count", "total"]); + assert.equal(result.result?.rows.length, 1); + const row = result.result!.rows[0]; + assert.equal(String(row.plan), "paid"); + assert.equal(Number(row.payment_count), 8); + assert.ok(Math.abs(Number(row.total) - 285.74) < 0.01); + assert.equal(result.insight, "All payments come from paid-plan users."); + assert.ok(lines.join("\n").includes("Insight:")); +}); + +test("runAsk refuses to execute non-read-only generated SQL", async () => { + await assert.rejects( + runAsk({ + question: "delete everything", + folder: "fixtures/data", + ai: stubAi("DROP TABLE users"), + log: () => {}, + }), + /non-read-only/ + ); +}); + +test("runAsk --show-sql returns SQL without executing", async () => { + const result = await runAsk({ + question: "anything", + folder: "fixtures/data", + showSql: true, + ai: stubAi("```sql\nSELECT 1\n```"), + log: () => {}, + }); + assert.equal(result.sql, "SELECT 1"); + assert.equal(result.result, null); + assert.equal(result.insight, null); +}); diff --git a/test/discovery.test.ts b/test/discovery.test.ts new file mode 100644 index 0000000..6ffade2 --- /dev/null +++ b/test/discovery.test.ts @@ -0,0 +1,107 @@ +import assert from "node:assert/strict"; +import path from "node:path"; +import test from "node:test"; +import { + cardinalityShapeScore, + confidence, + isTypeCompatible, + nameSimilarity, + singularize, + splitTokens, + typeMatchScore, +} from "../src/lib/discovery/signals"; +import { discoverRelationships } from "../src/lib/discovery/relationships"; +import { createNodeDb } from "../src/lib/duckdb-node/connection"; +import { loadFolder } from "../src/lib/duckdb-node/load"; +import { profileTable } from "../src/lib/duckdb-node/profile"; + +// ---- Pure signal unit tests --------------------------------------------------- + +test("splitTokens handles snake_case and camelCase", () => { + assert.deepEqual(splitTokens("user_id"), ["user", "id"]); + assert.deepEqual(splitTokens("customerId"), ["customer", "id"]); +}); + +test("singularize covers common plural forms", () => { + assert.equal(singularize("users"), "user"); + assert.equal(singularize("companies"), "company"); + assert.equal(singularize("addresses"), "address"); +}); + +test("nameSimilarity rewards canonical FK conventions", () => { + // events.user_id ↳ users.id + assert.equal(nameSimilarity("user_id", "id", "users"), 1); + // payments.customer_id ↳ subscriptions.customer_id + assert.equal(nameSimilarity("customer_id", "customer_id", "subscriptions"), 1); +}); + +test("nameSimilarity stays weak for bare surrogate ids", () => { + // events.id vs payments.id — only the shared "id" token + assert.ok(nameSimilarity("id", "id", "payments") < 0.6); + // payments.user_id vs events.id — wrong table reference + assert.ok(nameSimilarity("user_id", "id", "events") < 0.6); +}); + +test("type compatibility and match scoring", () => { + assert.ok(isTypeCompatible("numeric", "numeric")); + assert.ok(isTypeCompatible("numeric", "text")); + assert.ok(!isTypeCompatible("numeric", "date")); + assert.equal(typeMatchScore("BIGINT", "BIGINT", "numeric", "numeric"), 1); + assert.equal(typeMatchScore("INTEGER", "BIGINT", "numeric", "numeric"), 0.85); +}); + +test("cardinality shape distinguishes many-to-one from one-to-one", () => { + assert.equal(cardinalityShapeScore(true, false), 1); + assert.equal(cardinalityShapeScore(true, true), 0.8); + assert.equal(cardinalityShapeScore(false, false), 0); +}); + +test("confidence is 100 for a perfect FK and lower for a weak name", () => { + assert.equal( + confidence({ valueOverlap: 1, nameSimilarity: 1, typeMatch: 1, cardinalityShape: 1 }), + 100 + ); + const weakName = confidence({ + valueOverlap: 1, + nameSimilarity: 0.33, + typeMatch: 1, + cardinalityShape: 1, + }); + assert.ok(weakName < 90 && weakName > 50); +}); + +// ---- Engine integration test (real Node DuckDB over fixtures) ------------------ + +test("inspect fixtures yields exactly the two true relationships", async () => { + const folder = path.resolve(process.cwd(), "fixtures/data"); + const db = await createNodeDb(); + try { + const { tables } = await loadFolder(folder, db.runner); + assert.equal(tables.length, 3); + + const now = 1_700_000_000_000; + const profiles = []; + for (const table of tables) { + profiles.push(await profileTable(table, db.runner, now)); + } + + const relationships = await discoverRelationships(profiles, db.runner); + const edges = relationships.map( + (rel) => `${rel.from.table}.${rel.from.column}->${rel.to.table}.${rel.to.column}` + ); + + assert.deepEqual( + new Set(edges), + new Set(["payments.user_id->users.id", "events.user_id->users.id"]) + ); + for (const rel of relationships) { + assert.equal(rel.confidence, 100); + assert.equal(rel.cardinality, "many-to-one"); + } + // No spurious edges into surrogate id columns of other tables. + assert.ok(!edges.some((edge) => edge.endsWith("events.id"))); + assert.ok(!edges.some((edge) => edge.endsWith("payments.id"))); + } finally { + db.close(); + } +}); From e6abad905653107f8edd461495102d409711bd48 Mon Sep 17 00:00:00 2001 From: Kiyeon Jeon Date: Thu, 18 Jun 2026 00:04:40 +0900 Subject: [PATCH 2/6] docs: pivot positioning to "Cursor for Data" Reframe README, roadmap, contributor + agent guides, and changelog around dataset understanding (inspect + ask) instead of an AI SQL editor; publish ROADMAP.md. Co-Authored-By: Claude Opus 4.8 (1M context) --- AGENTS.md | 23 ++++++- CHANGELOG.md | 29 ++++++++ CONTRIBUTING.md | 35 ++++++++-- README.md | 147 ++++++++++++++++++++++++++++++++++++----- ROADMAP.md | 172 ++++++++++++++++++++++++++++++++++++++++++++++++ 5 files changed, 383 insertions(+), 23 deletions(-) create mode 100644 ROADMAP.md diff --git a/AGENTS.md b/AGENTS.md index 7986d6f..904df49 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -4,8 +4,27 @@ This version has breaking changes — APIs, conventions, and file structure may all differ from your training data. Read the relevant guide in `node_modules/next/dist/docs/` before writing any code. Heed deprecation notices. +## Product direction + +QueryPad is pivoting from "AI-powered SQL editor" to **Cursor for Data** — a +local-first AI workspace that understands datasets (discovers relationships, builds +semantic models) before generating SQL. See `ROADMAP.md` for the layered plan. + +## Two surfaces, one core + +- **Web app** (`src/app`, `src/components`) runs DuckDB-Wasm in the browser. +- **CLI** (`src/cli`, `src/lib/duckdb-node`) runs native `@duckdb/node-api` in Node. +- **Shared, engine-agnostic core** (`src/lib/discovery`, `src/lib/duckdb/sql-utils.ts`) + is consumed by both via a `QueryRunner` abstraction. +- Node-only code (`src/lib/duckdb-node`, `src/cli`) must never be imported by app code, + or the native addon leaks into the browser bundle. `npm run check`'s build step + verifies this. + ## Release and verification - Keep `package.json`, `package-lock.json`, and the latest `CHANGELOG.md` release version in sync. -- Run `npm run check` after code/config changes, and `npm test` when UI behavior or e2e-covered flows change. -- Do not commit demo video artifacts; use them as release/README upload assets. +- Run `npm run check` after code/config changes. +- Run `npm test` when UI behavior or e2e-covered flows change. +- Run `npm run test:cli` when discovery/CLI logic changes. +- Do not commit demo video artifacts or `.querypad/` inspection output; use the videos + as release/README upload assets. diff --git a/CHANGELOG.md b/CHANGELOG.md index de4f331..f91fd0f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,6 +3,35 @@ QueryPad is a web app, not an npm package. Version numbers mark GitHub release milestones and public product updates. +## Unreleased + +### CLI: Dataset Understanding + +- New `querypad inspect ` command that profiles a folder of data files and + infers foreign-key relationships with confidence scores +- New `querypad ask "" ` command (AI Analyst): generates SQL using the + inferred relationships as context, runs it on DuckDB, and explains the result +- Generated SQL is read-only-gated (only SELECT/WITH/EXPLAIN/… execute) and code-fence stripped +- CLI AI keys come from `ANTHROPIC_API_KEY` / `OPENAI_API_KEY`; provider via `--provider` +- Writes `.querypad/` artifacts (`schema.json`, `relationships.json`, `inspect-summary.md`) + for AI agents such as Claude Code to reason about the dataset +- Engine-agnostic discovery core shared between the browser app and the Node CLI +- Runs on a native Node DuckDB engine (`@duckdb/node-api`), separate from the browser Wasm engine + +### Multi-Provider BYOK + +- OpenAI BYOK support for the Cmd+K AI SQL assistant via the Responses API +- Provider selector for Claude and OpenAI with independent browser-local keys +- Updated the default Claude model to `claude-sonnet-4-6` +- Added `gpt-5.5` as the default OpenAI model + +### Data Profile & Agent Context + +- On-demand data profile drawer for loaded tables +- Column-level nulls, distinct counts, numeric ranges, averages, and top values +- Copy Agent Context action for Codex, Claude Code, or other coding agents +- README positioning updated around local-first OSS and the hosted demo + ## v0.6 — Open-Source Release - Vercel Analytics integration diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 2cc284e..b4a9384 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -13,15 +13,38 @@ npm run dev The app will be available at `http://localhost:3000`. +QueryPad has two surfaces sharing one engine-agnostic understanding core: a **web +app** (DuckDB-Wasm in the browser) and a **CLI** (native `@duckdb/node-api` in Node). +The product direction is **Cursor for Data** — understanding datasets (discovering +relationships, building semantic models) before generating SQL. See +[ROADMAP.md](ROADMAP.md). + ## Project Structure ``` src/ - app/ # Next.js app router pages and API routes - components/ # React components - lib/ # Core logic (DuckDB, AI, utilities) - stores/ # Zustand state management - types/ # TypeScript type definitions + app/ # Next.js app router pages + components/ # React components (web app) + cli/ # querypad CLI: index.ts (dispatch), inspect.ts, artifacts.ts + lib/ + discovery/ # engine-agnostic core: signals.ts (pure), relationships.ts + duckdb/ # browser DuckDB-Wasm: profile.ts, sql-utils.ts (shared) + duckdb-node/ # Node DuckDB: connection.ts, load.ts, profile.ts + ai/ # SQL generation, providers, BYOK key storage + stores/ # Zustand state management + types/ # TypeScript type definitions (incl. discovery.ts) +test/ # Node test runner specs for discovery/CLI +fixtures/data/ # sample related files for CLI inspection +``` + +> Node-only code (`src/cli`, `src/lib/duckdb-node`) must not be imported by app code — +> it would pull the native DuckDB addon into the browser bundle. + +## Running the CLI + +```bash +npm run querypad -- inspect ./fixtures/data +# writes ./fixtures/data/.querypad/ (gitignored) ``` ## How to Contribute @@ -30,7 +53,7 @@ src/ 2. Create a feature branch (`git checkout -b feat/my-feature`) 3. Make your changes 4. Run the local checks (`npm run check`) -5. Run the e2e tests (`npm test`) +5. Run the e2e tests (`npm test`) for UI changes, and `npm run test:cli` for discovery/CLI changes 6. Commit your changes 7. Push to your fork and open a Pull Request diff --git a/README.md b/README.md index 8dd77b2..1a91d7d 100644 --- a/README.md +++ b/README.md @@ -1,33 +1,137 @@ # QueryPad -> **Drop a file. Query with SQL. Visualize and share. All in your browser.** +> **Cursor for Data — a local-first AI workspace that understands your datasets, not just runs SQL on them.** -A browser-native SQL playground powered by DuckDB-Wasm. Drag & drop CSV, Parquet, JSON, or Excel files — write SQL, visualize results, and share with a link. No server, no account, no install. +QueryPad points an AI at a folder of CSV/Parquet/JSON files, profiles them, +discovers how they connect, and helps you analyze them with DuckDB — locally, +with no server-side data processing, no account, and no install. + +The execution layer is solved (DuckDB does it well). The unsolved problem is that +**people don't understand their data**: which tables exist, what each field means, +how datasets connect, which join is correct. QueryPad is built to answer those +questions first, then generate and run the SQL.

- Try it now + Try the web app

-## Why QueryPad? +## Two surfaces, one understanding engine + +QueryPad ships as a **CLI** for dataset understanding and a **browser app** for +interactive analysis. Both share the same engine-agnostic discovery core; only the +DuckDB binding differs (native `@duckdb/node-api` for the CLI, DuckDB-Wasm for the web). + +```text + ┌─────────────────────────┐ + folder of → │ Discovery core │ → .querypad/ artifacts + data files │ profile → relationships│ (schema + relationships) + │ → semantic model │ + └───────────┬─────────────┘ + CLI (Node) │ Web (Wasm) + querypad │ querypad.io + inspect │ drop & query +``` + +## CLI: dataset understanding + +```bash +querypad inspect ./data +``` + +Scans a folder, profiles every file, and infers foreign-key relationships with +confidence scores: + +```text +Tables: 3 +Relationships: 2 + payments.user_id ↳ users.id (100%, many-to-one) + events.user_id ↳ users.id (100%, many-to-one) + +Wrote artifacts to ./data/.querypad +``` + +It writes machine-readable artifacts that an AI agent (Claude Code, Cursor, …) can +read to reason about the dataset instead of guessing at pandas: + +```text +.querypad/ + schema.json # tables, columns, types, per-column profiles + relationships.json # inferred joins with confidence + signals + inspect-summary.md # human- and agent-readable overview +``` + +```text +Claude Code + QueryPad + DuckDB +``` + +## How relationship discovery works + +For every table, QueryPad computes a statistical profile (row count, null %, +distinct count, ranges, top values). It then identifies primary-key candidates +(unique, non-null), prunes likely foreign-key pairs by **name similarity** and +**type compatibility**, and runs a **value-overlap** query for each survivor. A +confidence score blends four signals — value overlap (dominant), name similarity, +type match, and cardinality shape — and competition disambiguation keeps a foreign +column pointed at its single strongest target, so overlapping integer id ranges +don't produce false positives. + +## Product layers -You have a CSV. You want to run a quick SQL query. You don't want to spin up a database, open a notebook, or install anything. +| Layer | What it does | Status | +|-------|--------------|--------| +| **1 — Dataset Discovery** | Scan folders; detect schema, types, statistics, uniqueness, cardinality | ✅ Built (`profile`) | +| **2 — Relationship Discovery** | Infer joins automatically with confidence scores | ✅ Built (`inspect`) | +| **3 — Semantic Model** | Roll relationships into named business entities (`Customer ├ Payment ├ Event`) | 🚧 Roadmap | +| **4 — AI Analyst** | Natural-language questions → SQL → execution → insight (`ask`) | ✅ Built (`ask`) | -QueryPad lets you drop files and start querying in seconds. Everything runs in your browser — your data never leaves your machine. +See [ROADMAP.md](ROADMAP.md) for the full plan. -## Features +## CLI: ask a question + +```bash +export ANTHROPIC_API_KEY=sk-ant-... # or OPENAI_API_KEY with --provider openai +querypad ask "total payment amount by user plan" ./data +``` + +`ask` builds context from the inferred relationships (so the generated SQL joins on the +right keys), runs it on DuckDB, and explains the result: + +```text +-- SQL +SELECT u.plan, COUNT(*) AS payment_count, SUM(p.amount) AS total +FROM payments p JOIN users u ON p.user_id = u.id +GROUP BY u.plan ORDER BY u.plan + +plan payment_count total +---- ------------- ------ +paid 8 285.74 + +Insight: All payments come from paid-plan users. +``` + +Generated SQL is read-only-gated (only `SELECT`/`WITH`/… execute) and the DB is in-memory, +so source files are never modified. Use `--show-sql` to preview the SQL without running it. + +## Web app: interactive analysis + +The browser app at [querypad.io](https://querypad.io) is the same OSS app running +client-side. Your data stays on your machine unless you explicitly share or collaborate. - **Drag & drop anything** — CSV, Parquet, JSON, Excel — drop multiple formats at once and JOIN them -- **DuckDB-Wasm SQL** — Full analytical SQL in the browser (JOIN, GROUP BY, window functions, etc.) -- **Inline charts** — One-click Bar, Line, Scatter, Pie charts from query results +- **DuckDB-Wasm SQL** — Full analytical SQL in the browser (JOIN, GROUP BY, window functions, …) +- **Data profiles** — Column-level nulls, distinct counts, ranges, averages, and top values +- **Agent context** — Copy schema, profiles, active SQL, and latest results for Claude Code or Codex +- **AI SQL assistant** — Cmd+K for natural language to SQL with Claude or OpenAI BYOK +- **Inline charts** — One-click Bar, Line, Scatter, Pie from query results - **URL sharing** — Compress data + query into a single shareable link -- **AI SQL assistant** — Cmd+K for natural language to SQL (BYOK: bring your own Anthropic API key, runs entirely in browser) - **Sample data on first visit** — Start exploring immediately, drop your own files when ready -### Advanced Features +
+More web app features - **Monaco Editor** — Table/column autocomplete, syntax highlighting, Cmd+Enter to run - **Virtualized table** — Smooth rendering up to 10,000 rows @@ -40,8 +144,12 @@ QueryPad lets you drop files and start querying in seconds. Everything runs in y - **Real-time collaboration** — PartyKit + Y.js CRDT with remote cursors - **File size guardrails** — 100 MB per file limit with clear warnings +
+ ## Quick Start +**Web app:** + ```bash npm install npm run dev @@ -49,13 +157,21 @@ npm run dev Open `http://localhost:3000`. Sample data is automatically loaded on first visit. +**CLI:** + +```bash +npm install +npm run querypad -- inspect ./fixtures/data # discover relationships +ANTHROPIC_API_KEY=sk-ant-... npm run querypad -- ask "payments by plan" ./fixtures/data +``` + ## Tech Stack | Area | Technology | |------|-----------| -| SQL Engine | DuckDB-Wasm | +| Query Engine | DuckDB-Wasm (web) · `@duckdb/node-api` (CLI) | | Framework | Next.js + TypeScript + Tailwind CSS v4 | -| AI | Anthropic Claude API (BYOK, browser-direct) | +| AI | Anthropic Claude + OpenAI BYOK | | Editor | Monaco Editor | | State | Zustand | | Charts | Recharts | @@ -64,13 +180,14 @@ Open `http://localhost:3000`. Sample data is automatically loaded on first visit ## Releases -QueryPad is a web app, not an npm package. Version numbers mark GitHub +QueryPad is a local-first tool, not a hosted SaaS. Version numbers mark GitHub release milestones and public product updates. See [CHANGELOG.md](CHANGELOG.md) for release notes. ## Contributing -Contributions are welcome! Feel free to open issues and pull requests. +Contributions are welcome! Feel free to open issues and pull requests. See +[CONTRIBUTING.md](CONTRIBUTING.md). ## License diff --git a/ROADMAP.md b/ROADMAP.md new file mode 100644 index 0000000..4d86403 --- /dev/null +++ b/ROADMAP.md @@ -0,0 +1,172 @@ +# QueryPad Roadmap — Cursor for Data + +QueryPad is pivoting from "AI-powered SQL editor" to **Cursor for Data**: a +local-first AI workspace that understands folders of CSV/Parquet files, discovers +relationships, builds semantic models, and answers business questions using DuckDB. + +The execution layer is solved — DuckDB does it well. The unsolved problem is +**dataset understanding**: which tables exist, what each field means, how datasets +connect, which join is correct. That is the bottleneck this roadmap attacks, and +the reason we build the understanding engine **CLI-first** before investing in UI. + +> Cursor understands code → generates code → edits code → runs code. +> QueryPad understands datasets → infers relationships → generates SQL → executes analysis → explains findings. +> +> The semantic model is the AST for data. The relationship graph is the codebase graph. + +## The four layers + +```text +Layer 1 Dataset Discovery → profile files: schema, stats, uniqueness, cardinality +Layer 2 Relationship Disc. → infer joins automatically, with confidence scores +Layer 3 Semantic Model → roll relationships into named business entities +Layer 4 AI Analyst → question → semantic model → SQL → execution → insight +``` + +| Layer | Deliverable | Status | +|-------|-------------|--------| +| 1 — Dataset Discovery | Folder scan + per-column profiles (`profileTable`, `loadFolder`) | ✅ Built | +| 2 — Relationship Discovery | Confidence-scored FK inference (`discoverRelationships`, `querypad inspect`) | ✅ Built | +| 4 — AI Analyst | `querypad ask`: NL → SQL (relationship-aware) → execution → insight | ✅ Built | +| 3 — Semantic Model | Entity rollup → `.querypad/semantic-model.yaml` | 🚧 Next | +| UI — AI Verification | Lightweight web UI to accept/reject/edit inferred relationships | 🚧 Planned | +| `querypad explain` | Justify confidence from stored `RelationshipSignals` | 🚧 Planned | + +## Built today + +Two CLI commands ship: `querypad inspect` (Layers 1–2) and `querypad ask` (Layer 4). + +```bash +querypad inspect ./data +``` + +```text +Tables: 3 +Relationships: 2 + payments.user_id ↳ users.id (100%, many-to-one) + events.user_id ↳ users.id (100%, many-to-one) +Wrote artifacts to ./data/.querypad +``` + +```bash +ANTHROPIC_API_KEY=sk-ant-... querypad ask "payments by plan" ./data +``` + +```text +-- SQL +SELECT u.plan, COUNT(*) AS payment_count, SUM(p.amount) AS total +FROM payments p JOIN users u ON p.user_id = u.id GROUP BY u.plan +... +Insight: All payments come from paid-plan users. +``` + +Architecture (engine-agnostic core, two DuckDB bindings): + +```text +src/lib/discovery/ signals.ts · relationships.ts · sql-safety.ts (read-only gate) +src/lib/ai/ complete.ts (shared streaming) · generate-sql.ts · providers.ts +src/lib/duckdb-node/ connection.ts · load.ts · profile.ts (native @duckdb/node-api) +src/lib/duckdb/ sql-utils.ts (shared) · profile.ts (browser DuckDB-Wasm) +src/cli/ index.ts (dispatch) · inspect.ts · ask.ts · artifacts.ts +``` + +Relationship discovery: profile each table → find primary-key candidates (unique, +non-null) → prune FK pairs by name similarity + type compatibility → run a +value-overlap query per survivor → blend four signals (value overlap, name +similarity, type match, cardinality shape) into a 0–100% confidence → keep each +foreign column's single strongest target (competition disambiguation) so +overlapping id ranges don't yield false positives. + +Artifacts written to `.querypad/`: + +```text +schema.json tables, columns, types, per-column profiles +relationships.json inferred joins with confidence + per-signal breakdown +inspect-summary.md human- and agent-readable overview +``` + +## Layer 3 — Semantic Model (next) + +Roll inferred relationships into named business entities, stored as the source of truth. + +```yaml +# .querypad/semantic-model.yaml +entities: + - name: Customer + table: users + has_many: [Payment, Event] + - name: Payment + table: payments + belongs_to: Customer +``` + +- Derive entity names from table names (singularize) and the relationship graph. +- Persist as `semantic-model.yaml`; let users override/curate it. +- Surface conflicts (ambiguous joins, multiple FK candidates) for resolution. + +## Layer 4 — AI Analyst (built) + +```bash +querypad ask "show 7-day retention for paid users" ./data +``` + +```text +Question → inferred relationships as context → SQL generation → DuckDB execution → insight +``` + +- Reuses the AI layer (`src/lib/ai/complete.ts`, Claude + OpenAI). CLI keys come from + `ANTHROPIC_API_KEY` / `OPENAI_API_KEY`; provider via `--provider`. +- Feeds the inferred relationships (`buildAskContext`) so generated SQL joins on the + right keys. Once Layer 3 lands, `ask` will prefer the semantic model as context. +- Generated SQL is read-only-gated (`isReadOnlyQuery`) and code-fence stripped before + execution; the in-memory DB is reloaded from files each run, so sources are never touched. +- `--show-sql` previews the SQL without executing. + +Still planned: `querypad explain` renders the stored per-signal breakdown to justify +each inferred relationship and surface potential conflicts. + +## UI — AI Verification (planned) + +After the CLI proves the understanding engine, build a **lightweight local web UI** +— not Tableau, not Metabase. Its purpose is **AI verification**, not dashboard building. + +```text +Left Center Right +───── ────── ────── +Tables Chat Generated SQL +Relationships Results +``` + +The defining interaction is validating AI assumptions: + +```text +Detected relationship + users.id ↳ payments.user_id Confidence 97% + [Accept] [Reject] [Edit] +``` + +This reuses the shared `src/lib/discovery` core, surfacing the same edges the CLI +emits. The existing browser app (Monaco, charts, pipelines, sharing) remains the +interactive-analysis surface; the verification view is additive. + +## Claude Code integration + +`querypad inspect` makes the dataset legible to coding agents. Instead of guessing +with pandas, Claude Code reads `.querypad/schema.json` + `relationships.json` and +reasons about the data directly: + +```text +Claude Code + QueryPad + DuckDB +``` + +A future MCP server can expose the same engine (`inspect`, `ask`, `describe`) as +typed tools for agent workflows — a natural follow-on once Layers 3–4 land. + +## Principles + +- **Use DuckDB.** Do not build a database or a query engine. +- **Understanding before UI.** Relationship inference and semantic modeling are the + bottleneck; a dashboard built before solving them is just another BI tool. +- **Local-first.** Computation and storage stay on the user's machine; AI is BYOK. +- **Agent-native.** Artifacts are structured, typed, and token-efficient so agents + can consume them directly. From ce7d24c66a64b936e6b88449a0fabea6565fda6d Mon Sep 17 00:00:00 2001 From: Kiyeon Jeon Date: Thu, 18 Jun 2026 01:16:39 +0900 Subject: [PATCH 3/6] feat: querypad semantic model + explain - inspect rolls inferred relationships into named entities (User has_many Payment, ...) and writes .querypad/semantic-model.yaml; ask feeds those entities as domain context - querypad explain: justify each inferred relationship from its stored signals (value overlap, name match, type, cardinality) with caveats to verify Co-Authored-By: Claude Opus 4.8 (1M context) --- src/cli/artifacts.ts | 29 +++++++- src/cli/ask.ts | 8 +- src/cli/explain.ts | 48 ++++++++++++ src/cli/index.ts | 7 ++ src/cli/inspect.ts | 21 +++++- src/lib/agent/ask-context.ts | 38 ++++++++-- src/lib/discovery/explain.ts | 103 ++++++++++++++++++++++++++ src/lib/discovery/semantic-model.ts | 109 ++++++++++++++++++++++++++++ src/types/discovery.ts | 21 ++++++ test/explain.test.ts | 70 ++++++++++++++++++ test/semantic.test.ts | 95 ++++++++++++++++++++++++ 11 files changed, 536 insertions(+), 13 deletions(-) create mode 100644 src/cli/explain.ts create mode 100644 src/lib/discovery/explain.ts create mode 100644 src/lib/discovery/semantic-model.ts create mode 100644 test/explain.test.ts create mode 100644 test/semantic.test.ts diff --git a/src/cli/artifacts.ts b/src/cli/artifacts.ts index 0504313..30af621 100644 --- a/src/cli/artifacts.ts +++ b/src/cli/artifacts.ts @@ -1,5 +1,6 @@ import { mkdir, readFile, writeFile } from "node:fs/promises"; import path from "node:path"; +import { renderSemanticYaml } from "../lib/discovery/semantic-model"; import type { TableProfile } from "../types"; import type { DiscoveryReport, Relationship } from "../types/discovery"; @@ -35,10 +36,11 @@ export interface WrittenArtifacts { dir: string; schemaPath: string; relationshipsPath: string; + semanticModelPath: string; summaryPath: string; } -/** Write schema.json, relationships.json and inspect-summary.md under /.querypad/. */ +/** Write schema.json, relationships.json, semantic-model.yaml and inspect-summary.md. */ export async function writeArtifacts( folder: string, report: DiscoveryReport, @@ -49,6 +51,7 @@ export async function writeArtifacts( const schemaPath = path.join(dir, "schema.json"); const relationshipsPath = path.join(dir, "relationships.json"); + const semanticModelPath = path.join(dir, "semantic-model.yaml"); const summaryPath = path.join(dir, "inspect-summary.md"); await writeFile( @@ -63,9 +66,10 @@ export async function writeArtifacts( 2 ) ); + await writeFile(semanticModelPath, renderSemanticYaml(report.semanticModel)); await writeFile(summaryPath, buildSummary(report, skipped)); - return { dir, schemaPath, relationshipsPath, summaryPath }; + return { dir, schemaPath, relationshipsPath, semanticModelPath, summaryPath }; } /** Human- and agent-readable markdown overview of the inspection. */ @@ -106,6 +110,23 @@ export function buildSummary(report: DiscoveryReport, skipped: string[]): string lines.push(""); } + const entities = report.semanticModel.entities; + lines.push(`## Entities (${entities.length})`, ""); + if (entities.length === 0) { + lines.push("No entities derived.", ""); + } else { + for (const entity of entities) { + const assoc = [ + entity.belongsTo.length > 0 ? `belongs_to: ${entity.belongsTo.join(", ")}` : null, + entity.hasMany.length > 0 ? `has_many: ${entity.hasMany.join(", ")}` : null, + entity.hasOne.length > 0 ? `has_one: ${entity.hasOne.join(", ")}` : null, + ].filter(Boolean); + const detail = assoc.length > 0 ? ` — ${assoc.join("; ")}` : ""; + lines.push(`- ${entity.name} (${entity.table})${detail}`); + } + lines.push(""); + } + if (skipped.length > 0) { lines.push(`## Skipped files (${skipped.length})`, ""); lines.push(skipped.map((name) => `- ${name} (unsupported type)`).join("\n"), ""); @@ -114,8 +135,8 @@ export function buildSummary(report: DiscoveryReport, skipped: string[]): string lines.push( "## Next steps", "", - "- Review inferred relationships and adjust as needed.", - "- Feed `.querypad/schema.json` + `.querypad/relationships.json` to an AI agent to reason about the dataset.", + "- Review inferred relationships and entities, and adjust as needed.", + "- Feed `.querypad/schema.json`, `relationships.json`, and `semantic-model.yaml` to an AI agent to reason about the dataset.", "" ); diff --git a/src/cli/ask.ts b/src/cli/ask.ts index fdee836..8dc3337 100644 --- a/src/cli/ask.ts +++ b/src/cli/ask.ts @@ -2,6 +2,7 @@ import { complete } from "../lib/ai/complete"; import { SQL_SYSTEM_PROMPT, buildSqlInput } from "../lib/ai/generate-sql"; import { buildAskContext } from "../lib/agent/ask-context"; import { discoverRelationships } from "../lib/discovery/relationships"; +import { buildSemanticModel } from "../lib/discovery/semantic-model"; import { isReadOnlyQuery, stripSqlFences } from "../lib/discovery/sql-safety"; import { createNodeDb, type QueryResultRows } from "../lib/duckdb-node/connection"; import { loadFolder } from "../lib/duckdb-node/load"; @@ -83,7 +84,12 @@ export async function runAsk(options: RunAskOptions): Promise { relationships = await discoverRelationships(profiles, db.runner); } - const context = buildAskContext({ tables, relationships }); + const semanticModel = buildSemanticModel( + tables.map((table) => table.name), + relationships, + Date.now() + ); + const context = buildAskContext({ tables, relationships, semanticModel }); const sql = stripSqlFences(await ai.generateSql({ context, question: options.question })); log("-- SQL"); diff --git a/src/cli/explain.ts b/src/cli/explain.ts new file mode 100644 index 0000000..0e59d75 --- /dev/null +++ b/src/cli/explain.ts @@ -0,0 +1,48 @@ +import { buildExplanation } from "../lib/discovery/explain"; +import { readArtifacts } from "./artifacts"; + +export interface RunExplainOptions { + log?: (line: string) => void; +} + +/** + * `querypad explain [folder]`: read the inferred relationships from `.querypad/` and + * justify each one from its stored signals, plus caveats to verify. Pure consumer of + * artifacts — run `querypad inspect` first. Returns a process exit code. + */ +export async function runExplain(folder: string, options: RunExplainOptions = {}): Promise { + const log = options.log ?? ((line: string) => console.log(line)); + + const { relationships, profiles } = await readArtifacts(folder); + if (!relationships) { + log( + `No .querypad/relationships.json found in ${folder}. ` + + "Run `querypad inspect ` first." + ); + return 1; + } + + const tableNames = profiles?.map((profile) => profile.tableName) ?? []; + const report = buildExplanation(relationships, tableNames); + + log(`Relationships (${report.relationships.length})`); + if (report.relationships.length === 0) { + log(" No relationships were inferred."); + } + for (const { relationship, reasons } of report.relationships) { + const from = `${relationship.from.table}.${relationship.from.column}`; + const to = `${relationship.to.table}.${relationship.to.column}`; + log(""); + log(`${from} ↳ ${to} — ${relationship.confidence}% (${relationship.cardinality})`); + for (const reason of reasons) log(` • ${reason}`); + } + + log(""); + log(`Caveats (${report.caveats.length})`); + if (report.caveats.length === 0) { + log(" None."); + } + for (const caveat of report.caveats) log(` • ${caveat}`); + + return 0; +} diff --git a/src/cli/index.ts b/src/cli/index.ts index 9d3903f..b94b1f4 100644 --- a/src/cli/index.ts +++ b/src/cli/index.ts @@ -1,5 +1,6 @@ #!/usr/bin/env -S npx tsx import { runAsk } from "./ask"; +import { runExplain } from "./explain"; import { runInspect } from "./inspect"; const HELP = `querypad — local-first dataset understanding @@ -10,6 +11,8 @@ Usage: querypad ask "" [folder] Answer a natural-language question: generate SQL using the inferred relationships, run it, and explain the result. + querypad explain [folder] Justify each inferred relationship from its signals, + with caveats to verify (reads .querypad/; run inspect first). querypad help Show this help Options for ask: @@ -77,6 +80,10 @@ async function main(argv: string[]): Promise { }); return 0; } + case "explain": { + const { positionals } = parseArgs(rest); + return runExplain(positionals[0] ?? "."); + } default: console.error(`Unknown command: ${command}\n`); console.error(HELP); diff --git a/src/cli/inspect.ts b/src/cli/inspect.ts index 45c8997..67fb1dd 100644 --- a/src/cli/inspect.ts +++ b/src/cli/inspect.ts @@ -1,5 +1,6 @@ import path from "node:path"; import { discoverRelationships } from "../lib/discovery/relationships"; +import { buildSemanticModel } from "../lib/discovery/semantic-model"; import { createNodeDb } from "../lib/duckdb-node/connection"; import { loadFolder } from "../lib/duckdb-node/load"; import { profileTable } from "../lib/duckdb-node/profile"; @@ -21,7 +22,12 @@ export async function runInspect(folder: string, now: number): Promise profile.tableName), + relationships, + now + ); - const report: DiscoveryReport = { generatedAt: now, profiles, relationships }; + const report: DiscoveryReport = { generatedAt: now, profiles, relationships, semanticModel }; const artifacts = await writeArtifacts(resolved, report, skipped); console.log(""); @@ -47,6 +58,12 @@ export async function runInspect(folder: string, now: number): Promise 0 ? ` → ${links.join(", ")}` : ""; + console.log(` ${entity.name} (${entity.table})${detail}`); + } if (skipped.length > 0) { console.log(`Skipped: ${skipped.length} unsupported file(s)`); } diff --git a/src/lib/agent/ask-context.ts b/src/lib/agent/ask-context.ts index 9f43fc9..8a43e94 100644 --- a/src/lib/agent/ask-context.ts +++ b/src/lib/agent/ask-context.ts @@ -1,10 +1,11 @@ import type { TableInfo } from "../../types"; -import type { Relationship } from "../../types/discovery"; +import type { Relationship, SemanticModel } from "../../types/discovery"; import { buildSchemaContext } from "../ai/schema-context"; export interface AskContextInput { tables: TableInfo[]; relationships: Relationship[]; + semanticModel?: SemanticModel; } /** Render the inferred join graph as guidance for the SQL-generating model. */ @@ -20,15 +21,40 @@ function renderRelationships(relationships: Relationship[]): string { return lines.join("\n"); } +/** Render the business entities so the model can reason in domain terms. */ +function renderEntities(model: SemanticModel): string { + const lines = model.entities.map((entity) => { + const assoc = [ + entity.belongsTo.length > 0 ? `belongs_to ${entity.belongsTo.join(", ")}` : null, + entity.hasMany.length > 0 ? `has_many ${entity.hasMany.join(", ")}` : null, + entity.hasOne.length > 0 ? `has_one ${entity.hasOne.join(", ")}` : null, + ].filter(Boolean); + const detail = assoc.length > 0 ? ` ${assoc.join(", ")}` : ""; + return `- ${entity.name} (table ${entity.table})${detail}`; + }); + return lines.join("\n"); +} + /** - * Build the context block handed to the AI Analyst: table schemas plus the inferred - * relationships, so generated SQL joins on the correct keys. + * Build the context block handed to the AI Analyst: table schemas, the inferred + * relationships (so generated SQL joins on the correct keys), and — when available — + * the semantic model's business entities for domain-level framing. */ -export function buildAskContext({ tables, relationships }: AskContextInput): string { - return [ +export function buildAskContext({ + tables, + relationships, + semanticModel, +}: AskContextInput): string { + const sections = [ buildSchemaContext(tables), "", "Known relationships (use these for JOINs):", renderRelationships(relationships), - ].join("\n"); + ]; + + if (semanticModel && semanticModel.entities.length > 0) { + sections.push("", "Business entities:", renderEntities(semanticModel)); + } + + return sections.join("\n"); } diff --git a/src/lib/discovery/explain.ts b/src/lib/discovery/explain.ts new file mode 100644 index 0000000..40bceaf --- /dev/null +++ b/src/lib/discovery/explain.ts @@ -0,0 +1,103 @@ +import type { Relationship } from "../../types/discovery"; + +/** + * Turn a relationship's stored signals into a human-readable justification. Pure — + * a relationship graph in, prose explanation out. No IO, no re-running discovery. + */ + +export interface ExplainedRelationship { + relationship: Relationship; + reasons: string[]; +} + +export interface ExplainReport { + relationships: ExplainedRelationship[]; + caveats: string[]; +} + +/** Confidence at/above which an edge is considered trustworthy without manual review. */ +const LOW_CONFIDENCE = 75; + +function ref(rel: Relationship): { from: string; to: string } { + return { + from: `${rel.from.table}.${rel.from.column}`, + to: `${rel.to.table}.${rel.to.column}`, + }; +} + +function reasonsFor(rel: Relationship): string[] { + const { from, to } = ref(rel); + const s = rel.signals; + const reasons: string[] = []; + + reasons.push( + `${Math.round(s.valueOverlap * 100)}% of distinct ${from} values are present in ${to}` + ); + + if (s.nameSimilarity >= 0.9) { + reasons.push("column name strongly matches the target"); + } else if (s.nameSimilarity >= 0.5) { + reasons.push("column name partially matches the target"); + } else { + reasons.push("weak name match"); + } + + if (s.typeMatch >= 1) { + reasons.push("exact type match"); + } else if (s.typeMatch >= 0.85) { + reasons.push("same type family"); + } else if (s.typeMatch > 0) { + reasons.push("compatible types"); + } else { + reasons.push("type mismatch"); + } + + reasons.push( + rel.cardinality === "one-to-one" + ? "one-to-one (both sides unique)" + : "many-to-one (target key is unique)" + ); + + return reasons; +} + +function caveatsFor(relationships: Relationship[], tableNames: string[]): string[] { + const caveats: string[] = []; + + for (const rel of relationships) { + const { from, to } = ref(rel); + if (rel.confidence < LOW_CONFIDENCE) { + caveats.push(`${from} ↳ ${to} is low-confidence (${rel.confidence}%) — verify manually`); + } + if (rel.signals.nameSimilarity < 0.5 && rel.signals.valueOverlap >= 0.9) { + caveats.push( + `${from} ↳ ${to} matched mainly on data overlap with weak name evidence — possibly coincidental` + ); + } + } + + const linked = new Set(); + for (const rel of relationships) { + linked.add(rel.from.table); + linked.add(rel.to.table); + } + const orphans = tableNames.filter((name) => !linked.has(name)); + if (orphans.length > 0) { + caveats.push(`Tables with no inferred relationships: ${orphans.join(", ")}`); + } + + return caveats; +} + +export function buildExplanation( + relationships: Relationship[], + tableNames: string[] +): ExplainReport { + return { + relationships: relationships.map((relationship) => ({ + relationship, + reasons: reasonsFor(relationship), + })), + caveats: caveatsFor(relationships, tableNames), + }; +} diff --git a/src/lib/discovery/semantic-model.ts b/src/lib/discovery/semantic-model.ts new file mode 100644 index 0000000..12378bb --- /dev/null +++ b/src/lib/discovery/semantic-model.ts @@ -0,0 +1,109 @@ +import type { Relationship, SemanticEntity, SemanticModel } from "../../types/discovery"; +import { singularize, splitTokens } from "./signals"; + +/** + * Roll inferred relationships into named business entities. Pure — tables + + * relationships in, semantic model out. No DuckDB / IO. + */ + +function capitalize(token: string): string { + return token.length === 0 ? token : token[0].toUpperCase() + token.slice(1); +} + +function pascalCase(value: string): string { + return splitTokens(value).map(capitalize).join("") || value; +} + +/** Derive a PascalCase singular entity name, e.g. `order_items` → `OrderItem`. */ +export function entityName(table: string): string { + return pascalCase(singularize(table)) || pascalCase(table) || table; +} + +function pushUnique(list: string[], value: string): void { + if (!list.includes(value)) list.push(value); +} + +/** Assign one entity name per table, resolving collisions deterministically. */ +function nameEntities(tableNames: string[]): Map { + const byTable = new Map(); + const used = new Set(); + for (const table of tableNames) { + let name = entityName(table); + if (used.has(name)) { + // Collision (e.g. "users" and "user"): fall back to the full table PascalCase, + // then a numeric suffix. + const base = pascalCase(table) || name; + name = base; + let suffix = 2; + while (used.has(name)) { + name = `${base}${suffix}`; + suffix += 1; + } + } + used.add(name); + byTable.set(table, name); + } + return byTable; +} + +export function buildSemanticModel( + tableNames: string[], + relationships: Relationship[], + now: number +): SemanticModel { + const names = nameEntities(tableNames); + + const entities: SemanticEntity[] = tableNames.map((table) => ({ + name: names.get(table)!, + table, + belongsTo: [], + hasMany: [], + hasOne: [], + })); + const byTable = new Map(entities.map((entity) => [entity.table, entity])); + + for (const rel of relationships) { + const fromEntity = byTable.get(rel.from.table); + const toEntity = byTable.get(rel.to.table); + if (!fromEntity || !toEntity) continue; + + pushUnique(fromEntity.belongsTo, toEntity.name); + if (rel.cardinality === "one-to-one") { + pushUnique(toEntity.hasOne, fromEntity.name); + } else { + pushUnique(toEntity.hasMany, fromEntity.name); + } + } + + return { generatedAt: now, entities }; +} + +/** Deterministically render the semantic model as YAML. */ +export function renderSemanticYaml(model: SemanticModel): string { + const lines: string[] = [ + "# QueryPad semantic model", + `generated_at: ${new Date(model.generatedAt).toISOString()}`, + "entities:", + ]; + + if (model.entities.length === 0) { + lines.push(" []"); + return lines.join("\n") + "\n"; + } + + const assoc = (key: string, values: string[]) => { + if (values.length === 0) return; + lines.push(` ${key}:`); + for (const value of values) lines.push(` - ${value}`); + }; + + for (const entity of model.entities) { + lines.push(` - name: ${entity.name}`); + lines.push(` table: ${entity.table}`); + assoc("belongs_to", entity.belongsTo); + assoc("has_many", entity.hasMany); + assoc("has_one", entity.hasOne); + } + + return lines.join("\n") + "\n"; +} diff --git a/src/types/discovery.ts b/src/types/discovery.ts index ba9e770..577db6e 100644 --- a/src/types/discovery.ts +++ b/src/types/discovery.ts @@ -34,9 +34,30 @@ export interface Relationship { signals: RelationshipSignals; } +/** A named business entity derived from a table and its relationships. */ +export interface SemanticEntity { + /** PascalCase singular, e.g. "User". */ + name: string; + /** Source table, e.g. "users". */ + table: string; + /** Entity names this entity references (foreign-key side). */ + belongsTo: string[]; + /** Entity names that reference this one with a many-to-one relationship. */ + hasMany: string[]; + /** Entity names that reference this one with a one-to-one relationship. */ + hasOne: string[]; +} + +/** The semantic model: business entities rolled up from tables + relationships. */ +export interface SemanticModel { + generatedAt: number; + entities: SemanticEntity[]; +} + /** Full output of a folder inspection, serialized to .querypad/ artifacts. */ export interface DiscoveryReport { generatedAt: number; profiles: TableProfile[]; relationships: Relationship[]; + semanticModel: SemanticModel; } diff --git a/test/explain.test.ts b/test/explain.test.ts new file mode 100644 index 0000000..317684c --- /dev/null +++ b/test/explain.test.ts @@ -0,0 +1,70 @@ +import assert from "node:assert/strict"; +import { mkdtemp } from "node:fs/promises"; +import { tmpdir } from "node:os"; +import path from "node:path"; +import test from "node:test"; +import { buildExplanation } from "../src/lib/discovery/explain"; +import { runExplain } from "../src/cli/explain"; +import type { Relationship, RelationshipSignals } from "../src/types/discovery"; + +function rel( + fromTable: string, + toTable: string, + confidence: number, + signals: Partial = {} +): Relationship { + return { + from: { table: fromTable, column: "user_id" }, + to: { table: toTable, column: "id" }, + confidence, + cardinality: "many-to-one", + signals: { + valueOverlap: 1, + nameSimilarity: 1, + typeMatch: 1, + cardinalityShape: 1, + ...signals, + }, + }; +} + +test("buildExplanation gives strong reasons and no caveats for a clean edge", () => { + const report = buildExplanation([rel("payments", "users", 100)], ["payments", "users"]); + assert.equal(report.relationships.length, 1); + const reasons = report.relationships[0].reasons; + assert.ok(reasons.some((r) => r.includes("100% of distinct payments.user_id"))); + assert.ok(reasons.includes("column name strongly matches the target")); + assert.ok(reasons.includes("exact type match")); + assert.ok(reasons.includes("many-to-one (target key is unique)")); + assert.deepEqual(report.caveats, []); +}); + +test("buildExplanation flags low-confidence edges", () => { + const report = buildExplanation([rel("payments", "users", 60)], ["payments", "users"]); + assert.ok(report.caveats.some((c) => c.includes("low-confidence (60%)"))); +}); + +test("buildExplanation flags high-overlap weak-name edges as coincidental", () => { + const report = buildExplanation( + [rel("payments", "events", 80, { nameSimilarity: 0.3, valueOverlap: 1 })], + ["payments", "events"] + ); + assert.ok(report.caveats.some((c) => c.includes("possibly coincidental"))); +}); + +test("buildExplanation reports orphan tables", () => { + const report = buildExplanation([rel("payments", "users", 100)], [ + "payments", + "users", + "logs", + ]); + assert.ok(report.caveats.some((c) => c.includes("no inferred relationships: logs"))); +}); + +test("runExplain guides the user when artifacts are missing", async () => { + const dir = await mkdtemp(path.join(tmpdir(), "querypad-explain-")); + const lines: string[] = []; + const code = await runExplain(dir, { log: (line) => lines.push(line) }); + assert.equal(code, 1); + assert.ok(lines.join("\n").includes("querypad inspect")); +}); diff --git a/test/semantic.test.ts b/test/semantic.test.ts new file mode 100644 index 0000000..f4f01cd --- /dev/null +++ b/test/semantic.test.ts @@ -0,0 +1,95 @@ +import assert from "node:assert/strict"; +import test from "node:test"; +import { + buildSemanticModel, + entityName, + renderSemanticYaml, +} from "../src/lib/discovery/semantic-model"; +import { buildAskContext } from "../src/lib/agent/ask-context"; +import type { Relationship } from "../src/types/discovery"; + +function rel( + fromTable: string, + fromCol: string, + toTable: string, + toCol: string, + cardinality: Relationship["cardinality"] = "many-to-one" +): Relationship { + return { + from: { table: fromTable, column: fromCol }, + to: { table: toTable, column: toCol }, + confidence: 100, + cardinality, + signals: { valueOverlap: 1, nameSimilarity: 1, typeMatch: 1, cardinalityShape: 1 }, + }; +} + +test("entityName singularizes and PascalCases", () => { + assert.equal(entityName("users"), "User"); + assert.equal(entityName("payments"), "Payment"); + assert.equal(entityName("order_items"), "OrderItem"); + assert.equal(entityName("companies"), "Company"); +}); + +test("buildSemanticModel derives associations from relationships", () => { + const model = buildSemanticModel( + ["users", "payments", "events"], + [rel("payments", "user_id", "users", "id"), rel("events", "user_id", "users", "id")], + 1 + ); + const byName = Object.fromEntries(model.entities.map((e) => [e.name, e])); + + assert.deepEqual(Object.keys(byName).sort(), ["Event", "Payment", "User"]); + assert.deepEqual(byName.User.hasMany.sort(), ["Event", "Payment"]); + assert.deepEqual(byName.Payment.belongsTo, ["User"]); + assert.deepEqual(byName.Event.belongsTo, ["User"]); + assert.deepEqual(byName.User.belongsTo, []); +}); + +test("buildSemanticModel uses has_one for one-to-one", () => { + const model = buildSemanticModel( + ["users", "profiles"], + [rel("profiles", "user_id", "users", "id", "one-to-one")], + 1 + ); + const user = model.entities.find((e) => e.name === "User")!; + assert.deepEqual(user.hasOne, ["Profile"]); + assert.deepEqual(user.hasMany, []); +}); + +test("entity name collisions are de-duplicated", () => { + // "users" and "user" both singularize to "User". + const model = buildSemanticModel(["users", "user"], [], 1); + const names = model.entities.map((e) => e.name); + assert.equal(new Set(names).size, names.length, `names should be unique: ${names}`); +}); + +test("renderSemanticYaml emits entities with associations", () => { + const model = buildSemanticModel( + ["users", "payments"], + [rel("payments", "user_id", "users", "id")], + 1700000000000 + ); + const yaml = renderSemanticYaml(model); + assert.match(yaml, /entities:/); + assert.match(yaml, /- name: User\n {4}table: users\n {4}has_many:\n {6}- Payment/); + assert.match(yaml, /- name: Payment\n {4}table: payments\n {4}belongs_to:\n {6}- User/); +}); + +test("buildAskContext includes the business entities block", () => { + const model = buildSemanticModel( + ["users", "payments"], + [rel("payments", "user_id", "users", "id")], + 1 + ); + const context = buildAskContext({ + tables: [ + { name: "users", columns: [{ name: "id", type: "BIGINT" }], rowCount: 5 }, + { name: "payments", columns: [{ name: "user_id", type: "BIGINT" }], rowCount: 8 }, + ], + relationships: [rel("payments", "user_id", "users", "id")], + semanticModel: model, + }); + assert.match(context, /Business entities:/); + assert.match(context, /User \(table users\) has_many Payment/); +}); From ccf7a1f384bbdcafee2b4aa4f7be0f1b31937586 Mon Sep 17 00:00:00 2001 From: Kiyeon Jeon Date: Thu, 18 Jun 2026 01:16:39 +0900 Subject: [PATCH 4/6] docs: document semantic model + explain Co-Authored-By: Claude Opus 4.8 (1M context) --- CHANGELOG.md | 4 ++++ README.md | 35 ++++++++++++++++++++++++++++++++++- ROADMAP.md | 47 +++++++++++++++++++++++++++++++---------------- 3 files changed, 69 insertions(+), 17 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index f91fd0f..1343f49 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,6 +11,10 @@ milestones and public product updates. infers foreign-key relationships with confidence scores - New `querypad ask "" ` command (AI Analyst): generates SQL using the inferred relationships as context, runs it on DuckDB, and explains the result +- `inspect` now builds a semantic model (named business entities with belongs_to/has_many) + and writes `.querypad/semantic-model.yaml`; `ask` feeds those entities as context too +- New `querypad explain ` command: justifies each inferred relationship from its + signals (value overlap, name match, type, cardinality) and lists caveats to verify - Generated SQL is read-only-gated (only SELECT/WITH/EXPLAIN/… execute) and code-fence stripped - CLI AI keys come from `ANTHROPIC_API_KEY` / `OPENAI_API_KEY`; provider via `--provider` - Writes `.querypad/` artifacts (`schema.json`, `relationships.json`, `inspect-summary.md`) diff --git a/README.md b/README.md index 1a91d7d..4eab05d 100644 --- a/README.md +++ b/README.md @@ -61,9 +61,23 @@ read to reason about the dataset instead of guessing at pandas: .querypad/ schema.json # tables, columns, types, per-column profiles relationships.json # inferred joins with confidence + signals + semantic-model.yaml # named business entities (belongs_to / has_many) inspect-summary.md # human- and agent-readable overview ``` +`inspect` also rolls the relationships into a semantic model of named entities: + +```yaml +# .querypad/semantic-model.yaml +entities: + - name: User + table: users + has_many: [Payment, Event] + - name: Payment + table: payments + belongs_to: [User] +``` + ```text Claude Code + QueryPad + DuckDB ``` @@ -85,7 +99,7 @@ don't produce false positives. |-------|--------------|--------| | **1 — Dataset Discovery** | Scan folders; detect schema, types, statistics, uniqueness, cardinality | ✅ Built (`profile`) | | **2 — Relationship Discovery** | Infer joins automatically with confidence scores | ✅ Built (`inspect`) | -| **3 — Semantic Model** | Roll relationships into named business entities (`Customer ├ Payment ├ Event`) | 🚧 Roadmap | +| **3 — Semantic Model** | Roll relationships into named business entities (`User ├ Payment ├ Event`) | ✅ Built (`inspect`) | | **4 — AI Analyst** | Natural-language questions → SQL → execution → insight (`ask`) | ✅ Built (`ask`) | See [ROADMAP.md](ROADMAP.md) for the full plan. @@ -116,6 +130,25 @@ Insight: All payments come from paid-plan users. Generated SQL is read-only-gated (only `SELECT`/`WITH`/… execute) and the DB is in-memory, so source files are never modified. Use `--show-sql` to preview the SQL without running it. +## CLI: explain why + +```bash +querypad explain ./data +``` + +Justifies each inferred relationship from its stored signals, and lists caveats to verify: + +```text +payments.user_id ↳ users.id — 100% (many-to-one) + • 100% of distinct payments.user_id values are present in users.id + • column name strongly matches the target + • exact type match + • many-to-one (target key is unique) + +Caveats (0) + None. +``` + ## Web app: interactive analysis The browser app at [querypad.io](https://querypad.io) is the same OSS app running diff --git a/ROADMAP.md b/ROADMAP.md index 4d86403..df667bb 100644 --- a/ROADMAP.md +++ b/ROADMAP.md @@ -27,10 +27,11 @@ Layer 4 AI Analyst → question → semantic model → SQL → execut |-------|-------------|--------| | 1 — Dataset Discovery | Folder scan + per-column profiles (`profileTable`, `loadFolder`) | ✅ Built | | 2 — Relationship Discovery | Confidence-scored FK inference (`discoverRelationships`, `querypad inspect`) | ✅ Built | +| 3 — Semantic Model | Entity rollup → `.querypad/semantic-model.yaml` (`buildSemanticModel`) | ✅ Built | | 4 — AI Analyst | `querypad ask`: NL → SQL (relationship-aware) → execution → insight | ✅ Built | -| 3 — Semantic Model | Entity rollup → `.querypad/semantic-model.yaml` | 🚧 Next | -| UI — AI Verification | Lightweight web UI to accept/reject/edit inferred relationships | 🚧 Planned | -| `querypad explain` | Justify confidence from stored `RelationshipSignals` | 🚧 Planned | +| `querypad explain` | Justify each relationship from stored `RelationshipSignals` + caveats | ✅ Built | +| UI — AI Verification | Lightweight web UI to accept/reject/edit inferred relationships | 🚧 Next | +| MCP server | Expose `inspect`/`ask`/`explain` as typed agent tools | 🚧 Planned | ## Built today @@ -63,11 +64,11 @@ Insight: All payments come from paid-plan users. Architecture (engine-agnostic core, two DuckDB bindings): ```text -src/lib/discovery/ signals.ts · relationships.ts · sql-safety.ts (read-only gate) +src/lib/discovery/ signals.ts · relationships.ts · semantic-model.ts · explain.ts · sql-safety.ts src/lib/ai/ complete.ts (shared streaming) · generate-sql.ts · providers.ts src/lib/duckdb-node/ connection.ts · load.ts · profile.ts (native @duckdb/node-api) src/lib/duckdb/ sql-utils.ts (shared) · profile.ts (browser DuckDB-Wasm) -src/cli/ index.ts (dispatch) · inspect.ts · ask.ts · artifacts.ts +src/cli/ index.ts (dispatch) · inspect.ts · ask.ts · explain.ts · artifacts.ts ``` Relationship discovery: profile each table → find primary-key candidates (unique, @@ -82,26 +83,35 @@ Artifacts written to `.querypad/`: ```text schema.json tables, columns, types, per-column profiles relationships.json inferred joins with confidence + per-signal breakdown +semantic-model.yaml named business entities (belongs_to / has_many / has_one) inspect-summary.md human- and agent-readable overview ``` -## Layer 3 — Semantic Model (next) +## Layer 3 — Semantic Model (built) -Roll inferred relationships into named business entities, stored as the source of truth. +Rolls inferred relationships into named business entities, stored as the source of truth. ```yaml # .querypad/semantic-model.yaml entities: - - name: Customer + - name: User table: users - has_many: [Payment, Event] + has_many: + - Payment + - Event - name: Payment table: payments - belongs_to: Customer + belongs_to: + - User ``` -- Derive entity names from table names (singularize) and the relationship graph. -- Persist as `semantic-model.yaml`; let users override/curate it. +- Entity names are derived mechanically (`buildSemanticModel`): singularize → PascalCase + (`users` → `User`, `order_items` → `OrderItem`), with deterministic collision handling. + This keeps `inspect` key-free and deterministic. +- Associations come from the relationship graph: FK side `belongs_to`, PK side `has_many` + (or `has_one` for one-to-one). +- `ask` feeds the entities into its context so generated SQL is reasoned in domain terms. +- Future: AI/user-curated renames (e.g. `users` → `Customer`) over the mechanical defaults. - Surface conflicts (ambiguous joins, multiple FK candidates) for resolution. ## Layer 4 — AI Analyst (built) @@ -116,14 +126,19 @@ Question → inferred relationships as context → SQL generation → DuckDB exe - Reuses the AI layer (`src/lib/ai/complete.ts`, Claude + OpenAI). CLI keys come from `ANTHROPIC_API_KEY` / `OPENAI_API_KEY`; provider via `--provider`. -- Feeds the inferred relationships (`buildAskContext`) so generated SQL joins on the - right keys. Once Layer 3 lands, `ask` will prefer the semantic model as context. +- Feeds the inferred relationships and the semantic model's entities (`buildAskContext`) + so generated SQL joins on the right keys and is reasoned in domain terms. - Generated SQL is read-only-gated (`isReadOnlyQuery`) and code-fence stripped before execution; the in-memory DB is reloaded from files each run, so sources are never touched. - `--show-sql` previews the SQL without executing. -Still planned: `querypad explain` renders the stored per-signal breakdown to justify -each inferred relationship and surface potential conflicts. +## `querypad explain` (built) + +`querypad explain ` reads `.querypad/relationships.json` and renders the stored +per-signal breakdown (`buildExplanation`) as a justification for each inferred relationship: +value overlap, name match, type match, and cardinality. It also surfaces caveats — +low-confidence edges, high-overlap/weak-name matches that may be coincidental, and tables +with no inferred relationships. Pure consumer of artifacts (no DuckDB / AI); run `inspect` first. ## UI — AI Verification (planned) From c6f5c08b50923ffc5125592843ea80389199439f Mon Sep 17 00:00:00 2001 From: Kiyeon Jeon Date: Thu, 18 Jun 2026 01:39:55 +0900 Subject: [PATCH 5/6] feat: web relationship verification UI MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Sidebar Relationships panel runs discovery in the browser (DuckDB-Wasm) and lets the user Accept/Reject/Edit inferred joins; verdicts + edits persist to IndexedDB - reuses the engine-agnostic src/lib/discovery core (relationshipKey, buildExplanation) — no logic duplicated between CLI and web - add a browser QueryRunner adapter; run npm run test:cli in CI Co-Authored-By: Claude Opus 4.8 (1M context) --- .github/workflows/ci.yml | 3 + e2e/app.spec.ts | 24 ++ src/components/sidebar/RelationshipsPanel.tsx | 308 ++++++++++++++++++ src/components/sidebar/Sidebar.tsx | 19 ++ src/lib/discovery/relationships.ts | 5 + src/lib/duckdb/browser-runner.ts | 14 + src/lib/persistence/indexeddb.ts | 6 + src/stores/workspace-store.ts | 128 +++++++- src/types/discovery.ts | 10 + test/discovery.test.ts | 21 +- 10 files changed, 536 insertions(+), 2 deletions(-) create mode 100644 src/components/sidebar/RelationshipsPanel.tsx create mode 100644 src/lib/duckdb/browser-runner.ts diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 27325cb..a56383b 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -45,6 +45,9 @@ jobs: - name: Build run: npm run build + - name: Run CLI tests + run: npm run test:cli + - name: Run e2e tests timeout-minutes: 10 run: npm test -- --reporter=line diff --git a/e2e/app.spec.ts b/e2e/app.spec.ts index 27a4f56..d43a357 100644 --- a/e2e/app.spec.ts +++ b/e2e/app.spec.ts @@ -56,6 +56,30 @@ test.describe("QueryPad", () => { expect(copied).toContain("## Latest Result"); }); + test("discovers and verifies relationships", async ({ page }) => { + await page.goto("/"); + await expect(page.getByRole("button", { name: "employees", exact: true })).toBeVisible({ timeout: 15000 }); + + // Open the relationships verification panel + await page.getByRole("button", { name: "Relationships" }).click(); + await expect(page.getByRole("heading", { name: "Relationships" })).toBeVisible(); + + // Discovery completes (ready subtitle shows the inferred count) + await expect(page.getByText(/inferred/)).toBeVisible({ timeout: 15000 }); + + // Sample data has an employees -> departments relationship + const accept = page.getByRole("button", { name: "Accept" }).first(); + await expect(accept).toBeVisible({ timeout: 10000 }); + + // "Why?" reveals the signal-based justification (reused buildExplanation) + await page.getByRole("button", { name: "Why?" }).first().click(); + await expect(page.getByText(/values are present in/).first()).toBeVisible(); + + // Accept the relationship + await accept.click(); + await expect(page.getByRole("button", { name: "Accept" }).first()).toBeVisible(); + }); + test("can switch AI SQL assistant provider", async ({ page }) => { await page.goto("/"); await expect(page.getByRole("button", { name: "employees", exact: true })).toBeVisible({ timeout: 15000 }); diff --git a/src/components/sidebar/RelationshipsPanel.tsx b/src/components/sidebar/RelationshipsPanel.tsx new file mode 100644 index 0000000..a3a0a8a --- /dev/null +++ b/src/components/sidebar/RelationshipsPanel.tsx @@ -0,0 +1,308 @@ +"use client"; + +import { useEffect, useMemo, useState } from "react"; +import { useWorkspaceStore } from "@/stores/workspace-store"; +import { relationshipKey } from "@/lib/discovery/relationships"; +import { buildExplanation } from "@/lib/discovery/explain"; +import type { TableInfo } from "@/types"; +import type { Relationship, RelationshipVerdict } from "@/types/discovery"; + +function confidenceClasses(confidence: number): string { + if (confidence >= 85) return "text-green-700 bg-green-50"; + if (confidence >= 60) return "text-amber-700 bg-amber-50"; + return "text-gray-600 bg-gray-100"; +} + +interface RelationshipCardProps { + rel: Relationship; + tables: TableInfo[]; + tableNames: string[]; + verdict: RelationshipVerdict | undefined; + edited: boolean; + onVerdict: (verdict: RelationshipVerdict | null) => void; + onEdit: (next: Relationship) => void; +} + +function columnsOf(tables: TableInfo[], table: string): string[] { + return tables.find((t) => t.name === table)?.columns.map((c) => c.name) ?? []; +} + +function RelationshipCard({ + rel, + tables, + tableNames, + verdict, + edited, + onVerdict, + onEdit, +}: RelationshipCardProps) { + const [expanded, setExpanded] = useState(false); + const [editing, setEditing] = useState(false); + const [fromColumn, setFromColumn] = useState(rel.from.column); + const [toTable, setToTable] = useState(rel.to.table); + const [toColumn, setToColumn] = useState(rel.to.column); + + const reasons = useMemo( + () => buildExplanation([rel], tableNames).relationships[0]?.reasons ?? [], + [rel, tableNames] + ); + + const from = `${rel.from.table}.${rel.from.column}`; + const to = `${rel.to.table}.${rel.to.column}`; + + const startEdit = () => { + setFromColumn(rel.from.column); + setToTable(rel.to.table); + setToColumn(rel.to.column); + setEditing(true); + }; + + const saveEdit = () => { + onEdit({ + from: { table: rel.from.table, column: fromColumn }, + to: { table: toTable, column: toColumn }, + confidence: 100, + cardinality: rel.cardinality, + signals: rel.signals, + }); + setEditing(false); + }; + + const ring = + verdict === "accepted" + ? "border-green-300 bg-green-50/40" + : verdict === "rejected" + ? "border-gray-200 bg-gray-50 opacity-60" + : "border-gray-200"; + + return ( +
+
+
+

+ {from} ↳ {to} +

+
+ + {rel.confidence}% + + {rel.cardinality} + {edited && edited} +
+
+
+ + {editing ? ( +
+
+ {rel.from.table}. + + +
+
+ + . + +
+
+ + +
+
+ ) : ( + <> + {expanded && reasons.length > 0 && ( +
    + {reasons.map((reason, i) => ( +
  • • {reason}
  • + ))} +
+ )} +
+ + + + +
+ + )} +
+ ); +} + +interface RelationshipsPanelProps { + onClose: () => void; +} + +export default function RelationshipsPanel({ onClose }: RelationshipsPanelProps) { + const discovery = useWorkspaceStore((s) => s.discovery); + const tables = useWorkspaceStore((s) => s.tables); + const verdicts = useWorkspaceStore((s) => s.relationshipVerdicts); + const overrides = useWorkspaceStore((s) => s.relationshipOverrides); + const discoverRelationships = useWorkspaceStore((s) => s.discoverRelationships); + const setRelationshipVerdict = useWorkspaceStore((s) => s.setRelationshipVerdict); + const editRelationship = useWorkspaceStore((s) => s.editRelationship); + + useEffect(() => { + if (discovery.status === "idle") void discoverRelationships(); + }, [discovery.status, discoverRelationships]); + + const tableNames = useMemo(() => tables.map((t) => t.name), [tables]); + const overrideKeys = useMemo( + () => new Set(overrides.map((rel) => relationshipKey(rel))), + [overrides] + ); + + return ( + + ); +} diff --git a/src/components/sidebar/Sidebar.tsx b/src/components/sidebar/Sidebar.tsx index 8fd1f1a..2ba0f68 100644 --- a/src/components/sidebar/Sidebar.tsx +++ b/src/components/sidebar/Sidebar.tsx @@ -5,12 +5,14 @@ import { useWorkspaceStore } from "@/stores/workspace-store"; import { SAMPLE_TABLE_NAMES } from "@/lib/constants"; import TableSchema from "./TableSchema"; import ProfileDrawer from "./ProfileDrawer"; +import RelationshipsPanel from "./RelationshipsPanel"; import DropZone from "@/components/dropzone/DropZone"; export default function Sidebar() { const tables = useWorkspaceStore((s) => s.tables); const removeTable = useWorkspaceStore((s) => s.removeTable); const [profileTableName, setProfileTableName] = useState(null); + const [showRelationships, setShowRelationships] = useState(false); const onlySampleTables = tables.length > 0 && tables.every((t) => SAMPLE_TABLE_NAMES.has(t.name)); const visibleProfileTableName = @@ -43,6 +45,20 @@ export default function Sidebar() {
+
{tables.length === 0 ? (

@@ -67,6 +83,9 @@ export default function Sidebar() { onClose={() => setProfileTableName(null)} /> )} + {showRelationships && ( + setShowRelationships(false)} /> + )}

); } diff --git a/src/lib/discovery/relationships.ts b/src/lib/discovery/relationships.ts index b4259a8..ab0ca81 100644 --- a/src/lib/discovery/relationships.ts +++ b/src/lib/discovery/relationships.ts @@ -19,6 +19,11 @@ import { */ export type QueryRunner = (sql: string) => Promise[]>; +/** Stable directional identity for a relationship (foreign endpoint → key endpoint). */ +export function relationshipKey(rel: Relationship): string { + return `${rel.from.table}.${rel.from.column}->${rel.to.table}.${rel.to.column}`; +} + /** Minimum blended confidence for an edge to be reported. */ const CONFIDENCE_FLOOR = 50; diff --git a/src/lib/duckdb/browser-runner.ts b/src/lib/duckdb/browser-runner.ts new file mode 100644 index 0000000..ba08830 --- /dev/null +++ b/src/lib/duckdb/browser-runner.ts @@ -0,0 +1,14 @@ +import type { QueryRunner } from "@/lib/discovery/relationships"; +import { getConnection } from "./instance"; + +/** + * A browser `QueryRunner` backed by DuckDB-Wasm — lets the engine-agnostic discovery + * core (`discoverRelationships`) run in the browser, mirroring the Node CLI's runner. + */ +export function createBrowserQueryRunner(): QueryRunner { + return async (sql: string) => { + const conn = await getConnection(); + const result = await conn.query(sql); + return result.toArray() as Record[]; + }; +} diff --git a/src/lib/persistence/indexeddb.ts b/src/lib/persistence/indexeddb.ts index ddc8914..47c82cf 100644 --- a/src/lib/persistence/indexeddb.ts +++ b/src/lib/persistence/indexeddb.ts @@ -3,6 +3,7 @@ import { get, set, del } from "idb-keyval"; const WORKSPACE_KEY = "querypad-workspace"; import type { Pipeline } from "@/types/pipeline"; +import type { Relationship, RelationshipVerdict } from "@/types/discovery"; interface PersistedTab { id: string; @@ -21,6 +22,9 @@ export interface PersistedWorkspace { activePipelineId?: string | null; viewMode?: "sql" | "pipeline"; pluginUrls?: string[]; + // Relationship verification (user-curated graph) + relationshipVerdicts?: Record; + relationshipOverrides?: Relationship[]; } export async function saveWorkspace( @@ -40,6 +44,8 @@ export async function saveWorkspace( activePipelineId: workspace.activePipelineId, viewMode: workspace.viewMode, pluginUrls: workspace.pluginUrls, + relationshipVerdicts: workspace.relationshipVerdicts, + relationshipOverrides: workspace.relationshipOverrides, }; await set(WORKSPACE_KEY, clone); } diff --git a/src/stores/workspace-store.ts b/src/stores/workspace-store.ts index e580495..eecd292 100644 --- a/src/stores/workspace-store.ts +++ b/src/stores/workspace-store.ts @@ -1,5 +1,10 @@ import { create } from "zustand"; import type { TableInfo, EditorTab, TableProfileState } from "@/types"; +import type { + Relationship, + RelationshipDiscoveryState, + RelationshipVerdict, +} from "@/types/discovery"; import type { Pipeline, PipelineStep, PipelineExecutionResult } from "@/types/pipeline"; import type { LoadedPlugin } from "@/types/plugin"; import { @@ -8,6 +13,11 @@ import { clearPersistedWorkspace, } from "@/lib/persistence/indexeddb"; import { getConnection } from "@/lib/duckdb/instance"; +import { + discoverRelationships as runDiscovery, + relationshipKey, +} from "@/lib/discovery/relationships"; +import { createBrowserQueryRunner } from "@/lib/duckdb/browser-runner"; interface FileEntry { name: string; @@ -15,6 +25,23 @@ interface FileEntry { data: Uint8Array; } +const IDLE_DISCOVERY: RelationshipDiscoveryState = { + status: "idle", + relationships: [], + error: null, +}; + +/** Merge user overrides onto a base relationship list (override wins, by key). */ +function mergeOverrides( + base: Relationship[], + overrides: Relationship[] +): Relationship[] { + const byKey = new Map(); + for (const rel of base) byKey.set(relationshipKey(rel), rel); + for (const rel of overrides) byKey.set(relationshipKey(rel), rel); + return [...byKey.values()]; +} + const MAX_TABS = 20; function createTab(index: number): EditorTab { @@ -49,6 +76,14 @@ interface WorkspaceState { loadTableProfile: (name: string) => Promise; clearTableProfile: (name: string) => void; + // Relationship discovery + verification + discovery: RelationshipDiscoveryState; + relationshipVerdicts: Record; + relationshipOverrides: Relationship[]; + discoverRelationships: () => Promise; + setRelationshipVerdict: (key: string, verdict: RelationshipVerdict | null) => void; + editRelationship: (oldKey: string, next: Relationship) => void; + // Clear clearWorkspace: () => Promise; @@ -144,6 +179,12 @@ export const useWorkspaceStore = create((set, get) => ({ if (persisted.viewMode) { patch.viewMode = persisted.viewMode; } + if (persisted.relationshipVerdicts) { + patch.relationshipVerdicts = persisted.relationshipVerdicts; + } + if (persisted.relationshipOverrides) { + patch.relationshipOverrides = persisted.relationshipOverrides; + } set(patch); @@ -183,6 +224,8 @@ export const useWorkspaceStore = create((set, get) => ({ ...state.tableProfiles, [table.name]: { status: "idle", profile: null, error: null }, }, + // Tables changed — the relationship graph must be re-derived. + discovery: IDLE_DISCOVERY, })), removeTable: async (name) => { set((state) => ({ @@ -191,6 +234,10 @@ export const useWorkspaceStore = create((set, get) => ({ tableProfiles: Object.fromEntries( Object.entries(state.tableProfiles).filter(([tableName]) => tableName !== name) ), + discovery: IDLE_DISCOVERY, + relationshipOverrides: state.relationshipOverrides.filter( + (rel) => rel.from.table !== name && rel.to.table !== name + ), })); try { const conn = await getConnection(); @@ -217,6 +264,9 @@ export const useWorkspaceStore = create((set, get) => ({ tables: [], fileEntries: [], tableProfiles: {}, + discovery: IDLE_DISCOVERY, + relationshipVerdicts: {}, + relationshipOverrides: [], tabs: [tab], activeTabId: tab.id, shareUrl: null, @@ -269,6 +319,78 @@ export const useWorkspaceStore = create((set, get) => ({ ), })), + discovery: IDLE_DISCOVERY, + relationshipVerdicts: {}, + relationshipOverrides: [], + + discoverRelationships: async () => { + const tables = get().tables; + const names = new Set(tables.map((t) => t.name)); + const validOverrides = () => + get().relationshipOverrides.filter( + (rel) => names.has(rel.from.table) && names.has(rel.to.table) + ); + + if (tables.length < 2) { + set({ + discovery: { status: "ready", relationships: validOverrides(), error: null }, + }); + return; + } + + set((state) => ({ discovery: { ...state.discovery, status: "loading", error: null } })); + try { + const { profileTable } = await import("@/lib/duckdb/profile"); + const profiles = []; + for (const table of tables) profiles.push(await profileTable(table)); + const base = await runDiscovery(profiles, createBrowserQueryRunner()); + set({ + discovery: { + status: "ready", + relationships: mergeOverrides(base, validOverrides()), + error: null, + }, + }); + } catch (err) { + set({ + discovery: { + status: "error", + relationships: [], + error: err instanceof Error ? err.message : String(err), + }, + }); + } + }, + + setRelationshipVerdict: (key, verdict) => + set((state) => { + const next = { ...state.relationshipVerdicts }; + if (verdict === null) delete next[key]; + else next[key] = verdict; + return { relationshipVerdicts: next }; + }), + + editRelationship: (oldKey, next) => + set((state) => { + const nextKey = relationshipKey(next); + const verdicts = { ...state.relationshipVerdicts }; + delete verdicts[oldKey]; + verdicts[nextKey] = "accepted"; + const overrides = state.relationshipOverrides.filter( + (rel) => relationshipKey(rel) !== oldKey && relationshipKey(rel) !== nextKey + ); + overrides.push(next); + const relationships = mergeOverrides( + state.discovery.relationships.filter((rel) => relationshipKey(rel) !== oldKey), + [next] + ); + return { + relationshipVerdicts: verdicts, + relationshipOverrides: overrides, + discovery: { ...state.discovery, relationships }, + }; + }), + tabs: [initialTab], activeTabId: initialTab.id, @@ -415,7 +537,9 @@ useWorkspaceStore.subscribe((state, prevState) => { state.pipelines !== prevState.pipelines || state.activePipelineId !== prevState.activePipelineId || state.viewMode !== prevState.viewMode || - state.plugins !== prevState.plugins + state.plugins !== prevState.plugins || + state.relationshipVerdicts !== prevState.relationshipVerdicts || + state.relationshipOverrides !== prevState.relationshipOverrides ) { if (saveTimer) clearTimeout(saveTimer); saveTimer = setTimeout(() => { @@ -435,6 +559,8 @@ useWorkspaceStore.subscribe((state, prevState) => { pluginUrls: state.plugins .filter((p) => p.url) .map((p) => p.url), + relationshipVerdicts: state.relationshipVerdicts, + relationshipOverrides: state.relationshipOverrides, }).catch(console.error); }, 500); } diff --git a/src/types/discovery.ts b/src/types/discovery.ts index 577db6e..c20a066 100644 --- a/src/types/discovery.ts +++ b/src/types/discovery.ts @@ -54,6 +54,16 @@ export interface SemanticModel { entities: SemanticEntity[]; } +/** A user's verification verdict on an inferred relationship. */ +export type RelationshipVerdict = "accepted" | "rejected"; + +/** Browser-side discovery state for the relationship verification UI. */ +export interface RelationshipDiscoveryState { + status: "idle" | "loading" | "ready" | "error"; + relationships: Relationship[]; + error: string | null; +} + /** Full output of a folder inspection, serialized to .querypad/ artifacts. */ export interface DiscoveryReport { generatedAt: number; diff --git a/test/discovery.test.ts b/test/discovery.test.ts index 6ffade2..fe39471 100644 --- a/test/discovery.test.ts +++ b/test/discovery.test.ts @@ -10,13 +10,32 @@ import { splitTokens, typeMatchScore, } from "../src/lib/discovery/signals"; -import { discoverRelationships } from "../src/lib/discovery/relationships"; +import { discoverRelationships, relationshipKey } from "../src/lib/discovery/relationships"; import { createNodeDb } from "../src/lib/duckdb-node/connection"; import { loadFolder } from "../src/lib/duckdb-node/load"; import { profileTable } from "../src/lib/duckdb-node/profile"; // ---- Pure signal unit tests --------------------------------------------------- +test("relationshipKey is directional and stable", () => { + const forward = relationshipKey({ + from: { table: "payments", column: "user_id" }, + to: { table: "users", column: "id" }, + confidence: 100, + cardinality: "many-to-one", + signals: { valueOverlap: 1, nameSimilarity: 1, typeMatch: 1, cardinalityShape: 1 }, + }); + const reversed = relationshipKey({ + from: { table: "users", column: "id" }, + to: { table: "payments", column: "user_id" }, + confidence: 100, + cardinality: "many-to-one", + signals: { valueOverlap: 1, nameSimilarity: 1, typeMatch: 1, cardinalityShape: 1 }, + }); + assert.equal(forward, "payments.user_id->users.id"); + assert.notEqual(forward, reversed); +}); + test("splitTokens handles snake_case and camelCase", () => { assert.deepEqual(splitTokens("user_id"), ["user", "id"]); assert.deepEqual(splitTokens("customerId"), ["customer", "id"]); From 27584f5216414cee7e1453bdda9c9412d81483ec Mon Sep 17 00:00:00 2001 From: Kiyeon Jeon Date: Thu, 18 Jun 2026 01:39:55 +0900 Subject: [PATCH 6/6] docs: web verification UI + Cursor-for-Data repo metadata - document the Relationships panel in CHANGELOG/README/ROADMAP - package.json description + keywords reframed to the pivot (GitHub About already updated) Co-Authored-By: Claude Opus 4.8 (1M context) --- CHANGELOG.md | 8 ++++++++ README.md | 1 + ROADMAP.md | 33 ++++++++++++++------------------- package.json | 19 +++++++++++++------ 4 files changed, 36 insertions(+), 25 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 1343f49..94e806a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,14 @@ milestones and public product updates. ## Unreleased +### Web: Relationship Verification + +- New Relationships panel in the sidebar: runs the same discovery engine in the browser + (DuckDB-Wasm) and lists inferred joins with confidence and a per-signal "why" +- Accept / Reject / Edit each relationship to curate the AI's assumptions; verdicts and + edits persist across refresh (IndexedDB) +- Reuses the engine-agnostic `src/lib/discovery` core (no logic duplicated between CLI and web) + ### CLI: Dataset Understanding - New `querypad inspect ` command that profiles a folder of data files and diff --git a/README.md b/README.md index 4eab05d..9a6dce4 100644 --- a/README.md +++ b/README.md @@ -157,6 +157,7 @@ client-side. Your data stays on your machine unless you explicitly share or coll - **Drag & drop anything** — CSV, Parquet, JSON, Excel — drop multiple formats at once and JOIN them - **DuckDB-Wasm SQL** — Full analytical SQL in the browser (JOIN, GROUP BY, window functions, …) - **Data profiles** — Column-level nulls, distinct counts, ranges, averages, and top values +- **Relationship verification** — Discover inferred joins in-browser; Accept / Reject / Edit each with a per-signal "why" (verdicts persist) - **Agent context** — Copy schema, profiles, active SQL, and latest results for Claude Code or Codex - **AI SQL assistant** — Cmd+K for natural language to SQL with Claude or OpenAI BYOK - **Inline charts** — One-click Bar, Line, Scatter, Pie from query results diff --git a/ROADMAP.md b/ROADMAP.md index df667bb..7e14ad8 100644 --- a/ROADMAP.md +++ b/ROADMAP.md @@ -30,8 +30,8 @@ Layer 4 AI Analyst → question → semantic model → SQL → execut | 3 — Semantic Model | Entity rollup → `.querypad/semantic-model.yaml` (`buildSemanticModel`) | ✅ Built | | 4 — AI Analyst | `querypad ask`: NL → SQL (relationship-aware) → execution → insight | ✅ Built | | `querypad explain` | Justify each relationship from stored `RelationshipSignals` + caveats | ✅ Built | -| UI — AI Verification | Lightweight web UI to accept/reject/edit inferred relationships | 🚧 Next | -| MCP server | Expose `inspect`/`ask`/`explain` as typed agent tools | 🚧 Planned | +| UI — AI Verification | Sidebar Relationships panel: accept/reject/edit inferred joins | ✅ Built | +| MCP server | Expose `inspect`/`ask`/`explain` as typed agent tools | 🚧 Next | ## Built today @@ -140,29 +140,24 @@ value overlap, name match, type match, and cardinality. It also surfaces caveats low-confidence edges, high-overlap/weak-name matches that may be coincidental, and tables with no inferred relationships. Pure consumer of artifacts (no DuckDB / AI); run `inspect` first. -## UI — AI Verification (planned) +## UI — AI Verification (built) -After the CLI proves the understanding engine, build a **lightweight local web UI** -— not Tableau, not Metabase. Its purpose is **AI verification**, not dashboard building. - -```text -Left Center Right -───── ────── ────── -Tables Chat Generated SQL -Relationships Results -``` - -The defining interaction is validating AI assumptions: +The browser app has a **Relationships panel** in the sidebar — its purpose is +**AI verification**, not dashboard building. It runs the same discovery engine in the +browser (DuckDB-Wasm via `createBrowserQueryRunner`) and lets the user validate the +AI's assumptions: ```text Detected relationship - users.id ↳ payments.user_id Confidence 97% - [Accept] [Reject] [Edit] + payments.user_id ↳ users.id Confidence 100% + [Accept] [Reject] [Edit] (Why? → per-signal justification) ``` -This reuses the shared `src/lib/discovery` core, surfacing the same edges the CLI -emits. The existing browser app (Monaco, charts, pipelines, sharing) remains the -interactive-analysis surface; the verification view is additive. +`RelationshipsPanel.tsx` reuses the shared `src/lib/discovery` core (`discoverRelationships`, +`buildExplanation`) — the same edges the CLI emits — so no logic is duplicated. Verdicts and +edits are keyed by `relationshipKey` and persisted to IndexedDB. The existing browser app +(Monaco, charts, pipelines, sharing) remains the interactive-analysis surface; the +verification view is additive. ## Claude Code integration diff --git a/package.json b/package.json index 72a103e..22e3acc 100644 --- a/package.json +++ b/package.json @@ -2,7 +2,7 @@ "name": "querypad", "version": "0.6.0", "private": true, - "description": "Local-first SQL scratchpad powered by DuckDB-Wasm", + "description": "Cursor for Data — a local-first AI workspace that understands datasets (discovers relationships, builds semantic models) and analyzes them with DuckDB.", "author": "QueryPad Contributors", "license": "MIT", "repository": { @@ -14,14 +14,21 @@ "querypad": "src/cli/index.ts" }, "keywords": [ - "sql", + "cursor-for-data", "duckdb", + "sql", + "ai", + "llm", + "data-understanding", + "relationship-discovery", + "semantic-model", + "schema-inference", + "data-analysis", "csv", "parquet", - "json", - "analytics", - "data-analysis", - "browser" + "cli", + "claude", + "local-first" ], "scripts": { "dev": "next dev",