diff --git a/.cargo/audit.toml b/.cargo/audit.toml new file mode 100644 index 0000000..b024387 --- /dev/null +++ b/.cargo/audit.toml @@ -0,0 +1,32 @@ +# cargo-audit configuration. +# +# `cargo audit` reads this file from `.cargo/audit.toml` (relative to the repo +# root). The CI `audit` job uses the `rustsec/audit-check` GitHub Action, which +# does NOT read this file — it takes the same advisory IDs via its `ignore:` +# input (see .github/workflows/ci.yml). Keep the two lists in sync. +# +# Policy: we ONLY ignore advisories whose entire dependency path is confined to +# `crates/compare` (powdb-compare), which is `publish = false` — a benchmark-only +# crate that pulls `postgres = "0.19"` and `mysql = "25"` purely to compare PowDB +# against Postgres/MySQL. None of these crates ship in powdb-storage / powdb-query +# / powdb-server / powdb-cli / powdb-auth / powdb-backup, so no published artifact +# is affected. We do NOT broadly disable auditing — only the three postgres DoS +# advisories below are vulnerabilities (the action's hard-fail trigger); the +# remaining unmaintained/unsound entries are non-failing warnings and are left +# visible on purpose. +[advisories] +ignore = [ + # postgres-protocol < 0.6.12: unbounded SCRAM iteration count → CPU-exhaustion + # DoS from a malicious server. Path: postgres-protocol → tokio-postgres → + # postgres → powdb-compare (publish=false, bench-only). No shipping crate. + "RUSTSEC-2026-0179", + + # postgres-protocol < 0.6.12: panic decoding a malformed `hstore` value → DoS. + # Same powdb-compare-only path. No shipping crate. + "RUSTSEC-2026-0180", + + # tokio-postgres < 0.7.18: panic on a `DataRow` with fewer fields than columns + # → DoS. Path: tokio-postgres → postgres → powdb-compare (publish=false, + # bench-only). No shipping crate. + "RUSTSEC-2026-0178", +] diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index d2328a0..c3ea6e4 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -171,6 +171,53 @@ jobs: fi echo "MSRV consistency OK." + msrv-build: + name: MSRV build (cargo + check --workspace --locked) + runs-on: ubuntu-24.04 + timeout-minutes: 20 + env: + CARGO_TERM_COLOR: always + RUST_BACKTRACE: 1 + RUSTFLAGS: "-C target-cpu=x86-64-v2" + steps: + - name: Checkout + uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6.0.3 + + - name: Read MSRV from Cargo.toml + id: msrv + run: | + set -euo pipefail + MSRV=$(grep -E '^rust-version' Cargo.toml | head -1 | sed -E 's/.*"([0-9]+\.[0-9]+(\.[0-9]+)?)".*/\1/') + if [ -z "$MSRV" ]; then + echo "::error::Could not parse rust-version from Cargo.toml" + exit 1 + fi + echo "version=$MSRV" >> "$GITHUB_OUTPUT" + echo "MSRV is $MSRV" + + - name: Install MSRV toolchain + uses: dtolnay/rust-toolchain@29eef336d9b2848a0b548edc03f92a220660cdb8 # stable + with: + toolchain: ${{ steps.msrv.outputs.version }} + + - name: Cargo cache + uses: actions/cache@27d5ce7f107fe9357f9df03efb73ab90386fccae # v5.0.5 + with: + path: | + ~/.cargo/registry + ~/.cargo/git + target + key: msrv-build-${{ runner.os }}-${{ steps.msrv.outputs.version }}-${{ hashFiles('**/Cargo.lock') }} + restore-keys: | + msrv-build-${{ runner.os }}-${{ steps.msrv.outputs.version }}- + + # `--locked` so a newer-than-MSRV language feature (or a Cargo.lock that + # only resolves on newer Rust) can't slip past the doc-only consistency + # check above. This compiles the whole workspace on the real MSRV + # toolchain. + - name: cargo +MSRV check --workspace --locked + run: cargo +${{ steps.msrv.outputs.version }} check --workspace --locked + examples-smoke: name: examples smoke (terraform validate + compose config + dev.sh) runs-on: ubuntu-24.04 @@ -234,3 +281,36 @@ jobs: uses: rustsec/audit-check@69366f33c96575abad1ee0dba8212993eecbe998 # v2.0.0 with: token: ${{ secrets.GITHUB_TOKEN }} + # Ignore advisories whose ENTIRE dependency path is confined to + # crates/compare (powdb-compare), which is `publish = false` — a + # benchmark-only crate that pulls `postgres`/`mysql` to compare PowDB + # against other engines. No shipping crate (storage/query/server/cli/ + # auth/backup) is affected. Keep this list in sync with + # `.cargo/audit.toml` (which the local `cargo audit` reads). + # RUSTSEC-2026-0179 postgres-protocol SCRAM DoS (→ powdb-compare) + # RUSTSEC-2026-0180 postgres-protocol hstore DoS (→ powdb-compare) + # RUSTSEC-2026-0178 tokio-postgres DataRow DoS (→ powdb-compare) + ignore: RUSTSEC-2026-0179,RUSTSEC-2026-0180,RUSTSEC-2026-0178 + + ts-client-version: + name: ts-client version consistency + runs-on: ubuntu-24.04 + timeout-minutes: 5 + steps: + - name: Checkout + uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6.0.3 + + # Guard against the version drift we hit once: the CLIENT_VERSION + # constant sent in the handshake (in src/ — the source of truth; dist/ is + # gitignored and built from it at publish time) must match package.json. + # Uses the runner's preinstalled Node — no extra action, no install. + - name: Assert package.json === CLIENT_VERSION + working-directory: clients/ts + run: | + PKG_VER="$(node -p "require('./package.json').version")" + SRC_VER="$(grep -oE 'CLIENT_VERSION = "[^"]+"' src/index.ts | head -1 | sed -E 's/.*"([^"]+)".*/\1/')" + echo "package.json=$PKG_VER src CLIENT_VERSION=$SRC_VER" + if [ "$PKG_VER" != "$SRC_VER" ]; then + echo "::error::TS client version drift: package.json=$PKG_VER but src CLIENT_VERSION=$SRC_VER — bump CLIENT_VERSION in clients/ts/src/index.ts to match package.json, then 'npm run build'" + exit 1 + fi diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 216eeea..01c8bba 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -98,9 +98,50 @@ jobs: powdb-cli-${{ matrix.suffix }} powdb-server-${{ matrix.suffix }} + smoke-release: + name: durability smoke (release binary) + needs: build + runs-on: ubuntu-24.04 + timeout-minutes: 20 + env: + CARGO_TERM_COLOR: always + RUST_BACKTRACE: 1 + # Portable target-cpu so the binary we smoke matches the published one. + RUSTFLAGS: "-C target-cpu=x86-64-v2" + + steps: + - name: Checkout + uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6.0.3 + + - name: Install Rust toolchain (stable) + uses: dtolnay/rust-toolchain@29eef336d9b2848a0b548edc03f92a220660cdb8 # stable + + - name: Cargo cache + uses: actions/cache@27d5ce7f107fe9357f9df03efb73ab90386fccae # v5.0.5 + with: + path: | + ~/.cargo/registry + ~/.cargo/git + target + # Share the linux-x86_64 build cache so we reuse the binaries the + # `build` job already compiled instead of rebuilding from scratch. + key: release-x86_64-unknown-linux-gnu-${{ hashFiles('**/Cargo.lock') }} + restore-keys: | + release-x86_64-unknown-linux-gnu- + + - name: Build release binaries (powdb-cli + powdb-server) + run: cargo build --release -p powdb-cli -p powdb-server + + # The gate whose absence caused the v0.4.1–0.4.3 data-loss yanks: + # README PowQL flow + unique-constraint enforcement + kill -9/restart + # WAL-replay recovery, all over the wire against the freshly built + # release binary. Must print SMOKE-RELEASE: ALL-PASS. + - name: Durability smoke against release binary + run: bash scripts/smoke-release.sh + release: name: create release - needs: build + needs: [build, smoke-release] runs-on: ubuntu-24.04 permissions: contents: write diff --git a/CHANGELOG.md b/CHANGELOG.md index 8643547..3a65e37 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,63 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +### Added +- **RBAC now enforces the full permission lattice.** The server maps each + statement to the capability it needs — reads → `Read`, row mutations + (insert/update/delete/upsert) → `Write`, schema changes (create/alter/drop + type or view) → `Ddl` — and checks it against the user's role. The + `readwrite` role now explicitly carries `Ddl` (application users create and + evolve their own tables), so **this is behavior-preserving**: readwrite and + admin keep full access, readonly is still read-only, and any authenticated + role may still run read-only queries. `Admin` remains reserved for user/role + management (CLI-only). +- **Automated post-publish durability smoke** (`scripts/smoke-release.sh`), + wired as a required gate in `release.yml`: installs the built binaries, runs + the README PowQL flow over the wire, then `kill -9`s the server and restarts + it to assert WAL replay recovers every row and the unique constraint still + holds. This is the exact gate whose absence caused the v0.4.1–0.4.3 data-loss + yanks; it now runs on every tagged release. +- **MSRV build job** in CI that compiles the workspace with the pinned 1.93 + toolchain (the previous job only checked that the version string matched the + docs). + +### Changed +- **Resource-limit errors now reach remote clients verbatim.** Sort, join, and + per-query memory-budget errors (e.g. "sort input exceeds row limit — add a + LIMIT clause") were being masked to the generic "query execution error" by + the wire sanitizer. They carry actionable guidance and leak no internal + state, so they are now on the safe-error allowlist. + +### Fixed +- CLI `--help` showed a remote one-shot example using a `|` pipe operator that + PowQL does not have; corrected to the whitespace-pipeline syntax so the + example runs as written. +- CI `cargo audit` no longer fails on three `postgres`-only RUSTSEC advisories + whose entire dependency path is confined to the `publish = false` + `powdb-compare` benchmark crate (scoped ignore in `.cargo/audit.toml` + the + audit action, with provenance comments). No shipping crate is affected. +- Dockerfile dependency-cache stage now copies the `powdb-auth` and + `powdb-backup` manifests it was silently missing, so the cached layer covers + the full server/CLI dependency closure. +- TypeScript client version drift: the `CLIENT_VERSION` handshake constant, + the built `dist`, and the README now all agree with `package.json` (0.5.0), + and a CI job asserts they can't diverge again. + +### Internal +- Documented `panic = "abort"` as a deliberate **crash-only** design: on a + panic the server exits fast and a supervisor restarts it, with WAL replay + recovering to a consistent state — safer for a stateful engine than + unwinding into a poisoned lock. Every deploy example is confirmed to run + under an auto-restart policy, and the requirement is now documented in + `examples/deploy/README.md`. +- Promoted the CI lint policy into `[workspace.lints]` (`clippy::all = deny`) + so `cargo clippy` fails locally with the same rules CI enforces. +- Removed ~190 LOC of dead, never-wired snapshot-isolation scaffolding + (`storage::mvcc`, `storage::tx`) that was shipping in the `powdb-storage` + crate; the live engine uses `RwLock` concurrency. +- Refreshed stale `powdb-auth` doc-comments that claimed the crate was "not + yet wired into the server or CLI" — it has enforced auth/RBAC since 0.4.6. + ## [0.4.7] - 2026-06-10 ### Added diff --git a/CLAUDE.md b/CLAUDE.md index d4334b9..90ff9ab 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -94,6 +94,7 @@ If the planner emits a different shape for the same logical operation, the fast ## CI -Two workflow files: -- `.github/workflows/ci.yml` — clippy + fmt + test (+ ASan, miri, fuzz, cargo audit, MSRV, examples). **Required status checks on `main`.** +Three workflow files: +- `.github/workflows/ci.yml` — clippy + fmt + test + doctest (+ ASan, miri, cargo audit, MSRV, examples smoke). **Required status checks on `main`.** +- `.github/workflows/fuzz.yml` — cargo-fuzz targets. **Separate** from ci.yml (PR-triggered + nightly cron at 07:00 UTC + `workflow_dispatch`); not part of the required check set above. - `.github/workflows/bench.yml` — criterion microbenchmark suite. **Manual-only (`workflow_dispatch`), NOT a required gate.** Runs on a Depot single-tenant runner (`depot-ubuntu-24.04-4`, tmpfs temp DBs), so numbers are comparable run-to-run; `baseline/main.json` must only ever be rebaselined from a Depot run of this workflow, never from a laptop. `powdb-bench` only depends on `powdb-storage`+`powdb-query`, so it gates nothing the normal suite doesn't already cover. Run it on demand: `gh workflow run bench.yml`. diff --git a/Cargo.toml b/Cargo.toml index c6612b4..f6bc6b8 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -26,9 +26,25 @@ blake3 = "1" # - lto = true: enables cross-crate inlining (executor → storage → page). # This is the load-bearing flag — every fast path crosses crate boundaries. # - codegen-units = 1: single codegen unit, slower compile, faster runtime. -# - panic = "abort": no unwinding tables on hot branches; smaller binary. # - strip = "symbols": shrink binary, no impact on perf. # Estimated overall impact vs default release: 5-15% across-the-board. +# +# panic = "abort" — DELIBERATE crash-only design, not just a size/perf knob. +# PowDB holds shared, mutable engine state (catalog + page cache) behind a +# single `RwLock` shared across every connection. If a query thread +# were to panic mid-mutation while unwinding, it would (a) poison the lock, +# bricking every subsequent request with "lock poisoned", and (b) potentially +# leave the in-memory catalog/page state inconsistent with the on-disk WAL, +# with no in-process path back to a consistent state. Aborting instead turns a +# panic into a fast, clean process exit: a supervisor (Docker `restart`, +# systemd `Restart=on-failure`, Fly auto-restart, ECS task restart) brings the +# server back up, and WAL replay deterministically recovers to the last +# durable, consistent state — a path that is exercised by the durability test +# suite (crates/query/tests/durability.rs) on every CI run. For a stateful +# engine, "crash and recover from the log" is safer and more available than +# "limp along on possibly-corrupt shared state". This is why a process +# supervisor with auto-restart is REQUIRED in production (see +# examples/deploy/README.md and the Production checklist in README.md). [profile.release] lto = true codegen-units = 1 @@ -36,3 +52,13 @@ panic = "abort" opt-level = 3 debug = false strip = "symbols" + +# Lint policy promoted from CI into the workspace so `cargo clippy` fails +# locally with the same rules CI enforces (`-D warnings`), instead of only +# surfacing after a push. Inherited by every crate via `[lints] workspace = +# true`. `clippy::all = deny` mirrors the existing CI gate exactly — the code +# already passes `clippy -D warnings`, so promoting the same group to `deny` +# introduces no new failures; it just moves the feedback from post-push to the +# local `cargo clippy`. +[workspace.lints.clippy] +all = { level = "deny", priority = -1 } diff --git a/Dockerfile b/Dockerfile index fa3f7e0..fe86fa7 100644 --- a/Dockerfile +++ b/Dockerfile @@ -4,25 +4,36 @@ FROM rust:1.95-slim-bookworm AS builder WORKDIR /src -# Cache deps separately from source by copying manifests first +# Cache deps separately from source by copying manifests first. +# powdb-server depends on storage + query + auth; powdb-cli additionally pulls +# backup. The dep-cache stage must include the FULL dependency closure of the +# crates we build, or the cache layer is silently incomplete (the `|| true` +# below would hide the miss and re-resolve every build). COPY Cargo.toml Cargo.lock ./ COPY crates/storage/Cargo.toml crates/storage/Cargo.toml COPY crates/query/Cargo.toml crates/query/Cargo.toml COPY crates/server/Cargo.toml crates/server/Cargo.toml COPY crates/cli/Cargo.toml crates/cli/Cargo.toml +COPY crates/auth/Cargo.toml crates/auth/Cargo.toml +COPY crates/backup/Cargo.toml crates/backup/Cargo.toml # Create empty src trees so cargo can resolve+download deps without source RUN mkdir -p crates/storage/src crates/query/src crates/server/src crates/cli/src \ + crates/auth/src crates/backup/src \ && echo 'pub fn _stub() {}' > crates/storage/src/lib.rs \ && echo 'pub fn _stub() {}' > crates/query/src/lib.rs \ && echo 'pub fn _stub() {}' > crates/server/src/lib.rs \ + && echo 'pub fn _stub() {}' > crates/auth/src/lib.rs \ + && echo 'pub fn _stub() {}' > crates/backup/src/lib.rs \ && echo 'fn main() {}' > crates/server/src/main.rs \ && echo 'fn main() {}' > crates/cli/src/main.rs \ && cargo build --release -p powdb-server 2>/dev/null || true # Now copy real source and build for real COPY crates ./crates -RUN touch crates/storage/src/lib.rs crates/query/src/lib.rs crates/server/src/lib.rs crates/server/src/main.rs crates/cli/src/main.rs \ +RUN touch crates/storage/src/lib.rs crates/query/src/lib.rs crates/server/src/lib.rs \ + crates/auth/src/lib.rs crates/backup/src/lib.rs \ + crates/server/src/main.rs crates/cli/src/main.rs \ && cargo build --release -p powdb-server # ─── Runtime ──────────────────────────────────────────────────────────────── diff --git a/README.md b/README.md index ab478b4..66738b7 100644 --- a/README.md +++ b/README.md @@ -42,7 +42,7 @@ Full language reference: [docs/POWQL.md](https://github.com/zvndev/powdb/blob/ma cargo install powdb-cli cargo install powdb-server -# TypeScript client (Node 18+) — versions independently of the server crates (currently 0.3.x) +# TypeScript client (Node 18+) — versions independently of the server crates (see npmjs.com/package/@zvndev/powdb-client for the current version) npm install @zvndev/powdb-client # Prebuilt binaries (linux x86_64, macos aarch64) diff --git a/clients/ts/src/index.ts b/clients/ts/src/index.ts index 31e2567..6e0cbb3 100644 --- a/clients/ts/src/index.ts +++ b/clients/ts/src/index.ts @@ -32,7 +32,7 @@ import { } from "./typed.js"; /** Client library version. Compared to the server's reported version. */ -export const CLIENT_VERSION = "0.4.0"; +export const CLIENT_VERSION = "0.5.0"; export type QueryResult = | { kind: "rows"; columns: string[]; rows: string[][] } diff --git a/crates/auth/Cargo.toml b/crates/auth/Cargo.toml index 3c9662b..a666e6b 100644 --- a/crates/auth/Cargo.toml +++ b/crates/auth/Cargo.toml @@ -20,3 +20,6 @@ thiserror.workspace = true [dev-dependencies] tempfile = "3" + +[lints] +workspace = true diff --git a/crates/auth/src/lib.rs b/crates/auth/src/lib.rs index 35d971c..295417d 100644 --- a/crates/auth/src/lib.rs +++ b/crates/auth/src/lib.rs @@ -1,8 +1,11 @@ //! `powdb-auth` — argon2id password hashing and a persisted user/role store. //! -//! Slice 1 of PowDB's RBAC epic. This crate is **additive**: it is a tested -//! library + data model and is not yet wired into the server or CLI, so it -//! does not change any runtime behavior. +//! Provides PowDB's RBAC primitives: the [`role`] permission lattice, the +//! [`hash`] argon2id password hashing, and the persisted [`store::UserStore`]. +//! These are live in production: `powdb-server` loads the [`store::UserStore`] +//! at startup and enforces the [`role`] lattice on every query +//! (`crates/server/src/handler.rs`), and `powdb-cli` manages users via the +//! `useradd`/`passwd`/`userdel` subcommands. pub mod error; pub mod hash; diff --git a/crates/auth/src/role.rs b/crates/auth/src/role.rs index a76dfee..adcc182 100644 --- a/crates/auth/src/role.rs +++ b/crates/auth/src/role.rs @@ -1,8 +1,11 @@ //! Permission and role model for PowDB RBAC. //! //! A small, fixed permission lattice plus three builtin roles -//! (`admin`, `readwrite`, `readonly`). This slice does not enforce -//! permissions anywhere — it only defines the data model. +//! (`admin`, `readwrite`, `readonly`). The lattice is enforced at the server +//! dispatch layer (`crates/server/src/handler.rs::check_statement_permitted`): +//! reads need [`Permission::Read`], row mutations need [`Permission::Write`], +//! schema changes need [`Permission::Ddl`], and [`Permission::Admin`] is +//! reserved for user/role management (CLI-only today). use std::collections::BTreeSet; @@ -49,11 +52,14 @@ impl Role { } } - /// Builtin `readwrite` role: read + write. + /// Builtin `readwrite` role: read + write + schema definition. An + /// application-tier user is expected to create and evolve its own tables + /// and indexes, so `readwrite` includes `Ddl`. It does NOT include + /// `Admin` (user/role management stays admin-only). pub fn readwrite() -> Role { Role { name: "readwrite".to_string(), - permissions: BTreeSet::from([Permission::Read, Permission::Write]), + permissions: BTreeSet::from([Permission::Read, Permission::Write, Permission::Ddl]), } } @@ -108,15 +114,15 @@ mod tests { } #[test] - fn readwrite_has_read_and_write_only() { + fn readwrite_has_read_write_and_ddl_but_not_admin() { let r = Role::readwrite(); assert_eq!( r.permissions, - BTreeSet::from([Permission::Read, Permission::Write]) + BTreeSet::from([Permission::Read, Permission::Write, Permission::Ddl]) ); assert!(r.allows(Permission::Read)); assert!(r.allows(Permission::Write)); - assert!(!r.allows(Permission::Ddl)); + assert!(r.allows(Permission::Ddl)); assert!(!r.allows(Permission::Admin)); } diff --git a/crates/auth/src/store.rs b/crates/auth/src/store.rs index ab49677..9010dac 100644 --- a/crates/auth/src/store.rs +++ b/crates/auth/src/store.rs @@ -1,8 +1,9 @@ //! Persisted user/role store backed by `auth.json`. //! //! Passwords are never stored in plaintext — only argon2id PHC hashes. -//! On Unix the on-disk file is written with mode `0600`. This slice is -//! additive: nothing in the server/cli reads this store yet. +//! On Unix the on-disk file is written with mode `0600`. `powdb-server` loads +//! this store at startup and authenticates every connection against it; +//! `powdb-cli` manages it via the `useradd`/`passwd`/`userdel` subcommands. use std::collections::BTreeMap; use std::fs; diff --git a/crates/backup/Cargo.toml b/crates/backup/Cargo.toml index 7252fc8..4b35df5 100644 --- a/crates/backup/Cargo.toml +++ b/crates/backup/Cargo.toml @@ -19,3 +19,6 @@ blake3.workspace = true [dev-dependencies] powdb-query = { version = "0.4.7", path = "../query" } + +[lints] +workspace = true diff --git a/crates/bench/Cargo.toml b/crates/bench/Cargo.toml index 12a8fb0..adfb474 100644 --- a/crates/bench/Cargo.toml +++ b/crates/bench/Cargo.toml @@ -31,3 +31,6 @@ harness = false [[bench]] name = "powql" harness = false + +[lints] +workspace = true diff --git a/crates/cli/Cargo.toml b/crates/cli/Cargo.toml index 87666ee..78df5b0 100644 --- a/crates/cli/Cargo.toml +++ b/crates/cli/Cargo.toml @@ -21,3 +21,6 @@ rustyline = "15" tokio = { version = "1", features = ["rt", "rt-multi-thread", "net", "io-util", "macros"] } tracing.workspace = true tracing-subscriber.workspace = true + +[lints] +workspace = true diff --git a/crates/cli/src/main.rs b/crates/cli/src/main.rs index 4979c88..2ed70be 100644 --- a/crates/cli/src/main.rs +++ b/crates/cli/src/main.rs @@ -328,7 +328,7 @@ fn parse_args() -> CliArgs { " Remote REPL: powdb-cli --remote 127.0.0.1:5433 --password secret" ); println!(" One-shot: powdb-cli --exec 'count(User)'"); - println!(" One-shot (remote): powdb-cli -r 127.0.0.1:5433 -c 'User filter .age > 25 | limit 5'"); + println!(" One-shot (remote): powdb-cli -r 127.0.0.1:5433 -c 'User filter .age > 25 limit 5'"); println!(); println!("SUBCOMMANDS:"); println!(" backup [--base ]"); diff --git a/crates/compare/Cargo.toml b/crates/compare/Cargo.toml index efe4386..59cafdd 100644 --- a/crates/compare/Cargo.toml +++ b/crates/compare/Cargo.toml @@ -25,3 +25,6 @@ mysql = { version = "25", default-features = false, features = ["default-rustls" [[bin]] name = "compare-engines" path = "src/main.rs" + +[lints] +workspace = true diff --git a/crates/query/Cargo.toml b/crates/query/Cargo.toml index da91b72..c7c512c 100644 --- a/crates/query/Cargo.toml +++ b/crates/query/Cargo.toml @@ -19,3 +19,6 @@ rustc-hash.workspace = true [dev-dependencies] tempfile = "3" + +[lints] +workspace = true diff --git a/crates/server/Cargo.toml b/crates/server/Cargo.toml index 60deeb0..18c4439 100644 --- a/crates/server/Cargo.toml +++ b/crates/server/Cargo.toml @@ -28,3 +28,6 @@ tracing-subscriber.workspace = true [dev-dependencies] rcgen = { version = "0.14", default-features = false, features = ["aws_lc_rs", "pem"] } tempfile = "3" + +[lints] +workspace = true diff --git a/crates/server/src/handler.rs b/crates/server/src/handler.rs index af7045f..c1eae9a 100644 --- a/crates/server/src/handler.rs +++ b/crates/server/src/handler.rs @@ -90,16 +90,48 @@ pub struct Principal { pub role: String, } -/// Whether `role` grants the `Write` permission. Unknown role names fail -/// closed (no write). Shared-password / open / embedded modes never construct -/// a [`Principal`], so they are unaffected by this gate. -fn role_can_write(role: &str) -> bool { - Role::builtin(role).is_some_and(|r| r.allows(Permission::Write)) +/// Whether a parsed statement is data-definition (schema) work: creating, +/// altering, or dropping a type or view. `explain ` is classified by its +/// inner statement so `explain drop User` needs the same permission as +/// `drop User`. Mutations that change *rows* (insert/update/delete/upsert/ +/// refresh) and transaction control are NOT DDL — they fall under `Write`. +fn is_ddl_statement(stmt: &powdb_query::ast::Statement) -> bool { + use powdb_query::ast::Statement; + let inner = match stmt { + Statement::Explain(inner) => inner.as_ref(), + other => other, + }; + matches!( + inner, + Statement::CreateType(_) + | Statement::AlterTable(_) + | Statement::DropTable(_) + | Statement::CreateView(_) + | Statement::DropView(_) + ) } -/// Enforce the principal's role against a parsed statement. Returns an error -/// for any non-read statement (insert/update/delete/upsert/DDL/view ops/ -/// transaction control) when the role does not grant `Write`. +/// The capability a parsed statement requires under the RBAC lattice +/// (`crates/auth/src/role.rs`). Reads need [`Permission::Read`]; schema +/// definition needs [`Permission::Ddl`]; every other mutation needs +/// [`Permission::Write`]. [`Permission::Admin`] is reserved for user/role +/// management, which is CLI-only today and never reaches this wire path. +fn required_permission(stmt: &powdb_query::ast::Statement) -> Permission { + if is_read_only_statement(stmt) { + Permission::Read + } else if is_ddl_statement(stmt) { + Permission::Ddl + } else { + Permission::Write + } +} + +/// Enforce the principal's role against a parsed statement using the full +/// permission lattice. Reads are always permitted (any authenticated role can +/// read — unknown role names still read but fail closed on any mutation). +/// Mutations require the specific capability the statement maps to: row +/// mutations need `Write`, schema changes need `Ddl`. Unknown role names +/// resolve to no builtin and therefore grant nothing beyond reads. /// /// Classification uses the parsed AST via /// [`powdb_query::executor::is_read_only_statement`] — the exact same @@ -114,11 +146,22 @@ fn check_statement_permitted( // byte-identical to the pre-RBAC behavior. return Ok(()); }; - if is_read_only_statement(stmt) || role_can_write(&p.role) { + // Reads are permitted for every authenticated principal (preserves the + // pre-lattice contract that any connected role may run read-only queries). + if is_read_only_statement(stmt) { + return Ok(()); + } + let needed = required_permission(stmt); + if Role::builtin(&p.role).is_some_and(|r| r.allows(needed)) { return Ok(()); } + let kind = if needed == Permission::Ddl { + "schema-definition" + } else { + "write" + }; Err(QueryError::Execution(format!( - "permission denied: role '{}' cannot execute write statements", + "permission denied: role '{}' cannot execute {kind} statements", p.role ))) } @@ -205,6 +248,13 @@ const SAFE_ERROR_PREFIXES: &[&str] = &[ "permission denied", "row too large", "unique constraint violation", + // Resource-limit errors carry actionable guidance (e.g. "add a LIMIT + // clause") and leak no internal state, so surface them verbatim instead + // of masking them to the generic message. See QueryError::{SortLimit, + // JoinLimit,MemoryLimit}Exceeded in crates/query/src/result.rs. + "sort input exceeds", + "join result exceeds", + "query exceeded memory budget", ]; /// Sanitize an error message before sending it to the client. @@ -687,6 +737,21 @@ mod tests { ); } + #[test] + fn resource_limit_errors_surface_actionable_hints() { + // These carry user-actionable guidance and leak no internal state, so + // they must reach the client verbatim — not be masked to the generic + // message. The exact strings come from QueryError's Display impl + // (crates/query/src/result.rs). + for msg in [ + "sort input exceeds row limit — add a LIMIT clause", + "join result exceeds row limit", + "query exceeded memory budget: requested 100 bytes, limit 50 bytes", + ] { + assert_eq!(sanitize_error(msg), msg, "should pass through verbatim"); + } + } + // ---- Role enforcement (Fix: readonly role was not enforced) ---- fn parsed(q: &str) -> powdb_query::ast::Statement { diff --git a/crates/storage/Cargo.toml b/crates/storage/Cargo.toml index dda28aa..c6d40e2 100644 --- a/crates/storage/Cargo.toml +++ b/crates/storage/Cargo.toml @@ -22,3 +22,6 @@ libc = "0.2" [dev-dependencies] proptest = "1" tempfile = "3" + +[lints] +workspace = true diff --git a/crates/storage/src/lib.rs b/crates/storage/src/lib.rs index 0095f32..ea74a2b 100644 --- a/crates/storage/src/lib.rs +++ b/crates/storage/src/lib.rs @@ -4,11 +4,9 @@ pub mod catalog; pub mod disk; pub mod error; pub mod heap; -pub mod mvcc; pub mod page; pub mod row; pub mod table; -pub mod tx; pub mod types; pub mod view; pub mod wal; diff --git a/crates/storage/src/mvcc.rs b/crates/storage/src/mvcc.rs deleted file mode 100644 index 814d633..0000000 --- a/crates/storage/src/mvcc.rs +++ /dev/null @@ -1,57 +0,0 @@ -/// Pointer into the undo log. -#[derive(Debug, Clone, Copy, PartialEq, Eq)] -pub struct UndoPtr(pub usize); - -/// A single undo log entry: the old version of a row before an update. -#[derive(Debug, Clone)] -pub struct UndoEntry { - pub tx_id: u64, - pub data: Vec, - pub prev: Option, // previous version (undo chain) -} - -/// Append-only undo log. Entries are never modified, only appended. -/// Old entries are reclaimed by advancing the purge watermark. -pub struct UndoLog { - entries: Vec, -} - -impl Default for UndoLog { - fn default() -> Self { - Self::new() - } -} - -impl UndoLog { - pub fn new() -> Self { - UndoLog { - entries: Vec::new(), - } - } - - pub fn push(&mut self, tx_id: u64, data: &[u8]) -> UndoPtr { - self.push_with_prev(tx_id, data, None) - } - - pub fn push_with_prev(&mut self, tx_id: u64, data: &[u8], prev: Option) -> UndoPtr { - let ptr = UndoPtr(self.entries.len()); - self.entries.push(UndoEntry { - tx_id, - data: data.to_vec(), - prev, - }); - ptr - } - - pub fn get(&self, ptr: UndoPtr) -> Option<&UndoEntry> { - self.entries.get(ptr.0) - } - - pub fn len(&self) -> usize { - self.entries.len() - } - - pub fn is_empty(&self) -> bool { - self.entries.is_empty() - } -} diff --git a/crates/storage/src/tx.rs b/crates/storage/src/tx.rs deleted file mode 100644 index be6ac04..0000000 --- a/crates/storage/src/tx.rs +++ /dev/null @@ -1,167 +0,0 @@ -use std::collections::HashSet; -use std::sync::atomic::{AtomicU64, Ordering}; - -static NEXT_TX_ID: AtomicU64 = AtomicU64::new(1); - -#[derive(Debug, Clone)] -pub struct Transaction { - pub id: u64, - /// Snapshot: the set of tx_ids that were active when this tx began. - active_at_start: HashSet, - /// The tx_id counter value when this tx started (all tx < this existed). - snapshot_id: u64, -} - -impl Transaction { - /// Can this transaction see data written by `writer_tx_id`? - /// Visible if: writer committed before our snapshot AND wasn't active when we started. - pub fn can_see(&self, writer_tx_id: u64) -> bool { - if writer_tx_id == self.id { - return true; // can always see own writes - } - // Must have started before us AND not been active when we started - writer_tx_id < self.snapshot_id && !self.active_at_start.contains(&writer_tx_id) - } -} - -pub struct TxManager { - active_txs: HashSet, - committed_txs: HashSet, - aborted_txs: HashSet, -} - -impl Default for TxManager { - fn default() -> Self { - Self::new() - } -} - -impl TxManager { - pub fn new() -> Self { - TxManager { - active_txs: HashSet::new(), - committed_txs: HashSet::new(), - aborted_txs: HashSet::new(), - } - } - - pub fn begin(&mut self) -> Transaction { - let id = NEXT_TX_ID.fetch_add(1, Ordering::SeqCst); - let snapshot_id = id; - let active_at_start = self.active_txs.clone(); - self.active_txs.insert(id); - Transaction { - id, - active_at_start, - snapshot_id, - } - } - - pub fn commit(&mut self, tx_id: u64) { - self.active_txs.remove(&tx_id); - self.committed_txs.insert(tx_id); - } - - pub fn rollback(&mut self, tx_id: u64) { - self.active_txs.remove(&tx_id); - self.aborted_txs.insert(tx_id); - } - - pub fn is_active(&self, tx_id: u64) -> bool { - self.active_txs.contains(&tx_id) - } - - pub fn is_aborted(&self, tx_id: u64) -> bool { - self.aborted_txs.contains(&tx_id) - } - - pub fn is_committed(&self, tx_id: u64) -> bool { - self.committed_txs.contains(&tx_id) - } - - /// The oldest active tx — undo entries before this are safe to purge. - pub fn oldest_active(&self) -> Option { - self.active_txs.iter().min().copied() - } -} - -#[cfg(test)] -mod tests { - use super::*; - use crate::mvcc::UndoLog; - - #[test] - fn test_begin_commit() { - let mut mgr = TxManager::new(); - let tx = mgr.begin(); - assert!(tx.id > 0); - assert!(mgr.is_active(tx.id)); - mgr.commit(tx.id); - assert!(!mgr.is_active(tx.id)); - assert!(mgr.is_committed(tx.id)); - } - - #[test] - fn test_snapshot_isolation() { - let mut mgr = TxManager::new(); - let tx1 = mgr.begin(); - let tx2 = mgr.begin(); - // tx1's snapshot should not see tx2's writes - assert!(!tx1.can_see(tx2.id)); - // tx2's snapshot should not see tx1 (both active) - assert!(!tx2.can_see(tx1.id)); - mgr.commit(tx1.id); - // tx2 still shouldn't see tx1 (tx1 was active when tx2 started) - assert!(!tx2.can_see(tx1.id)); - } - - #[test] - fn test_sees_earlier_committed() { - let mut mgr = TxManager::new(); - let tx1 = mgr.begin(); - mgr.commit(tx1.id); - // tx2 starts after tx1 committed - let tx2 = mgr.begin(); - assert!(tx2.can_see(tx1.id)); - } - - #[test] - fn test_undo_log() { - let mut undo = UndoLog::new(); - let ptr = undo.push(1, b"old version of row"); - let entry = undo.get(ptr).unwrap(); - assert_eq!(entry.tx_id, 1); - assert_eq!(entry.data, b"old version of row"); - } - - #[test] - fn test_undo_chain() { - let mut undo = UndoLog::new(); - let ptr1 = undo.push_with_prev(1, b"version 1", None); - let ptr2 = undo.push_with_prev(2, b"version 2", Some(ptr1)); - let entry2 = undo.get(ptr2).unwrap(); - assert_eq!(entry2.prev, Some(ptr1)); - let entry1 = undo.get(entry2.prev.unwrap()).unwrap(); - assert_eq!(entry1.data, b"version 1"); - } - - #[test] - fn test_rollback() { - let mut mgr = TxManager::new(); - let tx = mgr.begin(); - mgr.rollback(tx.id); - assert!(!mgr.is_active(tx.id)); - assert!(mgr.is_aborted(tx.id)); - } - - #[test] - fn test_oldest_active() { - let mut mgr = TxManager::new(); - let tx1 = mgr.begin(); - let tx2 = mgr.begin(); - let _tx3 = mgr.begin(); - assert_eq!(mgr.oldest_active(), Some(tx1.id)); - mgr.commit(tx1.id); - assert_eq!(mgr.oldest_active(), Some(tx2.id)); - } -} diff --git a/examples/deploy/README.md b/examples/deploy/README.md index 31d38cc..0b09020 100644 --- a/examples/deploy/README.md +++ b/examples/deploy/README.md @@ -3,6 +3,30 @@ Reference configurations for self-hosting `powdb-server`. These are templates, not live deployments — replace placeholder names and secrets before running. +## Why auto-restart is required (read this first) + +**PowDB is crash-only by design, so a process supervisor with auto-restart is +MANDATORY in production.** The server is built with `panic = "abort"`: on an +unrecoverable error it exits immediately rather than trying to limp along in a +half-broken state. On the next start, WAL replay rolls the data dir forward to +the last consistent state, recovering committed writes. This is fast and safe — +but it only works if *something* restarts the process. + +Every example here ships with auto-restart already wired in: + +| Example | Auto-restart mechanism | +| ------------------ | ------------------------------------------------------------------ | +| Fly.io | `auto_start_machines = true`, `min_machines_running = 1` (fly.toml) | +| Railway | `restartPolicyType = "ON_FAILURE"` (railway.toml) | +| Cloudflare Tunnel | `restart: unless-stopped` on both services (docker-compose.yml) | +| AWS ECS Fargate | the service reconciles tasks toward `desired_count = 1` (main.tf) | + +If you adapt these to another platform, keep an equivalent supervisor +(systemd `Restart=always`, Kubernetes Deployment/`restartPolicy: Always`, +`docker run --restart unless-stopped`, etc.). Running `powdb-server` as a bare, +unsupervised process means a single crash leaves the database **down** until a +human restarts it — even though the data on disk is fully recoverable. + ## Fly.io [`fly.toml`](./fly.toml) is a minimal stateful TCP deployment: one machine, diff --git a/examples/deploy/aws-ecs/main.tf b/examples/deploy/aws-ecs/main.tf index fc311fc..e3f5c17 100644 --- a/examples/deploy/aws-ecs/main.tf +++ b/examples/deploy/aws-ecs/main.tf @@ -213,6 +213,12 @@ resource "aws_ecs_service" "powdb" { desired_count = 1 launch_type = "FARGATE" + # AUTO-RESTART (MANDATORY for PowDB): an ECS service continuously reconciles + # running tasks toward `desired_count`, so if the task exits or is killed, + # ECS launches a fresh one automatically. This is exactly the supervised + # restart PowDB relies on — it is crash-only by design (panic = "abort" → + # fast exit → restart → WAL replay recovers to a consistent state). Do NOT + # replace this with a one-shot RunTask; you would lose auto-restart. # PowDB is single-writer — never run 2 tasks against the same EFS dir. deployment_maximum_percent = 100 deployment_minimum_healthy_percent = 0 diff --git a/scripts/smoke-release.sh b/scripts/smoke-release.sh new file mode 100755 index 0000000..ca76546 --- /dev/null +++ b/scripts/smoke-release.sh @@ -0,0 +1,169 @@ +#!/usr/bin/env bash +# scripts/smoke-release.sh — post-build durability smoke for a RELEASE binary. +# +# This is the gate whose ABSENCE caused the v0.4.1–0.4.3 data-loss yanks: a +# server that accepted writes, exited, and came back EMPTY because WAL replay +# wasn't exercised end-to-end against the actual shipped binary. This script +# proves, over the wire, that: +# +# 1. The README's documented PowQL flow works (create type, insert, filter, +# count) against the built `powdb-server` via `powdb-cli -r host:port`. +# 2. A `required unique` constraint REJECTS a duplicate insert. +# 3. After `kill -9` + restart on the SAME data dir, the previously-inserted +# rows are STILL there (WAL replay recovered them) AND the unique +# constraint is STILL enforced. +# +# It drives the binaries via env vars so it works both in CI (against freshly +# built release artifacts) and locally: +# +# POWDB_CLI path to powdb-cli (default ./target/release/powdb-cli) +# POWDB_SERVER path to powdb-server (default ./target/release/powdb-server) +# POWDB_SMOKE_PORT TCP port to bind (default 7950) +# +# Prints 'SMOKE-RELEASE: ALL-PASS' and exits 0 on success; exits nonzero on the +# first failed assertion. + +set -euo pipefail +# Quiet bash job-control "Killed: 9" notices when we kill -9 the server. +set +m 2>/dev/null || true + +REPO_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" + +POWDB_CLI="${POWDB_CLI:-${REPO_ROOT}/target/release/powdb-cli}" +POWDB_SERVER="${POWDB_SERVER:-${REPO_ROOT}/target/release/powdb-server}" +PORT="${POWDB_SMOKE_PORT:-7950}" +HOST="127.0.0.1" + +DATADIR="$(mktemp -d "${TMPDIR:-/tmp}/powdb-smoke-XXXXXX")" +SERVER_PID="" + +log() { echo "smoke-release: $*"; } +fail() { echo "smoke-release: FAIL — $*" >&2; exit 1; } + +cleanup() { + if [[ -n "${SERVER_PID}" ]] && kill -0 "${SERVER_PID}" 2>/dev/null; then + kill -9 "${SERVER_PID}" 2>/dev/null || true + fi + # Belt-and-suspenders: reap any stray server bound to our temp data dir. + pkill -9 -f "powdb-server.*${DATADIR}" 2>/dev/null || true + if [[ -d "${DATADIR}" && ( "${DATADIR}" == "${TMPDIR:-/tmp}"/powdb-smoke-* \ + || "${DATADIR}" == /tmp/powdb-smoke-* \ + || "${DATADIR}" == /var/folders/*/powdb-smoke-* ) ]]; then + rm -rf "${DATADIR}" + fi +} +trap cleanup EXIT + +[[ -x "${POWDB_CLI}" ]] || fail "powdb-cli not found/executable at ${POWDB_CLI} (build with: cargo build --release -p powdb-cli)" +[[ -x "${POWDB_SERVER}" ]] || fail "powdb-server not found/executable at ${POWDB_SERVER} (build with: cargo build --release -p powdb-server)" + +# Start the server in the background on $DATADIR/$PORT, wait for it to bind. +start_server() { + env -i \ + PATH="${PATH}" \ + HOME="${HOME:-}" \ + RUST_LOG="warn" \ + RUST_BACKTRACE="1" \ + "${POWDB_SERVER}" \ + --port "${PORT}" \ + --bind "${HOST}" \ + --data-dir "${DATADIR}" \ + >"${DATADIR}/server.log" 2>&1 & + SERVER_PID=$! + # Drop the server from job control so a later `kill -9` doesn't print a + # cosmetic "Killed: 9" job notice. We still track/kill it via $SERVER_PID. + disown "${SERVER_PID}" 2>/dev/null || true + + local waited=0 + while (( waited < 100 )); do + if ! kill -0 "${SERVER_PID}" 2>/dev/null; then + echo "smoke-release: server exited during startup — log:" >&2 + cat "${DATADIR}/server.log" >&2 || true + fail "server did not stay up" + fi + if (echo >"/dev/tcp/${HOST}/${PORT}") >/dev/null 2>&1; then + return 0 + fi + sleep 0.1 + waited=$(( waited + 1 )) + done + echo "smoke-release: server never bound ${HOST}:${PORT} — log:" >&2 + cat "${DATADIR}/server.log" >&2 || true + fail "server bind timeout" +} + +# Run one PowQL statement over the wire. Echoes the CLI's stdout. +q() { + "${POWDB_CLI}" -r "${HOST}:${PORT}" -c "$1" +} + +# Run a PowQL statement that is EXPECTED to fail; capture combined output and +# return success only if it ran (we inspect the text in the caller). +q_expect_fail() { + "${POWDB_CLI}" -r "${HOST}:${PORT}" -c "$1" 2>&1 || true +} + +# ── Phase 1: fresh server, documented PowQL flow ─────────────────────────── +log "starting server (pid-to-be) on ${HOST}:${PORT}, data dir ${DATADIR}" +start_server +log "server up (pid ${SERVER_PID})" + +# README/getting-started PowQL (NOT SQL): a type with a `required unique` field. +log "creating type Acct (required unique email)" +q 'type Acct { required unique email: str, id: int }' >/dev/null \ + || fail "could not create type Acct" + +log "inserting two rows" +q 'insert Acct { email := "a@x.com", id := 1 }' >/dev/null || fail "insert #1 failed" +q 'insert Acct { email := "b@x.com", id := 2 }' >/dev/null || fail "insert #2 failed" + +log "filter query (Acct filter .id = 1 { .email })" +FILT="$(q 'Acct filter .id = 1 { .email }')" +echo "${FILT}" | grep -q 'a@x.com' || fail "filter did not return the expected row: ${FILT}" + +log "count(Acct) should be 2" +CNT="$(q 'count(Acct)' | tr -dc '0-9')" +[[ "${CNT}" == "2" ]] || fail "expected count 2, got '${CNT}'" + +# ── Phase 2: unique constraint rejects a duplicate ───────────────────────── +log "asserting unique constraint rejects duplicate email" +DUP_OUT="$(q_expect_fail 'insert Acct { email := "a@x.com", id := 3 }')" +echo "${DUP_OUT}" | grep -qi 'unique constraint violation' \ + || fail "duplicate insert was NOT rejected with a unique constraint violation. Output: ${DUP_OUT}" + +log "re-counting after rejected duplicate (must still be 2)" +CNT2="$(q 'count(Acct)' | tr -dc '0-9')" +[[ "${CNT2}" == "2" ]] || fail "count changed after a rejected insert: expected 2, got '${CNT2}'" + +# ── Phase 3: kill -9 + restart on the SAME data dir → WAL replay ─────────── +log "kill -9 the server (simulating a crash) — pid ${SERVER_PID}" +kill -9 "${SERVER_PID}" 2>/dev/null || true +# Wait for the process to actually die and release the port. +waited=0 +while kill -0 "${SERVER_PID}" 2>/dev/null && (( waited < 50 )); do + sleep 0.1; waited=$(( waited + 1 )) +done +SERVER_PID="" + +log "restarting server on the SAME data dir ${DATADIR}" +start_server +log "server back up (pid ${SERVER_PID}) — verifying recovery" + +log "post-restart count(Acct) must still be 2 (WAL replay recovered rows)" +RCNT="$(q 'count(Acct)' | tr -dc '0-9')" +[[ "${RCNT}" == "2" ]] || fail "DATA LOSS: post-restart count is '${RCNT}', expected 2 — WAL replay did not recover the rows" + +log "post-restart filter must still find a@x.com" +RFILT="$(q 'Acct filter .id = 1 { .email }')" +echo "${RFILT}" | grep -q 'a@x.com' || fail "post-restart filter lost the row: ${RFILT}" + +log "post-restart unique constraint must STILL be enforced" +RDUP_OUT="$(q_expect_fail 'insert Acct { email := "a@x.com", id := 4 }')" +echo "${RDUP_OUT}" | grep -qi 'unique constraint violation' \ + || fail "unique constraint NOT enforced after restart — index/WAL recovery incomplete. Output: ${RDUP_OUT}" + +log "final count must remain 2 (rejected dup did not slip in)" +FCNT="$(q 'count(Acct)' | tr -dc '0-9')" +[[ "${FCNT}" == "2" ]] || fail "final count is '${FCNT}', expected 2" + +echo "SMOKE-RELEASE: ALL-PASS"