From 177a81d9a445017899ecec3e2886d8777e858a5d Mon Sep 17 00:00:00 2001 From: bluestreak Date: Sun, 24 May 2026 01:42:22 +0100 Subject: [PATCH 1/9] docs(ingress): add column-major sender plan and FFI ABI spec Plan and FFI ABI for the new column-major writer that will ingest Pandas/Polars DataFrames over QWP/WebSocket. Locks the QuestDb pool shape, BulkChunk encoder strategy, validity bitmap semantics, and the C ABI the separate Python wrapper repo will consume. Co-Authored-By: Claude Opus 4.7 (1M context) --- doc/COLUMN_SENDER_FFI_ABI.md | 833 +++++++++++++++++++++++++++++++++++ doc/COLUMN_SENDER_PLAN.md | 578 ++++++++++++++++++++++++ 2 files changed, 1411 insertions(+) create mode 100644 doc/COLUMN_SENDER_FFI_ABI.md create mode 100644 doc/COLUMN_SENDER_PLAN.md diff --git a/doc/COLUMN_SENDER_FFI_ABI.md b/doc/COLUMN_SENDER_FFI_ABI.md new file mode 100644 index 00000000..f8b6ecc3 --- /dev/null +++ b/doc/COLUMN_SENDER_FFI_ABI.md @@ -0,0 +1,833 @@ +# Column-Major Sender — C ABI Specification + +**Status:** draft, pending approval +**Header:** `include/questdb/ingress/column_sender.h` (to be added) +**Sibling header:** `include/questdb/ingress/line_sender.h` (existing, +shares error types) +**Audience:** the Python wrapper repo, and anyone writing a C/C++ +client against this API. + +This document is self-contained. It is the contract between +`c-questdb-client` (Rust core) and the Python wrapper repo. The Python +repo can be implemented from this spec without reading any Rust code. + +--- + +## 1. Scope + +This ABI exposes a column-major writer that ingests **per-column typed +buffers** into QuestDB via QWP/WebSocket. Optimised for sending +Pandas/Polars DataFrames at maximum throughput. One submission = +one QWP frame = one logical batch of rows for one table. + +**This is a client for the existing QuestDB server implementing the QWP +ingress (WebSocket) v1 wire specification.** The spec is at +`questdb/documentation/connect/wire-protocols/qwp-ingress-websocket.md` +in the documentation repo. The protocol is fixed and the wire types, +null encoding, schema model, framing, and limits are not up for +negotiation in this API. The FFI's job is to expose that wire as +ergonomic, zero-overhead-where-possible calls for the Python wrapper. + +Out of scope: the existing row-major `line_sender_*` ABI is unaffected; +this is an additional, orthogonal API. The two coexist on different +opaque types. + +### 1.1 Spec-derived constraints (non-negotiable) + +These come from the QWP/WS v1 wire spec and are enforced or surfaced +by this ABI. They are not API design choices. + +| Limit | Value | Enforcement | +|--------------------------------|----------------------------------------|----------------------------------------------------------| +| Max batch (frame) size | 16 MiB protocol ceiling; effectively `min(server recv buf − 14, 16 MiB)` advertised on upgrade via `X-QWP-Max-Batch-Size` | `column_sender_submit` returns an error if the encoded frame exceeds the negotiated cap. | +| Max tables per connection | 10,000 | Server-enforced; client surfaces server rejections. | +| Max rows per table block | 1,000,000 | `column_sender_chunk_*` calls fail if `row_count` exceeds. | +| Max columns per table | 2,048 | `column_sender_chunk_column_*` fails after the 2048th column. | +| Max table / column name length | 127 bytes UTF-8 | Rejected at name validation. | +| Max in-flight batches | 128 | `column_sender_submit` blocks (or returns back-pressure) until an ack frees a slot. | +| Max symbol dictionary entries | 1,000,000 per connection | Server returns `PARSE_ERROR`; surfaced as `line_sender_error_server_rejection`. | + +The wire pins protocol version 1; clients advertise +`X-QWP-Max-Version: 1`. + +--- + +## 2. Universal conventions + +### 2.1 Errors + +Errors use the existing `line_sender_error*` type from +`line_sender.h` — same codes, same accessors (`line_sender_error_msg`, +`line_sender_error_get_code`, `line_sender_error_free`). + +Every fallible function takes a trailing `line_sender_error** err_out`: + +- On success, returns `true` and does not touch `*err_out`. +- On failure, returns `false` and, if `err_out != NULL`, sets + `*err_out` to a heap-allocated error the caller must free with + `line_sender_error_free`. + +Pass `err_out = NULL` to discard the error. + +### 2.2 Pointer conventions + +Same as `line_sender.h`: opaque handles must be non-NULL. `err_out` may +be NULL. Lifecycle "free" functions accept NULL and no-op. + +### 2.3 Buffer conventions + +For every column-append function: + +- `data` is a pointer to a **contiguous, full-length** typed array + with one slot per row, **including null rows**. The slot value for + a null row is ignored — it can hold anything. This matches the + Arrow / Pandas / Polars layout, where data buffers are full-length + and null status lives in a separate bitmap. +- Strided buffers are **not** supported in v1. The Python wrapper must + materialise contiguous data before calling. (Pandas + `Series.to_numpy(copy=False)` and Polars Arrow buffers are + contiguous in the common case.) +- All column buffers passed in one chunk must have the same `row_count` + — the chunk's row count, set by the first column-append call. +- Buffer ownership stays with the caller; the FFI copies into internal + storage during the call. The buffer can be freed or reused + immediately on return. + +### 2.4 Validity bitmaps + +The FFI accepts validity bitmaps in **Arrow semantics** (bit = 1 means +**valid**, bit = 0 means NULL). This is directly compatible with PyArrow +buffers, Polars Arrow buffers, and bitmaps produced by +`numpy.packbits(..., bitorder='little')`. + +- Layout: one bit per row. Byte `i` holds rows `8*i .. 8*i+7`. +- Bit ordering is **LSB-first** within each byte (bit 0 of byte 0 is row 0). +- **Bit = 1 means VALID. Bit = 0 means NULL.** +- Buffer length in bytes must be at least `ceil(row_count / 8)`. Bits + past `row_count` are ignored. +- Pass `validity = NULL` when the column has no nulls. + +```c +typedef struct column_sender_validity { + const uint8_t* bits; // NULL = no nulls + size_t bit_len; // must equal chunk row_count +} column_sender_validity; +``` + +If `validity != NULL`, `validity->bit_len` must equal the chunk's row +count. Mismatches return `line_sender_error_invalid_api_call`. + +**Wire-format note (informative).** The QWP wire format uses the +*inverted* semantics — bit = 1 means NULL — and column data after the +bitmap is **densely packed** (only non-null values, count = +`row_count − null_count`). See spec §Null handling. The FFI accepts +the Arrow shape so PyArrow / Pandas / Polars buffers hand off +zero-copy; the library inverts the bitmap and gathers non-null values +when encoding the QWP frame. Callers never construct QWP-shaped +inputs. + +### 2.5 Threading + +- A `questdb_db` (the pool) is **thread-safe**. Share it across + threads. `questdb_db_borrow_sender` and `questdb_db_return_sender` + are safe to call concurrently. +- A `column_sender` (a borrow) is **not thread-safe**. It belongs to + the borrowing thread until returned. Do not pass it across threads. +- A `column_sender_chunk` is owned by one thread at a time. It is + *not* tied to a particular sender; chunks can be built without a + borrow and submitted on any sender borrowed from the same `db`. +- `line_sender_error` is thread-safe to read but not to share writes. + +### 2.6 String / UTF-8 + +String and symbol-dict bytes must be valid UTF-8. The library trusts the +caller by default (no per-row validation). Invalid UTF-8 will be +detected by the server and rejected. The Python wrapper is responsible +for ensuring valid UTF-8 from Pandas/Polars. + +--- + +## 3. Opaque types + +```c +typedef struct questdb_db questdb_db; /* connection pool */ +typedef struct column_sender column_sender; /* borrowed handle */ +typedef struct column_sender_chunk column_sender_chunk; +``` + +Errors reuse `line_sender_error*` (from `line_sender.h`). + +--- + +## 4. Connection pool and sender borrow + +### 4.1 Conceptual shape + +The user thinks `DataFrame → Table`: a script holds one connection to +the database and pushes DataFrames at it. Under the hood, sending is +not thread-safe per connection, so multi-threaded ingest needs +multiple connections. The pool absorbs both cases: + +``` + ┌──────────────────────────┐ + questdb_db_connect ───► │ questdb_db (pool) │ + │ ├─ connection #1 │ + │ ├─ connection #2 (lazy) │ + │ └─ ... │ + └──────────┬────────────────┘ + │ borrow_sender / return_sender + ▼ + ┌──────────────────────────┐ + │ column_sender (borrowed)│ + │ ├─ new_chunk │ + │ ├─ submit / await │ + │ └─ ... │ + └──────────────────────────┘ +``` + +Single-threaded scripts get pool size 1 by default — one borrow held +for the lifetime of the script. Multi-threaded callers borrow and +return per work unit (or per thread). + +### 4.2 Connect-string keys (pool) + +| Key | Default | Description | +|------------------------|---------|----------------------------------------------------------------------------------------------------------------------------------------------| +| `pool_size` | 1 | Warm / minimum connections, opened eagerly at `questdb_db_connect`. All N go through the full WS upgrade before `connect` returns. The pool never shrinks below this. | +| `pool_max` | 64 | Hard cap on auto-grow. When all current senders are checked out and pool size < `pool_max`, a new connection is opened on demand. When at `pool_max`, `borrow_sender` fails fast (see §4.3). | +| `pool_idle_timeout_ms` | 60000 | Connections *above* `pool_size` are closed after this much idle time in the pool's free list. Set to 0 to disable shrink (the pool only grows). | +| `pool_reap` | `auto` | `auto` — pool spawns a background thread that periodically reaps idle connections per `pool_idle_timeout_ms`. `manual` — no background thread; caller invokes `questdb_db_reap_idle` on its own cadence. | + +All other connect-string keys are inherited from the existing +`qwpws::` configuration (auth, TLS, `auth_timeout_ms`, retry, store- +and-forward, durable-ack opt-in, etc.). See `doc/CONSIDERATIONS.md` +and the row-API connect-string reference. + +Validity: `pool_size <= pool_max` must hold; otherwise +`questdb_db_connect` returns `line_sender_error_config_error`. + +### 4.3 Pool functions + +```c +/** + * Open a connection pool. Eagerly opens `pool_size` connections; any + * server/auth/TLS error during those opens fails the call. + * + * `conf` is a standard `qwpws::` connect string. Non-WS schemes return + * line_sender_error_config_error — the column-sender path is QWP/WS + * only. + */ +QUESTDB_CLIENT_API +questdb_db* questdb_db_connect( + const char* conf, + line_sender_error** err_out); + +/** + * Close the pool and all its connections. Accepts NULL and no-ops. + * Senders still checked out are invalidated; calls on them return + * line_sender_error_invalid_api_call. Callers must not call close() + * while any thread is mid-submit on a borrowed sender. + */ +QUESTDB_CLIENT_API +void questdb_db_close(questdb_db* db); + +/** + * Borrow a sender from the pool. + * + * Selection rules: + * 1. If a previously-returned sender is in the free list, hand it out. + * 2. Otherwise, if pool size < `pool_max`, open a new connection on + * demand (auto-grow) and hand out a sender bound to it. + * 3. Otherwise (at `pool_max` cap, all checked out), return + * line_sender_error_invalid_api_call. This is fail-fast: hitting + * the cap signals either a leaked borrow or a `pool_max` set too + * low — both want an error rather than silent blocking. Caller may + * retry after returning senders. + * + * The returned sender is bound to the calling thread until returned. + * Do not share across threads. + */ +QUESTDB_CLIENT_API +column_sender* questdb_db_borrow_sender( + questdb_db* db, + line_sender_error** err_out); + +/** + * Manually reap idle connections. Closes connections in the pool's + * free list whose idle time exceeds `pool_idle_timeout_ms`, never + * shrinking pool size below `pool_size`. + * + * When `pool_reap=auto` (the default), the pool runs an internal + * background thread that calls this logic periodically; calling this + * function manually is harmless. When `pool_reap=manual`, callers that + * want shrinking must invoke this function on their own cadence (e.g. + * from a daemon thread in the host language). + * + * Returns the number of connections closed by this invocation. + */ +QUESTDB_CLIENT_API +size_t questdb_db_reap_idle(questdb_db* db); + +/** + * Return a sender to the pool. The sender pointer is invalidated and + * must not be used again after this call. Any chunks created from the + * sender remain valid (chunks are caller-owned, not sender-owned) but + * cannot be submitted until borrowed again from a new sender. + * + * If the sender is in a latched-error state (must_close() == true), + * its underlying connection is closed and dropped from the pool + * instead of returned. + */ +QUESTDB_CLIENT_API +void questdb_db_return_sender( + questdb_db* db, + column_sender* sender); +``` + +### 4.4 Sender state inspection + +```c +/** + * True if the sender's underlying connection is in a permanently- + * unusable state (a QWP halt rejection, terminal WS protocol + * violation, etc.). On return to the pool, such senders are dropped, + * not recycled. + */ +QUESTDB_CLIENT_API +bool column_sender_must_close(const column_sender* sender); +``` + +--- + +## 5. Chunk lifecycle + +A chunk represents one DataFrame's worth of column buffers destined +for one table. It is the "one chunk = one table = one frame = one +FSN" unit. Chunks are caller-owned and **not bound to a particular +sender** — build a chunk on any thread, submit it on any sender +borrowed from the same `db`. + +```c +/** + * Create an empty chunk for the given table. The table name must be + * valid (same rules as line_sender_table_name; max 127 bytes UTF-8). + * + * Does not require a sender — the chunk is pure data until submitted. + * + * The chunk is owned by the caller and must be either submitted with + * column_sender_submit (which clears it for reuse) or freed with + * column_sender_chunk_free. + */ +QUESTDB_CLIENT_API +column_sender_chunk* column_sender_chunk_new( + const char* table_name, + size_t table_name_len, + line_sender_error** err_out); + +/** + * Discard the chunk and all retained capacity. Accepts NULL and no-ops. + */ +QUESTDB_CLIENT_API +void column_sender_chunk_free(column_sender_chunk* chunk); + +/** + * Clear the chunk's content, keeping retained capacity for reuse. + */ +QUESTDB_CLIENT_API +void column_sender_chunk_clear(column_sender_chunk* chunk); + +/** + * Current row count of the chunk, as locked in by the first column + * append. Zero if no columns have been added yet. + */ +QUESTDB_CLIENT_API +size_t column_sender_chunk_row_count(const column_sender_chunk* chunk); +``` + +--- + +## 6. Numeric and fixed-width column appends + +All have the shape: + +```c +bool column_sender_chunk_column_( + column_sender_chunk* chunk, + const char* name, + size_t name_len, + const * data, + size_t row_count, + const column_sender_validity* validity, // NULL if no nulls + line_sender_error** err_out); +``` + +The first column-append call locks the chunk's `row_count`. Subsequent +calls must pass the same `row_count` value or return +`line_sender_error_invalid_api_call`. + +```c +QUESTDB_CLIENT_API +bool column_sender_chunk_column_i8( + column_sender_chunk* chunk, + const char* name, size_t name_len, + const int8_t* data, size_t row_count, + const column_sender_validity* validity, + line_sender_error** err_out); + +QUESTDB_CLIENT_API +bool column_sender_chunk_column_i16( + column_sender_chunk* chunk, + const char* name, size_t name_len, + const int16_t* data, size_t row_count, + const column_sender_validity* validity, + line_sender_error** err_out); + +QUESTDB_CLIENT_API +bool column_sender_chunk_column_i32( + column_sender_chunk* chunk, + const char* name, size_t name_len, + const int32_t* data, size_t row_count, + const column_sender_validity* validity, + line_sender_error** err_out); + +QUESTDB_CLIENT_API +bool column_sender_chunk_column_i64( + column_sender_chunk* chunk, + const char* name, size_t name_len, + const int64_t* data, size_t row_count, + const column_sender_validity* validity, + line_sender_error** err_out); + +QUESTDB_CLIENT_API +bool column_sender_chunk_column_f32( + column_sender_chunk* chunk, + const char* name, size_t name_len, + const float* data, size_t row_count, + const column_sender_validity* validity, + line_sender_error** err_out); + +QUESTDB_CLIENT_API +bool column_sender_chunk_column_f64( + column_sender_chunk* chunk, + const char* name, size_t name_len, + const double* data, size_t row_count, + const column_sender_validity* validity, + line_sender_error** err_out); + +/** + * Boolean column. `data` is an Arrow-style packed bitmap (LSB-first, + * 1=true). Length is row_count bits, so `data` must be at least + * ceil(row_count/8) bytes long. + */ +QUESTDB_CLIENT_API +bool column_sender_chunk_column_bool( + column_sender_chunk* chunk, + const char* name, size_t name_len, + const uint8_t* data, size_t row_count, + const column_sender_validity* validity, + line_sender_error** err_out); + +/** + * UUID column. `data` points to row_count * 16 bytes. Each 16-byte + * group is one UUID; bytes 0..8 are the lo half (little-endian), + * bytes 8..16 are the hi half (little-endian). Matches the + * existing line_sender_buffer_column_uuid layout. + */ +QUESTDB_CLIENT_API +bool column_sender_chunk_column_uuid( + column_sender_chunk* chunk, + const char* name, size_t name_len, + const uint8_t* data, size_t row_count, + const column_sender_validity* validity, + line_sender_error** err_out); + +/** + * LONG256 column. `data` points to row_count * 32 bytes. Each + * 32-byte group is one LONG256: four 64-bit limbs little-endian, + * least-significant limb first. + */ +QUESTDB_CLIENT_API +bool column_sender_chunk_column_long256( + column_sender_chunk* chunk, + const char* name, size_t name_len, + const uint8_t* data, size_t row_count, + const column_sender_validity* validity, + line_sender_error** err_out); + +/** + * IPv4 column. `data` is a packed uint32 per row, encoded as + * u32::from(Ipv4Addr).to_le_bytes() (octet 0 in the high byte). + */ +QUESTDB_CLIENT_API +bool column_sender_chunk_column_ipv4( + column_sender_chunk* chunk, + const char* name, size_t name_len, + const uint32_t* data, size_t row_count, + const column_sender_validity* validity, + line_sender_error** err_out); +``` + +--- + +## 7. Timestamp columns + +```c +/** + * TIMESTAMP column, nanoseconds since the Unix epoch. + */ +QUESTDB_CLIENT_API +bool column_sender_chunk_column_ts_nanos( + column_sender_chunk* chunk, + const char* name, size_t name_len, + const int64_t* data, size_t row_count, + const column_sender_validity* validity, + line_sender_error** err_out); + +/** + * TIMESTAMP column, microseconds since the Unix epoch. Equivalent to + * passing nanoseconds = micros * 1000 through ts_nanos, but the FFI + * does the scale-up so the caller does not have to materialise a + * second buffer. + */ +QUESTDB_CLIENT_API +bool column_sender_chunk_column_ts_micros( + column_sender_chunk* chunk, + const char* name, size_t name_len, + const int64_t* data, size_t row_count, + const column_sender_validity* validity, + line_sender_error** err_out); + +/** + * DATE column, milliseconds since the Unix epoch. + */ +QUESTDB_CLIENT_API +bool column_sender_chunk_column_date_millis( + column_sender_chunk* chunk, + const char* name, size_t name_len, + const int64_t* data, size_t row_count, + const column_sender_validity* validity, + line_sender_error** err_out); +``` + +--- + +## 8. Variable-width text column (VARCHAR) + +QWP has exactly one variable-width text type: VARCHAR (wire code +`0x0F`). The wire format is `uint32` offsets + concatenated bytes. The +older STRING wire type (`0x08`) has been removed from the spec and is +not exposed here. + +Input is in Arrow Utf8 shape: a full-length offsets array of +`row_count + 1` entries where `offsets[i]..offsets[i+1]` slices `bytes` +for row `i`. Null rows are signalled via the validity bitmap; their +offset slice is ignored (typically a zero-length slice, but the FFI +makes no assumption). + +```c +/** + * VARCHAR column (QWP wire type 0x0F). + * + * Input layout matches Arrow Utf8: + * - offsets has row_count + 1 entries. Monotonically non-decreasing. + * The first entry is typically 0 and the last is typically + * bytes_len; the FFI does not require those exactly, but every + * offset must be ≤ bytes_len. + * - bytes is a single contiguous UTF-8 buffer. + * - validity is Arrow-shape (1 = valid, see §2.4). NULL rows' + * offset slices are ignored. + * + * Wire output: the library compresses to QWP's dense layout + * (only non-null values, uint32 offsets matching the wire spec). + * + * UTF-8 validity is the caller's responsibility; invalid UTF-8 is + * detected by the server and surfaced as line_sender_error_server_rejection. + * + * Input offsets are int32_t because that is the Arrow Utf8 layout + * (signed 32-bit). Negative values are rejected. Polars LargeUtf8 + * (int64 offsets, >2 GiB) is the Python wrapper's concern: split the + * column or copy down to int32 offsets before calling. + */ +QUESTDB_CLIENT_API +bool column_sender_chunk_column_varchar( + column_sender_chunk* chunk, + const char* name, size_t name_len, + const int32_t* offsets, // length = row_count + 1 + const uint8_t* bytes, + size_t bytes_len, + size_t row_count, + const column_sender_validity* validity, + line_sender_error** err_out); +``` + +--- + +## 9. Symbol columns (dictionary fast path) + +Symbol columns take dictionary-encoded input: a `codes` array of +per-row indices and a dict (`dict_offsets` + `dict_bytes` in Arrow +Utf8 layout). + +This is **the canonical symbol input** because it matches: +- Pandas `Categorical` (`.codes` + `.categories`), +- Polars `Categorical` / Arrow `Dictionary`. + +The implementation interns the dict against the connection-scoped +symbol table once (cost ∝ dict cardinality, not row count) and then +remaps codes in bulk. + +For each `symbol_dict_` variant, `codes[i]` is the index into the +dict for row `i`. Codes must be in range `0..dict_len` for valid rows; +behaviour is undefined for out-of-range codes when validity is NULL. +When a row's validity bit is 0, its code is ignored. + +`dict_offsets` has `dict_len + 1` entries; `dict_offsets[d]..dict_offsets[d+1]` +slices `dict_bytes` for dict entry `d`. `dict_len` is implicit: +`dict_len == (dict_offsets length) - 1`. The FFI takes +`dict_offsets_len` explicitly to compute `dict_len = dict_offsets_len - 1`. + +```c +QUESTDB_CLIENT_API +bool column_sender_chunk_symbol_dict_i8( + column_sender_chunk* chunk, + const char* name, size_t name_len, + const int8_t* codes, size_t row_count, + const int32_t* dict_offsets, size_t dict_offsets_len, + const uint8_t* dict_bytes, size_t dict_bytes_len, + const column_sender_validity* validity, + line_sender_error** err_out); + +QUESTDB_CLIENT_API +bool column_sender_chunk_symbol_dict_i16( + column_sender_chunk* chunk, + const char* name, size_t name_len, + const int16_t* codes, size_t row_count, + const int32_t* dict_offsets, size_t dict_offsets_len, + const uint8_t* dict_bytes, size_t dict_bytes_len, + const column_sender_validity* validity, + line_sender_error** err_out); + +QUESTDB_CLIENT_API +bool column_sender_chunk_symbol_dict_i32( + column_sender_chunk* chunk, + const char* name, size_t name_len, + const int32_t* codes, size_t row_count, + const int32_t* dict_offsets, size_t dict_offsets_len, + const uint8_t* dict_bytes, size_t dict_bytes_len, + const column_sender_validity* validity, + line_sender_error** err_out); +``` + +--- + +## 10. Designated timestamp + +Required exactly once per chunk before `submit`. Two variants picking +the on-wire type: + +- `..._micros` encodes the column on the wire as TIMESTAMP (`0x0A`, + microseconds since Unix epoch). +- `..._nanos` encodes the column on the wire as TIMESTAMP_NANOS + (`0x10`, nanoseconds since Unix epoch). + +Exactly one of the two may be called per chunk. The designated +timestamp is emitted on the wire as a schema column with an empty +name (per spec §Full schema mode). + +```c +/** + * Designated-timestamp column, microseconds since the Unix epoch. + * Encoded on the wire as TIMESTAMP (0x0A). + */ +QUESTDB_CLIENT_API +bool column_sender_chunk_designated_timestamp_micros( + column_sender_chunk* chunk, + const int64_t* data, + size_t row_count, + line_sender_error** err_out); + +/** + * Designated-timestamp column, nanoseconds since the Unix epoch. + * Encoded on the wire as TIMESTAMP_NANOS (0x10). + */ +QUESTDB_CLIENT_API +bool column_sender_chunk_designated_timestamp_nanos( + column_sender_chunk* chunk, + const int64_t* data, + size_t row_count, + line_sender_error** err_out); +``` + +(No `validity` parameter — the designated timestamp must be non-null +per row.) + +--- + +## 11. Submit + +```c +/** + * Encode the chunk into a QWP/WebSocket frame and publish it. On + * success the chunk is cleared (row count → 0, allocations retained) + * and can be reused. + * + * If fsn_out != NULL, the frame's assigned sequence number is written + * to *fsn_out on success. This value is the QWP wire `sequence` field + * (spec §Sequence numbering): a per-connection counter starting at 0, + * server-assigned by counting inbound frames. The existing Rust API + * calls it "FSN" (frame sequence number) — the two terms are + * interchangeable. + * + * Use column_sender_await_acked_fsn to block until the server acks it. + * + * On failure, the chunk is left untouched so the caller can recover + * its contents (e.g. write to local fallback storage) before freeing. + * + * Back-pressure: the wire allows at most 128 in-flight (unacked) + * batches. When the in-flight queue is full, submit blocks until an + * ack frees a slot, or returns an error if the deadline configured on + * the sender elapses first. + */ +QUESTDB_CLIENT_API +bool column_sender_submit( + column_sender* sender, + column_sender_chunk* chunk, + uint64_t* fsn_out, + line_sender_error** err_out); + +/** + * Block until the server has durably acknowledged the given FSN, or + * until the timeout elapses. + * + * timeout_millis = 0 means non-blocking poll. + * + * Returns true if acked within the deadline, false otherwise. On + * unrecoverable error sets *err_out. + */ +QUESTDB_CLIENT_API +bool column_sender_await_acked_fsn( + column_sender* sender, + uint64_t fsn, + uint64_t timeout_millis, + line_sender_error** err_out); + +/** + * Non-blocking poll of progress counters. + */ +QUESTDB_CLIENT_API +uint64_t column_sender_published_fsn(const column_sender* sender); +QUESTDB_CLIENT_API +uint64_t column_sender_acked_fsn(const column_sender* sender); +``` + +--- + +## 12. Versioning + +This API is **draft / unstable** until first ship. Once shipped: + +- The C ABI is versioned alongside the rest of `c-questdb-client`. +- Breaking changes follow the same SemVer policy as the existing + `line_sender_*` ABI. +- The wire format is the existing QWP v1 spec (no new wire types + introduced). + +--- + +## 13. Minimal C example + +Pool/borrow shape: one `questdb_db` per process, borrow a sender per +unit of work, return it when done. + +```c +#include "questdb/ingress/line_sender.h" +#include "questdb/ingress/column_sender.h" + +int send_one_chunk(questdb_db* db) { + line_sender_error* err = NULL; + column_sender* sender = NULL; + column_sender_chunk* chunk = NULL; + + sender = questdb_db_borrow_sender(db, &err); + if (!sender) goto fail; + + chunk = column_sender_chunk_new("trades", 6, &err); + if (!chunk) goto fail; + + const double prices[] = { 2615.54, 2615.60, 2615.50 }; + const double amounts[] = { 0.00044, 0.00021, 0.00073 }; + const int64_t timestamps_ns[] = { 1700000000000000000LL, + 1700000000000001000LL, + 1700000000000002000LL }; + + if (!column_sender_chunk_column_f64( + chunk, "price", 5, prices, 3, NULL, &err)) goto fail; + if (!column_sender_chunk_column_f64( + chunk, "amount", 6, amounts, 3, NULL, &err)) goto fail; + if (!column_sender_chunk_designated_timestamp_nanos( + chunk, timestamps_ns, 3, &err)) goto fail; + + uint64_t fsn = 0; + if (!column_sender_submit(sender, chunk, &fsn, &err)) goto fail; + if (!column_sender_await_acked_fsn(sender, fsn, 5000, &err)) goto fail; + + column_sender_chunk_free(chunk); + questdb_db_return_sender(db, sender); + return 0; + +fail: + if (err) { + fprintf(stderr, "%s\n", line_sender_error_msg(err, NULL)); + line_sender_error_free(err); + } + column_sender_chunk_free(chunk); + if (sender) questdb_db_return_sender(db, sender); + return 1; +} + +int main(void) { + line_sender_error* err = NULL; + questdb_db* db = questdb_db_connect( + "qwpws::addr=localhost:9000;pool_size=1;", &err); + if (!db) { + if (err) line_sender_error_free(err); + return 1; + } + int rc = send_one_chunk(db); + questdb_db_close(db); + return rc; +} +``` + +--- + +## 14. Notes for the Python wrapper + +These are not part of the C ABI; they are guidance for the Python repo +agent. + +- **Pandas numeric columns** → `Series.to_numpy(copy=False)` gives a + contiguous `np.ndarray` whose `.ctypes.data` pointer goes straight + to FFI. No copy. +- **Pandas nulls** → `Series.isna().values` is a `np.ndarray[bool]`; + pack it LSB-first into a `uint8_t*` bitmap (provide a vectorised + helper using `numpy.packbits(... bitorder='little')`). +- **Pandas datetime64** → already an int64 view via + `series.view('int64')`. For `[ns]` use `column_ts_nanos`; for + `[us]` use `column_ts_micros`; for `[ms]` use `column_date_millis` + (or scale up to ns). +- **Pandas `Categorical`** → `cat.codes.to_numpy()` for `codes`; + `cat.categories.to_numpy()` then encode to Arrow Utf8 layout + (build `offsets` + `bytes`) for the dict. Or roundtrip via PyArrow + for less manual work. +- **Polars** → `series.to_arrow()` yields a `pyarrow.Array` whose + buffers (`array.buffers()`) include the validity bitmap (already + LSB-first 1=valid) and the data buffer. Direct pointer handoff. +- **Pandas object-dtype strings** are the slow path: materialise into + Arrow Utf8 via `pyarrow.array(series)` then forward. The FFI + does not have a fast path for object dtype — that's a deliberate + choice. Document this. +- **Object lifetimes** — keep the source `np.ndarray` / `pa.Array` + alive for the duration of the FFI call. Buffers are copied into the + chunk during the call, so they can be dropped after the call + returns. diff --git a/doc/COLUMN_SENDER_PLAN.md b/doc/COLUMN_SENDER_PLAN.md new file mode 100644 index 00000000..10bf1155 --- /dev/null +++ b/doc/COLUMN_SENDER_PLAN.md @@ -0,0 +1,578 @@ +# Column-Major Sender — Implementation Plan + +**Status:** draft, pending approval +**Owner:** TBD +**Audience:** engineers implementing the Rust core, the C FFI, and the +separate Python wrapper repo. + +--- + +## 1. Goal + +Ship a column-major writer that ingests **Pandas and Polars DataFrames into +QuestDB at the maximum throughput the QWP/WebSocket wire allows.** + +That is the whole goal. Every design choice in this plan is justified by +"does it make `df → QuestDB` faster?" Anything else is out of scope. + +**This is a client for an existing server implementing the QWP ingress +(WebSocket) v1 wire specification.** The spec lives at +`questdb/documentation/connect/wire-protocols/qwp-ingress-websocket.md` +in the documentation repo. Wire framing, column types, null encoding +(bit = 1 NULL, dense values), schema model, symbol delta dictionary, +ack/sequence semantics, and protocol limits are all fixed by the spec. +We invent nothing the spec covers; the design freedom is purely in how +the FFI exposes the wire to Pandas/Polars callers efficiently. + +### Non-goals + +- A generic columnar ingestion library. No Arrow C Data Interface, no + generic column-source traits, no support for "hypothetical other column + formats." If/when those are needed they live above the FFI, in a + language-specific wrapper. +- Replacing the row-major `Sender`/`Buffer` path. The row API stays as-is + for users who think in rows. +- QWP/UDP support. UDP's internal buffer is row-major and unreliable; the + column-major path targets QWP/WS only. +- A Python binding inside this repo. Python lives in its own repo and + consumes the C ABI defined in `COLUMN_SENDER_FFI_ABI.md`. +- New wire-protocol work. The wire format already is column-major. + +--- + +## 2. Why this is a small change to the wire and a big change to the API + +The QWP/WS wire format is **already column-major.** The row-API path +(`Buffer` / `QwpWsColumnarBuffer`) pays per-cell name-lookup and +op-state validation: for 50M rows × 6 columns that's 300M name lookups ++ 300M op-state checks before any actual encoding happens. The +column-major API replaces all of that with **6 bulk appends per chunk ++ 1 encode pass**. + +### 2.1 Decoupled from the existing row encoder + +Performance is the goal; **code reuse is a non-goal**. The column +sender does **not** reuse `QwpWsColumnarBuffer` or the row API's +encoder. It writes a fresh QWP/WS frame directly from pandas/polars- +shaped buffers, via a new `BulkChunk` type and a sibling encoder in a +new module. + +What is shared with the row API is only what *must* stay coherent at +connection scope: + +- `SymbolGlobalDict` (`questdb-rs/src/ingress/buffer/qwp.rs:5041`) — + the connection-scoped symbol intern table the wire requires. +- `SchemaRegistry` (`qwp.rs:5148`) — connection-scoped schema IDs. +- The QWP/WS publisher / driver / WS framing in + `questdb-rs/src/ingress/sender/qwp_ws*.rs` — connection lifecycle, + ack pump, reconnect, FSN tracking. + +What is *not* shared, and may be duplicated verbatim if that's +simplest, is the wire-formatting helper surface: varint writers, type- +byte tables, schema-signature construction. These are stable per the +QWP v1 spec; duplicating costs ~100 lines and removes one layer of +indirection from the hot path. + +### 2.2 Two code paths per type + +For every numeric/fixed-width column, the bulk-append function +branches on validity at the top: + +- **`validity == NULL`** (no nulls): single `extend_from_slice` / + `memcpy` from the caller's buffer into the column's wire-shape + storage. Emit `null_flag = 0x00`. +- **`validity != NULL`**: one pass that (a) inverts the Arrow bitmap + to QWP wire semantics (bit=1 means NULL) and (b) gathers non-null + values densely into the wire buffer. Emit `null_flag != 0x00` and + the bitmap. + +The first path is the common case for pandas/polars numeric columns +and should bottleneck on `memcpy` bandwidth. The second is a tight +loop with a branch on the validity bit, suitable for SIMD where the +types allow. + +--- + +## 3. Architecture + +``` +Python repo (separate) c-questdb-client (this repo) +───────────────────── ───────────────────────────── + Rust core + pandas / polars DataFrame ──┐ + ▼ │ ┌─────────────────────────────┐ + Python wrapper │ C ABI │ QuestDb (pool, shareable) │ + - extract typed buffers ├────────►│ ├─ conn #1 ┐ │ + - extract validity bitmap │ │ ├─ conn #2 ├─ each owns: │ + - extract category codes & │ │ └─ ... │ publisher, │ + dict for symbols │ │ │ SchemaReg, │ + │ │ │ SymbolDict │ + │ │ borrow_sender / return │ + │ │ │ │ + │ │ ▼ │ + │ │ ColumnSender (borrowed) │ + │ │ ├─ new_chunk │ + │ │ ├─ submit (FSN-returning) │ + │ │ └─ await_acked_fsn │ + │ └─────────┬───────────────────┘ + │ + ▼ (BulkChunk encoder, + a new module) + QWP/WS frame → server +``` + +Layering rules: + +- **The C ABI must be expressible as a thin wrapper around typed Rust + slices.** Per-column-append functions take `ptr + len + optional + validity bitmap`. Nothing else. +- **The user thinks `DataFrame → Table`.** One chunk = one table = one + DataFrame = one QWP frame = one FSN. +- **A `QuestDb` is shareable across threads; a borrowed `ColumnSender` + is not.** The pool absorbs the per-connection thread-safety + constraint. + +--- + +## 4. Rust API (public surface) + +New module: `questdb-rs/src/ingress/column_sender/` with submodules +`db.rs`, `sender.rs`, `chunk.rs`, `validity.rs`, `encoder.rs`, +`error.rs`. Re-exported under +`questdb::ingress::column_sender::{QuestDb, ColumnSender, Chunk, Validity}`. + +```rust +/// Connection pool. Shareable across threads. One `QuestDb` per +/// connect string per process (typical usage). +pub struct QuestDb { /* pool of Connection (private) */ } + +impl QuestDb { + /// Open a pool. Eagerly opens `pool_size` connections (default 1). + /// Pool knobs: `pool_size=N` (default 1), `pool_max=M` (default 64), + /// `pool_idle_timeout_ms=T` (default 60000), `pool_reap=auto|manual` + /// (default auto). Plus all standard `qwpws::` keys. + pub fn connect(conf: &str) -> Result; + + /// Borrow a sender. If a previously-returned sender is free, hand + /// it out; else, if pool size < `pool_max`, open a new connection + /// and hand out a sender bound to it; else return InvalidApiCall + /// (fail-fast at cap). + pub fn borrow_sender(&self) -> Result>; + + /// Manually reap idle connections (closes those above `pool_size` + /// idle longer than `pool_idle_timeout_ms`). Returns the count + /// closed. Background reaper does this for you under `pool_reap=auto`. + pub fn reap_idle(&self) -> usize; + + pub fn close(self); +} + +/// Borrowed sender. Returns to the pool on `Drop`. Not `Send`/`Sync` — +/// belongs to the borrowing thread. +pub struct BorrowedSender<'a> { /* borrow handle into QuestDb */ } + +impl<'a> std::ops::Deref for BorrowedSender<'a> { type Target = ColumnSender; … } +impl<'a> std::ops::DerefMut for BorrowedSender<'a> { … } +impl<'a> Drop for BorrowedSender<'a> { … } // returns to pool + +/// Thin handle over a borrowed connection. +pub struct ColumnSender { /* &mut Connection (lifetime-bound) */ } + +impl ColumnSender { + /// Create a chunk for a given table. Doesn't touch the connection + /// — chunks are pure data until submitted. + pub fn new_chunk(&self, table: TableName) -> Chunk; + + /// Submit a chunk: encode → publish → return FSN (= wire `sequence`). + /// Clears the chunk for reuse on success. + pub fn submit(&mut self, chunk: &mut Chunk) -> Result; + + pub fn await_acked_fsn(&mut self, fsn: Fsn, timeout: Duration) -> Result<()>; + pub fn must_close(&self) -> bool; +} + +pub struct Chunk { /* table name + Vec + row_count */ } + +impl Chunk { + /// First call locks `row_count`. All subsequent column appends + /// MUST have the same length (counted in logical rows, not bytes). + + // Numeric columns — zero-copy from contiguous typed slice. + pub fn column_i8 (&mut self, name: ColumnName, data: &[i8 ], v: Option<&Validity>) -> Result<()>; + pub fn column_i16(&mut self, name: ColumnName, data: &[i16], v: Option<&Validity>) -> Result<()>; + pub fn column_i32(&mut self, name: ColumnName, data: &[i32], v: Option<&Validity>) -> Result<()>; + pub fn column_i64(&mut self, name: ColumnName, data: &[i64], v: Option<&Validity>) -> Result<()>; + pub fn column_f32(&mut self, name: ColumnName, data: &[f32], v: Option<&Validity>) -> Result<()>; + pub fn column_f64(&mut self, name: ColumnName, data: &[f64], v: Option<&Validity>) -> Result<()>; + pub fn column_bool(&mut self, name: ColumnName, data: &[u8] /* arrow bitmap */, v: Option<&Validity>) -> Result<()>; + + // Fixed-width binary columns. + pub fn column_uuid (&mut self, name: ColumnName, data: &[[u8;16]], v: Option<&Validity>) -> Result<()>; + pub fn column_long256(&mut self, name: ColumnName, data: &[[u8;32]], v: Option<&Validity>) -> Result<()>; + pub fn column_ipv4 (&mut self, name: ColumnName, data: &[u32], v: Option<&Validity>) -> Result<()>; + + // Time columns. + pub fn column_ts_nanos (&mut self, name: ColumnName, data: &[i64], v: Option<&Validity>) -> Result<()>; + pub fn column_ts_micros(&mut self, name: ColumnName, data: &[i64], v: Option<&Validity>) -> Result<()>; + pub fn column_date_millis(&mut self, name: ColumnName, data: &[i64], v: Option<&Validity>) -> Result<()>; + + // Variable-width text — QWP has exactly one text type, VARCHAR + // (wire 0x0F, uint32 offsets). The older STRING (0x08) was + // removed from the spec. + // Input is Arrow Utf8 shape: i32 offsets + bytes; library + // compresses to dense uint32-offset layout on the wire. + pub fn column_varchar(&mut self, name: ColumnName, offsets: &[i32], data: &[u8], v: Option<&Validity>) -> Result<()>; + + // Symbol fast path: dictionary-encoded. + // `codes` are per-row indices into `dict_offsets`/`dict_data` (Arrow Utf8). + // The implementation interns the dict against SymbolGlobalDict once + // and remaps codes in bulk — no per-row HashMap probe. + pub fn symbol_dict_i8 (&mut self, name: ColumnName, codes: &[i8 ], dict_offsets: &[i32], dict_data: &[u8], v: Option<&Validity>) -> Result<()>; + pub fn symbol_dict_i16(&mut self, name: ColumnName, codes: &[i16], dict_offsets: &[i32], dict_data: &[u8], v: Option<&Validity>) -> Result<()>; + pub fn symbol_dict_i32(&mut self, name: ColumnName, codes: &[i32], dict_offsets: &[i32], dict_data: &[u8], v: Option<&Validity>) -> Result<()>; + + // Designated timestamp (required, exactly once per chunk; pick one). + // Emitted on the wire as an empty-name column of type + // TIMESTAMP (0x0A) for micros, TIMESTAMP_NANOS (0x10) for nanos. + pub fn designated_timestamp_micros(&mut self, data: &[i64]) -> Result<()>; + pub fn designated_timestamp_nanos (&mut self, data: &[i64]) -> Result<()>; + + // Lifecycle. + pub fn row_count(&self) -> usize; + pub fn clear(&mut self); // retains capacity for reuse +} + +/// Validity bitmap. Public API accepts **Arrow semantics** +/// (bit = 1 means valid, LSB-first within each byte) to enable +/// zero-copy from PyArrow / Polars / Pandas buffers. Length in bits +/// must equal the chunk's row_count. +/// +/// The QWP wire uses the inverted semantics (bit = 1 means NULL) and +/// dense data (only non-null values). The library inverts the bitmap +/// and gathers when encoding; callers never construct QWP-shaped +/// input. +pub struct Validity<'a> { bits: &'a [u8] } +impl<'a> Validity<'a> { + pub fn from_bitmap(bits: &'a [u8], bit_len: usize) -> Result; +} +``` + +### What `column_*` does internally + +1. Validate name (or skip when `ColumnName` already validated). +2. Look up or create the column slot in the chunk's `Vec`. + **Once per column per chunk, not per row.** +3. Append data to the column's storage: + - For numeric/fixed-width columns where the chunk's internal storage + is `Vec` of the same `T`, this is a single `Vec::extend_from_slice`. + - For columns with null-bitmap representation, also OR the validity + bitmap into the column's null bitmap (bulk, byte-aligned where + possible). +4. Bump the per-column row counter; assert it matches `chunk.row_count`. + +### Symbol bulk-intern + +The expensive part of symbol handling today is per-row +`SymbolGlobalDict::intern` (qwp.rs:5041). The fast path: + +1. Walk `dict_offsets`/`dict_data` once: build a small + `Vec` of length `dict_len` mapping each dict entry's local + index → global id (one `intern()` per *unique* symbol value, not per + row). +2. Walk `codes` once, writing the mapped global ids into the column's + storage — a tight loop, branch-predictable, ~1ns/row. + +For a 10M-row symbol column with cardinality 1000, this drops from 10M +HashMap probes to 1000. + +--- + +## 5. Workstreams + +Designed so multiple engineers can work in parallel after WS-0 + WS-1 +land. + +### WS-0 — QuestDb pool, sender borrow, idle reaper (blocking dependency) + +- Create `questdb-rs/src/ingress/column_sender/db.rs` with the pool + type, eagerly opening `pool_size` connections at `connect()`. +- Connect-string parsing: lift the existing `qwpws::` parser; add + `pool_size` (default 1), `pool_max` (default 64), + `pool_idle_timeout_ms` (default 60000), `pool_reap` + (`auto`|`manual`, default `auto`). Reject configs with + `pool_size > pool_max`. +- `borrow_sender()` semantics: pull from free list if any; else if + pool size < `pool_max`, open a new connection; else return + `InvalidApiCall` (fail-fast). +- `BorrowedSender<'_>` returns the connection to the pool on `Drop` + with a `last_idle_at = Instant::now()` stamp. If + `must_close()` is true on return, drop the connection. +- **Idle reaper.** Under `pool_reap=auto`, the pool spawns one + background `std::thread` on `connect`. The thread wakes on a ticker + (~5s or `pool_idle_timeout_ms / 12`, whichever is larger), scans the + free list, closes connections idle longer than + `pool_idle_timeout_ms`, **never shrinking below `pool_size`**. The + thread is joined on `close()`. Manual mode skips the thread entirely; + `db.reap_idle()` runs the same scan on demand and is exposed on + the FFI. +- Thread-safety: the pool's internal state (free list, total count, + per-connection idle stamp) is guarded by a `Mutex`. Borrow/return/ + reap/close are all safe concurrent. +- Owner: 1 engineer. +- Done when: + - multi-thread test borrows and returns N senders concurrently + without deadlock or leak, + - pool fails-fast at `pool_max`, + - idle reaper (auto and manual) closes excess connections after the + timeout while keeping `pool_size` warm, + - `close()` joins the reaper cleanly. + +### WS-1 — `ColumnSender` thin handle & wire-side submit plumbing + +- Define `ColumnSender` as a `&mut Connection` lifetime-bound borrow + handle. Implement `submit(chunk)` that calls the new encoder + (WS-2/3/4) and hands the encoded frame to the existing publisher + (`questdb-rs/src/ingress/sender/qwp_ws_publisher.rs`). +- Hook up FSN return, `await_acked_fsn`, `must_close`. +- Stub `submit()` for an empty chunk that produces a header-only QWP + frame end-to-end (no columns; pure framing) and the server accepts. +- Owner: 1 engineer. +- Depends on: WS-0. +- Done when: empty-chunk submit round-trips against a real server and + the FSN is acked. + +### WS-2 — `Chunk`, `BulkChunk` encoder, numeric/fixed-width columns + +- Define `Chunk` (caller-owned, table-bound) and the internal + `BulkChunk` wire-shape storage: per-column `Vec` already in QWP + wire layout (dense values + optional null bitmap with QWP + semantics) so encode is a header + `extend_from_slice` per column. +- Implement the **two code paths per type** (see §2.2): no-null + fast-memcpy; nullable invert+gather. Both produce identical + on-wire shape modulo the null_flag byte. +- Implement `column_i8`/`i16`/`i32`/`i64`/`f32`/`f64`/`bool`/`uuid`/ + `long256`/`ipv4`/`ts_nanos`/`ts_micros`/`date_millis` + + `designated_timestamp_micros` + `designated_timestamp_nanos`. +- Implement `Validity` (Arrow-shape in: 1=valid, LSB-first). Library + masks trailing bits beyond row_count. +- Implement the table-header + schema-section emit. Schema interning + goes through the existing connection-shared `SchemaRegistry`. +- Owner: 1 engineer. +- Depends on: WS-1. +- Done when: round-trip test for each type passes against a real + server and a benchmark shows the per-row cost is dominated by + memcpy bandwidth, not API overhead. + +### WS-3 — VARCHAR column + +- Implement `column_varchar`. Input is Arrow Utf8 shape (i32 offsets + + bytes). Wire output is dense (only non-null) with uint32 offsets per + QWP spec §VARCHAR. +- Two code paths per §2.2: + - No-null: copy all `row_count + 1` offsets unchanged (caller's i32 + fits trivially in wire u32) + copy the full byte buffer. + - Nullable: walk validity bitmap; for each non-null row, compute + `slice_len = offsets[i+1] − offsets[i]`, append dense offsets and + bytes for that slice. **Skip slicing for null rows** — do not + trust caller's offset values for null rows. +- UTF-8 is trusted; server rejects invalid UTF-8 with PARSE_ERROR. +- Owner: 1 engineer. +- Depends on: WS-1, +reads WS-2's `Chunk` shape. +- Done when: round-trip + null handling test passes; benchmark within + ~2× of f64 column throughput for short strings (varchar is + fundamentally variable-width so equal-throughput is unrealistic). + +### WS-4 — Symbol bulk-intern fast path + +- Implement `symbol_dict_{i8,i16,i32}`. +- Share the connection-scoped `SymbolGlobalDict` (qwp.rs:5041). New + code interns through it; emits the new symbols in the delta-dict + prefix of the QWP frame. +- **Intern only referenced dict entries.** Pandas/polars `Categorical` + carries every category ever observed (often 100k+) but a typical + chunk references a small subset. The implementation: + 1. One pass over `codes` to mark referenced dict indices in a + bitset (sized `dict_len`). + 2. One pass over the bitset: intern each referenced dict entry, + build a `Vec` of length `dict_len` mapping local → global + (unreferenced slots get `u64::MAX` sentinel). + 3. One pass over `codes` writing global IDs into the wire buffer. + This protects the 1M-per-connection wire limit and avoids + polluting `SymbolGlobalDict` with never-sent values. +- Validate codes are in `0..dict_len` for non-null rows; out-of-range + is `InvalidApiCall`. Codes for null rows are not inspected. +- Owner: 1 engineer. +- Depends on: WS-1; can develop in parallel with WS-2/3. +- Done when: 10M-row × 1000-card benchmark shows symbol throughput + within 2× of f64 throughput (today, symbol throughput is much worse). + +### WS-5 — C FFI surface + +- Implement the ABI defined in `COLUMN_SENDER_FFI_ABI.md`. Two FFI + namespaces: + - `questdb_db_*` — pool/borrow (`connect`, `close`, `borrow_sender`, + `return_sender`). Lands once WS-0 lands. + - `column_sender_chunk_*` + `column_sender_submit` / + `_await_acked_fsn` — chunk fill and submit. Each column function + ships the moment its Rust counterpart lands. +- Code lives in `questdb-rs-ffi/src/column_sender.rs`, re-exported from + `lib.rs`. +- Header lives at `include/questdb/ingress/column_sender.h`. Defer the + `.hpp` until someone needs a C++ wrapper — the Python wrapper does + not. +- `cbindgen.toml` updates if the column sender is exposed by cbindgen. +- Owner: 1 engineer. +- Depends on: WS-0/2/3/4 land in parallel. +- Done when: a C test program (in `cpp_test/` or `system_test/`) opens + a pool, borrows a sender, submits a chunk, returns the sender, and + the server stores the rows. + +### WS-6 — Benchmarks & soak tests + +- Microbench (Criterion in `questdb-rs/benches/`): + - per-column bulk append, vs the row-API equivalent, vs raw memcpy + baseline, for each type; + - symbol intern (dict path) vs per-row symbol intern (row API); + - end-to-end "10M rows × N columns" chunk submit (in-memory, no + network), to measure pure encoder + populate cost. +- End-to-end throughput test against a local QuestDB: Pandas DataFrame + → submit → ack, varying row counts, column counts, dtypes. Report + GB/s in and rows/s. +- Soak: 1-hour run sending random chunks; assert no leaks, no + reconnects, latched-error handling works. +- Owner: 1 engineer. +- Depends on: WS-2 minimum. +- Done when: benchmark numbers documented in `doc/DEV_NOTES.md` or a + new `doc/COLUMN_SENDER_PERF.md`. + +### WS-7 — Python repo coordination (out-of-tree, tracked here) + +- The Python repo wraps `column_sender.h`. The Python repo's agent + works from `COLUMN_SENDER_FFI_ABI.md` alone. +- Python repo TODOs (tracked there, listed here for visibility): + - Build a thin ctypes/cffi/pyo3 wrapper around the C ABI. + - For Pandas: extract numpy buffers per column via `Series.to_numpy()` + (zero-copy for native dtypes), build validity bitmaps from + `Series.isna()` (LSB-first packing — provide a vectorised helper). + - For Polars: extract Arrow buffers via `Series.to_arrow()`; the + Arrow buffer pointers and validity bitmaps go straight to FFI. + - For Pandas `Categorical` / Polars `Categorical`: use + `symbol_dict_*`. + - Document the slow paths (object-dtype strings, mixed dtypes, + extension types) and the fallbacks (materialise to a contiguous + typed array). + +--- + +## 6. Type mapping reference + +| QWP wire type | Rust API | Pandas dtype | Polars / Arrow dtype | FFI shape | +|---------------|--------------------|------------------------------|----------------------------|--------------------------| +| BOOL | `column_bool` | `bool` (numpy) | `Boolean` (Arrow bitmap) | `uint8_t*` (bitmap) | +| BYTE | `column_i8` | `int8` | `Int8` | `int8_t*` | +| SHORT | `column_i16` | `int16` | `Int16` | `int16_t*` | +| INT | `column_i32` | `int32` | `Int32` | `int32_t*` | +| LONG | `column_i64` | `int64` | `Int64` | `int64_t*` | +| FLOAT | `column_f32` | `float32` | `Float32` | `float*` | +| DOUBLE | `column_f64` | `float64` | `Float64` | `double*` | +| VARCHAR | `column_varchar` | `string` / object (fallback) | `Utf8` (Polars `LargeUtf8` → wrapper splits) | `int32_t*` + `uint8_t*` | +| SYMBOL | `symbol_dict_iN` | `Categorical` | `Categorical` / Dict | codes + dict offsets+bytes | +| TIMESTAMP | `column_ts_nanos`/`_micros` | `datetime64[ns]`/`[us]` | `Datetime(ns/us)` | `int64_t*` | +| DATE | `column_date_millis` | `datetime64[ms]` | `Date` (after cast) | `int64_t*` | +| UUID | `column_uuid` | bytes (no native) | Arrow `FixedSizeBinary(16)`| `uint8_t*` (16N) | +| IPV4 | `column_ipv4` | uint32 (no native) | `UInt32` | `uint32_t*` | +| LONG256 | `column_long256` | bytes (no native) | Arrow `FixedSizeBinary(32)`| `uint8_t*` (32N) | + +**Out of v1 scope:** `DECIMAL64/128/256`, `LONG_ARRAY`, `DOUBLE_ARRAY`, +`GEOHASH`, `CHAR`, `BINARY`. Add in a follow-up milestone driven by +actual user demand from the Python wrapper. + +--- + +## 7. Threading & error model (inherited) + +- One `ColumnSender` is bound to one connection. Not `Sync`. Use + multiple senders for parallel ingestion. +- `Chunk` is owned by one thread. After `submit`, the chunk can be + cleared and reused. +- Error model is identical to the existing QWP/WS sender (see + `questdb-rs/src/ingress/mod.md` §"QWP/WebSocket"): drop-and-continue + vs halt; `must_close()`; FSN ack semantics. +- The Java client (`../java-questdb-client`, see memory + [[reference-java-questdb-client]]) is the posture reference for + parser-vs-writer trust split. The column-major API is the *writer* + side — it trusts its caller and panics nowhere + (memory [[feedback-client-no-panic]]). + +--- + +## 8. Decisions log + +All architectural decisions are locked. Anyone implementing should +flag a deviation rather than re-litigate silently. + +### Settled by the QWP/WS v1 spec (non-negotiable) + +- Wire framing, column type codes, schema model, sequence numbering, + symbol delta-dictionary, durable-ack opt-in, version negotiation, + protocol limits. +- Null encoding on the wire: bit = 1 means NULL, LSB-first; data after + the bitmap is dense. Internal encoder matches; FFI exposes the + inverted (Arrow-style) semantics for zero-copy from Pandas/Polars + and does the invert+gather internally. +- Wire is contiguous-per-column; strided input is the wrapper's + problem. +- UTF-8 validation: server enforces; we trust by default. +- Text type: VARCHAR only (`0x0F`, uint32 offsets). STRING is gone. +- Designated timestamp: empty-name column of type TIMESTAMP (`0x0A`, + µs) or TIMESTAMP_NANOS (`0x10`, ns). +- DATE on ingress is plain int64. +- FSN = wire `sequence` / `wireSeq`. + +### Settled by user direction + +- **API shape:** new top-level types, separate from `Buffer`/`Sender`. + Naming: `QuestDb`, `ColumnSender`, `Chunk`, `Validity`. +- **Mental model:** `DataFrame → Table`. One chunk = one table = one + DataFrame = one QWP frame = one FSN. +- **Connection layer:** pool (`QuestDb::connect`), borrow/return + (`db.borrow_sender()` → drop returns to pool). Defaults: + `pool_size=1`, `pool_max=64`, `pool_idle_timeout_ms=60000`. Eager + open at connect, auto-grow on exhaustion, fail-fast at cap. +- **Idle shrinking:** Rust-side background reaper per pool + (`pool_reap=auto`, default) closes excess-over-`pool_size` + connections after `pool_idle_timeout_ms` idle. Manual mode + (`pool_reap=manual`) disables the thread; `db.reap_idle()` / + `questdb_db_reap_idle()` exposed for caller-driven reaping. Reaper + lives in Rust so every binding (C/C++/Python) inherits the + behaviour without re-implementing. +- **Encoder:** fresh `BulkChunk` encoder, no reuse of + `QwpWsColumnarBuffer` or row-API encoder. Shares only connection- + scoped state (`SymbolGlobalDict`, `SchemaRegistry`, publisher). + Code reuse is a non-goal; perf is the goal. +- **Two code paths per type:** no-null = `memcpy`; nullable = invert + + gather in one pass. +- **Symbol intern:** scan codes first, intern only referenced dict + entries. +- **Validity trailing bits:** library masks; caller need not zero. +- **VARCHAR null offsets:** library skips slicing; caller's value for + null rows is ignored. +- **FFI:** raw pointers per column. No Arrow C Data Interface, no + strides, no generic column-source traits. +- **Python:** lives in a separate repo; this repo provides the C ABI. + +### Out of v1 scope (deferred) + +- Multi-table-per-frame batching at the API. Wire supports it; v1 API + is one chunk = one table. Revisit if the Python wrapper has a + multi-table use case. +- DECIMAL64/128/256. Wire is defined (1-byte column-wide scale + + dense unscaled ints). Defer until Polars-decimal demand surfaces. +- `LONG_ARRAY` / `DOUBLE_ARRAY` per-row, `GEOHASH`, `CHAR`, `BINARY`. +- C++ header wrapper (`column_sender.hpp`). Python wrapper does not + need it. +- Durable-ack callback API. Connect-string opt-in + (`X-QWP-Request-Durable-Ack: true` via `qwp_durable_ack=on`) is + surfaced; the OK fast path is what the throughput target cares + about. + From 15f4c02b46c5e8bcdea565836d659e8681560287 Mon Sep 17 00:00:00 2001 From: bluestreak Date: Sun, 24 May 2026 02:10:07 +0100 Subject: [PATCH 2/9] docs(ingress): sync flush with ack_level, refuse sf_dir in v1 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Locks the column-sender API around synchronous flush: sender.flush(&mut chunk, ack_level) blocks until the requested ACK level (Ok = WAL commit, Durable = object-store via Enterprise opt-in). Drops the FSN/submit/await split from the FFI; at most one frame in flight per sender, parallelism via the pool. Refuses sf_dir and other sf_* keys at QuestDb::connect with ConfigError — store-and-forward is single-writer-per-slot and interacts awkwardly with pool auto-grow; row-major Sender remains the SF path. Co-Authored-By: Claude Opus 4.7 (1M context) --- doc/COLUMN_SENDER_FFI_ABI.md | 109 +++++++++++++++++++---------------- doc/COLUMN_SENDER_PLAN.md | 83 +++++++++++++++++++------- 2 files changed, 123 insertions(+), 69 deletions(-) diff --git a/doc/COLUMN_SENDER_FFI_ABI.md b/doc/COLUMN_SENDER_FFI_ABI.md index f8b6ecc3..5d2b81ce 100644 --- a/doc/COLUMN_SENDER_FFI_ABI.md +++ b/doc/COLUMN_SENDER_FFI_ABI.md @@ -199,9 +199,18 @@ return per work unit (or per thread). | `pool_reap` | `auto` | `auto` — pool spawns a background thread that periodically reaps idle connections per `pool_idle_timeout_ms`. `manual` — no background thread; caller invokes `questdb_db_reap_idle` on its own cadence. | All other connect-string keys are inherited from the existing -`qwpws::` configuration (auth, TLS, `auth_timeout_ms`, retry, store- -and-forward, durable-ack opt-in, etc.). See `doc/CONSIDERATIONS.md` -and the row-API connect-string reference. +`qwpws::` configuration (auth, TLS, `auth_timeout_ms`, retry, +durable-ack opt-in, etc.). See `doc/CONSIDERATIONS.md` and the +row-API connect-string reference. + +**Not accepted in v1:** `sf_dir` and the other `sf_*` store-and- +forward keys (`sender_id`, `sf_max_bytes`, `sf_max_total_bytes`, +`sf_durability`, `sf_append_deadline_millis`). Passing any of them to +`questdb_db_connect` returns `line_sender_error_config_error` with a +message pointing to the row-major `line_sender` API for users who +need SF semantics. SF is fundamentally single-writer per slot and +interacts awkwardly with the pool's auto-grow; revisit only if a +real user needs both throughput and on-disk durability. Validity: `pool_size <= pool_max` must hold; otherwise `questdb_db_connect` returns `line_sender_error_config_error`. @@ -663,61 +672,63 @@ per row.) --- -## 11. Submit +## 11. Flush (synchronous) ```c /** - * Encode the chunk into a QWP/WebSocket frame and publish it. On - * success the chunk is cleared (row count → 0, allocations retained) - * and can be reused. - * - * If fsn_out != NULL, the frame's assigned sequence number is written - * to *fsn_out on success. This value is the QWP wire `sequence` field - * (spec §Sequence numbering): a per-connection counter starting at 0, - * server-assigned by counting inbound frames. The existing Rust API - * calls it "FSN" (frame sequence number) — the two terms are - * interchangeable. - * - * Use column_sender_await_acked_fsn to block until the server acks it. - * - * On failure, the chunk is left untouched so the caller can recover - * its contents (e.g. write to local fallback storage) before freeing. - * - * Back-pressure: the wire allows at most 128 in-flight (unacked) - * batches. When the in-flight queue is full, submit blocks until an - * ack frees a slot, or returns an error if the deadline configured on - * the sender elapses first. + * Acknowledgement level the flush waits for. */ -QUESTDB_CLIENT_API -bool column_sender_submit( - column_sender* sender, - column_sender_chunk* chunk, - uint64_t* fsn_out, - line_sender_error** err_out); +typedef enum column_sender_ack_level +{ + /** Wait for the server's WAL-commit ACK (spec status 0x00). + Always available. */ + column_sender_ack_level_ok = 0, + + /** Wait for the server's object-store durability ACK + (spec status 0x02). Enterprise only. Requires the pool to be + opened with `request_durable_ack=on` in the connect string + (and the server's 101 response confirming + `X-QWP-Durable-Ack: enabled`). If the connection did not opt + in, flush returns line_sender_error_invalid_api_call. */ + column_sender_ack_level_durable = 1, +} column_sender_ack_level; /** - * Block until the server has durably acknowledged the given FSN, or - * until the timeout elapses. + * Encode the chunk into a QWP/WebSocket frame, publish it, and block + * until the server acknowledges at the requested `ack_level`. Returns + * true once the ACK is received; the chunk is then cleared (row count + * → 0, allocations retained) and can be reused for the next DataFrame. + * + * Synchronous semantics: at most one frame in flight per sender. For + * parallel ingest, borrow multiple senders from the pool — one per + * thread — and flush concurrently. The 128-in-flight wire cap is + * never reached. * - * timeout_millis = 0 means non-blocking poll. + * Ack level semantics: + * - `ok` — returns when the server has written the batch to its WAL. + * - `durable` — returns when the WAL segment is durably uploaded to + * the configured object store. Strictly later than the OK + * watermark; can be significantly later under upload pressure. * - * Returns true if acked within the deadline, false otherwise. On - * unrecoverable error sets *err_out. + * On any failure (server rejection, transport error, latched-error + * sender, or `durable` requested without opt-in), returns false and + * sets *err_out. The chunk is left untouched so the caller can + * inspect or recover its contents before freeing. + * + * Flush blocks until ack or until the underlying connection enters a + * terminal failure state (must_close() becomes true). Transient + * disconnects are absorbed by the existing reconnect machinery. No + * separate per-call timeout in v1; if you need one, file a request. + * + * The QWP wire `sequence` (FSN) is tracked internally and is not + * exposed at the FFI — synchronous flush makes it unnecessary. */ QUESTDB_CLIENT_API -bool column_sender_await_acked_fsn( +bool column_sender_flush( column_sender* sender, - uint64_t fsn, - uint64_t timeout_millis, + column_sender_chunk* chunk, + column_sender_ack_level ack_level, line_sender_error** err_out); - -/** - * Non-blocking poll of progress counters. - */ -QUESTDB_CLIENT_API -uint64_t column_sender_published_fsn(const column_sender* sender); -QUESTDB_CLIENT_API -uint64_t column_sender_acked_fsn(const column_sender* sender); ``` --- @@ -767,9 +778,9 @@ int send_one_chunk(questdb_db* db) { if (!column_sender_chunk_designated_timestamp_nanos( chunk, timestamps_ns, 3, &err)) goto fail; - uint64_t fsn = 0; - if (!column_sender_submit(sender, chunk, &fsn, &err)) goto fail; - if (!column_sender_await_acked_fsn(sender, fsn, 5000, &err)) goto fail; + if (!column_sender_flush( + sender, chunk, column_sender_ack_level_ok, &err)) goto fail; + /* flush returned: server has WAL-committed; chunk cleared & reusable */ column_sender_chunk_free(chunk); questdb_db_return_sender(db, sender); diff --git a/doc/COLUMN_SENDER_PLAN.md b/doc/COLUMN_SENDER_PLAN.md index 10bf1155..5b425238 100644 --- a/doc/COLUMN_SENDER_PLAN.md +++ b/doc/COLUMN_SENDER_PLAN.md @@ -112,8 +112,8 @@ Python repo (separate) c-questdb-client (this repo) │ │ ▼ │ │ │ ColumnSender (borrowed) │ │ │ ├─ new_chunk │ - │ │ ├─ submit (FSN-returning) │ - │ │ └─ await_acked_fsn │ + │ │ └─ flush (sync, blocks │ + │ │ until server ACK) │ │ └─────────┬───────────────────┘ │ ▼ (BulkChunk encoder, @@ -180,17 +180,40 @@ pub struct ColumnSender { /* &mut Connection (lifetime-bound) */ } impl ColumnSender { /// Create a chunk for a given table. Doesn't touch the connection - /// — chunks are pure data until submitted. + /// — chunks are pure data until flushed. pub fn new_chunk(&self, table: TableName) -> Chunk; - /// Submit a chunk: encode → publish → return FSN (= wire `sequence`). - /// Clears the chunk for reuse on success. - pub fn submit(&mut self, chunk: &mut Chunk) -> Result; + /// Synchronously flush a chunk: encode → publish → block until the + /// server ACK at the requested level arrives. On success the chunk + /// is cleared (allocations retained) ready for the next DataFrame. + /// On failure the chunk is left untouched. + /// + /// `ack_level`: + /// - `AckLevel::Ok` — wait for WAL-commit ACK (spec status `0x00`). + /// Always available. + /// - `AckLevel::Durable` — wait for object-store durability ACK + /// (spec status `0x02`). Enterprise feature; requires the pool + /// to be opened with `request_durable_ack=on` in the connect + /// string. If the connection did not opt in, returns + /// `InvalidApiCall`. + /// + /// At most one frame in flight per sender; for parallel ingest, + /// borrow multiple senders from the `QuestDb` pool. + pub fn flush(&mut self, chunk: &mut Chunk, ack_level: AckLevel) -> Result<()>; - pub fn await_acked_fsn(&mut self, fsn: Fsn, timeout: Duration) -> Result<()>; pub fn must_close(&self) -> bool; } +#[derive(Clone, Copy, Debug, Default)] +pub enum AckLevel { + /// Server's WAL commit (spec status `0x00`). Always available. + #[default] + Ok, + /// Server's object-store durability (spec status `0x02`). + /// Enterprise + requires durable-ack opt-in at connect. + Durable, +} + pub struct Chunk { /* table name + Vec + row_count */ } impl Chunk { @@ -327,19 +350,26 @@ land. timeout while keeping `pool_size` warm, - `close()` joins the reaper cleanly. -### WS-1 — `ColumnSender` thin handle & wire-side submit plumbing +### WS-1 — `ColumnSender` thin handle & synchronous flush plumbing - Define `ColumnSender` as a `&mut Connection` lifetime-bound borrow - handle. Implement `submit(chunk)` that calls the new encoder - (WS-2/3/4) and hands the encoded frame to the existing publisher - (`questdb-rs/src/ingress/sender/qwp_ws_publisher.rs`). -- Hook up FSN return, `await_acked_fsn`, `must_close`. -- Stub `submit()` for an empty chunk that produces a header-only QWP - frame end-to-end (no columns; pure framing) and the server accepts. + handle. Implement `flush(chunk)` that calls the new encoder + (WS-2/3/4), hands the encoded frame to the existing publisher + (`questdb-rs/src/ingress/sender/qwp_ws_publisher.rs`), and blocks + until the server ACK arrives. +- Internally the publisher still tracks the wire `sequence` (FSN); + `flush` waits on that FSN. FSN is not exposed at the public API. +- Hook up `must_close`. +- Refuse `sf_dir` (and other `sf_*` keys) at `QuestDb::connect`-time + with `ConfigError`. Update WS-0's connect-string parser + accordingly. +- Stub `flush()` on an empty chunk: produces a header-only QWP frame + end-to-end (no columns; pure framing), server accepts and ACKs. - Owner: 1 engineer. - Depends on: WS-0. -- Done when: empty-chunk submit round-trips against a real server and - the FSN is acked. +- Done when: empty-chunk `flush` round-trips against a real server and + returns on ACK; `sf_dir` in the connect string is rejected with a + clear error. ### WS-2 — `Chunk`, `BulkChunk` encoder, numeric/fixed-width columns @@ -535,6 +565,21 @@ flag a deviation rather than re-litigate silently. Naming: `QuestDb`, `ColumnSender`, `Chunk`, `Validity`. - **Mental model:** `DataFrame → Table`. One chunk = one table = one DataFrame = one QWP frame = one FSN. +- **Send is synchronous.** `sender.flush(&mut chunk, ack_level)` + blocks until the server ACK at the requested level arrives. Two + levels: `Ok` (WAL commit, always available) and `Durable` + (object-store durability — Enterprise; requires durable-ack opt-in + at connect). At most one frame in flight per sender. Parallelism is + expressed by borrowing multiple senders from the pool, one per + thread. The wire's 128-in-flight cap is never reached. The QWP + `sequence` / FSN is tracked internally and not exposed at the API + or FFI surface. +- **Store-and-forward (`sf_dir`) is refused in v1.** Passing `sf_dir` + or any other `sf_*` key to `QuestDb::connect` returns `ConfigError`. + SF is single-writer per slot and interacts awkwardly with pool + auto-grow. Users who need on-disk durability across crashes can use + the existing row-major `Sender` API. Revisit if a real user needs + both throughput and SF. - **Connection layer:** pool (`QuestDb::connect`), borrow/return (`db.borrow_sender()` → drop returns to pool). Defaults: `pool_size=1`, `pool_max=64`, `pool_idle_timeout_ms=60000`. Eager @@ -571,8 +616,6 @@ flag a deviation rather than re-litigate silently. - `LONG_ARRAY` / `DOUBLE_ARRAY` per-row, `GEOHASH`, `CHAR`, `BINARY`. - C++ header wrapper (`column_sender.hpp`). Python wrapper does not need it. -- Durable-ack callback API. Connect-string opt-in - (`X-QWP-Request-Durable-Ack: true` via `qwp_durable_ack=on`) is - surfaced; the OK fast path is what the throughput target cares - about. +- (Removed in this revision: durable-ack as deferred. See settled + decisions for ack-level handling.) From c7407b0a58b88cf5887a30c05bc38b60e456d341 Mon Sep 17 00:00:00 2001 From: bluestreak Date: Sun, 24 May 2026 17:30:47 +0100 Subject: [PATCH 3/9] feat(ingress): column-major sender for QWP/WebSocket (WS-0..WS-6) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Lands the Rust core, C ABI, and benchmarks for a column-major sender targeting Pandas/Polars → QuestDB throughput over QWP/WebSocket. See `doc/COLUMN_SENDER_PLAN.md` for the design and `doc/COLUMN_SENDER_FFI_ABI.md` for the C ABI spec; both shipped in earlier commits on this branch. # What's in the box * **WS-0 — `QuestDb` pool** (`ingress/column_sender/db.rs`, `ingress/column_sender/conf.rs`). Thread-safe pool with eager-open, fail-fast at `pool_max`, `BorrowedSender<'a>` that returns on `Drop`, and a background reaper (`pool_reap=auto`, tick = max(5 s, idle_timeout / 12)) that closes excess-over-`pool_size` connections. New conf keys: `pool_size`, `pool_max`, `pool_idle_timeout_ms`, `pool_reap`. `sf_*` / `sender_id` / `qwp_ws_progress=manual` refused at `connect`-time. * **WS-1 — synchronous `flush` plumbing** (`ingress/column_sender/sender.rs`, `ingress/column_sender/encoder.rs`). `ColumnSender::flush(chunk, AckLevel)` encodes the chunk, publishes via the existing QWP/WS replay queue (`Sender::qwp_ws_publish_raw` — pub(crate) escape hatch in the row-API sender), and blocks until the ACK watermark crosses the published FSN. Polls in 50 ms slices so a `must_close` mid-wait surfaces promptly. `AckLevel::Durable` requires `request_durable_ack=on` at connect or returns `InvalidApiCall`. * **WS-2 — `Chunk` + numeric / fixed-width columns** (`ingress/column_sender/chunk.rs`, `validity.rs`, `wire.rs`). Per-column wire-shape `Vec` storage so encode is a header + `extend_from_slice` per column. Two code paths per type per the plan §2.2: - Bool, i8, i16, i32, i64, f32, f64: `null_flag = 0` always; nullable rows sentinel-encoded (0 / i32::MIN / i64::MIN / NaN), matching the row-API convention. - Sparse-null types (uuid, long256, ipv4, ts_nanos, ts_micros, date_millis): no-null = `extend_from_slice`; nullable = QWP-shape bitmap + dense gather. - Designated timestamp (micros or nanos) — exactly one per chunk. Connection-scoped `SchemaRegistry`: first emit → FULL; repeat → REFERENCE. * **WS-3 — VARCHAR** (`Chunk::column_varchar`). Arrow Utf8 in (`offsets: &[i32]` length `row_count + 1`, `bytes: &[u8]`); wire out is dense `non_null_count + 1` LE-u32 offsets + concatenated bytes. No-null path memcpys offsets when `offsets[0] == 0`; nullable path walks validity and skips slicing for null rows. Offset validation (negative / non-monotonic / past `bytes_len`) caught client-side. * **WS-4 — symbol bulk-intern** (`Chunk::symbol_dict_{i8,i16,i32}`, `encoder::resolve_symbols`). Three append-time passes: referenced-bitset + range check; compact referenced dict bytes; translate codes to internal indices and build the QWP-shape bitmap. Connection-scoped `SymbolGlobalDict` shared with the row API's type (`buffer/qwp.rs:next_id/intern/entry` promoted to `pub(crate)`). At flush time, only entries the chunk actually references reach the wire — protects the 1M-per-connection cap on huge Pandas `Categorical` dicts. Roll-back on encode error keeps client + server dict views coherent. * **WS-5 — C ABI** (`questdb-rs-ffi/src/column_sender.rs`, `include/questdb/ingress/column_sender.h`). Full implementation of `doc/COLUMN_SENDER_FFI_ABI.md`: - Opaque handles `questdb_db`, `column_sender`, `column_sender_chunk`. - `column_sender_validity` repr-C struct; `column_sender_ack_level` repr-C enum. - `questdb_db_connect/close/borrow_sender/return_sender/reap_idle`. - Every chunk column-append, the VARCHAR + symbol_dict family, the two designated-timestamp variants, and `column_sender_flush`. - Errors reuse `line_sender_error*`. Rust side gains `OwnedSender` — Arc-backed borrow handle the FFI hands out as `column_sender*` so the C caller can free `questdb_db*` before all borrows return without dangling. Hand-runnable smoke test at `cpp_test/smoke_column_sender.c` (compiles with `-Wall -Wextra -Werror`; not wired into CMake yet — matches the `smoke_line_reader` pattern). * **WS-6 — bench** (`questdb-rs/benches/column_sender.rs`, `doc/COLUMN_SENDER_PERF.md`). Three families: per-column append vs raw memcpy baseline; symbol bulk-intern vs naïve per-row HashMap; encode_chunk end-to-end (no network). First-baseline numbers (Apple Silicon laptop, 100k rows): - `column_f64/column_sender_no_null` ≈ 55 GiB/s — matches memcpy. - `column_i64/column_sender_no_null` ≈ 54 GiB/s — matches memcpy. - `column_varchar/column_sender_no_null` within ~5 % of memcpy. - Symbol bulk-intern ~16× faster than naïve per-row HashMap. - `encode_chunk/populate_plus_encode` ≈ 139 M rows/s end-to-end. # Verification - 57 column-sender tests (Rust core); 8 FFI tests; full 834-test lib suite passes. - `cargo fmt` + `cargo clippy --tests --benches` clean on both crates. - `cargo doc` introduces no new warnings. - `cc -std=c11 -Wall -Wextra -Werror -I include` compiles the C header and the smoke program. # What's not in here - WS-7 (Python wrapper) lives in `py-questdb-client`. With the C ABI in `include/questdb/ingress/column_sender.h` and the FFI symbols in `libquestdb_client`, that repo can now start consuming. - A live Pandas→QuestDB end-to-end bench and 1-hour soak — both belong in the Python repo / nightly CI rather than the in-tree Criterion suite. Co-Authored-By: Claude Opus 4.7 (1M context) --- cpp_test/smoke_column_sender.c | 166 +++ doc/COLUMN_SENDER_PERF.md | 99 ++ include/questdb/ingress/column_sender.h | 466 +++++++ questdb-rs-ffi/src/column_sender.rs | 976 ++++++++++++++ questdb-rs-ffi/src/lib.rs | 3 + questdb-rs/Cargo.toml | 11 + questdb-rs/benches/column_sender.rs | 432 ++++++ questdb-rs/src/ingress.rs | 3 + questdb-rs/src/ingress/buffer/qwp.rs | 11 +- questdb-rs/src/ingress/column_sender/chunk.rs | 1160 +++++++++++++++++ questdb-rs/src/ingress/column_sender/conf.rs | 413 ++++++ questdb-rs/src/ingress/column_sender/db.rs | 513 ++++++++ .../src/ingress/column_sender/encoder.rs | 498 +++++++ questdb-rs/src/ingress/column_sender/mod.rs | 99 ++ .../src/ingress/column_sender/sender.rs | 153 +++ .../src/ingress/column_sender/validity.rs | 171 +++ questdb-rs/src/ingress/column_sender/wire.rs | 116 ++ questdb-rs/src/ingress/sender.rs | 46 + questdb-rs/src/ingress/sender/qwp_ws.rs | 11 + questdb-rs/src/tests.rs | 3 + questdb-rs/src/tests/column_sender_pool.rs | 589 +++++++++ questdb-rs/src/tests/qwp_ws.rs | 21 +- 22 files changed, 5949 insertions(+), 11 deletions(-) create mode 100644 cpp_test/smoke_column_sender.c create mode 100644 doc/COLUMN_SENDER_PERF.md create mode 100644 include/questdb/ingress/column_sender.h create mode 100644 questdb-rs-ffi/src/column_sender.rs create mode 100644 questdb-rs/benches/column_sender.rs create mode 100644 questdb-rs/src/ingress/column_sender/chunk.rs create mode 100644 questdb-rs/src/ingress/column_sender/conf.rs create mode 100644 questdb-rs/src/ingress/column_sender/db.rs create mode 100644 questdb-rs/src/ingress/column_sender/encoder.rs create mode 100644 questdb-rs/src/ingress/column_sender/mod.rs create mode 100644 questdb-rs/src/ingress/column_sender/sender.rs create mode 100644 questdb-rs/src/ingress/column_sender/validity.rs create mode 100644 questdb-rs/src/ingress/column_sender/wire.rs create mode 100644 questdb-rs/src/tests/column_sender_pool.rs diff --git a/cpp_test/smoke_column_sender.c b/cpp_test/smoke_column_sender.c new file mode 100644 index 00000000..7f2f19c3 --- /dev/null +++ b/cpp_test/smoke_column_sender.c @@ -0,0 +1,166 @@ +/******************************************************************************* + * ___ _ ____ ____ + * / _ \ _ _ ___ ___| |_| _ \| __ ) + * | | | | | | |/ _ \/ __| __| | | | _ \ + * | |_| | |_| | __/\__ \ |_| |_| | |_) | + * \__\_\\__,_|\___||___/\__|____/|____/ + * + * Copyright (c) 2014-2019 Appsicle + * Copyright (c) 2019-2025 QuestDB + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + ******************************************************************************/ + +/* + * Hand-runnable smoke test for the column-major sender C ABI. + * + * Not wired into CMake — the in-tree CMake build does not yet build the + * column-sender ABI surface as a C test (the existing `smoke_line_reader` + * pattern wires through ctest; we'll follow it once the C test matrix + * for the column sender is fleshed out). + * + * Build manually against a real QuestDB instance, e.g.: + * + * gcc -std=c11 cpp_test/smoke_column_sender.c \ + * -I include -L target/debug -lquestdb_client \ + * -o smoke_column_sender + * + * ./smoke_column_sender "qwpws::addr=localhost:9000;" + * + * Round-trips a single 3-row chunk with mixed i64, f64, varchar, and a + * designated timestamp. Prints any client-side error to stderr and + * exits non-zero; on success exits 0 after flushing and returning the + * sender to the pool. + */ + +#include +#include +#include +#include + +#include "questdb/ingress/column_sender.h" + +static int die(line_sender_error* err, const char* what) +{ + if (err) { + size_t msg_len = 0; + const char* msg = line_sender_error_msg(err, &msg_len); + fprintf(stderr, "%s: %.*s\n", what, (int)msg_len, msg); + line_sender_error_free(err); + } else { + fprintf(stderr, "%s\n", what); + } + return 1; +} + +int main(int argc, char** argv) +{ + if (argc < 2) { + fprintf(stderr, + "usage: %s 'qwpws::addr=host:port;[options]'\n", + argv[0]); + return 2; + } + const char* conf = argv[1]; + + line_sender_error* err = NULL; + questdb_db* db = questdb_db_connect(conf, strlen(conf), &err); + if (!db) + return die(err, "questdb_db_connect failed"); + + column_sender* sender = questdb_db_borrow_sender(db, &err); + if (!sender) { + questdb_db_close(db); + return die(err, "questdb_db_borrow_sender failed"); + } + + const char* table = "smoke_column_sender"; + column_sender_chunk* chunk = + column_sender_chunk_new(table, strlen(table), &err); + if (!chunk) { + questdb_db_return_sender(db, sender); + questdb_db_close(db); + return die(err, "column_sender_chunk_new failed"); + } + + const char* qty_name = "qty"; + const int64_t qty[3] = { 10, 20, 30 }; + if (!column_sender_chunk_column_i64( + chunk, qty_name, strlen(qty_name), + qty, 3, NULL, &err)) + { + column_sender_chunk_free(chunk); + questdb_db_return_sender(db, sender); + questdb_db_close(db); + return die(err, "column_i64(qty) failed"); + } + + const char* price_name = "price"; + const double price[3] = { 1.1, 2.2, 3.3 }; + if (!column_sender_chunk_column_f64( + chunk, price_name, strlen(price_name), + price, 3, NULL, &err)) + { + column_sender_chunk_free(chunk); + questdb_db_return_sender(db, sender); + questdb_db_close(db); + return die(err, "column_f64(price) failed"); + } + + /* Arrow Utf8: 3 rows of varchar with one null in the middle. + offsets length = row_count + 1; null row's slice is ignored by + the encoder (we set it to zero length here to keep offsets + monotonic). */ + const char* msg_name = "msg"; + const int32_t msg_offsets[4] = { 0, 5, 5, 10 }; + const uint8_t msg_bytes[] = { 'a','l','p','h','a', + 'g','a','m','m','a' }; + const uint8_t msg_validity_bits = 0x05u; /* rows 0 + 2 valid, row 1 null */ + const column_sender_validity msg_validity = { + &msg_validity_bits, 3 + }; + if (!column_sender_chunk_column_varchar( + chunk, msg_name, strlen(msg_name), + msg_offsets, msg_bytes, sizeof(msg_bytes), + 3, &msg_validity, &err)) + { + column_sender_chunk_free(chunk); + questdb_db_return_sender(db, sender); + questdb_db_close(db); + return die(err, "column_varchar(msg) failed"); + } + + const int64_t ts_nanos[3] = { + (int64_t)1700000000000000000LL, + (int64_t)1700000000000001000LL, + (int64_t)1700000000000002000LL + }; + if (!column_sender_chunk_designated_timestamp_nanos( + chunk, ts_nanos, 3, &err)) + { + column_sender_chunk_free(chunk); + questdb_db_return_sender(db, sender); + questdb_db_close(db); + return die(err, "designated_timestamp_nanos failed"); + } + + if (!column_sender_flush( + sender, chunk, column_sender_ack_level_ok, &err)) + { + column_sender_chunk_free(chunk); + questdb_db_return_sender(db, sender); + questdb_db_close(db); + return die(err, "column_sender_flush failed"); + } + + column_sender_chunk_free(chunk); + questdb_db_return_sender(db, sender); + questdb_db_close(db); + fprintf(stdout, "ok\n"); + return 0; +} diff --git a/doc/COLUMN_SENDER_PERF.md b/doc/COLUMN_SENDER_PERF.md new file mode 100644 index 00000000..cfc7d8d9 --- /dev/null +++ b/doc/COLUMN_SENDER_PERF.md @@ -0,0 +1,99 @@ +# Column-Major Sender — Performance Notes + +Tracks the bench results that anchor `doc/COLUMN_SENDER_PLAN.md` §2.1 +("encode is a header + extend_from_slice per column") and §2.2 ("no-null += memcpy; nullable = invert+gather"). + +The Criterion bench lives at `questdb-rs/benches/column_sender.rs`. It +covers three families: + +1. **Per-column bulk append** — each column-type's hot path vs a raw + `extend_from_slice` baseline. +2. **Symbol bulk-intern** — `Chunk::symbol_dict_i32` vs a naïve per-row + HashMap probe that mirrors what a row-API symbol cell pays. +3. **End-to-end encode** — populate a 100k-row chunk with a + representative column mix and time the encoder body. + +Pure encoder cost — no network, no real server. + +## Running + +```sh +cargo bench --features sync-sender-qwp-ws --bench column_sender + +# Larger workload (anchors the headline 10M-rows-per-batch number from +# the WS-2/WS-4 plan): +QUESTDB_COLUMN_BENCH_ROWS=10000000 \ + cargo bench --features sync-sender-qwp-ws --bench column_sender + +# Knobs: +# QUESTDB_COLUMN_BENCH_ROWS default 100_000 +# QUESTDB_COLUMN_BENCH_VARCHAR_LEN default 16 +# QUESTDB_COLUMN_BENCH_SYM_CARD default 1_000 +``` + +## First-baseline numbers + +Captured on an Apple Silicon laptop, default workload +(`rows = 100_000`, `varchar_len = 16`, `sym_card = 1_000`), +`cargo bench ... -- --quick --noplot`. Replace with refreshed numbers as +the encoder evolves. + +| Bench | Median time | Median throughput | Notes | +|-------------------------------------|------------:|--------------------:|-------| +| `column_i64/memcpy_baseline` | ~143 µs | ~5.2 GiB/s | High variance — bare `Vec` alloc + push + extend on a 800 KB allocation dominates. | +| `column_i64/column_sender_no_null` | ~13.7 µs | ~54 GiB/s | Memcpy-bound; matches the plan's "no-null = `extend_from_slice`" goal. | +| `column_i64/column_sender_nullable` | ~79.1 µs | ~9.4 GiB/s | Sentinel-encode per row (`i64::MIN` for nulls). | +| `column_f64/memcpy_baseline` | ~13.6 µs | ~54.7 GiB/s | | +| `column_f64/column_sender_no_null` | ~13.5 µs | ~55 GiB/s | Indistinguishable from memcpy. | +| `column_varchar/memcpy_baseline` | ~63.6 µs | ~29.3 GiB/s | Offset table + bytes copy. | +| `column_varchar/column_sender_no_null` | ~67.0 µs | ~27.8 GiB/s | Within ~5 % of memcpy; rebase-to-zero path is the same as memcpy when `offsets[0] == 0`. | +| `symbol_dict/column_sender` | ~135 µs | ~740 M rows/s | 100k rows × 1 000-card dict; three-pass bulk-intern. | +| `symbol_dict/naive_per_row_hashmap` | ~2.16 ms | ~46 M rows/s | Per-row HashMap probe; mirrors what the row API pays. **~16× slower than the column path** — confirms the WS-4 plan claim (drops 100k probes to 1 000 interns). | +| `encode_chunk/populate_only` | ~294 µs | ~341 M rows/s | 5 columns (i64, f64, varchar, symbol, designated_ts); all bulk-append calls. | +| `encode_chunk/encode_only` | ~437 µs | ~229 M rows/s | Header + dict-delta + table block + per-column splices. | +| `encode_chunk/populate_plus_encode` | ~718 µs | ~139 M rows/s | End-to-end, no network. | + +A second-pass `encode_chunk/encode_only` on the same workload should +land in **REFERENCE mode** for the schema (because the registry caches +the signature from the first encode), shaving off the FULL-mode +signature bytes — see `doc/COLUMN_SENDER_PLAN.md` §2.1. + +## Interpreting the baseline + +- The **`column_f64/column_sender_no_null` ≈ memcpy** result is the + load-bearing perf claim of the column sender: a contiguous typed + buffer pays the cost of a `memcpy` and nothing more. The chunk's + per-column `Vec` storage absorbs the null-flag byte + payload in + one extend; encode time then turns each column into a single + `extend_from_slice`. +- The **`column_i64/memcpy_baseline` variance** is bench noise from the + large per-iteration allocation in the baseline (a fresh + ~800 KB `Vec` per sample). The column-sender path reuses its + `Vec::with_capacity(16)` seed and grows in place, which the + allocator handles more uniformly. Both medians are well above + network bandwidth, so this is not the bottleneck. +- The **nullable I64 path** at ~9.4 GiB/s is the sentinel-encode loop + (`if v.is_valid(i) { value } else { I64_NULL }`), bounded by branch + prediction. It still moves the same 800 KB; a SIMD lowering would + close the gap with the no-null path but isn't necessary to hit the + "memcpy-bound when the user has no nulls" bar. +- The **symbol bulk-intern speedup (~16×)** comes from the WS-4 + three-pass design — referenced bitset, compact dict copy, code + translation. At 100k rows × 1 000-card dict the column path runs + 1 000 interns plus 100 000 `Vec` writes; the naïve path runs + 100 000 HashMap probes. + +## Out of scope here + +- **End-to-end Pandas → QuestDB throughput** lives in the Python + wrapper repo (WS-7); add the `pandas_to_questdb_throughput` bench + there once a real server is wired into its CI. +- **1-hour soak** belongs in nightly CI rather than the in-tree + Criterion suite; track that as a follow-up alongside WS-7. +- **Microbench against the row-API encoder** is intentionally absent. + The row API's `Buffer::column_i64` is a per-cell call (it appends a + single value per invocation); comparing it cell-by-cell against the + column sender's bulk append would be apples vs oranges and is + already qualitatively captured by the `symbol_dict/naive_per_row_*` + comparison. diff --git a/include/questdb/ingress/column_sender.h b/include/questdb/ingress/column_sender.h new file mode 100644 index 00000000..cad41df8 --- /dev/null +++ b/include/questdb/ingress/column_sender.h @@ -0,0 +1,466 @@ +/******************************************************************************* + * ___ _ ____ ____ + * / _ \ _ _ ___ ___| |_| _ \| __ ) + * | | | | | | |/ _ \/ __| __| | | | _ \ + * | |_| | |_| | __/\__ \ |_| |_| | |_) | + * \__\_\\__,_|\___||___/\__|____/|____/ + * + * Copyright (c) 2014-2019 Appsicle + * Copyright (c) 2019-2025 QuestDB + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ******************************************************************************/ + +/* + * Column-major sender for QuestDB QWP/WebSocket. + * + * Mirrors doc/COLUMN_SENDER_FFI_ABI.md. Reuses `line_sender_error*` from + * `line_sender.h` for fallible-call error reporting; all opaque handles + * are heap-allocated and freed through their dedicated entry points. + * + * Conventions: + * - Opaque handles must be non-NULL unless the function documentation + * states otherwise. + * - `err_out` is optional on every fallible call: pass NULL to discard + * error information. + * - `column_sender_chunk` is owned by the caller and not bound to a + * particular sender; chunks can be built on any thread and flushed + * through any sender borrowed from the same `questdb_db`. + */ + +#pragma once + +#ifdef __cplusplus +extern "C" { +#endif + +#include +#include +#include + +#include "line_sender.h" + +/* ------------------------------------------------------------------------- + * Opaque handles + * ------------------------------------------------------------------------- */ + +/** Connection pool. Thread-safe; share across threads. */ +typedef struct questdb_db questdb_db; + +/** Borrowed sender. Not thread-safe; belongs to the borrowing thread + * until returned via `questdb_db_return_sender`. */ +typedef struct column_sender column_sender; + +/** One DataFrame's worth of column buffers destined for one QuestDB table. + * Owned by the caller. */ +typedef struct column_sender_chunk column_sender_chunk; + +/* ------------------------------------------------------------------------- + * Validity bitmap + * + * Arrow shape: bit = 1 means VALID, bit = 0 means NULL. LSB-first within + * each byte. `bit_len` must equal the chunk's row count; `bits` must + * point to at least `ceil(bit_len / 8)` bytes. Pass `bits=NULL, + * bit_len=0` to signal "no nulls" (or pass a `NULL` pointer to the + * column function's `validity` parameter). + * ------------------------------------------------------------------------- */ + +typedef struct column_sender_validity +{ + const uint8_t* bits; + size_t bit_len; +} column_sender_validity; + +/* ------------------------------------------------------------------------- + * Acknowledgement level for `column_sender_flush`. + * ------------------------------------------------------------------------- */ + +typedef enum column_sender_ack_level +{ + /** Wait for the server's WAL-commit ACK (spec status 0x00). Always + * available. */ + column_sender_ack_level_ok = 0, + + /** Wait for the server's object-store durability ACK (spec status + * 0x02). Enterprise only; requires the pool to be opened with + * `request_durable_ack=on` in the connect string. Flush returns + * `line_sender_error_invalid_api_call` otherwise. */ + column_sender_ack_level_durable = 1 +} column_sender_ack_level; + +/* ------------------------------------------------------------------------- + * Pool and sender borrow + * ------------------------------------------------------------------------- */ + +/** + * Open a connection pool. Eagerly opens `pool_size` connections (default + * 1); any auth / TLS / connect error during those opens fails the call. + * + * `conf` is a `qwpws::` / `qwpwss::` connect string. Pool-specific keys: + * `pool_size` (default 1) warm/min connections; + * `pool_max` (default 64) hard cap on auto-grow; + * `pool_idle_timeout_ms` (default 60000) + * reap above-pool_size idle conns; + * `pool_reap` (`auto`|`manual`, default `auto`) + * background reaper opt-in. + * + * Store-and-forward keys (`sf_*`, `sender_id`) are refused — use the + * row-major `line_sender_*` API for on-disk durability. + */ +QUESTDB_CLIENT_API +questdb_db* questdb_db_connect( + const char* conf, + size_t conf_len, + line_sender_error** err_out); + +/** + * Close the pool and all its connections. Accepts NULL and no-ops. + * Outstanding `column_sender` handles remain valid and return their + * connections on `questdb_db_return_sender` — the pool's state is + * reference-counted internally. + */ +QUESTDB_CLIENT_API +void questdb_db_close(questdb_db* db); + +/** + * Borrow a sender. Selection rules: + * 1. If a previously-returned sender is in the free list, hand it out. + * 2. Otherwise, if pool size < `pool_max`, open a new connection. + * 3. Otherwise (at cap), return NULL + `line_sender_error_invalid_api_call`. + * + * The returned sender is bound to the calling thread until returned. + */ +QUESTDB_CLIENT_API +column_sender* questdb_db_borrow_sender( + questdb_db* db, + line_sender_error** err_out); + +/** + * Return a sender to the pool. Accepts NULL `sender` and no-ops. + * Invalidates the `sender` pointer; do not use it after this call. + * + * `db` is currently ignored — the sender carries its own reference to + * the pool — but accepted for symmetry with the borrow call. + */ +QUESTDB_CLIENT_API +void questdb_db_return_sender( + questdb_db* db, + column_sender* sender); + +/** + * Manually reap idle connections (closes free-list entries idle longer + * than `pool_idle_timeout_ms`, never shrinking below `pool_size`). + * Returns the number of connections closed. + */ +QUESTDB_CLIENT_API +size_t questdb_db_reap_idle(questdb_db* db); + +/* ------------------------------------------------------------------------- + * Sender state inspection + * ------------------------------------------------------------------------- */ + +/** + * `true` if the sender's underlying connection is in a permanently- + * unusable state. On return to the pool such senders are dropped, not + * recycled. + */ +QUESTDB_CLIENT_API +bool column_sender_must_close(const column_sender* sender); + +/* ------------------------------------------------------------------------- + * Chunk lifecycle + * ------------------------------------------------------------------------- */ + +/** + * Create an empty chunk for the given table. The chunk is caller-owned + * and must be freed with `column_sender_chunk_free` or flushed via + * `column_sender_flush` (which clears but does not free it). + */ +QUESTDB_CLIENT_API +column_sender_chunk* column_sender_chunk_new( + const char* table_name, + size_t table_name_len, + line_sender_error** err_out); + +/** Discard the chunk and release its allocations. Accepts NULL. */ +QUESTDB_CLIENT_API +void column_sender_chunk_free(column_sender_chunk* chunk); + +/** Clear the chunk's content, keeping retained capacity for reuse. */ +QUESTDB_CLIENT_API +void column_sender_chunk_clear(column_sender_chunk* chunk); + +/** Current row count of the chunk; 0 if no column has been appended. */ +QUESTDB_CLIENT_API +size_t column_sender_chunk_row_count(const column_sender_chunk* chunk); + +/* ------------------------------------------------------------------------- + * Numeric / fixed-width column appends + * + * Every column-append function locks the chunk's row count on the first + * call. Subsequent columns must agree on row count. `data` is a + * contiguous, full-length typed array with one slot per row (including + * null rows — their slot value is ignored). `validity` is optional; + * pass NULL when the column has no nulls. + * ------------------------------------------------------------------------- */ + +QUESTDB_CLIENT_API +bool column_sender_chunk_column_i8( + column_sender_chunk* chunk, + const char* name, size_t name_len, + const int8_t* data, size_t row_count, + const column_sender_validity* validity, + line_sender_error** err_out); + +QUESTDB_CLIENT_API +bool column_sender_chunk_column_i16( + column_sender_chunk* chunk, + const char* name, size_t name_len, + const int16_t* data, size_t row_count, + const column_sender_validity* validity, + line_sender_error** err_out); + +QUESTDB_CLIENT_API +bool column_sender_chunk_column_i32( + column_sender_chunk* chunk, + const char* name, size_t name_len, + const int32_t* data, size_t row_count, + const column_sender_validity* validity, + line_sender_error** err_out); + +QUESTDB_CLIENT_API +bool column_sender_chunk_column_i64( + column_sender_chunk* chunk, + const char* name, size_t name_len, + const int64_t* data, size_t row_count, + const column_sender_validity* validity, + line_sender_error** err_out); + +QUESTDB_CLIENT_API +bool column_sender_chunk_column_f32( + column_sender_chunk* chunk, + const char* name, size_t name_len, + const float* data, size_t row_count, + const column_sender_validity* validity, + line_sender_error** err_out); + +QUESTDB_CLIENT_API +bool column_sender_chunk_column_f64( + column_sender_chunk* chunk, + const char* name, size_t name_len, + const double* data, size_t row_count, + const column_sender_validity* validity, + line_sender_error** err_out); + +/** + * `BOOLEAN` column. `data` is an Arrow-style LSB-first packed bitmap + * (1 = true). `data` must point to at least `ceil(row_count / 8)` bytes. + */ +QUESTDB_CLIENT_API +bool column_sender_chunk_column_bool( + column_sender_chunk* chunk, + const char* name, size_t name_len, + const uint8_t* data, size_t row_count, + const column_sender_validity* validity, + line_sender_error** err_out); + +/** + * `UUID` column. `data` points to `row_count * 16` bytes; each 16-byte + * group is one UUID (bytes 0..8 lo half LE, 8..16 hi half LE). + */ +QUESTDB_CLIENT_API +bool column_sender_chunk_column_uuid( + column_sender_chunk* chunk, + const char* name, size_t name_len, + const uint8_t* data, size_t row_count, + const column_sender_validity* validity, + line_sender_error** err_out); + +/** + * `LONG256` column. `data` points to `row_count * 32` bytes — four + * little-endian 64-bit limbs per row, least-significant limb first. + */ +QUESTDB_CLIENT_API +bool column_sender_chunk_column_long256( + column_sender_chunk* chunk, + const char* name, size_t name_len, + const uint8_t* data, size_t row_count, + const column_sender_validity* validity, + line_sender_error** err_out); + +/** + * `IPV4` column. Each `data[i]` is `u32::from(Ipv4Addr)` (octet 0 in + * the high byte), encoded little-endian on the wire. + */ +QUESTDB_CLIENT_API +bool column_sender_chunk_column_ipv4( + column_sender_chunk* chunk, + const char* name, size_t name_len, + const uint32_t* data, size_t row_count, + const column_sender_validity* validity, + line_sender_error** err_out); + +/* ------------------------------------------------------------------------- + * Timestamp columns (non-designated) + * ------------------------------------------------------------------------- */ + +/** `TIMESTAMP_NANOS` column, nanoseconds since the Unix epoch. */ +QUESTDB_CLIENT_API +bool column_sender_chunk_column_ts_nanos( + column_sender_chunk* chunk, + const char* name, size_t name_len, + const int64_t* data, size_t row_count, + const column_sender_validity* validity, + line_sender_error** err_out); + +/** `TIMESTAMP` column, microseconds since the Unix epoch. */ +QUESTDB_CLIENT_API +bool column_sender_chunk_column_ts_micros( + column_sender_chunk* chunk, + const char* name, size_t name_len, + const int64_t* data, size_t row_count, + const column_sender_validity* validity, + line_sender_error** err_out); + +/** `DATE` column, milliseconds since the Unix epoch. */ +QUESTDB_CLIENT_API +bool column_sender_chunk_column_date_millis( + column_sender_chunk* chunk, + const char* name, size_t name_len, + const int64_t* data, size_t row_count, + const column_sender_validity* validity, + line_sender_error** err_out); + +/* ------------------------------------------------------------------------- + * Variable-width text (VARCHAR) + * ------------------------------------------------------------------------- */ + +/** + * `VARCHAR` column (QWP wire type 0x0F). + * + * Input layout matches Arrow Utf8: + * - `offsets` has `row_count + 1` entries, monotonically non-decreasing. + * - `bytes` is a single contiguous UTF-8 buffer; offsets are absolute + * byte offsets into it (the column encoder rebases to 0 on the wire + * when the first offset is non-zero). + * - `validity` is Arrow-shape; NULL-row offset slices are not + * inspected. + * + * Wire output: dense (only non-null values), `non_null_count + 1` + * little-endian uint32 offsets followed by the concatenated bytes. + * + * UTF-8 validity is the caller's responsibility; invalid UTF-8 is + * detected by the server and surfaced as + * `line_sender_error_server_rejection`. + */ +QUESTDB_CLIENT_API +bool column_sender_chunk_column_varchar( + column_sender_chunk* chunk, + const char* name, size_t name_len, + const int32_t* offsets, + const uint8_t* bytes, + size_t bytes_len, + size_t row_count, + const column_sender_validity* validity, + line_sender_error** err_out); + +/* ------------------------------------------------------------------------- + * Symbol columns (dictionary fast path) + * + * `codes` is per-row dictionary indices. `dict_offsets` (length + * `dict_offsets_len`) and `dict_bytes` (length `dict_bytes_len`) + * describe the dictionary in Arrow Utf8 layout. The library interns + * only referenced dict entries against the connection-scoped global + * symbol table — `dict_offsets_len - 1` may be huge (Pandas + * `Categorical`) without paying the cost for unused entries. + * + * `codes[i]` must be in `0 .. dict_len` for non-null rows; null-row + * codes are not inspected. + * ------------------------------------------------------------------------- */ + +QUESTDB_CLIENT_API +bool column_sender_chunk_symbol_dict_i8( + column_sender_chunk* chunk, + const char* name, size_t name_len, + const int8_t* codes, size_t row_count, + const int32_t* dict_offsets, size_t dict_offsets_len, + const uint8_t* dict_bytes, size_t dict_bytes_len, + const column_sender_validity* validity, + line_sender_error** err_out); + +QUESTDB_CLIENT_API +bool column_sender_chunk_symbol_dict_i16( + column_sender_chunk* chunk, + const char* name, size_t name_len, + const int16_t* codes, size_t row_count, + const int32_t* dict_offsets, size_t dict_offsets_len, + const uint8_t* dict_bytes, size_t dict_bytes_len, + const column_sender_validity* validity, + line_sender_error** err_out); + +QUESTDB_CLIENT_API +bool column_sender_chunk_symbol_dict_i32( + column_sender_chunk* chunk, + const char* name, size_t name_len, + const int32_t* codes, size_t row_count, + const int32_t* dict_offsets, size_t dict_offsets_len, + const uint8_t* dict_bytes, size_t dict_bytes_len, + const column_sender_validity* validity, + line_sender_error** err_out); + +/* ------------------------------------------------------------------------- + * Designated timestamp + * + * Required exactly once per chunk before flush. Always non-null per the + * QWP wire spec — no `validity` parameter. + * ------------------------------------------------------------------------- */ + +/** Designated timestamp in microseconds (wire type TIMESTAMP, 0x0A). */ +QUESTDB_CLIENT_API +bool column_sender_chunk_designated_timestamp_micros( + column_sender_chunk* chunk, + const int64_t* data, + size_t row_count, + line_sender_error** err_out); + +/** Designated timestamp in nanoseconds (wire type TIMESTAMP_NANOS, 0x10). */ +QUESTDB_CLIENT_API +bool column_sender_chunk_designated_timestamp_nanos( + column_sender_chunk* chunk, + const int64_t* data, + size_t row_count, + line_sender_error** err_out); + +/* ------------------------------------------------------------------------- + * Flush (synchronous) + * + * Encode `chunk` into a QWP/WebSocket frame, publish it, and block + * until the server acknowledges at the requested `ack_level`. On + * success, `chunk` is cleared (allocations retained) and `true` is + * returned. On failure, `chunk` is left untouched. + * + * At most one frame in flight per sender. For parallel ingest, borrow + * multiple senders from the same `questdb_db` — one per worker thread. + * ------------------------------------------------------------------------- */ + +QUESTDB_CLIENT_API +bool column_sender_flush( + column_sender* sender, + column_sender_chunk* chunk, + column_sender_ack_level ack_level, + line_sender_error** err_out); + +#ifdef __cplusplus +} /* extern "C" */ +#endif diff --git a/questdb-rs-ffi/src/column_sender.rs b/questdb-rs-ffi/src/column_sender.rs new file mode 100644 index 00000000..b6a6713b --- /dev/null +++ b/questdb-rs-ffi/src/column_sender.rs @@ -0,0 +1,976 @@ +/******************************************************************************* + * ___ _ ____ ____ + * / _ \ _ _ ___ ___| |_| _ \| __ ) + * | | | | | | |/ _ \/ __| __| | | | _ \ + * | |_| | |_| | __/\__ \ |_| |_| | |_) | + * \__\_\\__,_|\___||___/\__|____/|____/ + * + * Copyright (c) 2014-2019 Appsicle + * Copyright (c) 2019-2025 QuestDB + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ******************************************************************************/ + +//! C ABI for the column-major sender. +//! +//! Mirrors `doc/COLUMN_SENDER_FFI_ABI.md`. The ABI re-uses +//! `line_sender_error*` for fallible-call error reporting; opaque types +//! (`questdb_db`, `column_sender`, `column_sender_chunk`) are heap-allocated +//! and freed through their dedicated `_close` / `_free` / `_return_sender` +//! entry points. + +use libc::{c_char, size_t}; +use std::slice; +use std::str; + +use questdb::ingress::column_sender::{AckLevel, Chunk, OwnedSender, QuestDb, Validity}; +use questdb::{Error, ErrorCode}; + +use crate::{line_sender_error, set_err_out_from_error}; + +// =========================================================================== +// Opaque handles +// =========================================================================== + +/// Connection pool. Thread-safe; share across threads. +pub struct questdb_db(QuestDb); + +/// Borrowed sender. Owns a pool slot until `questdb_db_return_sender` is +/// called. Not thread-safe. +pub struct column_sender(OwnedSender); + +/// One DataFrame's worth of column buffers destined for one QuestDB table. +/// Owned by the caller; not bound to a sender. +pub struct column_sender_chunk(Chunk); + +// =========================================================================== +// Validity bitmap (Arrow shape: bit = 1 means valid, LSB-first). +// =========================================================================== + +#[repr(C)] +#[derive(Copy, Clone)] +pub struct column_sender_validity { + pub bits: *const u8, + pub bit_len: size_t, +} + +unsafe fn as_validity<'a>( + v: *const column_sender_validity, + err_out: *mut *mut line_sender_error, +) -> Option>> { + if v.is_null() { + return Some(None); + } + let v = unsafe { &*v }; + let required = v.bit_len.div_ceil(8); + if v.bits.is_null() && v.bit_len != 0 { + unsafe { + set_err_out_from_error( + err_out, + Error::new( + ErrorCode::InvalidApiCall, + "column_sender_validity has null bits but bit_len != 0".to_string(), + ), + ); + } + return None; + } + let bytes: &[u8] = if v.bit_len == 0 { + &[] + } else { + unsafe { slice::from_raw_parts(v.bits, required) } + }; + match Validity::from_bitmap(bytes, v.bit_len) { + Ok(parsed) => Some(Some(parsed)), + Err(err) => { + unsafe { set_err_out_from_error(err_out, err) }; + None + } + } +} + +// =========================================================================== +// Ack level +// =========================================================================== + +#[repr(C)] +#[derive(Copy, Clone, Debug, PartialEq, Eq)] +pub enum column_sender_ack_level { + column_sender_ack_level_ok = 0, + column_sender_ack_level_durable = 1, +} + +impl From for AckLevel { + fn from(value: column_sender_ack_level) -> Self { + match value { + column_sender_ack_level::column_sender_ack_level_ok => AckLevel::Ok, + column_sender_ack_level::column_sender_ack_level_durable => AckLevel::Durable, + } + } +} + +// =========================================================================== +// Conversion helpers +// =========================================================================== + +unsafe fn name_str<'a>( + name: *const c_char, + name_len: size_t, + err_out: *mut *mut line_sender_error, +) -> Option<&'a str> { + if name.is_null() && name_len != 0 { + unsafe { + set_err_out_from_error( + err_out, + Error::new( + ErrorCode::InvalidApiCall, + "name pointer is NULL with non-zero length".to_string(), + ), + ); + } + return None; + } + let slice = if name_len == 0 { + &[] + } else { + unsafe { slice::from_raw_parts(name as *const u8, name_len) } + }; + match str::from_utf8(slice) { + Ok(s) => Some(s), + Err(_) => { + unsafe { + set_err_out_from_error( + err_out, + Error::new( + ErrorCode::InvalidUtf8, + "name is not valid UTF-8".to_string(), + ), + ); + } + None + } + } +} + +unsafe fn typed_slice<'a, T>( + data: *const T, + len: size_t, + err_out: *mut *mut line_sender_error, + what: &'static str, +) -> Option<&'a [T]> { + if data.is_null() && len != 0 { + unsafe { + set_err_out_from_error( + err_out, + Error::new( + ErrorCode::InvalidApiCall, + format!("{what} pointer is NULL with non-zero length"), + ), + ); + } + return None; + } + if len == 0 { + return Some(&[]); + } + Some(unsafe { slice::from_raw_parts(data, len) }) +} + +macro_rules! bubble { + ($err_out:expr, $expr:expr) => { + match $expr { + Ok(value) => value, + Err(err) => { + unsafe { set_err_out_from_error($err_out, err) }; + return false; + } + } + }; +} + +// =========================================================================== +// Pool +// =========================================================================== + +/// Open a connection pool. Eagerly opens `pool_size` connections; any +/// server/auth/TLS error during those opens fails the call. `conf` is a +/// NUL-terminated UTF-8 string. +/// +/// Returns NULL on failure. When `err_out != NULL`, the error is placed +/// in `*err_out` and ownership transfers to the caller (release with +/// `line_sender_error_free`). +#[unsafe(no_mangle)] +pub unsafe extern "C" fn questdb_db_connect( + conf: *const c_char, + conf_len: size_t, + err_out: *mut *mut line_sender_error, +) -> *mut questdb_db { + let conf = match unsafe { name_str(conf, conf_len, err_out) } { + Some(s) => s, + None => return std::ptr::null_mut(), + }; + match QuestDb::connect(conf) { + Ok(db) => Box::into_raw(Box::new(questdb_db(db))), + Err(err) => { + unsafe { set_err_out_from_error(err_out, err) }; + std::ptr::null_mut() + } + } +} + +/// Close the pool and all its connections. Accepts NULL and no-ops. +/// +/// Outstanding `column_sender` handles remain valid (they hold an +/// internal reference to the pool's state) and return themselves on +/// `questdb_db_return_sender`. +#[unsafe(no_mangle)] +pub unsafe extern "C" fn questdb_db_close(db: *mut questdb_db) { + if !db.is_null() { + unsafe { drop(Box::from_raw(db)) }; + } +} + +/// Borrow a sender from the pool. See +/// `doc/COLUMN_SENDER_FFI_ABI.md` §4.3 for the selection rules. Returns +/// NULL on failure; sets `*err_out` if provided. +#[unsafe(no_mangle)] +pub unsafe extern "C" fn questdb_db_borrow_sender( + db: *mut questdb_db, + err_out: *mut *mut line_sender_error, +) -> *mut column_sender { + if db.is_null() { + unsafe { + set_err_out_from_error( + err_out, + Error::new( + ErrorCode::InvalidApiCall, + "questdb_db_borrow_sender: db pointer is NULL".to_string(), + ), + ); + } + return std::ptr::null_mut(); + } + let db_ref = unsafe { &*db }; + match db_ref.0.borrow_sender_owned() { + Ok(owned) => Box::into_raw(Box::new(column_sender(owned))), + Err(err) => { + unsafe { set_err_out_from_error(err_out, err) }; + std::ptr::null_mut() + } + } +} + +/// Return a borrowed sender to the pool. Invalidates `sender`. Accepts +/// NULL `sender` and no-ops. `db` is ignored — the sender carries its +/// own reference to the pool — but kept in the ABI for symmetry with the +/// borrow call and to allow future runtime checks. +#[unsafe(no_mangle)] +pub unsafe extern "C" fn questdb_db_return_sender( + _db: *mut questdb_db, + sender: *mut column_sender, +) { + if !sender.is_null() { + unsafe { drop(Box::from_raw(sender)) }; + } +} + +/// Manually reap idle connections. Returns the number of connections +/// closed by this invocation. `db` must be non-NULL. +#[unsafe(no_mangle)] +pub unsafe extern "C" fn questdb_db_reap_idle(db: *mut questdb_db) -> size_t { + if db.is_null() { + return 0; + } + let db_ref = unsafe { &*db }; + db_ref.0.reap_idle() +} + +// =========================================================================== +// Sender state +// =========================================================================== + +/// `true` if the sender's underlying connection is in a permanently- +/// unusable state. +#[unsafe(no_mangle)] +pub unsafe extern "C" fn column_sender_must_close(sender: *const column_sender) -> bool { + if sender.is_null() { + return true; + } + unsafe { (*sender).0.get().must_close() } +} + +// =========================================================================== +// Chunk lifecycle +// =========================================================================== + +/// Create an empty chunk for `table_name` (validated UTF-8, ≤ 127 bytes). +#[unsafe(no_mangle)] +pub unsafe extern "C" fn column_sender_chunk_new( + table_name: *const c_char, + table_name_len: size_t, + err_out: *mut *mut line_sender_error, +) -> *mut column_sender_chunk { + let table = match unsafe { name_str(table_name, table_name_len, err_out) } { + Some(s) => s, + None => return std::ptr::null_mut(), + }; + Box::into_raw(Box::new(column_sender_chunk(Chunk::new(table)))) +} + +/// Free a chunk. Accepts NULL and no-ops. +#[unsafe(no_mangle)] +pub unsafe extern "C" fn column_sender_chunk_free(chunk: *mut column_sender_chunk) { + if !chunk.is_null() { + unsafe { drop(Box::from_raw(chunk)) }; + } +} + +/// Clear a chunk's content, keeping its retained capacity for reuse. +#[unsafe(no_mangle)] +pub unsafe extern "C" fn column_sender_chunk_clear(chunk: *mut column_sender_chunk) { + if !chunk.is_null() { + unsafe { (*chunk).0.clear() }; + } +} + +/// Current row count of the chunk; 0 if no column has been appended. +#[unsafe(no_mangle)] +pub unsafe extern "C" fn column_sender_chunk_row_count( + chunk: *const column_sender_chunk, +) -> size_t { + if chunk.is_null() { + return 0; + } + unsafe { (*chunk).0.row_count() } +} + +// =========================================================================== +// Numeric / fixed-width column appends +// =========================================================================== + +macro_rules! column_fn { + ($fn_name:ident, $c_ty:ty, $rust_method:ident, $what:literal) => { + #[unsafe(no_mangle)] + pub unsafe extern "C" fn $fn_name( + chunk: *mut column_sender_chunk, + name: *const c_char, + name_len: size_t, + data: *const $c_ty, + row_count: size_t, + validity: *const column_sender_validity, + err_out: *mut *mut line_sender_error, + ) -> bool { + let chunk = match unsafe { chunk.as_mut() } { + Some(c) => &mut c.0, + None => return reject_null_chunk(err_out), + }; + let name = match unsafe { name_str(name, name_len, err_out) } { + Some(s) => s, + None => return false, + }; + let data = match unsafe { typed_slice(data, row_count, err_out, $what) } { + Some(s) => s, + None => return false, + }; + let validity = match unsafe { as_validity(validity, err_out) } { + Some(v) => v, + None => return false, + }; + bubble!(err_out, chunk.$rust_method(name, data, validity.as_ref())); + true + } + }; +} + +column_fn!( + column_sender_chunk_column_i8, + i8, + column_i8, + "i8 column data" +); +column_fn!( + column_sender_chunk_column_i16, + i16, + column_i16, + "i16 column data" +); +column_fn!( + column_sender_chunk_column_i32, + i32, + column_i32, + "i32 column data" +); +column_fn!( + column_sender_chunk_column_i64, + i64, + column_i64, + "i64 column data" +); +column_fn!( + column_sender_chunk_column_f32, + f32, + column_f32, + "f32 column data" +); +column_fn!( + column_sender_chunk_column_f64, + f64, + column_f64, + "f64 column data" +); +column_fn!( + column_sender_chunk_column_ipv4, + u32, + column_ipv4, + "ipv4 column data" +); +column_fn!( + column_sender_chunk_column_ts_nanos, + i64, + column_ts_nanos, + "ts_nanos column data" +); +column_fn!( + column_sender_chunk_column_ts_micros, + i64, + column_ts_micros, + "ts_micros column data" +); +column_fn!( + column_sender_chunk_column_date_millis, + i64, + column_date_millis, + "date_millis column data" +); + +/// `BOOLEAN` column. `data` is an Arrow-style LSB-first packed bitmap; +/// must be at least `ceil(row_count / 8)` bytes long. +#[unsafe(no_mangle)] +pub unsafe extern "C" fn column_sender_chunk_column_bool( + chunk: *mut column_sender_chunk, + name: *const c_char, + name_len: size_t, + data: *const u8, + row_count: size_t, + validity: *const column_sender_validity, + err_out: *mut *mut line_sender_error, +) -> bool { + let chunk = match unsafe { chunk.as_mut() } { + Some(c) => &mut c.0, + None => return reject_null_chunk(err_out), + }; + let name = match unsafe { name_str(name, name_len, err_out) } { + Some(s) => s, + None => return false, + }; + let bytes_required = row_count.div_ceil(8); + let data_slice = match unsafe { typed_slice(data, bytes_required, err_out, "bool column data") } + { + Some(s) => s, + None => return false, + }; + let validity = match unsafe { as_validity(validity, err_out) } { + Some(v) => v, + None => return false, + }; + bubble!( + err_out, + chunk.column_bool(name, data_slice, row_count, validity.as_ref()) + ); + true +} + +macro_rules! fixed_width_byte_column_fn { + ($fn_name:ident, $n:literal, $rust_method:ident, $what:literal) => { + #[unsafe(no_mangle)] + pub unsafe extern "C" fn $fn_name( + chunk: *mut column_sender_chunk, + name: *const c_char, + name_len: size_t, + data: *const u8, + row_count: size_t, + validity: *const column_sender_validity, + err_out: *mut *mut line_sender_error, + ) -> bool { + let chunk = match unsafe { chunk.as_mut() } { + Some(c) => &mut c.0, + None => return reject_null_chunk(err_out), + }; + let name = match unsafe { name_str(name, name_len, err_out) } { + Some(s) => s, + None => return false, + }; + if data.is_null() && row_count != 0 { + unsafe { + set_err_out_from_error( + err_out, + Error::new( + ErrorCode::InvalidApiCall, + format!( + "{} column data pointer is NULL with non-zero row_count", + $what + ), + ), + ); + } + return false; + } + // SAFETY: the caller promises `data` points to `row_count * + // N` bytes (FFI-ABI §6) and that the buffer outlives the call. + let data_slice: &[[u8; $n]] = if row_count == 0 { + &[] + } else { + unsafe { slice::from_raw_parts(data as *const [u8; $n], row_count) } + }; + let validity = match unsafe { as_validity(validity, err_out) } { + Some(v) => v, + None => return false, + }; + bubble!( + err_out, + chunk.$rust_method(name, data_slice, validity.as_ref()) + ); + true + } + }; +} + +// `UUID` column. `data` is `row_count * 16` bytes; the FFI takes a +// `uint8_t*` and slices it into 16-byte rows. +fixed_width_byte_column_fn!(column_sender_chunk_column_uuid, 16, column_uuid, "uuid"); + +// `LONG256` column. `data` is `row_count * 32` bytes. +fixed_width_byte_column_fn!( + column_sender_chunk_column_long256, + 32, + column_long256, + "long256" +); + +// =========================================================================== +// VARCHAR (variable-width text) +// =========================================================================== + +/// `VARCHAR` column. Inputs are Arrow Utf8 shape: `offsets` length +/// `row_count + 1`, monotonically non-decreasing; `bytes` is the +/// concatenated UTF-8 buffer. +#[unsafe(no_mangle)] +pub unsafe extern "C" fn column_sender_chunk_column_varchar( + chunk: *mut column_sender_chunk, + name: *const c_char, + name_len: size_t, + offsets: *const i32, + bytes: *const u8, + bytes_len: size_t, + row_count: size_t, + validity: *const column_sender_validity, + err_out: *mut *mut line_sender_error, +) -> bool { + let chunk = match unsafe { chunk.as_mut() } { + Some(c) => &mut c.0, + None => return reject_null_chunk(err_out), + }; + let name = match unsafe { name_str(name, name_len, err_out) } { + Some(s) => s, + None => return false, + }; + let offsets_len = match row_count.checked_add(1) { + Some(n) => n, + None => { + unsafe { + set_err_out_from_error( + err_out, + Error::new( + ErrorCode::InvalidApiCall, + "row_count overflow when computing offsets length".to_string(), + ), + ); + } + return false; + } + }; + let offsets = match unsafe { typed_slice(offsets, offsets_len, err_out, "varchar offsets") } { + Some(s) => s, + None => return false, + }; + let bytes = match unsafe { typed_slice(bytes, bytes_len, err_out, "varchar bytes") } { + Some(s) => s, + None => return false, + }; + let validity = match unsafe { as_validity(validity, err_out) } { + Some(v) => v, + None => return false, + }; + bubble!( + err_out, + chunk.column_varchar(name, offsets, bytes, validity.as_ref()) + ); + true +} + +// =========================================================================== +// Symbol dictionary columns +// =========================================================================== + +macro_rules! symbol_fn { + ($fn_name:ident, $code_ty:ty, $rust_method:ident, $what:literal) => { + #[unsafe(no_mangle)] + pub unsafe extern "C" fn $fn_name( + chunk: *mut column_sender_chunk, + name: *const c_char, + name_len: size_t, + codes: *const $code_ty, + row_count: size_t, + dict_offsets: *const i32, + dict_offsets_len: size_t, + dict_bytes: *const u8, + dict_bytes_len: size_t, + validity: *const column_sender_validity, + err_out: *mut *mut line_sender_error, + ) -> bool { + let chunk = match unsafe { chunk.as_mut() } { + Some(c) => &mut c.0, + None => return reject_null_chunk(err_out), + }; + let name = match unsafe { name_str(name, name_len, err_out) } { + Some(s) => s, + None => return false, + }; + let codes = match unsafe { typed_slice(codes, row_count, err_out, $what) } { + Some(s) => s, + None => return false, + }; + let dict_offsets = match unsafe { + typed_slice( + dict_offsets, + dict_offsets_len, + err_out, + "symbol dict offsets", + ) + } { + Some(s) => s, + None => return false, + }; + let dict_bytes = match unsafe { + typed_slice(dict_bytes, dict_bytes_len, err_out, "symbol dict bytes") + } { + Some(s) => s, + None => return false, + }; + let validity = match unsafe { as_validity(validity, err_out) } { + Some(v) => v, + None => return false, + }; + bubble!( + err_out, + chunk.$rust_method(name, codes, dict_offsets, dict_bytes, validity.as_ref()) + ); + true + } + }; +} + +symbol_fn!( + column_sender_chunk_symbol_dict_i8, + i8, + symbol_dict_i8, + "symbol codes (i8)" +); +symbol_fn!( + column_sender_chunk_symbol_dict_i16, + i16, + symbol_dict_i16, + "symbol codes (i16)" +); +symbol_fn!( + column_sender_chunk_symbol_dict_i32, + i32, + symbol_dict_i32, + "symbol codes (i32)" +); + +// =========================================================================== +// Designated timestamp +// =========================================================================== + +#[unsafe(no_mangle)] +pub unsafe extern "C" fn column_sender_chunk_designated_timestamp_micros( + chunk: *mut column_sender_chunk, + data: *const i64, + row_count: size_t, + err_out: *mut *mut line_sender_error, +) -> bool { + let chunk = match unsafe { chunk.as_mut() } { + Some(c) => &mut c.0, + None => return reject_null_chunk(err_out), + }; + let data = match unsafe { typed_slice(data, row_count, err_out, "designated_ts micros") } { + Some(s) => s, + None => return false, + }; + bubble!(err_out, chunk.designated_timestamp_micros(data)); + true +} + +#[unsafe(no_mangle)] +pub unsafe extern "C" fn column_sender_chunk_designated_timestamp_nanos( + chunk: *mut column_sender_chunk, + data: *const i64, + row_count: size_t, + err_out: *mut *mut line_sender_error, +) -> bool { + let chunk = match unsafe { chunk.as_mut() } { + Some(c) => &mut c.0, + None => return reject_null_chunk(err_out), + }; + let data = match unsafe { typed_slice(data, row_count, err_out, "designated_ts nanos") } { + Some(s) => s, + None => return false, + }; + bubble!(err_out, chunk.designated_timestamp_nanos(data)); + true +} + +// =========================================================================== +// Flush +// =========================================================================== + +/// Encode `chunk` into a QWP/WebSocket frame, publish it, and block +/// until the server acknowledges at the requested `ack_level`. +/// +/// On success, `chunk` is cleared and the call returns `true`. On +/// failure, `chunk` is left untouched and `false` is returned (with +/// `*err_out` set if provided). +#[unsafe(no_mangle)] +pub unsafe extern "C" fn column_sender_flush( + sender: *mut column_sender, + chunk: *mut column_sender_chunk, + ack_level: column_sender_ack_level, + err_out: *mut *mut line_sender_error, +) -> bool { + let sender = match unsafe { sender.as_mut() } { + Some(s) => s.0.get_mut(), + None => { + unsafe { + set_err_out_from_error( + err_out, + Error::new( + ErrorCode::InvalidApiCall, + "column_sender_flush: sender pointer is NULL".to_string(), + ), + ); + } + return false; + } + }; + let chunk = match unsafe { chunk.as_mut() } { + Some(c) => &mut c.0, + None => return reject_null_chunk(err_out), + }; + bubble!(err_out, sender.flush(chunk, ack_level.into())); + true +} + +// =========================================================================== +// Helpers +// =========================================================================== + +fn reject_null_chunk(err_out: *mut *mut line_sender_error) -> bool { + unsafe { + set_err_out_from_error( + err_out, + Error::new( + ErrorCode::InvalidApiCall, + "column_sender_chunk pointer is NULL".to_string(), + ), + ); + } + false +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::line_sender_error_free; + + // Most behaviour is already covered by the questdb-rs lib tests; this + // module's tests focus on the FFI surface — pointer handling, NULL + // guards, lifetime of error objects, etc. + + #[test] + fn connect_rejects_non_qwp_ws_schema() { + let conf = b"http::addr=localhost:9000;"; + let mut err: *mut line_sender_error = std::ptr::null_mut(); + let db = + unsafe { questdb_db_connect(conf.as_ptr() as *const c_char, conf.len(), &mut err) }; + assert!(db.is_null()); + assert!(!err.is_null()); + unsafe { line_sender_error_free(err) }; + } + + #[test] + fn chunk_new_validates_table_name() { + let mut err: *mut line_sender_error = std::ptr::null_mut(); + // 128-byte name: exceeds the 127-byte QWP cap, but the public + // `Chunk::new` does not validate eagerly — validation happens at + // flush time. So this constructor succeeds. + let table = "x".repeat(128); + let chunk = unsafe { + column_sender_chunk_new(table.as_ptr() as *const c_char, table.len(), &mut err) + }; + assert!(!chunk.is_null()); + assert!(err.is_null()); + unsafe { column_sender_chunk_free(chunk) }; + } + + #[test] + fn chunk_new_rejects_invalid_utf8() { + let bad: [u8; 3] = [0xFF, 0xFE, 0xFD]; + let mut err: *mut line_sender_error = std::ptr::null_mut(); + let chunk = + unsafe { column_sender_chunk_new(bad.as_ptr() as *const c_char, bad.len(), &mut err) }; + assert!(chunk.is_null()); + assert!(!err.is_null()); + unsafe { line_sender_error_free(err) }; + } + + #[test] + fn column_i64_round_trip_on_pure_data_path() { + let table = b"trades"; + let mut err: *mut line_sender_error = std::ptr::null_mut(); + let chunk = unsafe { + column_sender_chunk_new(table.as_ptr() as *const c_char, table.len(), &mut err) + }; + assert!(!chunk.is_null()); + + let name = b"price"; + let data: [i64; 3] = [1, 2, 3]; + let ok = unsafe { + column_sender_chunk_column_i64( + chunk, + name.as_ptr() as *const c_char, + name.len(), + data.as_ptr(), + data.len(), + std::ptr::null(), + &mut err, + ) + }; + assert!(ok, "column_i64 should succeed"); + assert_eq!(unsafe { column_sender_chunk_row_count(chunk) }, 3); + unsafe { column_sender_chunk_free(chunk) }; + } + + #[test] + fn column_i64_rejects_row_count_mismatch() { + let table = b"trades"; + let mut err: *mut line_sender_error = std::ptr::null_mut(); + let chunk = unsafe { + column_sender_chunk_new(table.as_ptr() as *const c_char, table.len(), &mut err) + }; + let name_a = b"a"; + let name_b = b"b"; + let data_a: [i64; 3] = [1, 2, 3]; + let data_b: [i64; 2] = [4, 5]; + assert!(unsafe { + column_sender_chunk_column_i64( + chunk, + name_a.as_ptr() as *const c_char, + name_a.len(), + data_a.as_ptr(), + data_a.len(), + std::ptr::null(), + &mut err, + ) + }); + let ok = unsafe { + column_sender_chunk_column_i64( + chunk, + name_b.as_ptr() as *const c_char, + name_b.len(), + data_b.as_ptr(), + data_b.len(), + std::ptr::null(), + &mut err, + ) + }; + assert!(!ok); + assert!(!err.is_null()); + unsafe { line_sender_error_free(err) }; + unsafe { column_sender_chunk_free(chunk) }; + } + + #[test] + fn validity_null_bits_with_nonzero_len_errors() { + let table = b"trades"; + let mut err: *mut line_sender_error = std::ptr::null_mut(); + let chunk = unsafe { + column_sender_chunk_new(table.as_ptr() as *const c_char, table.len(), &mut err) + }; + let name = b"a"; + let data: [i64; 2] = [1, 2]; + let v = column_sender_validity { + bits: std::ptr::null(), + bit_len: 2, + }; + let ok = unsafe { + column_sender_chunk_column_i64( + chunk, + name.as_ptr() as *const c_char, + name.len(), + data.as_ptr(), + data.len(), + &v, + &mut err, + ) + }; + assert!(!ok); + assert!(!err.is_null()); + unsafe { line_sender_error_free(err) }; + unsafe { column_sender_chunk_free(chunk) }; + } + + #[test] + fn null_chunk_pointer_is_handled() { + let mut err: *mut line_sender_error = std::ptr::null_mut(); + let name = b"a"; + let data: [i64; 1] = [1]; + let ok = unsafe { + column_sender_chunk_column_i64( + std::ptr::null_mut(), + name.as_ptr() as *const c_char, + name.len(), + data.as_ptr(), + data.len(), + std::ptr::null(), + &mut err, + ) + }; + assert!(!ok); + assert!(!err.is_null()); + unsafe { line_sender_error_free(err) }; + } + + #[test] + fn ack_level_enum_maps_correctly() { + assert_eq!( + AckLevel::from(column_sender_ack_level::column_sender_ack_level_ok), + AckLevel::Ok + ); + assert_eq!( + AckLevel::from(column_sender_ack_level::column_sender_ack_level_durable), + AckLevel::Durable + ); + } +} diff --git a/questdb-rs-ffi/src/lib.rs b/questdb-rs-ffi/src/lib.rs index 4cf0f6f0..c107b4a4 100644 --- a/questdb-rs-ffi/src/lib.rs +++ b/questdb-rs-ffi/src/lib.rs @@ -76,6 +76,9 @@ use ndarr::StrideArrayView; #[cfg(feature = "sync-reader-ws")] mod egress; +pub mod column_sender; +pub use column_sender::*; + macro_rules! bubble_err_to_c { ($err_out:expr, $expression:expr) => { bubble_err_to_c!($err_out, $expression, false) diff --git a/questdb-rs/Cargo.toml b/questdb-rs/Cargo.toml index 70aac7a2..811bcd7a 100644 --- a/questdb-rs/Cargo.toml +++ b/questdb-rs/Cargo.toml @@ -269,3 +269,14 @@ required-features = ["sync-sender-qwp-ws"] name = "decoder" harness = false required-features = ["sync-reader-ws"] + +# Column-major sender hot-path bench. Anchors the perf claims from +# `doc/COLUMN_SENDER_PLAN.md` §2 (memcpy-bound no-null path, +# referenced-only symbol intern). Run with: +# +# cargo bench --features sync-sender-qwp-ws --bench column_sender +# QUESTDB_COLUMN_BENCH_ROWS=10000000 cargo bench --features sync-sender-qwp-ws --bench column_sender +[[bench]] +name = "column_sender" +harness = false +required-features = ["sync-sender-qwp-ws"] diff --git a/questdb-rs/benches/column_sender.rs b/questdb-rs/benches/column_sender.rs new file mode 100644 index 00000000..75c4cf64 --- /dev/null +++ b/questdb-rs/benches/column_sender.rs @@ -0,0 +1,432 @@ +/******************************************************************************* + * ___ _ ____ ____ + * / _ \ _ _ ___ ___| |_| _ \| __ ) + * | | | | | | |/ _ \/ __| __| | | | _ \ + * | |_| | |_| | __/\__ \ |_| |_| | |_) | + * \__\_\\__,_|\___||___/\__|____/|____/ + * + * Copyright (c) 2014-2019 Appsicle + * Copyright (c) 2019-2025 QuestDB + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ******************************************************************************/ + +//! Column-major sender hot-path bench (`questdb-rs/benches/column_sender.rs`). +//! +//! Anchors the perf claims in `doc/COLUMN_SENDER_PLAN.md` §2.1 +//! ("encode is a header + extend_from_slice per column") and §2.2 +//! ("no-null = memcpy; nullable = invert+gather"). Each bench reports +//! throughput in rows/s and bytes/s so a regression shows up as either +//! a row-rate or bandwidth drop. +//! +//! Three families: +//! +//! 1. **Per-column bulk append** — exercises [`Chunk::column_i64`], +//! [`Chunk::column_f64`], [`Chunk::column_varchar`], and +//! [`Chunk::symbol_dict_i32`] in both no-null and nullable shapes. +//! Baseline: a raw `extend_from_slice` from the caller's typed +//! buffer into a fresh `Vec`, the absolute floor any +//! column-sender hot path is competing with. +//! +//! 2. **Symbol bulk-intern** — compares the column path +//! ([`Chunk::symbol_dict_i32`] + flush-time interning) with a +//! naive per-row HashMap lookup that mirrors what the row API pays +//! on the same cardinality, to anchor the WS-4 plan claim ("10M +//! rows × 1000-card drops from 10M probes to 1000"). +//! +//! 3. **Encode-only end-to-end** — populate a 10M-row chunk with a +//! representative column mix, then time +//! [`bench_encode_chunk`](_bench_internals::bench_encode_chunk). +//! Pure encoder cost (no network) so a regression in +//! `encode_chunk` or in any per-column append shows up here. +//! +//! Run: +//! +//! ```text +//! cargo bench --features sync-sender-qwp-ws --bench column_sender +//! QUESTDB_COLUMN_BENCH_ROWS=10000000 cargo bench --features sync-sender-qwp-ws --bench column_sender +//! ``` + +use std::collections::HashMap; +use std::time::Duration; + +use criterion::{BatchSize, Criterion, Throughput, black_box, criterion_group, criterion_main}; + +use questdb::ingress::column_sender::_bench_internals::{BenchEncoderState, bench_encode_chunk}; +use questdb::ingress::column_sender::{Chunk, Validity}; + +// --------------------------------------------------------------------------- +// Workload sizes. Defaults are tuned for sub-second criterion samples so the +// bench runs in CI; bump via `QUESTDB_COLUMN_BENCH_ROWS` for headline numbers. +// --------------------------------------------------------------------------- + +fn row_count() -> usize { + std::env::var("QUESTDB_COLUMN_BENCH_ROWS") + .ok() + .and_then(|v| v.parse().ok()) + .unwrap_or(100_000) +} + +fn varchar_len() -> usize { + std::env::var("QUESTDB_COLUMN_BENCH_VARCHAR_LEN") + .ok() + .and_then(|v| v.parse().ok()) + .unwrap_or(16) +} + +fn symbol_cardinality() -> usize { + std::env::var("QUESTDB_COLUMN_BENCH_SYM_CARD") + .ok() + .and_then(|v| v.parse().ok()) + .unwrap_or(1_000) +} + +// --------------------------------------------------------------------------- +// Workload generators +// --------------------------------------------------------------------------- + +fn make_i64_data(rows: usize) -> Vec { + (0..rows as i64).collect() +} + +fn make_f64_data(rows: usize) -> Vec { + (0..rows).map(|i| i as f64 * 1.5).collect() +} + +/// Arrow-shape validity: every 16th row is null, all others valid. +fn make_validity_bits(rows: usize) -> Vec { + let bytes = rows.div_ceil(8); + let mut out = vec![0xFFu8; bytes]; + for (row_idx, byte) in (0..rows).zip(0..) { + let _ = byte; // pacify clippy if unused + if row_idx % 16 == 0 { + out[row_idx / 8] &= !(1u8 << (row_idx % 8)); + } + } + out +} + +fn make_varchar(rows: usize, len: usize) -> (Vec, Vec) { + let mut offsets = Vec::with_capacity(rows + 1); + let mut bytes = Vec::with_capacity(rows * len); + let alphabet = b"abcdefghijklmnopqrstuvwxyz"; + offsets.push(0); + for row in 0..rows { + for i in 0..len { + bytes.push(alphabet[(row + i) % alphabet.len()]); + } + offsets.push(bytes.len() as i32); + } + (offsets, bytes) +} + +fn make_symbol_workload(rows: usize, cardinality: usize) -> (Vec, Vec, Vec) { + let mut dict_offsets = Vec::with_capacity(cardinality + 1); + let mut dict_bytes = Vec::new(); + dict_offsets.push(0); + for i in 0..cardinality { + // Short distinct strings: "sym-12345". + let entry = format!("sym-{i:08}"); + dict_bytes.extend_from_slice(entry.as_bytes()); + dict_offsets.push(dict_bytes.len() as i32); + } + // Splitmix-style spread of codes across the dict so the encoder's + // intern + gather path sees a realistic distribution. + let mut codes = Vec::with_capacity(rows); + let mut state = 0x9E37_79B9_7F4A_7C15u64; + for _ in 0..rows { + state = state.wrapping_mul(0x9E37_79B9_7F4A_7C15); + state ^= state >> 27; + codes.push((state as usize % cardinality) as i32); + } + (codes, dict_offsets, dict_bytes) +} + +// --------------------------------------------------------------------------- +// Bench helpers +// --------------------------------------------------------------------------- + +fn fresh_chunk(table: &str) -> Chunk { + Chunk::new(table) +} + +// --------------------------------------------------------------------------- +// Per-column bulk-append benchmarks +// --------------------------------------------------------------------------- + +fn bench_column_i64(c: &mut Criterion) { + let rows = row_count(); + let data = make_i64_data(rows); + let mut group = c.benchmark_group("column_i64"); + group.throughput(Throughput::Bytes((rows * 8) as u64)); + + group.bench_function("memcpy_baseline", |b| { + b.iter_batched( + || Vec::::with_capacity(rows * 8 + 1), + |mut out| { + out.push(0); + let bytes: &[u8] = unsafe { + std::slice::from_raw_parts( + data.as_ptr().cast::(), + std::mem::size_of_val(data.as_slice()), + ) + }; + out.extend_from_slice(bytes); + black_box(out); + }, + BatchSize::SmallInput, + ); + }); + + group.bench_function("column_sender_no_null", |b| { + b.iter_batched( + || fresh_chunk("trades"), + |mut chunk| { + chunk.column_i64("v", &data, None).unwrap(); + black_box(&chunk); + }, + BatchSize::SmallInput, + ); + }); + + let bits = make_validity_bits(rows); + let validity = Validity::from_bitmap(&bits, rows).unwrap(); + group.bench_function("column_sender_nullable", |b| { + b.iter_batched( + || fresh_chunk("trades"), + |mut chunk| { + chunk.column_i64("v", &data, Some(&validity)).unwrap(); + black_box(&chunk); + }, + BatchSize::SmallInput, + ); + }); + + group.finish(); +} + +fn bench_column_f64(c: &mut Criterion) { + let rows = row_count(); + let data = make_f64_data(rows); + let mut group = c.benchmark_group("column_f64"); + group.throughput(Throughput::Bytes((rows * 8) as u64)); + + group.bench_function("memcpy_baseline", |b| { + b.iter_batched( + || Vec::::with_capacity(rows * 8 + 1), + |mut out| { + out.push(0); + let bytes: &[u8] = unsafe { + std::slice::from_raw_parts( + data.as_ptr().cast::(), + std::mem::size_of_val(data.as_slice()), + ) + }; + out.extend_from_slice(bytes); + black_box(out); + }, + BatchSize::SmallInput, + ); + }); + + group.bench_function("column_sender_no_null", |b| { + b.iter_batched( + || fresh_chunk("trades"), + |mut chunk| { + chunk.column_f64("v", &data, None).unwrap(); + black_box(&chunk); + }, + BatchSize::SmallInput, + ); + }); + + group.finish(); +} + +fn bench_column_varchar(c: &mut Criterion) { + let rows = row_count(); + let len = varchar_len(); + let (offsets, bytes) = make_varchar(rows, len); + let mut group = c.benchmark_group("column_varchar"); + group.throughput(Throughput::Bytes((4 * (rows + 1) + bytes.len()) as u64)); + + group.bench_function("memcpy_baseline", |b| { + b.iter_batched( + || Vec::::with_capacity(4 * (rows + 1) + bytes.len() + 1), + |mut out| { + out.push(0); + let offset_bytes: &[u8] = unsafe { + std::slice::from_raw_parts( + offsets.as_ptr().cast::(), + std::mem::size_of_val(offsets.as_slice()), + ) + }; + out.extend_from_slice(offset_bytes); + out.extend_from_slice(&bytes); + black_box(out); + }, + BatchSize::SmallInput, + ); + }); + + group.bench_function("column_sender_no_null", |b| { + b.iter_batched( + || fresh_chunk("logs"), + |mut chunk| { + chunk.column_varchar("msg", &offsets, &bytes, None).unwrap(); + black_box(&chunk); + }, + BatchSize::SmallInput, + ); + }); + + group.finish(); +} + +// --------------------------------------------------------------------------- +// Symbol bulk-intern: column path vs naïve per-row HashMap +// --------------------------------------------------------------------------- + +fn bench_symbol_dict(c: &mut Criterion) { + let rows = row_count(); + let card = symbol_cardinality(); + let (codes, dict_offsets, dict_bytes) = make_symbol_workload(rows, card); + let mut group = c.benchmark_group("symbol_dict"); + group.throughput(Throughput::Elements(rows as u64)); + + // Column-sender path: bulk three-pass intern at append time. + group.bench_function("column_sender", |b| { + b.iter_batched( + || fresh_chunk("ticks"), + |mut chunk| { + chunk + .symbol_dict_i32("sym", &codes, &dict_offsets, &dict_bytes, None) + .unwrap(); + black_box(&chunk); + }, + BatchSize::SmallInput, + ); + }); + + // Row-API analogue: per-row HashMap probe. Mimics what the legacy + // path pays for each symbol cell. We don't use the actual row + // encoder because it owns much more state than this measurement + // is trying to isolate — the point here is the per-row HashMap + // hit, which dominates symbol-column cost on the row path. + group.bench_function("naive_per_row_hashmap", |b| { + b.iter_batched( + || { + let map: HashMap<&[u8], u64> = HashMap::new(); + (map, Vec::::with_capacity(rows)) + }, + |(mut map, mut gids)| { + let mut next_id: u64 = 0; + for &code in &codes { + let start = dict_offsets[code as usize] as usize; + let end = dict_offsets[code as usize + 1] as usize; + let entry: &[u8] = &dict_bytes[start..end]; + let gid = *map.entry(entry).or_insert_with(|| { + let id = next_id; + next_id += 1; + id + }); + gids.push(gid); + } + black_box(&gids); + }, + BatchSize::SmallInput, + ); + }); + + group.finish(); +} + +// --------------------------------------------------------------------------- +// End-to-end encode (no network) +// --------------------------------------------------------------------------- + +fn encode_chunk_group(c: &mut Criterion) { + let rows = row_count(); + let i64_data = make_i64_data(rows); + let f64_data = make_f64_data(rows); + let (offsets, varchar_bytes) = make_varchar(rows, varchar_len()); + let (codes, dict_offsets, dict_bytes) = make_symbol_workload(rows, symbol_cardinality()); + let ts_data = make_i64_data(rows); + + let mut group = c.benchmark_group("encode_chunk"); + group.sample_size(20); // larger workload — fewer samples + group.measurement_time(Duration::from_secs(5)); + group.throughput(Throughput::Elements(rows as u64)); + + let build_chunk = || { + let mut chunk = Chunk::new("ticks"); + chunk.column_i64("qty", &i64_data, None).unwrap(); + chunk.column_f64("price", &f64_data, None).unwrap(); + chunk + .column_varchar("msg", &offsets, &varchar_bytes, None) + .unwrap(); + chunk + .symbol_dict_i32("sym", &codes, &dict_offsets, &dict_bytes, None) + .unwrap(); + chunk.designated_timestamp_nanos(&ts_data).unwrap(); + chunk + }; + + group.bench_function("populate_only", |b| { + b.iter_batched( + || (), + |_| { + let chunk = build_chunk(); + black_box(&chunk); + }, + BatchSize::SmallInput, + ); + }); + + let prebuilt = build_chunk(); + group.bench_function("encode_only", |b| { + b.iter_batched( + BenchEncoderState::new, + |mut state| { + let frame = bench_encode_chunk(&prebuilt, &mut state).unwrap(); + black_box(frame); + }, + BatchSize::SmallInput, + ); + }); + + group.bench_function("populate_plus_encode", |b| { + b.iter_batched( + BenchEncoderState::new, + |mut state| { + let chunk = build_chunk(); + let frame = bench_encode_chunk(&chunk, &mut state).unwrap(); + black_box(frame); + }, + BatchSize::SmallInput, + ); + }); + + group.finish(); +} + +criterion_group!( + benches, + bench_column_i64, + bench_column_f64, + bench_column_varchar, + bench_symbol_dict, + encode_chunk_group, +); +criterion_main!(benches); diff --git a/questdb-rs/src/ingress.rs b/questdb-rs/src/ingress.rs index b1569abf..990dda08 100644 --- a/questdb-rs/src/ingress.rs +++ b/questdb-rs/src/ingress.rs @@ -68,6 +68,9 @@ pub use sender::*; mod decimal; pub use decimal::DecimalView; +#[cfg(feature = "sync-sender-qwp-ws")] +pub mod column_sender; + const MAX_NAME_LEN_DEFAULT: usize = 127; /// The maximum allowed dimensions for arrays. diff --git a/questdb-rs/src/ingress/buffer/qwp.rs b/questdb-rs/src/ingress/buffer/qwp.rs index 7446fa25..afcce210 100644 --- a/questdb-rs/src/ingress/buffer/qwp.rs +++ b/questdb-rs/src/ingress/buffer/qwp.rs @@ -5066,6 +5066,13 @@ impl SymbolGlobalDict { self.next_id } + /// Number of global ids assigned so far. The column-sender encoder + /// uses this as the `delta_start` field of the delta-symbol-dict + /// prefix. + pub(crate) fn next_id(&self) -> u64 { + self.next_id + } + pub(crate) fn mark(&self) -> SymbolGlobalDictMark { SymbolGlobalDictMark { entries_len: self.entries.len(), @@ -5082,13 +5089,13 @@ impl SymbolGlobalDict { self.next_id = mark.next_id; } - fn entry(&self, id: u64) -> Option<&[u8]> { + pub(crate) fn entry(&self, id: u64) -> Option<&[u8]> { let index = usize::try_from(id).ok()?; self.entries.get(index).map(Vec::as_slice) } /// Returns `(global_id, is_new)`. - fn intern(&mut self, bytes: &[u8]) -> (u64, bool) { + pub(crate) fn intern(&mut self, bytes: &[u8]) -> (u64, bool) { if let Some(&id) = self.map.get(bytes) { return (id, false); } diff --git a/questdb-rs/src/ingress/column_sender/chunk.rs b/questdb-rs/src/ingress/column_sender/chunk.rs new file mode 100644 index 00000000..ef7c38f1 --- /dev/null +++ b/questdb-rs/src/ingress/column_sender/chunk.rs @@ -0,0 +1,1160 @@ +/******************************************************************************* + * ___ _ ____ ____ + * / _ \ _ _ ___ ___| |_| _ \| __ ) + * | | | | | | |/ _ \/ __| __| | | | _ \ + * | |_| | |_| | __/\__ \ |_| |_| | |_) | + * \__\_\\__,_|\___||___/\__|____/|____/ + * + * Copyright (c) 2014-2019 Appsicle + * Copyright (c) 2019-2025 QuestDB + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ******************************************************************************/ + +//! Column-major chunk: one DataFrame's worth of column buffers destined for +//! a single QuestDB table. +//! +//! The user calls [`Chunk::new`] with a table name, fills it with one +//! `column_*` call per column, optionally pins a designated timestamp, and +//! hands it to [`super::ColumnSender::flush`]. Each `column_*` writes the +//! column straight into wire-shape `Vec` storage so the flush-time +//! encoder only does a header + per-column `extend_from_slice`. + +use std::fmt::{self, Debug, Formatter}; + +use crate::{Result, error}; + +use super::validity::{Validity, check_row_count}; +use super::wire::{ + F32_NULL, F64_NULL, I8_NULL, I16_NULL, I32_NULL, I64_NULL, QWP_TYPE_BOOLEAN, QWP_TYPE_BYTE, + QWP_TYPE_DATE, QWP_TYPE_DOUBLE, QWP_TYPE_FLOAT, QWP_TYPE_INT, QWP_TYPE_IPV4, QWP_TYPE_LONG, + QWP_TYPE_LONG256, QWP_TYPE_SHORT, QWP_TYPE_SYMBOL, QWP_TYPE_TIMESTAMP, + QWP_TYPE_TIMESTAMP_NANOS, QWP_TYPE_UUID, QWP_TYPE_VARCHAR, validate_name, write_qwp_bytes, +}; + +/// One column in a chunk. +/// +/// Numeric and fixed-width columns are pre-encoded to wire shape at +/// append time and stored as [`ChunkColumn::Resolved`]. Symbol columns +/// stage their codes + referenced dict bytes and resolve to wire shape +/// at flush time ([`ChunkColumn::Symbol`]) because the global symbol id +/// is connection-scoped and chunks are sender-agnostic until flushed. +pub(crate) enum ChunkColumn { + Resolved { + #[allow(dead_code)] + name: String, + /// `name_len_varint || name_bytes || wire_type_byte`. + signature_chunk: Vec, + /// `payload[0]` is the null-flag byte; `payload[1..]` is the + /// per-type body (optional bitmap then dense values, or + /// row-count dense values for the no-bitmap shape). + payload: Vec, + }, + Symbol { + #[allow(dead_code)] + name: String, + signature_chunk: Vec, + row_count: usize, + /// Per-row index into `referenced_symbols`. For null rows the + /// value is unspecified — the encoder consults the bitmap before + /// touching the code. + codes: Vec, + /// QWP-shape null bitmap (bit = 1 means NULL). `None` when the + /// column has no nulls — encoder emits `null_flag = 0`. + bitmap: Option>, + non_null_count: usize, + /// Compact list of dict entries this column actually references, + /// indexed by the values in `codes`. Bounded by the chunk's + /// per-column cardinality rather than the (potentially huge) + /// caller dict. + referenced_symbols: Vec>, + }, +} + +impl ChunkColumn { + pub(crate) fn signature(&self) -> &[u8] { + match self { + Self::Resolved { + signature_chunk, .. + } + | Self::Symbol { + signature_chunk, .. + } => signature_chunk, + } + } + + fn name(&self) -> &str { + match self { + Self::Resolved { name, .. } | Self::Symbol { name, .. } => name, + } + } + + #[cfg(test)] + pub(crate) fn resolved_payload(&self) -> &[u8] { + match self { + Self::Resolved { payload, .. } => payload, + Self::Symbol { .. } => panic!("not a Resolved column"), + } + } +} + +/// Designated timestamp slot. Required exactly once per chunk before flush. +pub(crate) struct DesignatedTimestamp { + /// `QWP_TYPE_TIMESTAMP` (0x0A) for micros, `QWP_TYPE_TIMESTAMP_NANOS` + /// (0x10) for nanos. + pub(crate) wire_type: u8, + /// Already wire-shape: `null_flag=0` then `row_count * 8` bytes of LE + /// i64. Designated timestamps are non-null per the wire spec, so no + /// bitmap path. + pub(crate) payload: Vec, +} + +/// One DataFrame's worth of column buffers destined for one QuestDB table. +/// +/// Builders mutate the chunk in-place; on a successful +/// [`super::ColumnSender::flush`] it is cleared (its per-column `Vec` +/// allocations are retained for the next DataFrame). +pub struct Chunk { + pub(crate) table: String, + /// Locked by the first `column_*` call. `None` means the chunk has no + /// columns yet and the next append will set it. + pub(crate) row_count: Option, + pub(crate) columns: Vec, + pub(crate) designated_ts: Option, +} + +impl Chunk { + /// Create a chunk for `table`. The table name is validated at flush + /// time against the QWP/Java client length cap (127 bytes UTF-8). + pub fn new(table: impl Into) -> Self { + Self { + table: table.into(), + row_count: None, + columns: Vec::new(), + designated_ts: None, + } + } + + /// Table name the chunk's rows will land in. + pub fn table(&self) -> &str { + &self.table + } + + /// Number of rows in the chunk. Locked by the first column append; + /// returns `0` before any column has been appended. + pub fn row_count(&self) -> usize { + self.row_count.unwrap_or(0) + } + + /// `true` iff the chunk has no columns and no designated timestamp. + pub fn is_empty(&self) -> bool { + self.row_count.is_none() && self.designated_ts.is_none() + } + + /// Reset the chunk for reuse: clears all rows but keeps each column's + /// allocated capacity. Called automatically after a successful flush. + pub fn clear(&mut self) { + self.row_count = None; + // Drop the column slots; we keep the outer Vec's capacity so the + // next chunk's `push_column` reuses the slot count without + // reallocating the Vec itself. + self.columns.clear(); + self.designated_ts = None; + } + + // ------------------------------------------------------------------ + // Numeric & fixed-width columns + // ------------------------------------------------------------------ + + /// `BYTE` column. Nullable rows are sentinel-encoded as 0 on the wire. + pub fn column_i8( + &mut self, + name: &str, + data: &[i8], + validity: Option<&Validity<'_>>, + ) -> Result<&mut Self> { + let row_count = check_row_count(self.row_count, data.len(), validity)?; + let mut payload = new_payload(); + payload.push(0); // null_flag + match validity { + None => { + // Safety: `i8` and `u8` have identical layout; the cast + // gives a byte slice without copying. + let bytes: &[u8] = + unsafe { std::slice::from_raw_parts(data.as_ptr().cast::(), data.len()) }; + payload.extend_from_slice(bytes); + } + Some(v) => { + for (i, &value) in data.iter().enumerate() { + let out = if v.is_valid(i) { value } else { I8_NULL }; + payload.push(out as u8); + } + } + } + self.push_column(name, QWP_TYPE_BYTE, payload, row_count) + } + + /// `SHORT` column. Nullable rows are sentinel-encoded as 0. + pub fn column_i16( + &mut self, + name: &str, + data: &[i16], + validity: Option<&Validity<'_>>, + ) -> Result<&mut Self> { + encode_le_numeric( + self, + name, + QWP_TYPE_SHORT, + data, + validity, + I16_NULL, + i16::to_le_bytes, + ) + } + + /// `INT` column. Nullable rows are sentinel-encoded as `i32::MIN`. + pub fn column_i32( + &mut self, + name: &str, + data: &[i32], + validity: Option<&Validity<'_>>, + ) -> Result<&mut Self> { + encode_le_numeric( + self, + name, + QWP_TYPE_INT, + data, + validity, + I32_NULL, + i32::to_le_bytes, + ) + } + + /// `LONG` column. Nullable rows are sentinel-encoded as `i64::MIN`. + pub fn column_i64( + &mut self, + name: &str, + data: &[i64], + validity: Option<&Validity<'_>>, + ) -> Result<&mut Self> { + encode_le_numeric( + self, + name, + QWP_TYPE_LONG, + data, + validity, + I64_NULL, + i64::to_le_bytes, + ) + } + + /// `FLOAT` column. Nullable rows are sentinel-encoded as `NaN`. + pub fn column_f32( + &mut self, + name: &str, + data: &[f32], + validity: Option<&Validity<'_>>, + ) -> Result<&mut Self> { + encode_le_numeric( + self, + name, + QWP_TYPE_FLOAT, + data, + validity, + F32_NULL, + f32::to_le_bytes, + ) + } + + /// `DOUBLE` column. Nullable rows are sentinel-encoded as `NaN`. + pub fn column_f64( + &mut self, + name: &str, + data: &[f64], + validity: Option<&Validity<'_>>, + ) -> Result<&mut Self> { + encode_le_numeric( + self, + name, + QWP_TYPE_DOUBLE, + data, + validity, + F64_NULL, + f64::to_le_bytes, + ) + } + + /// `BOOLEAN` column. `data` is an Arrow-style LSB-first packed bitmap + /// (1 = true). Nullable rows are encoded as `false` on the wire — the + /// row-API + QuestDB convention. + pub fn column_bool( + &mut self, + name: &str, + data: &[u8], + row_count: usize, + validity: Option<&Validity<'_>>, + ) -> Result<&mut Self> { + let bytes_required = row_count.div_ceil(8); + if data.len() < bytes_required { + return Err(error::fmt!( + InvalidApiCall, + "Boolean column data too short: {} bytes for {} rows (need at least {})", + data.len(), + row_count, + bytes_required + )); + } + let row_count = check_row_count(self.row_count, row_count, validity)?; + let mut payload = new_payload(); + payload.push(0); // null_flag — bool always uses sentinel encoding + + let mut packed = 0u8; + let mut bit_idx = 0u8; + for i in 0..row_count { + let bit = (data[i / 8] >> (i % 8)) & 1; + let valid = validity.is_none_or(|v| v.is_valid(i)); + if bit == 1 && valid { + packed |= 1u8 << bit_idx; + } + bit_idx += 1; + if bit_idx == 8 { + payload.push(packed); + packed = 0; + bit_idx = 0; + } + } + if bit_idx != 0 { + payload.push(packed); + } + self.push_column(name, QWP_TYPE_BOOLEAN, payload, row_count) + } + + // ------------------------------------------------------------------ + // Bitmap-style fixed-width columns (sparse-null types) + // ------------------------------------------------------------------ + + /// `UUID` column. `data[i]` is a 16-byte UUID per row (bytes 0..8 lo + /// half LE, 8..16 hi half LE — same layout as the row-API path). + pub fn column_uuid( + &mut self, + name: &str, + data: &[[u8; 16]], + validity: Option<&Validity<'_>>, + ) -> Result<&mut Self> { + encode_fixed_width_bitmap(self, name, QWP_TYPE_UUID, data, validity, 16) + } + + /// `LONG256` column. `data[i]` is a 32-byte LONG256 per row (4 LE + /// 64-bit limbs, least-significant first). + pub fn column_long256( + &mut self, + name: &str, + data: &[[u8; 32]], + validity: Option<&Validity<'_>>, + ) -> Result<&mut Self> { + encode_fixed_width_bitmap(self, name, QWP_TYPE_LONG256, data, validity, 32) + } + + /// `IPV4` column. Each `data[i]` is a `u32::from(Ipv4Addr)` (octet 0 + /// in the high byte) encoded little-endian on the wire. + pub fn column_ipv4( + &mut self, + name: &str, + data: &[u32], + validity: Option<&Validity<'_>>, + ) -> Result<&mut Self> { + encode_le_bitmap(self, name, QWP_TYPE_IPV4, data, validity, u32::to_le_bytes) + } + + /// `TIMESTAMP_NANOS` column (wire type `0x10`). + pub fn column_ts_nanos( + &mut self, + name: &str, + data: &[i64], + validity: Option<&Validity<'_>>, + ) -> Result<&mut Self> { + encode_le_bitmap( + self, + name, + QWP_TYPE_TIMESTAMP_NANOS, + data, + validity, + i64::to_le_bytes, + ) + } + + /// `TIMESTAMP` (microseconds) column (wire type `0x0A`). + pub fn column_ts_micros( + &mut self, + name: &str, + data: &[i64], + validity: Option<&Validity<'_>>, + ) -> Result<&mut Self> { + encode_le_bitmap( + self, + name, + QWP_TYPE_TIMESTAMP, + data, + validity, + i64::to_le_bytes, + ) + } + + /// `DATE` column. Milliseconds since the Unix epoch on the wire. + pub fn column_date_millis( + &mut self, + name: &str, + data: &[i64], + validity: Option<&Validity<'_>>, + ) -> Result<&mut Self> { + encode_le_bitmap(self, name, QWP_TYPE_DATE, data, validity, i64::to_le_bytes) + } + + // ------------------------------------------------------------------ + // Variable-width text (VARCHAR) + // ------------------------------------------------------------------ + + /// `VARCHAR` column (QWP wire type `0x0F`). + /// + /// Input is Arrow Utf8 shape: `offsets` has `row_count + 1` entries, + /// monotonically non-decreasing, where `bytes[offsets[i]..offsets[i+1]]` + /// is the value for row `i`. `offsets[0]` may be non-zero (the column + /// encoder rebases to 0 on the wire). + /// + /// Wire output: dense (only non-null values), `non_null_count + 1` + /// little-endian u32 offsets starting at 0, followed by the + /// concatenated bytes of the non-null rows. + /// + /// UTF-8 validity is the caller's responsibility; invalid UTF-8 is + /// detected by the server and surfaced as a server rejection. + pub fn column_varchar( + &mut self, + name: &str, + offsets: &[i32], + bytes: &[u8], + validity: Option<&Validity<'_>>, + ) -> Result<&mut Self> { + // Arrow Utf8 layout: offsets length is row_count + 1. We can't + // call `check_row_count(.. offsets.len() ..)` because the data is + // really `offsets.len() - 1` rows. + if offsets.is_empty() { + return Err(error::fmt!( + InvalidApiCall, + "VARCHAR offsets must have at least one entry (row_count + 1)" + )); + } + let row_count = offsets.len() - 1; + let row_count = check_row_count(self.row_count, row_count, validity)?; + + validate_varchar_offsets(offsets, bytes.len())?; + + let mut payload = new_payload(); + match validity { + None => { + payload.push(0); // null_flag + // Rebase offsets to start at 0 and write them as LE u32. + payload.reserve(4 * (row_count + 1) + bytes.len()); + let base = offsets[0]; + if base == 0 { + // Common case: contiguous arrow buffer, base == 0 — the + // i32 LE bytes are bit-identical to u32 LE bytes for + // non-negative values, so memcpy the offset table. + let offset_bytes: &[u8] = unsafe { + std::slice::from_raw_parts( + offsets.as_ptr().cast::(), + std::mem::size_of_val(offsets), + ) + }; + payload.extend_from_slice(offset_bytes); + // Bytes: copy the in-use slice (caller's buffer may be + // longer than the last offset). + let used = offsets[row_count] as usize; + payload.extend_from_slice(&bytes[..used]); + } else { + for &offset in offsets { + let normalized = (offset - base) as u32; + payload.extend_from_slice(&normalized.to_le_bytes()); + } + let start = base as usize; + let end = offsets[row_count] as usize; + payload.extend_from_slice(&bytes[start..end]); + } + } + Some(v) => { + payload.push(1); // null_flag — bitmap follows + v.write_qwp_bitmap(&mut payload); + + // Dense offsets: walk non-null rows once, then append the + // matching bytes. We size the offset table conservatively + // and patch it as we go to avoid a separate pass. + let non_null = v.non_null_count(); + let offsets_start = payload.len(); + payload.resize(offsets_start + 4 * (non_null + 1), 0); + // First dense offset is always 0. + payload[offsets_start..offsets_start + 4].copy_from_slice(&0u32.to_le_bytes()); + + let mut cumulative: u32 = 0; + let mut next_offset_idx = 1usize; + let bytes_anchor = payload.len(); + for i in 0..row_count { + if !v.is_valid(i) { + continue; + } + // Skip slicing for null rows — caller's offsets there + // are not trusted (Arrow allows arbitrary values). + let start = offsets[i] as usize; + let end = offsets[i + 1] as usize; + let len = end - start; + payload.extend_from_slice(&bytes[start..end]); + let new_cumulative = cumulative.checked_add(len as u32).ok_or_else(|| { + error::fmt!(InvalidApiCall, "VARCHAR column bytes exceed u32::MAX") + })?; + cumulative = new_cumulative; + let off = offsets_start + 4 * next_offset_idx; + payload[off..off + 4].copy_from_slice(&cumulative.to_le_bytes()); + next_offset_idx += 1; + } + debug_assert_eq!(next_offset_idx - 1, non_null); + debug_assert_eq!(payload.len() - bytes_anchor, cumulative as usize); + } + } + self.push_column(name, QWP_TYPE_VARCHAR, payload, row_count) + } + + // ------------------------------------------------------------------ + // Symbol columns (dictionary-encoded fast path) + // ------------------------------------------------------------------ + + /// `SYMBOL` column with `i8` dictionary codes (max dict cardinality + /// 128 — caller should promote to `i16`/`i32` for larger dicts). + pub fn symbol_dict_i8( + &mut self, + name: &str, + codes: &[i8], + dict_offsets: &[i32], + dict_bytes: &[u8], + validity: Option<&Validity<'_>>, + ) -> Result<&mut Self> { + push_symbol_column( + self, + name, + codes, + |c| *c as i32, + dict_offsets, + dict_bytes, + validity, + ) + } + + /// `SYMBOL` column with `i16` dictionary codes. + pub fn symbol_dict_i16( + &mut self, + name: &str, + codes: &[i16], + dict_offsets: &[i32], + dict_bytes: &[u8], + validity: Option<&Validity<'_>>, + ) -> Result<&mut Self> { + push_symbol_column( + self, + name, + codes, + |c| *c as i32, + dict_offsets, + dict_bytes, + validity, + ) + } + + /// `SYMBOL` column with `i32` dictionary codes — the Pandas + /// `Categorical` / Polars `Categorical` shape. + pub fn symbol_dict_i32( + &mut self, + name: &str, + codes: &[i32], + dict_offsets: &[i32], + dict_bytes: &[u8], + validity: Option<&Validity<'_>>, + ) -> Result<&mut Self> { + push_symbol_column( + self, + name, + codes, + |c| *c, + dict_offsets, + dict_bytes, + validity, + ) + } + + // ------------------------------------------------------------------ + // Designated timestamp + // ------------------------------------------------------------------ + + /// Designated timestamp in microseconds since the Unix epoch (wire + /// type `TIMESTAMP` 0x0A). Required exactly once per chunk before + /// flush. Designated timestamps must be non-null per the wire spec — + /// there is no validity bitmap. + pub fn designated_timestamp_micros(&mut self, data: &[i64]) -> Result<&mut Self> { + self.set_designated_ts(QWP_TYPE_TIMESTAMP, data) + } + + /// Designated timestamp in nanoseconds since the Unix epoch (wire + /// type `TIMESTAMP_NANOS` 0x10). + pub fn designated_timestamp_nanos(&mut self, data: &[i64]) -> Result<&mut Self> { + self.set_designated_ts(QWP_TYPE_TIMESTAMP_NANOS, data) + } + + fn set_designated_ts(&mut self, wire_type: u8, data: &[i64]) -> Result<&mut Self> { + if self.designated_ts.is_some() { + return Err(error::fmt!( + InvalidApiCall, + "designated timestamp already set on this chunk" + )); + } + let row_count = check_row_count(self.row_count, data.len(), None)?; + let mut payload = new_payload(); + payload.push(0); // null_flag — designated_ts is always non-null + payload.reserve(8 * data.len()); + for &v in data { + payload.extend_from_slice(&v.to_le_bytes()); + } + self.row_count = Some(row_count); + self.designated_ts = Some(DesignatedTimestamp { wire_type, payload }); + Ok(self) + } + + // ------------------------------------------------------------------ + // Internal helpers + // ------------------------------------------------------------------ + + fn push_column( + &mut self, + name: &str, + wire_type: u8, + payload: Vec, + row_count: usize, + ) -> Result<&mut Self> { + validate_name("column", name)?; + self.guard_unique_name(name)?; + let signature_chunk = build_signature_chunk(name, wire_type); + self.columns.push(ChunkColumn::Resolved { + name: name.to_owned(), + signature_chunk, + payload, + }); + self.row_count = Some(row_count); + Ok(self) + } + + fn guard_unique_name(&self, name: &str) -> Result<()> { + if self.columns.iter().any(|c| c.name() == name) { + return Err(error::fmt!( + InvalidApiCall, + "duplicate column name in chunk: {:?}", + name + )); + } + Ok(()) + } +} + +fn build_signature_chunk(name: &str, wire_type: u8) -> Vec { + let mut sig = Vec::with_capacity(1 + name.len() + 1); + write_qwp_bytes(&mut sig, name.as_bytes()); + sig.push(wire_type); + sig +} + +fn new_payload() -> Vec { + // 1 byte null_flag, room for a small bitmap, and most callers extend + // immediately. 16 bytes is enough to avoid the first realloc for any + // short column. + Vec::with_capacity(16) +} + +/// Bulk-intern a symbol column at append time. +/// +/// Three passes (each O(row_count) or O(dict_len) but never the +/// product): +/// 1. Walk `codes` once to mark which dict entries the chunk actually +/// references in a bitset. Validate range; reject out-of-range. +/// 2. Walk the bitset to copy referenced dict entries into compact +/// `referenced_symbols` storage and build a `local → internal` map +/// keyed by dict index. +/// 3. Walk `codes` again to translate to the compact internal indices +/// and build the QWP-shape bitmap from validity. +/// +/// Defers the connection-scoped global-id assignment to flush time +/// because chunks are sender-agnostic — see `doc/COLUMN_SENDER_PLAN.md`. +fn push_symbol_column<'a, T, F>( + chunk: &'a mut Chunk, + name: &str, + codes: &[T], + to_i32: F, + dict_offsets: &[i32], + dict_bytes: &[u8], + validity: Option<&Validity<'_>>, +) -> Result<&'a mut Chunk> +where + F: Fn(&T) -> i32, +{ + let row_count = check_row_count(chunk.row_count, codes.len(), validity)?; + validate_name("column", name)?; + chunk.guard_unique_name(name)?; + + if dict_offsets.is_empty() { + return Err(error::fmt!( + InvalidApiCall, + "symbol dict offsets must have at least one entry (dict_len + 1)" + )); + } + validate_varchar_offsets(dict_offsets, dict_bytes.len())?; + let dict_len = dict_offsets.len() - 1; + + // Pass 1: referenced bitset + range check. + let mut referenced = vec![false; dict_len]; + let mut non_null_count = 0usize; + for (i, code) in codes.iter().enumerate() { + if !validity.is_none_or(|v| v.is_valid(i)) { + continue; + } + let idx = to_i32(code); + if idx < 0 || (idx as usize) >= dict_len { + return Err(error::fmt!( + InvalidApiCall, + "symbol code out of range: row {} -> {} (dict_len = {})", + i, + idx, + dict_len + )); + } + referenced[idx as usize] = true; + non_null_count += 1; + } + + // Pass 2: compact referenced dict + build local-to-internal map. + // `local_to_internal[d] == u32::MAX` for unreferenced entries; we + // never index it with an unreferenced code (pass 1 marked them so + // pass 3 only follows referenced entries). `dict_offsets` are + // absolute byte offsets into `dict_bytes` per the Arrow Utf8 layout + // (`validate_varchar_offsets` has already proven the slices are in + // bounds and monotonic). + let mut local_to_internal = vec![u32::MAX; dict_len]; + let mut referenced_symbols: Vec> = Vec::new(); + for (d, mark) in referenced.iter().enumerate() { + if !*mark { + continue; + } + let start = dict_offsets[d] as usize; + let end = dict_offsets[d + 1] as usize; + let internal = referenced_symbols.len() as u32; + referenced_symbols.push(dict_bytes[start..end].to_vec()); + local_to_internal[d] = internal; + } + + // Pass 3: translate codes to internal indices; build QWP bitmap. + let mut compact_codes = Vec::with_capacity(codes.len()); + for (i, code) in codes.iter().enumerate() { + if !validity.is_none_or(|v| v.is_valid(i)) { + compact_codes.push(u32::MAX); + continue; + } + let idx = to_i32(code) as usize; + compact_codes.push(local_to_internal[idx]); + } + let bitmap = validity.map(|v| { + let mut bm = Vec::with_capacity(row_count.div_ceil(8)); + v.write_qwp_bitmap(&mut bm); + bm + }); + + let signature_chunk = build_signature_chunk(name, QWP_TYPE_SYMBOL); + chunk.columns.push(ChunkColumn::Symbol { + name: name.to_owned(), + signature_chunk, + row_count, + codes: compact_codes, + bitmap, + non_null_count, + referenced_symbols, + }); + chunk.row_count = Some(row_count); + Ok(chunk) +} + +fn validate_varchar_offsets(offsets: &[i32], bytes_len: usize) -> Result<()> { + // Arrow Utf8 promises monotonic non-decreasing offsets and that every + // offset is ≤ bytes_len. We trust UTF-8 (server enforces) but cheap + // bounds checking here saves the server an obvious parse error and + // gives us a meaningful Rust-side error. + let mut prev = offsets[0]; + if prev < 0 { + return Err(error::fmt!( + InvalidApiCall, + "VARCHAR offsets must be non-negative (offsets[0] = {})", + prev + )); + } + for (i, &off) in offsets.iter().enumerate().skip(1) { + if off < prev { + return Err(error::fmt!( + InvalidApiCall, + "VARCHAR offsets must be non-decreasing (offsets[{}] = {} < offsets[{}] = {})", + i, + off, + i - 1, + prev + )); + } + prev = off; + } + if (prev as usize) > bytes_len { + return Err(error::fmt!( + InvalidApiCall, + "VARCHAR offsets exceed bytes buffer: last offset = {}, bytes_len = {}", + prev, + bytes_len + )); + } + Ok(()) +} + +#[inline] +fn encode_le_numeric<'a, T, const N: usize, F>( + chunk: &'a mut Chunk, + name: &str, + wire_type: u8, + data: &[T], + validity: Option<&Validity<'_>>, + null_value: T, + to_le: F, +) -> Result<&'a mut Chunk> +where + T: Copy, + F: Fn(T) -> [u8; N], +{ + let row_count = check_row_count(chunk.row_count, data.len(), validity)?; + let mut payload = new_payload(); + payload.push(0); // null_flag — non-sparse-null types always use sentinels + payload.reserve(N * row_count); + match validity { + None => { + // Safety: `[T]` and the resulting `[u8]` view share the same + // backing memory; `T` is a plain numeric POD so any byte + // pattern is sound. This is the column-sender hot path — pure + // memcpy. + let bytes: &[u8] = unsafe { + std::slice::from_raw_parts(data.as_ptr().cast::(), std::mem::size_of_val(data)) + }; + payload.extend_from_slice(bytes); + } + Some(v) => { + for (i, &value) in data.iter().enumerate() { + let out = if v.is_valid(i) { value } else { null_value }; + payload.extend_from_slice(&to_le(out)); + } + } + } + chunk.push_column(name, wire_type, payload, row_count) +} + +#[inline] +fn encode_le_bitmap<'a, T, const N: usize, F>( + chunk: &'a mut Chunk, + name: &str, + wire_type: u8, + data: &[T], + validity: Option<&Validity<'_>>, + to_le: F, +) -> Result<&'a mut Chunk> +where + T: Copy, + F: Fn(T) -> [u8; N], +{ + let row_count = check_row_count(chunk.row_count, data.len(), validity)?; + let mut payload = new_payload(); + match validity { + None => { + payload.push(0); // null_flag + payload.reserve(N * row_count); + let bytes: &[u8] = unsafe { + std::slice::from_raw_parts(data.as_ptr().cast::(), std::mem::size_of_val(data)) + }; + payload.extend_from_slice(bytes); + } + Some(v) => { + payload.push(1); // null_flag — bitmap follows + v.write_qwp_bitmap(&mut payload); + payload.reserve(N * v.non_null_count()); + for (i, &value) in data.iter().enumerate() { + if v.is_valid(i) { + payload.extend_from_slice(&to_le(value)); + } + } + } + } + chunk.push_column(name, wire_type, payload, row_count) +} + +#[inline] +fn encode_fixed_width_bitmap<'a, const N: usize>( + chunk: &'a mut Chunk, + name: &str, + wire_type: u8, + data: &[[u8; N]], + validity: Option<&Validity<'_>>, + elem_size: usize, +) -> Result<&'a mut Chunk> { + debug_assert_eq!(elem_size, N); + let row_count = check_row_count(chunk.row_count, data.len(), validity)?; + let mut payload = new_payload(); + match validity { + None => { + payload.push(0); // null_flag + payload.reserve(N * row_count); + // Bulk memcpy: `[[u8; N]]` is laid out as `N * row_count` bytes + // contiguously, no per-row work. + let bytes: &[u8] = + unsafe { std::slice::from_raw_parts(data.as_ptr().cast::(), N * data.len()) }; + payload.extend_from_slice(bytes); + } + Some(v) => { + payload.push(1); // null_flag — bitmap follows + v.write_qwp_bitmap(&mut payload); + payload.reserve(N * v.non_null_count()); + for (i, value) in data.iter().enumerate() { + if v.is_valid(i) { + payload.extend_from_slice(&value[..]); + } + } + } + } + chunk.push_column(name, wire_type, payload, row_count) +} + +impl Debug for Chunk { + fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { + f.debug_struct("Chunk") + .field("table", &self.table) + .field("row_count", &self.row_count()) + .field("columns", &self.columns.len()) + .field("has_designated_ts", &self.designated_ts.is_some()) + .finish() + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn locks_row_count_on_first_column() { + let mut chunk = Chunk::new("t"); + chunk.column_i64("a", &[1, 2, 3], None).unwrap(); + assert_eq!(chunk.row_count(), 3); + let err = chunk.column_i64("b", &[1, 2], None).unwrap_err(); + assert_eq!(err.code(), crate::ErrorCode::InvalidApiCall); + assert!(err.msg().contains("row_count")); + } + + #[test] + fn rejects_duplicate_column_name() { + let mut chunk = Chunk::new("t"); + chunk.column_i64("a", &[1], None).unwrap(); + let err = chunk.column_i64("a", &[2], None).unwrap_err(); + assert_eq!(err.code(), crate::ErrorCode::InvalidApiCall); + assert!(err.msg().contains("duplicate")); + } + + #[test] + fn rejects_invalid_validity_length() { + let mut chunk = Chunk::new("t"); + let bits = [0xFFu8]; + let v = Validity::from_bitmap(&bits, 8).unwrap(); + let err = chunk.column_i64("a", &[1, 2, 3], Some(&v)).unwrap_err(); + assert_eq!(err.code(), crate::ErrorCode::InvalidApiCall); + assert!(err.msg().contains("Validity bitmap")); + } + + #[test] + fn nullable_i64_sentinel_encodes() { + let mut chunk = Chunk::new("t"); + let bits = [0b0000_0101]; // bits 0,2 valid; bit 1 null + let v = Validity::from_bitmap(&bits, 3).unwrap(); + chunk.column_i64("a", &[10, 99, 20], Some(&v)).unwrap(); + let payload = chunk.columns[0].resolved_payload(); + assert_eq!(payload[0], 0, "null_flag must be 0 for I64"); + let raw: Vec = payload[1..] + .chunks_exact(8) + .map(|b| i64::from_le_bytes(b.try_into().unwrap())) + .collect(); + assert_eq!(raw, vec![10, I64_NULL, 20]); + } + + #[test] + fn nullable_uuid_uses_bitmap() { + let mut chunk = Chunk::new("t"); + let uuids: [[u8; 16]; 3] = [[0x10; 16], [0x99; 16], [0x20; 16]]; + let bits = [0b0000_0101]; // 0 valid, 1 null, 2 valid + let v = Validity::from_bitmap(&bits, 3).unwrap(); + chunk.column_uuid("u", &uuids, Some(&v)).unwrap(); + let payload = chunk.columns[0].resolved_payload(); + assert_eq!(payload[0], 1, "null_flag must be 1 (bitmap follows)"); + // QWP bitmap: bit=1 means NULL. Arrow bits = 0b101 → invert = + // 0b010 masked to 3 bits. + let qwp_bitmap = payload[1]; + assert_eq!(qwp_bitmap & 0b111, 0b010); + // Dense values: rows 0 and 2 only. + let dense = &payload[2..]; + assert_eq!(dense.len(), 32); + assert_eq!(&dense[..16], &[0x10u8; 16]); + assert_eq!(&dense[16..], &[0x20u8; 16]); + } + + #[test] + fn designated_ts_sets_row_count() { + let mut chunk = Chunk::new("t"); + chunk.designated_timestamp_micros(&[1, 2, 3]).unwrap(); + assert_eq!(chunk.row_count(), 3); + let err = chunk.designated_timestamp_nanos(&[4, 5, 6]).unwrap_err(); + assert!(err.msg().contains("designated")); + } + + #[test] + fn clear_resets_columns_but_keeps_table() { + let mut chunk = Chunk::new("t"); + chunk.column_i64("a", &[1], None).unwrap(); + chunk.designated_timestamp_nanos(&[10]).unwrap(); + chunk.clear(); + assert_eq!(chunk.row_count(), 0); + assert!(chunk.is_empty()); + assert_eq!(chunk.table(), "t"); + } + + #[test] + fn name_validation_rejects_overlong_names() { + let mut chunk = Chunk::new("t"); + let too_long = "x".repeat(super::super::wire::MAX_NAME_LEN + 1); + let err = chunk.column_i64(&too_long, &[1], None).unwrap_err(); + assert_eq!(err.code(), crate::ErrorCode::InvalidName); + } + + #[test] + fn varchar_no_null_memcpy_path() { + let mut chunk = Chunk::new("t"); + let offsets: [i32; 4] = [0, 3, 7, 11]; + let bytes = b"abcdefghijk"; + chunk.column_varchar("v", &offsets, bytes, None).unwrap(); + let payload = chunk.columns[0].resolved_payload(); + assert_eq!(payload[0], 0, "null_flag"); + // Offset table: 4 u32 little-endian values matching `offsets`. + let table = &payload[1..1 + 16]; + let parsed: Vec = table + .chunks_exact(4) + .map(|b| u32::from_le_bytes(b.try_into().unwrap())) + .collect(); + assert_eq!(parsed, vec![0u32, 3, 7, 11]); + // Byte buffer follows. + assert_eq!(&payload[1 + 16..], bytes); + } + + #[test] + fn varchar_no_null_rebases_non_zero_first_offset() { + let mut chunk = Chunk::new("t"); + // Caller's Arrow slice starts at offset 5. + let offsets: [i32; 3] = [5, 8, 12]; + let bytes = b"_____abcdefg____"; + chunk.column_varchar("v", &offsets, bytes, None).unwrap(); + let payload = chunk.columns[0].resolved_payload(); + assert_eq!(payload[0], 0); + let table = &payload[1..1 + 12]; + let parsed: Vec = table + .chunks_exact(4) + .map(|b| u32::from_le_bytes(b.try_into().unwrap())) + .collect(); + assert_eq!(parsed, vec![0u32, 3, 7]); + assert_eq!(&payload[1 + 12..], b"abcdefg"); + } + + #[test] + fn varchar_nullable_gather_skips_null_rows() { + let mut chunk = Chunk::new("t"); + // 3 rows; row 1 is null. Per the plan we MUST not slice + // bytes[offsets[1]..offsets[2]] for null rows. We assert the + // skip implicitly by reusing the same offset on both sides of + // the null row (so dense bytes still match what's expected) and + // by checking the output's bytes equal the union of non-null + // slices only. + let offsets: [i32; 4] = [0, 3, 3, 6]; + let bytes = b"abcxyz"; + let bits = [0b0000_0101]; // 0 valid, 1 null, 2 valid + let v = Validity::from_bitmap(&bits, 3).unwrap(); + chunk + .column_varchar("v", &offsets, bytes, Some(&v)) + .unwrap(); + let payload = chunk.columns[0].resolved_payload(); + assert_eq!(payload[0], 1, "null_flag = 1 (bitmap follows)"); + // QWP bitmap byte: invert Arrow bits 0b101 → 0b010 (mask to 3 bits). + assert_eq!(payload[1] & 0b111, 0b010); + // 2 non-null rows → 3 offsets (u32 each) = 12 bytes, then bytes. + let offsets_section = &payload[2..2 + 12]; + let parsed: Vec = offsets_section + .chunks_exact(4) + .map(|b| u32::from_le_bytes(b.try_into().unwrap())) + .collect(); + assert_eq!(parsed, vec![0u32, 3, 6]); + assert_eq!(&payload[2 + 12..], b"abcxyz"); + } + + #[test] + fn varchar_rejects_negative_offset() { + let mut chunk = Chunk::new("t"); + let offsets: [i32; 3] = [-1, 1, 2]; + let err = chunk + .column_varchar("v", &offsets, b"ab", None) + .unwrap_err(); + assert_eq!(err.code(), crate::ErrorCode::InvalidApiCall); + assert!(err.msg().contains("non-negative"), "msg: {}", err.msg()); + } + + #[test] + fn varchar_rejects_non_monotonic_offsets() { + let mut chunk = Chunk::new("t"); + let offsets: [i32; 3] = [0, 5, 3]; + let err = chunk + .column_varchar("v", &offsets, b"abcde", None) + .unwrap_err(); + assert_eq!(err.code(), crate::ErrorCode::InvalidApiCall); + assert!(err.msg().contains("non-decreasing"), "msg: {}", err.msg()); + } + + #[test] + fn varchar_rejects_offsets_past_bytes_end() { + let mut chunk = Chunk::new("t"); + let offsets: [i32; 3] = [0, 2, 7]; + let err = chunk + .column_varchar("v", &offsets, b"abcde", None) + .unwrap_err(); + assert_eq!(err.code(), crate::ErrorCode::InvalidApiCall); + assert!(err.msg().contains("bytes buffer"), "msg: {}", err.msg()); + } + + #[test] + fn varchar_rejects_empty_offsets() { + let mut chunk = Chunk::new("t"); + let err = chunk.column_varchar("v", &[], b"", None).unwrap_err(); + assert_eq!(err.code(), crate::ErrorCode::InvalidApiCall); + } +} diff --git a/questdb-rs/src/ingress/column_sender/conf.rs b/questdb-rs/src/ingress/column_sender/conf.rs new file mode 100644 index 00000000..f024670c --- /dev/null +++ b/questdb-rs/src/ingress/column_sender/conf.rs @@ -0,0 +1,413 @@ +/******************************************************************************* + * ___ _ ____ ____ + * / _ \ _ _ ___ ___| |_| _ \| __ ) + * | | | | | | |/ _ \/ __| __| | | | _ \ + * | |_| | |_| | __/\__ \ |_| |_| | |_) | + * \__\_\\__,_|\___||___/\__|____/|____/ + * + * Copyright (c) 2014-2019 Appsicle + * Copyright (c) 2019-2025 QuestDB + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ******************************************************************************/ + +//! Column-sender connect-string parsing. +//! +//! Extracts pool-specific keys (`pool_size`, `pool_max`, +//! `pool_idle_timeout_ms`, `pool_reap`), refuses store-and-forward keys +//! (`sf_*`, `sender_id`), enforces a QWP/WebSocket schema, and produces a +//! sanitized conf string that the underlying [`crate::ingress::SenderBuilder`] +//! can consume to build per-pool-slot connections. + +use std::time::Duration; + +use crate::{Result, error}; + +/// Default number of warm connections opened eagerly at +/// [`super::QuestDb::connect`]. +pub(crate) const DEFAULT_POOL_SIZE: usize = 1; +/// Default hard cap on auto-grow. +pub(crate) const DEFAULT_POOL_MAX: usize = 64; +/// Default idle timeout before the reaper closes an above-`pool_size` +/// connection. +pub(crate) const DEFAULT_POOL_IDLE_TIMEOUT: Duration = Duration::from_secs(60); + +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub(crate) enum PoolReap { + Auto, + Manual, +} + +#[derive(Debug, Clone)] +pub(crate) struct PoolConfig { + pub(crate) pool_size: usize, + pub(crate) pool_max: usize, + pub(crate) pool_idle_timeout: Duration, + pub(crate) pool_reap: PoolReap, +} + +impl Default for PoolConfig { + fn default() -> Self { + Self { + pool_size: DEFAULT_POOL_SIZE, + pool_max: DEFAULT_POOL_MAX, + pool_idle_timeout: DEFAULT_POOL_IDLE_TIMEOUT, + pool_reap: PoolReap::Auto, + } + } +} + +#[derive(Debug, Clone)] +pub(crate) struct ParsedConf { + pub(crate) pool: PoolConfig, + /// `true` iff the connect string opted in to durable acks via + /// `request_durable_ack=on`. Required for `AckLevel::Durable` flushes. + pub(crate) durable_ack_opt_in: bool, +} + +/// Validate and extract pool-specific knobs from a column-sender connect +/// string. +/// +/// The conf string itself is **not** rewritten — the underlying +/// `SenderBuilder` silently ignores the pool keys, so a single parse over the +/// original conf is enough. This function only sanity-checks the schema, +/// refuses store-and-forward keys, and returns the [`PoolConfig`] the pool +/// machinery needs. +pub(crate) fn parse(conf: &str) -> Result { + let Some((service, params)) = conf.split_once("::") else { + return Err(error::fmt!( + ConfigError, + "Invalid column-sender config: missing '::' service separator" + )); + }; + + if !is_qwp_ws_schema(service) { + return Err(error::fmt!( + ConfigError, + "Column-sender requires a QWP/WebSocket connect string \ + (schema must be one of 'qwpws', 'qwpwss', 'ws', or 'wss', \ + got {:?})", + service + )); + } + + let mut pool = PoolConfig::default(); + let mut pool_size_specified = false; + let mut durable_ack_opt_in = false; + + walk_params(params, |key, value| { + if is_refused_key(key) { + return Err(refused_key_error(key)); + } + match key { + "request_durable_ack" => { + durable_ack_opt_in = parse_on_off("request_durable_ack", value)?; + } + "qwp_ws_progress" if value != "background" => { + return Err(error::fmt!( + ConfigError, + "Column-sender requires \"qwp_ws_progress=background\" (got {:?})", + value + )); + } + "pool_size" => { + pool.pool_size = parse_pool_usize(key, value)?; + pool_size_specified = true; + } + "pool_max" => { + let value = parse_pool_usize(key, value)?; + if value == 0 { + return Err(error::fmt!( + ConfigError, + "\"pool_max\" must be greater than 0" + )); + } + pool.pool_max = value; + } + "pool_idle_timeout_ms" => { + let millis: u64 = value.parse().map_err(|_| { + error::fmt!( + ConfigError, + "Invalid value for \"pool_idle_timeout_ms\" (expected non-negative integer): {:?}", + value + ) + })?; + pool.pool_idle_timeout = Duration::from_millis(millis); + } + "pool_reap" => { + pool.pool_reap = match value { + "auto" => PoolReap::Auto, + "manual" => PoolReap::Manual, + other => { + return Err(error::fmt!( + ConfigError, + "Invalid value for \"pool_reap\" (expected 'auto' or 'manual'): {:?}", + other + )); + } + }; + } + _ => { + // Unknown / passthrough — leave the SenderBuilder to handle it. + } + } + Ok(()) + })?; + + if pool_size_specified && pool.pool_size == 0 { + return Err(error::fmt!( + ConfigError, + "\"pool_size\" must be greater than 0" + )); + } + + if pool.pool_size > pool.pool_max { + return Err(error::fmt!( + ConfigError, + "\"pool_size\" ({}) must not exceed \"pool_max\" ({})", + pool.pool_size, + pool.pool_max + )); + } + + Ok(ParsedConf { + pool, + durable_ack_opt_in, + }) +} + +fn parse_on_off(key: &str, value: &str) -> Result { + match value { + "on" => Ok(true), + "off" => Ok(false), + _ => Err(error::fmt!( + ConfigError, + "Invalid value for {:?} (expected 'on' or 'off'): {:?}", + key, + value + )), + } +} + +fn is_qwp_ws_schema(service: &str) -> bool { + service.eq_ignore_ascii_case("qwpws") + || service.eq_ignore_ascii_case("qwpwss") + || service.eq_ignore_ascii_case("ws") + || service.eq_ignore_ascii_case("wss") +} + +fn is_refused_key(key: &str) -> bool { + // Store-and-forward (`sf_*`) is unsupported by the column-sender API in v1 + // — see `doc/COLUMN_SENDER_PLAN.md` §8. The legacy `sender_id` key is part + // of the same SF family and is refused alongside the `sf_*` keys. + key == "sender_id" || key.starts_with("sf_") +} + +fn refused_key_error(key: &str) -> crate::Error { + error::fmt!( + ConfigError, + "Column-sender does not support store-and-forward configuration \ + (key {:?} is refused; use the row-major `Sender` API if you need \ + on-disk durability)", + key + ) +} + +fn parse_pool_usize(key: &str, value: &str) -> Result { + value.parse::().map_err(|_| { + error::fmt!( + ConfigError, + "Invalid value for {:?} (expected non-negative integer): {:?}", + key, + value + ) + }) +} + +/// Walk a parsed conf-string `params` section, invoking `visit(key, value)` +/// for each `key=value;` pair. +/// +/// Mirrors the value-parsing rules of [`crate::ingress::scan_qwp_ws_addr_params`]: +/// a doubled `;;` is treated as a literal semicolon inside a value. +fn walk_params(params: &str, mut visit: F) -> Result<()> +where + F: FnMut(&str, &str) -> Result<()>, +{ + let mut pos = 0usize; + while pos < params.len() { + let Some(eq_rel) = params[pos..].find('=') else { + return Err(error::fmt!( + ConfigError, + "Invalid column-sender config: parameter without '=' at position {}", + pos + )); + }; + let key = ¶ms[pos..pos + eq_rel]; + pos = pos + eq_rel + 1; + + let mut value = String::new(); + while pos < params.len() { + let rest = ¶ms[pos..]; + let mut chars = rest.char_indices(); + let (_, ch) = chars.next().expect("pos is within params"); + if ch == ';' { + let next_pos = pos + ch.len_utf8(); + if params[next_pos..].starts_with(';') { + value.push(';'); + pos = next_pos + 1; + continue; + } + pos = next_pos; + break; + } + value.push(ch); + pos += ch.len_utf8(); + } + + visit(key, value.as_str())?; + } + Ok(()) +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::ErrorCode; + + fn parse_ok(conf: &str) -> ParsedConf { + parse(conf).unwrap_or_else(|e| panic!("expected ok, got {e}")) + } + + fn parse_err(conf: &str) -> crate::Error { + match parse(conf) { + Ok(_) => panic!("expected error for {conf:?}"), + Err(e) => e, + } + } + + #[test] + fn defaults() { + let p = parse_ok("qwpws::addr=localhost:9000;"); + assert_eq!(p.pool.pool_size, DEFAULT_POOL_SIZE); + assert_eq!(p.pool.pool_max, DEFAULT_POOL_MAX); + assert_eq!(p.pool.pool_idle_timeout, DEFAULT_POOL_IDLE_TIMEOUT); + assert_eq!(p.pool.pool_reap, PoolReap::Auto); + } + + #[test] + fn parses_pool_knobs() { + let p = parse_ok( + "qwpws::addr=localhost:9000;pool_size=4;pool_max=8;pool_idle_timeout_ms=10000;pool_reap=manual;", + ); + assert_eq!(p.pool.pool_size, 4); + assert_eq!(p.pool.pool_max, 8); + assert_eq!(p.pool.pool_idle_timeout, Duration::from_secs(10)); + assert_eq!(p.pool.pool_reap, PoolReap::Manual); + } + + #[test] + fn refuses_non_qwp_ws_schema() { + let err = parse_err("http::addr=localhost:9000;"); + assert_eq!(err.code(), ErrorCode::ConfigError); + assert!(err.msg().contains("QWP/WebSocket")); + } + + #[test] + fn refuses_sf_keys() { + for key in [ + "sf_dir", + "sender_id", + "sf_max_bytes", + "sf_max_total_bytes", + "sf_durability", + "sf_append_deadline_millis", + ] { + let conf = format!("qwpws::addr=localhost:9000;{key}=whatever;"); + let err = parse_err(&conf); + assert_eq!(err.code(), ErrorCode::ConfigError); + assert!( + err.msg().contains("store-and-forward") && err.msg().contains(key), + "{} -> {}", + key, + err.msg() + ); + } + } + + #[test] + fn refuses_pool_size_zero() { + let err = parse_err("qwpws::addr=localhost:9000;pool_size=0;"); + assert_eq!(err.code(), ErrorCode::ConfigError); + assert!(err.msg().contains("pool_size")); + } + + #[test] + fn refuses_pool_size_above_pool_max() { + let err = parse_err("qwpws::addr=localhost:9000;pool_size=10;pool_max=5;"); + assert_eq!(err.code(), ErrorCode::ConfigError); + assert!(err.msg().contains("pool_size") && err.msg().contains("pool_max")); + } + + #[test] + fn invalid_pool_reap_value() { + let err = parse_err("qwpws::addr=localhost:9000;pool_reap=sometimes;"); + assert_eq!(err.code(), ErrorCode::ConfigError); + assert!(err.msg().contains("pool_reap")); + } + + #[test] + fn ignores_unknown_keys() { + // Unknown keys are passed through to the underlying SenderBuilder, + // which silently ignores its own unknowns. The column-sender layer + // must not error on them either. + let _ = parse_ok("qwpws::addr=localhost:9000;auth_timeout=5000;some_future_key=value;"); + } + + #[test] + fn parses_request_durable_ack() { + let off = parse_ok("qwpws::addr=localhost:9000;"); + assert!(!off.durable_ack_opt_in); + let on = parse_ok("qwpws::addr=localhost:9000;request_durable_ack=on;"); + assert!(on.durable_ack_opt_in); + let explicit_off = parse_ok("qwpws::addr=localhost:9000;request_durable_ack=off;"); + assert!(!explicit_off.durable_ack_opt_in); + } + + #[test] + fn refuses_invalid_request_durable_ack_value() { + let err = parse_err("qwpws::addr=localhost:9000;request_durable_ack=true;"); + assert_eq!(err.code(), ErrorCode::ConfigError); + assert!(err.msg().contains("request_durable_ack")); + } + + #[test] + fn refuses_manual_progress_mode() { + let err = parse_err("qwpws::addr=localhost:9000;qwp_ws_progress=manual;"); + assert_eq!(err.code(), ErrorCode::ConfigError); + assert!(err.msg().contains("qwp_ws_progress")); + } + + #[test] + fn accepts_explicit_background_progress_mode() { + let _ = parse_ok("qwpws::addr=localhost:9000;qwp_ws_progress=background;"); + } + + #[test] + fn doubled_semicolon_in_value() { + // `;;` inside a value should be parsed as a literal `;`, not as a + // record separator. Our walker mirrors `scan_qwp_ws_addr_params` so a + // value containing `;;` does not bleed into the next key. + let _ = parse_ok("qwpws::addr=localhost:9000;password=a;;b;pool_size=2;"); + } +} diff --git a/questdb-rs/src/ingress/column_sender/db.rs b/questdb-rs/src/ingress/column_sender/db.rs new file mode 100644 index 00000000..9ac34280 --- /dev/null +++ b/questdb-rs/src/ingress/column_sender/db.rs @@ -0,0 +1,513 @@ +/******************************************************************************* + * ___ _ ____ ____ + * / _ \ _ _ ___ ___| |_| _ \| __ ) + * | | | | | | |/ _ \/ __| __| | | | _ \ + * | |_| | |_| | __/\__ \ |_| |_| | |_) | + * \__\_\\__,_|\___||___/\__|____/|____/ + * + * Copyright (c) 2014-2019 Appsicle + * Copyright (c) 2019-2025 QuestDB + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ******************************************************************************/ + +//! Column-sender connection pool. +//! +//! `QuestDb` is a thread-safe pool of [`crate::ingress::Sender`] handles to +//! a single QuestDB QWP/WebSocket endpoint. The pool eagerly opens +//! `pool_size` connections at `connect`, auto-grows up to `pool_max` on +//! demand, and (under `pool_reap=auto`) runs a background thread that closes +//! above-`pool_size` connections after they have been idle for +//! `pool_idle_timeout_ms`. +//! +//! Each pool slot is handed out as a [`BorrowedSender<'_>`] which returns +//! itself to the pool on `Drop`. Slots whose underlying connection has +//! latched into `must_close=true` are dropped on return instead of being +//! recycled. + +use std::fmt::{self, Debug, Formatter}; +use std::marker::PhantomData; +use std::ops::{Deref, DerefMut}; +use std::rc::Rc; +use std::sync::atomic::{AtomicBool, Ordering}; +use std::sync::{Arc, Condvar, Mutex}; +use std::thread::{self, JoinHandle}; +use std::time::{Duration, Instant}; + +use crate::ingress::{Sender, SenderBuilder}; +use crate::{Result, error}; + +use super::conf::{self, PoolReap}; +use super::sender::ColumnSender; + +/// Lower bound on the reaper's wake interval. +const REAPER_MIN_TICK: Duration = Duration::from_secs(5); + +/// Connection pool for the column-major sender API. +/// +/// Construct with [`QuestDb::connect`]. Share the pool across threads — its +/// internal state is `Mutex`-guarded so [`QuestDb::borrow_sender`] / +/// [`QuestDb::reap_idle`] / Drop-driven returns are safe to interleave. +/// +/// Each borrow ([`BorrowedSender`]) is **not** `Send` — it belongs to the +/// thread that borrowed it. To ingest in parallel, borrow one sender per +/// worker thread from the same `QuestDb`. +pub struct QuestDb { + inner: Arc, + reaper: Option>, +} + +struct DbInner { + /// Original connect string. Kept verbatim so auto-grow can spin up a new + /// connection with the same settings. + conf: String, + pool_size: usize, + pool_max: usize, + pool_idle_timeout: Duration, + /// Latched from the connect string. Required for `AckLevel::Durable` + /// flushes; without it, a `Durable` flush returns `InvalidApiCall`. + durable_ack_opt_in: bool, + state: Mutex, + /// Wakes the reaper thread on `shutdown` and lets a future blocking + /// borrow wait for a free slot once we grow `borrow_sender` past + /// fail-fast (not in v1). + cv: Condvar, + shutdown: AtomicBool, +} + +#[derive(Default)] +struct PoolState { + /// Idle connections, oldest-first (FIFO push/pop from the back). + free: Vec, + /// Sum of currently-borrowed senders + in-flight grow operations. + in_use: usize, +} + +impl PoolState { + fn total(&self) -> usize { + self.free.len() + self.in_use + } +} + +struct PoolEntry { + sender: Sender, + /// Connection-scoped schema interner. Travels with the slot so its + /// `(signature → id)` map stays coherent across borrow/return cycles; + /// both client and server build the same map by first-emit order, so + /// dropping it would resync the next FULL emit at id 0 and corrupt + /// the server's schema table. + schema_registry: super::encoder::SchemaRegistry, + /// Connection-scoped global symbol dictionary — same coherence + /// argument: the server tracks ids by first-emit order over the life + /// of the WS connection, so the dict must travel with the slot. + symbol_dict: crate::ingress::buffer::SymbolGlobalDict, + last_idle_at: Instant, +} + +impl QuestDb { + /// Open a pool against `conf`. + /// + /// The connect string must use a QWP/WebSocket schema (`qwpws::` / + /// `qwpwss::` / `ws::` / `wss::`). Pool-specific keys are recognised: + /// + /// | Key | Default | Meaning | + /// |------------------------|---------|----------------------------------------------------------------| + /// | `pool_size` | 1 | Warm / minimum connections, opened eagerly here. | + /// | `pool_max` | 64 | Hard cap on auto-grow. Borrow at the cap returns `InvalidApiCall`. | + /// | `pool_idle_timeout_ms` | 60000 | Above-`pool_size` idle connections are closed after this long. | + /// | `pool_reap` | `auto` | `auto` runs a background reaper; `manual` requires `reap_idle`. | + /// + /// Store-and-forward keys (`sf_*`, `sender_id`) are **refused** here — + /// see `doc/COLUMN_SENDER_PLAN.md` §8. Use the row-major + /// [`crate::ingress::Sender`] API if you need on-disk durability. + pub fn connect(conf: &str) -> Result { + let parsed = conf::parse(conf)?; + let pool_cfg = parsed.pool; + + let mut free = Vec::with_capacity(pool_cfg.pool_size); + let now = Instant::now(); + for slot in 0..pool_cfg.pool_size { + let sender = build_sender(conf).map_err(|err| { + crate::Error::new( + err.code(), + format!( + "Failed to open pool slot {} of {}: {}", + slot + 1, + pool_cfg.pool_size, + err.msg() + ), + ) + })?; + free.push(PoolEntry { + sender, + schema_registry: super::encoder::SchemaRegistry::new(), + symbol_dict: crate::ingress::buffer::SymbolGlobalDict::new(), + last_idle_at: now, + }); + } + + let inner = Arc::new(DbInner { + conf: conf.to_owned(), + pool_size: pool_cfg.pool_size, + pool_max: pool_cfg.pool_max, + pool_idle_timeout: pool_cfg.pool_idle_timeout, + durable_ack_opt_in: parsed.durable_ack_opt_in, + state: Mutex::new(PoolState { free, in_use: 0 }), + cv: Condvar::new(), + shutdown: AtomicBool::new(false), + }); + + let reaper = match pool_cfg.pool_reap { + PoolReap::Auto => Some(spawn_reaper(Arc::clone(&inner))), + PoolReap::Manual => None, + }; + + Ok(Self { inner, reaper }) + } + + /// Borrow a sender. + /// + /// Selection: pop the most-recently-returned slot from the free list; + /// failing that, open a new connection if we are below `pool_max`; + /// failing that, return `InvalidApiCall` (fail-fast at cap). + pub fn borrow_sender(&self) -> Result> { + let cs = self.pick_sender()?; + Ok(BorrowedSender::new(self, cs)) + } + + /// FFI escape hatch: like [`Self::borrow_sender`] but the returned + /// handle is not lifetime-bound to `&self`. Carries an `Arc` + /// internally so it can outlive the user-facing `QuestDb` pointer + /// (the pool's free list and reaper stay alive as long as any + /// borrow is outstanding). + /// + /// Hidden from the Rust API because Rust callers should prefer the + /// lifetime-bound `borrow_sender`, which catches use-after-close at + /// compile time. C callers reach this through `questdb_db_borrow_sender`. + #[doc(hidden)] + pub fn borrow_sender_owned(&self) -> Result { + let cs = self.pick_sender()?; + Ok(OwnedSender { + inner: Arc::clone(&self.inner), + sender: Some(cs), + }) + } + + fn pick_sender(&self) -> Result { + let mut state = self.inner.state.lock().expect("pool mutex poisoned"); + if let Some(entry) = state.free.pop() { + state.in_use += 1; + drop(state); + return Ok(ColumnSender::new( + entry.sender, + entry.schema_registry, + entry.symbol_dict, + self.inner.durable_ack_opt_in, + )); + } + + if state.total() >= self.inner.pool_max { + return Err(error::fmt!( + InvalidApiCall, + "Connection pool exhausted: {} connections are currently borrowed and \ + the pool is at its `pool_max` cap of {}. Return a sender or raise `pool_max`.", + state.in_use, + self.inner.pool_max + )); + } + + // Reserve the slot before releasing the lock so a concurrent + // `borrow_sender` cannot over-grow past `pool_max`. + state.in_use += 1; + drop(state); + + let sender = match build_sender(&self.inner.conf) { + Ok(sender) => sender, + Err(err) => { + let mut state = self.inner.state.lock().expect("pool mutex poisoned"); + state.in_use -= 1; + return Err(err); + } + }; + + Ok(ColumnSender::new( + sender, + super::encoder::SchemaRegistry::new(), + crate::ingress::buffer::SymbolGlobalDict::new(), + self.inner.durable_ack_opt_in, + )) + } + + /// Manually reap idle connections. + /// + /// Closes free-list entries that have been idle longer than + /// `pool_idle_timeout_ms`, never shrinking total connection count below + /// `pool_size`. Returns the number of connections closed. + /// + /// Under the default `pool_reap=auto`, a background thread invokes this + /// logic periodically and this call is harmless. Under + /// `pool_reap=manual`, callers that want shrinking must invoke this on + /// their own cadence. + pub fn reap_idle(&self) -> usize { + reap_idle_inner(&self.inner) + } + + /// Close the pool: stop the reaper (if any), drop all idle connections, + /// and consume `self`. + /// + /// Drop has the same effect; `close` exists for parity with the C ABI + /// (where `Drop` is not available) and to give callers a place to handle + /// any reaper-join errors explicitly in the future. + pub fn close(self) { + drop(self); + } + + /// Snapshot the number of idle (free) connections currently in the pool. + #[doc(hidden)] + pub fn free_count(&self) -> usize { + self.inner + .state + .lock() + .expect("pool mutex poisoned") + .free + .len() + } + + /// Snapshot the number of currently-borrowed (or in-flight-being-built) + /// connections. + #[doc(hidden)] + pub fn in_use_count(&self) -> usize { + self.inner.state.lock().expect("pool mutex poisoned").in_use + } +} + +impl Debug for QuestDb { + fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { + let state = self.inner.state.lock(); + let (free, in_use) = match state { + Ok(s) => (s.free.len(), s.in_use), + Err(_) => (0, 0), + }; + f.debug_struct("QuestDb") + .field("pool_size", &self.inner.pool_size) + .field("pool_max", &self.inner.pool_max) + .field("free", &free) + .field("in_use", &in_use) + .finish() + } +} + +impl Drop for QuestDb { + fn drop(&mut self) { + // Wake the reaper and let it observe shutdown. + self.inner.shutdown.store(true, Ordering::SeqCst); + // Notifying under the mutex avoids the lost-wakeup race where the + // reaper has just released the lock and is about to wait. + { + let _g = self.inner.state.lock().expect("pool mutex poisoned"); + self.inner.cv.notify_all(); + } + if let Some(handle) = self.reaper.take() { + let _ = handle.join(); + } + // Remaining free senders are dropped when `inner` (Arc) hits 0. + } +} + +/// A sender borrowed from a [`QuestDb`] pool. +/// +/// On `Drop` the underlying connection is returned to the pool unless it +/// has latched into `must_close=true`, in which case it is dropped (and +/// auto-grow will open a fresh one for the next borrow). +/// +/// `BorrowedSender` is **not** `Send` or `Sync`. The borrowed connection +/// belongs to the borrowing thread for the duration of the borrow. +pub struct BorrowedSender<'a> { + db: &'a QuestDb, + sender: Option, + /// !Send / !Sync marker — `Rc<()>` poisons both auto traits without any + /// runtime cost. + _not_send: PhantomData>, +} + +impl<'a> BorrowedSender<'a> { + fn new(db: &'a QuestDb, sender: ColumnSender) -> Self { + Self { + db, + sender: Some(sender), + _not_send: PhantomData, + } + } +} + +impl Debug for BorrowedSender<'_> { + fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { + f.debug_struct("BorrowedSender") + .field("sender", &self.sender) + .finish() + } +} + +impl Deref for BorrowedSender<'_> { + type Target = ColumnSender; + + fn deref(&self) -> &Self::Target { + self.sender + .as_ref() + .expect("borrowed sender already returned") + } +} + +impl DerefMut for BorrowedSender<'_> { + fn deref_mut(&mut self) -> &mut Self::Target { + self.sender + .as_mut() + .expect("borrowed sender already returned") + } +} + +impl Drop for BorrowedSender<'_> { + fn drop(&mut self) { + let Some(sender) = self.sender.take() else { + return; + }; + return_to_pool(&self.db.inner, sender); + } +} + +/// Owned (lifetime-free) variant of [`BorrowedSender`] used by the C FFI. +/// +/// Holds an `Arc` so the pool's state outlives the user-facing +/// `QuestDb` pointer — the C ABI can free its `questdb_db*` before +/// dropping outstanding `column_sender*` handles without invalidating the +/// free list / mutex. +#[doc(hidden)] +pub struct OwnedSender { + inner: Arc, + sender: Option, +} + +impl OwnedSender { + /// Borrow the underlying [`ColumnSender`] mutably. Always returns a + /// live reference until `Drop` runs. + pub fn get_mut(&mut self) -> &mut ColumnSender { + self.sender + .as_mut() + .expect("OwnedSender already returned to the pool") + } + + /// Inspect the wrapped sender without taking ownership. + pub fn get(&self) -> &ColumnSender { + self.sender + .as_ref() + .expect("OwnedSender already returned to the pool") + } +} + +impl Drop for OwnedSender { + fn drop(&mut self) { + if let Some(sender) = self.sender.take() { + return_to_pool(&self.inner, sender); + } + } +} + +fn return_to_pool(inner: &Arc, sender: ColumnSender) { + let must_close = sender.must_close(); + let mut state = inner.state.lock().expect("pool mutex poisoned"); + state.in_use -= 1; + if !must_close { + state.free.push(PoolEntry { + sender: sender.sender, + schema_registry: sender.schema_registry, + symbol_dict: sender.symbol_dict, + last_idle_at: Instant::now(), + }); + } + // Dropped `sender` (when `must_close`) falls out of scope here, after + // the count was decremented but with the mutex still held — safe + // since `Sender::drop` does not re-enter the pool. + drop(state); +} + +fn build_sender(conf: &str) -> Result { + SenderBuilder::from_conf(conf)?.build() +} + +fn spawn_reaper(inner: Arc) -> JoinHandle<()> { + let tick = reaper_tick(inner.pool_idle_timeout); + thread::Builder::new() + .name("questdb-column-sender-pool-reaper".to_string()) + .spawn(move || reaper_loop(inner, tick)) + .expect("failed to spawn pool reaper thread") +} + +fn reaper_tick(idle_timeout: Duration) -> Duration { + let twelfth = idle_timeout / 12; + if twelfth > REAPER_MIN_TICK { + twelfth + } else { + REAPER_MIN_TICK + } +} + +fn reaper_loop(inner: Arc, tick: Duration) { + loop { + if inner.shutdown.load(Ordering::SeqCst) { + break; + } + let state = inner.state.lock().expect("pool mutex poisoned"); + let (state, _) = inner + .cv + .wait_timeout(state, tick) + .expect("pool mutex poisoned"); + drop(state); + if inner.shutdown.load(Ordering::SeqCst) { + break; + } + reap_idle_inner(&inner); + } +} + +fn reap_idle_inner(inner: &DbInner) -> usize { + // Drop the to-be-closed senders OUTSIDE the lock so closing a connection + // (which may take an unbounded amount of time) does not stall concurrent + // borrows. + let to_drop: Vec = { + let mut state = inner.state.lock().expect("pool mutex poisoned"); + let mut to_drop = Vec::new(); + let now = Instant::now(); + // Free-list is oldest at front, newest at back (push on return / + // pop on borrow). We must protect `total() >= pool_size` after the + // drop, so we count current total once and only drop if total stays + // above the floor. + let mut i = 0; + while i < state.free.len() { + if state.total() <= inner.pool_size { + break; + } + let idle_for = now.saturating_duration_since(state.free[i].last_idle_at); + if idle_for > inner.pool_idle_timeout { + let entry = state.free.remove(i); + to_drop.push(entry.sender); + } else { + i += 1; + } + } + to_drop + }; + let dropped = to_drop.len(); + drop(to_drop); + dropped +} diff --git a/questdb-rs/src/ingress/column_sender/encoder.rs b/questdb-rs/src/ingress/column_sender/encoder.rs new file mode 100644 index 00000000..290404a0 --- /dev/null +++ b/questdb-rs/src/ingress/column_sender/encoder.rs @@ -0,0 +1,498 @@ +/******************************************************************************* + * ___ _ ____ ____ + * / _ \ _ _ ___ ___| |_| _ \| __ ) + * | | | | | | |/ _ \/ __| __| | | | _ \ + * | |_| | |_| | __/\__ \ |_| |_| | |_) | + * \__\_\\__,_|\___||___/\__|____/|____/ + * + * Copyright (c) 2014-2019 Appsicle + * Copyright (c) 2019-2025 QuestDB + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ******************************************************************************/ + +//! Column-sender QWP/WebSocket frame encoder. +//! +//! Emits a single-table frame (one chunk = one table = one frame). Most +//! column payloads are already in wire shape inside the chunk (see +//! `chunk.rs`); symbol columns resolve to wire bytes here because their +//! global-id assignment is connection-scoped and chunks are +//! sender-agnostic until flushed. + +use std::collections::HashMap; + +use crate::ingress::buffer::SymbolGlobalDict; +use crate::{Result, error}; + +use super::chunk::{Chunk, ChunkColumn}; +use super::wire::{ + MAX_NAME_LEN, QWP_FLAG_DELTA_SYMBOL_DICT, QWP_HEADER_LEN, QWP_MAGIC, QWP_SCHEMA_MODE_FULL, + QWP_SCHEMA_MODE_REFERENCE, QWP_VERSION_1, validate_name, write_qwp_bytes, write_qwp_varint, +}; + +/// Connection-scoped table-schema interner. +/// +/// Each unique signature gets a sequentially-assigned `u64` id. The first +/// emit for a signature uses `QWP_SCHEMA_MODE_FULL`; subsequent emits +/// reuse the id under `QWP_SCHEMA_MODE_REFERENCE`. Both sides of the wire +/// build the same id-by-first-emit mapping; on reconnect both sides reset. +#[derive(Debug, Default)] +pub(crate) struct SchemaRegistry { + by_signature: HashMap, u64>, + next_id: u64, +} + +impl SchemaRegistry { + pub(crate) fn new() -> Self { + Self::default() + } + + fn intern(&mut self, signature: &[u8]) -> (u64, bool) { + if let Some(&id) = self.by_signature.get(signature) { + return (id, false); + } + let id = self.next_id; + self.next_id += 1; + self.by_signature.insert(signature.to_vec(), id); + (id, true) + } + + #[cfg(test)] + pub(crate) fn len(&self) -> usize { + self.by_signature.len() + } +} + +/// Encode `chunk` into a QWP/WebSocket frame. +/// +/// Returns the frame bytes ready to hand to +/// [`crate::ingress::Sender::qwp_ws_publish_raw`]. +pub(crate) fn encode_chunk( + chunk: &Chunk, + schema_registry: &mut SchemaRegistry, + global_dict: &mut SymbolGlobalDict, +) -> Result> { + if chunk.is_empty() { + return Ok(encode_header_only_frame()); + } + if chunk.designated_ts.is_none() { + return Err(error::fmt!( + InvalidApiCall, + "Chunk has no designated timestamp; \ + call designated_timestamp_micros or designated_timestamp_nanos before flush." + )); + } + let row_count = chunk.row_count(); + if row_count == 0 { + return Err(error::fmt!( + InvalidApiCall, + "Chunk row_count is 0; flush at least one row or hand back an empty chunk." + )); + } + validate_name("table", &chunk.table)?; + + let table_bytes = chunk.table.as_bytes(); + if table_bytes.len() > MAX_NAME_LEN { + return Err(error::fmt!( + InvalidName, + "table name is too long: {} bytes (max {})", + table_bytes.len(), + MAX_NAME_LEN + )); + } + + let designated = chunk + .designated_ts + .as_ref() + .expect("guarded by is_none() check above"); + + // Pass 1: resolve symbol columns against the connection-scoped global + // dict so we know the delta-dict prefix BEFORE writing the table + // block. We snapshot the dict's pre-encode size for the rollback + // path below — if anything fails after we touched the dict, the + // server has not yet seen those entries, so dropping them locally + // keeps both sides in sync. + let dict_mark = global_dict.mark(); + let resolution = match resolve_symbols(chunk, global_dict) { + Ok(r) => r, + Err(e) => { + global_dict.rollback(dict_mark); + return Err(e); + } + }; + + // Build the schema signature (registry key + FULL-emit payload). + let column_count = chunk.columns.len() + 1; // +1 for designated timestamp + let mut signature = Vec::with_capacity(column_count * 8); + for col in &chunk.columns { + signature.extend_from_slice(col.signature()); + } + write_qwp_bytes(&mut signature, &[]); + signature.push(designated.wire_type); + + let (schema_id, is_new_schema) = schema_registry.intern(&signature); + + // Pre-allocate the full frame. + let symbol_payload_estimate = resolution + .per_column_payload + .iter() + .filter_map(|p| p.as_ref().map(|v| v.len())) + .sum::(); + let resolved_payload_estimate = chunk + .columns + .iter() + .filter_map(|c| match c { + ChunkColumn::Resolved { payload, .. } => Some(payload.len()), + ChunkColumn::Symbol { .. } => None, + }) + .sum::(); + let payload_estimate = 1 + 10 // dict prefix base (delta_start + count varints) + + resolution.delta_symbol_bytes_estimate + + 1 + table_bytes.len() + + 10 + + 1 + 10 + signature.len() + + resolved_payload_estimate + + symbol_payload_estimate + + designated.payload.len(); + let mut frame = Vec::with_capacity(QWP_HEADER_LEN + payload_estimate); + + write_header_placeholder(&mut frame, /* table_count = */ 1); + let payload_start = frame.len(); + + // Delta-symbol-dict prefix. + write_qwp_varint(&mut frame, resolution.delta_start); + write_qwp_varint(&mut frame, resolution.new_symbols.len() as u64); + for bytes in &resolution.new_symbols { + write_qwp_bytes(&mut frame, bytes); + } + + // Table block header. + write_qwp_bytes(&mut frame, table_bytes); + write_qwp_varint(&mut frame, row_count as u64); + write_qwp_varint(&mut frame, column_count as u64); + + // Schema section. + if is_new_schema { + frame.push(QWP_SCHEMA_MODE_FULL); + write_qwp_varint(&mut frame, schema_id); + frame.extend_from_slice(&signature); + } else { + frame.push(QWP_SCHEMA_MODE_REFERENCE); + write_qwp_varint(&mut frame, schema_id); + } + + // Column payloads. + for (col_idx, col) in chunk.columns.iter().enumerate() { + match col { + ChunkColumn::Resolved { payload, .. } => { + frame.extend_from_slice(payload); + } + ChunkColumn::Symbol { .. } => { + let payload = resolution.per_column_payload[col_idx] + .as_ref() + .expect("symbol payload must have been resolved"); + frame.extend_from_slice(payload); + } + } + } + frame.extend_from_slice(&designated.payload); + + let payload_len = (frame.len() - payload_start) as u32; + frame[8..12].copy_from_slice(&payload_len.to_le_bytes()); + Ok(frame) +} + +struct SymbolResolution { + /// Pre-existing global dict size at encode start; the delta-dict + /// prefix tells the server "ids `delta_start..delta_start + + /// new_symbols.len()` are these new entries". + delta_start: u64, + /// New entries, in the order their ids were assigned. + new_symbols: Vec>, + /// Conservative byte estimate for the delta-dict prefix. + delta_symbol_bytes_estimate: usize, + /// One per column slot; `Some` for symbol columns (wire-shape bytes + /// for that column), `None` for resolved columns. + per_column_payload: Vec>>, +} + +fn resolve_symbols(chunk: &Chunk, global_dict: &mut SymbolGlobalDict) -> Result { + let delta_start = global_dict_len(global_dict); + let mut new_symbols: Vec> = Vec::new(); + let mut delta_symbol_bytes_estimate: usize = 0; + let mut per_column_payload: Vec>> = Vec::with_capacity(chunk.columns.len()); + + for col in &chunk.columns { + match col { + ChunkColumn::Resolved { .. } => per_column_payload.push(None), + ChunkColumn::Symbol { + row_count, + codes, + bitmap, + non_null_count, + referenced_symbols, + .. + } => { + // Map each referenced symbol's internal index → global id, + // remembering new ids so we can append them to the + // delta-dict prefix. + let mut internal_to_global = Vec::with_capacity(referenced_symbols.len()); + for bytes in referenced_symbols { + let (gid, is_new) = global_dict.intern(bytes); + if is_new { + delta_symbol_bytes_estimate += 5 + bytes.len(); + new_symbols.push(bytes.clone()); + } + internal_to_global.push(gid); + } + + // Build the column's wire payload: null_flag + optional + // bitmap + dense varint global ids for non-null rows. + let mut payload = Vec::with_capacity( + 1 + bitmap.as_ref().map_or(0, |b| b.len()) + non_null_count * 4, + ); + match bitmap { + None => payload.push(0), + Some(bm) => { + payload.push(1); + payload.extend_from_slice(bm); + } + } + for (i, &internal) in codes.iter().enumerate() { + let valid = bitmap.as_ref().is_none_or(|bm| qwp_bit_is_valid(bm, i)); + if !valid { + continue; + } + debug_assert!( + internal != u32::MAX, + "valid symbol row at index {i} had sentinel code" + ); + let gid = internal_to_global[internal as usize]; + write_qwp_varint(&mut payload, gid); + } + // Sanity-check: we wrote exactly `non_null_count` ids. + debug_assert_eq!( + *non_null_count, + count_non_null(*row_count, bitmap.as_deref()) + ); + per_column_payload.push(Some(payload)); + } + } + } + + Ok(SymbolResolution { + delta_start, + new_symbols, + delta_symbol_bytes_estimate, + per_column_payload, + }) +} + +fn write_header_placeholder(frame: &mut Vec, table_count: u16) { + frame.extend_from_slice(&QWP_MAGIC); + frame.push(QWP_VERSION_1); + frame.push(QWP_FLAG_DELTA_SYMBOL_DICT); + frame.extend_from_slice(&table_count.to_le_bytes()); + frame.extend_from_slice(&0u32.to_le_bytes()); // payload_len, patched after +} + +fn encode_header_only_frame() -> Vec { + let mut frame = Vec::with_capacity(QWP_HEADER_LEN + 2); + write_header_placeholder(&mut frame, 0); + let payload_start = frame.len(); + write_qwp_varint(&mut frame, 0); // delta_start + write_qwp_varint(&mut frame, 0); // new_symbols_count + let payload_len = (frame.len() - payload_start) as u32; + frame[8..12].copy_from_slice(&payload_len.to_le_bytes()); + frame +} + +/// Inspect the QWP-shape bitmap (bit = 1 means NULL): return `true` iff +/// row `i` is valid. +#[inline] +fn qwp_bit_is_valid(bitmap: &[u8], i: usize) -> bool { + (bitmap[i / 8] >> (i % 8)) & 1 == 0 +} + +#[inline] +fn count_non_null(row_count: usize, bitmap: Option<&[u8]>) -> usize { + match bitmap { + None => row_count, + Some(bm) => (0..row_count).filter(|&i| qwp_bit_is_valid(bm, i)).count(), + } +} + +/// Pre-encode size of the connection-scoped global dict — the +/// `delta_start` field of the QWP delta-symbol-dict prefix. +fn global_dict_len(global_dict: &SymbolGlobalDict) -> u64 { + global_dict.next_id() +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::ingress::column_sender::Validity; + + fn empty_chunk(table: &str) -> Chunk { + Chunk::new(table) + } + + #[test] + fn empty_chunk_encodes_to_14_bytes() { + let mut reg = SchemaRegistry::new(); + let mut dict = SymbolGlobalDict::new(); + let frame = encode_chunk(&empty_chunk("trades"), &mut reg, &mut dict).unwrap(); + assert_eq!(frame.len(), 14); + assert_eq!(&frame[0..4], b"QWP1"); + assert_eq!(frame[5], QWP_FLAG_DELTA_SYMBOL_DICT); + assert_eq!(u16::from_le_bytes([frame[6], frame[7]]), 0); + } + + #[test] + fn non_empty_chunk_without_designated_ts_errors() { + let mut reg = SchemaRegistry::new(); + let mut dict = SymbolGlobalDict::new(); + let mut chunk = Chunk::new("trades"); + chunk.column_i64("a", &[1, 2, 3], None).unwrap(); + let err = encode_chunk(&chunk, &mut reg, &mut dict).unwrap_err(); + assert_eq!(err.code(), crate::ErrorCode::InvalidApiCall); + assert!(err.msg().contains("designated")); + } + + #[test] + fn second_encode_with_same_schema_uses_reference() { + let mut reg = SchemaRegistry::new(); + let mut dict = SymbolGlobalDict::new(); + let mut c1 = Chunk::new("trades"); + c1.column_i64("price", &[1, 2], None).unwrap(); + c1.designated_timestamp_nanos(&[10, 20]).unwrap(); + let frame1 = encode_chunk(&c1, &mut reg, &mut dict).unwrap(); + + let mut c2 = Chunk::new("trades"); + c2.column_i64("price", &[3, 4], None).unwrap(); + c2.designated_timestamp_nanos(&[30, 40]).unwrap(); + let frame2 = encode_chunk(&c2, &mut reg, &mut dict).unwrap(); + + assert!(frame2.len() < frame1.len()); + assert_eq!(reg.len(), 1, "schema signature interned once"); + + let schema_mode_offset = 12 + 1 + 1 + 1 + "trades".len() + 1 + 1; + assert_eq!(frame1[schema_mode_offset], QWP_SCHEMA_MODE_FULL); + assert_eq!(frame2[schema_mode_offset], QWP_SCHEMA_MODE_REFERENCE); + } + + #[test] + fn distinct_schemas_get_distinct_ids() { + let mut reg = SchemaRegistry::new(); + let mut dict = SymbolGlobalDict::new(); + let mut a = Chunk::new("a"); + a.column_i64("x", &[1], None).unwrap(); + a.designated_timestamp_nanos(&[1]).unwrap(); + encode_chunk(&a, &mut reg, &mut dict).unwrap(); + + let mut b = Chunk::new("b"); + b.column_f64("y", &[1.0], None).unwrap(); + b.designated_timestamp_nanos(&[1]).unwrap(); + encode_chunk(&b, &mut reg, &mut dict).unwrap(); + + assert_eq!(reg.len(), 2); + } + + #[test] + fn frame_size_grows_with_column_payloads() { + let mut reg = SchemaRegistry::new(); + let mut dict = SymbolGlobalDict::new(); + let mut chunk = Chunk::new("trades"); + let bits = [0xFFu8]; + let v = Validity::from_bitmap(&bits, 4).unwrap(); + chunk.column_i64("price", &[1, 2, 3, 4], Some(&v)).unwrap(); + chunk.designated_timestamp_nanos(&[1, 2, 3, 4]).unwrap(); + let frame = encode_chunk(&chunk, &mut reg, &mut dict).unwrap(); + assert!(frame.len() > 32); + } + + #[test] + fn symbol_dict_emits_only_referenced_entries() { + let mut reg = SchemaRegistry::new(); + let mut dict = SymbolGlobalDict::new(); + + let mut chunk = Chunk::new("trades"); + // Caller dict has 3 entries; rows only reference "alpha" and "gamma". + let dict_bytes = b"alphabetagamma"; + let dict_offsets: [i32; 4] = [0, 5, 9, 14]; + chunk + .symbol_dict_i32( + "sym", + &[0, 2, 0, 2], // alpha, gamma, alpha, gamma + &dict_offsets, + dict_bytes, + None, + ) + .unwrap(); + chunk.designated_timestamp_nanos(&[1, 2, 3, 4]).unwrap(); + let _frame = encode_chunk(&chunk, &mut reg, &mut dict).unwrap(); + // Global dict should have grown by exactly 2 (alpha, gamma) — beta + // is never sent because no row references it. + assert_eq!(global_dict_len(&dict), 2); + } + + #[test] + fn symbol_dict_second_frame_resends_only_new_entries() { + let mut reg = SchemaRegistry::new(); + let mut dict = SymbolGlobalDict::new(); + let dict_bytes = b"alphabetagamma"; + let dict_offsets: [i32; 4] = [0, 5, 9, 14]; + + let mut c1 = Chunk::new("trades"); + c1.symbol_dict_i32("sym", &[0, 1], &dict_offsets, dict_bytes, None) + .unwrap(); + c1.designated_timestamp_nanos(&[1, 2]).unwrap(); + encode_chunk(&c1, &mut reg, &mut dict).unwrap(); + assert_eq!(global_dict_len(&dict), 2); // alpha, beta + + let mut c2 = Chunk::new("trades"); + // alpha (cached) + gamma (new). + c2.symbol_dict_i32("sym", &[0, 2], &dict_offsets, dict_bytes, None) + .unwrap(); + c2.designated_timestamp_nanos(&[3, 4]).unwrap(); + encode_chunk(&c2, &mut reg, &mut dict).unwrap(); + assert_eq!(global_dict_len(&dict), 3, "gamma added on second frame"); + } + + #[test] + fn symbol_dict_rejects_out_of_range_code() { + let mut chunk = Chunk::new("trades"); + let dict_bytes = b"alpha"; + let dict_offsets: [i32; 2] = [0, 5]; + let err = chunk + .symbol_dict_i32("sym", &[0, 99], &dict_offsets, dict_bytes, None) + .unwrap_err(); + assert_eq!(err.code(), crate::ErrorCode::InvalidApiCall); + assert!(err.msg().contains("out of range")); + } + + #[test] + fn symbol_dict_skips_null_codes() { + let mut chunk = Chunk::new("trades"); + let dict_bytes = b"alpha"; + let dict_offsets: [i32; 2] = [0, 5]; + // Code 99 is out of range, but row 1 is null so its code is not + // validated. + let bits = [0b0000_0001]; + let v = Validity::from_bitmap(&bits, 2).unwrap(); + chunk + .symbol_dict_i32("sym", &[0, 99], &dict_offsets, dict_bytes, Some(&v)) + .expect("null row's bogus code is ignored"); + } +} diff --git a/questdb-rs/src/ingress/column_sender/mod.rs b/questdb-rs/src/ingress/column_sender/mod.rs new file mode 100644 index 00000000..b2e159fc --- /dev/null +++ b/questdb-rs/src/ingress/column_sender/mod.rs @@ -0,0 +1,99 @@ +/******************************************************************************* + * ___ _ ____ ____ + * / _ \ _ _ ___ ___| |_| _ \| __ ) + * | | | | | | |/ _ \/ __| __| | | | _ \ + * | |_| | |_| | __/\__ \ |_| |_| | |_) | + * \__\_\\__,_|\___||___/\__|____/|____/ + * + * Copyright (c) 2014-2019 Appsicle + * Copyright (c) 2019-2025 QuestDB + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ******************************************************************************/ + +//! Column-major sender for QWP/WebSocket. +//! +//! This is a separate API surface from [`crate::ingress::Sender`] / [`crate::ingress::Buffer`]. +//! It exists to ingest **Pandas/Polars DataFrames into QuestDB at the maximum +//! throughput the QWP/WebSocket wire allows**. See `doc/COLUMN_SENDER_PLAN.md` +//! for the design rationale. +//! +//! The user model is `DataFrame → Table`: +//! +//! - Open a connection pool with [`QuestDb::connect`]. +//! - Borrow a sender with [`QuestDb::borrow_sender`]. +//! - Build a [`Chunk`] of column buffers for one table, then pin a +//! designated timestamp on it. +//! - Flush the chunk synchronously; the call blocks until the server +//! acknowledges at the requested [`AckLevel`]. +//! - Drop the [`BorrowedSender`] to return its connection to the pool. + +mod chunk; +mod conf; +mod db; +mod encoder; +mod sender; +mod validity; +mod wire; + +pub use chunk::Chunk; +pub use db::{BorrowedSender, QuestDb}; +pub use sender::{AckLevel, ColumnSender}; +pub use validity::Validity; + +#[doc(hidden)] +pub use db::OwnedSender; + +/// Internals exposed for criterion benchmarks under +/// `questdb-rs/benches/`. Not part of the public API; bumped freely +/// without semver concerns. +#[doc(hidden)] +pub mod _bench_internals { + use crate::Result; + use crate::ingress::buffer::SymbolGlobalDict; + + use super::chunk::Chunk; + use super::encoder::{SchemaRegistry, encode_chunk}; + + /// Opaque holder for the connection-scoped state the encoder needs. + /// Lets benches reuse the encoder across iterations without + /// promoting [`SchemaRegistry`] / [`SymbolGlobalDict`] to the + /// public API. + pub struct BenchEncoderState { + schema_registry: SchemaRegistry, + symbol_dict: SymbolGlobalDict, + } + + impl Default for BenchEncoderState { + fn default() -> Self { + Self::new() + } + } + + impl BenchEncoderState { + pub fn new() -> Self { + Self { + schema_registry: SchemaRegistry::new(), + symbol_dict: SymbolGlobalDict::new(), + } + } + } + + /// Encode `chunk` against `state`. Mirrors [`encode_chunk`] but + /// hides the internal-state types so the bench module never has to + /// touch them. + pub fn bench_encode_chunk(chunk: &Chunk, state: &mut BenchEncoderState) -> Result> { + encode_chunk(chunk, &mut state.schema_registry, &mut state.symbol_dict) + } +} diff --git a/questdb-rs/src/ingress/column_sender/sender.rs b/questdb-rs/src/ingress/column_sender/sender.rs new file mode 100644 index 00000000..96010bb9 --- /dev/null +++ b/questdb-rs/src/ingress/column_sender/sender.rs @@ -0,0 +1,153 @@ +/******************************************************************************* + * ___ _ ____ ____ + * / _ \ _ _ ___ ___| |_| _ \| __ ) + * | | | | | | |/ _ \/ __| __| | | | _ \ + * | |_| | |_| | __/\__ \ |_| |_| | |_) | + * \__\_\\__,_|\___||___/\__|____/|____/ + * + * Copyright (c) 2014-2019 Appsicle + * Copyright (c) 2019-2025 QuestDB + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ******************************************************************************/ + +//! Borrowed-handle types for the column-major sender. +//! +//! A [`ColumnSender`] is one borrowed pool slot. It owns the underlying +//! [`crate::ingress::Sender`], the connection-scoped [`SchemaRegistry`], +//! and the connection-scoped [`SymbolGlobalDict`]: all three travel back +//! into the pool together when the [`super::BorrowedSender`] is dropped. + +use std::fmt::{self, Debug, Formatter}; +use std::time::Duration; + +use crate::ingress::Sender; +use crate::ingress::buffer::SymbolGlobalDict; +use crate::{Result, error}; + +use super::chunk::Chunk; +use super::encoder::{self, SchemaRegistry}; + +/// Acknowledgement level a [`ColumnSender::flush`] call waits for. +/// +/// See `doc/COLUMN_SENDER_PLAN.md` §4 for the rationale and the QWP/WS spec +/// for the status-byte values. +#[derive(Clone, Copy, Debug, Default, PartialEq, Eq)] +pub enum AckLevel { + /// Wait for the server's WAL-commit ACK (spec status `0x00`). Always + /// available. + #[default] + Ok, + /// Wait for the server's object-store durability ACK (spec status + /// `0x02`). Enterprise feature; requires `request_durable_ack=on` in the + /// connect string. Flush returns `InvalidApiCall` otherwise. + Durable, +} + +/// One [`crate::ingress::Sender`] in the pool, wrapped in the column-sender +/// type system. +/// +/// The user reaches this via [`super::BorrowedSender`]. +pub struct ColumnSender { + pub(crate) sender: Sender, + pub(crate) schema_registry: SchemaRegistry, + pub(crate) symbol_dict: SymbolGlobalDict, + /// Latched from the connect string at [`super::QuestDb::connect`]; a + /// [`AckLevel::Durable`] flush is only honoured when this is `true`. + durable_ack_opt_in: bool, +} + +impl Debug for ColumnSender { + fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { + f.debug_struct("ColumnSender") + .field("sender", &self.sender) + .field("durable_ack_opt_in", &self.durable_ack_opt_in) + .finish() + } +} + +impl ColumnSender { + pub(crate) fn new( + sender: Sender, + schema_registry: SchemaRegistry, + symbol_dict: SymbolGlobalDict, + durable_ack_opt_in: bool, + ) -> Self { + Self { + sender, + schema_registry, + symbol_dict, + durable_ack_opt_in, + } + } + + /// `true` once the underlying QWP/WS connection has latched into a + /// permanently-unusable state. On return to the pool such senders + /// are dropped rather than recycled. + #[must_use] + pub fn must_close(&self) -> bool { + self.sender.must_close() + } + + /// Encode `chunk` into a QWP/WebSocket frame, publish it, and block + /// until the server acknowledges at the requested [`AckLevel`]. + /// + /// On success, `chunk` is cleared (its retained capacity is preserved). + /// On failure, `chunk` is left untouched so the caller can inspect or + /// recover its contents before dropping it. + /// + /// At most one frame is in flight per sender at a time — that is what + /// makes this call synchronous. For parallel ingest, borrow multiple + /// senders from the [`super::QuestDb`] pool, one per worker thread. + /// + /// `AckLevel::Durable` requires the pool to have been opened with + /// `request_durable_ack=on`; otherwise this returns `InvalidApiCall`. + pub fn flush(&mut self, chunk: &mut Chunk, ack_level: AckLevel) -> Result<()> { + if ack_level == AckLevel::Durable && !self.durable_ack_opt_in { + return Err(error::fmt!( + InvalidApiCall, + "AckLevel::Durable requires the pool to be opened with \ + `request_durable_ack=on` in the connect string." + )); + } + + let payload = + encoder::encode_chunk(chunk, &mut self.schema_registry, &mut self.symbol_dict)?; + let fsn = self.sender.qwp_ws_publish_raw(&payload)?; + self.await_ack(fsn)?; + chunk.clear(); + Ok(()) + } + + /// Wait until the underlying connection's cumulative ack watermark + /// reaches `fsn`, or until the connection latches into `must_close`. + fn await_ack(&mut self, fsn: u64) -> Result<()> { + // Poll in 50 ms slices so a connection that latches into + // `must_close` mid-wait is surfaced promptly rather than blocking + // forever on the underlying ack watermark. + const POLL: Duration = Duration::from_millis(50); + loop { + if self.sender.await_acked_fsn(fsn, POLL)? { + return Ok(()); + } + if self.sender.must_close() { + return Err(error::fmt!( + SocketError, + "QWP/WebSocket connection entered a terminal state before \ + the published frame was acknowledged." + )); + } + } + } +} diff --git a/questdb-rs/src/ingress/column_sender/validity.rs b/questdb-rs/src/ingress/column_sender/validity.rs new file mode 100644 index 00000000..66036330 --- /dev/null +++ b/questdb-rs/src/ingress/column_sender/validity.rs @@ -0,0 +1,171 @@ +/******************************************************************************* + * ___ _ ____ ____ + * / _ \ _ _ ___ ___| |_| _ \| __ ) + * | | | | | | |/ _ \/ __| __| | | | _ \ + * | |_| | |_| | __/\__ \ |_| |_| | |_) | + * \__\_\\__,_|\___||___/\__|____/|____/ + * + * Copyright (c) 2014-2019 Appsicle + * Copyright (c) 2019-2025 QuestDB + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ******************************************************************************/ + +//! Validity bitmap helpers for the column-major sender. +//! +//! Users pass validity in **Arrow shape**: bit = 1 means valid, LSB-first +//! inside each byte. The QWP wire shape is the inverse: bit = 1 means +//! NULL. The conversion happens here; helpers below also count non-null +//! rows and stream Arrow-bit-set positions for the gather path. + +use crate::{Result, error}; + +/// Public validity bitmap. See `doc/COLUMN_SENDER_FFI_ABI.md` §2.4 for the +/// Arrow semantics the API accepts. +#[derive(Debug)] +pub struct Validity<'a> { + pub(crate) bits: &'a [u8], + pub(crate) bit_len: usize, +} + +impl<'a> Validity<'a> { + /// Borrow `bits` as a validity bitmap of length `bit_len` rows. + /// + /// `bits.len()` must be at least `ceil(bit_len / 8)`. Bits past + /// `bit_len` are ignored by the encoder, so callers do not need to + /// zero them. + pub fn from_bitmap(bits: &'a [u8], bit_len: usize) -> Result { + let required_bytes = bit_len.div_ceil(8); + if bits.len() < required_bytes { + return Err(error::fmt!( + InvalidApiCall, + "validity bitmap too short: {} bytes for {} bits (need at least {})", + bits.len(), + bit_len, + required_bytes + )); + } + Ok(Self { bits, bit_len }) + } + + /// Logical length in bits / rows. + pub fn bit_len(&self) -> usize { + self.bit_len + } + + /// `true` iff bit `idx` is set (row `idx` is **valid**, Arrow shape). + #[inline] + pub(crate) fn is_valid(&self, idx: usize) -> bool { + debug_assert!(idx < self.bit_len); + let byte = self.bits[idx / 8]; + (byte >> (idx % 8)) & 1 == 1 + } + + /// Count non-null (i.e. valid) rows. + pub(crate) fn non_null_count(&self) -> usize { + let full_bytes = self.bit_len / 8; + let trailing_bits = self.bit_len % 8; + let mut count: usize = 0; + for &byte in &self.bits[..full_bytes] { + count += byte.count_ones() as usize; + } + if trailing_bits != 0 { + let mask = (1u8 << trailing_bits) - 1; + count += (self.bits[full_bytes] & mask).count_ones() as usize; + } + count + } + + /// Write the QWP-shape null bitmap (bit = 1 means NULL) for this + /// validity into `out`. Always writes `ceil(bit_len / 8)` bytes; the + /// last byte's high bits past `bit_len` are masked to zero. + pub(crate) fn write_qwp_bitmap(&self, out: &mut Vec) { + let full_bytes = self.bit_len / 8; + let trailing_bits = self.bit_len % 8; + for &byte in &self.bits[..full_bytes] { + out.push(!byte); + } + if trailing_bits != 0 { + let mask = (1u8 << trailing_bits) - 1; + let inverted = !self.bits[full_bytes] & mask; + out.push(inverted); + } + } +} + +/// Validate that a caller-supplied `data` length matches a chunk's locked +/// row count and any validity bitmap. Returns the row count to use. +pub(crate) fn check_row_count( + locked: Option, + data_len: usize, + validity: Option<&Validity<'_>>, +) -> Result { + let row_count = data_len; + if let Some(existing) = locked + && existing != row_count + { + return Err(error::fmt!( + InvalidApiCall, + "Column length mismatch: chunk row_count is {} but this column has {} rows", + existing, + row_count + )); + } + if let Some(v) = validity + && v.bit_len != row_count + { + return Err(error::fmt!( + InvalidApiCall, + "Validity bitmap length ({} bits) does not match column data length ({} rows)", + v.bit_len, + row_count + )); + } + Ok(row_count) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn non_null_count_handles_trailing_bits() { + // 9 bits: 0b1010_1010, 0b0000_0001 — bits 1,3,5,7 valid in byte 0; + // bit 8 (== row 8) valid in byte 1. Trailing bits past row 8 must + // be masked. + let bits = [0b1010_1010, 0xFFu8]; // second byte has every bit set + let v = Validity::from_bitmap(&bits, 9).unwrap(); + assert_eq!(v.non_null_count(), 4 + 1); + } + + #[test] + fn write_qwp_bitmap_inverts_arrow_semantics() { + // Arrow: bit=1 valid. QWP wire: bit=1 NULL. Trailing high bits of + // the last byte are masked to 0. + let bits = [0b1100_1100, 0b0000_0011]; + let v = Validity::from_bitmap(&bits, 12).unwrap(); + let mut out = Vec::new(); + v.write_qwp_bitmap(&mut out); + assert_eq!(out.len(), 2); + assert_eq!(out[0], !0b1100_1100); + // Last byte: invert and mask to 4 valid bits (rows 8..12). + assert_eq!(out[1], (!0b0000_0011) & 0b0000_1111); + } + + #[test] + fn from_bitmap_rejects_short_buffer() { + let err = Validity::from_bitmap(&[0u8], 9).unwrap_err(); + assert_eq!(err.code(), crate::ErrorCode::InvalidApiCall); + } +} diff --git a/questdb-rs/src/ingress/column_sender/wire.rs b/questdb-rs/src/ingress/column_sender/wire.rs new file mode 100644 index 00000000..548d0376 --- /dev/null +++ b/questdb-rs/src/ingress/column_sender/wire.rs @@ -0,0 +1,116 @@ +/******************************************************************************* + * ___ _ ____ ____ + * / _ \ _ _ ___ ___| |_| _ \| __ ) + * | | | | | | |/ _ \/ __| __| | | | _ \ + * | |_| | |_| | __/\__ \ |_| |_| | |_) | + * \__\_\\__,_|\___||___/\__|____/|____/ + * + * Copyright (c) 2014-2019 Appsicle + * Copyright (c) 2019-2025 QuestDB + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ******************************************************************************/ + +//! Wire-format helpers for the column-major sender encoder. +//! +//! These are intentionally duplicated from the row-API encoder +//! (`buffer/qwp.rs`): the row helpers are private to that module and the +//! plan calls out the wire surface as a place where we accept the ~100 +//! lines of duplication to keep the column-sender hot path free of +//! cross-module hops. See `doc/COLUMN_SENDER_PLAN.md` §2.1. + +/// QWP message header magic. +pub(crate) const QWP_MAGIC: [u8; 4] = *b"QWP1"; +pub(crate) const QWP_VERSION_1: u8 = 1; +/// Wire-spec flag set on every column-sender frame (matches the row-API +/// `QwpBuffer::encode_ws_message`). +pub(crate) const QWP_FLAG_DELTA_SYMBOL_DICT: u8 = 0x08; +pub(crate) const QWP_HEADER_LEN: usize = 12; + +/// Full schema mode emits the column-definition signature inline. +pub(crate) const QWP_SCHEMA_MODE_FULL: u8 = 0x00; +/// Reference schema mode reuses a previously-FULL signature by id. +pub(crate) const QWP_SCHEMA_MODE_REFERENCE: u8 = 0x01; + +// Wire type codes — duplicated from `buffer/qwp.rs`. See the QWP v1 spec +// (`questdb/documentation/connect/wire-protocols/qwp-ingress-websocket.md`) +// §Type byte table for the canonical list. +pub(crate) const QWP_TYPE_BOOLEAN: u8 = 0x01; +pub(crate) const QWP_TYPE_BYTE: u8 = 0x02; +pub(crate) const QWP_TYPE_SHORT: u8 = 0x03; +pub(crate) const QWP_TYPE_INT: u8 = 0x04; +pub(crate) const QWP_TYPE_LONG: u8 = 0x05; +pub(crate) const QWP_TYPE_FLOAT: u8 = 0x06; +pub(crate) const QWP_TYPE_DOUBLE: u8 = 0x07; +pub(crate) const QWP_TYPE_TIMESTAMP: u8 = 0x0A; +pub(crate) const QWP_TYPE_DATE: u8 = 0x0B; +pub(crate) const QWP_TYPE_UUID: u8 = 0x0C; +pub(crate) const QWP_TYPE_LONG256: u8 = 0x0D; +pub(crate) const QWP_TYPE_TIMESTAMP_NANOS: u8 = 0x10; +pub(crate) const QWP_TYPE_IPV4: u8 = 0x18; +pub(crate) const QWP_TYPE_VARCHAR: u8 = 0x0F; +pub(crate) const QWP_TYPE_SYMBOL: u8 = 0x09; + +/// Maximum bytes a UTF-8 column or table name is allowed to occupy on the +/// wire. Matches the row-API + Java client cap. +pub(crate) const MAX_NAME_LEN: usize = 127; + +/// Wire-shape sentinels QuestDB treats as NULL for each fixed-width +/// non-bitmap-capable type. The row-API encoder writes these for missing +/// values; the column-sender mirrors them on the nullable path so the +/// wire bytes are byte-compatible with the row encoder. +pub(crate) const I8_NULL: i8 = 0; +pub(crate) const I16_NULL: i16 = 0; +pub(crate) const I32_NULL: i32 = i32::MIN; +pub(crate) const I64_NULL: i64 = i64::MIN; +pub(crate) const F32_NULL: f32 = f32::NAN; +pub(crate) const F64_NULL: f64 = f64::NAN; + +/// Append `value` to `out` as an unsigned QWP varint (LEB128). +#[inline] +pub(crate) fn write_qwp_varint(out: &mut Vec, mut value: u64) { + while value > 0x7F { + out.push(((value & 0x7F) as u8) | 0x80); + value >>= 7; + } + out.push(value as u8); +} + +/// Append a length-prefixed byte string: `varint(len) + bytes`. +#[inline] +pub(crate) fn write_qwp_bytes(out: &mut Vec, bytes: &[u8]) { + write_qwp_varint(out, bytes.len() as u64); + out.extend_from_slice(bytes); +} + +/// Validate a UTF-8 name against the QWP/Java client length cap. +pub(crate) fn validate_name(kind: &'static str, name: &str) -> crate::Result<()> { + if name.is_empty() { + return Err(crate::error::fmt!( + InvalidName, + "{} name must not be empty", + kind + )); + } + if name.len() > MAX_NAME_LEN { + return Err(crate::error::fmt!( + InvalidName, + "{} name is too long: {} bytes (max {})", + kind, + name.len(), + MAX_NAME_LEN + )); + } + Ok(()) +} diff --git a/questdb-rs/src/ingress/sender.rs b/questdb-rs/src/ingress/sender.rs index 257989e2..2bbb0102 100644 --- a/questdb-rs/src/ingress/sender.rs +++ b/questdb-rs/src/ingress/sender.rs @@ -835,6 +835,52 @@ impl Sender { } Ok(()) } + + /// Publish a pre-encoded QWP/WebSocket payload through this sender's + /// replay queue, returning the assigned frame sequence number (FSN). + /// + /// Caller-side escape hatch used by the column-major sender; the row-API + /// path stays on [`Sender::flush_and_get_fsn`]. The payload must already + /// be a valid QWP frame including its 12-byte header. Manual progress + /// mode and non-QWP/WS handlers are rejected with `InvalidApiCall`. + #[cfg(feature = "sync-sender-qwp-ws")] + pub(crate) fn qwp_ws_publish_raw(&mut self, payload: &[u8]) -> Result { + let SyncProtocolHandler::SyncQwpWs(_) = &self.handler else { + return Err(error::fmt!( + InvalidApiCall, + "qwp_ws_publish_raw is only supported for QWP/WebSocket senders \ + in background progress mode." + )); + }; + if let SyncProtocolHandler::SyncQwpWs(state) = &self.handler + && let Err(err) = qwp_ws_check_error_background(state) + { + let _ = self.drain_qwp_ws_error_notifications(); + return Err(err); + } + self.drain_qwp_ws_error_notifications()?; + + if payload.len() > self.max_buf_size { + return Err(qwp_ws_publisher::qwp_ws_encoded_message_size_error( + payload.len(), + self.max_buf_size, + )); + } + + let result = match &mut self.handler { + SyncProtocolHandler::SyncQwpWs(state) => { + qwp_ws_publish_replay_background(state, payload) + } + _ => unreachable!("guarded above"), + }; + if result + .as_ref() + .is_err_and(|err| matches!(err.code(), crate::ErrorCode::SocketError)) + { + self.connected = false; + } + result + } } #[cfg(feature = "sync-sender-qwp-ws")] diff --git a/questdb-rs/src/ingress/sender/qwp_ws.rs b/questdb-rs/src/ingress/sender/qwp_ws.rs index 8f272a68..10082fa1 100644 --- a/questdb-rs/src/ingress/sender/qwp_ws.rs +++ b/questdb-rs/src/ingress/sender/qwp_ws.rs @@ -2778,6 +2778,17 @@ pub(crate) fn flush_qwp_ws( }) } +/// Background-mode escape hatch used by the column-major sender: hand a +/// pre-encoded QWP/WebSocket frame to the replay queue and return its FSN. +/// Bypasses the row-API encoder; the caller is responsible for producing a +/// spec-conformant payload. +pub(crate) fn qwp_ws_publish_replay_background( + state: &mut SyncQwpWsHandlerState, + payload: &[u8], +) -> crate::Result { + state.runner.publish_replay_payload(payload) +} + pub(crate) fn flush_qwp_ws_manual( state: &mut ManualQwpWsHandlerState, buffer: &QwpWsColumnarBuffer, diff --git a/questdb-rs/src/tests.rs b/questdb-rs/src/tests.rs index e5f060a3..8c28c42b 100644 --- a/questdb-rs/src/tests.rs +++ b/questdb-rs/src/tests.rs @@ -54,6 +54,9 @@ mod qwp_ws_publication_probe; #[cfg(feature = "sync-sender-qwp-ws")] mod qwp_ws_java_golden; +#[cfg(feature = "sync-sender-qwp-ws")] +mod column_sender_pool; + mod sender; mod decimal; diff --git a/questdb-rs/src/tests/column_sender_pool.rs b/questdb-rs/src/tests/column_sender_pool.rs new file mode 100644 index 00000000..d1346e54 --- /dev/null +++ b/questdb-rs/src/tests/column_sender_pool.rs @@ -0,0 +1,589 @@ +/******************************************************************************* + * ___ _ ____ ____ + * / _ \ _ _ ___ ___| |_| _ \| __ ) + * | | | | | | |/ _ \/ __| __| | | | _ \ + * | |_| | |_| | __/\__ \ |_| |_| | |_) | + * \__\_\\__,_|\___||___/\__|____/|____/ + * + * Copyright (c) 2014-2019 Appsicle + * Copyright (c) 2019-2025 QuestDB + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ******************************************************************************/ + +//! Column-sender pool + flush integration tests (WS-0 through WS-2). +//! +//! - WS-0: eager-open, borrow/return, multi-thread concurrent borrows, +//! fail-fast at `pool_max`, idle reaper. +//! - WS-1: synchronous `flush` round-trip for empty chunks; `AckLevel::Durable` +//! opt-in guard. +//! - WS-2: numeric / fixed-width column round-trip with a designated +//! timestamp; schema reuse across repeated flushes. +//! +//! Pool slots are real [`crate::ingress::Sender`] instances. The mock server +//! defined here accepts the HTTP→WebSocket upgrade so `Sender::build()` +//! succeeds, then either parks on the connection or reads each QWP frame +//! and replies with an OK ack (status 0x00). + +use std::io::Read; +use std::net::TcpListener; +use std::sync::Arc; +use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering}; +use std::thread; +use std::time::{Duration, Instant}; + +use crate::ErrorCode; +use crate::ingress::column_sender::{AckLevel, Chunk, QuestDb}; +use crate::tests::qwp_ws::{perform_server_upgrade, read_frame, write_qwp_ok_response}; + +#[derive(Clone, Copy, Debug)] +enum MockMode { + /// Park the connection after upgrade — used by pool-only tests. + Park, + /// Read every QWP frame the client sends and reply with an OK ack. + AckEachFrame, +} + +/// Spawn a mock server that performs the WS upgrade for up to `max_accepts` +/// connections, then parks each accepted connection (drains until EOF). The +/// returned guard's `Drop` signals the accept loop to stop. +struct MockServer { + port: u16, + stop: Arc, + accepted: Arc, + join: Option>, +} + +impl MockServer { + fn spawn(max_accepts: usize) -> Self { + Self::spawn_with_mode(max_accepts, MockMode::Park) + } + + fn spawn_acking(max_accepts: usize) -> Self { + Self::spawn_with_mode(max_accepts, MockMode::AckEachFrame) + } + + fn spawn_with_mode(max_accepts: usize, mode: MockMode) -> Self { + let listener = TcpListener::bind("127.0.0.1:0").expect("bind 127.0.0.1"); + listener + .set_nonblocking(true) + .expect("set_nonblocking on listener"); + let port = listener.local_addr().expect("local_addr").port(); + + let stop = Arc::new(AtomicBool::new(false)); + let accepted = Arc::new(AtomicUsize::new(0)); + let stop_clone = Arc::clone(&stop); + let accepted_clone = Arc::clone(&accepted); + + let join = thread::Builder::new() + .name("column-sender-pool-mock-server".to_string()) + .spawn(move || { + let mut handles = Vec::new(); + while !stop_clone.load(Ordering::SeqCst) { + match listener.accept() { + Ok((mut stream, _)) => { + if accepted_clone.fetch_add(1, Ordering::SeqCst) >= max_accepts { + // Past the budget — drop without upgrade so + // the client sees a failed connect. + continue; + } + stream + .set_nonblocking(false) + .expect("set_nonblocking(false)"); + let stop = Arc::clone(&stop_clone); + let h = thread::spawn(move || { + if perform_server_upgrade(&mut stream).is_ok() { + match mode { + MockMode::Park => park_connection(&mut stream, &stop), + MockMode::AckEachFrame => { + ack_each_frame(&mut stream, &stop) + } + } + } + }); + handles.push(h); + } + Err(e) if e.kind() == std::io::ErrorKind::WouldBlock => { + thread::sleep(Duration::from_millis(10)); + } + Err(_) => break, + } + } + for h in handles { + let _ = h.join(); + } + }) + .expect("spawn mock server"); + + Self { + port, + stop, + accepted, + join: Some(join), + } + } + + fn port(&self) -> u16 { + self.port + } + + fn accepted(&self) -> usize { + self.accepted.load(Ordering::SeqCst) + } +} + +impl Drop for MockServer { + fn drop(&mut self) { + self.stop.store(true, Ordering::SeqCst); + if let Some(h) = self.join.take() { + let _ = h.join(); + } + } +} + +fn park_connection(stream: &mut std::net::TcpStream, stop: &AtomicBool) { + let _ = stream.set_read_timeout(Some(Duration::from_millis(100))); + let mut buf = [0u8; 1024]; + while !stop.load(Ordering::SeqCst) { + match stream.read(&mut buf) { + Ok(0) => break, + Ok(_) => {} + Err(e) + if e.kind() == std::io::ErrorKind::WouldBlock + || e.kind() == std::io::ErrorKind::TimedOut => + { + continue; + } + Err(_) => break, + } + } +} + +/// Read each WebSocket binary frame the client sends and reply with a QWP +/// OK ack, incrementing the wire sequence per frame. Control frames are +/// ignored. Exits on EOF or `stop`. +fn ack_each_frame(stream: &mut std::net::TcpStream, stop: &AtomicBool) { + let _ = stream.set_read_timeout(Some(Duration::from_millis(50))); + let mut next_wire_seq: u64 = 0; + while !stop.load(Ordering::SeqCst) { + match read_frame(stream) { + Ok((_fin, opcode, _payload)) => { + // Opcode 0x2 = binary; 0x8 = close; everything else is ignored. + if opcode == 0x8 { + break; + } + if opcode != 0x2 { + continue; + } + if write_qwp_ok_response(stream, next_wire_seq).is_err() { + break; + } + next_wire_seq += 1; + } + Err(e) + if e.kind() == std::io::ErrorKind::WouldBlock + || e.kind() == std::io::ErrorKind::TimedOut => + { + continue; + } + Err(_) => break, + } + } +} + +fn conf_for(port: u16, extras: &str) -> String { + format!( + "qwpws::addr=127.0.0.1:{port};auth_timeout=2000;reconnect_max_duration_millis=1000;{extras}" + ) +} + +#[test] +fn refuses_non_qwp_ws_schema() { + let err = QuestDb::connect("http::addr=localhost:9000;").unwrap_err(); + assert_eq!(err.code(), ErrorCode::ConfigError); + assert!(err.msg().contains("QWP/WebSocket")); +} + +#[test] +fn refuses_sf_dir() { + let err = QuestDb::connect("qwpws::addr=localhost:9000;sf_dir=/tmp/sf;").unwrap_err(); + assert_eq!(err.code(), ErrorCode::ConfigError); + assert!( + err.msg().contains("store-and-forward") && err.msg().contains("sf_dir"), + "msg: {}", + err.msg() + ); +} + +#[test] +fn eager_open_opens_pool_size_connections() { + let server = MockServer::spawn(8); + let db = QuestDb::connect(&conf_for(server.port(), "pool_size=3;pool_max=4;")).unwrap(); + assert_eq!(db.free_count(), 3); + assert_eq!(db.in_use_count(), 0); + // Give the server thread time to register the accepts (the upgrades + // complete before `connect` returns, but the AtomicUsize is incremented + // before `perform_server_upgrade`). + wait_until(Duration::from_secs(2), || server.accepted() == 3); + drop(db); +} + +#[test] +fn borrow_and_return_reuses_connection() { + let server = MockServer::spawn(2); + let db = QuestDb::connect(&conf_for(server.port(), "pool_size=1;pool_max=2;")).unwrap(); + assert_eq!(db.free_count(), 1); + { + let _borrow = db.borrow_sender().expect("borrow"); + assert_eq!(db.free_count(), 0); + assert_eq!(db.in_use_count(), 1); + } + // Drop returns the sender to the pool. + assert_eq!(db.free_count(), 1); + assert_eq!(db.in_use_count(), 0); + // Same physical connection — server only ever accepted one. + assert_eq!(server.accepted(), 1); + drop(db); +} + +#[test] +fn auto_grow_opens_new_connection_until_pool_max() { + let server = MockServer::spawn(4); + let db = QuestDb::connect(&conf_for(server.port(), "pool_size=1;pool_max=3;")).unwrap(); + let b1 = db.borrow_sender().expect("b1"); + let b2 = db.borrow_sender().expect("b2 (auto-grow)"); + let b3 = db.borrow_sender().expect("b3 (auto-grow)"); + assert_eq!(db.free_count(), 0); + assert_eq!(db.in_use_count(), 3); + wait_until(Duration::from_secs(2), || server.accepted() == 3); + drop(b1); + drop(b2); + drop(b3); + assert_eq!(db.free_count(), 3); + drop(db); +} + +#[test] +fn fail_fast_at_pool_max() { + let server = MockServer::spawn(4); + let db = QuestDb::connect(&conf_for(server.port(), "pool_size=1;pool_max=2;")).unwrap(); + let _b1 = db.borrow_sender().expect("b1"); + let _b2 = db.borrow_sender().expect("b2"); + let err = db.borrow_sender().expect_err("must fail-fast at cap"); + assert_eq!(err.code(), ErrorCode::InvalidApiCall); + assert!(err.msg().contains("pool_max"), "msg: {}", err.msg()); +} + +#[test] +fn concurrent_borrow_and_return_does_not_deadlock_or_leak() { + let server = MockServer::spawn(16); + let db = + Arc::new(QuestDb::connect(&conf_for(server.port(), "pool_size=1;pool_max=8;")).unwrap()); + let mut handles = Vec::new(); + for _ in 0..8 { + let db = Arc::clone(&db); + handles.push(thread::spawn(move || { + for _ in 0..16 { + let borrow = db.borrow_sender().expect("borrow_sender under contention"); + // Tiny critical section to encourage contention. + std::hint::black_box(&borrow); + thread::yield_now(); + } + })); + } + for h in handles { + h.join().expect("worker thread"); + } + // After all workers finish: every borrow returned. + assert_eq!(db.in_use_count(), 0); + assert!(db.free_count() >= 1); +} + +#[test] +fn manual_reap_closes_excess_idle_connections() { + let server = MockServer::spawn(4); + let db = QuestDb::connect(&conf_for( + server.port(), + "pool_size=1;pool_max=3;pool_idle_timeout_ms=50;pool_reap=manual;", + )) + .unwrap(); + let b1 = db.borrow_sender().expect("b1"); + let b2 = db.borrow_sender().expect("b2 (grow)"); + let b3 = db.borrow_sender().expect("b3 (grow)"); + drop(b1); + drop(b2); + drop(b3); + assert_eq!(db.free_count(), 3); + + // Reap before the idle timeout — nothing should be closed. + let immediate = db.reap_idle(); + assert_eq!(immediate, 0); + assert_eq!(db.free_count(), 3); + + // Wait past the idle timeout, then reap. Must keep `pool_size` warm. + thread::sleep(Duration::from_millis(120)); + let closed = db.reap_idle(); + assert_eq!(closed, 2, "should reap the two excess-over-pool_size slots"); + assert_eq!(db.free_count(), 1, "pool_size warm slot must stay"); + drop(db); +} + +#[test] +fn auto_reaper_closes_excess_idle_connections() { + let server = MockServer::spawn(4); + // tick = max(5s, timeout/12); use a long-enough timeout that timeout/12 + // > 5s so the reaper wakes promptly on its own ticker. + let db = QuestDb::connect(&conf_for( + server.port(), + "pool_size=1;pool_max=3;pool_idle_timeout_ms=100;pool_reap=auto;", + )) + .unwrap(); + let b1 = db.borrow_sender().expect("b1"); + let b2 = db.borrow_sender().expect("b2"); + let b3 = db.borrow_sender().expect("b3"); + drop(b1); + drop(b2); + drop(b3); + assert_eq!(db.free_count(), 3); + + // Auto reaper wakes on a `max(5s, timeout/12)` ticker. With timeout=100ms, + // the floor of 5s applies. Wait > 5s for the first wake-up. + let reaped = wait_until(Duration::from_secs(8), || db.free_count() == 1); + assert!( + reaped, + "auto reaper failed to drain excess; free={}", + db.free_count() + ); + drop(db); +} + +// ---------- WS-1: flush round-trip ---------- + +#[test] +fn refuses_durable_ack_without_opt_in() { + let server = MockServer::spawn(2); + let db = QuestDb::connect(&conf_for(server.port(), "")).unwrap(); + let mut sender = db.borrow_sender().expect("borrow"); + let mut chunk = Chunk::new("trades"); + let err = sender + .flush(&mut chunk, AckLevel::Durable) + .expect_err("durable without opt-in must fail"); + assert_eq!(err.code(), ErrorCode::InvalidApiCall); + assert!( + err.msg().contains("request_durable_ack"), + "msg: {}", + err.msg() + ); +} + +#[test] +fn empty_chunk_flush_round_trips() { + let server = MockServer::spawn_acking(2); + let db = QuestDb::connect(&conf_for(server.port(), "")).unwrap(); + let mut sender = db.borrow_sender().expect("borrow"); + let mut chunk = Chunk::new("trades"); + assert_eq!(chunk.row_count(), 0); + sender + .flush(&mut chunk, AckLevel::Ok) + .expect("empty-chunk flush must round-trip"); + // Flush clears the chunk. + assert_eq!(chunk.row_count(), 0); +} + +#[test] +fn flush_clears_chunk_for_reuse_and_can_repeat() { + let server = MockServer::spawn_acking(2); + let db = QuestDb::connect(&conf_for(server.port(), "")).unwrap(); + let mut sender = db.borrow_sender().expect("borrow"); + let mut chunk = Chunk::new("trades"); + for _ in 0..3 { + sender + .flush(&mut chunk, AckLevel::Ok) + .expect("repeated empty flush"); + } +} + +#[test] +fn flush_rejects_chunk_with_no_designated_timestamp() { + let server = MockServer::spawn(2); + let db = QuestDb::connect(&conf_for(server.port(), "")).unwrap(); + let mut sender = db.borrow_sender().expect("borrow"); + let mut chunk = Chunk::new("trades"); + chunk + .column_i64("price", &[1, 2, 3], None) + .expect("column_i64"); + let err = sender + .flush(&mut chunk, AckLevel::Ok) + .expect_err("non-empty chunk without designated_ts must error"); + assert_eq!(err.code(), ErrorCode::InvalidApiCall); + assert!(err.msg().contains("designated"), "msg: {}", err.msg()); + // Chunk is left untouched on failure. + assert_eq!(chunk.row_count(), 3); +} + +#[test] +fn non_empty_chunk_with_numeric_columns_round_trips() { + use crate::ingress::column_sender::Validity; + + let server = MockServer::spawn_acking(2); + let db = QuestDb::connect(&conf_for(server.port(), "")).unwrap(); + let mut sender = db.borrow_sender().expect("borrow"); + + let mut chunk = Chunk::new("trades"); + chunk.column_i64("qty", &[10, 20, 30], None).unwrap(); + chunk.column_f64("price", &[1.1, 2.2, 3.3], None).unwrap(); + // Nullable column: bit 1 (row 1) is null. + let bits = [0b0000_0101]; + let v = Validity::from_bitmap(&bits, 3).unwrap(); + chunk + .column_uuid("id", &[[0x10; 16], [0; 16], [0x20; 16]], Some(&v)) + .unwrap(); + chunk + .designated_timestamp_nanos(&[ + 1_700_000_000_000_000_000, + 1_700_000_000_000_001_000, + 1_700_000_000_000_002_000, + ]) + .unwrap(); + assert_eq!(chunk.row_count(), 3); + + sender + .flush(&mut chunk, AckLevel::Ok) + .expect("numeric chunk flush"); + assert!(chunk.is_empty(), "flush must clear the chunk"); + + // Second flush with the SAME schema exercises the SchemaRegistry's + // REFERENCE-mode shortcut: it must still round-trip cleanly. + chunk.column_i64("qty", &[40, 50], None).unwrap(); + chunk.column_f64("price", &[4.4, 5.5], None).unwrap(); + chunk + .column_uuid("id", &[[0x30; 16], [0x40; 16]], None) + .unwrap(); + chunk + .designated_timestamp_nanos(&[1_700_000_000_000_003_000, 1_700_000_000_000_004_000]) + .unwrap(); + sender + .flush(&mut chunk, AckLevel::Ok) + .expect("second flush (schema reuse)"); +} + +#[test] +fn varchar_chunk_round_trips() { + use crate::ingress::column_sender::Validity; + + let server = MockServer::spawn_acking(2); + let db = QuestDb::connect(&conf_for(server.port(), "")).unwrap(); + let mut sender = db.borrow_sender().expect("borrow"); + + let mut chunk = Chunk::new("logs"); + // 4 rows: "alpha", null, "gamma", "δ" (multi-byte UTF-8). + let bytes = b"alphagamma\xCE\xB4"; + // Offsets length must be row_count + 1 = 5. The null row reuses the + // same offset on both sides per the plan's "skip slicing for null + // rows" rule. + let offsets: [i32; 5] = [0, 5, 5, 10, 12]; + let bits = [0b0000_1101]; // 0,2,3 valid; 1 null + let v = Validity::from_bitmap(&bits, 4).unwrap(); + chunk + .column_varchar("msg", &offsets, bytes, Some(&v)) + .unwrap(); + chunk + .column_i64("seq", &[100, 101, 102, 103], None) + .unwrap(); + chunk + .designated_timestamp_nanos(&[ + 1_700_000_000_000_000_000, + 1_700_000_000_000_001_000, + 1_700_000_000_000_002_000, + 1_700_000_000_000_003_000, + ]) + .unwrap(); + assert_eq!(chunk.row_count(), 4); + sender + .flush(&mut chunk, AckLevel::Ok) + .expect("varchar flush"); + assert!(chunk.is_empty()); +} + +#[test] +fn symbol_chunk_round_trips_and_reuses_global_dict() { + let server = MockServer::spawn_acking(2); + let db = QuestDb::connect(&conf_for(server.port(), "")).unwrap(); + let mut sender = db.borrow_sender().expect("borrow"); + + // Caller has a 3-entry dict; first chunk only references entries 0 and 2, + // so the wire's delta-symbol-dict prefix carries those two new symbols. + let dict_bytes = b"alphabetagamma"; + let dict_offsets: [i32; 4] = [0, 5, 9, 14]; + + let mut chunk = Chunk::new("trades"); + chunk + .symbol_dict_i32("sym", &[0, 2, 0, 2], &dict_offsets, dict_bytes, None) + .expect("symbol_dict_i32 first flush"); + chunk.designated_timestamp_nanos(&[1, 2, 3, 4]).unwrap(); + sender + .flush(&mut chunk, AckLevel::Ok) + .expect("symbol flush 1"); + + // Second flush re-uses entry 0 ("alpha", already in the global dict) + // and adds entry 1 ("beta"). With the connection-scoped dict the + // wire prefix only resends "beta"; the round-trip must still succeed. + chunk + .symbol_dict_i32("sym", &[1, 0, 1, 0], &dict_offsets, dict_bytes, None) + .expect("symbol_dict_i32 second flush"); + chunk.designated_timestamp_nanos(&[5, 6, 7, 8]).unwrap(); + sender + .flush(&mut chunk, AckLevel::Ok) + .expect("symbol flush 2"); +} + +#[test] +fn close_joins_reaper_cleanly() { + let server = MockServer::spawn(2); + let db = QuestDb::connect(&conf_for( + server.port(), + // close_flush_timeout_millis bounds the per-Sender close drain, which + // otherwise can wait up to 5s for the mock server's (absent) WS close + // handshake. We only care here that the reaper thread joins. + "pool_size=1;pool_max=2;pool_idle_timeout_ms=500;pool_reap=auto;close_flush_timeout_millis=200;", + )) + .unwrap(); + // Borrow + return so we have something to reap eventually. + let _ = db.borrow_sender().expect("borrow").must_close(); + // close() must return promptly (no hang) — the join is the test. + let start = Instant::now(); + db.close(); + // The bar is "does not hang indefinitely", not strict latency. The + // mock server never replies to a WS close frame, so Sender::drop waits + // out the (200 ms) close-flush timeout; 10 s is plenty of headroom on + // a CI runner under load. + assert!( + start.elapsed() < Duration::from_secs(10), + "close() must not hang on the reaper (took {:?})", + start.elapsed() + ); +} + +fn wait_until bool>(timeout: Duration, mut predicate: F) -> bool { + let deadline = Instant::now() + timeout; + loop { + if predicate() { + return true; + } + if Instant::now() >= deadline { + return false; + } + thread::sleep(Duration::from_millis(50)); + } +} diff --git a/questdb-rs/src/tests/qwp_ws.rs b/questdb-rs/src/tests/qwp_ws.rs index cbd824fa..9e50a040 100644 --- a/questdb-rs/src/tests/qwp_ws.rs +++ b/questdb-rs/src/tests/qwp_ws.rs @@ -41,7 +41,7 @@ use crate::ingress::{ QwpWsProgress, SenderBuilder, SymbolGlobalDict, TableName, TimestampNanos, }; -const WS_GUID: &str = "258EAFA5-E914-47DA-95CA-C5AB0DC85B11"; +pub(crate) const WS_GUID: &str = "258EAFA5-E914-47DA-95CA-C5AB0DC85B11"; const FIRST_WIRE_SEQUENCE: u64 = 0; const QWP_STATUS_OK: u8 = 0x00; const QWP_STATUS_DURABLE_ACK: u8 = 0x02; @@ -94,7 +94,7 @@ struct MockResult { received_frames: Vec>, } -fn read_request_until_blank(stream: &mut R) -> std::io::Result> { +pub(crate) fn read_request_until_blank(stream: &mut R) -> std::io::Result> { let mut buf = Vec::new(); let mut tmp = [0u8; 256]; loop { @@ -110,7 +110,7 @@ fn read_request_until_blank(stream: &mut R) -> std::io::Result> Ok(buf) } -fn parse_header(req: &str, name: &str) -> Option { +pub(crate) fn parse_header(req: &str, name: &str) -> Option { for line in req.split("\r\n").skip(1) { if let Some((k, v)) = line.split_once(':') && k.trim().eq_ignore_ascii_case(name) @@ -121,7 +121,7 @@ fn parse_header(req: &str, name: &str) -> Option { None } -fn read_frame(stream: &mut TcpStream) -> std::io::Result<(bool, u8, Vec)> { +pub(crate) fn read_frame(stream: &mut TcpStream) -> std::io::Result<(bool, u8, Vec)> { let mut hdr = [0u8; 2]; stream.read_exact(&mut hdr)?; let fin = (hdr[0] & 0x80) != 0; @@ -155,7 +155,10 @@ fn read_frame(stream: &mut TcpStream) -> std::io::Result<(bool, u8, Vec)> { Ok((fin, opcode, payload)) } -fn write_server_binary_frame(stream: &mut TcpStream, payload: &[u8]) -> std::io::Result<()> { +pub(crate) fn write_server_binary_frame( + stream: &mut TcpStream, + payload: &[u8], +) -> std::io::Result<()> { // FIN | binary, no mask (server→client). let mut frame = vec![0x82]; let plen = payload.len(); @@ -172,7 +175,7 @@ fn write_server_binary_frame(stream: &mut TcpStream, payload: &[u8]) -> std::io: stream.write_all(&frame) } -fn perform_server_upgrade(stream: &mut TcpStream) -> std::io::Result> { +pub(crate) fn perform_server_upgrade(stream: &mut TcpStream) -> std::io::Result> { stream.set_read_timeout(Some(Duration::from_secs(5)))?; stream.set_write_timeout(Some(Duration::from_secs(5)))?; @@ -272,7 +275,7 @@ fn write_raw_ws_frame(stream: &mut TcpStream, byte0: u8, payload: &[u8]) -> std: stream.write_all(&frame) } -fn write_qwp_ok_response(stream: &mut TcpStream, wire_seq: u64) -> std::io::Result<()> { +pub(crate) fn write_qwp_ok_response(stream: &mut TcpStream, wire_seq: u64) -> std::io::Result<()> { let mut ok = Vec::new(); ok.push(QWP_STATUS_OK); ok.extend_from_slice(&wire_seq.to_le_bytes()); @@ -325,7 +328,7 @@ fn write_qwp_error_response( write_server_binary_frame(stream, &err) } -fn compute_accept(key_b64: &str) -> String { +pub(crate) fn compute_accept(key_b64: &str) -> String { use base64ct::{Base64, Encoding}; let combined = format!("{key_b64}{WS_GUID}"); let digest = sha1(combined.as_bytes()); @@ -407,7 +410,7 @@ fn upgrade_mock_stream_without_upgrade_header(stream: &mut TcpStream) { // Mirror of the production SHA-1 used by the sender, reproduced here to // validate the upgrade handshake from the server side without poking at // internals. ~50 lines is cheaper than another dependency. -fn sha1(input: &[u8]) -> [u8; 20] { +pub(crate) fn sha1(input: &[u8]) -> [u8; 20] { let (mut h0, mut h1, mut h2, mut h3, mut h4) = ( 0x67452301u32, 0xEFCDAB89, From 7248546b1681430e265b217abb4060fa93c83aab Mon Sep 17 00:00:00 2001 From: bluestreak Date: Sun, 24 May 2026 22:53:44 +0100 Subject: [PATCH 4/9] feat(ingress): zero-copy pipelined column sender MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Rewrite the column-major sender to eliminate intermediate buffers and pipeline writes for maximum single-connection throughput. Architecture changes: - ColumnSender now owns a dedicated ColumnConn (conn.rs) that drives socket I/O directly — no replay queue, no background thread, no row-API publisher involvement. - Chunk<'a> holds borrowed descriptors (raw pointers + lengths) into the caller's buffers; no per-column Vec staging. The encoder writes wire bytes straight from caller memory into the connection's reusable write_buf at flush time. - flush() pipelines: encode + WS-mask + write_all, then drain acks non-blocking. Blocks only when in-flight hits the 128-frame protocol cap. New sync(AckLevel) blocks until all acks settle. - Server cumulative OKs handled correctly (sequence=N acks all frames up to N). API changes: - flush(&mut chunk, AckLevel) → flush(&mut chunk) (fire-and-forget) - New sync(AckLevel) drains all in-flight acks - FFI: column_sender_flush drops ack_level arg; new column_sender_sync - FFI lifetime contract: caller buffers must outlive flush (no copy) Performance (5M-row L1 quotes, 9 columns, localhost): - Encode path: 6 GB/s (2.3% of wall time) - End-to-end: 350 MB/s pipelined (was 264 MB/s stop-and-wait) - Per-chunk p50: 0.72 ms (was 2.64 ms) - Criterion populate+encode: 575 µs (was 718 µs, 20% faster) Co-Authored-By: Claude Opus 4.7 (1M context) --- doc/COLUMN_SENDER_FFI_ABI.md | 16 +- doc/COLUMN_SENDER_PERF.md | 85 +- doc/COLUMN_SENDER_PLAN.md | 65 +- questdb-rs-ffi/src/column_sender.rs | 56 +- questdb-rs/benches/column_sender.rs | 34 +- questdb-rs/examples/qwp_ws_l1_quotes.rs | 295 ++++ questdb-rs/src/ingress.rs | 106 +- questdb-rs/src/ingress/column_sender/chunk.rs | 1218 +++++++---------- questdb-rs/src/ingress/column_sender/conf.rs | 25 +- questdb-rs/src/ingress/column_sender/conn.rs | 966 +++++++++++++ questdb-rs/src/ingress/column_sender/db.rs | 38 +- .../src/ingress/column_sender/encoder.rs | 923 +++++++++---- questdb-rs/src/ingress/column_sender/mod.rs | 22 +- .../src/ingress/column_sender/sender.rs | 120 +- .../src/ingress/column_sender/validity.rs | 30 - questdb-rs/src/ingress/sender.rs | 48 +- questdb-rs/src/ingress/sender/qwp_ws.rs | 11 - questdb-rs/src/tests/column_sender_pool.rs | 36 +- 18 files changed, 2791 insertions(+), 1303 deletions(-) create mode 100644 questdb-rs/examples/qwp_ws_l1_quotes.rs create mode 100644 questdb-rs/src/ingress/column_sender/conn.rs diff --git a/doc/COLUMN_SENDER_FFI_ABI.md b/doc/COLUMN_SENDER_FFI_ABI.md index 5d2b81ce..1c1de52f 100644 --- a/doc/COLUMN_SENDER_FFI_ABI.md +++ b/doc/COLUMN_SENDER_FFI_ABI.md @@ -89,9 +89,19 @@ For every column-append function: contiguous in the common case.) - All column buffers passed in one chunk must have the same `row_count` — the chunk's row count, set by the first column-append call. -- Buffer ownership stays with the caller; the FFI copies into internal - storage during the call. The buffer can be freed or reused - immediately on return. +- **Buffer lifetime contract.** Buffers passed to a `column_sender_chunk_*` + function (numeric columns, varchar offsets/bytes, symbol codes/dict + offsets/dict bytes, designated timestamps, validity bitmaps) **must + remain alive and unchanged until the next `column_sender_flush` call + on the chunk returns** (or until `column_sender_chunk_free` / + `column_sender_chunk_clear` is called without a flush). The FFI stores + raw pointers into the caller's buffers; it does **not** copy at + append time. This is required to hit memcpy-bandwidth throughput on + the no-null hot path — see `doc/COLUMN_SENDER_PLAN.md` §2. +- For Python wrappers, the typical pattern is to fill the chunk from a + live DataFrame's numpy / Arrow buffers and flush before letting the + DataFrame go out of scope — the contract is naturally satisfied + because flush is synchronous. ### 2.4 Validity bitmaps diff --git a/doc/COLUMN_SENDER_PERF.md b/doc/COLUMN_SENDER_PERF.md index cfc7d8d9..c7c382ce 100644 --- a/doc/COLUMN_SENDER_PERF.md +++ b/doc/COLUMN_SENDER_PERF.md @@ -32,57 +32,58 @@ QUESTDB_COLUMN_BENCH_ROWS=10000000 \ # QUESTDB_COLUMN_BENCH_SYM_CARD default 1_000 ``` -## First-baseline numbers +## Numbers after the borrow-not-copy rewrite Captured on an Apple Silicon laptop, default workload (`rows = 100_000`, `varchar_len = 16`, `sym_card = 1_000`), -`cargo bench ... -- --quick --noplot`. Replace with refreshed numbers as -the encoder evolves. - -| Bench | Median time | Median throughput | Notes | -|-------------------------------------|------------:|--------------------:|-------| -| `column_i64/memcpy_baseline` | ~143 µs | ~5.2 GiB/s | High variance — bare `Vec` alloc + push + extend on a 800 KB allocation dominates. | -| `column_i64/column_sender_no_null` | ~13.7 µs | ~54 GiB/s | Memcpy-bound; matches the plan's "no-null = `extend_from_slice`" goal. | -| `column_i64/column_sender_nullable` | ~79.1 µs | ~9.4 GiB/s | Sentinel-encode per row (`i64::MIN` for nulls). | -| `column_f64/memcpy_baseline` | ~13.6 µs | ~54.7 GiB/s | | -| `column_f64/column_sender_no_null` | ~13.5 µs | ~55 GiB/s | Indistinguishable from memcpy. | -| `column_varchar/memcpy_baseline` | ~63.6 µs | ~29.3 GiB/s | Offset table + bytes copy. | -| `column_varchar/column_sender_no_null` | ~67.0 µs | ~27.8 GiB/s | Within ~5 % of memcpy; rebase-to-zero path is the same as memcpy when `offsets[0] == 0`. | -| `symbol_dict/column_sender` | ~135 µs | ~740 M rows/s | 100k rows × 1 000-card dict; three-pass bulk-intern. | -| `symbol_dict/naive_per_row_hashmap` | ~2.16 ms | ~46 M rows/s | Per-row HashMap probe; mirrors what the row API pays. **~16× slower than the column path** — confirms the WS-4 plan claim (drops 100k probes to 1 000 interns). | -| `encode_chunk/populate_only` | ~294 µs | ~341 M rows/s | 5 columns (i64, f64, varchar, symbol, designated_ts); all bulk-append calls. | -| `encode_chunk/encode_only` | ~437 µs | ~229 M rows/s | Header + dict-delta + table block + per-column splices. | -| `encode_chunk/populate_plus_encode` | ~718 µs | ~139 M rows/s | End-to-end, no network. | +`cargo bench ... -- --quick --noplot`. The big change vs the first +baseline: `Chunk` now holds raw pointers into the caller's buffers; +all wire-formatting is deferred to flush time and writes directly into +the connection's reusable write buffer. + +| Bench | Median time | Notes | +|-------------------------------------|------------:|-------| +| `column_i64/column_sender_no_null` | ~57 ns | Descriptor store only — no data copy at append time. | +| `column_i64/column_sender_nullable` | ~289 ns | Descriptor store + `non_null_count` precompute over the bitmap. | +| `column_f64/column_sender_no_null` | ~57 ns | Same as i64 — `Chunk` never touches the caller's bytes. | +| `encode_chunk/populate_only` | ~76 µs | Chunk-fill for the 5-column workload (was ~294 µs in the pre-rewrite baseline). **~4× faster.** | +| `encode_chunk/encode_only` | ~500 µs | Full encode: header + dict-delta + table block + per-column wire encode straight into a reusable buffer (was ~437 µs in the pre-rewrite baseline; now does the per-row work that previously happened during populate). | +| `encode_chunk/populate_plus_encode` | ~575 µs | **End-to-end flush time (no network) was ~718 µs pre-rewrite → ~575 µs after. ~20 % faster.** | A second-pass `encode_chunk/encode_only` on the same workload should land in **REFERENCE mode** for the schema (because the registry caches the signature from the first encode), shaving off the FULL-mode signature bytes — see `doc/COLUMN_SENDER_PLAN.md` §2.1. -## Interpreting the baseline - -- The **`column_f64/column_sender_no_null` ≈ memcpy** result is the - load-bearing perf claim of the column sender: a contiguous typed - buffer pays the cost of a `memcpy` and nothing more. The chunk's - per-column `Vec` storage absorbs the null-flag byte + payload in - one extend; encode time then turns each column into a single - `extend_from_slice`. -- The **`column_i64/memcpy_baseline` variance** is bench noise from the - large per-iteration allocation in the baseline (a fresh - ~800 KB `Vec` per sample). The column-sender path reuses its - `Vec::with_capacity(16)` seed and grows in place, which the - allocator handles more uniformly. Both medians are well above - network bandwidth, so this is not the bottleneck. -- The **nullable I64 path** at ~9.4 GiB/s is the sentinel-encode loop - (`if v.is_valid(i) { value } else { I64_NULL }`), bounded by branch - prediction. It still moves the same 800 KB; a SIMD lowering would - close the gap with the no-null path but isn't necessary to hit the - "memcpy-bound when the user has no nulls" bar. -- The **symbol bulk-intern speedup (~16×)** comes from the WS-4 - three-pass design — referenced bitset, compact dict copy, code - translation. At 100k rows × 1 000-card dict the column path runs - 1 000 interns plus 100 000 `Vec` writes; the naïve path runs - 100 000 HashMap probes. +The per-column microbenches no longer measure data movement: with raw +pointers stored, `column_iN`/`column_fN` are essentially constant-time +in `row_count`. The honest end-to-end metric is +`encode_chunk/populate_plus_encode`, which is what a single flush +costs (chunk-fill + frame encode into the WS write buffer, before +masking/socket-write). + +## Interpreting the numbers + +- The **`encode_chunk/populate_plus_encode` ~20 % win** is the + load-bearing claim: end-to-end CPU time per flush is lower than the + pre-rewrite design that copied each column into per-column `Vec` + staging and then aggregated those into a fresh per-frame `Vec`. + We now do exactly one memcpy per fixed-width column — straight from + the caller's buffer into the connection's reusable write buffer. +- The **`encode_only` is *slightly* slower in isolation** (~500 µs vs + ~437 µs) because the per-row work that used to be amortised into + `populate_only` is now done at encode time. `populate_only` dropped + from ~294 µs to ~76 µs, and the sum is what matters. +- The encoder pre-sizes the write buffer in one shot via + `estimate_frame_size(...)` to avoid the geometric-growth memcpy + pattern when payloads exceed the default 64 KiB capacity. Without + this, end-to-end flush time would be ~880 µs (worse than the + baseline). +- The **symbol bulk-intern** still runs the WS-4 three-pass design + (referenced bitset, intern only referenced slots, then per-row + emit). At 100 k rows × 1 000-card dict the encoder runs ≤ 1 000 + interns + 100 k varint writes — the per-row HashMap probe of the + row-API path remains ~16× slower. ## Out of scope here diff --git a/doc/COLUMN_SENDER_PLAN.md b/doc/COLUMN_SENDER_PLAN.md index 5b425238..1bf882b5 100644 --- a/doc/COLUMN_SENDER_PLAN.md +++ b/doc/COLUMN_SENDER_PLAN.md @@ -49,29 +49,62 @@ op-state validation: for 50M rows × 6 columns that's 300M name lookups column-major API replaces all of that with **6 bulk appends per chunk + 1 encode pass**. -### 2.1 Decoupled from the existing row encoder +### 2.1 Decoupled from the existing row encoder *and the row publisher* Performance is the goal; **code reuse is a non-goal**. The column -sender does **not** reuse `QwpWsColumnarBuffer` or the row API's -encoder. It writes a fresh QWP/WS frame directly from pandas/polars- -shaped buffers, via a new `BulkChunk` type and a sibling encoder in a -new module. +sender does **not** reuse `QwpWsColumnarBuffer`, the row API's +encoder, **or the row API's publisher / driver / queue stack**. It +owns its own QWP/WebSocket socket end-to-end via a dedicated +`ColumnConn` type (`questdb-rs/src/ingress/column_sender/conn.rs`): + +- one write buffer reused across flushes (no per-frame allocation); +- the encoder writes the QWP frame body directly into that buffer at + offset `WS_HEADER_RESERVE = 14`, leaving room to prepend the WS + header in place once the payload length is known; +- the buffer is masked in place per RFC 6455 §5.3 and `write_all`'d to + the socket — at most one frame in flight by construction; +- the ack reader synchronously parses the QWP response inline (no + replay queue, no background thread). What is shared with the row API is only what *must* stay coherent at connection scope: - `SymbolGlobalDict` (`questdb-rs/src/ingress/buffer/qwp.rs:5041`) — - the connection-scoped symbol intern table the wire requires. -- `SchemaRegistry` (`qwp.rs:5148`) — connection-scoped schema IDs. -- The QWP/WS publisher / driver / WS framing in - `questdb-rs/src/ingress/sender/qwp_ws*.rs` — connection lifecycle, - ack pump, reconnect, FSN tracking. - -What is *not* shared, and may be duplicated verbatim if that's -simplest, is the wire-formatting helper surface: varint writers, type- -byte tables, schema-signature construction. These are stable per the -QWP v1 spec; duplicating costs ~100 lines and removes one layer of -indirection from the hot path. + the connection-scoped symbol intern table the wire requires. A + fresh instance per `ColumnConn`. +- The shared RFC 6455 WS plumbing in `crate::ws::{frame, mask, + handshake, crypto}` (handshake, frame header parse, + client-frame encode, mask key source). +- TCP connect + TLS setup + WS handshake, reached via + `SenderBuilder::build_qwp_ws_raw_stream` which returns a + `RawQwpWsStream` and never assembles the row-API publisher / + driver / queue. + +Note that `SchemaRegistry` is now **column-sender-local** (defined in +`column_sender/encoder.rs`), not shared. Each `ColumnConn` carries its +own registry through the pool; the row API has its own, separate +registry inside `QwpWsReplayEncoder`. + +What is *not* shared, and is duplicated verbatim where simplest, is +the QWP response parser (one binary OK / DurableAck / error frame at +a time) and the wire-formatting helper surface (varint writers, +type-byte tables, schema-signature construction). These are stable per +the QWP v1 spec; duplicating costs ~150 lines and removes one layer +of indirection from the hot path. + +### 2.1.1 Borrow-not-copy + +`Chunk<'a>` holds **raw pointers** into the caller's column buffers, +not copied wire-shape bytes. Each `column_*` call validates input +(name, lengths, varchar offset monotonicity, symbol-code range) and +stores a descriptor; the encoder dereferences the pointers at flush +time. The caller's buffers must outlive flush. + +On the Rust API, the lifetime parameter `'a` ties the chunk to every +borrowed buffer, so the borrow checker catches use-after-free at +compile time. The FFI layer carries the same shape via +`Chunk<'static>` and an explicit ABI contract — see +`doc/COLUMN_SENDER_FFI_ABI.md` §2.3. ### 2.2 Two code paths per type diff --git a/questdb-rs-ffi/src/column_sender.rs b/questdb-rs-ffi/src/column_sender.rs index b6a6713b..bc36b41a 100644 --- a/questdb-rs-ffi/src/column_sender.rs +++ b/questdb-rs-ffi/src/column_sender.rs @@ -52,7 +52,14 @@ pub struct column_sender(OwnedSender); /// One DataFrame's worth of column buffers destined for one QuestDB table. /// Owned by the caller; not bound to a sender. -pub struct column_sender_chunk(Chunk); +/// +/// Holds raw pointers into caller buffers (no copy). Per the FFI ABI +/// doc §2.3, the caller MUST keep every column buffer passed in via +/// `column_sender_chunk_column_*` / `column_sender_chunk_symbol_dict_*` +/// alive until the next `column_sender_flush` call returns. We hide the +/// chunk's lifetime by promoting its inner type to `'static`; the lifetime +/// is enforced by the caller, not the borrow checker. +pub struct column_sender_chunk(Chunk<'static>); // =========================================================================== // Validity bitmap (Arrow shape: bit = 1 means valid, LSB-first). @@ -746,17 +753,23 @@ pub unsafe extern "C" fn column_sender_chunk_designated_timestamp_nanos( // Flush // =========================================================================== -/// Encode `chunk` into a QWP/WebSocket frame, publish it, and block -/// until the server acknowledges at the requested `ack_level`. +/// Encode `chunk` into a QWP/WebSocket frame, write it to the socket, +/// and return immediately — without waiting for the server's ack. +/// +/// Ready acks are drained non-blocking before the write. If the +/// in-flight count has hit the protocol cap (128), the call blocks +/// until one ack frees a slot. /// /// On success, `chunk` is cleared and the call returns `true`. On /// failure, `chunk` is left untouched and `false` is returned (with /// `*err_out` set if provided). +/// +/// Call [`column_sender_sync`] after the last flush to drain all +/// remaining in-flight acks. #[unsafe(no_mangle)] pub unsafe extern "C" fn column_sender_flush( sender: *mut column_sender, chunk: *mut column_sender_chunk, - ack_level: column_sender_ack_level, err_out: *mut *mut line_sender_error, ) -> bool { let sender = match unsafe { sender.as_mut() } { @@ -778,7 +791,40 @@ pub unsafe extern "C" fn column_sender_flush( Some(c) => &mut c.0, None => return reject_null_chunk(err_out), }; - bubble!(err_out, sender.flush(chunk, ack_level.into())); + bubble!(err_out, sender.flush(chunk)); + true +} + +/// Block until all in-flight frames are acknowledged at the requested +/// `ack_level`. +/// +/// `column_sender_ack_level_ok` waits for every in-flight frame's +/// WAL-commit ack. `column_sender_ack_level_durable` additionally waits +/// for the server's object-store durability watermarks. +/// +/// Returns `true` on success, `false` on error (with `*err_out` set). +#[unsafe(no_mangle)] +pub unsafe extern "C" fn column_sender_sync( + sender: *mut column_sender, + ack_level: column_sender_ack_level, + err_out: *mut *mut line_sender_error, +) -> bool { + let sender = match unsafe { sender.as_mut() } { + Some(s) => s.0.get_mut(), + None => { + unsafe { + set_err_out_from_error( + err_out, + Error::new( + ErrorCode::InvalidApiCall, + "column_sender_sync: sender pointer is NULL".to_string(), + ), + ); + } + return false; + } + }; + bubble!(err_out, sender.sync(ack_level.into())); true } diff --git a/questdb-rs/benches/column_sender.rs b/questdb-rs/benches/column_sender.rs index 75c4cf64..f430d05b 100644 --- a/questdb-rs/benches/column_sender.rs +++ b/questdb-rs/benches/column_sender.rs @@ -63,7 +63,9 @@ use std::time::Duration; use criterion::{BatchSize, Criterion, Throughput, black_box, criterion_group, criterion_main}; -use questdb::ingress::column_sender::_bench_internals::{BenchEncoderState, bench_encode_chunk}; +use questdb::ingress::column_sender::_bench_internals::{ + BenchEncoderState, bench_encode_chunk_into, +}; use questdb::ingress::column_sender::{Chunk, Validity}; // --------------------------------------------------------------------------- @@ -157,7 +159,7 @@ fn make_symbol_workload(rows: usize, cardinality: usize) -> (Vec, Vec, // Bench helpers // --------------------------------------------------------------------------- -fn fresh_chunk(table: &str) -> Chunk { +fn fresh_chunk<'a>(table: &str) -> Chunk<'a> { Chunk::new(table) } @@ -397,10 +399,16 @@ fn encode_chunk_group(c: &mut Criterion) { let prebuilt = build_chunk(); group.bench_function("encode_only", |b| { b.iter_batched( - BenchEncoderState::new, - |mut state| { - let frame = bench_encode_chunk(&prebuilt, &mut state).unwrap(); - black_box(frame); + || { + ( + BenchEncoderState::new(), + Vec::::with_capacity(64 * 1024), + ) + }, + |(mut state, mut out)| { + out.clear(); + bench_encode_chunk_into(&mut out, &prebuilt, &mut state).unwrap(); + black_box(&out); }, BatchSize::SmallInput, ); @@ -408,11 +416,17 @@ fn encode_chunk_group(c: &mut Criterion) { group.bench_function("populate_plus_encode", |b| { b.iter_batched( - BenchEncoderState::new, - |mut state| { + || { + ( + BenchEncoderState::new(), + Vec::::with_capacity(64 * 1024), + ) + }, + |(mut state, mut out)| { let chunk = build_chunk(); - let frame = bench_encode_chunk(&chunk, &mut state).unwrap(); - black_box(frame); + out.clear(); + bench_encode_chunk_into(&mut out, &chunk, &mut state).unwrap(); + black_box(&out); }, BatchSize::SmallInput, ); diff --git a/questdb-rs/examples/qwp_ws_l1_quotes.rs b/questdb-rs/examples/qwp_ws_l1_quotes.rs new file mode 100644 index 00000000..1ee1e373 --- /dev/null +++ b/questdb-rs/examples/qwp_ws_l1_quotes.rs @@ -0,0 +1,295 @@ +/******************************************************************************* + * ___ _ ____ ____ + * / _ \ _ _ ___ ___| |_| _ \| __ ) + * | | | | | | |/ _ \/ __| __| | | | _ \ + * | |_| | |_| | __/\__ \ |_| |_| | |_) | + * \__\_\\__,_|\___||___/\__|____/|____/ + * + * Copyright (c) 2014-2019 Appsicle + * Copyright (c) 2019-2025 QuestDB + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + ******************************************************************************/ + +//! Synthetic equities L1 quote feed → QuestDB via the column-major sender. +//! +//! Generates a 5M-row dataset that mimics a Level-1 order book stream +//! (per-symbol top-of-book bid/ask with a trailing last-trade) and +//! ingests it into a single QuestDB table. Reports end-to-end +//! throughput (rows/s, MB/s) and the average per-chunk flush latency. +//! +//! Default schema: +//! ts TIMESTAMP_NANOS (designated) +//! symbol SYMBOL (~500 tickers) +//! exchange SYMBOL (5 venues) +//! bid_px DOUBLE +//! ask_px DOUBLE +//! last_px DOUBLE +//! bid_sz LONG +//! ask_sz LONG +//! last_sz LONG +//! +//! Run against a local QuestDB instance: +//! cargo run --release --features sync-sender-qwp-ws \ +//! --example qwp_ws_l1_quotes +//! +//! Positional args: +//! 1: connect string (default `qwpws::addr=localhost:9000;`) +//! 2: table name (default `l1_quotes`) +//! 3: row count (default 5_000_000) +//! +//! Pre-create the table (paste into the QuestDB Web Console at +//! http://localhost:9000 or post via curl): +//! +//! CREATE TABLE l1_quotes ( +//! ts TIMESTAMP, +//! symbol SYMBOL CAPACITY 512 NOCACHE, +//! exchange SYMBOL CAPACITY 8 NOCACHE, +//! bid_px DOUBLE, +//! ask_px DOUBLE, +//! last_px DOUBLE, +//! bid_sz LONG, +//! ask_sz LONG, +//! last_sz LONG +//! ) TIMESTAMP(ts) PARTITION BY HOUR WAL; +//! +//! Verify after run: +//! curl 'http://localhost:9000/exec?query=SELECT%20count()%20FROM%20l1_quotes' +//! curl 'http://localhost:9000/exec?query=SELECT%20*%20FROM%20l1_quotes%20LIMIT%2010' + +use std::time::Instant; + +use questdb::ingress::column_sender::{AckLevel, Chunk, QuestDb}; + +const DEFAULT_TOTAL_ROWS: usize = 5_000_000; +/// 25 000 rows × ~60 bytes/row ≈ 1.5 MB. Stays under the QuestDB server's +/// default 2 MiB WebSocket receive buffer (the server logs +/// `QwpIngressUpgradeProcessor … frame too large` and closes the +/// connection for larger frames; the spec's 16 MiB cap is only relevant +/// when the server's buffer is sized for it). +const CHUNK_ROWS: usize = 25_000; +const SYMBOL_CARDINALITY: usize = 500; +const EXCHANGES: &[&str] = &["NYSE", "NASDAQ", "BATS", "ARCA", "IEX"]; + +fn main() -> questdb::Result<()> { + let conf = std::env::args() + .nth(1) + .unwrap_or_else(|| "qwpws::addr=localhost:9000;".to_string()); + let table_name = std::env::args() + .nth(2) + .unwrap_or_else(|| "l1_quotes".to_string()); + let total_rows: usize = std::env::args() + .nth(3) + .and_then(|v| v.parse().ok()) + .unwrap_or(DEFAULT_TOTAL_ROWS); + + println!( + "Generating {} rows of L1 quote data ({} tickers × {} venues)...", + humanise(total_rows), + SYMBOL_CARDINALITY, + EXCHANGES.len() + ); + let gen_start = Instant::now(); + + let symbol_dict_strings: Vec = (0..SYMBOL_CARDINALITY) + .map(|i| format!("TICK{i:03}")) + .collect(); + let (sym_dict_offsets, sym_dict_bytes) = + build_dict(symbol_dict_strings.iter().map(String::as_str)); + let (ex_dict_offsets, ex_dict_bytes) = build_dict(EXCHANGES.iter().copied()); + + // Pre-allocate columnar buffers for the full dataset. At 5 M × 8 B per + // f64/i64 column the peak working set is ~280 MB; comfortable on any + // dev box. + let mut symbol_codes = Vec::with_capacity(total_rows); + let mut exchange_codes = Vec::with_capacity(total_rows); + let mut ts_ns = Vec::with_capacity(total_rows); + let mut bid_px = Vec::with_capacity(total_rows); + let mut ask_px = Vec::with_capacity(total_rows); + let mut last_px = Vec::with_capacity(total_rows); + let mut bid_sz = Vec::with_capacity(total_rows); + let mut ask_sz = Vec::with_capacity(total_rows); + let mut last_sz = Vec::with_capacity(total_rows); + + let start_ts = std::time::SystemTime::now() + .duration_since(std::time::UNIX_EPOCH) + .unwrap() + .as_nanos() as i64; + + // Splitmix-style RNG: avoids a dep on `rand` and produces a uniform + // enough spread for the symbol distribution. + let mut state: u64 = 0x9E37_79B9_7F4A_7C15; + let mut step = || { + state = state.wrapping_mul(0x9E37_79B9_7F4A_7C15); + state ^= state >> 27; + state + }; + + for i in 0..total_rows { + let r1 = step(); + let r2 = step(); + + let sym = (r1 as usize % SYMBOL_CARDINALITY) as i32; + let ex = ((r1 >> 32) as usize % EXCHANGES.len()) as i8; + // Per-symbol base price so the L1 feed has realistic price strata. + let base = 100.0 + sym as f64; + let spread = 0.01 + (((r2 & 0xFFFF) as f64) / 65_535.0) * 0.05; + let drift = (((r2 >> 16) & 0xFFFF) as f64 - 32_768.0) / 1_000_000.0; + let mid = base + drift; + let bid = mid - spread / 2.0; + let ask = mid + spread / 2.0; + let last = mid + (((r2 >> 32) & 0xFFFF) as f64 - 32_768.0) / 1_000_000.0; + let sz_bid = 100 + ((r1 >> 8) & 0xFFFF) as i64; + let sz_ask = 100 + ((r1 >> 24) & 0xFFFF) as i64; + let sz_last = 100 + ((r2 >> 48) & 0x3FF) as i64; + + symbol_codes.push(sym); + exchange_codes.push(ex); + // Monotonic 1 µs cadence — characteristic of a top-of-book feed + // even if individual events are slightly out of order in real + // life. + ts_ns.push(start_ts + (i as i64) * 1_000); + bid_px.push(bid); + ask_px.push(ask); + last_px.push(last); + bid_sz.push(sz_bid); + ask_sz.push(sz_ask); + last_sz.push(sz_last); + } + let gen_elapsed = gen_start.elapsed(); + println!( + " generated in {:.2}s ({:.1} M rows/s)", + gen_elapsed.as_secs_f64(), + total_rows as f64 / gen_elapsed.as_secs_f64() / 1e6 + ); + + println!("\nConnecting to {conf} ..."); + let db = QuestDb::connect(&conf)?; + let mut sender = db.borrow_sender()?; + + // One chunk reused across flushes — the bench design exists exactly + // for this case: per-column `Vec` capacity is retained across + // flush(). + let mut chunk = Chunk::new(&table_name); + + let mut chunk_micros: Vec = Vec::new(); + let send_start = Instant::now(); + let mut flushed = 0usize; + let mut chunk_idx = 0usize; + while flushed < total_rows { + let end = (flushed + CHUNK_ROWS).min(total_rows); + + chunk.column_i64("bid_sz", &bid_sz[flushed..end], None)?; + chunk.column_i64("ask_sz", &ask_sz[flushed..end], None)?; + chunk.column_i64("last_sz", &last_sz[flushed..end], None)?; + chunk.column_f64("bid_px", &bid_px[flushed..end], None)?; + chunk.column_f64("ask_px", &ask_px[flushed..end], None)?; + chunk.column_f64("last_px", &last_px[flushed..end], None)?; + chunk.symbol_dict_i32( + "symbol", + &symbol_codes[flushed..end], + &sym_dict_offsets, + &sym_dict_bytes, + None, + )?; + chunk.symbol_dict_i8( + "exchange", + &exchange_codes[flushed..end], + &ex_dict_offsets, + &ex_dict_bytes, + None, + )?; + chunk.designated_timestamp_nanos(&ts_ns[flushed..end])?; + + let t = Instant::now(); + sender.flush(&mut chunk)?; + chunk_micros.push(t.elapsed().as_micros()); + + flushed = end; + chunk_idx += 1; + eprint!( + "\r flushed chunk {chunk_idx:02} ({}/{} rows)", + humanise(flushed), + humanise(total_rows) + ); + } + sender.sync(AckLevel::Ok)?; + eprintln!(); + let send_elapsed = send_start.elapsed(); + + // Per-row wire payload estimate: + // 3 × f64 + 3 × i64 + 1 × i64 (ts) + 2 B symbol varint + 1 B exchange varint + // = 24 + 24 + 8 + 3 = 59 bytes. Schema/header overhead amortises away. + let bytes_per_row = 59usize; + let total_bytes = total_rows * bytes_per_row; + + println!( + "\nFlushed {} rows in {:.2}s ({} chunks of up to {})", + humanise(total_rows), + send_elapsed.as_secs_f64(), + chunk_idx, + humanise(CHUNK_ROWS) + ); + println!( + " throughput: {:>7.2} M rows/s", + total_rows as f64 / send_elapsed.as_secs_f64() / 1e6 + ); + println!( + " bandwidth: {:>7.1} MB/s (≈ {:.0} byte/row × rows/s)", + total_bytes as f64 / send_elapsed.as_secs_f64() / 1e6, + bytes_per_row + ); + println!( + " per-chunk avg: {:>7.1} ms", + send_elapsed.as_millis() as f64 / chunk_idx as f64 + ); + if let (Some(&min), Some(&max)) = (chunk_micros.iter().min(), chunk_micros.iter().max()) { + let mut sorted = chunk_micros.clone(); + sorted.sort_unstable(); + let p50 = sorted[sorted.len() / 2]; + let p95 = sorted[(sorted.len() * 19) / 20]; + println!( + " per-chunk min/p50/p95/max: {:.2} / {:.2} / {:.2} / {:.2} ms", + min as f64 / 1000.0, + p50 as f64 / 1000.0, + p95 as f64 / 1000.0, + max as f64 / 1000.0, + ); + } + + println!("\nVerify in QuestDB:"); + println!(" curl 'http://localhost:9000/exec?query=SELECT%20count()%20FROM%20{table_name}'"); + println!( + " curl 'http://localhost:9000/exec?query=SELECT%20*%20FROM%20{table_name}%20LIMIT%2010'" + ); + + Ok(()) +} + +fn build_dict<'a, I>(strings: I) -> (Vec, Vec) +where + I: IntoIterator, +{ + let mut offsets: Vec = vec![0]; + let mut bytes: Vec = Vec::new(); + for s in strings { + bytes.extend_from_slice(s.as_bytes()); + offsets.push(bytes.len() as i32); + } + (offsets, bytes) +} + +fn humanise(n: usize) -> String { + if n >= 1_000_000 { + format!("{:.2} M", n as f64 / 1e6) + } else if n >= 1_000 { + format!("{:.1} k", n as f64 / 1e3) + } else { + n.to_string() + } +} diff --git a/questdb-rs/src/ingress.rs b/questdb-rs/src/ingress.rs index 990dda08..09da8dd2 100644 --- a/questdb-rs/src/ingress.rs +++ b/questdb-rs/src/ingress.rs @@ -60,7 +60,7 @@ mod timestamp; mod buffer; pub use buffer::*; -mod sender; +pub(crate) mod sender; #[cfg(feature = "_sender-qwp-ws")] pub(crate) use sender::QwpWsRoleReject; pub use sender::*; @@ -392,6 +392,24 @@ pub(crate) struct QwpWsAddrScan { pub(crate) sanitized_conf: String, } +/// Raw QWP/WebSocket connection produced by +/// [`SenderBuilder::build_qwp_ws_raw_stream`]. The column-major sender uses +/// this as its sole entry point into the network — it does its own +/// synchronous frame I/O on the contained `WsStream` and never touches the +/// row-API publisher / driver / queue stack. +#[cfg(feature = "sync-sender-qwp-ws")] +pub(crate) struct RawQwpWsStream { + pub(crate) stream: sender::qwp_ws::WsStream, + /// Bytes already read past the HTTP upgrade response. The shared + /// handshake helper may consume more bytes than the response body + /// itself; those bytes are the start of the first server WS frame + /// and must be drained before reading more from the socket. + pub(crate) leftover: Vec, + pub(crate) max_buf_size: usize, + pub(crate) request_timeout: Duration, + pub(crate) durable_ack_opt_in: bool, +} + /// Pre-scan a raw connect string for repeated `addr=...` params. Returns the /// full list of addr values and a sanitized conf with duplicate `addr=` params /// removed (the first one is kept so the downstream `questdb_confstr` parser @@ -2387,6 +2405,92 @@ impl SenderBuilder { Ok(sender) } + /// Open a raw QWP/WebSocket connection (TCP + optional TLS + HTTP + /// upgrade) **without** assembling the row-API publisher, queue, or + /// background-thread machinery. + /// + /// Returned by reference, the [`crate::ingress::sender::qwp_ws::WsStream`] + /// is the only thing the column-major sender needs from this crate's + /// builder: it does its own synchronous frame writing and ack reading + /// from there. See `doc/COLUMN_SENDER_PLAN.md`. + #[cfg(feature = "sync-sender-qwp-ws")] + pub(crate) fn build_qwp_ws_raw_stream(&self) -> Result { + if self.init_buf_size.is_specified() && *self.init_buf_size > *self.max_buf_size { + return Err(error::fmt!( + ConfigError, + "init_buf_size ({}) cannot exceed max_buf_size ({})", + *self.init_buf_size, + *self.max_buf_size + )); + } + + if !matches!(self.protocol, Protocol::QwpWs | Protocol::QwpWss) { + return Err(error::fmt!( + ConfigError, + "Column-sender requires a QWP/WebSocket connect string \ + (got protocol {:?})", + self.protocol + )); + } + if self.net_interface.is_some() { + return Err(error::fmt!( + InvalidApiCall, + "net_interface is not supported for QWP over WebSocket." + )); + } + let Some(qwp_ws) = self.qwp_ws.as_ref() else { + return Err(error::fmt!( + ConfigError, + "QWP/WebSocket configuration is missing." + )); + }; + + #[cfg(feature = "insecure-skip-verify")] + let tls_verify = *self.tls_verify; + let tls_roots_password = self.tls_roots_password.deref().as_deref(); + + if tls_roots_password.is_some() && self.tls_roots.deref().is_none() { + return Err(error::fmt!( + ConfigError, + "\"tls_roots_password\" requires \"tls_roots\" \ + (the password unlocks the keystore at that path)" + )); + } + + let tls_settings = tls::TlsSettings::build( + self.protocol.tls_enabled(), + #[cfg(feature = "insecure-skip-verify")] + tls_verify, + *self.tls_ca, + self.tls_roots.deref().as_deref(), + tls_roots_password, + )?; + + let auth = self.build_auth()?; + let basic_auth = qwp_ws_auth_header(&auth)?; + let mut qwp_ws = qwp_ws.clone(); + qwp_ws.apply_reconnect_implies_initial_retry(); + reject_unsupported_qwp_ws_sf_config(&qwp_ws)?; + + let use_tls = matches!(self.protocol, Protocol::QwpWss); + let (stream, _negotiated_version, leftover) = sender::qwp_ws::establish_connection( + self.host.as_str(), + self.port.as_str(), + use_tls, + tls_settings, + &qwp_ws, + basic_auth.as_deref(), + )?; + + Ok(RawQwpWsStream { + stream, + leftover, + max_buf_size: *self.max_buf_size, + request_timeout: *qwp_ws.request_timeout, + durable_ack_opt_in: *qwp_ws.request_durable_ack, + }) + } + #[cfg(any(feature = "_sender-tcp", feature = "_sender-qwp-udp"))] fn ensure_supports_bind_interface(&self, param_name: &str) -> Result<()> { #[cfg(feature = "_sender-tcp")] diff --git a/questdb-rs/src/ingress/column_sender/chunk.rs b/questdb-rs/src/ingress/column_sender/chunk.rs index ef7c38f1..88a22471 100644 --- a/questdb-rs/src/ingress/column_sender/chunk.rs +++ b/questdb-rs/src/ingress/column_sender/chunk.rs @@ -22,119 +22,202 @@ * ******************************************************************************/ -//! Column-major chunk: one DataFrame's worth of column buffers destined for -//! a single QuestDB table. +//! Column-major chunk: one DataFrame's worth of borrowed column buffers +//! destined for a single QuestDB table. //! -//! The user calls [`Chunk::new`] with a table name, fills it with one -//! `column_*` call per column, optionally pins a designated timestamp, and -//! hands it to [`super::ColumnSender::flush`]. Each `column_*` writes the -//! column straight into wire-shape `Vec` storage so the flush-time -//! encoder only does a header + per-column `extend_from_slice`. +//! `Chunk<'a>` stores **descriptors** — raw pointers + lengths + an +//! optional validity bitmap — for each column. No data is copied at +//! append time. Caller buffers must remain alive from +//! [`ColumnSender::flush`](super::ColumnSender::flush) call setup until +//! the call returns; the lifetime parameter `'a` enforces this on the +//! safe Rust API. +//! +//! At flush time, the [`encoder`](super::encoder) walks the descriptors +//! and writes wire bytes straight into the connection's reusable write +//! buffer. The no-null hot path is a single `memcpy` per column from the +//! caller's buffer into that buffer. use std::fmt::{self, Debug, Formatter}; +use std::marker::PhantomData; use crate::{Result, error}; use super::validity::{Validity, check_row_count}; use super::wire::{ - F32_NULL, F64_NULL, I8_NULL, I16_NULL, I32_NULL, I64_NULL, QWP_TYPE_BOOLEAN, QWP_TYPE_BYTE, - QWP_TYPE_DATE, QWP_TYPE_DOUBLE, QWP_TYPE_FLOAT, QWP_TYPE_INT, QWP_TYPE_IPV4, QWP_TYPE_LONG, - QWP_TYPE_LONG256, QWP_TYPE_SHORT, QWP_TYPE_SYMBOL, QWP_TYPE_TIMESTAMP, - QWP_TYPE_TIMESTAMP_NANOS, QWP_TYPE_UUID, QWP_TYPE_VARCHAR, validate_name, write_qwp_bytes, + MAX_NAME_LEN, QWP_TYPE_BOOLEAN, QWP_TYPE_BYTE, QWP_TYPE_DATE, QWP_TYPE_DOUBLE, QWP_TYPE_FLOAT, + QWP_TYPE_INT, QWP_TYPE_IPV4, QWP_TYPE_LONG, QWP_TYPE_LONG256, QWP_TYPE_SHORT, QWP_TYPE_SYMBOL, + QWP_TYPE_TIMESTAMP, QWP_TYPE_TIMESTAMP_NANOS, QWP_TYPE_UUID, QWP_TYPE_VARCHAR, validate_name, }; -/// One column in a chunk. -/// -/// Numeric and fixed-width columns are pre-encoded to wire shape at -/// append time and stored as [`ChunkColumn::Resolved`]. Symbol columns -/// stage their codes + referenced dict bytes and resolve to wire shape -/// at flush time ([`ChunkColumn::Symbol`]) because the global symbol id -/// is connection-scoped and chunks are sender-agnostic until flushed. -pub(crate) enum ChunkColumn { - Resolved { - #[allow(dead_code)] - name: String, - /// `name_len_varint || name_bytes || wire_type_byte`. - signature_chunk: Vec, - /// `payload[0]` is the null-flag byte; `payload[1..]` is the - /// per-type body (optional bitmap then dense values, or - /// row-count dense values for the no-bitmap shape). - payload: Vec, - }, - Symbol { - #[allow(dead_code)] - name: String, - signature_chunk: Vec, - row_count: usize, - /// Per-row index into `referenced_symbols`. For null rows the - /// value is unspecified — the encoder consults the bitmap before - /// touching the code. - codes: Vec, - /// QWP-shape null bitmap (bit = 1 means NULL). `None` when the - /// column has no nulls — encoder emits `null_flag = 0`. - bitmap: Option>, - non_null_count: usize, - /// Compact list of dict entries this column actually references, - /// indexed by the values in `codes`. Bounded by the chunk's - /// per-column cardinality rather than the (potentially huge) - /// caller dict. - referenced_symbols: Vec>, - }, +// =========================================================================== +// Descriptors +// =========================================================================== + +/// Validity bitmap descriptor (raw-ptr form, matching `Validity<'a>`). +/// `non_null_count` is pre-computed at column-append time because several +/// encoder paths (e.g. VARCHAR's dense offset table) size their output +/// from it. +#[derive(Clone, Copy)] +pub(crate) struct ValidityDescriptor { + pub(crate) bits: *const u8, + pub(crate) bit_len: usize, + pub(crate) non_null_count: usize, } -impl ChunkColumn { - pub(crate) fn signature(&self) -> &[u8] { - match self { - Self::Resolved { - signature_chunk, .. - } - | Self::Symbol { - signature_chunk, .. - } => signature_chunk, +impl ValidityDescriptor { + fn from_validity(v: &Validity<'_>) -> Self { + Self { + bits: v.bits.as_ptr(), + bit_len: v.bit_len, + non_null_count: v.non_null_count(), } } - fn name(&self) -> &str { - match self { - Self::Resolved { name, .. } | Self::Symbol { name, .. } => name, - } + /// SAFETY: caller's buffer must still be alive (Chunk's `'a` lifetime + /// guarantees this on the safe path; the FFI is responsible on the + /// unsafe path). + #[inline] + pub(crate) unsafe fn is_valid(&self, idx: usize) -> bool { + debug_assert!(idx < self.bit_len); + let byte = unsafe { *self.bits.add(idx / 8) }; + (byte >> (idx % 8)) & 1 == 1 + } + + /// Length in bytes of the underlying Arrow bitmap. + #[inline] + pub(crate) fn byte_len(&self) -> usize { + self.bit_len.div_ceil(8) } +} + +/// Per-column kind dispatch. Each variant carries the raw pointer(s) the +/// encoder dereferences at flush time. +pub(crate) enum ColumnKind { + // ---- Sentinel-null fixed width (no bitmap; 0x00 null_flag) ---- + Byte { + data: *const i8, + }, + Short { + data: *const i16, + }, + Int { + data: *const i32, + }, + Long { + data: *const i64, + }, + Float { + data: *const f32, + }, + Double { + data: *const f64, + }, + // Bool: Arrow LSB-first bitmap input. row_count is the Chunk's row count. + Bool { + bits: *const u8, + }, + + // ---- Bitmap-style fixed width (sparse null encoding) ---- + Ipv4 { + data: *const u32, + }, + TsNanos { + data: *const i64, + }, + TsMicros { + data: *const i64, + }, + DateMillis { + data: *const i64, + }, + Uuid { + data: *const [u8; 16], + }, + Long256 { + data: *const [u8; 32], + }, + + // ---- Variable-width text (VARCHAR) ---- + Varchar { + offsets: *const i32, + /// row_count + 1 + offsets_len: usize, + bytes: *const u8, + bytes_len: usize, + }, + + // ---- Symbol (dictionary-encoded) ---- + Symbol { + codes: SymbolCodesPtr, + dict_offsets: *const i32, + /// dict cardinality + 1 + dict_offsets_len: usize, + dict_bytes: *const u8, + dict_bytes_len: usize, + }, +} - #[cfg(test)] - pub(crate) fn resolved_payload(&self) -> &[u8] { - match self { - Self::Resolved { payload, .. } => payload, - Self::Symbol { .. } => panic!("not a Resolved column"), +#[derive(Clone, Copy)] +pub(crate) enum SymbolCodesPtr { + I8(*const i8), + I16(*const i16), + I32(*const i32), +} + +impl SymbolCodesPtr { + /// Read the dict-index for row `i`, sign-extended to `i64` so the + /// encoder can range-check uniformly. SAFETY: caller's `codes` + /// buffer must still be alive. + #[inline] + pub(crate) unsafe fn read_i64(&self, i: usize) -> i64 { + unsafe { + match self { + SymbolCodesPtr::I8(p) => *p.add(i) as i64, + SymbolCodesPtr::I16(p) => *p.add(i) as i64, + SymbolCodesPtr::I32(p) => *p.add(i) as i64, + } } } } -/// Designated timestamp slot. Required exactly once per chunk before flush. -pub(crate) struct DesignatedTimestamp { - /// `QWP_TYPE_TIMESTAMP` (0x0A) for micros, `QWP_TYPE_TIMESTAMP_NANOS` - /// (0x10) for nanos. +/// One column slot in a [`Chunk`]. `name` is owned (the chunk holds it +/// for diagnostics + signature emission); everything else is borrowed. +pub(crate) struct ColumnDescriptor { + pub(crate) name: String, + pub(crate) wire_type: u8, + pub(crate) kind: ColumnKind, + pub(crate) validity: Option, +} + +/// Designated timestamp descriptor. Required exactly once per chunk +/// before flush. Designated timestamps are non-null by spec. +pub(crate) struct DesignatedTsDescriptor { pub(crate) wire_type: u8, - /// Already wire-shape: `null_flag=0` then `row_count * 8` bytes of LE - /// i64. Designated timestamps are non-null per the wire spec, so no - /// bitmap path. - pub(crate) payload: Vec, + pub(crate) data: *const i64, } -/// One DataFrame's worth of column buffers destined for one QuestDB table. +// =========================================================================== +// Chunk +// =========================================================================== + +/// One DataFrame's worth of borrowed column buffers destined for one +/// QuestDB table. /// -/// Builders mutate the chunk in-place; on a successful -/// [`super::ColumnSender::flush`] it is cleared (its per-column `Vec` -/// allocations are retained for the next DataFrame). -pub struct Chunk { +/// The lifetime parameter `'a` ties the chunk to every column buffer +/// passed in through `column_*` / `symbol_dict_*`. Each call validates +/// inputs and stores a descriptor referencing the caller's buffer; no +/// data is copied. The caller's buffers must outlive the chunk — +/// concretely, they must remain alive from each column append through +/// the next [`ColumnSender::flush`](super::ColumnSender::flush) call. +pub struct Chunk<'a> { pub(crate) table: String, - /// Locked by the first `column_*` call. `None` means the chunk has no - /// columns yet and the next append will set it. pub(crate) row_count: Option, - pub(crate) columns: Vec, - pub(crate) designated_ts: Option, + pub(crate) columns: Vec, + pub(crate) designated_ts: Option, + _marker: PhantomData<&'a ()>, } -impl Chunk { +impl<'a> Chunk<'a> { /// Create a chunk for `table`. The table name is validated at flush /// time against the QWP/Java client length cap (127 bytes UTF-8). pub fn new(table: impl Into) -> Self { @@ -143,167 +226,149 @@ impl Chunk { row_count: None, columns: Vec::new(), designated_ts: None, + _marker: PhantomData, } } - /// Table name the chunk's rows will land in. pub fn table(&self) -> &str { &self.table } - /// Number of rows in the chunk. Locked by the first column append; - /// returns `0` before any column has been appended. pub fn row_count(&self) -> usize { self.row_count.unwrap_or(0) } - /// `true` iff the chunk has no columns and no designated timestamp. pub fn is_empty(&self) -> bool { self.row_count.is_none() && self.designated_ts.is_none() } - /// Reset the chunk for reuse: clears all rows but keeps each column's - /// allocated capacity. Called automatically after a successful flush. + /// Reset the chunk for reuse. Drops descriptors but keeps the + /// `Vec` capacity so the next chunk fills the same + /// slots without reallocating the outer Vec. pub fn clear(&mut self) { self.row_count = None; - // Drop the column slots; we keep the outer Vec's capacity so the - // next chunk's `push_column` reuses the slot count without - // reallocating the Vec itself. self.columns.clear(); self.designated_ts = None; } - // ------------------------------------------------------------------ + // ------------------------------------------------------------------- // Numeric & fixed-width columns - // ------------------------------------------------------------------ + // ------------------------------------------------------------------- - /// `BYTE` column. Nullable rows are sentinel-encoded as 0 on the wire. pub fn column_i8( &mut self, name: &str, - data: &[i8], - validity: Option<&Validity<'_>>, + data: &'a [i8], + validity: Option<&Validity<'a>>, ) -> Result<&mut Self> { let row_count = check_row_count(self.row_count, data.len(), validity)?; - let mut payload = new_payload(); - payload.push(0); // null_flag - match validity { - None => { - // Safety: `i8` and `u8` have identical layout; the cast - // gives a byte slice without copying. - let bytes: &[u8] = - unsafe { std::slice::from_raw_parts(data.as_ptr().cast::(), data.len()) }; - payload.extend_from_slice(bytes); - } - Some(v) => { - for (i, &value) in data.iter().enumerate() { - let out = if v.is_valid(i) { value } else { I8_NULL }; - payload.push(out as u8); - } - } - } - self.push_column(name, QWP_TYPE_BYTE, payload, row_count) + self.push_column( + name, + QWP_TYPE_BYTE, + ColumnKind::Byte { + data: data.as_ptr(), + }, + validity, + row_count, + ) } - /// `SHORT` column. Nullable rows are sentinel-encoded as 0. pub fn column_i16( &mut self, name: &str, - data: &[i16], - validity: Option<&Validity<'_>>, + data: &'a [i16], + validity: Option<&Validity<'a>>, ) -> Result<&mut Self> { - encode_le_numeric( - self, + let row_count = check_row_count(self.row_count, data.len(), validity)?; + self.push_column( name, QWP_TYPE_SHORT, - data, + ColumnKind::Short { + data: data.as_ptr(), + }, validity, - I16_NULL, - i16::to_le_bytes, + row_count, ) } - /// `INT` column. Nullable rows are sentinel-encoded as `i32::MIN`. pub fn column_i32( &mut self, name: &str, - data: &[i32], - validity: Option<&Validity<'_>>, + data: &'a [i32], + validity: Option<&Validity<'a>>, ) -> Result<&mut Self> { - encode_le_numeric( - self, + let row_count = check_row_count(self.row_count, data.len(), validity)?; + self.push_column( name, QWP_TYPE_INT, - data, + ColumnKind::Int { + data: data.as_ptr(), + }, validity, - I32_NULL, - i32::to_le_bytes, + row_count, ) } - /// `LONG` column. Nullable rows are sentinel-encoded as `i64::MIN`. pub fn column_i64( &mut self, name: &str, - data: &[i64], - validity: Option<&Validity<'_>>, + data: &'a [i64], + validity: Option<&Validity<'a>>, ) -> Result<&mut Self> { - encode_le_numeric( - self, + let row_count = check_row_count(self.row_count, data.len(), validity)?; + self.push_column( name, QWP_TYPE_LONG, - data, + ColumnKind::Long { + data: data.as_ptr(), + }, validity, - I64_NULL, - i64::to_le_bytes, + row_count, ) } - /// `FLOAT` column. Nullable rows are sentinel-encoded as `NaN`. pub fn column_f32( &mut self, name: &str, - data: &[f32], - validity: Option<&Validity<'_>>, + data: &'a [f32], + validity: Option<&Validity<'a>>, ) -> Result<&mut Self> { - encode_le_numeric( - self, + let row_count = check_row_count(self.row_count, data.len(), validity)?; + self.push_column( name, QWP_TYPE_FLOAT, - data, + ColumnKind::Float { + data: data.as_ptr(), + }, validity, - F32_NULL, - f32::to_le_bytes, + row_count, ) } - /// `DOUBLE` column. Nullable rows are sentinel-encoded as `NaN`. pub fn column_f64( &mut self, name: &str, - data: &[f64], - validity: Option<&Validity<'_>>, + data: &'a [f64], + validity: Option<&Validity<'a>>, ) -> Result<&mut Self> { - encode_le_numeric( - self, + let row_count = check_row_count(self.row_count, data.len(), validity)?; + self.push_column( name, QWP_TYPE_DOUBLE, - data, + ColumnKind::Double { + data: data.as_ptr(), + }, validity, - F64_NULL, - f64::to_le_bytes, + row_count, ) } - /// `BOOLEAN` column. `data` is an Arrow-style LSB-first packed bitmap - /// (1 = true). Nullable rows are encoded as `false` on the wire — the - /// row-API + QuestDB convention. pub fn column_bool( &mut self, name: &str, - data: &[u8], + data: &'a [u8], row_count: usize, - validity: Option<&Validity<'_>>, + validity: Option<&Validity<'a>>, ) -> Result<&mut Self> { let bytes_required = row_count.div_ceil(8); if data.len() < bytes_required { @@ -316,138 +381,140 @@ impl Chunk { )); } let row_count = check_row_count(self.row_count, row_count, validity)?; - let mut payload = new_payload(); - payload.push(0); // null_flag — bool always uses sentinel encoding - - let mut packed = 0u8; - let mut bit_idx = 0u8; - for i in 0..row_count { - let bit = (data[i / 8] >> (i % 8)) & 1; - let valid = validity.is_none_or(|v| v.is_valid(i)); - if bit == 1 && valid { - packed |= 1u8 << bit_idx; - } - bit_idx += 1; - if bit_idx == 8 { - payload.push(packed); - packed = 0; - bit_idx = 0; - } - } - if bit_idx != 0 { - payload.push(packed); - } - self.push_column(name, QWP_TYPE_BOOLEAN, payload, row_count) + self.push_column( + name, + QWP_TYPE_BOOLEAN, + ColumnKind::Bool { + bits: data.as_ptr(), + }, + validity, + row_count, + ) } - // ------------------------------------------------------------------ - // Bitmap-style fixed-width columns (sparse-null types) - // ------------------------------------------------------------------ + // ------------------------------------------------------------------- + // Bitmap-style fixed-width columns + // ------------------------------------------------------------------- - /// `UUID` column. `data[i]` is a 16-byte UUID per row (bytes 0..8 lo - /// half LE, 8..16 hi half LE — same layout as the row-API path). pub fn column_uuid( &mut self, name: &str, - data: &[[u8; 16]], - validity: Option<&Validity<'_>>, + data: &'a [[u8; 16]], + validity: Option<&Validity<'a>>, ) -> Result<&mut Self> { - encode_fixed_width_bitmap(self, name, QWP_TYPE_UUID, data, validity, 16) + let row_count = check_row_count(self.row_count, data.len(), validity)?; + self.push_column( + name, + QWP_TYPE_UUID, + ColumnKind::Uuid { + data: data.as_ptr(), + }, + validity, + row_count, + ) } - /// `LONG256` column. `data[i]` is a 32-byte LONG256 per row (4 LE - /// 64-bit limbs, least-significant first). pub fn column_long256( &mut self, name: &str, - data: &[[u8; 32]], - validity: Option<&Validity<'_>>, + data: &'a [[u8; 32]], + validity: Option<&Validity<'a>>, ) -> Result<&mut Self> { - encode_fixed_width_bitmap(self, name, QWP_TYPE_LONG256, data, validity, 32) + let row_count = check_row_count(self.row_count, data.len(), validity)?; + self.push_column( + name, + QWP_TYPE_LONG256, + ColumnKind::Long256 { + data: data.as_ptr(), + }, + validity, + row_count, + ) } - /// `IPV4` column. Each `data[i]` is a `u32::from(Ipv4Addr)` (octet 0 - /// in the high byte) encoded little-endian on the wire. pub fn column_ipv4( &mut self, name: &str, - data: &[u32], - validity: Option<&Validity<'_>>, + data: &'a [u32], + validity: Option<&Validity<'a>>, ) -> Result<&mut Self> { - encode_le_bitmap(self, name, QWP_TYPE_IPV4, data, validity, u32::to_le_bytes) + let row_count = check_row_count(self.row_count, data.len(), validity)?; + self.push_column( + name, + QWP_TYPE_IPV4, + ColumnKind::Ipv4 { + data: data.as_ptr(), + }, + validity, + row_count, + ) } - /// `TIMESTAMP_NANOS` column (wire type `0x10`). pub fn column_ts_nanos( &mut self, name: &str, - data: &[i64], - validity: Option<&Validity<'_>>, + data: &'a [i64], + validity: Option<&Validity<'a>>, ) -> Result<&mut Self> { - encode_le_bitmap( - self, + let row_count = check_row_count(self.row_count, data.len(), validity)?; + self.push_column( name, QWP_TYPE_TIMESTAMP_NANOS, - data, + ColumnKind::TsNanos { + data: data.as_ptr(), + }, validity, - i64::to_le_bytes, + row_count, ) } - /// `TIMESTAMP` (microseconds) column (wire type `0x0A`). pub fn column_ts_micros( &mut self, name: &str, - data: &[i64], - validity: Option<&Validity<'_>>, + data: &'a [i64], + validity: Option<&Validity<'a>>, ) -> Result<&mut Self> { - encode_le_bitmap( - self, + let row_count = check_row_count(self.row_count, data.len(), validity)?; + self.push_column( name, QWP_TYPE_TIMESTAMP, - data, + ColumnKind::TsMicros { + data: data.as_ptr(), + }, validity, - i64::to_le_bytes, + row_count, ) } - /// `DATE` column. Milliseconds since the Unix epoch on the wire. pub fn column_date_millis( &mut self, name: &str, - data: &[i64], - validity: Option<&Validity<'_>>, + data: &'a [i64], + validity: Option<&Validity<'a>>, ) -> Result<&mut Self> { - encode_le_bitmap(self, name, QWP_TYPE_DATE, data, validity, i64::to_le_bytes) + let row_count = check_row_count(self.row_count, data.len(), validity)?; + self.push_column( + name, + QWP_TYPE_DATE, + ColumnKind::DateMillis { + data: data.as_ptr(), + }, + validity, + row_count, + ) } - // ------------------------------------------------------------------ - // Variable-width text (VARCHAR) - // ------------------------------------------------------------------ - - /// `VARCHAR` column (QWP wire type `0x0F`). - /// - /// Input is Arrow Utf8 shape: `offsets` has `row_count + 1` entries, - /// monotonically non-decreasing, where `bytes[offsets[i]..offsets[i+1]]` - /// is the value for row `i`. `offsets[0]` may be non-zero (the column - /// encoder rebases to 0 on the wire). - /// - /// Wire output: dense (only non-null values), `non_null_count + 1` - /// little-endian u32 offsets starting at 0, followed by the - /// concatenated bytes of the non-null rows. - /// - /// UTF-8 validity is the caller's responsibility; invalid UTF-8 is - /// detected by the server and surfaced as a server rejection. + // ------------------------------------------------------------------- + // VARCHAR + // ------------------------------------------------------------------- + pub fn column_varchar( &mut self, name: &str, - offsets: &[i32], - bytes: &[u8], - validity: Option<&Validity<'_>>, + offsets: &'a [i32], + bytes: &'a [u8], + validity: Option<&Validity<'a>>, ) -> Result<&mut Self> { - // Arrow Utf8 layout: offsets length is row_count + 1. We can't - // call `check_row_count(.. offsets.len() ..)` because the data is - // really `offsets.len() - 1` rows. if offsets.is_empty() { return Err(error::fmt!( InvalidApiCall, @@ -456,167 +523,140 @@ impl Chunk { } let row_count = offsets.len() - 1; let row_count = check_row_count(self.row_count, row_count, validity)?; - validate_varchar_offsets(offsets, bytes.len())?; - - let mut payload = new_payload(); - match validity { - None => { - payload.push(0); // null_flag - // Rebase offsets to start at 0 and write them as LE u32. - payload.reserve(4 * (row_count + 1) + bytes.len()); - let base = offsets[0]; - if base == 0 { - // Common case: contiguous arrow buffer, base == 0 — the - // i32 LE bytes are bit-identical to u32 LE bytes for - // non-negative values, so memcpy the offset table. - let offset_bytes: &[u8] = unsafe { - std::slice::from_raw_parts( - offsets.as_ptr().cast::(), - std::mem::size_of_val(offsets), - ) - }; - payload.extend_from_slice(offset_bytes); - // Bytes: copy the in-use slice (caller's buffer may be - // longer than the last offset). - let used = offsets[row_count] as usize; - payload.extend_from_slice(&bytes[..used]); - } else { - for &offset in offsets { - let normalized = (offset - base) as u32; - payload.extend_from_slice(&normalized.to_le_bytes()); - } - let start = base as usize; - let end = offsets[row_count] as usize; - payload.extend_from_slice(&bytes[start..end]); - } - } - Some(v) => { - payload.push(1); // null_flag — bitmap follows - v.write_qwp_bitmap(&mut payload); - - // Dense offsets: walk non-null rows once, then append the - // matching bytes. We size the offset table conservatively - // and patch it as we go to avoid a separate pass. - let non_null = v.non_null_count(); - let offsets_start = payload.len(); - payload.resize(offsets_start + 4 * (non_null + 1), 0); - // First dense offset is always 0. - payload[offsets_start..offsets_start + 4].copy_from_slice(&0u32.to_le_bytes()); - - let mut cumulative: u32 = 0; - let mut next_offset_idx = 1usize; - let bytes_anchor = payload.len(); - for i in 0..row_count { - if !v.is_valid(i) { - continue; - } - // Skip slicing for null rows — caller's offsets there - // are not trusted (Arrow allows arbitrary values). - let start = offsets[i] as usize; - let end = offsets[i + 1] as usize; - let len = end - start; - payload.extend_from_slice(&bytes[start..end]); - let new_cumulative = cumulative.checked_add(len as u32).ok_or_else(|| { - error::fmt!(InvalidApiCall, "VARCHAR column bytes exceed u32::MAX") - })?; - cumulative = new_cumulative; - let off = offsets_start + 4 * next_offset_idx; - payload[off..off + 4].copy_from_slice(&cumulative.to_le_bytes()); - next_offset_idx += 1; - } - debug_assert_eq!(next_offset_idx - 1, non_null); - debug_assert_eq!(payload.len() - bytes_anchor, cumulative as usize); - } - } - self.push_column(name, QWP_TYPE_VARCHAR, payload, row_count) + self.push_column( + name, + QWP_TYPE_VARCHAR, + ColumnKind::Varchar { + offsets: offsets.as_ptr(), + offsets_len: offsets.len(), + bytes: bytes.as_ptr(), + bytes_len: bytes.len(), + }, + validity, + row_count, + ) } - // ------------------------------------------------------------------ - // Symbol columns (dictionary-encoded fast path) - // ------------------------------------------------------------------ + // ------------------------------------------------------------------- + // Symbol + // ------------------------------------------------------------------- - /// `SYMBOL` column with `i8` dictionary codes (max dict cardinality - /// 128 — caller should promote to `i16`/`i32` for larger dicts). pub fn symbol_dict_i8( &mut self, name: &str, - codes: &[i8], - dict_offsets: &[i32], - dict_bytes: &[u8], - validity: Option<&Validity<'_>>, + codes: &'a [i8], + dict_offsets: &'a [i32], + dict_bytes: &'a [u8], + validity: Option<&Validity<'a>>, ) -> Result<&mut Self> { - push_symbol_column( - self, + self.push_symbol( name, - codes, - |c| *c as i32, + SymbolCodesPtr::I8(codes.as_ptr()), + codes.len(), dict_offsets, dict_bytes, validity, ) } - /// `SYMBOL` column with `i16` dictionary codes. pub fn symbol_dict_i16( &mut self, name: &str, - codes: &[i16], - dict_offsets: &[i32], - dict_bytes: &[u8], - validity: Option<&Validity<'_>>, + codes: &'a [i16], + dict_offsets: &'a [i32], + dict_bytes: &'a [u8], + validity: Option<&Validity<'a>>, ) -> Result<&mut Self> { - push_symbol_column( - self, + self.push_symbol( name, - codes, - |c| *c as i32, + SymbolCodesPtr::I16(codes.as_ptr()), + codes.len(), dict_offsets, dict_bytes, validity, ) } - /// `SYMBOL` column with `i32` dictionary codes — the Pandas - /// `Categorical` / Polars `Categorical` shape. pub fn symbol_dict_i32( &mut self, name: &str, - codes: &[i32], - dict_offsets: &[i32], - dict_bytes: &[u8], - validity: Option<&Validity<'_>>, + codes: &'a [i32], + dict_offsets: &'a [i32], + dict_bytes: &'a [u8], + validity: Option<&Validity<'a>>, ) -> Result<&mut Self> { - push_symbol_column( - self, + self.push_symbol( name, - codes, - |c| *c, + SymbolCodesPtr::I32(codes.as_ptr()), + codes.len(), dict_offsets, dict_bytes, validity, ) } - // ------------------------------------------------------------------ + fn push_symbol( + &mut self, + name: &str, + codes: SymbolCodesPtr, + codes_len: usize, + dict_offsets: &'a [i32], + dict_bytes: &'a [u8], + validity: Option<&Validity<'a>>, + ) -> Result<&mut Self> { + let row_count = check_row_count(self.row_count, codes_len, validity)?; + if dict_offsets.is_empty() { + return Err(error::fmt!( + InvalidApiCall, + "symbol dict offsets must have at least one entry (dict_len + 1)" + )); + } + validate_varchar_offsets(dict_offsets, dict_bytes.len())?; + let dict_len = dict_offsets.len() - 1; + + // Range-check codes for non-null rows. The encoder relies on + // every non-null code being a valid dict index, so we surface + // the failure here at append time. + let bounds_check = match codes { + SymbolCodesPtr::I8(p) => unsafe { range_check_codes(p, codes_len, dict_len, validity) }, + SymbolCodesPtr::I16(p) => unsafe { + range_check_codes(p, codes_len, dict_len, validity) + }, + SymbolCodesPtr::I32(p) => unsafe { + range_check_codes(p, codes_len, dict_len, validity) + }, + }; + bounds_check?; + + self.push_column( + name, + QWP_TYPE_SYMBOL, + ColumnKind::Symbol { + codes, + dict_offsets: dict_offsets.as_ptr(), + dict_offsets_len: dict_offsets.len(), + dict_bytes: dict_bytes.as_ptr(), + dict_bytes_len: dict_bytes.len(), + }, + validity, + row_count, + ) + } + + // ------------------------------------------------------------------- // Designated timestamp - // ------------------------------------------------------------------ + // ------------------------------------------------------------------- - /// Designated timestamp in microseconds since the Unix epoch (wire - /// type `TIMESTAMP` 0x0A). Required exactly once per chunk before - /// flush. Designated timestamps must be non-null per the wire spec — - /// there is no validity bitmap. - pub fn designated_timestamp_micros(&mut self, data: &[i64]) -> Result<&mut Self> { + pub fn designated_timestamp_micros(&mut self, data: &'a [i64]) -> Result<&mut Self> { self.set_designated_ts(QWP_TYPE_TIMESTAMP, data) } - /// Designated timestamp in nanoseconds since the Unix epoch (wire - /// type `TIMESTAMP_NANOS` 0x10). - pub fn designated_timestamp_nanos(&mut self, data: &[i64]) -> Result<&mut Self> { + pub fn designated_timestamp_nanos(&mut self, data: &'a [i64]) -> Result<&mut Self> { self.set_designated_ts(QWP_TYPE_TIMESTAMP_NANOS, data) } - fn set_designated_ts(&mut self, wire_type: u8, data: &[i64]) -> Result<&mut Self> { + fn set_designated_ts(&mut self, wire_type: u8, data: &'a [i64]) -> Result<&mut Self> { if self.designated_ts.is_some() { return Err(error::fmt!( InvalidApiCall, @@ -624,42 +664,49 @@ impl Chunk { )); } let row_count = check_row_count(self.row_count, data.len(), None)?; - let mut payload = new_payload(); - payload.push(0); // null_flag — designated_ts is always non-null - payload.reserve(8 * data.len()); - for &v in data { - payload.extend_from_slice(&v.to_le_bytes()); - } + self.designated_ts = Some(DesignatedTsDescriptor { + wire_type, + data: data.as_ptr(), + }); self.row_count = Some(row_count); - self.designated_ts = Some(DesignatedTimestamp { wire_type, payload }); Ok(self) } - // ------------------------------------------------------------------ - // Internal helpers - // ------------------------------------------------------------------ + // ------------------------------------------------------------------- + // Internal + // ------------------------------------------------------------------- fn push_column( &mut self, name: &str, wire_type: u8, - payload: Vec, + kind: ColumnKind, + validity: Option<&Validity<'_>>, row_count: usize, ) -> Result<&mut Self> { validate_name("column", name)?; + if name.len() > MAX_NAME_LEN { + return Err(error::fmt!( + InvalidName, + "column name is too long: {} bytes (max {})", + name.len(), + MAX_NAME_LEN + )); + } self.guard_unique_name(name)?; - let signature_chunk = build_signature_chunk(name, wire_type); - self.columns.push(ChunkColumn::Resolved { + let validity = validity.map(ValidityDescriptor::from_validity); + self.columns.push(ColumnDescriptor { name: name.to_owned(), - signature_chunk, - payload, + wire_type, + kind, + validity, }); self.row_count = Some(row_count); Ok(self) } fn guard_unique_name(&self, name: &str) -> Result<()> { - if self.columns.iter().any(|c| c.name() == name) { + if self.columns.iter().any(|c| c.name == name) { return Err(error::fmt!( InvalidApiCall, "duplicate column name in chunk: {:?}", @@ -670,135 +717,7 @@ impl Chunk { } } -fn build_signature_chunk(name: &str, wire_type: u8) -> Vec { - let mut sig = Vec::with_capacity(1 + name.len() + 1); - write_qwp_bytes(&mut sig, name.as_bytes()); - sig.push(wire_type); - sig -} - -fn new_payload() -> Vec { - // 1 byte null_flag, room for a small bitmap, and most callers extend - // immediately. 16 bytes is enough to avoid the first realloc for any - // short column. - Vec::with_capacity(16) -} - -/// Bulk-intern a symbol column at append time. -/// -/// Three passes (each O(row_count) or O(dict_len) but never the -/// product): -/// 1. Walk `codes` once to mark which dict entries the chunk actually -/// references in a bitset. Validate range; reject out-of-range. -/// 2. Walk the bitset to copy referenced dict entries into compact -/// `referenced_symbols` storage and build a `local → internal` map -/// keyed by dict index. -/// 3. Walk `codes` again to translate to the compact internal indices -/// and build the QWP-shape bitmap from validity. -/// -/// Defers the connection-scoped global-id assignment to flush time -/// because chunks are sender-agnostic — see `doc/COLUMN_SENDER_PLAN.md`. -fn push_symbol_column<'a, T, F>( - chunk: &'a mut Chunk, - name: &str, - codes: &[T], - to_i32: F, - dict_offsets: &[i32], - dict_bytes: &[u8], - validity: Option<&Validity<'_>>, -) -> Result<&'a mut Chunk> -where - F: Fn(&T) -> i32, -{ - let row_count = check_row_count(chunk.row_count, codes.len(), validity)?; - validate_name("column", name)?; - chunk.guard_unique_name(name)?; - - if dict_offsets.is_empty() { - return Err(error::fmt!( - InvalidApiCall, - "symbol dict offsets must have at least one entry (dict_len + 1)" - )); - } - validate_varchar_offsets(dict_offsets, dict_bytes.len())?; - let dict_len = dict_offsets.len() - 1; - - // Pass 1: referenced bitset + range check. - let mut referenced = vec![false; dict_len]; - let mut non_null_count = 0usize; - for (i, code) in codes.iter().enumerate() { - if !validity.is_none_or(|v| v.is_valid(i)) { - continue; - } - let idx = to_i32(code); - if idx < 0 || (idx as usize) >= dict_len { - return Err(error::fmt!( - InvalidApiCall, - "symbol code out of range: row {} -> {} (dict_len = {})", - i, - idx, - dict_len - )); - } - referenced[idx as usize] = true; - non_null_count += 1; - } - - // Pass 2: compact referenced dict + build local-to-internal map. - // `local_to_internal[d] == u32::MAX` for unreferenced entries; we - // never index it with an unreferenced code (pass 1 marked them so - // pass 3 only follows referenced entries). `dict_offsets` are - // absolute byte offsets into `dict_bytes` per the Arrow Utf8 layout - // (`validate_varchar_offsets` has already proven the slices are in - // bounds and monotonic). - let mut local_to_internal = vec![u32::MAX; dict_len]; - let mut referenced_symbols: Vec> = Vec::new(); - for (d, mark) in referenced.iter().enumerate() { - if !*mark { - continue; - } - let start = dict_offsets[d] as usize; - let end = dict_offsets[d + 1] as usize; - let internal = referenced_symbols.len() as u32; - referenced_symbols.push(dict_bytes[start..end].to_vec()); - local_to_internal[d] = internal; - } - - // Pass 3: translate codes to internal indices; build QWP bitmap. - let mut compact_codes = Vec::with_capacity(codes.len()); - for (i, code) in codes.iter().enumerate() { - if !validity.is_none_or(|v| v.is_valid(i)) { - compact_codes.push(u32::MAX); - continue; - } - let idx = to_i32(code) as usize; - compact_codes.push(local_to_internal[idx]); - } - let bitmap = validity.map(|v| { - let mut bm = Vec::with_capacity(row_count.div_ceil(8)); - v.write_qwp_bitmap(&mut bm); - bm - }); - - let signature_chunk = build_signature_chunk(name, QWP_TYPE_SYMBOL); - chunk.columns.push(ChunkColumn::Symbol { - name: name.to_owned(), - signature_chunk, - row_count, - codes: compact_codes, - bitmap, - non_null_count, - referenced_symbols, - }); - chunk.row_count = Some(row_count); - Ok(chunk) -} - fn validate_varchar_offsets(offsets: &[i32], bytes_len: usize) -> Result<()> { - // Arrow Utf8 promises monotonic non-decreasing offsets and that every - // offset is ≤ bytes_len. We trust UTF-8 (server enforces) but cheap - // bounds checking here saves the server an obvious parse error and - // gives us a meaningful Rust-side error. let mut prev = offsets[0]; if prev < 0 { return Err(error::fmt!( @@ -831,120 +750,38 @@ fn validate_varchar_offsets(offsets: &[i32], bytes_len: usize) -> Result<()> { Ok(()) } -#[inline] -fn encode_le_numeric<'a, T, const N: usize, F>( - chunk: &'a mut Chunk, - name: &str, - wire_type: u8, - data: &[T], - validity: Option<&Validity<'_>>, - null_value: T, - to_le: F, -) -> Result<&'a mut Chunk> -where - T: Copy, - F: Fn(T) -> [u8; N], -{ - let row_count = check_row_count(chunk.row_count, data.len(), validity)?; - let mut payload = new_payload(); - payload.push(0); // null_flag — non-sparse-null types always use sentinels - payload.reserve(N * row_count); - match validity { - None => { - // Safety: `[T]` and the resulting `[u8]` view share the same - // backing memory; `T` is a plain numeric POD so any byte - // pattern is sound. This is the column-sender hot path — pure - // memcpy. - let bytes: &[u8] = unsafe { - std::slice::from_raw_parts(data.as_ptr().cast::(), std::mem::size_of_val(data)) - }; - payload.extend_from_slice(bytes); - } - Some(v) => { - for (i, &value) in data.iter().enumerate() { - let out = if v.is_valid(i) { value } else { null_value }; - payload.extend_from_slice(&to_le(out)); - } - } - } - chunk.push_column(name, wire_type, payload, row_count) -} - -#[inline] -fn encode_le_bitmap<'a, T, const N: usize, F>( - chunk: &'a mut Chunk, - name: &str, - wire_type: u8, - data: &[T], +/// SAFETY: `p` must point to `codes_len` valid `T`s. `validity` (if any) +/// must have `bit_len == codes_len` and a bitmap of at least +/// `ceil(codes_len / 8)` bytes — both enforced by `check_row_count` and +/// `Validity::from_bitmap` before this is called. +unsafe fn range_check_codes( + p: *const T, + codes_len: usize, + dict_len: usize, validity: Option<&Validity<'_>>, - to_le: F, -) -> Result<&'a mut Chunk> +) -> Result<()> where - T: Copy, - F: Fn(T) -> [u8; N], + T: Copy + Into, { - let row_count = check_row_count(chunk.row_count, data.len(), validity)?; - let mut payload = new_payload(); - match validity { - None => { - payload.push(0); // null_flag - payload.reserve(N * row_count); - let bytes: &[u8] = unsafe { - std::slice::from_raw_parts(data.as_ptr().cast::(), std::mem::size_of_val(data)) - }; - payload.extend_from_slice(bytes); - } - Some(v) => { - payload.push(1); // null_flag — bitmap follows - v.write_qwp_bitmap(&mut payload); - payload.reserve(N * v.non_null_count()); - for (i, &value) in data.iter().enumerate() { - if v.is_valid(i) { - payload.extend_from_slice(&to_le(value)); - } - } - } - } - chunk.push_column(name, wire_type, payload, row_count) -} - -#[inline] -fn encode_fixed_width_bitmap<'a, const N: usize>( - chunk: &'a mut Chunk, - name: &str, - wire_type: u8, - data: &[[u8; N]], - validity: Option<&Validity<'_>>, - elem_size: usize, -) -> Result<&'a mut Chunk> { - debug_assert_eq!(elem_size, N); - let row_count = check_row_count(chunk.row_count, data.len(), validity)?; - let mut payload = new_payload(); - match validity { - None => { - payload.push(0); // null_flag - payload.reserve(N * row_count); - // Bulk memcpy: `[[u8; N]]` is laid out as `N * row_count` bytes - // contiguously, no per-row work. - let bytes: &[u8] = - unsafe { std::slice::from_raw_parts(data.as_ptr().cast::(), N * data.len()) }; - payload.extend_from_slice(bytes); + for i in 0..codes_len { + if validity.is_some_and(|v| !v.is_valid(i)) { + continue; } - Some(v) => { - payload.push(1); // null_flag — bitmap follows - v.write_qwp_bitmap(&mut payload); - payload.reserve(N * v.non_null_count()); - for (i, value) in data.iter().enumerate() { - if v.is_valid(i) { - payload.extend_from_slice(&value[..]); - } - } + let code = unsafe { (*p.add(i)).into() }; + if code < 0 || (code as usize) >= dict_len { + return Err(error::fmt!( + InvalidApiCall, + "symbol code out of range: row {} -> {} (dict_len = {})", + i, + code, + dict_len + )); } } - chunk.push_column(name, wire_type, payload, row_count) + Ok(()) } -impl Debug for Chunk { +impl Debug for Chunk<'_> { fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { f.debug_struct("Chunk") .field("table", &self.table) @@ -962,9 +799,11 @@ mod tests { #[test] fn locks_row_count_on_first_column() { let mut chunk = Chunk::new("t"); - chunk.column_i64("a", &[1, 2, 3], None).unwrap(); + let a = [1i64, 2, 3]; + chunk.column_i64("a", &a, None).unwrap(); assert_eq!(chunk.row_count(), 3); - let err = chunk.column_i64("b", &[1, 2], None).unwrap_err(); + let b = [4i64, 5]; + let err = chunk.column_i64("b", &b, None).unwrap_err(); assert_eq!(err.code(), crate::ErrorCode::InvalidApiCall); assert!(err.msg().contains("row_count")); } @@ -972,8 +811,10 @@ mod tests { #[test] fn rejects_duplicate_column_name() { let mut chunk = Chunk::new("t"); - chunk.column_i64("a", &[1], None).unwrap(); - let err = chunk.column_i64("a", &[2], None).unwrap_err(); + let a1 = [1i64]; + chunk.column_i64("a", &a1, None).unwrap(); + let a2 = [2i64]; + let err = chunk.column_i64("a", &a2, None).unwrap_err(); assert_eq!(err.code(), crate::ErrorCode::InvalidApiCall); assert!(err.msg().contains("duplicate")); } @@ -983,178 +824,79 @@ mod tests { let mut chunk = Chunk::new("t"); let bits = [0xFFu8]; let v = Validity::from_bitmap(&bits, 8).unwrap(); - let err = chunk.column_i64("a", &[1, 2, 3], Some(&v)).unwrap_err(); + let data = [1i64, 2, 3]; + let err = chunk.column_i64("a", &data, Some(&v)).unwrap_err(); assert_eq!(err.code(), crate::ErrorCode::InvalidApiCall); assert!(err.msg().contains("Validity bitmap")); } - #[test] - fn nullable_i64_sentinel_encodes() { - let mut chunk = Chunk::new("t"); - let bits = [0b0000_0101]; // bits 0,2 valid; bit 1 null - let v = Validity::from_bitmap(&bits, 3).unwrap(); - chunk.column_i64("a", &[10, 99, 20], Some(&v)).unwrap(); - let payload = chunk.columns[0].resolved_payload(); - assert_eq!(payload[0], 0, "null_flag must be 0 for I64"); - let raw: Vec = payload[1..] - .chunks_exact(8) - .map(|b| i64::from_le_bytes(b.try_into().unwrap())) - .collect(); - assert_eq!(raw, vec![10, I64_NULL, 20]); - } - - #[test] - fn nullable_uuid_uses_bitmap() { - let mut chunk = Chunk::new("t"); - let uuids: [[u8; 16]; 3] = [[0x10; 16], [0x99; 16], [0x20; 16]]; - let bits = [0b0000_0101]; // 0 valid, 1 null, 2 valid - let v = Validity::from_bitmap(&bits, 3).unwrap(); - chunk.column_uuid("u", &uuids, Some(&v)).unwrap(); - let payload = chunk.columns[0].resolved_payload(); - assert_eq!(payload[0], 1, "null_flag must be 1 (bitmap follows)"); - // QWP bitmap: bit=1 means NULL. Arrow bits = 0b101 → invert = - // 0b010 masked to 3 bits. - let qwp_bitmap = payload[1]; - assert_eq!(qwp_bitmap & 0b111, 0b010); - // Dense values: rows 0 and 2 only. - let dense = &payload[2..]; - assert_eq!(dense.len(), 32); - assert_eq!(&dense[..16], &[0x10u8; 16]); - assert_eq!(&dense[16..], &[0x20u8; 16]); - } - #[test] fn designated_ts_sets_row_count() { let mut chunk = Chunk::new("t"); - chunk.designated_timestamp_micros(&[1, 2, 3]).unwrap(); + let ts = [1i64, 2, 3]; + chunk.designated_timestamp_micros(&ts).unwrap(); assert_eq!(chunk.row_count(), 3); - let err = chunk.designated_timestamp_nanos(&[4, 5, 6]).unwrap_err(); + let ts2 = [4i64, 5, 6]; + let err = chunk.designated_timestamp_nanos(&ts2).unwrap_err(); assert!(err.msg().contains("designated")); } #[test] fn clear_resets_columns_but_keeps_table() { let mut chunk = Chunk::new("t"); - chunk.column_i64("a", &[1], None).unwrap(); - chunk.designated_timestamp_nanos(&[10]).unwrap(); + let a = [1i64]; + let ts = [10i64]; + chunk.column_i64("a", &a, None).unwrap(); + chunk.designated_timestamp_nanos(&ts).unwrap(); chunk.clear(); assert_eq!(chunk.row_count(), 0); assert!(chunk.is_empty()); assert_eq!(chunk.table(), "t"); } - #[test] - fn name_validation_rejects_overlong_names() { - let mut chunk = Chunk::new("t"); - let too_long = "x".repeat(super::super::wire::MAX_NAME_LEN + 1); - let err = chunk.column_i64(&too_long, &[1], None).unwrap_err(); - assert_eq!(err.code(), crate::ErrorCode::InvalidName); - } - - #[test] - fn varchar_no_null_memcpy_path() { - let mut chunk = Chunk::new("t"); - let offsets: [i32; 4] = [0, 3, 7, 11]; - let bytes = b"abcdefghijk"; - chunk.column_varchar("v", &offsets, bytes, None).unwrap(); - let payload = chunk.columns[0].resolved_payload(); - assert_eq!(payload[0], 0, "null_flag"); - // Offset table: 4 u32 little-endian values matching `offsets`. - let table = &payload[1..1 + 16]; - let parsed: Vec = table - .chunks_exact(4) - .map(|b| u32::from_le_bytes(b.try_into().unwrap())) - .collect(); - assert_eq!(parsed, vec![0u32, 3, 7, 11]); - // Byte buffer follows. - assert_eq!(&payload[1 + 16..], bytes); - } - - #[test] - fn varchar_no_null_rebases_non_zero_first_offset() { - let mut chunk = Chunk::new("t"); - // Caller's Arrow slice starts at offset 5. - let offsets: [i32; 3] = [5, 8, 12]; - let bytes = b"_____abcdefg____"; - chunk.column_varchar("v", &offsets, bytes, None).unwrap(); - let payload = chunk.columns[0].resolved_payload(); - assert_eq!(payload[0], 0); - let table = &payload[1..1 + 12]; - let parsed: Vec = table - .chunks_exact(4) - .map(|b| u32::from_le_bytes(b.try_into().unwrap())) - .collect(); - assert_eq!(parsed, vec![0u32, 3, 7]); - assert_eq!(&payload[1 + 12..], b"abcdefg"); - } - - #[test] - fn varchar_nullable_gather_skips_null_rows() { - let mut chunk = Chunk::new("t"); - // 3 rows; row 1 is null. Per the plan we MUST not slice - // bytes[offsets[1]..offsets[2]] for null rows. We assert the - // skip implicitly by reusing the same offset on both sides of - // the null row (so dense bytes still match what's expected) and - // by checking the output's bytes equal the union of non-null - // slices only. - let offsets: [i32; 4] = [0, 3, 3, 6]; - let bytes = b"abcxyz"; - let bits = [0b0000_0101]; // 0 valid, 1 null, 2 valid - let v = Validity::from_bitmap(&bits, 3).unwrap(); - chunk - .column_varchar("v", &offsets, bytes, Some(&v)) - .unwrap(); - let payload = chunk.columns[0].resolved_payload(); - assert_eq!(payload[0], 1, "null_flag = 1 (bitmap follows)"); - // QWP bitmap byte: invert Arrow bits 0b101 → 0b010 (mask to 3 bits). - assert_eq!(payload[1] & 0b111, 0b010); - // 2 non-null rows → 3 offsets (u32 each) = 12 bytes, then bytes. - let offsets_section = &payload[2..2 + 12]; - let parsed: Vec = offsets_section - .chunks_exact(4) - .map(|b| u32::from_le_bytes(b.try_into().unwrap())) - .collect(); - assert_eq!(parsed, vec![0u32, 3, 6]); - assert_eq!(&payload[2 + 12..], b"abcxyz"); - } - #[test] fn varchar_rejects_negative_offset() { let mut chunk = Chunk::new("t"); - let offsets: [i32; 3] = [-1, 1, 2]; + let offsets = [-1i32, 1, 2]; let err = chunk .column_varchar("v", &offsets, b"ab", None) .unwrap_err(); assert_eq!(err.code(), crate::ErrorCode::InvalidApiCall); - assert!(err.msg().contains("non-negative"), "msg: {}", err.msg()); + assert!(err.msg().contains("non-negative")); } #[test] fn varchar_rejects_non_monotonic_offsets() { let mut chunk = Chunk::new("t"); - let offsets: [i32; 3] = [0, 5, 3]; + let offsets = [0i32, 5, 3]; let err = chunk .column_varchar("v", &offsets, b"abcde", None) .unwrap_err(); assert_eq!(err.code(), crate::ErrorCode::InvalidApiCall); - assert!(err.msg().contains("non-decreasing"), "msg: {}", err.msg()); + assert!(err.msg().contains("non-decreasing")); } #[test] - fn varchar_rejects_offsets_past_bytes_end() { + fn symbol_rejects_out_of_range_code() { let mut chunk = Chunk::new("t"); - let offsets: [i32; 3] = [0, 2, 7]; + let codes = [0i32, 99]; + let dict_offsets = [0i32, 5]; let err = chunk - .column_varchar("v", &offsets, b"abcde", None) + .symbol_dict_i32("sym", &codes, &dict_offsets, b"alpha", None) .unwrap_err(); assert_eq!(err.code(), crate::ErrorCode::InvalidApiCall); - assert!(err.msg().contains("bytes buffer"), "msg: {}", err.msg()); + assert!(err.msg().contains("out of range")); } #[test] - fn varchar_rejects_empty_offsets() { + fn symbol_skips_null_codes() { let mut chunk = Chunk::new("t"); - let err = chunk.column_varchar("v", &[], b"", None).unwrap_err(); - assert_eq!(err.code(), crate::ErrorCode::InvalidApiCall); + let codes = [0i32, 99]; + let dict_offsets = [0i32, 5]; + let bits = [0b0000_0001]; + let v = Validity::from_bitmap(&bits, 2).unwrap(); + chunk + .symbol_dict_i32("sym", &codes, &dict_offsets, b"alpha", Some(&v)) + .expect("null row's bogus code is ignored"); } } diff --git a/questdb-rs/src/ingress/column_sender/conf.rs b/questdb-rs/src/ingress/column_sender/conf.rs index f024670c..d5c27b43 100644 --- a/questdb-rs/src/ingress/column_sender/conf.rs +++ b/questdb-rs/src/ingress/column_sender/conf.rs @@ -71,9 +71,6 @@ impl Default for PoolConfig { #[derive(Debug, Clone)] pub(crate) struct ParsedConf { pub(crate) pool: PoolConfig, - /// `true` iff the connect string opted in to durable acks via - /// `request_durable_ack=on`. Required for `AckLevel::Durable` flushes. - pub(crate) durable_ack_opt_in: bool, } /// Validate and extract pool-specific knobs from a column-sender connect @@ -104,7 +101,6 @@ pub(crate) fn parse(conf: &str) -> Result { let mut pool = PoolConfig::default(); let mut pool_size_specified = false; - let mut durable_ack_opt_in = false; walk_params(params, |key, value| { if is_refused_key(key) { @@ -112,7 +108,9 @@ pub(crate) fn parse(conf: &str) -> Result { } match key { "request_durable_ack" => { - durable_ack_opt_in = parse_on_off("request_durable_ack", value)?; + // Syntactic check; the SenderBuilder also parses this + // for ColumnConn. + let _ = parse_on_off("request_durable_ack", value)?; } "qwp_ws_progress" if value != "background" => { return Err(error::fmt!( @@ -181,10 +179,7 @@ pub(crate) fn parse(conf: &str) -> Result { )); } - Ok(ParsedConf { - pool, - durable_ack_opt_in, - }) + Ok(ParsedConf { pool }) } fn parse_on_off(key: &str, value: &str) -> Result { @@ -376,12 +371,12 @@ mod tests { #[test] fn parses_request_durable_ack() { - let off = parse_ok("qwpws::addr=localhost:9000;"); - assert!(!off.durable_ack_opt_in); - let on = parse_ok("qwpws::addr=localhost:9000;request_durable_ack=on;"); - assert!(on.durable_ack_opt_in); - let explicit_off = parse_ok("qwpws::addr=localhost:9000;request_durable_ack=off;"); - assert!(!explicit_off.durable_ack_opt_in); + // Syntactically valid values pass the column-sender's pre-check. + // The actual `durable_ack_opt_in` flag is sourced from the + // SenderBuilder inside `ColumnConn::connect`. + let _ = parse_ok("qwpws::addr=localhost:9000;"); + let _ = parse_ok("qwpws::addr=localhost:9000;request_durable_ack=on;"); + let _ = parse_ok("qwpws::addr=localhost:9000;request_durable_ack=off;"); } #[test] diff --git a/questdb-rs/src/ingress/column_sender/conn.rs b/questdb-rs/src/ingress/column_sender/conn.rs new file mode 100644 index 00000000..cb46ca83 --- /dev/null +++ b/questdb-rs/src/ingress/column_sender/conn.rs @@ -0,0 +1,966 @@ +/******************************************************************************* + * ___ _ ____ ____ + * / _ \ _ _ ___ ___| |_| _ \| __ ) + * | | | | | | |/ _ \/ __| __| | | | _ \ + * | |_| | |_| | __/\__ \ |_| |_| | |_) | + * \__\_\\__,_|\___||___/\__|____/|____/ + * + * Copyright (c) 2014-2019 Appsicle + * Copyright (c) 2019-2025 QuestDB + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ******************************************************************************/ + +//! Dedicated pipelined QWP/WebSocket connection for the column-major +//! sender. +//! +//! `ColumnConn` owns its socket end-to-end. Each `publish_qwp` writes a +//! single QWP frame into the connection's reusable write buffer, masks it +//! per RFC 6455, and `write_all`s to the socket — then returns immediately +//! without waiting for the server's ack. Between publishes, ready acks +//! are drained non-blocking via `try_drain_acks`. When the in-flight +//! count hits the protocol cap (128), the next publish blocks until one +//! ack frees a slot. An explicit `sync_all_acks` blocks until every +//! in-flight frame is acknowledged. +//! +//! No replay queue, no background thread — single-thread, single-socket, +//! pipelined. + +use std::collections::{HashMap, VecDeque}; +use std::io::{self, Read, Write}; +use std::time::Duration; + +use crate::ingress::SenderBuilder; +use crate::ingress::sender::qwp_ws::WsStream; +use crate::ws::frame::{self, FrameError, FrameHeader, Opcode}; +use crate::ws::mask::{MaskKeySource, apply_mask}; +use crate::{Result, error}; + +use super::sender::AckLevel; + +/// Bytes the encoder leaves untouched at the start of `write_buf` so the +/// WS header can be prepended in place without a copy. RFC 6455 §5.2: the +/// client-to-server header is at most 14 bytes (1 flag + 1 len + 8 ext len +/// + 4 mask key). +pub(crate) const WS_HEADER_RESERVE: usize = 14; + +// Status bytes from the QWP/WS response opcode table. Duplicated here per +// the "no row-API code reuse" stance — the column sender never reaches +// into `crate::ingress::sender::qwp_ws_codec`. +const QWP_STATUS_OK: u8 = 0x00; +const QWP_STATUS_DURABLE_ACK: u8 = 0x02; +const QWP_STATUS_SCHEMA_MISMATCH: u8 = 0x03; +const QWP_STATUS_PARSE_ERROR: u8 = 0x05; +const QWP_STATUS_INTERNAL_ERROR: u8 = 0x06; +const QWP_STATUS_SECURITY_ERROR: u8 = 0x08; +const QWP_STATUS_WRITE_ERROR: u8 = 0x09; + +/// Cap on a single inbound WS frame. Well above QWP's 16 MiB batch limit +/// but small enough to refuse obviously bogus declared lengths early. +const MAX_INBOUND_FRAME_BYTES: u64 = 256 * 1024 * 1024; + +/// QWP spec §Protocol limits: max in-flight batches per connection. +const MAX_IN_FLIGHT: u32 = 128; + +/// Metadata for one published-but-unacked frame. Pushed on publish, +/// popped (front) when the matching OK arrives. +struct PendingAck { + fsn: u64, +} + +/// One pipelined QWP/WebSocket connection owned by the column-major +/// sender. See module docs. +pub(crate) struct ColumnConn { + stream: WsStream, + /// Bytes the WS handshake read past the upgrade response, plus any + /// bytes from inbound WS frames already consumed past their header. + /// Drained before reading more from the socket. + leftover: Vec, + /// Reusable outbound buffer. Bytes 0..WS_HEADER_RESERVE are reserved + /// for the WS header; the encoder writes the QWP frame body from + /// offset WS_HEADER_RESERVE onwards. + write_buf: Vec, + /// Reusable inbound scratch (one ack frame's worth). + read_buf: Vec, + mask_keys: MaskKeySource, + /// Sequence assigned to the next published frame. QWP server numbers + /// client frames starting at 0; first publish gets fsn 0. + next_fsn: u64, + /// Published-but-unacked frames, ordered by fsn. Pushed on publish, + /// popped (front) when the matching OK arrives. + pending_acks: VecDeque, + /// Number of published-but-unacked frames. Redundant with + /// `pending_acks.len()` but avoids a cast for the 128 cap check. + in_flight: u32, + /// For ack_level=Durable: per-table seq_txn watermark the server has + /// reported reaching durable storage. + durable_watermarks: HashMap, + /// Sticky: once `true`, the connection cannot be used for further + /// publishes; the pool drops the slot on return. + must_close: bool, + max_buf_size: usize, + request_timeout: Duration, + durable_ack_opt_in: bool, +} + +impl ColumnConn { + /// Open a fresh column-sender connection. The pool layer + /// ([`super::QuestDb::connect`]) has already extracted pool-specific + /// knobs and refused `sf_*` keys; this function only reaches the + /// remaining QWP/WS settings via [`SenderBuilder::from_conf`]. + pub(crate) fn connect(conf: &str) -> Result { + let builder = SenderBuilder::from_conf(conf)?; + let raw = builder.build_qwp_ws_raw_stream()?; + let mask_keys = MaskKeySource::new() + .map_err(|e| error::fmt!(SocketError, "MaskKeySource init failed: {}", e.0))?; + Ok(Self { + stream: raw.stream, + leftover: raw.leftover, + write_buf: Vec::with_capacity(64 * 1024), + read_buf: Vec::with_capacity(4 * 1024), + mask_keys, + next_fsn: 0, + pending_acks: VecDeque::new(), + in_flight: 0, + durable_watermarks: HashMap::new(), + must_close: false, + max_buf_size: raw.max_buf_size, + request_timeout: raw.request_timeout, + durable_ack_opt_in: raw.durable_ack_opt_in, + }) + } + + pub(crate) fn must_close(&self) -> bool { + self.must_close + } + + /// Hand `encode` a `&mut Vec` with `WS_HEADER_RESERVE` bytes + /// pre-reserved at the front; `encode` appends the QWP frame body to + /// it. Frame the result as a WS binary frame (mask in place), write + /// the bytes to the socket, return the assigned FSN. + /// + /// On any socket or protocol failure the connection is latched as + /// `must_close` and the original error is returned. + pub(crate) fn publish_qwp(&mut self, encode: F) -> Result + where + F: FnOnce(&mut Vec) -> Result<()>, + { + if self.must_close { + return Err(error::fmt!( + SocketError, + "QWP/WebSocket connection latched as terminal; \ + return the sender to the pool and acquire a fresh one." + )); + } + + // Set up the buffer: 14 zero bytes that the WS header will + // overwrite once we know the actual payload length. + self.write_buf.clear(); + self.write_buf.resize(WS_HEADER_RESERVE, 0); + + // Caller writes the QWP frame body. + encode(&mut self.write_buf).inspect_err(|_| { + // Encode failure leaves the connection usable — the bytes + // never hit the wire — but the buffer state needs resetting + // so the next publish starts clean. + self.write_buf.clear(); + })?; + + let payload_len = self.write_buf.len() - WS_HEADER_RESERVE; + if payload_len > self.max_buf_size { + return Err(error::fmt!( + InvalidApiCall, + "QWP frame ({} bytes) exceeds max_buf_size ({} bytes)", + payload_len, + self.max_buf_size + )); + } + + let mask_key = self.mask_keys.next_key().map_err(|e| { + self.latch(error::fmt!(SocketError, "mask key entropy failed: {}", e.0)) + })?; + + // Apply the mask to the QWP frame body in place. + apply_mask(&mut self.write_buf[WS_HEADER_RESERVE..], mask_key, 0); + + // Compute the WS header byte count for this payload length. + let ws_header_len = ws_header_len_for(payload_len); + let header_offset = WS_HEADER_RESERVE - ws_header_len; + write_ws_header( + &mut self.write_buf[header_offset..WS_HEADER_RESERVE], + payload_len, + mask_key, + ); + + self.set_timeouts(Some(self.request_timeout), Some(self.request_timeout))?; + self.stream + .write_all(&self.write_buf[header_offset..]) + .map_err(|e| { + self.latch(error::fmt!( + SocketError, + "QWP/WebSocket socket write failed: {}", + e + )) + })?; + self.stream.flush().map_err(|e| { + self.latch(error::fmt!( + SocketError, + "QWP/WebSocket socket flush failed: {}", + e + )) + })?; + + let fsn = self.next_fsn; + self.next_fsn = self.next_fsn.wrapping_add(1); + Ok(PublishedFrame { fsn }) + } + + /// Record a just-published frame as in-flight. Called by + /// `ColumnSender::flush` after `publish_qwp` succeeds. + pub(crate) fn push_pending(&mut self, fsn: u64) { + self.pending_acks.push_back(PendingAck { fsn }); + self.in_flight += 1; + } + + /// Number of published-but-unacked frames. + pub(crate) fn in_flight(&self) -> u32 { + self.in_flight + } + + /// Drain any ack responses available without blocking. Returns the + /// number of OK acks consumed. + pub(crate) fn try_drain_acks(&mut self) -> Result { + let mut drained = 0u32; + loop { + match self.try_recv_qwp_response()? { + None => return Ok(drained), + Some(response) => { + self.process_response(response)?; + drained += 1; + } + } + } + } + + /// Block until at least one OK ack arrives. Used when + /// `in_flight == MAX_IN_FLIGHT` to free a slot. + pub(crate) fn drain_one_ack_blocking(&mut self) -> Result<()> { + loop { + let response = self.recv_qwp_response()?; + match &response { + QwpResponse::Ok { .. } => { + self.process_response(response)?; + return Ok(()); + } + _ => { + self.process_response(response)?; + } + } + } + } + + /// Block until all in-flight frames are OK-acked. For + /// `AckLevel::Durable`, also wait for durable watermarks to reach + /// every pending frame's seq_txn. + pub(crate) fn sync_all_acks(&mut self, ack_level: AckLevel) -> Result<()> { + if self.must_close { + return Err(error::fmt!( + SocketError, + "QWP/WebSocket connection latched as terminal." + )); + } + if ack_level == AckLevel::Durable && !self.durable_ack_opt_in { + return Err(error::fmt!( + InvalidApiCall, + "AckLevel::Durable requires the pool to be opened with \ + `request_durable_ack=on` in the connect string." + )); + } + + // Phase 1: drain all OK acks. + let mut durable_targets: HashMap = HashMap::new(); + while self.in_flight > 0 { + let response = self.recv_qwp_response()?; + if let QwpResponse::Ok { tables, .. } = &response + && ack_level == AckLevel::Durable + { + for (t, seq_txn) in tables { + let entry = durable_targets.entry(t.clone()).or_insert(i64::MIN); + if *seq_txn > *entry { + *entry = *seq_txn; + } + } + } + self.process_response(response)?; + } + + // Phase 2 (Durable only): wait for watermarks. + if ack_level == AckLevel::Durable { + while durable_targets.iter().any(|(t, target)| { + self.durable_watermarks.get(t).copied().unwrap_or(i64::MIN) < *target + }) { + let response = self.recv_qwp_response()?; + self.process_response(response)?; + } + } + + Ok(()) + } + + /// Dispatch a parsed QWP response: validate OK sequence, update + /// in-flight tracking, absorb durable watermarks, latch on error. + fn process_response(&mut self, response: QwpResponse) -> Result<()> { + match response { + QwpResponse::Ok { sequence, tables } => { + // The server sends cumulative OKs: sequence=N means all + // frames up to and including N are committed. Pop every + // pending entry whose fsn <= sequence. + let mut popped = 0u32; + while let Some(front) = self.pending_acks.front() { + if front.fsn > sequence { + break; + } + self.pending_acks.pop_front(); + popped += 1; + } + if popped == 0 { + return Err(self.latch(error::fmt!( + SocketError, + "QWP OK sequence {} has no matching pending frame (next pending: {:?})", + sequence, + self.pending_acks.front().map(|p| p.fsn) + ))); + } + self.in_flight -= popped; + for (t, seq_txn) in tables { + self.durable_watermarks + .entry(t) + .and_modify(|w| { + if seq_txn > *w { + *w = seq_txn; + } + }) + .or_insert(seq_txn); + } + Ok(()) + } + QwpResponse::DurableAck { tables } => { + for (t, seq_txn) in tables { + self.durable_watermarks + .entry(t) + .and_modify(|w| { + if seq_txn > *w { + *w = seq_txn; + } + }) + .or_insert(seq_txn); + } + Ok(()) + } + QwpResponse::Error { + sequence, + status, + message, + } => { + let err = map_error_status(status, &message); + Err(self.latch(crate::Error::new( + err.code(), + format!( + "QWP server error on fsn {}: status=0x{:02x}, message={:?}", + sequence, status, message + ), + ))) + } + } + } + + /// `true` when the in-flight count has hit the protocol cap and a + /// blocking drain is needed before the next publish. + pub(crate) fn at_in_flight_cap(&self) -> bool { + self.in_flight >= MAX_IN_FLIGHT + } + + /// Latches the connection as terminal and returns the originating + /// error. Used by every socket-side failure path. + fn latch(&mut self, err: crate::Error) -> crate::Error { + self.must_close = true; + err + } + + fn set_timeouts(&self, read: Option, write: Option) -> Result<()> { + // WsStream::set_timeouts is `fn` (not pub(crate)). We replicate + // the socket timeout setting via the tcp_stream accessor, but + // since WsStream::set_timeouts is private we have to use the + // Read/Write IO directly. Skip explicit timeout muting here: + // the underlying socket already has timeouts set during connect + // (see establish_connection in qwp_ws.rs). If they need refresh + // for long flushes, expose a setter on WsStream. + let _ = read; + let _ = write; + Ok(()) + } + + /// Non-blocking attempt to read one QWP/WS data frame. Returns + /// `Ok(None)` if no complete frame is available yet (WouldBlock). + fn try_recv_qwp_response(&mut self) -> Result> { + loop { + match FrameHeader::parse(&self.leftover) { + Ok(h) => { + if !h.fin { + return Err(self.latch(error::fmt!( + SocketError, + "QWP/WebSocket server sent a fragmented frame; QWP is FIN-only" + ))); + } + if h.payload_len > MAX_INBOUND_FRAME_BYTES { + return Err(self.latch(error::fmt!( + SocketError, + "WS frame declared {} payload bytes (max {})", + h.payload_len, + MAX_INBOUND_FRAME_BYTES + ))); + } + let payload_len = h.payload_len as usize; + let header_len = h.header_len; + // Check if we have enough leftover for header + payload. + if self.leftover.len() < header_len + payload_len { + // We have the header but not the full payload yet. + // Try one non-blocking read to get more. + if !self.try_fill_leftover()? { + return Ok(None); + } + continue; + } + // Consume header + payload from leftover. + self.leftover.drain(..header_len); + self.read_buf.clear(); + self.read_buf + .extend_from_slice(&self.leftover[..payload_len]); + self.leftover.drain(..payload_len); + match h.opcode { + Opcode::Binary => { + return parse_qwp_response(&self.read_buf) + .inspect_err(|_| { + self.must_close = true; + }) + .map(Some); + } + Opcode::Ping => { + self.send_pong(payload_len)?; + continue; + } + Opcode::Pong => continue, + Opcode::Close => { + self.must_close = true; + return Err(error::fmt!( + SocketError, + "QWP/WebSocket server closed the connection" + )); + } + } + } + Err(FrameError::Incomplete) => { + if !self.try_fill_leftover()? { + return Ok(None); + } + } + Err(FrameError::Protocol(msg)) => { + return Err(self.latch(error::fmt!( + SocketError, + "QWP/WebSocket frame parse error: {}", + msg + ))); + } + } + } + } + + /// Read one QWP/WS data frame's payload and decode the QWP response. + /// Ping frames are answered transparently; pong frames are dropped; + /// close frames latch the connection. + fn recv_qwp_response(&mut self) -> Result { + loop { + let header = self.read_ws_frame_header()?; + if !header.fin { + return Err(self.latch(error::fmt!( + SocketError, + "QWP/WebSocket server sent a fragmented frame; QWP is FIN-only" + ))); + } + let payload_len = header.payload_len as usize; + if header.payload_len > MAX_INBOUND_FRAME_BYTES { + return Err(self.latch(error::fmt!( + SocketError, + "WS frame declared {} payload bytes (max {})", + header.payload_len, + MAX_INBOUND_FRAME_BYTES + ))); + } + self.read_buf.clear(); + self.read_buf.resize(payload_len, 0); + self.read_exact_into_buf(payload_len)?; + match header.opcode { + Opcode::Binary => { + return parse_qwp_response(&self.read_buf).inspect_err(|_| { + // Parse error: not a transport failure; the + // server gave us bytes that don't conform to the + // QWP response schema. Latch and surface. + self.must_close = true; + }); + } + Opcode::Ping => { + self.send_pong(payload_len)?; + continue; + } + Opcode::Pong => { + continue; + } + Opcode::Close => { + self.must_close = true; + return Err(error::fmt!( + SocketError, + "QWP/WebSocket server closed the connection" + )); + } + } + } + } + + /// Read a complete WS frame header from `leftover` / the socket. + fn read_ws_frame_header(&mut self) -> Result { + // Need at most 10 bytes for any header we'd parse (server frames + // are unmasked). + loop { + match FrameHeader::parse(&self.leftover) { + Ok(h) => { + // Trim the header bytes from leftover and return. + let header_len = h.header_len; + self.leftover.drain(..header_len); + return Ok(h); + } + Err(FrameError::Incomplete) => { + self.fill_leftover()?; + } + Err(FrameError::Protocol(msg)) => { + return Err(self.latch(error::fmt!( + SocketError, + "QWP/WebSocket frame parse error: {}", + msg + ))); + } + } + } + } + + /// Fill `read_buf[..len]` from `leftover` + the socket. + fn read_exact_into_buf(&mut self, len: usize) -> Result<()> { + let from_leftover = self.leftover.len().min(len); + self.read_buf[..from_leftover].copy_from_slice(&self.leftover[..from_leftover]); + self.leftover.drain(..from_leftover); + let mut filled = from_leftover; + while filled < len { + let n = self + .stream + .read(&mut self.read_buf[filled..]) + .map_err(|e| { + self.latch(error::fmt!( + SocketError, + "QWP/WebSocket socket read failed: {}", + e + )) + })?; + if n == 0 { + return Err(self.latch(error::fmt!( + SocketError, + "QWP/WebSocket socket closed unexpectedly during frame read" + ))); + } + filled += n; + } + Ok(()) + } + + /// Non-blocking attempt to read more bytes from the socket into + /// `leftover`. Returns `Ok(true)` if data was read, `Ok(false)` on + /// WouldBlock. + fn try_fill_leftover(&mut self) -> Result { + let mut chunk = [0u8; 4096]; + match self.stream.read_nonblocking_once(&mut chunk) { + Ok(0) => Err(self.latch(error::fmt!( + SocketError, + "QWP/WebSocket socket closed unexpectedly" + ))), + Ok(n) => { + self.leftover.extend_from_slice(&chunk[..n]); + Ok(true) + } + Err(e) if e.kind() == io::ErrorKind::WouldBlock => Ok(false), + Err(e) => Err(self.latch(error::fmt!( + SocketError, + "QWP/WebSocket non-blocking read failed: {}", + e + ))), + } + } + + /// Read at least one more byte from the socket into `leftover`. + fn fill_leftover(&mut self) -> Result<()> { + let mut chunk = [0u8; 1024]; + let n = self.stream.read(&mut chunk).map_err(|e| { + self.latch(error::fmt!( + SocketError, + "QWP/WebSocket socket read failed: {}", + e + )) + })?; + if n == 0 { + return Err(self.latch(error::fmt!( + SocketError, + "QWP/WebSocket socket closed unexpectedly while reading frame header" + ))); + } + self.leftover.extend_from_slice(&chunk[..n]); + Ok(()) + } + + fn send_pong(&mut self, payload_len: usize) -> Result<()> { + // The pong payload must echo the ping payload, which is in + // read_buf[..payload_len]. + let mask_key = self.mask_keys.next_key().map_err(|e| { + self.latch(error::fmt!(SocketError, "mask key entropy failed: {}", e.0)) + })?; + // Use a small scratch buffer to encode the pong; pongs are tiny + // (≤ 125 bytes by RFC) so this allocation is negligible. + let mut pong = Vec::with_capacity(WS_HEADER_RESERVE + payload_len); + frame::encode_client_frame( + &mut pong, + Opcode::Pong, + mask_key, + &self.read_buf[..payload_len], + ); + self.stream.write_all(&pong).map_err(|e| { + self.latch(error::fmt!( + SocketError, + "QWP/WebSocket pong write failed: {}", + e + )) + })?; + self.stream.flush().map_err(|e| { + self.latch(error::fmt!( + SocketError, + "QWP/WebSocket pong flush failed: {}", + e + )) + })?; + Ok(()) + } +} + +/// Outcome of a successful publish call. +pub(crate) struct PublishedFrame { + pub(crate) fsn: u64, +} + +#[derive(Debug)] +enum QwpResponse { + Ok { + sequence: u64, + tables: Vec<(String, i64)>, + }, + DurableAck { + tables: Vec<(String, i64)>, + }, + Error { + sequence: u64, + status: u8, + message: String, + }, +} + +/// Parse a QWP/WS response payload (the body of a binary WS frame). +fn parse_qwp_response(payload: &[u8]) -> Result { + if payload.is_empty() { + return Err(error::fmt!(SocketError, "Empty QWP response frame")); + } + let status = payload[0]; + match status { + QWP_STATUS_OK => { + if payload.len() < 1 + 8 + 2 { + return Err(error::fmt!(SocketError, "QWP OK response truncated")); + } + let sequence = u64::from_le_bytes(payload[1..9].try_into().unwrap()); + let tables = parse_table_entries(payload, 9, "QWP OK response")?; + Ok(QwpResponse::Ok { sequence, tables }) + } + QWP_STATUS_DURABLE_ACK => { + let tables = parse_table_entries(payload, 1, "QWP durable ACK response")?; + Ok(QwpResponse::DurableAck { tables }) + } + _ => { + let (sequence, message) = parse_error_body(payload)?; + Ok(QwpResponse::Error { + sequence, + status, + message, + }) + } + } +} + +fn parse_table_entries( + payload: &[u8], + table_count_offset: usize, + context: &'static str, +) -> Result> { + let table_count_end = table_count_offset + .checked_add(2) + .ok_or_else(|| error::fmt!(SocketError, "{} table count offset overflow", context))?; + if payload.len() < table_count_end { + return Err(error::fmt!(SocketError, "{} truncated", context)); + } + let table_count = u16::from_le_bytes( + payload[table_count_offset..table_count_end] + .try_into() + .unwrap(), + ) as usize; + let mut pos = table_count_end; + let mut entries = Vec::with_capacity(table_count); + for _ in 0..table_count { + let name_len_end = pos + .checked_add(2) + .ok_or_else(|| error::fmt!(SocketError, "{} table entry offset overflow", context))?; + if payload.len() < name_len_end { + return Err(error::fmt!( + SocketError, + "{} table entry truncated", + context + )); + } + let name_len = u16::from_le_bytes(payload[pos..name_len_end].try_into().unwrap()) as usize; + pos = name_len_end; + if name_len == 0 { + return Err(error::fmt!(SocketError, "{} table name is empty", context)); + } + let name_end = pos + .checked_add(name_len) + .ok_or_else(|| error::fmt!(SocketError, "{} table name length overflow", context))?; + let seq_txn_end = name_end + .checked_add(8) + .ok_or_else(|| error::fmt!(SocketError, "{} table entry length overflow", context))?; + if payload.len() < seq_txn_end { + return Err(error::fmt!( + SocketError, + "{} table entry truncated", + context + )); + } + let name = std::str::from_utf8(&payload[pos..name_end]) + .map_err(|_| error::fmt!(SocketError, "{} table name not UTF-8", context))? + .to_owned(); + let seq_txn = i64::from_le_bytes(payload[name_end..seq_txn_end].try_into().unwrap()); + entries.push((name, seq_txn)); + pos = seq_txn_end; + } + if pos != payload.len() { + return Err(error::fmt!( + SocketError, + "{} has trailing bytes after table entries", + context + )); + } + Ok(entries) +} + +fn parse_error_body(payload: &[u8]) -> Result<(u64, String)> { + if payload.len() < 1 + 8 + 2 { + return Err(error::fmt!(SocketError, "QWP error response truncated")); + } + let sequence = u64::from_le_bytes(payload[1..9].try_into().unwrap()); + let msg_len = u16::from_le_bytes(payload[9..11].try_into().unwrap()) as usize; + if msg_len > 1024 { + return Err(error::fmt!( + SocketError, + "QWP error response message too long (declared {} bytes, max 1024)", + msg_len + )); + } + let msg_end = 11usize + .checked_add(msg_len) + .ok_or_else(|| error::fmt!(SocketError, "QWP error response message length overflow"))?; + if payload.len() < msg_end { + return Err(error::fmt!( + SocketError, + "QWP error response truncated (declared {} bytes)", + msg_len + )); + } + if payload.len() != msg_end { + return Err(error::fmt!( + SocketError, + "QWP error response has trailing bytes after message" + )); + } + let message = std::str::from_utf8(&payload[11..msg_end]) + .map_err(|_| error::fmt!(SocketError, "QWP error message not UTF-8"))? + .to_owned(); + Ok((sequence, message)) +} + +fn map_error_status(status: u8, msg: &str) -> crate::Error { + match status { + QWP_STATUS_SCHEMA_MISMATCH => { + error::fmt!(InvalidApiCall, "QWP schema mismatch: {}", msg) + } + QWP_STATUS_PARSE_ERROR => error::fmt!(InvalidApiCall, "QWP parse error: {}", msg), + QWP_STATUS_INTERNAL_ERROR => error::fmt!(ServerFlushError, "QWP internal error: {}", msg), + QWP_STATUS_SECURITY_ERROR => error::fmt!(AuthError, "QWP security error: {}", msg), + QWP_STATUS_WRITE_ERROR => error::fmt!(ServerFlushError, "QWP write error: {}", msg), + _ => error::fmt!( + ServerFlushError, + "QWP unrecognised error status 0x{:02x}: {}", + status, + msg + ), + } +} + +/// On-wire byte count of the client-to-server WS header for a given +/// payload length (mask bit always set ⇒ +4 bytes for the mask key). +#[inline] +fn ws_header_len_for(payload_len: usize) -> usize { + if payload_len <= 125 { + 2 + 4 + } else if payload_len <= 0xFFFF { + 4 + 4 + } else { + 10 + 4 + } +} + +/// Write the RFC 6455 binary-frame client header into `out`. `out.len()` +/// must equal [`ws_header_len_for(payload_len)`]. +fn write_ws_header(out: &mut [u8], payload_len: usize, mask_key: [u8; 4]) { + const FIN_BIT: u8 = 0x80; + const BINARY_OPCODE: u8 = 0x2; + const MASK_BIT: u8 = 0x80; + out[0] = FIN_BIT | BINARY_OPCODE; + let len_bytes; + let mask_offset; + if payload_len <= 125 { + out[1] = MASK_BIT | (payload_len as u8); + mask_offset = 2; + len_bytes = 0; + } else if payload_len <= 0xFFFF { + out[1] = MASK_BIT | 126; + out[2..4].copy_from_slice(&(payload_len as u16).to_be_bytes()); + mask_offset = 4; + len_bytes = 2; + } else { + out[1] = MASK_BIT | 127; + out[2..10].copy_from_slice(&(payload_len as u64).to_be_bytes()); + mask_offset = 10; + len_bytes = 8; + } + let _ = len_bytes; + out[mask_offset..mask_offset + 4].copy_from_slice(&mask_key); +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn ws_header_len_matches_payload_length_class() { + assert_eq!(ws_header_len_for(0), 6); + assert_eq!(ws_header_len_for(125), 6); + assert_eq!(ws_header_len_for(126), 8); + assert_eq!(ws_header_len_for(0xFFFF), 8); + assert_eq!(ws_header_len_for(0x1_0000), 14); + assert_eq!(ws_header_len_for(1 << 24), 14); + } + + #[test] + fn write_ws_header_short_form() { + let mut buf = [0u8; 6]; + write_ws_header(&mut buf, 5, [0xDE, 0xAD, 0xBE, 0xEF]); + assert_eq!(buf[0], 0x82); // FIN=1, opcode=Binary + assert_eq!(buf[1], 0x80 | 5); // MASK=1, len=5 + assert_eq!(&buf[2..6], &[0xDE, 0xAD, 0xBE, 0xEF]); + } + + #[test] + fn write_ws_header_16bit_form() { + let mut buf = [0u8; 8]; + write_ws_header(&mut buf, 200, [1, 2, 3, 4]); + assert_eq!(buf[0], 0x82); + assert_eq!(buf[1], 0x80 | 126); + assert_eq!(u16::from_be_bytes([buf[2], buf[3]]), 200); + assert_eq!(&buf[4..8], &[1, 2, 3, 4]); + } + + #[test] + fn write_ws_header_64bit_form() { + let mut buf = [0u8; 14]; + write_ws_header(&mut buf, 0x1_0000, [9, 8, 7, 6]); + assert_eq!(buf[0], 0x82); + assert_eq!(buf[1], 0x80 | 127); + assert_eq!( + u64::from_be_bytes([ + buf[2], buf[3], buf[4], buf[5], buf[6], buf[7], buf[8], buf[9] + ]), + 0x1_0000 + ); + assert_eq!(&buf[10..14], &[9, 8, 7, 6]); + } + + #[test] + fn parse_qwp_ok_with_one_table() { + // status=OK, sequence=42, table_count=1, name_len=2, "tx", seq_txn=7 + let mut payload = vec![0u8]; + payload.extend_from_slice(&42u64.to_le_bytes()); + payload.extend_from_slice(&1u16.to_le_bytes()); + payload.extend_from_slice(&2u16.to_le_bytes()); + payload.extend_from_slice(b"tx"); + payload.extend_from_slice(&7i64.to_le_bytes()); + let response = parse_qwp_response(&payload).unwrap(); + match response { + QwpResponse::Ok { sequence, tables } => { + assert_eq!(sequence, 42); + assert_eq!(tables, vec![("tx".to_owned(), 7)]); + } + other => panic!("expected Ok, got {other:?}"), + } + } + + #[test] + fn parse_qwp_durable_ack_empty() { + // status=DurableAck, table_count=0 + let mut payload = vec![QWP_STATUS_DURABLE_ACK]; + payload.extend_from_slice(&0u16.to_le_bytes()); + let response = parse_qwp_response(&payload).unwrap(); + match response { + QwpResponse::DurableAck { tables } => { + assert!(tables.is_empty()); + } + other => panic!("expected DurableAck, got {other:?}"), + } + } + + #[test] + fn parse_qwp_error_truncated_rejected() { + // status=PARSE_ERROR but only the status byte present + let err = parse_qwp_response(&[QWP_STATUS_PARSE_ERROR]).unwrap_err(); + assert_eq!(err.code(), crate::ErrorCode::SocketError); + } +} diff --git a/questdb-rs/src/ingress/column_sender/db.rs b/questdb-rs/src/ingress/column_sender/db.rs index 9ac34280..bdb1117f 100644 --- a/questdb-rs/src/ingress/column_sender/db.rs +++ b/questdb-rs/src/ingress/column_sender/db.rs @@ -45,10 +45,10 @@ use std::sync::{Arc, Condvar, Mutex}; use std::thread::{self, JoinHandle}; use std::time::{Duration, Instant}; -use crate::ingress::{Sender, SenderBuilder}; use crate::{Result, error}; use super::conf::{self, PoolReap}; +use super::conn::ColumnConn; use super::sender::ColumnSender; /// Lower bound on the reaper's wake interval. @@ -75,9 +75,6 @@ struct DbInner { pool_size: usize, pool_max: usize, pool_idle_timeout: Duration, - /// Latched from the connect string. Required for `AckLevel::Durable` - /// flushes; without it, a `Durable` flush returns `InvalidApiCall`. - durable_ack_opt_in: bool, state: Mutex, /// Wakes the reaper thread on `shutdown` and lets a future blocking /// borrow wait for a free slot once we grow `borrow_sender` past @@ -101,7 +98,7 @@ impl PoolState { } struct PoolEntry { - sender: Sender, + conn: ColumnConn, /// Connection-scoped schema interner. Travels with the slot so its /// `(signature → id)` map stays coherent across borrow/return cycles; /// both client and server build the same map by first-emit order, so @@ -138,7 +135,7 @@ impl QuestDb { let mut free = Vec::with_capacity(pool_cfg.pool_size); let now = Instant::now(); for slot in 0..pool_cfg.pool_size { - let sender = build_sender(conf).map_err(|err| { + let conn = ColumnConn::connect(conf).map_err(|err| { crate::Error::new( err.code(), format!( @@ -150,7 +147,7 @@ impl QuestDb { ) })?; free.push(PoolEntry { - sender, + conn, schema_registry: super::encoder::SchemaRegistry::new(), symbol_dict: crate::ingress::buffer::SymbolGlobalDict::new(), last_idle_at: now, @@ -162,7 +159,6 @@ impl QuestDb { pool_size: pool_cfg.pool_size, pool_max: pool_cfg.pool_max, pool_idle_timeout: pool_cfg.pool_idle_timeout, - durable_ack_opt_in: parsed.durable_ack_opt_in, state: Mutex::new(PoolState { free, in_use: 0 }), cv: Condvar::new(), shutdown: AtomicBool::new(false), @@ -210,10 +206,9 @@ impl QuestDb { state.in_use += 1; drop(state); return Ok(ColumnSender::new( - entry.sender, + entry.conn, entry.schema_registry, entry.symbol_dict, - self.inner.durable_ack_opt_in, )); } @@ -232,8 +227,8 @@ impl QuestDb { state.in_use += 1; drop(state); - let sender = match build_sender(&self.inner.conf) { - Ok(sender) => sender, + let conn = match ColumnConn::connect(&self.inner.conf) { + Ok(c) => c, Err(err) => { let mut state = self.inner.state.lock().expect("pool mutex poisoned"); state.in_use -= 1; @@ -242,10 +237,9 @@ impl QuestDb { }; Ok(ColumnSender::new( - sender, + conn, super::encoder::SchemaRegistry::new(), crate::ingress::buffer::SymbolGlobalDict::new(), - self.inner.durable_ack_opt_in, )) } @@ -429,22 +423,18 @@ fn return_to_pool(inner: &Arc, sender: ColumnSender) { state.in_use -= 1; if !must_close { state.free.push(PoolEntry { - sender: sender.sender, + conn: sender.conn, schema_registry: sender.schema_registry, symbol_dict: sender.symbol_dict, last_idle_at: Instant::now(), }); } - // Dropped `sender` (when `must_close`) falls out of scope here, after + // When `must_close`, the contained connection is dropped here, after // the count was decremented but with the mutex still held — safe - // since `Sender::drop` does not re-enter the pool. + // since `ColumnConn::drop` does not re-enter the pool. drop(state); } -fn build_sender(conf: &str) -> Result { - SenderBuilder::from_conf(conf)?.build() -} - fn spawn_reaper(inner: Arc) -> JoinHandle<()> { let tick = reaper_tick(inner.pool_idle_timeout); thread::Builder::new() @@ -481,10 +471,10 @@ fn reaper_loop(inner: Arc, tick: Duration) { } fn reap_idle_inner(inner: &DbInner) -> usize { - // Drop the to-be-closed senders OUTSIDE the lock so closing a connection + // Drop the to-be-closed connections OUTSIDE the lock so closing a connection // (which may take an unbounded amount of time) does not stall concurrent // borrows. - let to_drop: Vec = { + let to_drop: Vec = { let mut state = inner.state.lock().expect("pool mutex poisoned"); let mut to_drop = Vec::new(); let now = Instant::now(); @@ -500,7 +490,7 @@ fn reap_idle_inner(inner: &DbInner) -> usize { let idle_for = now.saturating_duration_since(state.free[i].last_idle_at); if idle_for > inner.pool_idle_timeout { let entry = state.free.remove(i); - to_drop.push(entry.sender); + to_drop.push(entry.conn); } else { i += 1; } diff --git a/questdb-rs/src/ingress/column_sender/encoder.rs b/questdb-rs/src/ingress/column_sender/encoder.rs index 290404a0..29ee9251 100644 --- a/questdb-rs/src/ingress/column_sender/encoder.rs +++ b/questdb-rs/src/ingress/column_sender/encoder.rs @@ -24,29 +24,34 @@ //! Column-sender QWP/WebSocket frame encoder. //! -//! Emits a single-table frame (one chunk = one table = one frame). Most -//! column payloads are already in wire shape inside the chunk (see -//! `chunk.rs`); symbol columns resolve to wire bytes here because their -//! global-id assignment is connection-scoped and chunks are -//! sender-agnostic until flushed. +//! Writes the QWP frame body for a `Chunk` directly into the connection's +//! reusable outbound buffer — no allocation per flush, no per-column +//! aggregation copy. The no-null hot path for fixed-width columns is a +//! single `extend_from_slice` (memcpy) straight from the caller's buffer. +//! +//! See `doc/COLUMN_SENDER_PLAN.md` for the design rationale. use std::collections::HashMap; +use std::slice; use crate::ingress::buffer::SymbolGlobalDict; use crate::{Result, error}; -use super::chunk::{Chunk, ChunkColumn}; +use super::chunk::{ + Chunk, ColumnDescriptor, ColumnKind, DesignatedTsDescriptor, SymbolCodesPtr, ValidityDescriptor, +}; use super::wire::{ - MAX_NAME_LEN, QWP_FLAG_DELTA_SYMBOL_DICT, QWP_HEADER_LEN, QWP_MAGIC, QWP_SCHEMA_MODE_FULL, + F32_NULL, F64_NULL, I8_NULL, I16_NULL, I32_NULL, I64_NULL, MAX_NAME_LEN, + QWP_FLAG_DELTA_SYMBOL_DICT, QWP_HEADER_LEN, QWP_MAGIC, QWP_SCHEMA_MODE_FULL, QWP_SCHEMA_MODE_REFERENCE, QWP_VERSION_1, validate_name, write_qwp_bytes, write_qwp_varint, }; /// Connection-scoped table-schema interner. /// /// Each unique signature gets a sequentially-assigned `u64` id. The first -/// emit for a signature uses `QWP_SCHEMA_MODE_FULL`; subsequent emits -/// reuse the id under `QWP_SCHEMA_MODE_REFERENCE`. Both sides of the wire -/// build the same id-by-first-emit mapping; on reconnect both sides reset. +/// emit uses `QWP_SCHEMA_MODE_FULL`; subsequent emits reuse the id under +/// `QWP_SCHEMA_MODE_REFERENCE`. Both sides of the wire build the same id +/// mapping by first-emit order; on reconnect both sides reset. #[derive(Debug, Default)] pub(crate) struct SchemaRegistry { by_signature: HashMap, u64>, @@ -74,17 +79,19 @@ impl SchemaRegistry { } } -/// Encode `chunk` into a QWP/WebSocket frame. -/// -/// Returns the frame bytes ready to hand to -/// [`crate::ingress::Sender::qwp_ws_publish_raw`]. -pub(crate) fn encode_chunk( - chunk: &Chunk, +/// Encode `chunk` into `out` as a complete QWP/WebSocket frame body. The +/// caller has already reserved any prefix bytes it needs in `out` (the +/// connection layer reserves the WS header); the encoder appends QWP +/// bytes only. +pub(crate) fn encode_chunk_into( + out: &mut Vec, + chunk: &Chunk<'_>, schema_registry: &mut SchemaRegistry, - global_dict: &mut SymbolGlobalDict, -) -> Result> { + symbol_dict: &mut SymbolGlobalDict, +) -> Result<()> { if chunk.is_empty() { - return Ok(encode_header_only_frame()); + emit_header_only_frame(out); + return Ok(()); } if chunk.designated_ts.is_none() { return Err(error::fmt!( @@ -117,226 +124,602 @@ pub(crate) fn encode_chunk( .as_ref() .expect("guarded by is_none() check above"); - // Pass 1: resolve symbol columns against the connection-scoped global - // dict so we know the delta-dict prefix BEFORE writing the table - // block. We snapshot the dict's pre-encode size for the rollback - // path below — if anything fails after we touched the dict, the - // server has not yet seen those entries, so dropping them locally - // keeps both sides in sync. - let dict_mark = global_dict.mark(); - let resolution = match resolve_symbols(chunk, global_dict) { + // --- Pass 1: resolve symbol columns against the connection-scoped + // global dict. We snapshot the dict so we can roll back if encoding + // later fails — symbol entries that never hit the wire must not be + // remembered. --- + let dict_mark = symbol_dict.mark(); + let resolution = match resolve_symbols(chunk, symbol_dict) { Ok(r) => r, Err(e) => { - global_dict.rollback(dict_mark); + symbol_dict.rollback(dict_mark); return Err(e); } }; - // Build the schema signature (registry key + FULL-emit payload). + // --- Schema signature --- let column_count = chunk.columns.len() + 1; // +1 for designated timestamp let mut signature = Vec::with_capacity(column_count * 8); for col in &chunk.columns { - signature.extend_from_slice(col.signature()); + write_qwp_bytes(&mut signature, col.name.as_bytes()); + signature.push(col.wire_type); } - write_qwp_bytes(&mut signature, &[]); + write_qwp_bytes(&mut signature, &[]); // designated_ts has empty name signature.push(designated.wire_type); let (schema_id, is_new_schema) = schema_registry.intern(&signature); - // Pre-allocate the full frame. - let symbol_payload_estimate = resolution - .per_column_payload - .iter() - .filter_map(|p| p.as_ref().map(|v| v.len())) - .sum::(); - let resolved_payload_estimate = chunk - .columns - .iter() - .filter_map(|c| match c { - ChunkColumn::Resolved { payload, .. } => Some(payload.len()), - ChunkColumn::Symbol { .. } => None, - }) - .sum::(); - let payload_estimate = 1 + 10 // dict prefix base (delta_start + count varints) - + resolution.delta_symbol_bytes_estimate - + 1 + table_bytes.len() - + 10 - + 1 + 10 + signature.len() - + resolved_payload_estimate - + symbol_payload_estimate - + designated.payload.len(); - let mut frame = Vec::with_capacity(QWP_HEADER_LEN + payload_estimate); - - write_header_placeholder(&mut frame, /* table_count = */ 1); - let payload_start = frame.len(); - - // Delta-symbol-dict prefix. - write_qwp_varint(&mut frame, resolution.delta_start); - write_qwp_varint(&mut frame, resolution.new_symbols.len() as u64); + // --- Reserve total expected frame size up front. Avoids the + // geometric-growth memcpy pattern when the column data is large. --- + let estimated = estimate_frame_size(chunk, row_count, &signature, &resolution); + out.reserve(estimated); + + // --- Reserve frame header placeholder --- + let frame_start = out.len(); + write_header_placeholder(out, /* table_count = */ 1); + let payload_start = out.len(); + + // --- Delta-symbol-dict prefix --- + write_qwp_varint(out, resolution.delta_start); + write_qwp_varint(out, resolution.new_symbols.len() as u64); for bytes in &resolution.new_symbols { - write_qwp_bytes(&mut frame, bytes); + write_qwp_bytes(out, bytes); } - // Table block header. - write_qwp_bytes(&mut frame, table_bytes); - write_qwp_varint(&mut frame, row_count as u64); - write_qwp_varint(&mut frame, column_count as u64); + // --- Table block header --- + write_qwp_bytes(out, table_bytes); + write_qwp_varint(out, row_count as u64); + write_qwp_varint(out, column_count as u64); - // Schema section. + // --- Schema section --- if is_new_schema { - frame.push(QWP_SCHEMA_MODE_FULL); - write_qwp_varint(&mut frame, schema_id); - frame.extend_from_slice(&signature); + out.push(QWP_SCHEMA_MODE_FULL); + write_qwp_varint(out, schema_id); + out.extend_from_slice(&signature); } else { - frame.push(QWP_SCHEMA_MODE_REFERENCE); - write_qwp_varint(&mut frame, schema_id); + out.push(QWP_SCHEMA_MODE_REFERENCE); + write_qwp_varint(out, schema_id); } - // Column payloads. + // --- Column payloads --- for (col_idx, col) in chunk.columns.iter().enumerate() { - match col { - ChunkColumn::Resolved { payload, .. } => { - frame.extend_from_slice(payload); - } - ChunkColumn::Symbol { .. } => { - let payload = resolution.per_column_payload[col_idx] - .as_ref() - .expect("symbol payload must have been resolved"); - frame.extend_from_slice(payload); - } + // SAFETY: caller buffers are required by Chunk's `'a` (or the + // FFI's documented contract) to outlive this call. + unsafe { + encode_column(out, col, row_count, col_idx, &resolution)?; } } - frame.extend_from_slice(&designated.payload); - let payload_len = (frame.len() - payload_start) as u32; - frame[8..12].copy_from_slice(&payload_len.to_le_bytes()); - Ok(frame) + // --- Designated timestamp --- + encode_designated_ts(out, designated, row_count); + + // --- Patch payload_len --- + let payload_len = (out.len() - payload_start) as u32; + let header = &mut out[frame_start..payload_start]; + header[8..12].copy_from_slice(&payload_len.to_le_bytes()); + + Ok(()) +} + +/// Conservative byte estimate of the encoded QWP frame body. Used to +/// `reserve()` write_buf in one shot before the encode loop — avoids +/// the geometric-growth memcpy pattern when total payload runs into +/// MBs. Walks descriptors once, no actual data reads. +fn estimate_frame_size( + chunk: &Chunk<'_>, + row_count: usize, + signature: &[u8], + resolution: &SymbolResolution, +) -> usize { + let mut total = QWP_HEADER_LEN; + // delta-symbol-dict prefix + total += 10 + 10; // delta_start + new_symbols_count varints + for s in &resolution.new_symbols { + total += 10 + s.len(); + } + // table block header + schema section + total += 10 + chunk.table.len() + 10 + 10; // table name + row + col count varints + total += 1 + 10 + signature.len(); // schema mode + id varint + signature (full case) + + let bitmap_bytes = row_count.div_ceil(8); + for col in &chunk.columns { + let null_overhead = 1 + if col.validity.is_some() { + bitmap_bytes + } else { + 0 + }; + let payload_size = match col.kind { + ColumnKind::Byte { .. } => row_count, + ColumnKind::Short { .. } => 2 * row_count, + ColumnKind::Int { .. } | ColumnKind::Float { .. } | ColumnKind::Ipv4 { .. } => { + 4 * row_count + } + ColumnKind::Long { .. } + | ColumnKind::Double { .. } + | ColumnKind::TsNanos { .. } + | ColumnKind::TsMicros { .. } + | ColumnKind::DateMillis { .. } => 8 * row_count, + ColumnKind::Bool { .. } => bitmap_bytes, + ColumnKind::Uuid { .. } => 16 * row_count, + ColumnKind::Long256 { .. } => 32 * row_count, + ColumnKind::Varchar { bytes_len, .. } => 4 * (row_count + 1) + bytes_len, + ColumnKind::Symbol { .. } => 5 * row_count, // varint upper bound + }; + total += null_overhead + payload_size; + } + // designated timestamp + total += 1 + 8 * row_count; + total +} + +fn emit_header_only_frame(out: &mut Vec) { + let frame_start = out.len(); + write_header_placeholder(out, 0); + let payload_start = out.len(); + write_qwp_varint(out, 0); // delta_start + write_qwp_varint(out, 0); // new_symbols_count + let payload_len = (out.len() - payload_start) as u32; + out[frame_start + 8..frame_start + 12].copy_from_slice(&payload_len.to_le_bytes()); } +fn write_header_placeholder(out: &mut Vec, table_count: u16) { + let start = out.len(); + out.extend_from_slice(&QWP_MAGIC); + out.push(QWP_VERSION_1); + out.push(QWP_FLAG_DELTA_SYMBOL_DICT); + out.extend_from_slice(&table_count.to_le_bytes()); + out.extend_from_slice(&0u32.to_le_bytes()); // payload_len placeholder + debug_assert_eq!(out.len() - start, QWP_HEADER_LEN); +} + +// =========================================================================== +// Symbol resolution (pre-pass) +// =========================================================================== + struct SymbolResolution { - /// Pre-existing global dict size at encode start; the delta-dict - /// prefix tells the server "ids `delta_start..delta_start + - /// new_symbols.len()` are these new entries". delta_start: u64, - /// New entries, in the order their ids were assigned. new_symbols: Vec>, - /// Conservative byte estimate for the delta-dict prefix. - delta_symbol_bytes_estimate: usize, - /// One per column slot; `Some` for symbol columns (wire-shape bytes - /// for that column), `None` for resolved columns. - per_column_payload: Vec>>, + /// One entry per column slot. `Some` for symbol columns; carries the + /// per-row internal-index→global-id map keyed by the dict slot the + /// row references. + per_column: Vec>, } -fn resolve_symbols(chunk: &Chunk, global_dict: &mut SymbolGlobalDict) -> Result { - let delta_start = global_dict_len(global_dict); +struct ResolvedSymbolColumn { + /// Indexed by dict slot. `u64::MAX` for slots the column never + /// references (we only intern referenced slots). + local_to_global: Vec, + non_null_count: usize, +} + +fn resolve_symbols( + chunk: &Chunk<'_>, + symbol_dict: &mut SymbolGlobalDict, +) -> Result { + let delta_start = symbol_dict.next_id(); let mut new_symbols: Vec> = Vec::new(); - let mut delta_symbol_bytes_estimate: usize = 0; - let mut per_column_payload: Vec>> = Vec::with_capacity(chunk.columns.len()); + let mut per_column: Vec> = Vec::with_capacity(chunk.columns.len()); + let row_count = chunk.row_count(); for col in &chunk.columns { - match col { - ChunkColumn::Resolved { .. } => per_column_payload.push(None), - ChunkColumn::Symbol { + let ColumnKind::Symbol { + codes, + dict_offsets, + dict_offsets_len, + dict_bytes, + dict_bytes_len, + } = col.kind + else { + per_column.push(None); + continue; + }; + let dict_len = dict_offsets_len - 1; + // SAFETY: pointers were validated to be in-bounds at append time. + let offsets = unsafe { slice::from_raw_parts(dict_offsets, dict_offsets_len) }; + let dict_bytes_slice = unsafe { slice::from_raw_parts(dict_bytes, dict_bytes_len) }; + // Pass 1: mark referenced dict slots + count non-null rows. + let mut referenced = vec![false; dict_len]; + let mut non_null_count = 0usize; + for i in 0..row_count { + if !is_valid_row(col.validity.as_ref(), i) { + continue; + } + // SAFETY: codes ptr was validated to have row_count elements. + let slot = unsafe { codes.read_i64(i) } as usize; + referenced[slot] = true; + non_null_count += 1; + } + // Pass 2: intern referenced slots, build local_to_global. The + // encoder reads `codes` directly at emit time — no separate + // compact-codes pass / allocation needed (~400 KB saved on a + // 100k-row chunk). + let mut local_to_global = vec![u64::MAX; dict_len]; + for (slot, mark) in referenced.iter().enumerate() { + if !*mark { + continue; + } + let start = offsets[slot] as usize; + let end = offsets[slot + 1] as usize; + let entry_bytes = &dict_bytes_slice[start..end]; + let (gid, is_new) = symbol_dict.intern(entry_bytes); + if is_new { + new_symbols.push(entry_bytes.to_vec()); + } + local_to_global[slot] = gid; + } + per_column.push(Some(ResolvedSymbolColumn { + local_to_global, + non_null_count, + })); + } + Ok(SymbolResolution { + delta_start, + new_symbols, + per_column, + }) +} + +// =========================================================================== +// Column encoders +// =========================================================================== + +/// Encode column `col` into `out`. SAFETY: caller buffers referenced by +/// `col` must still be alive (see `Chunk` lifetime contract). +unsafe fn encode_column( + out: &mut Vec, + col: &ColumnDescriptor, + row_count: usize, + col_idx: usize, + resolution: &SymbolResolution, +) -> Result<()> { + let validity = col.validity.as_ref(); + match col.kind { + ColumnKind::Byte { data } => unsafe { + encode_sentinel_le::(out, data, row_count, validity, I8_NULL, |v| [v as u8]) + }, + ColumnKind::Short { data } => unsafe { + encode_sentinel_le::(out, data, row_count, validity, I16_NULL, i16::to_le_bytes) + }, + ColumnKind::Int { data } => unsafe { + encode_sentinel_le::(out, data, row_count, validity, I32_NULL, i32::to_le_bytes) + }, + ColumnKind::Long { data } => unsafe { + encode_sentinel_le::(out, data, row_count, validity, I64_NULL, i64::to_le_bytes) + }, + ColumnKind::Float { data } => unsafe { + encode_sentinel_le::(out, data, row_count, validity, F32_NULL, f32::to_le_bytes) + }, + ColumnKind::Double { data } => unsafe { + encode_sentinel_le::(out, data, row_count, validity, F64_NULL, f64::to_le_bytes) + }, + ColumnKind::Bool { bits } => unsafe { + encode_bool(out, bits, row_count, validity); + }, + ColumnKind::Ipv4 { data } => unsafe { + encode_bitmap_le::(out, data, row_count, validity, u32::to_le_bytes); + }, + ColumnKind::TsNanos { data } + | ColumnKind::TsMicros { data } + | ColumnKind::DateMillis { data } => unsafe { + encode_bitmap_le::(out, data, row_count, validity, i64::to_le_bytes); + }, + ColumnKind::Uuid { data } => unsafe { + encode_fixed_width_bitmap::<16>(out, data as *const u8, row_count, validity); + }, + ColumnKind::Long256 { data } => unsafe { + encode_fixed_width_bitmap::<32>(out, data as *const u8, row_count, validity); + }, + ColumnKind::Varchar { + offsets, + offsets_len, + bytes, + bytes_len, + } => unsafe { + encode_varchar( + out, + offsets, + offsets_len, + bytes, + bytes_len, row_count, - codes, - bitmap, - non_null_count, - referenced_symbols, - .. - } => { - // Map each referenced symbol's internal index → global id, - // remembering new ids so we can append them to the - // delta-dict prefix. - let mut internal_to_global = Vec::with_capacity(referenced_symbols.len()); - for bytes in referenced_symbols { - let (gid, is_new) = global_dict.intern(bytes); - if is_new { - delta_symbol_bytes_estimate += 5 + bytes.len(); - new_symbols.push(bytes.clone()); - } - internal_to_global.push(gid); - } + validity, + ); + }, + ColumnKind::Symbol { codes, .. } => { + let resolved = resolution.per_column[col_idx] + .as_ref() + .expect("symbol resolution missing for symbol column"); + unsafe { + encode_symbol(out, codes, resolved, row_count, validity); + } + } + } + Ok(()) +} - // Build the column's wire payload: null_flag + optional - // bitmap + dense varint global ids for non-null rows. - let mut payload = Vec::with_capacity( - 1 + bitmap.as_ref().map_or(0, |b| b.len()) + non_null_count * 4, - ); - match bitmap { - None => payload.push(0), - Some(bm) => { - payload.push(1); - payload.extend_from_slice(bm); - } +/// Sentinel-null path: no validity bitmap, single null_flag byte + dense +/// data. `T` is read directly from caller memory and converted to LE +/// bytes; nulls are sentinel-encoded with `null_value`. +unsafe fn encode_sentinel_le( + out: &mut Vec, + data: *const T, + row_count: usize, + validity: Option<&ValidityDescriptor>, + null_value: T, + to_le: impl Fn(T) -> [u8; N], +) where + T: Copy, +{ + out.push(0); // null_flag = 0x00 (sentinel encoding) + out.reserve(N * row_count); + match validity { + None => { + // Hot path: contiguous typed buffer → bulk memcpy via byte + // reinterpret. POD numerics, any byte pattern is sound. + let bytes = unsafe { slice::from_raw_parts(data as *const u8, row_count * N) }; + out.extend_from_slice(bytes); + } + Some(v) => { + for i in 0..row_count { + let value = if unsafe { v.is_valid(i) } { + unsafe { *data.add(i) } + } else { + null_value + }; + out.extend_from_slice(&to_le(value)); + } + } + } +} + +/// Bitmap-style fixed-width path: null_flag + optional QWP bitmap + +/// dense values for non-null rows only. +unsafe fn encode_bitmap_le( + out: &mut Vec, + data: *const T, + row_count: usize, + validity: Option<&ValidityDescriptor>, + to_le: impl Fn(T) -> [u8; N], +) where + T: Copy, +{ + match validity { + None => { + out.push(0); + out.reserve(N * row_count); + let bytes = unsafe { slice::from_raw_parts(data as *const u8, row_count * N) }; + out.extend_from_slice(bytes); + } + Some(v) => { + out.push(1); + unsafe { write_qwp_bitmap_from_validity(out, v) }; + out.reserve(N * v.non_null_count); + for i in 0..row_count { + if unsafe { v.is_valid(i) } { + let value = unsafe { *data.add(i) }; + out.extend_from_slice(&to_le(value)); } - for (i, &internal) in codes.iter().enumerate() { - let valid = bitmap.as_ref().is_none_or(|bm| qwp_bit_is_valid(bm, i)); - if !valid { - continue; - } - debug_assert!( - internal != u32::MAX, - "valid symbol row at index {i} had sentinel code" - ); - let gid = internal_to_global[internal as usize]; - write_qwp_varint(&mut payload, gid); + } + } + } +} + +/// Bitmap-style fixed-width binary column (UUID, LONG256). `data` +/// points at row 0 of an `[u8; N]` block. +unsafe fn encode_fixed_width_bitmap( + out: &mut Vec, + data: *const u8, + row_count: usize, + validity: Option<&ValidityDescriptor>, +) { + match validity { + None => { + out.push(0); + out.reserve(N * row_count); + let bytes = unsafe { slice::from_raw_parts(data, N * row_count) }; + out.extend_from_slice(bytes); + } + Some(v) => { + out.push(1); + unsafe { write_qwp_bitmap_from_validity(out, v) }; + out.reserve(N * v.non_null_count); + for i in 0..row_count { + if unsafe { v.is_valid(i) } { + let row_start = unsafe { data.add(i * N) }; + let row = unsafe { slice::from_raw_parts(row_start, N) }; + out.extend_from_slice(row); } - // Sanity-check: we wrote exactly `non_null_count` ids. - debug_assert_eq!( - *non_null_count, - count_non_null(*row_count, bitmap.as_deref()) - ); - per_column_payload.push(Some(payload)); } } } +} - Ok(SymbolResolution { - delta_start, - new_symbols, - delta_symbol_bytes_estimate, - per_column_payload, - }) +unsafe fn encode_bool( + out: &mut Vec, + bits: *const u8, + row_count: usize, + validity: Option<&ValidityDescriptor>, +) { + out.push(0); // bool always sentinel-encoded + let mut packed = 0u8; + let mut bit_idx = 0u8; + for i in 0..row_count { + let byte_idx = i / 8; + let bit_off = i % 8; + let bit = (unsafe { *bits.add(byte_idx) } >> bit_off) & 1; + let valid = validity.is_none_or(|v| unsafe { v.is_valid(i) }); + if bit == 1 && valid { + packed |= 1u8 << bit_idx; + } + bit_idx += 1; + if bit_idx == 8 { + out.push(packed); + packed = 0; + bit_idx = 0; + } + } + if bit_idx != 0 { + out.push(packed); + } } -fn write_header_placeholder(frame: &mut Vec, table_count: u16) { - frame.extend_from_slice(&QWP_MAGIC); - frame.push(QWP_VERSION_1); - frame.push(QWP_FLAG_DELTA_SYMBOL_DICT); - frame.extend_from_slice(&table_count.to_le_bytes()); - frame.extend_from_slice(&0u32.to_le_bytes()); // payload_len, patched after +unsafe fn encode_varchar( + out: &mut Vec, + offsets: *const i32, + offsets_len: usize, + bytes: *const u8, + bytes_len: usize, + row_count: usize, + validity: Option<&ValidityDescriptor>, +) { + let offsets_slice = unsafe { slice::from_raw_parts(offsets, offsets_len) }; + let bytes_slice = unsafe { slice::from_raw_parts(bytes, bytes_len) }; + + match validity { + None => { + out.push(0); // null_flag + out.reserve(4 * (row_count + 1) + bytes_len); + let base = offsets_slice[0]; + if base == 0 { + // Hot path: offset table is bit-identical to LE u32 for + // non-negative i32; memcpy both halves. + let offset_bytes = unsafe { + slice::from_raw_parts( + offsets as *const u8, + offsets_len * std::mem::size_of::(), + ) + }; + out.extend_from_slice(offset_bytes); + let used = offsets_slice[row_count] as usize; + out.extend_from_slice(&bytes_slice[..used]); + } else { + for &off in offsets_slice { + let normalized = (off - base) as u32; + out.extend_from_slice(&normalized.to_le_bytes()); + } + let start = base as usize; + let end = offsets_slice[row_count] as usize; + out.extend_from_slice(&bytes_slice[start..end]); + } + } + Some(v) => { + out.push(1); + unsafe { write_qwp_bitmap_from_validity(out, v) }; + let non_null = v.non_null_count; + let offsets_start = out.len(); + out.resize(offsets_start + 4 * (non_null + 1), 0); + out[offsets_start..offsets_start + 4].copy_from_slice(&0u32.to_le_bytes()); + let mut cumulative: u32 = 0; + let mut next_offset_idx = 1usize; + let bytes_anchor = out.len(); + for i in 0..row_count { + if !unsafe { v.is_valid(i) } { + continue; + } + let start = offsets_slice[i] as usize; + let end = offsets_slice[i + 1] as usize; + let len = end - start; + out.extend_from_slice(&bytes_slice[start..end]); + cumulative = cumulative.saturating_add(len as u32); + let off = offsets_start + 4 * next_offset_idx; + out[off..off + 4].copy_from_slice(&cumulative.to_le_bytes()); + next_offset_idx += 1; + } + debug_assert_eq!(next_offset_idx - 1, non_null); + debug_assert_eq!(out.len() - bytes_anchor, cumulative as usize); + } + } } -fn encode_header_only_frame() -> Vec { - let mut frame = Vec::with_capacity(QWP_HEADER_LEN + 2); - write_header_placeholder(&mut frame, 0); - let payload_start = frame.len(); - write_qwp_varint(&mut frame, 0); // delta_start - write_qwp_varint(&mut frame, 0); // new_symbols_count - let payload_len = (frame.len() - payload_start) as u32; - frame[8..12].copy_from_slice(&payload_len.to_le_bytes()); - frame +unsafe fn encode_symbol( + out: &mut Vec, + codes: SymbolCodesPtr, + resolved: &ResolvedSymbolColumn, + row_count: usize, + validity: Option<&ValidityDescriptor>, +) { + match validity { + None => out.push(0), + Some(v) => { + out.push(1); + unsafe { write_qwp_bitmap_from_validity(out, v) }; + } + } + out.reserve(resolved.non_null_count * 4); + // Specialise on the code's bit width so the per-row loop is a + // straight read + table lookup + varint write (~1 ns/row). The + // dispatch overhead is amortised across the whole column. + match codes { + SymbolCodesPtr::I8(p) => unsafe { + emit_symbol_rows(out, p, row_count, validity, &resolved.local_to_global); + }, + SymbolCodesPtr::I16(p) => unsafe { + emit_symbol_rows(out, p, row_count, validity, &resolved.local_to_global); + }, + SymbolCodesPtr::I32(p) => unsafe { + emit_symbol_rows(out, p, row_count, validity, &resolved.local_to_global); + }, + } } -/// Inspect the QWP-shape bitmap (bit = 1 means NULL): return `true` iff -/// row `i` is valid. -#[inline] -fn qwp_bit_is_valid(bitmap: &[u8], i: usize) -> bool { - (bitmap[i / 8] >> (i % 8)) & 1 == 0 +unsafe fn emit_symbol_rows( + out: &mut Vec, + codes: *const T, + row_count: usize, + validity: Option<&ValidityDescriptor>, + local_to_global: &[u64], +) where + T: Copy + Into, +{ + for i in 0..row_count { + let valid = validity.is_none_or(|v| unsafe { v.is_valid(i) }); + if !valid { + continue; + } + let slot = unsafe { (*codes.add(i)).into() } as usize; + let gid = local_to_global[slot]; + debug_assert_ne!(gid, u64::MAX, "referenced symbol slot has no global id"); + write_qwp_varint(out, gid); + } } -#[inline] -fn count_non_null(row_count: usize, bitmap: Option<&[u8]>) -> usize { - match bitmap { - None => row_count, - Some(bm) => (0..row_count).filter(|&i| qwp_bit_is_valid(bm, i)).count(), +fn encode_designated_ts(out: &mut Vec, ts: &DesignatedTsDescriptor, row_count: usize) { + out.push(0); // designated_ts is always non-null + out.reserve(8 * row_count); + // SAFETY: caller buffer lifetime is the chunk's `'a`. + let bytes = unsafe { + slice::from_raw_parts(ts.data as *const u8, row_count * std::mem::size_of::()) + }; + out.extend_from_slice(bytes); +} + +// =========================================================================== +// Helpers +// =========================================================================== + +/// Write `validity` as a QWP-shape (bit = 1 NULL) bitmap appended to +/// `out`. The high bits past `bit_len` in the last byte are masked. +unsafe fn write_qwp_bitmap_from_validity(out: &mut Vec, v: &ValidityDescriptor) { + let full_bytes = v.bit_len / 8; + let trailing_bits = v.bit_len % 8; + let src = unsafe { slice::from_raw_parts(v.bits, v.byte_len()) }; + for &byte in &src[..full_bytes] { + out.push(!byte); + } + if trailing_bits != 0 { + let mask = (1u8 << trailing_bits) - 1; + out.push((!src[full_bytes]) & mask); } } -/// Pre-encode size of the connection-scoped global dict — the -/// `delta_start` field of the QWP delta-symbol-dict prefix. -fn global_dict_len(global_dict: &SymbolGlobalDict) -> u64 { - global_dict.next_id() +#[inline] +fn is_valid_row(validity: Option<&ValidityDescriptor>, i: usize) -> bool { + match validity { + None => true, + // SAFETY: bit_len was checked == row_count at append time, so + // `i < row_count` ⇒ `i < bit_len`. + Some(v) => unsafe { v.is_valid(i) }, + } } #[cfg(test)] @@ -344,28 +727,39 @@ mod tests { use super::*; use crate::ingress::column_sender::Validity; - fn empty_chunk(table: &str) -> Chunk { - Chunk::new(table) + fn make_chunk_i64(name: &str, data: &[i64]) -> Vec { + let mut chunk = Chunk::new("trades"); + chunk.column_i64(name, data, None).unwrap(); + chunk.designated_timestamp_nanos(data).unwrap(); + let mut out = Vec::new(); + let mut reg = SchemaRegistry::new(); + let mut dict = SymbolGlobalDict::new(); + encode_chunk_into(&mut out, &chunk, &mut reg, &mut dict).unwrap(); + out } #[test] fn empty_chunk_encodes_to_14_bytes() { + let chunk = Chunk::new("trades"); + let mut out = Vec::new(); let mut reg = SchemaRegistry::new(); let mut dict = SymbolGlobalDict::new(); - let frame = encode_chunk(&empty_chunk("trades"), &mut reg, &mut dict).unwrap(); - assert_eq!(frame.len(), 14); - assert_eq!(&frame[0..4], b"QWP1"); - assert_eq!(frame[5], QWP_FLAG_DELTA_SYMBOL_DICT); - assert_eq!(u16::from_le_bytes([frame[6], frame[7]]), 0); + encode_chunk_into(&mut out, &chunk, &mut reg, &mut dict).unwrap(); + assert_eq!(out.len(), 14); + assert_eq!(&out[0..4], b"QWP1"); + assert_eq!(out[5], QWP_FLAG_DELTA_SYMBOL_DICT); + assert_eq!(u16::from_le_bytes([out[6], out[7]]), 0); } #[test] fn non_empty_chunk_without_designated_ts_errors() { + let mut chunk = Chunk::new("trades"); + let data = [1i64, 2, 3]; + chunk.column_i64("a", &data, None).unwrap(); + let mut out = Vec::new(); let mut reg = SchemaRegistry::new(); let mut dict = SymbolGlobalDict::new(); - let mut chunk = Chunk::new("trades"); - chunk.column_i64("a", &[1, 2, 3], None).unwrap(); - let err = encode_chunk(&chunk, &mut reg, &mut dict).unwrap_err(); + let err = encode_chunk_into(&mut out, &chunk, &mut reg, &mut dict).unwrap_err(); assert_eq!(err.code(), crate::ErrorCode::InvalidApiCall); assert!(err.msg().contains("designated")); } @@ -374,125 +768,120 @@ mod tests { fn second_encode_with_same_schema_uses_reference() { let mut reg = SchemaRegistry::new(); let mut dict = SymbolGlobalDict::new(); + + let p1 = [1i64, 2]; let mut c1 = Chunk::new("trades"); - c1.column_i64("price", &[1, 2], None).unwrap(); - c1.designated_timestamp_nanos(&[10, 20]).unwrap(); - let frame1 = encode_chunk(&c1, &mut reg, &mut dict).unwrap(); + c1.column_i64("price", &p1, None).unwrap(); + c1.designated_timestamp_nanos(&p1).unwrap(); + let mut out1 = Vec::new(); + encode_chunk_into(&mut out1, &c1, &mut reg, &mut dict).unwrap(); + let p2 = [3i64, 4]; let mut c2 = Chunk::new("trades"); - c2.column_i64("price", &[3, 4], None).unwrap(); - c2.designated_timestamp_nanos(&[30, 40]).unwrap(); - let frame2 = encode_chunk(&c2, &mut reg, &mut dict).unwrap(); + c2.column_i64("price", &p2, None).unwrap(); + c2.designated_timestamp_nanos(&p2).unwrap(); + let mut out2 = Vec::new(); + encode_chunk_into(&mut out2, &c2, &mut reg, &mut dict).unwrap(); - assert!(frame2.len() < frame1.len()); + assert!(out2.len() < out1.len()); assert_eq!(reg.len(), 1, "schema signature interned once"); let schema_mode_offset = 12 + 1 + 1 + 1 + "trades".len() + 1 + 1; - assert_eq!(frame1[schema_mode_offset], QWP_SCHEMA_MODE_FULL); - assert_eq!(frame2[schema_mode_offset], QWP_SCHEMA_MODE_REFERENCE); + assert_eq!(out1[schema_mode_offset], QWP_SCHEMA_MODE_FULL); + assert_eq!(out2[schema_mode_offset], QWP_SCHEMA_MODE_REFERENCE); } #[test] fn distinct_schemas_get_distinct_ids() { let mut reg = SchemaRegistry::new(); let mut dict = SymbolGlobalDict::new(); + let x = [1i64]; let mut a = Chunk::new("a"); - a.column_i64("x", &[1], None).unwrap(); - a.designated_timestamp_nanos(&[1]).unwrap(); - encode_chunk(&a, &mut reg, &mut dict).unwrap(); + a.column_i64("x", &x, None).unwrap(); + a.designated_timestamp_nanos(&x).unwrap(); + let mut oa = Vec::new(); + encode_chunk_into(&mut oa, &a, &mut reg, &mut dict).unwrap(); + let y = [1.0f64]; + let ts = [1i64]; let mut b = Chunk::new("b"); - b.column_f64("y", &[1.0], None).unwrap(); - b.designated_timestamp_nanos(&[1]).unwrap(); - encode_chunk(&b, &mut reg, &mut dict).unwrap(); + b.column_f64("y", &y, None).unwrap(); + b.designated_timestamp_nanos(&ts).unwrap(); + let mut ob = Vec::new(); + encode_chunk_into(&mut ob, &b, &mut reg, &mut dict).unwrap(); assert_eq!(reg.len(), 2); } #[test] fn frame_size_grows_with_column_payloads() { - let mut reg = SchemaRegistry::new(); - let mut dict = SymbolGlobalDict::new(); - let mut chunk = Chunk::new("trades"); + let p = [1i64, 2, 3, 4]; let bits = [0xFFu8]; let v = Validity::from_bitmap(&bits, 4).unwrap(); - chunk.column_i64("price", &[1, 2, 3, 4], Some(&v)).unwrap(); - chunk.designated_timestamp_nanos(&[1, 2, 3, 4]).unwrap(); - let frame = encode_chunk(&chunk, &mut reg, &mut dict).unwrap(); - assert!(frame.len() > 32); + let mut chunk = Chunk::new("trades"); + chunk.column_i64("price", &p, Some(&v)).unwrap(); + chunk.designated_timestamp_nanos(&p).unwrap(); + let mut out = Vec::new(); + let mut reg = SchemaRegistry::new(); + let mut dict = SymbolGlobalDict::new(); + encode_chunk_into(&mut out, &chunk, &mut reg, &mut dict).unwrap(); + assert!(out.len() > 32); } #[test] fn symbol_dict_emits_only_referenced_entries() { - let mut reg = SchemaRegistry::new(); - let mut dict = SymbolGlobalDict::new(); - - let mut chunk = Chunk::new("trades"); - // Caller dict has 3 entries; rows only reference "alpha" and "gamma". + let codes = [0i32, 2, 0, 2]; + let dict_offsets = [0i32, 5, 9, 14]; let dict_bytes = b"alphabetagamma"; - let dict_offsets: [i32; 4] = [0, 5, 9, 14]; + let ts = [1i64, 2, 3, 4]; + let mut chunk = Chunk::new("trades"); chunk - .symbol_dict_i32( - "sym", - &[0, 2, 0, 2], // alpha, gamma, alpha, gamma - &dict_offsets, - dict_bytes, - None, - ) + .symbol_dict_i32("sym", &codes, &dict_offsets, dict_bytes, None) .unwrap(); - chunk.designated_timestamp_nanos(&[1, 2, 3, 4]).unwrap(); - let _frame = encode_chunk(&chunk, &mut reg, &mut dict).unwrap(); - // Global dict should have grown by exactly 2 (alpha, gamma) — beta - // is never sent because no row references it. - assert_eq!(global_dict_len(&dict), 2); + chunk.designated_timestamp_nanos(&ts).unwrap(); + let mut out = Vec::new(); + let mut reg = SchemaRegistry::new(); + let mut dict = SymbolGlobalDict::new(); + encode_chunk_into(&mut out, &chunk, &mut reg, &mut dict).unwrap(); + assert_eq!(dict.next_id(), 2, "alpha + gamma only, beta unsent"); } #[test] fn symbol_dict_second_frame_resends_only_new_entries() { let mut reg = SchemaRegistry::new(); let mut dict = SymbolGlobalDict::new(); + let dict_offsets = [0i32, 5, 9, 14]; let dict_bytes = b"alphabetagamma"; - let dict_offsets: [i32; 4] = [0, 5, 9, 14]; + let codes1 = [0i32, 1]; + let ts1 = [1i64, 2]; let mut c1 = Chunk::new("trades"); - c1.symbol_dict_i32("sym", &[0, 1], &dict_offsets, dict_bytes, None) + c1.symbol_dict_i32("sym", &codes1, &dict_offsets, dict_bytes, None) .unwrap(); - c1.designated_timestamp_nanos(&[1, 2]).unwrap(); - encode_chunk(&c1, &mut reg, &mut dict).unwrap(); - assert_eq!(global_dict_len(&dict), 2); // alpha, beta + c1.designated_timestamp_nanos(&ts1).unwrap(); + let mut out1 = Vec::new(); + encode_chunk_into(&mut out1, &c1, &mut reg, &mut dict).unwrap(); + assert_eq!(dict.next_id(), 2); + let codes2 = [0i32, 2]; + let ts2 = [3i64, 4]; let mut c2 = Chunk::new("trades"); - // alpha (cached) + gamma (new). - c2.symbol_dict_i32("sym", &[0, 2], &dict_offsets, dict_bytes, None) + c2.symbol_dict_i32("sym", &codes2, &dict_offsets, dict_bytes, None) .unwrap(); - c2.designated_timestamp_nanos(&[3, 4]).unwrap(); - encode_chunk(&c2, &mut reg, &mut dict).unwrap(); - assert_eq!(global_dict_len(&dict), 3, "gamma added on second frame"); + c2.designated_timestamp_nanos(&ts2).unwrap(); + let mut out2 = Vec::new(); + encode_chunk_into(&mut out2, &c2, &mut reg, &mut dict).unwrap(); + assert_eq!(dict.next_id(), 3, "gamma added on second frame"); } #[test] - fn symbol_dict_rejects_out_of_range_code() { - let mut chunk = Chunk::new("trades"); - let dict_bytes = b"alpha"; - let dict_offsets: [i32; 2] = [0, 5]; - let err = chunk - .symbol_dict_i32("sym", &[0, 99], &dict_offsets, dict_bytes, None) - .unwrap_err(); - assert_eq!(err.code(), crate::ErrorCode::InvalidApiCall); - assert!(err.msg().contains("out of range")); - } - - #[test] - fn symbol_dict_skips_null_codes() { - let mut chunk = Chunk::new("trades"); - let dict_bytes = b"alpha"; - let dict_offsets: [i32; 2] = [0, 5]; - // Code 99 is out of range, but row 1 is null so its code is not - // validated. - let bits = [0b0000_0001]; - let v = Validity::from_bitmap(&bits, 2).unwrap(); - chunk - .symbol_dict_i32("sym", &[0, 99], &dict_offsets, dict_bytes, Some(&v)) - .expect("null row's bogus code is ignored"); + fn i64_no_null_round_trip_wire_bytes() { + let bytes = make_chunk_i64("price", &[10, 20, 30]); + // Frame contains: header(12) + delta_dict(2) + table_block + schema + + // column data + designated_ts data. The exact byte layout is asserted + // implicitly via the other tests; here we just ensure the payload_len + // patched correctly. + let payload_len = u32::from_le_bytes(bytes[8..12].try_into().unwrap()) as usize; + assert_eq!(12 + payload_len, bytes.len()); } } diff --git a/questdb-rs/src/ingress/column_sender/mod.rs b/questdb-rs/src/ingress/column_sender/mod.rs index b2e159fc..b1241ed3 100644 --- a/questdb-rs/src/ingress/column_sender/mod.rs +++ b/questdb-rs/src/ingress/column_sender/mod.rs @@ -41,6 +41,7 @@ mod chunk; mod conf; +mod conn; mod db; mod encoder; mod sender; @@ -64,7 +65,7 @@ pub mod _bench_internals { use crate::ingress::buffer::SymbolGlobalDict; use super::chunk::Chunk; - use super::encoder::{SchemaRegistry, encode_chunk}; + use super::encoder::{SchemaRegistry, encode_chunk_into}; /// Opaque holder for the connection-scoped state the encoder needs. /// Lets benches reuse the encoder across iterations without @@ -90,10 +91,19 @@ pub mod _bench_internals { } } - /// Encode `chunk` against `state`. Mirrors [`encode_chunk`] but - /// hides the internal-state types so the bench module never has to - /// touch them. - pub fn bench_encode_chunk(chunk: &Chunk, state: &mut BenchEncoderState) -> Result> { - encode_chunk(chunk, &mut state.schema_registry, &mut state.symbol_dict) + /// Encode `chunk` into `out`. Mirrors [`encode_chunk_into`] but hides + /// the internal-state types so the bench module never has to touch + /// them. + pub fn bench_encode_chunk_into( + out: &mut Vec, + chunk: &Chunk<'_>, + state: &mut BenchEncoderState, + ) -> Result<()> { + encode_chunk_into( + out, + chunk, + &mut state.schema_registry, + &mut state.symbol_dict, + ) } } diff --git a/questdb-rs/src/ingress/column_sender/sender.rs b/questdb-rs/src/ingress/column_sender/sender.rs index 96010bb9..cbb7ecb0 100644 --- a/questdb-rs/src/ingress/column_sender/sender.rs +++ b/questdb-rs/src/ingress/column_sender/sender.rs @@ -24,25 +24,22 @@ //! Borrowed-handle types for the column-major sender. //! -//! A [`ColumnSender`] is one borrowed pool slot. It owns the underlying -//! [`crate::ingress::Sender`], the connection-scoped [`SchemaRegistry`], -//! and the connection-scoped [`SymbolGlobalDict`]: all three travel back -//! into the pool together when the [`super::BorrowedSender`] is dropped. +//! A [`ColumnSender`] owns one pipelined QWP/WebSocket connection +//! ([`super::conn::ColumnConn`]), a connection-scoped +//! [`SchemaRegistry`](super::encoder::SchemaRegistry), and a +//! connection-scoped [`SymbolGlobalDict`]: all three travel back into the +//! pool together when the [`super::BorrowedSender`] is dropped. use std::fmt::{self, Debug, Formatter}; -use std::time::Duration; -use crate::ingress::Sender; +use crate::Result; use crate::ingress::buffer::SymbolGlobalDict; -use crate::{Result, error}; use super::chunk::Chunk; +use super::conn::ColumnConn; use super::encoder::{self, SchemaRegistry}; -/// Acknowledgement level a [`ColumnSender::flush`] call waits for. -/// -/// See `doc/COLUMN_SENDER_PLAN.md` §4 for the rationale and the QWP/WS spec -/// for the status-byte values. +/// Acknowledgement level for [`ColumnSender::sync`]. #[derive(Clone, Copy, Debug, Default, PartialEq, Eq)] pub enum AckLevel { /// Wait for the server's WAL-commit ACK (spec status `0x00`). Always @@ -50,45 +47,37 @@ pub enum AckLevel { #[default] Ok, /// Wait for the server's object-store durability ACK (spec status - /// `0x02`). Enterprise feature; requires `request_durable_ack=on` in the - /// connect string. Flush returns `InvalidApiCall` otherwise. + /// `0x02`). Enterprise feature; requires `request_durable_ack=on` in + /// the connect string. Durable, } -/// One [`crate::ingress::Sender`] in the pool, wrapped in the column-sender -/// type system. -/// -/// The user reaches this via [`super::BorrowedSender`]. +/// One [`ColumnConn`] in the pool, wrapped in the column-sender API. pub struct ColumnSender { - pub(crate) sender: Sender, + pub(crate) conn: ColumnConn, pub(crate) schema_registry: SchemaRegistry, pub(crate) symbol_dict: SymbolGlobalDict, - /// Latched from the connect string at [`super::QuestDb::connect`]; a - /// [`AckLevel::Durable`] flush is only honoured when this is `true`. - durable_ack_opt_in: bool, } impl Debug for ColumnSender { fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { f.debug_struct("ColumnSender") - .field("sender", &self.sender) - .field("durable_ack_opt_in", &self.durable_ack_opt_in) + .field("must_close", &self.conn.must_close()) + .field("in_flight", &self.conn.in_flight()) .finish() } } impl ColumnSender { pub(crate) fn new( - sender: Sender, + conn: ColumnConn, schema_registry: SchemaRegistry, symbol_dict: SymbolGlobalDict, - durable_ack_opt_in: bool, ) -> Self { Self { - sender, + conn, schema_registry, symbol_dict, - durable_ack_opt_in, } } @@ -97,57 +86,52 @@ impl ColumnSender { /// are dropped rather than recycled. #[must_use] pub fn must_close(&self) -> bool { - self.sender.must_close() + self.conn.must_close() } - /// Encode `chunk` into a QWP/WebSocket frame, publish it, and block - /// until the server acknowledges at the requested [`AckLevel`]. + /// Encode `chunk` into a QWP/WebSocket frame, write it to the + /// socket, and return — **without** waiting for the server's ack. /// - /// On success, `chunk` is cleared (its retained capacity is preserved). - /// On failure, `chunk` is left untouched so the caller can inspect or - /// recover its contents before dropping it. + /// Ready acks are drained non-blocking before the write. If the + /// in-flight count has reached the protocol cap (128), this call + /// blocks until at least one ack frees a slot. /// - /// At most one frame is in flight per sender at a time — that is what - /// makes this call synchronous. For parallel ingest, borrow multiple - /// senders from the [`super::QuestDb`] pool, one per worker thread. + /// On success, `chunk` is cleared (its retained descriptor capacity + /// is preserved) and the caller's buffers are released. The ack + /// will arrive later; call [`sync`](Self::sync) when you need all + /// in-flight frames acknowledged. /// - /// `AckLevel::Durable` requires the pool to have been opened with - /// `request_durable_ack=on`; otherwise this returns `InvalidApiCall`. - pub fn flush(&mut self, chunk: &mut Chunk, ack_level: AckLevel) -> Result<()> { - if ack_level == AckLevel::Durable && !self.durable_ack_opt_in { - return Err(error::fmt!( - InvalidApiCall, - "AckLevel::Durable requires the pool to be opened with \ - `request_durable_ack=on` in the connect string." - )); + /// On failure, the connection is latched as terminal and the error + /// is returned. `chunk` is left untouched. + pub fn flush(&mut self, chunk: &mut Chunk<'_>) -> Result<()> { + // Drain any ready acks to keep the pipeline moving and to + // surface server errors as early as possible. + self.conn.try_drain_acks()?; + + // If we've hit the cap, block until one slot frees up. + if self.conn.at_in_flight_cap() { + self.conn.drain_one_ack_blocking()?; } - let payload = - encoder::encode_chunk(chunk, &mut self.schema_registry, &mut self.symbol_dict)?; - let fsn = self.sender.qwp_ws_publish_raw(&payload)?; - self.await_ack(fsn)?; + let schema = &mut self.schema_registry; + let dict = &mut self.symbol_dict; + let published = self + .conn + .publish_qwp(|out| encoder::encode_chunk_into(out, chunk, schema, dict))?; + + self.conn.push_pending(published.fsn); chunk.clear(); Ok(()) } - /// Wait until the underlying connection's cumulative ack watermark - /// reaches `fsn`, or until the connection latches into `must_close`. - fn await_ack(&mut self, fsn: u64) -> Result<()> { - // Poll in 50 ms slices so a connection that latches into - // `must_close` mid-wait is surfaced promptly rather than blocking - // forever on the underlying ack watermark. - const POLL: Duration = Duration::from_millis(50); - loop { - if self.sender.await_acked_fsn(fsn, POLL)? { - return Ok(()); - } - if self.sender.must_close() { - return Err(error::fmt!( - SocketError, - "QWP/WebSocket connection entered a terminal state before \ - the published frame was acknowledged." - )); - } - } + /// Block until all in-flight frames are acknowledged at the + /// requested [`AckLevel`]. + /// + /// `AckLevel::Ok` waits for every in-flight frame's WAL-commit ack. + /// `AckLevel::Durable` additionally waits for the server's + /// object-store durability watermarks to reach every frame's + /// seq_txn (requires `request_durable_ack=on` at connect). + pub fn sync(&mut self, ack_level: AckLevel) -> Result<()> { + self.conn.sync_all_acks(ack_level) } } diff --git a/questdb-rs/src/ingress/column_sender/validity.rs b/questdb-rs/src/ingress/column_sender/validity.rs index 66036330..0bdcf124 100644 --- a/questdb-rs/src/ingress/column_sender/validity.rs +++ b/questdb-rs/src/ingress/column_sender/validity.rs @@ -86,22 +86,6 @@ impl<'a> Validity<'a> { } count } - - /// Write the QWP-shape null bitmap (bit = 1 means NULL) for this - /// validity into `out`. Always writes `ceil(bit_len / 8)` bytes; the - /// last byte's high bits past `bit_len` are masked to zero. - pub(crate) fn write_qwp_bitmap(&self, out: &mut Vec) { - let full_bytes = self.bit_len / 8; - let trailing_bits = self.bit_len % 8; - for &byte in &self.bits[..full_bytes] { - out.push(!byte); - } - if trailing_bits != 0 { - let mask = (1u8 << trailing_bits) - 1; - let inverted = !self.bits[full_bytes] & mask; - out.push(inverted); - } - } } /// Validate that a caller-supplied `data` length matches a chunk's locked @@ -149,20 +133,6 @@ mod tests { assert_eq!(v.non_null_count(), 4 + 1); } - #[test] - fn write_qwp_bitmap_inverts_arrow_semantics() { - // Arrow: bit=1 valid. QWP wire: bit=1 NULL. Trailing high bits of - // the last byte are masked to 0. - let bits = [0b1100_1100, 0b0000_0011]; - let v = Validity::from_bitmap(&bits, 12).unwrap(); - let mut out = Vec::new(); - v.write_qwp_bitmap(&mut out); - assert_eq!(out.len(), 2); - assert_eq!(out[0], !0b1100_1100); - // Last byte: invert and mask to 4 valid bits (rows 8..12). - assert_eq!(out[1], (!0b0000_0011) & 0b0000_1111); - } - #[test] fn from_bitmap_rejects_short_buffer() { let err = Validity::from_bitmap(&[0u8], 9).unwrap_err(); diff --git a/questdb-rs/src/ingress/sender.rs b/questdb-rs/src/ingress/sender.rs index 2bbb0102..e5a351c9 100644 --- a/questdb-rs/src/ingress/sender.rs +++ b/questdb-rs/src/ingress/sender.rs @@ -83,7 +83,7 @@ pub(crate) use qwp_ws_ownership::QwpWsRoleReject; pub use qwp_ws_ownership::*; #[cfg(feature = "sync-sender-qwp-ws")] -mod qwp_ws; +pub(crate) mod qwp_ws; #[cfg(feature = "sync-sender-qwp-ws")] pub(crate) use qwp_ws::*; @@ -835,52 +835,6 @@ impl Sender { } Ok(()) } - - /// Publish a pre-encoded QWP/WebSocket payload through this sender's - /// replay queue, returning the assigned frame sequence number (FSN). - /// - /// Caller-side escape hatch used by the column-major sender; the row-API - /// path stays on [`Sender::flush_and_get_fsn`]. The payload must already - /// be a valid QWP frame including its 12-byte header. Manual progress - /// mode and non-QWP/WS handlers are rejected with `InvalidApiCall`. - #[cfg(feature = "sync-sender-qwp-ws")] - pub(crate) fn qwp_ws_publish_raw(&mut self, payload: &[u8]) -> Result { - let SyncProtocolHandler::SyncQwpWs(_) = &self.handler else { - return Err(error::fmt!( - InvalidApiCall, - "qwp_ws_publish_raw is only supported for QWP/WebSocket senders \ - in background progress mode." - )); - }; - if let SyncProtocolHandler::SyncQwpWs(state) = &self.handler - && let Err(err) = qwp_ws_check_error_background(state) - { - let _ = self.drain_qwp_ws_error_notifications(); - return Err(err); - } - self.drain_qwp_ws_error_notifications()?; - - if payload.len() > self.max_buf_size { - return Err(qwp_ws_publisher::qwp_ws_encoded_message_size_error( - payload.len(), - self.max_buf_size, - )); - } - - let result = match &mut self.handler { - SyncProtocolHandler::SyncQwpWs(state) => { - qwp_ws_publish_replay_background(state, payload) - } - _ => unreachable!("guarded above"), - }; - if result - .as_ref() - .is_err_and(|err| matches!(err.code(), crate::ErrorCode::SocketError)) - { - self.connected = false; - } - result - } } #[cfg(feature = "sync-sender-qwp-ws")] diff --git a/questdb-rs/src/ingress/sender/qwp_ws.rs b/questdb-rs/src/ingress/sender/qwp_ws.rs index 10082fa1..8f272a68 100644 --- a/questdb-rs/src/ingress/sender/qwp_ws.rs +++ b/questdb-rs/src/ingress/sender/qwp_ws.rs @@ -2778,17 +2778,6 @@ pub(crate) fn flush_qwp_ws( }) } -/// Background-mode escape hatch used by the column-major sender: hand a -/// pre-encoded QWP/WebSocket frame to the replay queue and return its FSN. -/// Bypasses the row-API encoder; the caller is responsible for producing a -/// spec-conformant payload. -pub(crate) fn qwp_ws_publish_replay_background( - state: &mut SyncQwpWsHandlerState, - payload: &[u8], -) -> crate::Result { - state.runner.publish_replay_payload(payload) -} - pub(crate) fn flush_qwp_ws_manual( state: &mut ManualQwpWsHandlerState, buffer: &QwpWsColumnarBuffer, diff --git a/questdb-rs/src/tests/column_sender_pool.rs b/questdb-rs/src/tests/column_sender_pool.rs index d1346e54..07fc6c38 100644 --- a/questdb-rs/src/tests/column_sender_pool.rs +++ b/questdb-rs/src/tests/column_sender_pool.rs @@ -375,9 +375,8 @@ fn refuses_durable_ack_without_opt_in() { let server = MockServer::spawn(2); let db = QuestDb::connect(&conf_for(server.port(), "")).unwrap(); let mut sender = db.borrow_sender().expect("borrow"); - let mut chunk = Chunk::new("trades"); let err = sender - .flush(&mut chunk, AckLevel::Durable) + .sync(AckLevel::Durable) .expect_err("durable without opt-in must fail"); assert_eq!(err.code(), ErrorCode::InvalidApiCall); assert!( @@ -394,8 +393,9 @@ fn empty_chunk_flush_round_trips() { let mut sender = db.borrow_sender().expect("borrow"); let mut chunk = Chunk::new("trades"); assert_eq!(chunk.row_count(), 0); + sender.flush(&mut chunk).unwrap(); sender - .flush(&mut chunk, AckLevel::Ok) + .sync(AckLevel::Ok) .expect("empty-chunk flush must round-trip"); // Flush clears the chunk. assert_eq!(chunk.row_count(), 0); @@ -408,9 +408,8 @@ fn flush_clears_chunk_for_reuse_and_can_repeat() { let mut sender = db.borrow_sender().expect("borrow"); let mut chunk = Chunk::new("trades"); for _ in 0..3 { - sender - .flush(&mut chunk, AckLevel::Ok) - .expect("repeated empty flush"); + sender.flush(&mut chunk).unwrap(); + sender.sync(AckLevel::Ok).expect("repeated empty flush"); } } @@ -424,7 +423,7 @@ fn flush_rejects_chunk_with_no_designated_timestamp() { .column_i64("price", &[1, 2, 3], None) .expect("column_i64"); let err = sender - .flush(&mut chunk, AckLevel::Ok) + .flush(&mut chunk) .expect_err("non-empty chunk without designated_ts must error"); assert_eq!(err.code(), ErrorCode::InvalidApiCall); assert!(err.msg().contains("designated"), "msg: {}", err.msg()); @@ -458,9 +457,8 @@ fn non_empty_chunk_with_numeric_columns_round_trips() { .unwrap(); assert_eq!(chunk.row_count(), 3); - sender - .flush(&mut chunk, AckLevel::Ok) - .expect("numeric chunk flush"); + sender.flush(&mut chunk).unwrap(); + sender.sync(AckLevel::Ok).expect("numeric chunk flush"); assert!(chunk.is_empty(), "flush must clear the chunk"); // Second flush with the SAME schema exercises the SchemaRegistry's @@ -473,8 +471,9 @@ fn non_empty_chunk_with_numeric_columns_round_trips() { chunk .designated_timestamp_nanos(&[1_700_000_000_000_003_000, 1_700_000_000_000_004_000]) .unwrap(); + sender.flush(&mut chunk).unwrap(); sender - .flush(&mut chunk, AckLevel::Ok) + .sync(AckLevel::Ok) .expect("second flush (schema reuse)"); } @@ -510,9 +509,8 @@ fn varchar_chunk_round_trips() { ]) .unwrap(); assert_eq!(chunk.row_count(), 4); - sender - .flush(&mut chunk, AckLevel::Ok) - .expect("varchar flush"); + sender.flush(&mut chunk).unwrap(); + sender.sync(AckLevel::Ok).expect("varchar flush"); assert!(chunk.is_empty()); } @@ -532,9 +530,8 @@ fn symbol_chunk_round_trips_and_reuses_global_dict() { .symbol_dict_i32("sym", &[0, 2, 0, 2], &dict_offsets, dict_bytes, None) .expect("symbol_dict_i32 first flush"); chunk.designated_timestamp_nanos(&[1, 2, 3, 4]).unwrap(); - sender - .flush(&mut chunk, AckLevel::Ok) - .expect("symbol flush 1"); + sender.flush(&mut chunk).unwrap(); + sender.sync(AckLevel::Ok).expect("symbol flush 1"); // Second flush re-uses entry 0 ("alpha", already in the global dict) // and adds entry 1 ("beta"). With the connection-scoped dict the @@ -543,9 +540,8 @@ fn symbol_chunk_round_trips_and_reuses_global_dict() { .symbol_dict_i32("sym", &[1, 0, 1, 0], &dict_offsets, dict_bytes, None) .expect("symbol_dict_i32 second flush"); chunk.designated_timestamp_nanos(&[5, 6, 7, 8]).unwrap(); - sender - .flush(&mut chunk, AckLevel::Ok) - .expect("symbol flush 2"); + sender.flush(&mut chunk).unwrap(); + sender.sync(AckLevel::Ok).expect("symbol flush 2"); } #[test] From a9faea24976bb7c74e5c7dff2268e6dc25eab605 Mon Sep 17 00:00:00 2001 From: bluestreak Date: Sun, 24 May 2026 23:06:20 +0100 Subject: [PATCH 5/9] perf(ingress): set SO_SNDBUF and SO_RCVBUF to 4 MiB on QWP/WS sockets The default macOS TCP send buffer (~128 KB) is smaller than a typical QWP chunk (1.5 MB at 25k rows). write_all blocks mid-frame while the kernel drains the small buffer. A 4 MiB send buffer lets the kernel accept a full chunk in one shot, reducing write_all stalls when the pipeline has multiple frames in flight. Also sets SO_RCVBUF to 4 MiB to absorb ack bursts from the server without backpressuring the server's send path. Co-Authored-By: Claude Opus 4.7 (1M context) --- questdb-rs/src/ingress/sender/qwp_ws.rs | 3 +++ 1 file changed, 3 insertions(+) diff --git a/questdb-rs/src/ingress/sender/qwp_ws.rs b/questdb-rs/src/ingress/sender/qwp_ws.rs index 8f272a68..f077e746 100644 --- a/questdb-rs/src/ingress/sender/qwp_ws.rs +++ b/questdb-rs/src/ingress/sender/qwp_ws.rs @@ -2112,6 +2112,9 @@ fn connect_tcp_to_any_addr( tcp.set_nodelay(true).ok(); tcp.set_read_timeout(Some(request_timeout)).ok(); tcp.set_write_timeout(Some(request_timeout)).ok(); + let sock = socket2::SockRef::from(&tcp); + sock.set_send_buffer_size(4 * 1024 * 1024).ok(); + sock.set_recv_buffer_size(4 * 1024 * 1024).ok(); return Ok(tcp); } Err(io) => failures.push(format!("{addr}: {io}")), From 1725f8c67b9cc104bc40b5c3c6e58e786d8a9b66 Mon Sep 17 00:00:00 2001 From: bluestreak Date: Sun, 24 May 2026 23:10:22 +0100 Subject: [PATCH 6/9] chore: register qwp_ws_l1_quotes example in Cargo.toml Co-Authored-By: Claude Opus 4.7 (1M context) --- questdb-rs/Cargo.toml | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/questdb-rs/Cargo.toml b/questdb-rs/Cargo.toml index 811bcd7a..07915254 100644 --- a/questdb-rs/Cargo.toml +++ b/questdb-rs/Cargo.toml @@ -255,6 +255,12 @@ required-features = ["sync-reader-ws"] name = "qwp_ws_unified_sfa_bench" required-features = ["sync-sender-qwp-ws"] +# Synthetic equities L1 quote feed → QuestDB via the column-major +# sender. End-to-end throughput sanity check against a real server. +[[example]] +name = "qwp_ws_l1_quotes" +required-features = ["sync-sender-qwp-ws"] + # Decoder microbenchmark anchoring the perf claims from commits # `8ec0a85` (zero-copy decode) and `1163d43` (tighter SYMBOL/VARCHAR # decode hot paths). Run with: From 820ac3935c731207664caf8b9160dd1edf5c2f05 Mon Sep 17 00:00:00 2001 From: bluestreak Date: Mon, 25 May 2026 01:13:00 +0100 Subject: [PATCH 7/9] feat(ingress): FLAG_DEFER_COMMIT for batched WAL commits MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit flush() now sets FLAG_DEFER_COMMIT (0x01) on every QWP frame. The server appends rows to WAL writers without committing. sync() sends a commit-triggering empty frame (without the flag) that commits all accumulated rows in one WAL transaction, then drains acks. This eliminates per-chunk WAL fsync overhead: 200 chunks × 25k rows now produce 1 WAL commit instead of 200. The p95 per-chunk latency drops from ~23 ms to ~7 ms. Old servers that don't recognize the flag ignore it (reserved bit position) and commit per-message — graceful degradation per the spec. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../src/ingress/column_sender/encoder.rs | 41 ++++++++------- questdb-rs/src/ingress/column_sender/mod.rs | 1 + .../src/ingress/column_sender/sender.rs | 50 +++++++++++-------- questdb-rs/src/ingress/column_sender/wire.rs | 1 + 4 files changed, 55 insertions(+), 38 deletions(-) diff --git a/questdb-rs/src/ingress/column_sender/encoder.rs b/questdb-rs/src/ingress/column_sender/encoder.rs index 29ee9251..13b31415 100644 --- a/questdb-rs/src/ingress/column_sender/encoder.rs +++ b/questdb-rs/src/ingress/column_sender/encoder.rs @@ -41,7 +41,7 @@ use super::chunk::{ Chunk, ColumnDescriptor, ColumnKind, DesignatedTsDescriptor, SymbolCodesPtr, ValidityDescriptor, }; use super::wire::{ - F32_NULL, F64_NULL, I8_NULL, I16_NULL, I32_NULL, I64_NULL, MAX_NAME_LEN, + F32_NULL, F64_NULL, I8_NULL, I16_NULL, I32_NULL, I64_NULL, MAX_NAME_LEN, QWP_FLAG_DEFER_COMMIT, QWP_FLAG_DELTA_SYMBOL_DICT, QWP_HEADER_LEN, QWP_MAGIC, QWP_SCHEMA_MODE_FULL, QWP_SCHEMA_MODE_REFERENCE, QWP_VERSION_1, validate_name, write_qwp_bytes, write_qwp_varint, }; @@ -88,9 +88,10 @@ pub(crate) fn encode_chunk_into( chunk: &Chunk<'_>, schema_registry: &mut SchemaRegistry, symbol_dict: &mut SymbolGlobalDict, + defer_commit: bool, ) -> Result<()> { if chunk.is_empty() { - emit_header_only_frame(out); + emit_header_only_frame(out, defer_commit); return Ok(()); } if chunk.designated_ts.is_none() { @@ -156,7 +157,7 @@ pub(crate) fn encode_chunk_into( // --- Reserve frame header placeholder --- let frame_start = out.len(); - write_header_placeholder(out, /* table_count = */ 1); + write_header_placeholder(out, /* table_count = */ 1, defer_commit); let payload_start = out.len(); // --- Delta-symbol-dict prefix --- @@ -252,9 +253,9 @@ fn estimate_frame_size( total } -fn emit_header_only_frame(out: &mut Vec) { +fn emit_header_only_frame(out: &mut Vec, defer_commit: bool) { let frame_start = out.len(); - write_header_placeholder(out, 0); + write_header_placeholder(out, 0, defer_commit); let payload_start = out.len(); write_qwp_varint(out, 0); // delta_start write_qwp_varint(out, 0); // new_symbols_count @@ -262,11 +263,15 @@ fn emit_header_only_frame(out: &mut Vec) { out[frame_start + 8..frame_start + 12].copy_from_slice(&payload_len.to_le_bytes()); } -fn write_header_placeholder(out: &mut Vec, table_count: u16) { +fn write_header_placeholder(out: &mut Vec, table_count: u16, defer_commit: bool) { let start = out.len(); out.extend_from_slice(&QWP_MAGIC); out.push(QWP_VERSION_1); - out.push(QWP_FLAG_DELTA_SYMBOL_DICT); + let mut flags = QWP_FLAG_DELTA_SYMBOL_DICT; + if defer_commit { + flags |= QWP_FLAG_DEFER_COMMIT; + } + out.push(flags); out.extend_from_slice(&table_count.to_le_bytes()); out.extend_from_slice(&0u32.to_le_bytes()); // payload_len placeholder debug_assert_eq!(out.len() - start, QWP_HEADER_LEN); @@ -734,7 +739,7 @@ mod tests { let mut out = Vec::new(); let mut reg = SchemaRegistry::new(); let mut dict = SymbolGlobalDict::new(); - encode_chunk_into(&mut out, &chunk, &mut reg, &mut dict).unwrap(); + encode_chunk_into(&mut out, &chunk, &mut reg, &mut dict, false).unwrap(); out } @@ -744,7 +749,7 @@ mod tests { let mut out = Vec::new(); let mut reg = SchemaRegistry::new(); let mut dict = SymbolGlobalDict::new(); - encode_chunk_into(&mut out, &chunk, &mut reg, &mut dict).unwrap(); + encode_chunk_into(&mut out, &chunk, &mut reg, &mut dict, false).unwrap(); assert_eq!(out.len(), 14); assert_eq!(&out[0..4], b"QWP1"); assert_eq!(out[5], QWP_FLAG_DELTA_SYMBOL_DICT); @@ -759,7 +764,7 @@ mod tests { let mut out = Vec::new(); let mut reg = SchemaRegistry::new(); let mut dict = SymbolGlobalDict::new(); - let err = encode_chunk_into(&mut out, &chunk, &mut reg, &mut dict).unwrap_err(); + let err = encode_chunk_into(&mut out, &chunk, &mut reg, &mut dict, false).unwrap_err(); assert_eq!(err.code(), crate::ErrorCode::InvalidApiCall); assert!(err.msg().contains("designated")); } @@ -774,14 +779,14 @@ mod tests { c1.column_i64("price", &p1, None).unwrap(); c1.designated_timestamp_nanos(&p1).unwrap(); let mut out1 = Vec::new(); - encode_chunk_into(&mut out1, &c1, &mut reg, &mut dict).unwrap(); + encode_chunk_into(&mut out1, &c1, &mut reg, &mut dict, false).unwrap(); let p2 = [3i64, 4]; let mut c2 = Chunk::new("trades"); c2.column_i64("price", &p2, None).unwrap(); c2.designated_timestamp_nanos(&p2).unwrap(); let mut out2 = Vec::new(); - encode_chunk_into(&mut out2, &c2, &mut reg, &mut dict).unwrap(); + encode_chunk_into(&mut out2, &c2, &mut reg, &mut dict, false).unwrap(); assert!(out2.len() < out1.len()); assert_eq!(reg.len(), 1, "schema signature interned once"); @@ -800,7 +805,7 @@ mod tests { a.column_i64("x", &x, None).unwrap(); a.designated_timestamp_nanos(&x).unwrap(); let mut oa = Vec::new(); - encode_chunk_into(&mut oa, &a, &mut reg, &mut dict).unwrap(); + encode_chunk_into(&mut oa, &a, &mut reg, &mut dict, false).unwrap(); let y = [1.0f64]; let ts = [1i64]; @@ -808,7 +813,7 @@ mod tests { b.column_f64("y", &y, None).unwrap(); b.designated_timestamp_nanos(&ts).unwrap(); let mut ob = Vec::new(); - encode_chunk_into(&mut ob, &b, &mut reg, &mut dict).unwrap(); + encode_chunk_into(&mut ob, &b, &mut reg, &mut dict, false).unwrap(); assert_eq!(reg.len(), 2); } @@ -824,7 +829,7 @@ mod tests { let mut out = Vec::new(); let mut reg = SchemaRegistry::new(); let mut dict = SymbolGlobalDict::new(); - encode_chunk_into(&mut out, &chunk, &mut reg, &mut dict).unwrap(); + encode_chunk_into(&mut out, &chunk, &mut reg, &mut dict, false).unwrap(); assert!(out.len() > 32); } @@ -842,7 +847,7 @@ mod tests { let mut out = Vec::new(); let mut reg = SchemaRegistry::new(); let mut dict = SymbolGlobalDict::new(); - encode_chunk_into(&mut out, &chunk, &mut reg, &mut dict).unwrap(); + encode_chunk_into(&mut out, &chunk, &mut reg, &mut dict, false).unwrap(); assert_eq!(dict.next_id(), 2, "alpha + gamma only, beta unsent"); } @@ -860,7 +865,7 @@ mod tests { .unwrap(); c1.designated_timestamp_nanos(&ts1).unwrap(); let mut out1 = Vec::new(); - encode_chunk_into(&mut out1, &c1, &mut reg, &mut dict).unwrap(); + encode_chunk_into(&mut out1, &c1, &mut reg, &mut dict, false).unwrap(); assert_eq!(dict.next_id(), 2); let codes2 = [0i32, 2]; @@ -870,7 +875,7 @@ mod tests { .unwrap(); c2.designated_timestamp_nanos(&ts2).unwrap(); let mut out2 = Vec::new(); - encode_chunk_into(&mut out2, &c2, &mut reg, &mut dict).unwrap(); + encode_chunk_into(&mut out2, &c2, &mut reg, &mut dict, false).unwrap(); assert_eq!(dict.next_id(), 3, "gamma added on second frame"); } diff --git a/questdb-rs/src/ingress/column_sender/mod.rs b/questdb-rs/src/ingress/column_sender/mod.rs index b1241ed3..8d1489bc 100644 --- a/questdb-rs/src/ingress/column_sender/mod.rs +++ b/questdb-rs/src/ingress/column_sender/mod.rs @@ -104,6 +104,7 @@ pub mod _bench_internals { chunk, &mut state.schema_registry, &mut state.symbol_dict, + false, ) } } diff --git a/questdb-rs/src/ingress/column_sender/sender.rs b/questdb-rs/src/ingress/column_sender/sender.rs index cbb7ecb0..163a2eda 100644 --- a/questdb-rs/src/ingress/column_sender/sender.rs +++ b/questdb-rs/src/ingress/column_sender/sender.rs @@ -92,46 +92,56 @@ impl ColumnSender { /// Encode `chunk` into a QWP/WebSocket frame, write it to the /// socket, and return — **without** waiting for the server's ack. /// + /// The frame is sent with `FLAG_DEFER_COMMIT`: the server appends + /// rows to WAL but skips the commit. Call [`sync`](Self::sync) to + /// trigger the commit for all accumulated rows. + /// /// Ready acks are drained non-blocking before the write. If the /// in-flight count has reached the protocol cap (128), this call /// blocks until at least one ack frees a slot. /// /// On success, `chunk` is cleared (its retained descriptor capacity - /// is preserved) and the caller's buffers are released. The ack - /// will arrive later; call [`sync`](Self::sync) when you need all - /// in-flight frames acknowledged. + /// is preserved) and the caller's buffers are released. /// /// On failure, the connection is latched as terminal and the error /// is returned. `chunk` is left untouched. pub fn flush(&mut self, chunk: &mut Chunk<'_>) -> Result<()> { - // Drain any ready acks to keep the pipeline moving and to - // surface server errors as early as possible. + self.flush_inner(chunk, /* defer_commit = */ true) + } + + /// Block until all in-flight frames are acknowledged at the + /// requested [`AckLevel`]. + /// + /// Sends a commit-triggering frame (without `FLAG_DEFER_COMMIT`) + /// so the server commits all rows accumulated from preceding + /// deferred flushes, then drains all acks. + /// + /// `AckLevel::Ok` waits for every in-flight frame's WAL-commit ack. + /// `AckLevel::Durable` additionally waits for the server's + /// object-store durability watermarks to reach every frame's + /// seq_txn (requires `request_durable_ack=on` at connect). + pub fn sync(&mut self, ack_level: AckLevel) -> Result<()> { + // Send a commit-triggering empty frame (no FLAG_DEFER_COMMIT). + let mut commit_chunk = Chunk::new(""); + self.flush_inner(&mut commit_chunk, /* defer_commit = */ false)?; + self.conn.sync_all_acks(ack_level) + } + + fn flush_inner(&mut self, chunk: &mut Chunk<'_>, defer_commit: bool) -> Result<()> { self.conn.try_drain_acks()?; - // If we've hit the cap, block until one slot frees up. if self.conn.at_in_flight_cap() { self.conn.drain_one_ack_blocking()?; } let schema = &mut self.schema_registry; let dict = &mut self.symbol_dict; - let published = self - .conn - .publish_qwp(|out| encoder::encode_chunk_into(out, chunk, schema, dict))?; + let published = self.conn.publish_qwp(|out| { + encoder::encode_chunk_into(out, chunk, schema, dict, defer_commit) + })?; self.conn.push_pending(published.fsn); chunk.clear(); Ok(()) } - - /// Block until all in-flight frames are acknowledged at the - /// requested [`AckLevel`]. - /// - /// `AckLevel::Ok` waits for every in-flight frame's WAL-commit ack. - /// `AckLevel::Durable` additionally waits for the server's - /// object-store durability watermarks to reach every frame's - /// seq_txn (requires `request_durable_ack=on` at connect). - pub fn sync(&mut self, ack_level: AckLevel) -> Result<()> { - self.conn.sync_all_acks(ack_level) - } } diff --git a/questdb-rs/src/ingress/column_sender/wire.rs b/questdb-rs/src/ingress/column_sender/wire.rs index 548d0376..c62d2a4e 100644 --- a/questdb-rs/src/ingress/column_sender/wire.rs +++ b/questdb-rs/src/ingress/column_sender/wire.rs @@ -35,6 +35,7 @@ pub(crate) const QWP_MAGIC: [u8; 4] = *b"QWP1"; pub(crate) const QWP_VERSION_1: u8 = 1; /// Wire-spec flag set on every column-sender frame (matches the row-API /// `QwpBuffer::encode_ws_message`). +pub(crate) const QWP_FLAG_DEFER_COMMIT: u8 = 0x01; pub(crate) const QWP_FLAG_DELTA_SYMBOL_DICT: u8 = 0x08; pub(crate) const QWP_HEADER_LEN: usize = 12; From 2090138aa4aa9de3627c9bd640fe5d84726d083b Mon Sep 17 00:00:00 2001 From: bluestreak Date: Mon, 25 May 2026 02:34:12 +0100 Subject: [PATCH 8/9] perf(ingress): send first frame without FLAG_DEFER_COMMIT The server's ClientSymbolCache only caches symbols with symbolKey < initialSymbolCount. On a fresh table, initialSymbolCount stays at 0 until a WAL segment rolls and the watermark updates. By sending the first frame without FLAG_DEFER_COMMIT, the server commits it immediately, which allows the next segment to pick up the new symbol count and enable caching for all subsequent deferred frames. This is a client-side workaround for a server-side cache limitation. The proper fix is for the server to cache locally-assigned symbol IDs within the same segment (see WalColumnarRowAppender.putSymbolColumn). Co-Authored-By: Claude Opus 4.7 (1M context) --- questdb-rs/src/ingress/column_sender/sender.rs | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/questdb-rs/src/ingress/column_sender/sender.rs b/questdb-rs/src/ingress/column_sender/sender.rs index 163a2eda..6de7b720 100644 --- a/questdb-rs/src/ingress/column_sender/sender.rs +++ b/questdb-rs/src/ingress/column_sender/sender.rs @@ -57,6 +57,11 @@ pub struct ColumnSender { pub(crate) conn: ColumnConn, pub(crate) schema_registry: SchemaRegistry, pub(crate) symbol_dict: SymbolGlobalDict, + /// The first frame is sent without `FLAG_DEFER_COMMIT` so the server + /// commits it immediately. This lets the WAL segment roll and update + /// `initialSymbolCount`, warming the server's `ClientSymbolCache` for + /// all subsequent deferred frames. + first_frame_sent: bool, } impl Debug for ColumnSender { @@ -78,6 +83,7 @@ impl ColumnSender { conn, schema_registry, symbol_dict, + first_frame_sent: false, } } @@ -106,7 +112,10 @@ impl ColumnSender { /// On failure, the connection is latched as terminal and the error /// is returned. `chunk` is left untouched. pub fn flush(&mut self, chunk: &mut Chunk<'_>) -> Result<()> { - self.flush_inner(chunk, /* defer_commit = */ true) + let defer = self.first_frame_sent; + self.flush_inner(chunk, defer)?; + self.first_frame_sent = true; + Ok(()) } /// Block until all in-flight frames are acknowledged at the From e27bc30eb6e690274662ffc1c1e86215e18cd668 Mon Sep 17 00:00:00 2001 From: Jaromir Hamala Date: Tue, 26 May 2026 15:24:27 +0200 Subject: [PATCH 9/9] Fix column sender sync ABI and ACK handling Align the C ABI, docs, and smoke test with column_sender_flush(sender, chunk, err) plus column_sender_sync(sender, ack_level, err). Reserve an in-flight slot for the sync commit, validate durable ACK opt-in before publishing, and add pool/sync coverage. --- cpp_test/smoke_column_sender.c | 15 ++- doc/COLUMN_SENDER_FFI_ABI.md | 95 ++++++++++++------- include/questdb/ingress/column_sender.h | 30 ++++-- questdb-rs-ffi/src/column_sender.rs | 8 +- questdb-rs/src/ingress/column_sender/conn.rs | 32 +++++-- .../src/ingress/column_sender/encoder.rs | 14 +++ questdb-rs/src/ingress/column_sender/mod.rs | 4 +- .../src/ingress/column_sender/sender.rs | 34 +++++-- questdb-rs/src/tests/column_sender_pool.rs | 76 +++++++++++++++ 9 files changed, 236 insertions(+), 72 deletions(-) diff --git a/cpp_test/smoke_column_sender.c b/cpp_test/smoke_column_sender.c index 7f2f19c3..645ee011 100644 --- a/cpp_test/smoke_column_sender.c +++ b/cpp_test/smoke_column_sender.c @@ -34,8 +34,8 @@ * * Round-trips a single 3-row chunk with mixed i64, f64, varchar, and a * designated timestamp. Prints any client-side error to stderr and - * exits non-zero; on success exits 0 after flushing and returning the - * sender to the pool. + * exits non-zero; on success exits 0 after flushing, syncing, and + * returning the sender to the pool. */ #include @@ -149,8 +149,7 @@ int main(int argc, char** argv) return die(err, "designated_timestamp_nanos failed"); } - if (!column_sender_flush( - sender, chunk, column_sender_ack_level_ok, &err)) + if (!column_sender_flush(sender, chunk, &err)) { column_sender_chunk_free(chunk); questdb_db_return_sender(db, sender); @@ -158,6 +157,14 @@ int main(int argc, char** argv) return die(err, "column_sender_flush failed"); } + if (!column_sender_sync(sender, column_sender_ack_level_ok, &err)) + { + column_sender_chunk_free(chunk); + questdb_db_return_sender(db, sender); + questdb_db_close(db); + return die(err, "column_sender_sync failed"); + } + column_sender_chunk_free(chunk); questdb_db_return_sender(db, sender); questdb_db_close(db); diff --git a/doc/COLUMN_SENDER_FFI_ABI.md b/doc/COLUMN_SENDER_FFI_ABI.md index 1c1de52f..0f9c181b 100644 --- a/doc/COLUMN_SENDER_FFI_ABI.md +++ b/doc/COLUMN_SENDER_FFI_ABI.md @@ -39,12 +39,12 @@ by this ABI. They are not API design choices. | Limit | Value | Enforcement | |--------------------------------|----------------------------------------|----------------------------------------------------------| -| Max batch (frame) size | 16 MiB protocol ceiling; effectively `min(server recv buf − 14, 16 MiB)` advertised on upgrade via `X-QWP-Max-Batch-Size` | `column_sender_submit` returns an error if the encoded frame exceeds the negotiated cap. | +| Max batch (frame) size | 16 MiB protocol ceiling; effectively `min(server recv buf − 14, 16 MiB)` advertised on upgrade via `X-QWP-Max-Batch-Size` | `column_sender_flush` returns an error if the encoded frame exceeds the negotiated cap. | | Max tables per connection | 10,000 | Server-enforced; client surfaces server rejections. | | Max rows per table block | 1,000,000 | `column_sender_chunk_*` calls fail if `row_count` exceeds. | | Max columns per table | 2,048 | `column_sender_chunk_column_*` fails after the 2048th column. | | Max table / column name length | 127 bytes UTF-8 | Rejected at name validation. | -| Max in-flight batches | 128 | `column_sender_submit` blocks (or returns back-pressure) until an ack frees a slot. | +| Max in-flight batches | 128 | Deferred flushes reserve one slot for `column_sender_sync`; flush returns back-pressure when the reserve would be exhausted. | | Max symbol dictionary entries | 1,000,000 per connection | Server returns `PARSE_ERROR`; surfaced as `line_sender_error_server_rejection`. | The wire pins protocol version 1; clients advertise @@ -101,7 +101,8 @@ For every column-append function: - For Python wrappers, the typical pattern is to fill the chunk from a live DataFrame's numpy / Arrow buffers and flush before letting the DataFrame go out of scope — the contract is naturally satisfied - because flush is synchronous. + because flush encodes and writes the frame synchronously before + returning. ### 2.4 Validity bitmaps @@ -145,7 +146,7 @@ inputs. the borrowing thread until returned. Do not pass it across threads. - A `column_sender_chunk` is owned by one thread at a time. It is *not* tied to a particular sender; chunks can be built without a - borrow and submitted on any sender borrowed from the same `db`. + borrow and flushed on any sender borrowed from the same `db`. - `line_sender_error` is thread-safe to read but not to share writes. ### 2.6 String / UTF-8 @@ -190,7 +191,7 @@ multiple connections. The pool absorbs both cases: ┌──────────────────────────┐ │ column_sender (borrowed)│ │ ├─ new_chunk │ - │ ├─ submit / await │ + │ ├─ flush / sync │ │ └─ ... │ └──────────────────────────┘ ``` @@ -245,7 +246,7 @@ questdb_db* questdb_db_connect( * Close the pool and all its connections. Accepts NULL and no-ops. * Senders still checked out are invalidated; calls on them return * line_sender_error_invalid_api_call. Callers must not call close() - * while any thread is mid-submit on a borrowed sender. + * while any thread is mid-flush or mid-sync on a borrowed sender. */ QUESTDB_CLIENT_API void questdb_db_close(questdb_db* db); @@ -291,7 +292,7 @@ size_t questdb_db_reap_idle(questdb_db* db); * Return a sender to the pool. The sender pointer is invalidated and * must not be used again after this call. Any chunks created from the * sender remain valid (chunks are caller-owned, not sender-owned) but - * cannot be submitted until borrowed again from a new sender. + * cannot be flushed until borrowed again from a new sender. * * If the sender is in a latched-error state (must_close() == true), * its underlying connection is closed and dropped from the pool @@ -323,7 +324,7 @@ bool column_sender_must_close(const column_sender* sender); A chunk represents one DataFrame's worth of column buffers destined for one table. It is the "one chunk = one table = one frame = one FSN" unit. Chunks are caller-owned and **not bound to a particular -sender** — build a chunk on any thread, submit it on any sender +sender** — build a chunk on any thread, flush it on any sender borrowed from the same `db`. ```c @@ -331,10 +332,10 @@ borrowed from the same `db`. * Create an empty chunk for the given table. The table name must be * valid (same rules as line_sender_table_name; max 127 bytes UTF-8). * - * Does not require a sender — the chunk is pure data until submitted. + * Does not require a sender — the chunk is pure data until flushed. * - * The chunk is owned by the caller and must be either submitted with - * column_sender_submit (which clears it for reuse) or freed with + * The chunk is owned by the caller and must be either flushed with + * column_sender_flush (which clears it for reuse) or freed with * column_sender_chunk_free. */ QUESTDB_CLIENT_API @@ -641,7 +642,7 @@ bool column_sender_chunk_symbol_dict_i32( ## 10. Designated timestamp -Required exactly once per chunk before `submit`. Two variants picking +Required exactly once per chunk before `flush`. Two variants picking the on-wire type: - `..._micros` encodes the column on the wire as TIMESTAMP (`0x0A`, @@ -682,11 +683,11 @@ per row.) --- -## 11. Flush (synchronous) +## 11. Flush and sync ```c /** - * Acknowledgement level the flush waits for. + * Acknowledgement level `column_sender_sync` waits for. */ typedef enum column_sender_ack_level { @@ -699,20 +700,43 @@ typedef enum column_sender_ack_level opened with `request_durable_ack=on` in the connect string (and the server's 101 response confirming `X-QWP-Durable-Ack: enabled`). If the connection did not opt - in, flush returns line_sender_error_invalid_api_call. */ + in, sync returns line_sender_error_invalid_api_call. */ column_sender_ack_level_durable = 1, } column_sender_ack_level; /** - * Encode the chunk into a QWP/WebSocket frame, publish it, and block - * until the server acknowledges at the requested `ack_level`. Returns - * true once the ACK is received; the chunk is then cleared (row count - * → 0, allocations retained) and can be reused for the next DataFrame. + * Encode the chunk into a QWP/WebSocket frame, publish it, and return + * without waiting for a server ACK. On success the chunk is cleared + * (row count → 0, allocations retained) and can be reused for the next + * DataFrame. * - * Synchronous semantics: at most one frame in flight per sender. For - * parallel ingest, borrow multiple senders from the pool — one per - * thread — and flush concurrently. The 128-in-flight wire cap is - * never reached. + * The first flush is sent as an immediate commit. Later flushes are + * sent with QWP's deferred-commit flag so callers can pipeline many + * chunks. Call `column_sender_sync` after the final flush to send the + * commit frame and wait for all in-flight ACKs. + * + * The sender keeps one protocol in-flight slot reserved for the sync + * commit frame. If that reserve would be exhausted, flush returns + * line_sender_error_invalid_api_call; call `column_sender_sync` before + * flushing more chunks. + * + * For parallel ingest, borrow multiple senders from the pool — one per + * thread — and flush concurrently. + * + * On any failure (server rejection, transport error, latched-error + * sender, invalid chunk, or exhausted deferred-flight reserve), returns + * false and sets *err_out. The chunk is left untouched so the caller can + * inspect or recover its contents before freeing. + */ +QUESTDB_CLIENT_API +bool column_sender_flush( + column_sender* sender, + column_sender_chunk* chunk, + line_sender_error** err_out); + +/** + * Send a commit-triggering frame and block until all in-flight frames are + * acknowledged at the requested `ack_level`. * * Ack level semantics: * - `ok` — returns when the server has written the batch to its WAL. @@ -722,21 +746,20 @@ typedef enum column_sender_ack_level * * On any failure (server rejection, transport error, latched-error * sender, or `durable` requested without opt-in), returns false and - * sets *err_out. The chunk is left untouched so the caller can - * inspect or recover its contents before freeing. + * sets *err_out. * - * Flush blocks until ack or until the underlying connection enters a - * terminal failure state (must_close() becomes true). Transient - * disconnects are absorbed by the existing reconnect machinery. No - * separate per-call timeout in v1; if you need one, file a request. + * Sync blocks until ack or until the underlying connection enters a + * terminal failure state (must_close() becomes true). Transport errors + * latch the sender as terminal; return it to the pool and borrow a fresh + * sender to continue. No separate per-call timeout in v1; if you need + * one, file a request. * * The QWP wire `sequence` (FSN) is tracked internally and is not - * exposed at the FFI — synchronous flush makes it unnecessary. + * exposed at the FFI. */ QUESTDB_CLIENT_API -bool column_sender_flush( +bool column_sender_sync( column_sender* sender, - column_sender_chunk* chunk, column_sender_ack_level ack_level, line_sender_error** err_out); ``` @@ -788,9 +811,11 @@ int send_one_chunk(questdb_db* db) { if (!column_sender_chunk_designated_timestamp_nanos( chunk, timestamps_ns, 3, &err)) goto fail; - if (!column_sender_flush( - sender, chunk, column_sender_ack_level_ok, &err)) goto fail; - /* flush returned: server has WAL-committed; chunk cleared & reusable */ + if (!column_sender_flush(sender, chunk, &err)) goto fail; + /* flush returned: chunk cleared & reusable; ACK wait is deferred */ + if (!column_sender_sync( + sender, column_sender_ack_level_ok, &err)) goto fail; + /* sync returned: server has WAL-committed all flushed chunks */ column_sender_chunk_free(chunk); questdb_db_return_sender(db, sender); diff --git a/include/questdb/ingress/column_sender.h b/include/questdb/ingress/column_sender.h index cad41df8..2b411407 100644 --- a/include/questdb/ingress/column_sender.h +++ b/include/questdb/ingress/column_sender.h @@ -83,7 +83,7 @@ typedef struct column_sender_validity } column_sender_validity; /* ------------------------------------------------------------------------- - * Acknowledgement level for `column_sender_flush`. + * Acknowledgement level for `column_sender_sync`. * ------------------------------------------------------------------------- */ typedef enum column_sender_ack_level @@ -94,7 +94,7 @@ typedef enum column_sender_ack_level /** Wait for the server's object-store durability ACK (spec status * 0x02). Enterprise only; requires the pool to be opened with - * `request_durable_ack=on` in the connect string. Flush returns + * `request_durable_ack=on` in the connect string. Sync returns * `line_sender_error_invalid_api_call` otherwise. */ column_sender_ack_level_durable = 1 } column_sender_ack_level; @@ -443,21 +443,33 @@ bool column_sender_chunk_designated_timestamp_nanos( line_sender_error** err_out); /* ------------------------------------------------------------------------- - * Flush (synchronous) + * Flush / sync * - * Encode `chunk` into a QWP/WebSocket frame, publish it, and block - * until the server acknowledges at the requested `ack_level`. On - * success, `chunk` is cleared (allocations retained) and `true` is - * returned. On failure, `chunk` is left untouched. + * `column_sender_flush` encodes `chunk` into a QWP/WebSocket frame, + * publishes it, and returns without waiting for a server ACK. On success, + * `chunk` is cleared (allocations retained) and `true` is returned. On + * failure, `chunk` is left untouched. * - * At most one frame in flight per sender. For parallel ingest, borrow - * multiple senders from the same `questdb_db` — one per worker thread. + * The first flush is sent as an immediate commit. Later flushes are sent + * with QWP's deferred-commit flag so callers can pipeline many chunks. + * Call `column_sender_sync` after the final flush to send the commit frame + * and wait until all in-flight frames are acknowledged at `ack_level`. + * + * The sender keeps one protocol in-flight slot reserved for the sync commit + * frame. If that reserve would be exhausted, flush returns + * `line_sender_error_invalid_api_call`; call `column_sender_sync` before + * flushing more chunks. * ------------------------------------------------------------------------- */ QUESTDB_CLIENT_API bool column_sender_flush( column_sender* sender, column_sender_chunk* chunk, + line_sender_error** err_out); + +QUESTDB_CLIENT_API +bool column_sender_sync( + column_sender* sender, column_sender_ack_level ack_level, line_sender_error** err_out); diff --git a/questdb-rs-ffi/src/column_sender.rs b/questdb-rs-ffi/src/column_sender.rs index bc36b41a..414a4bab 100644 --- a/questdb-rs-ffi/src/column_sender.rs +++ b/questdb-rs-ffi/src/column_sender.rs @@ -756,9 +756,11 @@ pub unsafe extern "C" fn column_sender_chunk_designated_timestamp_nanos( /// Encode `chunk` into a QWP/WebSocket frame, write it to the socket, /// and return immediately — without waiting for the server's ack. /// -/// Ready acks are drained non-blocking before the write. If the -/// in-flight count has hit the protocol cap (128), the call blocks -/// until one ack frees a slot. +/// Ready acks are drained non-blocking before the write. Deferred +/// flushes keep one in-flight slot reserved for the later +/// `column_sender_sync` commit frame; if that reserve would be +/// consumed, the call fails and the caller must sync before flushing +/// more chunks. /// /// On success, `chunk` is cleared and the call returns `true`. On /// failure, `chunk` is left untouched and `false` is returned (with diff --git a/questdb-rs/src/ingress/column_sender/conn.rs b/questdb-rs/src/ingress/column_sender/conn.rs index cb46ca83..3ed23517 100644 --- a/questdb-rs/src/ingress/column_sender/conn.rs +++ b/questdb-rs/src/ingress/column_sender/conn.rs @@ -30,9 +30,10 @@ //! per RFC 6455, and `write_all`s to the socket — then returns immediately //! without waiting for the server's ack. Between publishes, ready acks //! are drained non-blocking via `try_drain_acks`. When the in-flight -//! count hits the protocol cap (128), the next publish blocks until one -//! ack frees a slot. An explicit `sync_all_acks` blocks until every -//! in-flight frame is acknowledged. +//! count hits the protocol cap (128), the next non-deferred publish +//! blocks until one ack frees a slot. Deferred publishes reserve one +//! in-flight slot for the later commit-triggering frame. An explicit +//! `sync_all_acks` blocks until every in-flight frame is acknowledged. //! //! No replay queue, no background thread — single-thread, single-socket, //! pipelined. @@ -238,6 +239,23 @@ impl ColumnConn { self.in_flight } + /// `true` when a deferred publish can still leave one in-flight slot + /// for the later non-deferred sync commit frame. + pub(crate) fn has_sync_commit_slot(&self) -> bool { + self.in_flight < MAX_IN_FLIGHT - 1 + } + + pub(crate) fn validate_ack_level(&self, ack_level: AckLevel) -> Result<()> { + if ack_level == AckLevel::Durable && !self.durable_ack_opt_in { + return Err(error::fmt!( + InvalidApiCall, + "AckLevel::Durable requires the pool to be opened with \ + `request_durable_ack=on` in the connect string." + )); + } + Ok(()) + } + /// Drain any ack responses available without blocking. Returns the /// number of OK acks consumed. pub(crate) fn try_drain_acks(&mut self) -> Result { @@ -280,13 +298,7 @@ impl ColumnConn { "QWP/WebSocket connection latched as terminal." )); } - if ack_level == AckLevel::Durable && !self.durable_ack_opt_in { - return Err(error::fmt!( - InvalidApiCall, - "AckLevel::Durable requires the pool to be opened with \ - `request_durable_ack=on` in the connect string." - )); - } + self.validate_ack_level(ack_level)?; // Phase 1: drain all OK acks. let mut durable_targets: HashMap = HashMap::new(); diff --git a/questdb-rs/src/ingress/column_sender/encoder.rs b/questdb-rs/src/ingress/column_sender/encoder.rs index 13b31415..8443c1e8 100644 --- a/questdb-rs/src/ingress/column_sender/encoder.rs +++ b/questdb-rs/src/ingress/column_sender/encoder.rs @@ -756,6 +756,20 @@ mod tests { assert_eq!(u16::from_le_bytes([out[6], out[7]]), 0); } + #[test] + fn defer_commit_flag_is_set_when_requested() { + let chunk = Chunk::new("trades"); + let mut out = Vec::new(); + let mut reg = SchemaRegistry::new(); + let mut dict = SymbolGlobalDict::new(); + encode_chunk_into(&mut out, &chunk, &mut reg, &mut dict, true).unwrap(); + assert_eq!(out[5] & QWP_FLAG_DEFER_COMMIT, QWP_FLAG_DEFER_COMMIT); + assert_eq!( + out[5] & QWP_FLAG_DELTA_SYMBOL_DICT, + QWP_FLAG_DELTA_SYMBOL_DICT + ); + } + #[test] fn non_empty_chunk_without_designated_ts_errors() { let mut chunk = Chunk::new("trades"); diff --git a/questdb-rs/src/ingress/column_sender/mod.rs b/questdb-rs/src/ingress/column_sender/mod.rs index 8d1489bc..130daac8 100644 --- a/questdb-rs/src/ingress/column_sender/mod.rs +++ b/questdb-rs/src/ingress/column_sender/mod.rs @@ -35,8 +35,8 @@ //! - Borrow a sender with [`QuestDb::borrow_sender`]. //! - Build a [`Chunk`] of column buffers for one table, then pin a //! designated timestamp on it. -//! - Flush the chunk synchronously; the call blocks until the server -//! acknowledges at the requested [`AckLevel`]. +//! - Flush chunks to publish them without waiting for ACKs, then call +//! [`ColumnSender::sync`] to commit and wait at the requested [`AckLevel`]. //! - Drop the [`BorrowedSender`] to return its connection to the pool. mod chunk; diff --git a/questdb-rs/src/ingress/column_sender/sender.rs b/questdb-rs/src/ingress/column_sender/sender.rs index 6de7b720..ecf7f166 100644 --- a/questdb-rs/src/ingress/column_sender/sender.rs +++ b/questdb-rs/src/ingress/column_sender/sender.rs @@ -32,8 +32,8 @@ use std::fmt::{self, Debug, Formatter}; -use crate::Result; use crate::ingress::buffer::SymbolGlobalDict; +use crate::{Result, error}; use super::chunk::Chunk; use super::conn::ColumnConn; @@ -98,19 +98,25 @@ impl ColumnSender { /// Encode `chunk` into a QWP/WebSocket frame, write it to the /// socket, and return — **without** waiting for the server's ack. /// - /// The frame is sent with `FLAG_DEFER_COMMIT`: the server appends - /// rows to WAL but skips the commit. Call [`sync`](Self::sync) to - /// trigger the commit for all accumulated rows. + /// The first frame is sent as an immediate commit so the server can + /// warm its symbol cache. Later frames are sent with + /// `FLAG_DEFER_COMMIT`: the server appends rows to WAL but skips the + /// commit. Call [`sync`](Self::sync) to trigger the commit for all + /// accumulated rows. /// - /// Ready acks are drained non-blocking before the write. If the - /// in-flight count has reached the protocol cap (128), this call - /// blocks until at least one ack frees a slot. + /// Ready acks are drained non-blocking before the write. Deferred + /// flushes reserve one in-flight slot for the later + /// commit-triggering sync frame; when that reserve would be consumed, + /// this call returns [`ErrorCode::InvalidApiCall`](crate::ErrorCode::InvalidApiCall) + /// and the caller must call [`sync`](Self::sync) before flushing more + /// chunks. /// /// On success, `chunk` is cleared (its retained descriptor capacity /// is preserved) and the caller's buffers are released. /// - /// On failure, the connection is latched as terminal and the error - /// is returned. `chunk` is left untouched. + /// On failure, the error is returned and `chunk` is left untouched. + /// Transport and server failures latch the connection as terminal; + /// validation and capacity failures leave it usable. pub fn flush(&mut self, chunk: &mut Chunk<'_>) -> Result<()> { let defer = self.first_frame_sent; self.flush_inner(chunk, defer)?; @@ -130,6 +136,8 @@ impl ColumnSender { /// object-store durability watermarks to reach every frame's /// seq_txn (requires `request_durable_ack=on` at connect). pub fn sync(&mut self, ack_level: AckLevel) -> Result<()> { + self.conn.validate_ack_level(ack_level)?; + // Send a commit-triggering empty frame (no FLAG_DEFER_COMMIT). let mut commit_chunk = Chunk::new(""); self.flush_inner(&mut commit_chunk, /* defer_commit = */ false)?; @@ -139,6 +147,14 @@ impl ColumnSender { fn flush_inner(&mut self, chunk: &mut Chunk<'_>, defer_commit: bool) -> Result<()> { self.conn.try_drain_acks()?; + if defer_commit && !self.conn.has_sync_commit_slot() { + return Err(error::fmt!( + InvalidApiCall, + "column sender deferred flush capacity exhausted; call sync() \ + before flushing more chunks." + )); + } + if self.conn.at_in_flight_cap() { self.conn.drain_one_ack_blocking()?; } diff --git a/questdb-rs/src/tests/column_sender_pool.rs b/questdb-rs/src/tests/column_sender_pool.rs index 07fc6c38..65cfa606 100644 --- a/questdb-rs/src/tests/column_sender_pool.rs +++ b/questdb-rs/src/tests/column_sender_pool.rs @@ -40,6 +40,7 @@ use std::io::Read; use std::net::TcpListener; use std::sync::Arc; use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering}; +use std::sync::mpsc; use std::thread; use std::time::{Duration, Instant}; @@ -386,6 +387,54 @@ fn refuses_durable_ack_without_opt_in() { ); } +#[test] +fn durable_ack_without_opt_in_does_not_publish_commit_frame() { + let listener = TcpListener::bind("127.0.0.1:0").expect("bind 127.0.0.1"); + let port = listener.local_addr().expect("local_addr").port(); + let (tx, rx) = mpsc::channel(); + + let handle = thread::spawn(move || { + let (mut stream, _) = listener.accept().expect("accept"); + perform_server_upgrade(&mut stream).expect("upgrade"); + stream + .set_read_timeout(Some(Duration::from_millis(200))) + .expect("set read timeout"); + let frame = match read_frame(&mut stream) { + Ok((_fin, opcode, _payload)) => Some(opcode), + Err(e) + if e.kind() == std::io::ErrorKind::WouldBlock + || e.kind() == std::io::ErrorKind::TimedOut => + { + None + } + Err(e) => panic!("unexpected server read error: {e}"), + }; + tx.send(frame).expect("send frame observation"); + }); + + let db = QuestDb::connect(&conf_for(port, "")).unwrap(); + let mut sender = db.borrow_sender().expect("borrow"); + let err = sender + .sync(AckLevel::Durable) + .expect_err("durable without opt-in must fail before publish"); + assert_eq!(err.code(), ErrorCode::InvalidApiCall); + assert!( + err.msg().contains("request_durable_ack"), + "msg: {}", + err.msg() + ); + assert_eq!( + rx.recv_timeout(Duration::from_secs(2)) + .expect("server observation"), + None, + "sync must reject durable ACK before sending a commit frame" + ); + + drop(sender); + drop(db); + handle.join().expect("server thread"); +} + #[test] fn empty_chunk_flush_round_trips() { let server = MockServer::spawn_acking(2); @@ -401,6 +450,33 @@ fn empty_chunk_flush_round_trips() { assert_eq!(chunk.row_count(), 0); } +#[test] +fn deferred_flush_reserves_slot_for_sync_commit() { + let server = MockServer::spawn(2); + let db = QuestDb::connect(&conf_for(server.port(), "close_flush_timeout_millis=50;")).unwrap(); + let mut sender = db.borrow_sender().expect("borrow"); + let mut chunk = Chunk::new("trades"); + + for _ in 0..127 { + sender.flush(&mut chunk).expect("flush below reserve"); + } + + chunk.column_i64("qty", &[42], None).expect("column_i64"); + chunk + .designated_timestamp_nanos(&[1_700_000_000_000_000_000]) + .expect("designated timestamp"); + let err = sender + .flush(&mut chunk) + .expect_err("deferred flush must preserve the sync commit slot"); + assert_eq!(err.code(), ErrorCode::InvalidApiCall); + assert!(err.msg().contains("sync()"), "msg: {}", err.msg()); + assert_eq!( + chunk.row_count(), + 1, + "capacity failure must leave the caller's chunk untouched" + ); +} + #[test] fn flush_clears_chunk_for_reuse_and_can_repeat() { let server = MockServer::spawn_acking(2);