diff --git a/cpp_test/smoke_column_sender.c b/cpp_test/smoke_column_sender.c new file mode 100644 index 00000000..645ee011 --- /dev/null +++ b/cpp_test/smoke_column_sender.c @@ -0,0 +1,173 @@ +/******************************************************************************* + * ___ _ ____ ____ + * / _ \ _ _ ___ ___| |_| _ \| __ ) + * | | | | | | |/ _ \/ __| __| | | | _ \ + * | |_| | |_| | __/\__ \ |_| |_| | |_) | + * \__\_\\__,_|\___||___/\__|____/|____/ + * + * Copyright (c) 2014-2019 Appsicle + * Copyright (c) 2019-2025 QuestDB + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + ******************************************************************************/ + +/* + * Hand-runnable smoke test for the column-major sender C ABI. + * + * Not wired into CMake — the in-tree CMake build does not yet build the + * column-sender ABI surface as a C test (the existing `smoke_line_reader` + * pattern wires through ctest; we'll follow it once the C test matrix + * for the column sender is fleshed out). + * + * Build manually against a real QuestDB instance, e.g.: + * + * gcc -std=c11 cpp_test/smoke_column_sender.c \ + * -I include -L target/debug -lquestdb_client \ + * -o smoke_column_sender + * + * ./smoke_column_sender "qwpws::addr=localhost:9000;" + * + * Round-trips a single 3-row chunk with mixed i64, f64, varchar, and a + * designated timestamp. Prints any client-side error to stderr and + * exits non-zero; on success exits 0 after flushing, syncing, and + * returning the sender to the pool. + */ + +#include +#include +#include +#include + +#include "questdb/ingress/column_sender.h" + +static int die(line_sender_error* err, const char* what) +{ + if (err) { + size_t msg_len = 0; + const char* msg = line_sender_error_msg(err, &msg_len); + fprintf(stderr, "%s: %.*s\n", what, (int)msg_len, msg); + line_sender_error_free(err); + } else { + fprintf(stderr, "%s\n", what); + } + return 1; +} + +int main(int argc, char** argv) +{ + if (argc < 2) { + fprintf(stderr, + "usage: %s 'qwpws::addr=host:port;[options]'\n", + argv[0]); + return 2; + } + const char* conf = argv[1]; + + line_sender_error* err = NULL; + questdb_db* db = questdb_db_connect(conf, strlen(conf), &err); + if (!db) + return die(err, "questdb_db_connect failed"); + + column_sender* sender = questdb_db_borrow_sender(db, &err); + if (!sender) { + questdb_db_close(db); + return die(err, "questdb_db_borrow_sender failed"); + } + + const char* table = "smoke_column_sender"; + column_sender_chunk* chunk = + column_sender_chunk_new(table, strlen(table), &err); + if (!chunk) { + questdb_db_return_sender(db, sender); + questdb_db_close(db); + return die(err, "column_sender_chunk_new failed"); + } + + const char* qty_name = "qty"; + const int64_t qty[3] = { 10, 20, 30 }; + if (!column_sender_chunk_column_i64( + chunk, qty_name, strlen(qty_name), + qty, 3, NULL, &err)) + { + column_sender_chunk_free(chunk); + questdb_db_return_sender(db, sender); + questdb_db_close(db); + return die(err, "column_i64(qty) failed"); + } + + const char* price_name = "price"; + const double price[3] = { 1.1, 2.2, 3.3 }; + if (!column_sender_chunk_column_f64( + chunk, price_name, strlen(price_name), + price, 3, NULL, &err)) + { + column_sender_chunk_free(chunk); + questdb_db_return_sender(db, sender); + questdb_db_close(db); + return die(err, "column_f64(price) failed"); + } + + /* Arrow Utf8: 3 rows of varchar with one null in the middle. + offsets length = row_count + 1; null row's slice is ignored by + the encoder (we set it to zero length here to keep offsets + monotonic). */ + const char* msg_name = "msg"; + const int32_t msg_offsets[4] = { 0, 5, 5, 10 }; + const uint8_t msg_bytes[] = { 'a','l','p','h','a', + 'g','a','m','m','a' }; + const uint8_t msg_validity_bits = 0x05u; /* rows 0 + 2 valid, row 1 null */ + const column_sender_validity msg_validity = { + &msg_validity_bits, 3 + }; + if (!column_sender_chunk_column_varchar( + chunk, msg_name, strlen(msg_name), + msg_offsets, msg_bytes, sizeof(msg_bytes), + 3, &msg_validity, &err)) + { + column_sender_chunk_free(chunk); + questdb_db_return_sender(db, sender); + questdb_db_close(db); + return die(err, "column_varchar(msg) failed"); + } + + const int64_t ts_nanos[3] = { + (int64_t)1700000000000000000LL, + (int64_t)1700000000000001000LL, + (int64_t)1700000000000002000LL + }; + if (!column_sender_chunk_designated_timestamp_nanos( + chunk, ts_nanos, 3, &err)) + { + column_sender_chunk_free(chunk); + questdb_db_return_sender(db, sender); + questdb_db_close(db); + return die(err, "designated_timestamp_nanos failed"); + } + + if (!column_sender_flush(sender, chunk, &err)) + { + column_sender_chunk_free(chunk); + questdb_db_return_sender(db, sender); + questdb_db_close(db); + return die(err, "column_sender_flush failed"); + } + + if (!column_sender_sync(sender, column_sender_ack_level_ok, &err)) + { + column_sender_chunk_free(chunk); + questdb_db_return_sender(db, sender); + questdb_db_close(db); + return die(err, "column_sender_sync failed"); + } + + column_sender_chunk_free(chunk); + questdb_db_return_sender(db, sender); + questdb_db_close(db); + fprintf(stdout, "ok\n"); + return 0; +} diff --git a/doc/COLUMN_SENDER_FFI_ABI.md b/doc/COLUMN_SENDER_FFI_ABI.md new file mode 100644 index 00000000..0f9c181b --- /dev/null +++ b/doc/COLUMN_SENDER_FFI_ABI.md @@ -0,0 +1,879 @@ +# Column-Major Sender — C ABI Specification + +**Status:** draft, pending approval +**Header:** `include/questdb/ingress/column_sender.h` (to be added) +**Sibling header:** `include/questdb/ingress/line_sender.h` (existing, +shares error types) +**Audience:** the Python wrapper repo, and anyone writing a C/C++ +client against this API. + +This document is self-contained. It is the contract between +`c-questdb-client` (Rust core) and the Python wrapper repo. The Python +repo can be implemented from this spec without reading any Rust code. + +--- + +## 1. Scope + +This ABI exposes a column-major writer that ingests **per-column typed +buffers** into QuestDB via QWP/WebSocket. Optimised for sending +Pandas/Polars DataFrames at maximum throughput. One submission = +one QWP frame = one logical batch of rows for one table. + +**This is a client for the existing QuestDB server implementing the QWP +ingress (WebSocket) v1 wire specification.** The spec is at +`questdb/documentation/connect/wire-protocols/qwp-ingress-websocket.md` +in the documentation repo. The protocol is fixed and the wire types, +null encoding, schema model, framing, and limits are not up for +negotiation in this API. The FFI's job is to expose that wire as +ergonomic, zero-overhead-where-possible calls for the Python wrapper. + +Out of scope: the existing row-major `line_sender_*` ABI is unaffected; +this is an additional, orthogonal API. The two coexist on different +opaque types. + +### 1.1 Spec-derived constraints (non-negotiable) + +These come from the QWP/WS v1 wire spec and are enforced or surfaced +by this ABI. They are not API design choices. + +| Limit | Value | Enforcement | +|--------------------------------|----------------------------------------|----------------------------------------------------------| +| Max batch (frame) size | 16 MiB protocol ceiling; effectively `min(server recv buf − 14, 16 MiB)` advertised on upgrade via `X-QWP-Max-Batch-Size` | `column_sender_flush` returns an error if the encoded frame exceeds the negotiated cap. | +| Max tables per connection | 10,000 | Server-enforced; client surfaces server rejections. | +| Max rows per table block | 1,000,000 | `column_sender_chunk_*` calls fail if `row_count` exceeds. | +| Max columns per table | 2,048 | `column_sender_chunk_column_*` fails after the 2048th column. | +| Max table / column name length | 127 bytes UTF-8 | Rejected at name validation. | +| Max in-flight batches | 128 | Deferred flushes reserve one slot for `column_sender_sync`; flush returns back-pressure when the reserve would be exhausted. | +| Max symbol dictionary entries | 1,000,000 per connection | Server returns `PARSE_ERROR`; surfaced as `line_sender_error_server_rejection`. | + +The wire pins protocol version 1; clients advertise +`X-QWP-Max-Version: 1`. + +--- + +## 2. Universal conventions + +### 2.1 Errors + +Errors use the existing `line_sender_error*` type from +`line_sender.h` — same codes, same accessors (`line_sender_error_msg`, +`line_sender_error_get_code`, `line_sender_error_free`). + +Every fallible function takes a trailing `line_sender_error** err_out`: + +- On success, returns `true` and does not touch `*err_out`. +- On failure, returns `false` and, if `err_out != NULL`, sets + `*err_out` to a heap-allocated error the caller must free with + `line_sender_error_free`. + +Pass `err_out = NULL` to discard the error. + +### 2.2 Pointer conventions + +Same as `line_sender.h`: opaque handles must be non-NULL. `err_out` may +be NULL. Lifecycle "free" functions accept NULL and no-op. + +### 2.3 Buffer conventions + +For every column-append function: + +- `data` is a pointer to a **contiguous, full-length** typed array + with one slot per row, **including null rows**. The slot value for + a null row is ignored — it can hold anything. This matches the + Arrow / Pandas / Polars layout, where data buffers are full-length + and null status lives in a separate bitmap. +- Strided buffers are **not** supported in v1. The Python wrapper must + materialise contiguous data before calling. (Pandas + `Series.to_numpy(copy=False)` and Polars Arrow buffers are + contiguous in the common case.) +- All column buffers passed in one chunk must have the same `row_count` + — the chunk's row count, set by the first column-append call. +- **Buffer lifetime contract.** Buffers passed to a `column_sender_chunk_*` + function (numeric columns, varchar offsets/bytes, symbol codes/dict + offsets/dict bytes, designated timestamps, validity bitmaps) **must + remain alive and unchanged until the next `column_sender_flush` call + on the chunk returns** (or until `column_sender_chunk_free` / + `column_sender_chunk_clear` is called without a flush). The FFI stores + raw pointers into the caller's buffers; it does **not** copy at + append time. This is required to hit memcpy-bandwidth throughput on + the no-null hot path — see `doc/COLUMN_SENDER_PLAN.md` §2. +- For Python wrappers, the typical pattern is to fill the chunk from a + live DataFrame's numpy / Arrow buffers and flush before letting the + DataFrame go out of scope — the contract is naturally satisfied + because flush encodes and writes the frame synchronously before + returning. + +### 2.4 Validity bitmaps + +The FFI accepts validity bitmaps in **Arrow semantics** (bit = 1 means +**valid**, bit = 0 means NULL). This is directly compatible with PyArrow +buffers, Polars Arrow buffers, and bitmaps produced by +`numpy.packbits(..., bitorder='little')`. + +- Layout: one bit per row. Byte `i` holds rows `8*i .. 8*i+7`. +- Bit ordering is **LSB-first** within each byte (bit 0 of byte 0 is row 0). +- **Bit = 1 means VALID. Bit = 0 means NULL.** +- Buffer length in bytes must be at least `ceil(row_count / 8)`. Bits + past `row_count` are ignored. +- Pass `validity = NULL` when the column has no nulls. + +```c +typedef struct column_sender_validity { + const uint8_t* bits; // NULL = no nulls + size_t bit_len; // must equal chunk row_count +} column_sender_validity; +``` + +If `validity != NULL`, `validity->bit_len` must equal the chunk's row +count. Mismatches return `line_sender_error_invalid_api_call`. + +**Wire-format note (informative).** The QWP wire format uses the +*inverted* semantics — bit = 1 means NULL — and column data after the +bitmap is **densely packed** (only non-null values, count = +`row_count − null_count`). See spec §Null handling. The FFI accepts +the Arrow shape so PyArrow / Pandas / Polars buffers hand off +zero-copy; the library inverts the bitmap and gathers non-null values +when encoding the QWP frame. Callers never construct QWP-shaped +inputs. + +### 2.5 Threading + +- A `questdb_db` (the pool) is **thread-safe**. Share it across + threads. `questdb_db_borrow_sender` and `questdb_db_return_sender` + are safe to call concurrently. +- A `column_sender` (a borrow) is **not thread-safe**. It belongs to + the borrowing thread until returned. Do not pass it across threads. +- A `column_sender_chunk` is owned by one thread at a time. It is + *not* tied to a particular sender; chunks can be built without a + borrow and flushed on any sender borrowed from the same `db`. +- `line_sender_error` is thread-safe to read but not to share writes. + +### 2.6 String / UTF-8 + +String and symbol-dict bytes must be valid UTF-8. The library trusts the +caller by default (no per-row validation). Invalid UTF-8 will be +detected by the server and rejected. The Python wrapper is responsible +for ensuring valid UTF-8 from Pandas/Polars. + +--- + +## 3. Opaque types + +```c +typedef struct questdb_db questdb_db; /* connection pool */ +typedef struct column_sender column_sender; /* borrowed handle */ +typedef struct column_sender_chunk column_sender_chunk; +``` + +Errors reuse `line_sender_error*` (from `line_sender.h`). + +--- + +## 4. Connection pool and sender borrow + +### 4.1 Conceptual shape + +The user thinks `DataFrame → Table`: a script holds one connection to +the database and pushes DataFrames at it. Under the hood, sending is +not thread-safe per connection, so multi-threaded ingest needs +multiple connections. The pool absorbs both cases: + +``` + ┌──────────────────────────┐ + questdb_db_connect ───► │ questdb_db (pool) │ + │ ├─ connection #1 │ + │ ├─ connection #2 (lazy) │ + │ └─ ... │ + └──────────┬────────────────┘ + │ borrow_sender / return_sender + ▼ + ┌──────────────────────────┐ + │ column_sender (borrowed)│ + │ ├─ new_chunk │ + │ ├─ flush / sync │ + │ └─ ... │ + └──────────────────────────┘ +``` + +Single-threaded scripts get pool size 1 by default — one borrow held +for the lifetime of the script. Multi-threaded callers borrow and +return per work unit (or per thread). + +### 4.2 Connect-string keys (pool) + +| Key | Default | Description | +|------------------------|---------|----------------------------------------------------------------------------------------------------------------------------------------------| +| `pool_size` | 1 | Warm / minimum connections, opened eagerly at `questdb_db_connect`. All N go through the full WS upgrade before `connect` returns. The pool never shrinks below this. | +| `pool_max` | 64 | Hard cap on auto-grow. When all current senders are checked out and pool size < `pool_max`, a new connection is opened on demand. When at `pool_max`, `borrow_sender` fails fast (see §4.3). | +| `pool_idle_timeout_ms` | 60000 | Connections *above* `pool_size` are closed after this much idle time in the pool's free list. Set to 0 to disable shrink (the pool only grows). | +| `pool_reap` | `auto` | `auto` — pool spawns a background thread that periodically reaps idle connections per `pool_idle_timeout_ms`. `manual` — no background thread; caller invokes `questdb_db_reap_idle` on its own cadence. | + +All other connect-string keys are inherited from the existing +`qwpws::` configuration (auth, TLS, `auth_timeout_ms`, retry, +durable-ack opt-in, etc.). See `doc/CONSIDERATIONS.md` and the +row-API connect-string reference. + +**Not accepted in v1:** `sf_dir` and the other `sf_*` store-and- +forward keys (`sender_id`, `sf_max_bytes`, `sf_max_total_bytes`, +`sf_durability`, `sf_append_deadline_millis`). Passing any of them to +`questdb_db_connect` returns `line_sender_error_config_error` with a +message pointing to the row-major `line_sender` API for users who +need SF semantics. SF is fundamentally single-writer per slot and +interacts awkwardly with the pool's auto-grow; revisit only if a +real user needs both throughput and on-disk durability. + +Validity: `pool_size <= pool_max` must hold; otherwise +`questdb_db_connect` returns `line_sender_error_config_error`. + +### 4.3 Pool functions + +```c +/** + * Open a connection pool. Eagerly opens `pool_size` connections; any + * server/auth/TLS error during those opens fails the call. + * + * `conf` is a standard `qwpws::` connect string. Non-WS schemes return + * line_sender_error_config_error — the column-sender path is QWP/WS + * only. + */ +QUESTDB_CLIENT_API +questdb_db* questdb_db_connect( + const char* conf, + line_sender_error** err_out); + +/** + * Close the pool and all its connections. Accepts NULL and no-ops. + * Senders still checked out are invalidated; calls on them return + * line_sender_error_invalid_api_call. Callers must not call close() + * while any thread is mid-flush or mid-sync on a borrowed sender. + */ +QUESTDB_CLIENT_API +void questdb_db_close(questdb_db* db); + +/** + * Borrow a sender from the pool. + * + * Selection rules: + * 1. If a previously-returned sender is in the free list, hand it out. + * 2. Otherwise, if pool size < `pool_max`, open a new connection on + * demand (auto-grow) and hand out a sender bound to it. + * 3. Otherwise (at `pool_max` cap, all checked out), return + * line_sender_error_invalid_api_call. This is fail-fast: hitting + * the cap signals either a leaked borrow or a `pool_max` set too + * low — both want an error rather than silent blocking. Caller may + * retry after returning senders. + * + * The returned sender is bound to the calling thread until returned. + * Do not share across threads. + */ +QUESTDB_CLIENT_API +column_sender* questdb_db_borrow_sender( + questdb_db* db, + line_sender_error** err_out); + +/** + * Manually reap idle connections. Closes connections in the pool's + * free list whose idle time exceeds `pool_idle_timeout_ms`, never + * shrinking pool size below `pool_size`. + * + * When `pool_reap=auto` (the default), the pool runs an internal + * background thread that calls this logic periodically; calling this + * function manually is harmless. When `pool_reap=manual`, callers that + * want shrinking must invoke this function on their own cadence (e.g. + * from a daemon thread in the host language). + * + * Returns the number of connections closed by this invocation. + */ +QUESTDB_CLIENT_API +size_t questdb_db_reap_idle(questdb_db* db); + +/** + * Return a sender to the pool. The sender pointer is invalidated and + * must not be used again after this call. Any chunks created from the + * sender remain valid (chunks are caller-owned, not sender-owned) but + * cannot be flushed until borrowed again from a new sender. + * + * If the sender is in a latched-error state (must_close() == true), + * its underlying connection is closed and dropped from the pool + * instead of returned. + */ +QUESTDB_CLIENT_API +void questdb_db_return_sender( + questdb_db* db, + column_sender* sender); +``` + +### 4.4 Sender state inspection + +```c +/** + * True if the sender's underlying connection is in a permanently- + * unusable state (a QWP halt rejection, terminal WS protocol + * violation, etc.). On return to the pool, such senders are dropped, + * not recycled. + */ +QUESTDB_CLIENT_API +bool column_sender_must_close(const column_sender* sender); +``` + +--- + +## 5. Chunk lifecycle + +A chunk represents one DataFrame's worth of column buffers destined +for one table. It is the "one chunk = one table = one frame = one +FSN" unit. Chunks are caller-owned and **not bound to a particular +sender** — build a chunk on any thread, flush it on any sender +borrowed from the same `db`. + +```c +/** + * Create an empty chunk for the given table. The table name must be + * valid (same rules as line_sender_table_name; max 127 bytes UTF-8). + * + * Does not require a sender — the chunk is pure data until flushed. + * + * The chunk is owned by the caller and must be either flushed with + * column_sender_flush (which clears it for reuse) or freed with + * column_sender_chunk_free. + */ +QUESTDB_CLIENT_API +column_sender_chunk* column_sender_chunk_new( + const char* table_name, + size_t table_name_len, + line_sender_error** err_out); + +/** + * Discard the chunk and all retained capacity. Accepts NULL and no-ops. + */ +QUESTDB_CLIENT_API +void column_sender_chunk_free(column_sender_chunk* chunk); + +/** + * Clear the chunk's content, keeping retained capacity for reuse. + */ +QUESTDB_CLIENT_API +void column_sender_chunk_clear(column_sender_chunk* chunk); + +/** + * Current row count of the chunk, as locked in by the first column + * append. Zero if no columns have been added yet. + */ +QUESTDB_CLIENT_API +size_t column_sender_chunk_row_count(const column_sender_chunk* chunk); +``` + +--- + +## 6. Numeric and fixed-width column appends + +All have the shape: + +```c +bool column_sender_chunk_column_( + column_sender_chunk* chunk, + const char* name, + size_t name_len, + const * data, + size_t row_count, + const column_sender_validity* validity, // NULL if no nulls + line_sender_error** err_out); +``` + +The first column-append call locks the chunk's `row_count`. Subsequent +calls must pass the same `row_count` value or return +`line_sender_error_invalid_api_call`. + +```c +QUESTDB_CLIENT_API +bool column_sender_chunk_column_i8( + column_sender_chunk* chunk, + const char* name, size_t name_len, + const int8_t* data, size_t row_count, + const column_sender_validity* validity, + line_sender_error** err_out); + +QUESTDB_CLIENT_API +bool column_sender_chunk_column_i16( + column_sender_chunk* chunk, + const char* name, size_t name_len, + const int16_t* data, size_t row_count, + const column_sender_validity* validity, + line_sender_error** err_out); + +QUESTDB_CLIENT_API +bool column_sender_chunk_column_i32( + column_sender_chunk* chunk, + const char* name, size_t name_len, + const int32_t* data, size_t row_count, + const column_sender_validity* validity, + line_sender_error** err_out); + +QUESTDB_CLIENT_API +bool column_sender_chunk_column_i64( + column_sender_chunk* chunk, + const char* name, size_t name_len, + const int64_t* data, size_t row_count, + const column_sender_validity* validity, + line_sender_error** err_out); + +QUESTDB_CLIENT_API +bool column_sender_chunk_column_f32( + column_sender_chunk* chunk, + const char* name, size_t name_len, + const float* data, size_t row_count, + const column_sender_validity* validity, + line_sender_error** err_out); + +QUESTDB_CLIENT_API +bool column_sender_chunk_column_f64( + column_sender_chunk* chunk, + const char* name, size_t name_len, + const double* data, size_t row_count, + const column_sender_validity* validity, + line_sender_error** err_out); + +/** + * Boolean column. `data` is an Arrow-style packed bitmap (LSB-first, + * 1=true). Length is row_count bits, so `data` must be at least + * ceil(row_count/8) bytes long. + */ +QUESTDB_CLIENT_API +bool column_sender_chunk_column_bool( + column_sender_chunk* chunk, + const char* name, size_t name_len, + const uint8_t* data, size_t row_count, + const column_sender_validity* validity, + line_sender_error** err_out); + +/** + * UUID column. `data` points to row_count * 16 bytes. Each 16-byte + * group is one UUID; bytes 0..8 are the lo half (little-endian), + * bytes 8..16 are the hi half (little-endian). Matches the + * existing line_sender_buffer_column_uuid layout. + */ +QUESTDB_CLIENT_API +bool column_sender_chunk_column_uuid( + column_sender_chunk* chunk, + const char* name, size_t name_len, + const uint8_t* data, size_t row_count, + const column_sender_validity* validity, + line_sender_error** err_out); + +/** + * LONG256 column. `data` points to row_count * 32 bytes. Each + * 32-byte group is one LONG256: four 64-bit limbs little-endian, + * least-significant limb first. + */ +QUESTDB_CLIENT_API +bool column_sender_chunk_column_long256( + column_sender_chunk* chunk, + const char* name, size_t name_len, + const uint8_t* data, size_t row_count, + const column_sender_validity* validity, + line_sender_error** err_out); + +/** + * IPv4 column. `data` is a packed uint32 per row, encoded as + * u32::from(Ipv4Addr).to_le_bytes() (octet 0 in the high byte). + */ +QUESTDB_CLIENT_API +bool column_sender_chunk_column_ipv4( + column_sender_chunk* chunk, + const char* name, size_t name_len, + const uint32_t* data, size_t row_count, + const column_sender_validity* validity, + line_sender_error** err_out); +``` + +--- + +## 7. Timestamp columns + +```c +/** + * TIMESTAMP column, nanoseconds since the Unix epoch. + */ +QUESTDB_CLIENT_API +bool column_sender_chunk_column_ts_nanos( + column_sender_chunk* chunk, + const char* name, size_t name_len, + const int64_t* data, size_t row_count, + const column_sender_validity* validity, + line_sender_error** err_out); + +/** + * TIMESTAMP column, microseconds since the Unix epoch. Equivalent to + * passing nanoseconds = micros * 1000 through ts_nanos, but the FFI + * does the scale-up so the caller does not have to materialise a + * second buffer. + */ +QUESTDB_CLIENT_API +bool column_sender_chunk_column_ts_micros( + column_sender_chunk* chunk, + const char* name, size_t name_len, + const int64_t* data, size_t row_count, + const column_sender_validity* validity, + line_sender_error** err_out); + +/** + * DATE column, milliseconds since the Unix epoch. + */ +QUESTDB_CLIENT_API +bool column_sender_chunk_column_date_millis( + column_sender_chunk* chunk, + const char* name, size_t name_len, + const int64_t* data, size_t row_count, + const column_sender_validity* validity, + line_sender_error** err_out); +``` + +--- + +## 8. Variable-width text column (VARCHAR) + +QWP has exactly one variable-width text type: VARCHAR (wire code +`0x0F`). The wire format is `uint32` offsets + concatenated bytes. The +older STRING wire type (`0x08`) has been removed from the spec and is +not exposed here. + +Input is in Arrow Utf8 shape: a full-length offsets array of +`row_count + 1` entries where `offsets[i]..offsets[i+1]` slices `bytes` +for row `i`. Null rows are signalled via the validity bitmap; their +offset slice is ignored (typically a zero-length slice, but the FFI +makes no assumption). + +```c +/** + * VARCHAR column (QWP wire type 0x0F). + * + * Input layout matches Arrow Utf8: + * - offsets has row_count + 1 entries. Monotonically non-decreasing. + * The first entry is typically 0 and the last is typically + * bytes_len; the FFI does not require those exactly, but every + * offset must be ≤ bytes_len. + * - bytes is a single contiguous UTF-8 buffer. + * - validity is Arrow-shape (1 = valid, see §2.4). NULL rows' + * offset slices are ignored. + * + * Wire output: the library compresses to QWP's dense layout + * (only non-null values, uint32 offsets matching the wire spec). + * + * UTF-8 validity is the caller's responsibility; invalid UTF-8 is + * detected by the server and surfaced as line_sender_error_server_rejection. + * + * Input offsets are int32_t because that is the Arrow Utf8 layout + * (signed 32-bit). Negative values are rejected. Polars LargeUtf8 + * (int64 offsets, >2 GiB) is the Python wrapper's concern: split the + * column or copy down to int32 offsets before calling. + */ +QUESTDB_CLIENT_API +bool column_sender_chunk_column_varchar( + column_sender_chunk* chunk, + const char* name, size_t name_len, + const int32_t* offsets, // length = row_count + 1 + const uint8_t* bytes, + size_t bytes_len, + size_t row_count, + const column_sender_validity* validity, + line_sender_error** err_out); +``` + +--- + +## 9. Symbol columns (dictionary fast path) + +Symbol columns take dictionary-encoded input: a `codes` array of +per-row indices and a dict (`dict_offsets` + `dict_bytes` in Arrow +Utf8 layout). + +This is **the canonical symbol input** because it matches: +- Pandas `Categorical` (`.codes` + `.categories`), +- Polars `Categorical` / Arrow `Dictionary`. + +The implementation interns the dict against the connection-scoped +symbol table once (cost ∝ dict cardinality, not row count) and then +remaps codes in bulk. + +For each `symbol_dict_` variant, `codes[i]` is the index into the +dict for row `i`. Codes must be in range `0..dict_len` for valid rows; +behaviour is undefined for out-of-range codes when validity is NULL. +When a row's validity bit is 0, its code is ignored. + +`dict_offsets` has `dict_len + 1` entries; `dict_offsets[d]..dict_offsets[d+1]` +slices `dict_bytes` for dict entry `d`. `dict_len` is implicit: +`dict_len == (dict_offsets length) - 1`. The FFI takes +`dict_offsets_len` explicitly to compute `dict_len = dict_offsets_len - 1`. + +```c +QUESTDB_CLIENT_API +bool column_sender_chunk_symbol_dict_i8( + column_sender_chunk* chunk, + const char* name, size_t name_len, + const int8_t* codes, size_t row_count, + const int32_t* dict_offsets, size_t dict_offsets_len, + const uint8_t* dict_bytes, size_t dict_bytes_len, + const column_sender_validity* validity, + line_sender_error** err_out); + +QUESTDB_CLIENT_API +bool column_sender_chunk_symbol_dict_i16( + column_sender_chunk* chunk, + const char* name, size_t name_len, + const int16_t* codes, size_t row_count, + const int32_t* dict_offsets, size_t dict_offsets_len, + const uint8_t* dict_bytes, size_t dict_bytes_len, + const column_sender_validity* validity, + line_sender_error** err_out); + +QUESTDB_CLIENT_API +bool column_sender_chunk_symbol_dict_i32( + column_sender_chunk* chunk, + const char* name, size_t name_len, + const int32_t* codes, size_t row_count, + const int32_t* dict_offsets, size_t dict_offsets_len, + const uint8_t* dict_bytes, size_t dict_bytes_len, + const column_sender_validity* validity, + line_sender_error** err_out); +``` + +--- + +## 10. Designated timestamp + +Required exactly once per chunk before `flush`. Two variants picking +the on-wire type: + +- `..._micros` encodes the column on the wire as TIMESTAMP (`0x0A`, + microseconds since Unix epoch). +- `..._nanos` encodes the column on the wire as TIMESTAMP_NANOS + (`0x10`, nanoseconds since Unix epoch). + +Exactly one of the two may be called per chunk. The designated +timestamp is emitted on the wire as a schema column with an empty +name (per spec §Full schema mode). + +```c +/** + * Designated-timestamp column, microseconds since the Unix epoch. + * Encoded on the wire as TIMESTAMP (0x0A). + */ +QUESTDB_CLIENT_API +bool column_sender_chunk_designated_timestamp_micros( + column_sender_chunk* chunk, + const int64_t* data, + size_t row_count, + line_sender_error** err_out); + +/** + * Designated-timestamp column, nanoseconds since the Unix epoch. + * Encoded on the wire as TIMESTAMP_NANOS (0x10). + */ +QUESTDB_CLIENT_API +bool column_sender_chunk_designated_timestamp_nanos( + column_sender_chunk* chunk, + const int64_t* data, + size_t row_count, + line_sender_error** err_out); +``` + +(No `validity` parameter — the designated timestamp must be non-null +per row.) + +--- + +## 11. Flush and sync + +```c +/** + * Acknowledgement level `column_sender_sync` waits for. + */ +typedef enum column_sender_ack_level +{ + /** Wait for the server's WAL-commit ACK (spec status 0x00). + Always available. */ + column_sender_ack_level_ok = 0, + + /** Wait for the server's object-store durability ACK + (spec status 0x02). Enterprise only. Requires the pool to be + opened with `request_durable_ack=on` in the connect string + (and the server's 101 response confirming + `X-QWP-Durable-Ack: enabled`). If the connection did not opt + in, sync returns line_sender_error_invalid_api_call. */ + column_sender_ack_level_durable = 1, +} column_sender_ack_level; + +/** + * Encode the chunk into a QWP/WebSocket frame, publish it, and return + * without waiting for a server ACK. On success the chunk is cleared + * (row count → 0, allocations retained) and can be reused for the next + * DataFrame. + * + * The first flush is sent as an immediate commit. Later flushes are + * sent with QWP's deferred-commit flag so callers can pipeline many + * chunks. Call `column_sender_sync` after the final flush to send the + * commit frame and wait for all in-flight ACKs. + * + * The sender keeps one protocol in-flight slot reserved for the sync + * commit frame. If that reserve would be exhausted, flush returns + * line_sender_error_invalid_api_call; call `column_sender_sync` before + * flushing more chunks. + * + * For parallel ingest, borrow multiple senders from the pool — one per + * thread — and flush concurrently. + * + * On any failure (server rejection, transport error, latched-error + * sender, invalid chunk, or exhausted deferred-flight reserve), returns + * false and sets *err_out. The chunk is left untouched so the caller can + * inspect or recover its contents before freeing. + */ +QUESTDB_CLIENT_API +bool column_sender_flush( + column_sender* sender, + column_sender_chunk* chunk, + line_sender_error** err_out); + +/** + * Send a commit-triggering frame and block until all in-flight frames are + * acknowledged at the requested `ack_level`. + * + * Ack level semantics: + * - `ok` — returns when the server has written the batch to its WAL. + * - `durable` — returns when the WAL segment is durably uploaded to + * the configured object store. Strictly later than the OK + * watermark; can be significantly later under upload pressure. + * + * On any failure (server rejection, transport error, latched-error + * sender, or `durable` requested without opt-in), returns false and + * sets *err_out. + * + * Sync blocks until ack or until the underlying connection enters a + * terminal failure state (must_close() becomes true). Transport errors + * latch the sender as terminal; return it to the pool and borrow a fresh + * sender to continue. No separate per-call timeout in v1; if you need + * one, file a request. + * + * The QWP wire `sequence` (FSN) is tracked internally and is not + * exposed at the FFI. + */ +QUESTDB_CLIENT_API +bool column_sender_sync( + column_sender* sender, + column_sender_ack_level ack_level, + line_sender_error** err_out); +``` + +--- + +## 12. Versioning + +This API is **draft / unstable** until first ship. Once shipped: + +- The C ABI is versioned alongside the rest of `c-questdb-client`. +- Breaking changes follow the same SemVer policy as the existing + `line_sender_*` ABI. +- The wire format is the existing QWP v1 spec (no new wire types + introduced). + +--- + +## 13. Minimal C example + +Pool/borrow shape: one `questdb_db` per process, borrow a sender per +unit of work, return it when done. + +```c +#include "questdb/ingress/line_sender.h" +#include "questdb/ingress/column_sender.h" + +int send_one_chunk(questdb_db* db) { + line_sender_error* err = NULL; + column_sender* sender = NULL; + column_sender_chunk* chunk = NULL; + + sender = questdb_db_borrow_sender(db, &err); + if (!sender) goto fail; + + chunk = column_sender_chunk_new("trades", 6, &err); + if (!chunk) goto fail; + + const double prices[] = { 2615.54, 2615.60, 2615.50 }; + const double amounts[] = { 0.00044, 0.00021, 0.00073 }; + const int64_t timestamps_ns[] = { 1700000000000000000LL, + 1700000000000001000LL, + 1700000000000002000LL }; + + if (!column_sender_chunk_column_f64( + chunk, "price", 5, prices, 3, NULL, &err)) goto fail; + if (!column_sender_chunk_column_f64( + chunk, "amount", 6, amounts, 3, NULL, &err)) goto fail; + if (!column_sender_chunk_designated_timestamp_nanos( + chunk, timestamps_ns, 3, &err)) goto fail; + + if (!column_sender_flush(sender, chunk, &err)) goto fail; + /* flush returned: chunk cleared & reusable; ACK wait is deferred */ + if (!column_sender_sync( + sender, column_sender_ack_level_ok, &err)) goto fail; + /* sync returned: server has WAL-committed all flushed chunks */ + + column_sender_chunk_free(chunk); + questdb_db_return_sender(db, sender); + return 0; + +fail: + if (err) { + fprintf(stderr, "%s\n", line_sender_error_msg(err, NULL)); + line_sender_error_free(err); + } + column_sender_chunk_free(chunk); + if (sender) questdb_db_return_sender(db, sender); + return 1; +} + +int main(void) { + line_sender_error* err = NULL; + questdb_db* db = questdb_db_connect( + "qwpws::addr=localhost:9000;pool_size=1;", &err); + if (!db) { + if (err) line_sender_error_free(err); + return 1; + } + int rc = send_one_chunk(db); + questdb_db_close(db); + return rc; +} +``` + +--- + +## 14. Notes for the Python wrapper + +These are not part of the C ABI; they are guidance for the Python repo +agent. + +- **Pandas numeric columns** → `Series.to_numpy(copy=False)` gives a + contiguous `np.ndarray` whose `.ctypes.data` pointer goes straight + to FFI. No copy. +- **Pandas nulls** → `Series.isna().values` is a `np.ndarray[bool]`; + pack it LSB-first into a `uint8_t*` bitmap (provide a vectorised + helper using `numpy.packbits(... bitorder='little')`). +- **Pandas datetime64** → already an int64 view via + `series.view('int64')`. For `[ns]` use `column_ts_nanos`; for + `[us]` use `column_ts_micros`; for `[ms]` use `column_date_millis` + (or scale up to ns). +- **Pandas `Categorical`** → `cat.codes.to_numpy()` for `codes`; + `cat.categories.to_numpy()` then encode to Arrow Utf8 layout + (build `offsets` + `bytes`) for the dict. Or roundtrip via PyArrow + for less manual work. +- **Polars** → `series.to_arrow()` yields a `pyarrow.Array` whose + buffers (`array.buffers()`) include the validity bitmap (already + LSB-first 1=valid) and the data buffer. Direct pointer handoff. +- **Pandas object-dtype strings** are the slow path: materialise into + Arrow Utf8 via `pyarrow.array(series)` then forward. The FFI + does not have a fast path for object dtype — that's a deliberate + choice. Document this. +- **Object lifetimes** — keep the source `np.ndarray` / `pa.Array` + alive for the duration of the FFI call. Buffers are copied into the + chunk during the call, so they can be dropped after the call + returns. diff --git a/doc/COLUMN_SENDER_PERF.md b/doc/COLUMN_SENDER_PERF.md new file mode 100644 index 00000000..c7c382ce --- /dev/null +++ b/doc/COLUMN_SENDER_PERF.md @@ -0,0 +1,100 @@ +# Column-Major Sender — Performance Notes + +Tracks the bench results that anchor `doc/COLUMN_SENDER_PLAN.md` §2.1 +("encode is a header + extend_from_slice per column") and §2.2 ("no-null += memcpy; nullable = invert+gather"). + +The Criterion bench lives at `questdb-rs/benches/column_sender.rs`. It +covers three families: + +1. **Per-column bulk append** — each column-type's hot path vs a raw + `extend_from_slice` baseline. +2. **Symbol bulk-intern** — `Chunk::symbol_dict_i32` vs a naïve per-row + HashMap probe that mirrors what a row-API symbol cell pays. +3. **End-to-end encode** — populate a 100k-row chunk with a + representative column mix and time the encoder body. + +Pure encoder cost — no network, no real server. + +## Running + +```sh +cargo bench --features sync-sender-qwp-ws --bench column_sender + +# Larger workload (anchors the headline 10M-rows-per-batch number from +# the WS-2/WS-4 plan): +QUESTDB_COLUMN_BENCH_ROWS=10000000 \ + cargo bench --features sync-sender-qwp-ws --bench column_sender + +# Knobs: +# QUESTDB_COLUMN_BENCH_ROWS default 100_000 +# QUESTDB_COLUMN_BENCH_VARCHAR_LEN default 16 +# QUESTDB_COLUMN_BENCH_SYM_CARD default 1_000 +``` + +## Numbers after the borrow-not-copy rewrite + +Captured on an Apple Silicon laptop, default workload +(`rows = 100_000`, `varchar_len = 16`, `sym_card = 1_000`), +`cargo bench ... -- --quick --noplot`. The big change vs the first +baseline: `Chunk` now holds raw pointers into the caller's buffers; +all wire-formatting is deferred to flush time and writes directly into +the connection's reusable write buffer. + +| Bench | Median time | Notes | +|-------------------------------------|------------:|-------| +| `column_i64/column_sender_no_null` | ~57 ns | Descriptor store only — no data copy at append time. | +| `column_i64/column_sender_nullable` | ~289 ns | Descriptor store + `non_null_count` precompute over the bitmap. | +| `column_f64/column_sender_no_null` | ~57 ns | Same as i64 — `Chunk` never touches the caller's bytes. | +| `encode_chunk/populate_only` | ~76 µs | Chunk-fill for the 5-column workload (was ~294 µs in the pre-rewrite baseline). **~4× faster.** | +| `encode_chunk/encode_only` | ~500 µs | Full encode: header + dict-delta + table block + per-column wire encode straight into a reusable buffer (was ~437 µs in the pre-rewrite baseline; now does the per-row work that previously happened during populate). | +| `encode_chunk/populate_plus_encode` | ~575 µs | **End-to-end flush time (no network) was ~718 µs pre-rewrite → ~575 µs after. ~20 % faster.** | + +A second-pass `encode_chunk/encode_only` on the same workload should +land in **REFERENCE mode** for the schema (because the registry caches +the signature from the first encode), shaving off the FULL-mode +signature bytes — see `doc/COLUMN_SENDER_PLAN.md` §2.1. + +The per-column microbenches no longer measure data movement: with raw +pointers stored, `column_iN`/`column_fN` are essentially constant-time +in `row_count`. The honest end-to-end metric is +`encode_chunk/populate_plus_encode`, which is what a single flush +costs (chunk-fill + frame encode into the WS write buffer, before +masking/socket-write). + +## Interpreting the numbers + +- The **`encode_chunk/populate_plus_encode` ~20 % win** is the + load-bearing claim: end-to-end CPU time per flush is lower than the + pre-rewrite design that copied each column into per-column `Vec` + staging and then aggregated those into a fresh per-frame `Vec`. + We now do exactly one memcpy per fixed-width column — straight from + the caller's buffer into the connection's reusable write buffer. +- The **`encode_only` is *slightly* slower in isolation** (~500 µs vs + ~437 µs) because the per-row work that used to be amortised into + `populate_only` is now done at encode time. `populate_only` dropped + from ~294 µs to ~76 µs, and the sum is what matters. +- The encoder pre-sizes the write buffer in one shot via + `estimate_frame_size(...)` to avoid the geometric-growth memcpy + pattern when payloads exceed the default 64 KiB capacity. Without + this, end-to-end flush time would be ~880 µs (worse than the + baseline). +- The **symbol bulk-intern** still runs the WS-4 three-pass design + (referenced bitset, intern only referenced slots, then per-row + emit). At 100 k rows × 1 000-card dict the encoder runs ≤ 1 000 + interns + 100 k varint writes — the per-row HashMap probe of the + row-API path remains ~16× slower. + +## Out of scope here + +- **End-to-end Pandas → QuestDB throughput** lives in the Python + wrapper repo (WS-7); add the `pandas_to_questdb_throughput` bench + there once a real server is wired into its CI. +- **1-hour soak** belongs in nightly CI rather than the in-tree + Criterion suite; track that as a follow-up alongside WS-7. +- **Microbench against the row-API encoder** is intentionally absent. + The row API's `Buffer::column_i64` is a per-cell call (it appends a + single value per invocation); comparing it cell-by-cell against the + column sender's bulk append would be apples vs oranges and is + already qualitatively captured by the `symbol_dict/naive_per_row_*` + comparison. diff --git a/doc/COLUMN_SENDER_PLAN.md b/doc/COLUMN_SENDER_PLAN.md new file mode 100644 index 00000000..1bf882b5 --- /dev/null +++ b/doc/COLUMN_SENDER_PLAN.md @@ -0,0 +1,654 @@ +# Column-Major Sender — Implementation Plan + +**Status:** draft, pending approval +**Owner:** TBD +**Audience:** engineers implementing the Rust core, the C FFI, and the +separate Python wrapper repo. + +--- + +## 1. Goal + +Ship a column-major writer that ingests **Pandas and Polars DataFrames into +QuestDB at the maximum throughput the QWP/WebSocket wire allows.** + +That is the whole goal. Every design choice in this plan is justified by +"does it make `df → QuestDB` faster?" Anything else is out of scope. + +**This is a client for an existing server implementing the QWP ingress +(WebSocket) v1 wire specification.** The spec lives at +`questdb/documentation/connect/wire-protocols/qwp-ingress-websocket.md` +in the documentation repo. Wire framing, column types, null encoding +(bit = 1 NULL, dense values), schema model, symbol delta dictionary, +ack/sequence semantics, and protocol limits are all fixed by the spec. +We invent nothing the spec covers; the design freedom is purely in how +the FFI exposes the wire to Pandas/Polars callers efficiently. + +### Non-goals + +- A generic columnar ingestion library. No Arrow C Data Interface, no + generic column-source traits, no support for "hypothetical other column + formats." If/when those are needed they live above the FFI, in a + language-specific wrapper. +- Replacing the row-major `Sender`/`Buffer` path. The row API stays as-is + for users who think in rows. +- QWP/UDP support. UDP's internal buffer is row-major and unreliable; the + column-major path targets QWP/WS only. +- A Python binding inside this repo. Python lives in its own repo and + consumes the C ABI defined in `COLUMN_SENDER_FFI_ABI.md`. +- New wire-protocol work. The wire format already is column-major. + +--- + +## 2. Why this is a small change to the wire and a big change to the API + +The QWP/WS wire format is **already column-major.** The row-API path +(`Buffer` / `QwpWsColumnarBuffer`) pays per-cell name-lookup and +op-state validation: for 50M rows × 6 columns that's 300M name lookups ++ 300M op-state checks before any actual encoding happens. The +column-major API replaces all of that with **6 bulk appends per chunk ++ 1 encode pass**. + +### 2.1 Decoupled from the existing row encoder *and the row publisher* + +Performance is the goal; **code reuse is a non-goal**. The column +sender does **not** reuse `QwpWsColumnarBuffer`, the row API's +encoder, **or the row API's publisher / driver / queue stack**. It +owns its own QWP/WebSocket socket end-to-end via a dedicated +`ColumnConn` type (`questdb-rs/src/ingress/column_sender/conn.rs`): + +- one write buffer reused across flushes (no per-frame allocation); +- the encoder writes the QWP frame body directly into that buffer at + offset `WS_HEADER_RESERVE = 14`, leaving room to prepend the WS + header in place once the payload length is known; +- the buffer is masked in place per RFC 6455 §5.3 and `write_all`'d to + the socket — at most one frame in flight by construction; +- the ack reader synchronously parses the QWP response inline (no + replay queue, no background thread). + +What is shared with the row API is only what *must* stay coherent at +connection scope: + +- `SymbolGlobalDict` (`questdb-rs/src/ingress/buffer/qwp.rs:5041`) — + the connection-scoped symbol intern table the wire requires. A + fresh instance per `ColumnConn`. +- The shared RFC 6455 WS plumbing in `crate::ws::{frame, mask, + handshake, crypto}` (handshake, frame header parse, + client-frame encode, mask key source). +- TCP connect + TLS setup + WS handshake, reached via + `SenderBuilder::build_qwp_ws_raw_stream` which returns a + `RawQwpWsStream` and never assembles the row-API publisher / + driver / queue. + +Note that `SchemaRegistry` is now **column-sender-local** (defined in +`column_sender/encoder.rs`), not shared. Each `ColumnConn` carries its +own registry through the pool; the row API has its own, separate +registry inside `QwpWsReplayEncoder`. + +What is *not* shared, and is duplicated verbatim where simplest, is +the QWP response parser (one binary OK / DurableAck / error frame at +a time) and the wire-formatting helper surface (varint writers, +type-byte tables, schema-signature construction). These are stable per +the QWP v1 spec; duplicating costs ~150 lines and removes one layer +of indirection from the hot path. + +### 2.1.1 Borrow-not-copy + +`Chunk<'a>` holds **raw pointers** into the caller's column buffers, +not copied wire-shape bytes. Each `column_*` call validates input +(name, lengths, varchar offset monotonicity, symbol-code range) and +stores a descriptor; the encoder dereferences the pointers at flush +time. The caller's buffers must outlive flush. + +On the Rust API, the lifetime parameter `'a` ties the chunk to every +borrowed buffer, so the borrow checker catches use-after-free at +compile time. The FFI layer carries the same shape via +`Chunk<'static>` and an explicit ABI contract — see +`doc/COLUMN_SENDER_FFI_ABI.md` §2.3. + +### 2.2 Two code paths per type + +For every numeric/fixed-width column, the bulk-append function +branches on validity at the top: + +- **`validity == NULL`** (no nulls): single `extend_from_slice` / + `memcpy` from the caller's buffer into the column's wire-shape + storage. Emit `null_flag = 0x00`. +- **`validity != NULL`**: one pass that (a) inverts the Arrow bitmap + to QWP wire semantics (bit=1 means NULL) and (b) gathers non-null + values densely into the wire buffer. Emit `null_flag != 0x00` and + the bitmap. + +The first path is the common case for pandas/polars numeric columns +and should bottleneck on `memcpy` bandwidth. The second is a tight +loop with a branch on the validity bit, suitable for SIMD where the +types allow. + +--- + +## 3. Architecture + +``` +Python repo (separate) c-questdb-client (this repo) +───────────────────── ───────────────────────────── + Rust core + pandas / polars DataFrame ──┐ + ▼ │ ┌─────────────────────────────┐ + Python wrapper │ C ABI │ QuestDb (pool, shareable) │ + - extract typed buffers ├────────►│ ├─ conn #1 ┐ │ + - extract validity bitmap │ │ ├─ conn #2 ├─ each owns: │ + - extract category codes & │ │ └─ ... │ publisher, │ + dict for symbols │ │ │ SchemaReg, │ + │ │ │ SymbolDict │ + │ │ borrow_sender / return │ + │ │ │ │ + │ │ ▼ │ + │ │ ColumnSender (borrowed) │ + │ │ ├─ new_chunk │ + │ │ └─ flush (sync, blocks │ + │ │ until server ACK) │ + │ └─────────┬───────────────────┘ + │ + ▼ (BulkChunk encoder, + a new module) + QWP/WS frame → server +``` + +Layering rules: + +- **The C ABI must be expressible as a thin wrapper around typed Rust + slices.** Per-column-append functions take `ptr + len + optional + validity bitmap`. Nothing else. +- **The user thinks `DataFrame → Table`.** One chunk = one table = one + DataFrame = one QWP frame = one FSN. +- **A `QuestDb` is shareable across threads; a borrowed `ColumnSender` + is not.** The pool absorbs the per-connection thread-safety + constraint. + +--- + +## 4. Rust API (public surface) + +New module: `questdb-rs/src/ingress/column_sender/` with submodules +`db.rs`, `sender.rs`, `chunk.rs`, `validity.rs`, `encoder.rs`, +`error.rs`. Re-exported under +`questdb::ingress::column_sender::{QuestDb, ColumnSender, Chunk, Validity}`. + +```rust +/// Connection pool. Shareable across threads. One `QuestDb` per +/// connect string per process (typical usage). +pub struct QuestDb { /* pool of Connection (private) */ } + +impl QuestDb { + /// Open a pool. Eagerly opens `pool_size` connections (default 1). + /// Pool knobs: `pool_size=N` (default 1), `pool_max=M` (default 64), + /// `pool_idle_timeout_ms=T` (default 60000), `pool_reap=auto|manual` + /// (default auto). Plus all standard `qwpws::` keys. + pub fn connect(conf: &str) -> Result; + + /// Borrow a sender. If a previously-returned sender is free, hand + /// it out; else, if pool size < `pool_max`, open a new connection + /// and hand out a sender bound to it; else return InvalidApiCall + /// (fail-fast at cap). + pub fn borrow_sender(&self) -> Result>; + + /// Manually reap idle connections (closes those above `pool_size` + /// idle longer than `pool_idle_timeout_ms`). Returns the count + /// closed. Background reaper does this for you under `pool_reap=auto`. + pub fn reap_idle(&self) -> usize; + + pub fn close(self); +} + +/// Borrowed sender. Returns to the pool on `Drop`. Not `Send`/`Sync` — +/// belongs to the borrowing thread. +pub struct BorrowedSender<'a> { /* borrow handle into QuestDb */ } + +impl<'a> std::ops::Deref for BorrowedSender<'a> { type Target = ColumnSender; … } +impl<'a> std::ops::DerefMut for BorrowedSender<'a> { … } +impl<'a> Drop for BorrowedSender<'a> { … } // returns to pool + +/// Thin handle over a borrowed connection. +pub struct ColumnSender { /* &mut Connection (lifetime-bound) */ } + +impl ColumnSender { + /// Create a chunk for a given table. Doesn't touch the connection + /// — chunks are pure data until flushed. + pub fn new_chunk(&self, table: TableName) -> Chunk; + + /// Synchronously flush a chunk: encode → publish → block until the + /// server ACK at the requested level arrives. On success the chunk + /// is cleared (allocations retained) ready for the next DataFrame. + /// On failure the chunk is left untouched. + /// + /// `ack_level`: + /// - `AckLevel::Ok` — wait for WAL-commit ACK (spec status `0x00`). + /// Always available. + /// - `AckLevel::Durable` — wait for object-store durability ACK + /// (spec status `0x02`). Enterprise feature; requires the pool + /// to be opened with `request_durable_ack=on` in the connect + /// string. If the connection did not opt in, returns + /// `InvalidApiCall`. + /// + /// At most one frame in flight per sender; for parallel ingest, + /// borrow multiple senders from the `QuestDb` pool. + pub fn flush(&mut self, chunk: &mut Chunk, ack_level: AckLevel) -> Result<()>; + + pub fn must_close(&self) -> bool; +} + +#[derive(Clone, Copy, Debug, Default)] +pub enum AckLevel { + /// Server's WAL commit (spec status `0x00`). Always available. + #[default] + Ok, + /// Server's object-store durability (spec status `0x02`). + /// Enterprise + requires durable-ack opt-in at connect. + Durable, +} + +pub struct Chunk { /* table name + Vec + row_count */ } + +impl Chunk { + /// First call locks `row_count`. All subsequent column appends + /// MUST have the same length (counted in logical rows, not bytes). + + // Numeric columns — zero-copy from contiguous typed slice. + pub fn column_i8 (&mut self, name: ColumnName, data: &[i8 ], v: Option<&Validity>) -> Result<()>; + pub fn column_i16(&mut self, name: ColumnName, data: &[i16], v: Option<&Validity>) -> Result<()>; + pub fn column_i32(&mut self, name: ColumnName, data: &[i32], v: Option<&Validity>) -> Result<()>; + pub fn column_i64(&mut self, name: ColumnName, data: &[i64], v: Option<&Validity>) -> Result<()>; + pub fn column_f32(&mut self, name: ColumnName, data: &[f32], v: Option<&Validity>) -> Result<()>; + pub fn column_f64(&mut self, name: ColumnName, data: &[f64], v: Option<&Validity>) -> Result<()>; + pub fn column_bool(&mut self, name: ColumnName, data: &[u8] /* arrow bitmap */, v: Option<&Validity>) -> Result<()>; + + // Fixed-width binary columns. + pub fn column_uuid (&mut self, name: ColumnName, data: &[[u8;16]], v: Option<&Validity>) -> Result<()>; + pub fn column_long256(&mut self, name: ColumnName, data: &[[u8;32]], v: Option<&Validity>) -> Result<()>; + pub fn column_ipv4 (&mut self, name: ColumnName, data: &[u32], v: Option<&Validity>) -> Result<()>; + + // Time columns. + pub fn column_ts_nanos (&mut self, name: ColumnName, data: &[i64], v: Option<&Validity>) -> Result<()>; + pub fn column_ts_micros(&mut self, name: ColumnName, data: &[i64], v: Option<&Validity>) -> Result<()>; + pub fn column_date_millis(&mut self, name: ColumnName, data: &[i64], v: Option<&Validity>) -> Result<()>; + + // Variable-width text — QWP has exactly one text type, VARCHAR + // (wire 0x0F, uint32 offsets). The older STRING (0x08) was + // removed from the spec. + // Input is Arrow Utf8 shape: i32 offsets + bytes; library + // compresses to dense uint32-offset layout on the wire. + pub fn column_varchar(&mut self, name: ColumnName, offsets: &[i32], data: &[u8], v: Option<&Validity>) -> Result<()>; + + // Symbol fast path: dictionary-encoded. + // `codes` are per-row indices into `dict_offsets`/`dict_data` (Arrow Utf8). + // The implementation interns the dict against SymbolGlobalDict once + // and remaps codes in bulk — no per-row HashMap probe. + pub fn symbol_dict_i8 (&mut self, name: ColumnName, codes: &[i8 ], dict_offsets: &[i32], dict_data: &[u8], v: Option<&Validity>) -> Result<()>; + pub fn symbol_dict_i16(&mut self, name: ColumnName, codes: &[i16], dict_offsets: &[i32], dict_data: &[u8], v: Option<&Validity>) -> Result<()>; + pub fn symbol_dict_i32(&mut self, name: ColumnName, codes: &[i32], dict_offsets: &[i32], dict_data: &[u8], v: Option<&Validity>) -> Result<()>; + + // Designated timestamp (required, exactly once per chunk; pick one). + // Emitted on the wire as an empty-name column of type + // TIMESTAMP (0x0A) for micros, TIMESTAMP_NANOS (0x10) for nanos. + pub fn designated_timestamp_micros(&mut self, data: &[i64]) -> Result<()>; + pub fn designated_timestamp_nanos (&mut self, data: &[i64]) -> Result<()>; + + // Lifecycle. + pub fn row_count(&self) -> usize; + pub fn clear(&mut self); // retains capacity for reuse +} + +/// Validity bitmap. Public API accepts **Arrow semantics** +/// (bit = 1 means valid, LSB-first within each byte) to enable +/// zero-copy from PyArrow / Polars / Pandas buffers. Length in bits +/// must equal the chunk's row_count. +/// +/// The QWP wire uses the inverted semantics (bit = 1 means NULL) and +/// dense data (only non-null values). The library inverts the bitmap +/// and gathers when encoding; callers never construct QWP-shaped +/// input. +pub struct Validity<'a> { bits: &'a [u8] } +impl<'a> Validity<'a> { + pub fn from_bitmap(bits: &'a [u8], bit_len: usize) -> Result; +} +``` + +### What `column_*` does internally + +1. Validate name (or skip when `ColumnName` already validated). +2. Look up or create the column slot in the chunk's `Vec`. + **Once per column per chunk, not per row.** +3. Append data to the column's storage: + - For numeric/fixed-width columns where the chunk's internal storage + is `Vec` of the same `T`, this is a single `Vec::extend_from_slice`. + - For columns with null-bitmap representation, also OR the validity + bitmap into the column's null bitmap (bulk, byte-aligned where + possible). +4. Bump the per-column row counter; assert it matches `chunk.row_count`. + +### Symbol bulk-intern + +The expensive part of symbol handling today is per-row +`SymbolGlobalDict::intern` (qwp.rs:5041). The fast path: + +1. Walk `dict_offsets`/`dict_data` once: build a small + `Vec` of length `dict_len` mapping each dict entry's local + index → global id (one `intern()` per *unique* symbol value, not per + row). +2. Walk `codes` once, writing the mapped global ids into the column's + storage — a tight loop, branch-predictable, ~1ns/row. + +For a 10M-row symbol column with cardinality 1000, this drops from 10M +HashMap probes to 1000. + +--- + +## 5. Workstreams + +Designed so multiple engineers can work in parallel after WS-0 + WS-1 +land. + +### WS-0 — QuestDb pool, sender borrow, idle reaper (blocking dependency) + +- Create `questdb-rs/src/ingress/column_sender/db.rs` with the pool + type, eagerly opening `pool_size` connections at `connect()`. +- Connect-string parsing: lift the existing `qwpws::` parser; add + `pool_size` (default 1), `pool_max` (default 64), + `pool_idle_timeout_ms` (default 60000), `pool_reap` + (`auto`|`manual`, default `auto`). Reject configs with + `pool_size > pool_max`. +- `borrow_sender()` semantics: pull from free list if any; else if + pool size < `pool_max`, open a new connection; else return + `InvalidApiCall` (fail-fast). +- `BorrowedSender<'_>` returns the connection to the pool on `Drop` + with a `last_idle_at = Instant::now()` stamp. If + `must_close()` is true on return, drop the connection. +- **Idle reaper.** Under `pool_reap=auto`, the pool spawns one + background `std::thread` on `connect`. The thread wakes on a ticker + (~5s or `pool_idle_timeout_ms / 12`, whichever is larger), scans the + free list, closes connections idle longer than + `pool_idle_timeout_ms`, **never shrinking below `pool_size`**. The + thread is joined on `close()`. Manual mode skips the thread entirely; + `db.reap_idle()` runs the same scan on demand and is exposed on + the FFI. +- Thread-safety: the pool's internal state (free list, total count, + per-connection idle stamp) is guarded by a `Mutex`. Borrow/return/ + reap/close are all safe concurrent. +- Owner: 1 engineer. +- Done when: + - multi-thread test borrows and returns N senders concurrently + without deadlock or leak, + - pool fails-fast at `pool_max`, + - idle reaper (auto and manual) closes excess connections after the + timeout while keeping `pool_size` warm, + - `close()` joins the reaper cleanly. + +### WS-1 — `ColumnSender` thin handle & synchronous flush plumbing + +- Define `ColumnSender` as a `&mut Connection` lifetime-bound borrow + handle. Implement `flush(chunk)` that calls the new encoder + (WS-2/3/4), hands the encoded frame to the existing publisher + (`questdb-rs/src/ingress/sender/qwp_ws_publisher.rs`), and blocks + until the server ACK arrives. +- Internally the publisher still tracks the wire `sequence` (FSN); + `flush` waits on that FSN. FSN is not exposed at the public API. +- Hook up `must_close`. +- Refuse `sf_dir` (and other `sf_*` keys) at `QuestDb::connect`-time + with `ConfigError`. Update WS-0's connect-string parser + accordingly. +- Stub `flush()` on an empty chunk: produces a header-only QWP frame + end-to-end (no columns; pure framing), server accepts and ACKs. +- Owner: 1 engineer. +- Depends on: WS-0. +- Done when: empty-chunk `flush` round-trips against a real server and + returns on ACK; `sf_dir` in the connect string is rejected with a + clear error. + +### WS-2 — `Chunk`, `BulkChunk` encoder, numeric/fixed-width columns + +- Define `Chunk` (caller-owned, table-bound) and the internal + `BulkChunk` wire-shape storage: per-column `Vec` already in QWP + wire layout (dense values + optional null bitmap with QWP + semantics) so encode is a header + `extend_from_slice` per column. +- Implement the **two code paths per type** (see §2.2): no-null + fast-memcpy; nullable invert+gather. Both produce identical + on-wire shape modulo the null_flag byte. +- Implement `column_i8`/`i16`/`i32`/`i64`/`f32`/`f64`/`bool`/`uuid`/ + `long256`/`ipv4`/`ts_nanos`/`ts_micros`/`date_millis` + + `designated_timestamp_micros` + `designated_timestamp_nanos`. +- Implement `Validity` (Arrow-shape in: 1=valid, LSB-first). Library + masks trailing bits beyond row_count. +- Implement the table-header + schema-section emit. Schema interning + goes through the existing connection-shared `SchemaRegistry`. +- Owner: 1 engineer. +- Depends on: WS-1. +- Done when: round-trip test for each type passes against a real + server and a benchmark shows the per-row cost is dominated by + memcpy bandwidth, not API overhead. + +### WS-3 — VARCHAR column + +- Implement `column_varchar`. Input is Arrow Utf8 shape (i32 offsets + + bytes). Wire output is dense (only non-null) with uint32 offsets per + QWP spec §VARCHAR. +- Two code paths per §2.2: + - No-null: copy all `row_count + 1` offsets unchanged (caller's i32 + fits trivially in wire u32) + copy the full byte buffer. + - Nullable: walk validity bitmap; for each non-null row, compute + `slice_len = offsets[i+1] − offsets[i]`, append dense offsets and + bytes for that slice. **Skip slicing for null rows** — do not + trust caller's offset values for null rows. +- UTF-8 is trusted; server rejects invalid UTF-8 with PARSE_ERROR. +- Owner: 1 engineer. +- Depends on: WS-1, +reads WS-2's `Chunk` shape. +- Done when: round-trip + null handling test passes; benchmark within + ~2× of f64 column throughput for short strings (varchar is + fundamentally variable-width so equal-throughput is unrealistic). + +### WS-4 — Symbol bulk-intern fast path + +- Implement `symbol_dict_{i8,i16,i32}`. +- Share the connection-scoped `SymbolGlobalDict` (qwp.rs:5041). New + code interns through it; emits the new symbols in the delta-dict + prefix of the QWP frame. +- **Intern only referenced dict entries.** Pandas/polars `Categorical` + carries every category ever observed (often 100k+) but a typical + chunk references a small subset. The implementation: + 1. One pass over `codes` to mark referenced dict indices in a + bitset (sized `dict_len`). + 2. One pass over the bitset: intern each referenced dict entry, + build a `Vec` of length `dict_len` mapping local → global + (unreferenced slots get `u64::MAX` sentinel). + 3. One pass over `codes` writing global IDs into the wire buffer. + This protects the 1M-per-connection wire limit and avoids + polluting `SymbolGlobalDict` with never-sent values. +- Validate codes are in `0..dict_len` for non-null rows; out-of-range + is `InvalidApiCall`. Codes for null rows are not inspected. +- Owner: 1 engineer. +- Depends on: WS-1; can develop in parallel with WS-2/3. +- Done when: 10M-row × 1000-card benchmark shows symbol throughput + within 2× of f64 throughput (today, symbol throughput is much worse). + +### WS-5 — C FFI surface + +- Implement the ABI defined in `COLUMN_SENDER_FFI_ABI.md`. Two FFI + namespaces: + - `questdb_db_*` — pool/borrow (`connect`, `close`, `borrow_sender`, + `return_sender`). Lands once WS-0 lands. + - `column_sender_chunk_*` + `column_sender_submit` / + `_await_acked_fsn` — chunk fill and submit. Each column function + ships the moment its Rust counterpart lands. +- Code lives in `questdb-rs-ffi/src/column_sender.rs`, re-exported from + `lib.rs`. +- Header lives at `include/questdb/ingress/column_sender.h`. Defer the + `.hpp` until someone needs a C++ wrapper — the Python wrapper does + not. +- `cbindgen.toml` updates if the column sender is exposed by cbindgen. +- Owner: 1 engineer. +- Depends on: WS-0/2/3/4 land in parallel. +- Done when: a C test program (in `cpp_test/` or `system_test/`) opens + a pool, borrows a sender, submits a chunk, returns the sender, and + the server stores the rows. + +### WS-6 — Benchmarks & soak tests + +- Microbench (Criterion in `questdb-rs/benches/`): + - per-column bulk append, vs the row-API equivalent, vs raw memcpy + baseline, for each type; + - symbol intern (dict path) vs per-row symbol intern (row API); + - end-to-end "10M rows × N columns" chunk submit (in-memory, no + network), to measure pure encoder + populate cost. +- End-to-end throughput test against a local QuestDB: Pandas DataFrame + → submit → ack, varying row counts, column counts, dtypes. Report + GB/s in and rows/s. +- Soak: 1-hour run sending random chunks; assert no leaks, no + reconnects, latched-error handling works. +- Owner: 1 engineer. +- Depends on: WS-2 minimum. +- Done when: benchmark numbers documented in `doc/DEV_NOTES.md` or a + new `doc/COLUMN_SENDER_PERF.md`. + +### WS-7 — Python repo coordination (out-of-tree, tracked here) + +- The Python repo wraps `column_sender.h`. The Python repo's agent + works from `COLUMN_SENDER_FFI_ABI.md` alone. +- Python repo TODOs (tracked there, listed here for visibility): + - Build a thin ctypes/cffi/pyo3 wrapper around the C ABI. + - For Pandas: extract numpy buffers per column via `Series.to_numpy()` + (zero-copy for native dtypes), build validity bitmaps from + `Series.isna()` (LSB-first packing — provide a vectorised helper). + - For Polars: extract Arrow buffers via `Series.to_arrow()`; the + Arrow buffer pointers and validity bitmaps go straight to FFI. + - For Pandas `Categorical` / Polars `Categorical`: use + `symbol_dict_*`. + - Document the slow paths (object-dtype strings, mixed dtypes, + extension types) and the fallbacks (materialise to a contiguous + typed array). + +--- + +## 6. Type mapping reference + +| QWP wire type | Rust API | Pandas dtype | Polars / Arrow dtype | FFI shape | +|---------------|--------------------|------------------------------|----------------------------|--------------------------| +| BOOL | `column_bool` | `bool` (numpy) | `Boolean` (Arrow bitmap) | `uint8_t*` (bitmap) | +| BYTE | `column_i8` | `int8` | `Int8` | `int8_t*` | +| SHORT | `column_i16` | `int16` | `Int16` | `int16_t*` | +| INT | `column_i32` | `int32` | `Int32` | `int32_t*` | +| LONG | `column_i64` | `int64` | `Int64` | `int64_t*` | +| FLOAT | `column_f32` | `float32` | `Float32` | `float*` | +| DOUBLE | `column_f64` | `float64` | `Float64` | `double*` | +| VARCHAR | `column_varchar` | `string` / object (fallback) | `Utf8` (Polars `LargeUtf8` → wrapper splits) | `int32_t*` + `uint8_t*` | +| SYMBOL | `symbol_dict_iN` | `Categorical` | `Categorical` / Dict | codes + dict offsets+bytes | +| TIMESTAMP | `column_ts_nanos`/`_micros` | `datetime64[ns]`/`[us]` | `Datetime(ns/us)` | `int64_t*` | +| DATE | `column_date_millis` | `datetime64[ms]` | `Date` (after cast) | `int64_t*` | +| UUID | `column_uuid` | bytes (no native) | Arrow `FixedSizeBinary(16)`| `uint8_t*` (16N) | +| IPV4 | `column_ipv4` | uint32 (no native) | `UInt32` | `uint32_t*` | +| LONG256 | `column_long256` | bytes (no native) | Arrow `FixedSizeBinary(32)`| `uint8_t*` (32N) | + +**Out of v1 scope:** `DECIMAL64/128/256`, `LONG_ARRAY`, `DOUBLE_ARRAY`, +`GEOHASH`, `CHAR`, `BINARY`. Add in a follow-up milestone driven by +actual user demand from the Python wrapper. + +--- + +## 7. Threading & error model (inherited) + +- One `ColumnSender` is bound to one connection. Not `Sync`. Use + multiple senders for parallel ingestion. +- `Chunk` is owned by one thread. After `submit`, the chunk can be + cleared and reused. +- Error model is identical to the existing QWP/WS sender (see + `questdb-rs/src/ingress/mod.md` §"QWP/WebSocket"): drop-and-continue + vs halt; `must_close()`; FSN ack semantics. +- The Java client (`../java-questdb-client`, see memory + [[reference-java-questdb-client]]) is the posture reference for + parser-vs-writer trust split. The column-major API is the *writer* + side — it trusts its caller and panics nowhere + (memory [[feedback-client-no-panic]]). + +--- + +## 8. Decisions log + +All architectural decisions are locked. Anyone implementing should +flag a deviation rather than re-litigate silently. + +### Settled by the QWP/WS v1 spec (non-negotiable) + +- Wire framing, column type codes, schema model, sequence numbering, + symbol delta-dictionary, durable-ack opt-in, version negotiation, + protocol limits. +- Null encoding on the wire: bit = 1 means NULL, LSB-first; data after + the bitmap is dense. Internal encoder matches; FFI exposes the + inverted (Arrow-style) semantics for zero-copy from Pandas/Polars + and does the invert+gather internally. +- Wire is contiguous-per-column; strided input is the wrapper's + problem. +- UTF-8 validation: server enforces; we trust by default. +- Text type: VARCHAR only (`0x0F`, uint32 offsets). STRING is gone. +- Designated timestamp: empty-name column of type TIMESTAMP (`0x0A`, + µs) or TIMESTAMP_NANOS (`0x10`, ns). +- DATE on ingress is plain int64. +- FSN = wire `sequence` / `wireSeq`. + +### Settled by user direction + +- **API shape:** new top-level types, separate from `Buffer`/`Sender`. + Naming: `QuestDb`, `ColumnSender`, `Chunk`, `Validity`. +- **Mental model:** `DataFrame → Table`. One chunk = one table = one + DataFrame = one QWP frame = one FSN. +- **Send is synchronous.** `sender.flush(&mut chunk, ack_level)` + blocks until the server ACK at the requested level arrives. Two + levels: `Ok` (WAL commit, always available) and `Durable` + (object-store durability — Enterprise; requires durable-ack opt-in + at connect). At most one frame in flight per sender. Parallelism is + expressed by borrowing multiple senders from the pool, one per + thread. The wire's 128-in-flight cap is never reached. The QWP + `sequence` / FSN is tracked internally and not exposed at the API + or FFI surface. +- **Store-and-forward (`sf_dir`) is refused in v1.** Passing `sf_dir` + or any other `sf_*` key to `QuestDb::connect` returns `ConfigError`. + SF is single-writer per slot and interacts awkwardly with pool + auto-grow. Users who need on-disk durability across crashes can use + the existing row-major `Sender` API. Revisit if a real user needs + both throughput and SF. +- **Connection layer:** pool (`QuestDb::connect`), borrow/return + (`db.borrow_sender()` → drop returns to pool). Defaults: + `pool_size=1`, `pool_max=64`, `pool_idle_timeout_ms=60000`. Eager + open at connect, auto-grow on exhaustion, fail-fast at cap. +- **Idle shrinking:** Rust-side background reaper per pool + (`pool_reap=auto`, default) closes excess-over-`pool_size` + connections after `pool_idle_timeout_ms` idle. Manual mode + (`pool_reap=manual`) disables the thread; `db.reap_idle()` / + `questdb_db_reap_idle()` exposed for caller-driven reaping. Reaper + lives in Rust so every binding (C/C++/Python) inherits the + behaviour without re-implementing. +- **Encoder:** fresh `BulkChunk` encoder, no reuse of + `QwpWsColumnarBuffer` or row-API encoder. Shares only connection- + scoped state (`SymbolGlobalDict`, `SchemaRegistry`, publisher). + Code reuse is a non-goal; perf is the goal. +- **Two code paths per type:** no-null = `memcpy`; nullable = invert + + gather in one pass. +- **Symbol intern:** scan codes first, intern only referenced dict + entries. +- **Validity trailing bits:** library masks; caller need not zero. +- **VARCHAR null offsets:** library skips slicing; caller's value for + null rows is ignored. +- **FFI:** raw pointers per column. No Arrow C Data Interface, no + strides, no generic column-source traits. +- **Python:** lives in a separate repo; this repo provides the C ABI. + +### Out of v1 scope (deferred) + +- Multi-table-per-frame batching at the API. Wire supports it; v1 API + is one chunk = one table. Revisit if the Python wrapper has a + multi-table use case. +- DECIMAL64/128/256. Wire is defined (1-byte column-wide scale + + dense unscaled ints). Defer until Polars-decimal demand surfaces. +- `LONG_ARRAY` / `DOUBLE_ARRAY` per-row, `GEOHASH`, `CHAR`, `BINARY`. +- C++ header wrapper (`column_sender.hpp`). Python wrapper does not + need it. +- (Removed in this revision: durable-ack as deferred. See settled + decisions for ack-level handling.) + diff --git a/include/questdb/ingress/column_sender.h b/include/questdb/ingress/column_sender.h new file mode 100644 index 00000000..2b411407 --- /dev/null +++ b/include/questdb/ingress/column_sender.h @@ -0,0 +1,478 @@ +/******************************************************************************* + * ___ _ ____ ____ + * / _ \ _ _ ___ ___| |_| _ \| __ ) + * | | | | | | |/ _ \/ __| __| | | | _ \ + * | |_| | |_| | __/\__ \ |_| |_| | |_) | + * \__\_\\__,_|\___||___/\__|____/|____/ + * + * Copyright (c) 2014-2019 Appsicle + * Copyright (c) 2019-2025 QuestDB + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ******************************************************************************/ + +/* + * Column-major sender for QuestDB QWP/WebSocket. + * + * Mirrors doc/COLUMN_SENDER_FFI_ABI.md. Reuses `line_sender_error*` from + * `line_sender.h` for fallible-call error reporting; all opaque handles + * are heap-allocated and freed through their dedicated entry points. + * + * Conventions: + * - Opaque handles must be non-NULL unless the function documentation + * states otherwise. + * - `err_out` is optional on every fallible call: pass NULL to discard + * error information. + * - `column_sender_chunk` is owned by the caller and not bound to a + * particular sender; chunks can be built on any thread and flushed + * through any sender borrowed from the same `questdb_db`. + */ + +#pragma once + +#ifdef __cplusplus +extern "C" { +#endif + +#include +#include +#include + +#include "line_sender.h" + +/* ------------------------------------------------------------------------- + * Opaque handles + * ------------------------------------------------------------------------- */ + +/** Connection pool. Thread-safe; share across threads. */ +typedef struct questdb_db questdb_db; + +/** Borrowed sender. Not thread-safe; belongs to the borrowing thread + * until returned via `questdb_db_return_sender`. */ +typedef struct column_sender column_sender; + +/** One DataFrame's worth of column buffers destined for one QuestDB table. + * Owned by the caller. */ +typedef struct column_sender_chunk column_sender_chunk; + +/* ------------------------------------------------------------------------- + * Validity bitmap + * + * Arrow shape: bit = 1 means VALID, bit = 0 means NULL. LSB-first within + * each byte. `bit_len` must equal the chunk's row count; `bits` must + * point to at least `ceil(bit_len / 8)` bytes. Pass `bits=NULL, + * bit_len=0` to signal "no nulls" (or pass a `NULL` pointer to the + * column function's `validity` parameter). + * ------------------------------------------------------------------------- */ + +typedef struct column_sender_validity +{ + const uint8_t* bits; + size_t bit_len; +} column_sender_validity; + +/* ------------------------------------------------------------------------- + * Acknowledgement level for `column_sender_sync`. + * ------------------------------------------------------------------------- */ + +typedef enum column_sender_ack_level +{ + /** Wait for the server's WAL-commit ACK (spec status 0x00). Always + * available. */ + column_sender_ack_level_ok = 0, + + /** Wait for the server's object-store durability ACK (spec status + * 0x02). Enterprise only; requires the pool to be opened with + * `request_durable_ack=on` in the connect string. Sync returns + * `line_sender_error_invalid_api_call` otherwise. */ + column_sender_ack_level_durable = 1 +} column_sender_ack_level; + +/* ------------------------------------------------------------------------- + * Pool and sender borrow + * ------------------------------------------------------------------------- */ + +/** + * Open a connection pool. Eagerly opens `pool_size` connections (default + * 1); any auth / TLS / connect error during those opens fails the call. + * + * `conf` is a `qwpws::` / `qwpwss::` connect string. Pool-specific keys: + * `pool_size` (default 1) warm/min connections; + * `pool_max` (default 64) hard cap on auto-grow; + * `pool_idle_timeout_ms` (default 60000) + * reap above-pool_size idle conns; + * `pool_reap` (`auto`|`manual`, default `auto`) + * background reaper opt-in. + * + * Store-and-forward keys (`sf_*`, `sender_id`) are refused — use the + * row-major `line_sender_*` API for on-disk durability. + */ +QUESTDB_CLIENT_API +questdb_db* questdb_db_connect( + const char* conf, + size_t conf_len, + line_sender_error** err_out); + +/** + * Close the pool and all its connections. Accepts NULL and no-ops. + * Outstanding `column_sender` handles remain valid and return their + * connections on `questdb_db_return_sender` — the pool's state is + * reference-counted internally. + */ +QUESTDB_CLIENT_API +void questdb_db_close(questdb_db* db); + +/** + * Borrow a sender. Selection rules: + * 1. If a previously-returned sender is in the free list, hand it out. + * 2. Otherwise, if pool size < `pool_max`, open a new connection. + * 3. Otherwise (at cap), return NULL + `line_sender_error_invalid_api_call`. + * + * The returned sender is bound to the calling thread until returned. + */ +QUESTDB_CLIENT_API +column_sender* questdb_db_borrow_sender( + questdb_db* db, + line_sender_error** err_out); + +/** + * Return a sender to the pool. Accepts NULL `sender` and no-ops. + * Invalidates the `sender` pointer; do not use it after this call. + * + * `db` is currently ignored — the sender carries its own reference to + * the pool — but accepted for symmetry with the borrow call. + */ +QUESTDB_CLIENT_API +void questdb_db_return_sender( + questdb_db* db, + column_sender* sender); + +/** + * Manually reap idle connections (closes free-list entries idle longer + * than `pool_idle_timeout_ms`, never shrinking below `pool_size`). + * Returns the number of connections closed. + */ +QUESTDB_CLIENT_API +size_t questdb_db_reap_idle(questdb_db* db); + +/* ------------------------------------------------------------------------- + * Sender state inspection + * ------------------------------------------------------------------------- */ + +/** + * `true` if the sender's underlying connection is in a permanently- + * unusable state. On return to the pool such senders are dropped, not + * recycled. + */ +QUESTDB_CLIENT_API +bool column_sender_must_close(const column_sender* sender); + +/* ------------------------------------------------------------------------- + * Chunk lifecycle + * ------------------------------------------------------------------------- */ + +/** + * Create an empty chunk for the given table. The chunk is caller-owned + * and must be freed with `column_sender_chunk_free` or flushed via + * `column_sender_flush` (which clears but does not free it). + */ +QUESTDB_CLIENT_API +column_sender_chunk* column_sender_chunk_new( + const char* table_name, + size_t table_name_len, + line_sender_error** err_out); + +/** Discard the chunk and release its allocations. Accepts NULL. */ +QUESTDB_CLIENT_API +void column_sender_chunk_free(column_sender_chunk* chunk); + +/** Clear the chunk's content, keeping retained capacity for reuse. */ +QUESTDB_CLIENT_API +void column_sender_chunk_clear(column_sender_chunk* chunk); + +/** Current row count of the chunk; 0 if no column has been appended. */ +QUESTDB_CLIENT_API +size_t column_sender_chunk_row_count(const column_sender_chunk* chunk); + +/* ------------------------------------------------------------------------- + * Numeric / fixed-width column appends + * + * Every column-append function locks the chunk's row count on the first + * call. Subsequent columns must agree on row count. `data` is a + * contiguous, full-length typed array with one slot per row (including + * null rows — their slot value is ignored). `validity` is optional; + * pass NULL when the column has no nulls. + * ------------------------------------------------------------------------- */ + +QUESTDB_CLIENT_API +bool column_sender_chunk_column_i8( + column_sender_chunk* chunk, + const char* name, size_t name_len, + const int8_t* data, size_t row_count, + const column_sender_validity* validity, + line_sender_error** err_out); + +QUESTDB_CLIENT_API +bool column_sender_chunk_column_i16( + column_sender_chunk* chunk, + const char* name, size_t name_len, + const int16_t* data, size_t row_count, + const column_sender_validity* validity, + line_sender_error** err_out); + +QUESTDB_CLIENT_API +bool column_sender_chunk_column_i32( + column_sender_chunk* chunk, + const char* name, size_t name_len, + const int32_t* data, size_t row_count, + const column_sender_validity* validity, + line_sender_error** err_out); + +QUESTDB_CLIENT_API +bool column_sender_chunk_column_i64( + column_sender_chunk* chunk, + const char* name, size_t name_len, + const int64_t* data, size_t row_count, + const column_sender_validity* validity, + line_sender_error** err_out); + +QUESTDB_CLIENT_API +bool column_sender_chunk_column_f32( + column_sender_chunk* chunk, + const char* name, size_t name_len, + const float* data, size_t row_count, + const column_sender_validity* validity, + line_sender_error** err_out); + +QUESTDB_CLIENT_API +bool column_sender_chunk_column_f64( + column_sender_chunk* chunk, + const char* name, size_t name_len, + const double* data, size_t row_count, + const column_sender_validity* validity, + line_sender_error** err_out); + +/** + * `BOOLEAN` column. `data` is an Arrow-style LSB-first packed bitmap + * (1 = true). `data` must point to at least `ceil(row_count / 8)` bytes. + */ +QUESTDB_CLIENT_API +bool column_sender_chunk_column_bool( + column_sender_chunk* chunk, + const char* name, size_t name_len, + const uint8_t* data, size_t row_count, + const column_sender_validity* validity, + line_sender_error** err_out); + +/** + * `UUID` column. `data` points to `row_count * 16` bytes; each 16-byte + * group is one UUID (bytes 0..8 lo half LE, 8..16 hi half LE). + */ +QUESTDB_CLIENT_API +bool column_sender_chunk_column_uuid( + column_sender_chunk* chunk, + const char* name, size_t name_len, + const uint8_t* data, size_t row_count, + const column_sender_validity* validity, + line_sender_error** err_out); + +/** + * `LONG256` column. `data` points to `row_count * 32` bytes — four + * little-endian 64-bit limbs per row, least-significant limb first. + */ +QUESTDB_CLIENT_API +bool column_sender_chunk_column_long256( + column_sender_chunk* chunk, + const char* name, size_t name_len, + const uint8_t* data, size_t row_count, + const column_sender_validity* validity, + line_sender_error** err_out); + +/** + * `IPV4` column. Each `data[i]` is `u32::from(Ipv4Addr)` (octet 0 in + * the high byte), encoded little-endian on the wire. + */ +QUESTDB_CLIENT_API +bool column_sender_chunk_column_ipv4( + column_sender_chunk* chunk, + const char* name, size_t name_len, + const uint32_t* data, size_t row_count, + const column_sender_validity* validity, + line_sender_error** err_out); + +/* ------------------------------------------------------------------------- + * Timestamp columns (non-designated) + * ------------------------------------------------------------------------- */ + +/** `TIMESTAMP_NANOS` column, nanoseconds since the Unix epoch. */ +QUESTDB_CLIENT_API +bool column_sender_chunk_column_ts_nanos( + column_sender_chunk* chunk, + const char* name, size_t name_len, + const int64_t* data, size_t row_count, + const column_sender_validity* validity, + line_sender_error** err_out); + +/** `TIMESTAMP` column, microseconds since the Unix epoch. */ +QUESTDB_CLIENT_API +bool column_sender_chunk_column_ts_micros( + column_sender_chunk* chunk, + const char* name, size_t name_len, + const int64_t* data, size_t row_count, + const column_sender_validity* validity, + line_sender_error** err_out); + +/** `DATE` column, milliseconds since the Unix epoch. */ +QUESTDB_CLIENT_API +bool column_sender_chunk_column_date_millis( + column_sender_chunk* chunk, + const char* name, size_t name_len, + const int64_t* data, size_t row_count, + const column_sender_validity* validity, + line_sender_error** err_out); + +/* ------------------------------------------------------------------------- + * Variable-width text (VARCHAR) + * ------------------------------------------------------------------------- */ + +/** + * `VARCHAR` column (QWP wire type 0x0F). + * + * Input layout matches Arrow Utf8: + * - `offsets` has `row_count + 1` entries, monotonically non-decreasing. + * - `bytes` is a single contiguous UTF-8 buffer; offsets are absolute + * byte offsets into it (the column encoder rebases to 0 on the wire + * when the first offset is non-zero). + * - `validity` is Arrow-shape; NULL-row offset slices are not + * inspected. + * + * Wire output: dense (only non-null values), `non_null_count + 1` + * little-endian uint32 offsets followed by the concatenated bytes. + * + * UTF-8 validity is the caller's responsibility; invalid UTF-8 is + * detected by the server and surfaced as + * `line_sender_error_server_rejection`. + */ +QUESTDB_CLIENT_API +bool column_sender_chunk_column_varchar( + column_sender_chunk* chunk, + const char* name, size_t name_len, + const int32_t* offsets, + const uint8_t* bytes, + size_t bytes_len, + size_t row_count, + const column_sender_validity* validity, + line_sender_error** err_out); + +/* ------------------------------------------------------------------------- + * Symbol columns (dictionary fast path) + * + * `codes` is per-row dictionary indices. `dict_offsets` (length + * `dict_offsets_len`) and `dict_bytes` (length `dict_bytes_len`) + * describe the dictionary in Arrow Utf8 layout. The library interns + * only referenced dict entries against the connection-scoped global + * symbol table — `dict_offsets_len - 1` may be huge (Pandas + * `Categorical`) without paying the cost for unused entries. + * + * `codes[i]` must be in `0 .. dict_len` for non-null rows; null-row + * codes are not inspected. + * ------------------------------------------------------------------------- */ + +QUESTDB_CLIENT_API +bool column_sender_chunk_symbol_dict_i8( + column_sender_chunk* chunk, + const char* name, size_t name_len, + const int8_t* codes, size_t row_count, + const int32_t* dict_offsets, size_t dict_offsets_len, + const uint8_t* dict_bytes, size_t dict_bytes_len, + const column_sender_validity* validity, + line_sender_error** err_out); + +QUESTDB_CLIENT_API +bool column_sender_chunk_symbol_dict_i16( + column_sender_chunk* chunk, + const char* name, size_t name_len, + const int16_t* codes, size_t row_count, + const int32_t* dict_offsets, size_t dict_offsets_len, + const uint8_t* dict_bytes, size_t dict_bytes_len, + const column_sender_validity* validity, + line_sender_error** err_out); + +QUESTDB_CLIENT_API +bool column_sender_chunk_symbol_dict_i32( + column_sender_chunk* chunk, + const char* name, size_t name_len, + const int32_t* codes, size_t row_count, + const int32_t* dict_offsets, size_t dict_offsets_len, + const uint8_t* dict_bytes, size_t dict_bytes_len, + const column_sender_validity* validity, + line_sender_error** err_out); + +/* ------------------------------------------------------------------------- + * Designated timestamp + * + * Required exactly once per chunk before flush. Always non-null per the + * QWP wire spec — no `validity` parameter. + * ------------------------------------------------------------------------- */ + +/** Designated timestamp in microseconds (wire type TIMESTAMP, 0x0A). */ +QUESTDB_CLIENT_API +bool column_sender_chunk_designated_timestamp_micros( + column_sender_chunk* chunk, + const int64_t* data, + size_t row_count, + line_sender_error** err_out); + +/** Designated timestamp in nanoseconds (wire type TIMESTAMP_NANOS, 0x10). */ +QUESTDB_CLIENT_API +bool column_sender_chunk_designated_timestamp_nanos( + column_sender_chunk* chunk, + const int64_t* data, + size_t row_count, + line_sender_error** err_out); + +/* ------------------------------------------------------------------------- + * Flush / sync + * + * `column_sender_flush` encodes `chunk` into a QWP/WebSocket frame, + * publishes it, and returns without waiting for a server ACK. On success, + * `chunk` is cleared (allocations retained) and `true` is returned. On + * failure, `chunk` is left untouched. + * + * The first flush is sent as an immediate commit. Later flushes are sent + * with QWP's deferred-commit flag so callers can pipeline many chunks. + * Call `column_sender_sync` after the final flush to send the commit frame + * and wait until all in-flight frames are acknowledged at `ack_level`. + * + * The sender keeps one protocol in-flight slot reserved for the sync commit + * frame. If that reserve would be exhausted, flush returns + * `line_sender_error_invalid_api_call`; call `column_sender_sync` before + * flushing more chunks. + * ------------------------------------------------------------------------- */ + +QUESTDB_CLIENT_API +bool column_sender_flush( + column_sender* sender, + column_sender_chunk* chunk, + line_sender_error** err_out); + +QUESTDB_CLIENT_API +bool column_sender_sync( + column_sender* sender, + column_sender_ack_level ack_level, + line_sender_error** err_out); + +#ifdef __cplusplus +} /* extern "C" */ +#endif diff --git a/questdb-rs-ffi/src/column_sender.rs b/questdb-rs-ffi/src/column_sender.rs new file mode 100644 index 00000000..414a4bab --- /dev/null +++ b/questdb-rs-ffi/src/column_sender.rs @@ -0,0 +1,1024 @@ +/******************************************************************************* + * ___ _ ____ ____ + * / _ \ _ _ ___ ___| |_| _ \| __ ) + * | | | | | | |/ _ \/ __| __| | | | _ \ + * | |_| | |_| | __/\__ \ |_| |_| | |_) | + * \__\_\\__,_|\___||___/\__|____/|____/ + * + * Copyright (c) 2014-2019 Appsicle + * Copyright (c) 2019-2025 QuestDB + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ******************************************************************************/ + +//! C ABI for the column-major sender. +//! +//! Mirrors `doc/COLUMN_SENDER_FFI_ABI.md`. The ABI re-uses +//! `line_sender_error*` for fallible-call error reporting; opaque types +//! (`questdb_db`, `column_sender`, `column_sender_chunk`) are heap-allocated +//! and freed through their dedicated `_close` / `_free` / `_return_sender` +//! entry points. + +use libc::{c_char, size_t}; +use std::slice; +use std::str; + +use questdb::ingress::column_sender::{AckLevel, Chunk, OwnedSender, QuestDb, Validity}; +use questdb::{Error, ErrorCode}; + +use crate::{line_sender_error, set_err_out_from_error}; + +// =========================================================================== +// Opaque handles +// =========================================================================== + +/// Connection pool. Thread-safe; share across threads. +pub struct questdb_db(QuestDb); + +/// Borrowed sender. Owns a pool slot until `questdb_db_return_sender` is +/// called. Not thread-safe. +pub struct column_sender(OwnedSender); + +/// One DataFrame's worth of column buffers destined for one QuestDB table. +/// Owned by the caller; not bound to a sender. +/// +/// Holds raw pointers into caller buffers (no copy). Per the FFI ABI +/// doc §2.3, the caller MUST keep every column buffer passed in via +/// `column_sender_chunk_column_*` / `column_sender_chunk_symbol_dict_*` +/// alive until the next `column_sender_flush` call returns. We hide the +/// chunk's lifetime by promoting its inner type to `'static`; the lifetime +/// is enforced by the caller, not the borrow checker. +pub struct column_sender_chunk(Chunk<'static>); + +// =========================================================================== +// Validity bitmap (Arrow shape: bit = 1 means valid, LSB-first). +// =========================================================================== + +#[repr(C)] +#[derive(Copy, Clone)] +pub struct column_sender_validity { + pub bits: *const u8, + pub bit_len: size_t, +} + +unsafe fn as_validity<'a>( + v: *const column_sender_validity, + err_out: *mut *mut line_sender_error, +) -> Option>> { + if v.is_null() { + return Some(None); + } + let v = unsafe { &*v }; + let required = v.bit_len.div_ceil(8); + if v.bits.is_null() && v.bit_len != 0 { + unsafe { + set_err_out_from_error( + err_out, + Error::new( + ErrorCode::InvalidApiCall, + "column_sender_validity has null bits but bit_len != 0".to_string(), + ), + ); + } + return None; + } + let bytes: &[u8] = if v.bit_len == 0 { + &[] + } else { + unsafe { slice::from_raw_parts(v.bits, required) } + }; + match Validity::from_bitmap(bytes, v.bit_len) { + Ok(parsed) => Some(Some(parsed)), + Err(err) => { + unsafe { set_err_out_from_error(err_out, err) }; + None + } + } +} + +// =========================================================================== +// Ack level +// =========================================================================== + +#[repr(C)] +#[derive(Copy, Clone, Debug, PartialEq, Eq)] +pub enum column_sender_ack_level { + column_sender_ack_level_ok = 0, + column_sender_ack_level_durable = 1, +} + +impl From for AckLevel { + fn from(value: column_sender_ack_level) -> Self { + match value { + column_sender_ack_level::column_sender_ack_level_ok => AckLevel::Ok, + column_sender_ack_level::column_sender_ack_level_durable => AckLevel::Durable, + } + } +} + +// =========================================================================== +// Conversion helpers +// =========================================================================== + +unsafe fn name_str<'a>( + name: *const c_char, + name_len: size_t, + err_out: *mut *mut line_sender_error, +) -> Option<&'a str> { + if name.is_null() && name_len != 0 { + unsafe { + set_err_out_from_error( + err_out, + Error::new( + ErrorCode::InvalidApiCall, + "name pointer is NULL with non-zero length".to_string(), + ), + ); + } + return None; + } + let slice = if name_len == 0 { + &[] + } else { + unsafe { slice::from_raw_parts(name as *const u8, name_len) } + }; + match str::from_utf8(slice) { + Ok(s) => Some(s), + Err(_) => { + unsafe { + set_err_out_from_error( + err_out, + Error::new( + ErrorCode::InvalidUtf8, + "name is not valid UTF-8".to_string(), + ), + ); + } + None + } + } +} + +unsafe fn typed_slice<'a, T>( + data: *const T, + len: size_t, + err_out: *mut *mut line_sender_error, + what: &'static str, +) -> Option<&'a [T]> { + if data.is_null() && len != 0 { + unsafe { + set_err_out_from_error( + err_out, + Error::new( + ErrorCode::InvalidApiCall, + format!("{what} pointer is NULL with non-zero length"), + ), + ); + } + return None; + } + if len == 0 { + return Some(&[]); + } + Some(unsafe { slice::from_raw_parts(data, len) }) +} + +macro_rules! bubble { + ($err_out:expr, $expr:expr) => { + match $expr { + Ok(value) => value, + Err(err) => { + unsafe { set_err_out_from_error($err_out, err) }; + return false; + } + } + }; +} + +// =========================================================================== +// Pool +// =========================================================================== + +/// Open a connection pool. Eagerly opens `pool_size` connections; any +/// server/auth/TLS error during those opens fails the call. `conf` is a +/// NUL-terminated UTF-8 string. +/// +/// Returns NULL on failure. When `err_out != NULL`, the error is placed +/// in `*err_out` and ownership transfers to the caller (release with +/// `line_sender_error_free`). +#[unsafe(no_mangle)] +pub unsafe extern "C" fn questdb_db_connect( + conf: *const c_char, + conf_len: size_t, + err_out: *mut *mut line_sender_error, +) -> *mut questdb_db { + let conf = match unsafe { name_str(conf, conf_len, err_out) } { + Some(s) => s, + None => return std::ptr::null_mut(), + }; + match QuestDb::connect(conf) { + Ok(db) => Box::into_raw(Box::new(questdb_db(db))), + Err(err) => { + unsafe { set_err_out_from_error(err_out, err) }; + std::ptr::null_mut() + } + } +} + +/// Close the pool and all its connections. Accepts NULL and no-ops. +/// +/// Outstanding `column_sender` handles remain valid (they hold an +/// internal reference to the pool's state) and return themselves on +/// `questdb_db_return_sender`. +#[unsafe(no_mangle)] +pub unsafe extern "C" fn questdb_db_close(db: *mut questdb_db) { + if !db.is_null() { + unsafe { drop(Box::from_raw(db)) }; + } +} + +/// Borrow a sender from the pool. See +/// `doc/COLUMN_SENDER_FFI_ABI.md` §4.3 for the selection rules. Returns +/// NULL on failure; sets `*err_out` if provided. +#[unsafe(no_mangle)] +pub unsafe extern "C" fn questdb_db_borrow_sender( + db: *mut questdb_db, + err_out: *mut *mut line_sender_error, +) -> *mut column_sender { + if db.is_null() { + unsafe { + set_err_out_from_error( + err_out, + Error::new( + ErrorCode::InvalidApiCall, + "questdb_db_borrow_sender: db pointer is NULL".to_string(), + ), + ); + } + return std::ptr::null_mut(); + } + let db_ref = unsafe { &*db }; + match db_ref.0.borrow_sender_owned() { + Ok(owned) => Box::into_raw(Box::new(column_sender(owned))), + Err(err) => { + unsafe { set_err_out_from_error(err_out, err) }; + std::ptr::null_mut() + } + } +} + +/// Return a borrowed sender to the pool. Invalidates `sender`. Accepts +/// NULL `sender` and no-ops. `db` is ignored — the sender carries its +/// own reference to the pool — but kept in the ABI for symmetry with the +/// borrow call and to allow future runtime checks. +#[unsafe(no_mangle)] +pub unsafe extern "C" fn questdb_db_return_sender( + _db: *mut questdb_db, + sender: *mut column_sender, +) { + if !sender.is_null() { + unsafe { drop(Box::from_raw(sender)) }; + } +} + +/// Manually reap idle connections. Returns the number of connections +/// closed by this invocation. `db` must be non-NULL. +#[unsafe(no_mangle)] +pub unsafe extern "C" fn questdb_db_reap_idle(db: *mut questdb_db) -> size_t { + if db.is_null() { + return 0; + } + let db_ref = unsafe { &*db }; + db_ref.0.reap_idle() +} + +// =========================================================================== +// Sender state +// =========================================================================== + +/// `true` if the sender's underlying connection is in a permanently- +/// unusable state. +#[unsafe(no_mangle)] +pub unsafe extern "C" fn column_sender_must_close(sender: *const column_sender) -> bool { + if sender.is_null() { + return true; + } + unsafe { (*sender).0.get().must_close() } +} + +// =========================================================================== +// Chunk lifecycle +// =========================================================================== + +/// Create an empty chunk for `table_name` (validated UTF-8, ≤ 127 bytes). +#[unsafe(no_mangle)] +pub unsafe extern "C" fn column_sender_chunk_new( + table_name: *const c_char, + table_name_len: size_t, + err_out: *mut *mut line_sender_error, +) -> *mut column_sender_chunk { + let table = match unsafe { name_str(table_name, table_name_len, err_out) } { + Some(s) => s, + None => return std::ptr::null_mut(), + }; + Box::into_raw(Box::new(column_sender_chunk(Chunk::new(table)))) +} + +/// Free a chunk. Accepts NULL and no-ops. +#[unsafe(no_mangle)] +pub unsafe extern "C" fn column_sender_chunk_free(chunk: *mut column_sender_chunk) { + if !chunk.is_null() { + unsafe { drop(Box::from_raw(chunk)) }; + } +} + +/// Clear a chunk's content, keeping its retained capacity for reuse. +#[unsafe(no_mangle)] +pub unsafe extern "C" fn column_sender_chunk_clear(chunk: *mut column_sender_chunk) { + if !chunk.is_null() { + unsafe { (*chunk).0.clear() }; + } +} + +/// Current row count of the chunk; 0 if no column has been appended. +#[unsafe(no_mangle)] +pub unsafe extern "C" fn column_sender_chunk_row_count( + chunk: *const column_sender_chunk, +) -> size_t { + if chunk.is_null() { + return 0; + } + unsafe { (*chunk).0.row_count() } +} + +// =========================================================================== +// Numeric / fixed-width column appends +// =========================================================================== + +macro_rules! column_fn { + ($fn_name:ident, $c_ty:ty, $rust_method:ident, $what:literal) => { + #[unsafe(no_mangle)] + pub unsafe extern "C" fn $fn_name( + chunk: *mut column_sender_chunk, + name: *const c_char, + name_len: size_t, + data: *const $c_ty, + row_count: size_t, + validity: *const column_sender_validity, + err_out: *mut *mut line_sender_error, + ) -> bool { + let chunk = match unsafe { chunk.as_mut() } { + Some(c) => &mut c.0, + None => return reject_null_chunk(err_out), + }; + let name = match unsafe { name_str(name, name_len, err_out) } { + Some(s) => s, + None => return false, + }; + let data = match unsafe { typed_slice(data, row_count, err_out, $what) } { + Some(s) => s, + None => return false, + }; + let validity = match unsafe { as_validity(validity, err_out) } { + Some(v) => v, + None => return false, + }; + bubble!(err_out, chunk.$rust_method(name, data, validity.as_ref())); + true + } + }; +} + +column_fn!( + column_sender_chunk_column_i8, + i8, + column_i8, + "i8 column data" +); +column_fn!( + column_sender_chunk_column_i16, + i16, + column_i16, + "i16 column data" +); +column_fn!( + column_sender_chunk_column_i32, + i32, + column_i32, + "i32 column data" +); +column_fn!( + column_sender_chunk_column_i64, + i64, + column_i64, + "i64 column data" +); +column_fn!( + column_sender_chunk_column_f32, + f32, + column_f32, + "f32 column data" +); +column_fn!( + column_sender_chunk_column_f64, + f64, + column_f64, + "f64 column data" +); +column_fn!( + column_sender_chunk_column_ipv4, + u32, + column_ipv4, + "ipv4 column data" +); +column_fn!( + column_sender_chunk_column_ts_nanos, + i64, + column_ts_nanos, + "ts_nanos column data" +); +column_fn!( + column_sender_chunk_column_ts_micros, + i64, + column_ts_micros, + "ts_micros column data" +); +column_fn!( + column_sender_chunk_column_date_millis, + i64, + column_date_millis, + "date_millis column data" +); + +/// `BOOLEAN` column. `data` is an Arrow-style LSB-first packed bitmap; +/// must be at least `ceil(row_count / 8)` bytes long. +#[unsafe(no_mangle)] +pub unsafe extern "C" fn column_sender_chunk_column_bool( + chunk: *mut column_sender_chunk, + name: *const c_char, + name_len: size_t, + data: *const u8, + row_count: size_t, + validity: *const column_sender_validity, + err_out: *mut *mut line_sender_error, +) -> bool { + let chunk = match unsafe { chunk.as_mut() } { + Some(c) => &mut c.0, + None => return reject_null_chunk(err_out), + }; + let name = match unsafe { name_str(name, name_len, err_out) } { + Some(s) => s, + None => return false, + }; + let bytes_required = row_count.div_ceil(8); + let data_slice = match unsafe { typed_slice(data, bytes_required, err_out, "bool column data") } + { + Some(s) => s, + None => return false, + }; + let validity = match unsafe { as_validity(validity, err_out) } { + Some(v) => v, + None => return false, + }; + bubble!( + err_out, + chunk.column_bool(name, data_slice, row_count, validity.as_ref()) + ); + true +} + +macro_rules! fixed_width_byte_column_fn { + ($fn_name:ident, $n:literal, $rust_method:ident, $what:literal) => { + #[unsafe(no_mangle)] + pub unsafe extern "C" fn $fn_name( + chunk: *mut column_sender_chunk, + name: *const c_char, + name_len: size_t, + data: *const u8, + row_count: size_t, + validity: *const column_sender_validity, + err_out: *mut *mut line_sender_error, + ) -> bool { + let chunk = match unsafe { chunk.as_mut() } { + Some(c) => &mut c.0, + None => return reject_null_chunk(err_out), + }; + let name = match unsafe { name_str(name, name_len, err_out) } { + Some(s) => s, + None => return false, + }; + if data.is_null() && row_count != 0 { + unsafe { + set_err_out_from_error( + err_out, + Error::new( + ErrorCode::InvalidApiCall, + format!( + "{} column data pointer is NULL with non-zero row_count", + $what + ), + ), + ); + } + return false; + } + // SAFETY: the caller promises `data` points to `row_count * + // N` bytes (FFI-ABI §6) and that the buffer outlives the call. + let data_slice: &[[u8; $n]] = if row_count == 0 { + &[] + } else { + unsafe { slice::from_raw_parts(data as *const [u8; $n], row_count) } + }; + let validity = match unsafe { as_validity(validity, err_out) } { + Some(v) => v, + None => return false, + }; + bubble!( + err_out, + chunk.$rust_method(name, data_slice, validity.as_ref()) + ); + true + } + }; +} + +// `UUID` column. `data` is `row_count * 16` bytes; the FFI takes a +// `uint8_t*` and slices it into 16-byte rows. +fixed_width_byte_column_fn!(column_sender_chunk_column_uuid, 16, column_uuid, "uuid"); + +// `LONG256` column. `data` is `row_count * 32` bytes. +fixed_width_byte_column_fn!( + column_sender_chunk_column_long256, + 32, + column_long256, + "long256" +); + +// =========================================================================== +// VARCHAR (variable-width text) +// =========================================================================== + +/// `VARCHAR` column. Inputs are Arrow Utf8 shape: `offsets` length +/// `row_count + 1`, monotonically non-decreasing; `bytes` is the +/// concatenated UTF-8 buffer. +#[unsafe(no_mangle)] +pub unsafe extern "C" fn column_sender_chunk_column_varchar( + chunk: *mut column_sender_chunk, + name: *const c_char, + name_len: size_t, + offsets: *const i32, + bytes: *const u8, + bytes_len: size_t, + row_count: size_t, + validity: *const column_sender_validity, + err_out: *mut *mut line_sender_error, +) -> bool { + let chunk = match unsafe { chunk.as_mut() } { + Some(c) => &mut c.0, + None => return reject_null_chunk(err_out), + }; + let name = match unsafe { name_str(name, name_len, err_out) } { + Some(s) => s, + None => return false, + }; + let offsets_len = match row_count.checked_add(1) { + Some(n) => n, + None => { + unsafe { + set_err_out_from_error( + err_out, + Error::new( + ErrorCode::InvalidApiCall, + "row_count overflow when computing offsets length".to_string(), + ), + ); + } + return false; + } + }; + let offsets = match unsafe { typed_slice(offsets, offsets_len, err_out, "varchar offsets") } { + Some(s) => s, + None => return false, + }; + let bytes = match unsafe { typed_slice(bytes, bytes_len, err_out, "varchar bytes") } { + Some(s) => s, + None => return false, + }; + let validity = match unsafe { as_validity(validity, err_out) } { + Some(v) => v, + None => return false, + }; + bubble!( + err_out, + chunk.column_varchar(name, offsets, bytes, validity.as_ref()) + ); + true +} + +// =========================================================================== +// Symbol dictionary columns +// =========================================================================== + +macro_rules! symbol_fn { + ($fn_name:ident, $code_ty:ty, $rust_method:ident, $what:literal) => { + #[unsafe(no_mangle)] + pub unsafe extern "C" fn $fn_name( + chunk: *mut column_sender_chunk, + name: *const c_char, + name_len: size_t, + codes: *const $code_ty, + row_count: size_t, + dict_offsets: *const i32, + dict_offsets_len: size_t, + dict_bytes: *const u8, + dict_bytes_len: size_t, + validity: *const column_sender_validity, + err_out: *mut *mut line_sender_error, + ) -> bool { + let chunk = match unsafe { chunk.as_mut() } { + Some(c) => &mut c.0, + None => return reject_null_chunk(err_out), + }; + let name = match unsafe { name_str(name, name_len, err_out) } { + Some(s) => s, + None => return false, + }; + let codes = match unsafe { typed_slice(codes, row_count, err_out, $what) } { + Some(s) => s, + None => return false, + }; + let dict_offsets = match unsafe { + typed_slice( + dict_offsets, + dict_offsets_len, + err_out, + "symbol dict offsets", + ) + } { + Some(s) => s, + None => return false, + }; + let dict_bytes = match unsafe { + typed_slice(dict_bytes, dict_bytes_len, err_out, "symbol dict bytes") + } { + Some(s) => s, + None => return false, + }; + let validity = match unsafe { as_validity(validity, err_out) } { + Some(v) => v, + None => return false, + }; + bubble!( + err_out, + chunk.$rust_method(name, codes, dict_offsets, dict_bytes, validity.as_ref()) + ); + true + } + }; +} + +symbol_fn!( + column_sender_chunk_symbol_dict_i8, + i8, + symbol_dict_i8, + "symbol codes (i8)" +); +symbol_fn!( + column_sender_chunk_symbol_dict_i16, + i16, + symbol_dict_i16, + "symbol codes (i16)" +); +symbol_fn!( + column_sender_chunk_symbol_dict_i32, + i32, + symbol_dict_i32, + "symbol codes (i32)" +); + +// =========================================================================== +// Designated timestamp +// =========================================================================== + +#[unsafe(no_mangle)] +pub unsafe extern "C" fn column_sender_chunk_designated_timestamp_micros( + chunk: *mut column_sender_chunk, + data: *const i64, + row_count: size_t, + err_out: *mut *mut line_sender_error, +) -> bool { + let chunk = match unsafe { chunk.as_mut() } { + Some(c) => &mut c.0, + None => return reject_null_chunk(err_out), + }; + let data = match unsafe { typed_slice(data, row_count, err_out, "designated_ts micros") } { + Some(s) => s, + None => return false, + }; + bubble!(err_out, chunk.designated_timestamp_micros(data)); + true +} + +#[unsafe(no_mangle)] +pub unsafe extern "C" fn column_sender_chunk_designated_timestamp_nanos( + chunk: *mut column_sender_chunk, + data: *const i64, + row_count: size_t, + err_out: *mut *mut line_sender_error, +) -> bool { + let chunk = match unsafe { chunk.as_mut() } { + Some(c) => &mut c.0, + None => return reject_null_chunk(err_out), + }; + let data = match unsafe { typed_slice(data, row_count, err_out, "designated_ts nanos") } { + Some(s) => s, + None => return false, + }; + bubble!(err_out, chunk.designated_timestamp_nanos(data)); + true +} + +// =========================================================================== +// Flush +// =========================================================================== + +/// Encode `chunk` into a QWP/WebSocket frame, write it to the socket, +/// and return immediately — without waiting for the server's ack. +/// +/// Ready acks are drained non-blocking before the write. Deferred +/// flushes keep one in-flight slot reserved for the later +/// `column_sender_sync` commit frame; if that reserve would be +/// consumed, the call fails and the caller must sync before flushing +/// more chunks. +/// +/// On success, `chunk` is cleared and the call returns `true`. On +/// failure, `chunk` is left untouched and `false` is returned (with +/// `*err_out` set if provided). +/// +/// Call [`column_sender_sync`] after the last flush to drain all +/// remaining in-flight acks. +#[unsafe(no_mangle)] +pub unsafe extern "C" fn column_sender_flush( + sender: *mut column_sender, + chunk: *mut column_sender_chunk, + err_out: *mut *mut line_sender_error, +) -> bool { + let sender = match unsafe { sender.as_mut() } { + Some(s) => s.0.get_mut(), + None => { + unsafe { + set_err_out_from_error( + err_out, + Error::new( + ErrorCode::InvalidApiCall, + "column_sender_flush: sender pointer is NULL".to_string(), + ), + ); + } + return false; + } + }; + let chunk = match unsafe { chunk.as_mut() } { + Some(c) => &mut c.0, + None => return reject_null_chunk(err_out), + }; + bubble!(err_out, sender.flush(chunk)); + true +} + +/// Block until all in-flight frames are acknowledged at the requested +/// `ack_level`. +/// +/// `column_sender_ack_level_ok` waits for every in-flight frame's +/// WAL-commit ack. `column_sender_ack_level_durable` additionally waits +/// for the server's object-store durability watermarks. +/// +/// Returns `true` on success, `false` on error (with `*err_out` set). +#[unsafe(no_mangle)] +pub unsafe extern "C" fn column_sender_sync( + sender: *mut column_sender, + ack_level: column_sender_ack_level, + err_out: *mut *mut line_sender_error, +) -> bool { + let sender = match unsafe { sender.as_mut() } { + Some(s) => s.0.get_mut(), + None => { + unsafe { + set_err_out_from_error( + err_out, + Error::new( + ErrorCode::InvalidApiCall, + "column_sender_sync: sender pointer is NULL".to_string(), + ), + ); + } + return false; + } + }; + bubble!(err_out, sender.sync(ack_level.into())); + true +} + +// =========================================================================== +// Helpers +// =========================================================================== + +fn reject_null_chunk(err_out: *mut *mut line_sender_error) -> bool { + unsafe { + set_err_out_from_error( + err_out, + Error::new( + ErrorCode::InvalidApiCall, + "column_sender_chunk pointer is NULL".to_string(), + ), + ); + } + false +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::line_sender_error_free; + + // Most behaviour is already covered by the questdb-rs lib tests; this + // module's tests focus on the FFI surface — pointer handling, NULL + // guards, lifetime of error objects, etc. + + #[test] + fn connect_rejects_non_qwp_ws_schema() { + let conf = b"http::addr=localhost:9000;"; + let mut err: *mut line_sender_error = std::ptr::null_mut(); + let db = + unsafe { questdb_db_connect(conf.as_ptr() as *const c_char, conf.len(), &mut err) }; + assert!(db.is_null()); + assert!(!err.is_null()); + unsafe { line_sender_error_free(err) }; + } + + #[test] + fn chunk_new_validates_table_name() { + let mut err: *mut line_sender_error = std::ptr::null_mut(); + // 128-byte name: exceeds the 127-byte QWP cap, but the public + // `Chunk::new` does not validate eagerly — validation happens at + // flush time. So this constructor succeeds. + let table = "x".repeat(128); + let chunk = unsafe { + column_sender_chunk_new(table.as_ptr() as *const c_char, table.len(), &mut err) + }; + assert!(!chunk.is_null()); + assert!(err.is_null()); + unsafe { column_sender_chunk_free(chunk) }; + } + + #[test] + fn chunk_new_rejects_invalid_utf8() { + let bad: [u8; 3] = [0xFF, 0xFE, 0xFD]; + let mut err: *mut line_sender_error = std::ptr::null_mut(); + let chunk = + unsafe { column_sender_chunk_new(bad.as_ptr() as *const c_char, bad.len(), &mut err) }; + assert!(chunk.is_null()); + assert!(!err.is_null()); + unsafe { line_sender_error_free(err) }; + } + + #[test] + fn column_i64_round_trip_on_pure_data_path() { + let table = b"trades"; + let mut err: *mut line_sender_error = std::ptr::null_mut(); + let chunk = unsafe { + column_sender_chunk_new(table.as_ptr() as *const c_char, table.len(), &mut err) + }; + assert!(!chunk.is_null()); + + let name = b"price"; + let data: [i64; 3] = [1, 2, 3]; + let ok = unsafe { + column_sender_chunk_column_i64( + chunk, + name.as_ptr() as *const c_char, + name.len(), + data.as_ptr(), + data.len(), + std::ptr::null(), + &mut err, + ) + }; + assert!(ok, "column_i64 should succeed"); + assert_eq!(unsafe { column_sender_chunk_row_count(chunk) }, 3); + unsafe { column_sender_chunk_free(chunk) }; + } + + #[test] + fn column_i64_rejects_row_count_mismatch() { + let table = b"trades"; + let mut err: *mut line_sender_error = std::ptr::null_mut(); + let chunk = unsafe { + column_sender_chunk_new(table.as_ptr() as *const c_char, table.len(), &mut err) + }; + let name_a = b"a"; + let name_b = b"b"; + let data_a: [i64; 3] = [1, 2, 3]; + let data_b: [i64; 2] = [4, 5]; + assert!(unsafe { + column_sender_chunk_column_i64( + chunk, + name_a.as_ptr() as *const c_char, + name_a.len(), + data_a.as_ptr(), + data_a.len(), + std::ptr::null(), + &mut err, + ) + }); + let ok = unsafe { + column_sender_chunk_column_i64( + chunk, + name_b.as_ptr() as *const c_char, + name_b.len(), + data_b.as_ptr(), + data_b.len(), + std::ptr::null(), + &mut err, + ) + }; + assert!(!ok); + assert!(!err.is_null()); + unsafe { line_sender_error_free(err) }; + unsafe { column_sender_chunk_free(chunk) }; + } + + #[test] + fn validity_null_bits_with_nonzero_len_errors() { + let table = b"trades"; + let mut err: *mut line_sender_error = std::ptr::null_mut(); + let chunk = unsafe { + column_sender_chunk_new(table.as_ptr() as *const c_char, table.len(), &mut err) + }; + let name = b"a"; + let data: [i64; 2] = [1, 2]; + let v = column_sender_validity { + bits: std::ptr::null(), + bit_len: 2, + }; + let ok = unsafe { + column_sender_chunk_column_i64( + chunk, + name.as_ptr() as *const c_char, + name.len(), + data.as_ptr(), + data.len(), + &v, + &mut err, + ) + }; + assert!(!ok); + assert!(!err.is_null()); + unsafe { line_sender_error_free(err) }; + unsafe { column_sender_chunk_free(chunk) }; + } + + #[test] + fn null_chunk_pointer_is_handled() { + let mut err: *mut line_sender_error = std::ptr::null_mut(); + let name = b"a"; + let data: [i64; 1] = [1]; + let ok = unsafe { + column_sender_chunk_column_i64( + std::ptr::null_mut(), + name.as_ptr() as *const c_char, + name.len(), + data.as_ptr(), + data.len(), + std::ptr::null(), + &mut err, + ) + }; + assert!(!ok); + assert!(!err.is_null()); + unsafe { line_sender_error_free(err) }; + } + + #[test] + fn ack_level_enum_maps_correctly() { + assert_eq!( + AckLevel::from(column_sender_ack_level::column_sender_ack_level_ok), + AckLevel::Ok + ); + assert_eq!( + AckLevel::from(column_sender_ack_level::column_sender_ack_level_durable), + AckLevel::Durable + ); + } +} diff --git a/questdb-rs-ffi/src/lib.rs b/questdb-rs-ffi/src/lib.rs index 4cf0f6f0..c107b4a4 100644 --- a/questdb-rs-ffi/src/lib.rs +++ b/questdb-rs-ffi/src/lib.rs @@ -76,6 +76,9 @@ use ndarr::StrideArrayView; #[cfg(feature = "sync-reader-ws")] mod egress; +pub mod column_sender; +pub use column_sender::*; + macro_rules! bubble_err_to_c { ($err_out:expr, $expression:expr) => { bubble_err_to_c!($err_out, $expression, false) diff --git a/questdb-rs/Cargo.toml b/questdb-rs/Cargo.toml index 70aac7a2..07915254 100644 --- a/questdb-rs/Cargo.toml +++ b/questdb-rs/Cargo.toml @@ -255,6 +255,12 @@ required-features = ["sync-reader-ws"] name = "qwp_ws_unified_sfa_bench" required-features = ["sync-sender-qwp-ws"] +# Synthetic equities L1 quote feed → QuestDB via the column-major +# sender. End-to-end throughput sanity check against a real server. +[[example]] +name = "qwp_ws_l1_quotes" +required-features = ["sync-sender-qwp-ws"] + # Decoder microbenchmark anchoring the perf claims from commits # `8ec0a85` (zero-copy decode) and `1163d43` (tighter SYMBOL/VARCHAR # decode hot paths). Run with: @@ -269,3 +275,14 @@ required-features = ["sync-sender-qwp-ws"] name = "decoder" harness = false required-features = ["sync-reader-ws"] + +# Column-major sender hot-path bench. Anchors the perf claims from +# `doc/COLUMN_SENDER_PLAN.md` §2 (memcpy-bound no-null path, +# referenced-only symbol intern). Run with: +# +# cargo bench --features sync-sender-qwp-ws --bench column_sender +# QUESTDB_COLUMN_BENCH_ROWS=10000000 cargo bench --features sync-sender-qwp-ws --bench column_sender +[[bench]] +name = "column_sender" +harness = false +required-features = ["sync-sender-qwp-ws"] diff --git a/questdb-rs/benches/column_sender.rs b/questdb-rs/benches/column_sender.rs new file mode 100644 index 00000000..f430d05b --- /dev/null +++ b/questdb-rs/benches/column_sender.rs @@ -0,0 +1,446 @@ +/******************************************************************************* + * ___ _ ____ ____ + * / _ \ _ _ ___ ___| |_| _ \| __ ) + * | | | | | | |/ _ \/ __| __| | | | _ \ + * | |_| | |_| | __/\__ \ |_| |_| | |_) | + * \__\_\\__,_|\___||___/\__|____/|____/ + * + * Copyright (c) 2014-2019 Appsicle + * Copyright (c) 2019-2025 QuestDB + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ******************************************************************************/ + +//! Column-major sender hot-path bench (`questdb-rs/benches/column_sender.rs`). +//! +//! Anchors the perf claims in `doc/COLUMN_SENDER_PLAN.md` §2.1 +//! ("encode is a header + extend_from_slice per column") and §2.2 +//! ("no-null = memcpy; nullable = invert+gather"). Each bench reports +//! throughput in rows/s and bytes/s so a regression shows up as either +//! a row-rate or bandwidth drop. +//! +//! Three families: +//! +//! 1. **Per-column bulk append** — exercises [`Chunk::column_i64`], +//! [`Chunk::column_f64`], [`Chunk::column_varchar`], and +//! [`Chunk::symbol_dict_i32`] in both no-null and nullable shapes. +//! Baseline: a raw `extend_from_slice` from the caller's typed +//! buffer into a fresh `Vec`, the absolute floor any +//! column-sender hot path is competing with. +//! +//! 2. **Symbol bulk-intern** — compares the column path +//! ([`Chunk::symbol_dict_i32`] + flush-time interning) with a +//! naive per-row HashMap lookup that mirrors what the row API pays +//! on the same cardinality, to anchor the WS-4 plan claim ("10M +//! rows × 1000-card drops from 10M probes to 1000"). +//! +//! 3. **Encode-only end-to-end** — populate a 10M-row chunk with a +//! representative column mix, then time +//! [`bench_encode_chunk`](_bench_internals::bench_encode_chunk). +//! Pure encoder cost (no network) so a regression in +//! `encode_chunk` or in any per-column append shows up here. +//! +//! Run: +//! +//! ```text +//! cargo bench --features sync-sender-qwp-ws --bench column_sender +//! QUESTDB_COLUMN_BENCH_ROWS=10000000 cargo bench --features sync-sender-qwp-ws --bench column_sender +//! ``` + +use std::collections::HashMap; +use std::time::Duration; + +use criterion::{BatchSize, Criterion, Throughput, black_box, criterion_group, criterion_main}; + +use questdb::ingress::column_sender::_bench_internals::{ + BenchEncoderState, bench_encode_chunk_into, +}; +use questdb::ingress::column_sender::{Chunk, Validity}; + +// --------------------------------------------------------------------------- +// Workload sizes. Defaults are tuned for sub-second criterion samples so the +// bench runs in CI; bump via `QUESTDB_COLUMN_BENCH_ROWS` for headline numbers. +// --------------------------------------------------------------------------- + +fn row_count() -> usize { + std::env::var("QUESTDB_COLUMN_BENCH_ROWS") + .ok() + .and_then(|v| v.parse().ok()) + .unwrap_or(100_000) +} + +fn varchar_len() -> usize { + std::env::var("QUESTDB_COLUMN_BENCH_VARCHAR_LEN") + .ok() + .and_then(|v| v.parse().ok()) + .unwrap_or(16) +} + +fn symbol_cardinality() -> usize { + std::env::var("QUESTDB_COLUMN_BENCH_SYM_CARD") + .ok() + .and_then(|v| v.parse().ok()) + .unwrap_or(1_000) +} + +// --------------------------------------------------------------------------- +// Workload generators +// --------------------------------------------------------------------------- + +fn make_i64_data(rows: usize) -> Vec { + (0..rows as i64).collect() +} + +fn make_f64_data(rows: usize) -> Vec { + (0..rows).map(|i| i as f64 * 1.5).collect() +} + +/// Arrow-shape validity: every 16th row is null, all others valid. +fn make_validity_bits(rows: usize) -> Vec { + let bytes = rows.div_ceil(8); + let mut out = vec![0xFFu8; bytes]; + for (row_idx, byte) in (0..rows).zip(0..) { + let _ = byte; // pacify clippy if unused + if row_idx % 16 == 0 { + out[row_idx / 8] &= !(1u8 << (row_idx % 8)); + } + } + out +} + +fn make_varchar(rows: usize, len: usize) -> (Vec, Vec) { + let mut offsets = Vec::with_capacity(rows + 1); + let mut bytes = Vec::with_capacity(rows * len); + let alphabet = b"abcdefghijklmnopqrstuvwxyz"; + offsets.push(0); + for row in 0..rows { + for i in 0..len { + bytes.push(alphabet[(row + i) % alphabet.len()]); + } + offsets.push(bytes.len() as i32); + } + (offsets, bytes) +} + +fn make_symbol_workload(rows: usize, cardinality: usize) -> (Vec, Vec, Vec) { + let mut dict_offsets = Vec::with_capacity(cardinality + 1); + let mut dict_bytes = Vec::new(); + dict_offsets.push(0); + for i in 0..cardinality { + // Short distinct strings: "sym-12345". + let entry = format!("sym-{i:08}"); + dict_bytes.extend_from_slice(entry.as_bytes()); + dict_offsets.push(dict_bytes.len() as i32); + } + // Splitmix-style spread of codes across the dict so the encoder's + // intern + gather path sees a realistic distribution. + let mut codes = Vec::with_capacity(rows); + let mut state = 0x9E37_79B9_7F4A_7C15u64; + for _ in 0..rows { + state = state.wrapping_mul(0x9E37_79B9_7F4A_7C15); + state ^= state >> 27; + codes.push((state as usize % cardinality) as i32); + } + (codes, dict_offsets, dict_bytes) +} + +// --------------------------------------------------------------------------- +// Bench helpers +// --------------------------------------------------------------------------- + +fn fresh_chunk<'a>(table: &str) -> Chunk<'a> { + Chunk::new(table) +} + +// --------------------------------------------------------------------------- +// Per-column bulk-append benchmarks +// --------------------------------------------------------------------------- + +fn bench_column_i64(c: &mut Criterion) { + let rows = row_count(); + let data = make_i64_data(rows); + let mut group = c.benchmark_group("column_i64"); + group.throughput(Throughput::Bytes((rows * 8) as u64)); + + group.bench_function("memcpy_baseline", |b| { + b.iter_batched( + || Vec::::with_capacity(rows * 8 + 1), + |mut out| { + out.push(0); + let bytes: &[u8] = unsafe { + std::slice::from_raw_parts( + data.as_ptr().cast::(), + std::mem::size_of_val(data.as_slice()), + ) + }; + out.extend_from_slice(bytes); + black_box(out); + }, + BatchSize::SmallInput, + ); + }); + + group.bench_function("column_sender_no_null", |b| { + b.iter_batched( + || fresh_chunk("trades"), + |mut chunk| { + chunk.column_i64("v", &data, None).unwrap(); + black_box(&chunk); + }, + BatchSize::SmallInput, + ); + }); + + let bits = make_validity_bits(rows); + let validity = Validity::from_bitmap(&bits, rows).unwrap(); + group.bench_function("column_sender_nullable", |b| { + b.iter_batched( + || fresh_chunk("trades"), + |mut chunk| { + chunk.column_i64("v", &data, Some(&validity)).unwrap(); + black_box(&chunk); + }, + BatchSize::SmallInput, + ); + }); + + group.finish(); +} + +fn bench_column_f64(c: &mut Criterion) { + let rows = row_count(); + let data = make_f64_data(rows); + let mut group = c.benchmark_group("column_f64"); + group.throughput(Throughput::Bytes((rows * 8) as u64)); + + group.bench_function("memcpy_baseline", |b| { + b.iter_batched( + || Vec::::with_capacity(rows * 8 + 1), + |mut out| { + out.push(0); + let bytes: &[u8] = unsafe { + std::slice::from_raw_parts( + data.as_ptr().cast::(), + std::mem::size_of_val(data.as_slice()), + ) + }; + out.extend_from_slice(bytes); + black_box(out); + }, + BatchSize::SmallInput, + ); + }); + + group.bench_function("column_sender_no_null", |b| { + b.iter_batched( + || fresh_chunk("trades"), + |mut chunk| { + chunk.column_f64("v", &data, None).unwrap(); + black_box(&chunk); + }, + BatchSize::SmallInput, + ); + }); + + group.finish(); +} + +fn bench_column_varchar(c: &mut Criterion) { + let rows = row_count(); + let len = varchar_len(); + let (offsets, bytes) = make_varchar(rows, len); + let mut group = c.benchmark_group("column_varchar"); + group.throughput(Throughput::Bytes((4 * (rows + 1) + bytes.len()) as u64)); + + group.bench_function("memcpy_baseline", |b| { + b.iter_batched( + || Vec::::with_capacity(4 * (rows + 1) + bytes.len() + 1), + |mut out| { + out.push(0); + let offset_bytes: &[u8] = unsafe { + std::slice::from_raw_parts( + offsets.as_ptr().cast::(), + std::mem::size_of_val(offsets.as_slice()), + ) + }; + out.extend_from_slice(offset_bytes); + out.extend_from_slice(&bytes); + black_box(out); + }, + BatchSize::SmallInput, + ); + }); + + group.bench_function("column_sender_no_null", |b| { + b.iter_batched( + || fresh_chunk("logs"), + |mut chunk| { + chunk.column_varchar("msg", &offsets, &bytes, None).unwrap(); + black_box(&chunk); + }, + BatchSize::SmallInput, + ); + }); + + group.finish(); +} + +// --------------------------------------------------------------------------- +// Symbol bulk-intern: column path vs naïve per-row HashMap +// --------------------------------------------------------------------------- + +fn bench_symbol_dict(c: &mut Criterion) { + let rows = row_count(); + let card = symbol_cardinality(); + let (codes, dict_offsets, dict_bytes) = make_symbol_workload(rows, card); + let mut group = c.benchmark_group("symbol_dict"); + group.throughput(Throughput::Elements(rows as u64)); + + // Column-sender path: bulk three-pass intern at append time. + group.bench_function("column_sender", |b| { + b.iter_batched( + || fresh_chunk("ticks"), + |mut chunk| { + chunk + .symbol_dict_i32("sym", &codes, &dict_offsets, &dict_bytes, None) + .unwrap(); + black_box(&chunk); + }, + BatchSize::SmallInput, + ); + }); + + // Row-API analogue: per-row HashMap probe. Mimics what the legacy + // path pays for each symbol cell. We don't use the actual row + // encoder because it owns much more state than this measurement + // is trying to isolate — the point here is the per-row HashMap + // hit, which dominates symbol-column cost on the row path. + group.bench_function("naive_per_row_hashmap", |b| { + b.iter_batched( + || { + let map: HashMap<&[u8], u64> = HashMap::new(); + (map, Vec::::with_capacity(rows)) + }, + |(mut map, mut gids)| { + let mut next_id: u64 = 0; + for &code in &codes { + let start = dict_offsets[code as usize] as usize; + let end = dict_offsets[code as usize + 1] as usize; + let entry: &[u8] = &dict_bytes[start..end]; + let gid = *map.entry(entry).or_insert_with(|| { + let id = next_id; + next_id += 1; + id + }); + gids.push(gid); + } + black_box(&gids); + }, + BatchSize::SmallInput, + ); + }); + + group.finish(); +} + +// --------------------------------------------------------------------------- +// End-to-end encode (no network) +// --------------------------------------------------------------------------- + +fn encode_chunk_group(c: &mut Criterion) { + let rows = row_count(); + let i64_data = make_i64_data(rows); + let f64_data = make_f64_data(rows); + let (offsets, varchar_bytes) = make_varchar(rows, varchar_len()); + let (codes, dict_offsets, dict_bytes) = make_symbol_workload(rows, symbol_cardinality()); + let ts_data = make_i64_data(rows); + + let mut group = c.benchmark_group("encode_chunk"); + group.sample_size(20); // larger workload — fewer samples + group.measurement_time(Duration::from_secs(5)); + group.throughput(Throughput::Elements(rows as u64)); + + let build_chunk = || { + let mut chunk = Chunk::new("ticks"); + chunk.column_i64("qty", &i64_data, None).unwrap(); + chunk.column_f64("price", &f64_data, None).unwrap(); + chunk + .column_varchar("msg", &offsets, &varchar_bytes, None) + .unwrap(); + chunk + .symbol_dict_i32("sym", &codes, &dict_offsets, &dict_bytes, None) + .unwrap(); + chunk.designated_timestamp_nanos(&ts_data).unwrap(); + chunk + }; + + group.bench_function("populate_only", |b| { + b.iter_batched( + || (), + |_| { + let chunk = build_chunk(); + black_box(&chunk); + }, + BatchSize::SmallInput, + ); + }); + + let prebuilt = build_chunk(); + group.bench_function("encode_only", |b| { + b.iter_batched( + || { + ( + BenchEncoderState::new(), + Vec::::with_capacity(64 * 1024), + ) + }, + |(mut state, mut out)| { + out.clear(); + bench_encode_chunk_into(&mut out, &prebuilt, &mut state).unwrap(); + black_box(&out); + }, + BatchSize::SmallInput, + ); + }); + + group.bench_function("populate_plus_encode", |b| { + b.iter_batched( + || { + ( + BenchEncoderState::new(), + Vec::::with_capacity(64 * 1024), + ) + }, + |(mut state, mut out)| { + let chunk = build_chunk(); + out.clear(); + bench_encode_chunk_into(&mut out, &chunk, &mut state).unwrap(); + black_box(&out); + }, + BatchSize::SmallInput, + ); + }); + + group.finish(); +} + +criterion_group!( + benches, + bench_column_i64, + bench_column_f64, + bench_column_varchar, + bench_symbol_dict, + encode_chunk_group, +); +criterion_main!(benches); diff --git a/questdb-rs/examples/qwp_ws_l1_quotes.rs b/questdb-rs/examples/qwp_ws_l1_quotes.rs new file mode 100644 index 00000000..1ee1e373 --- /dev/null +++ b/questdb-rs/examples/qwp_ws_l1_quotes.rs @@ -0,0 +1,295 @@ +/******************************************************************************* + * ___ _ ____ ____ + * / _ \ _ _ ___ ___| |_| _ \| __ ) + * | | | | | | |/ _ \/ __| __| | | | _ \ + * | |_| | |_| | __/\__ \ |_| |_| | |_) | + * \__\_\\__,_|\___||___/\__|____/|____/ + * + * Copyright (c) 2014-2019 Appsicle + * Copyright (c) 2019-2025 QuestDB + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + ******************************************************************************/ + +//! Synthetic equities L1 quote feed → QuestDB via the column-major sender. +//! +//! Generates a 5M-row dataset that mimics a Level-1 order book stream +//! (per-symbol top-of-book bid/ask with a trailing last-trade) and +//! ingests it into a single QuestDB table. Reports end-to-end +//! throughput (rows/s, MB/s) and the average per-chunk flush latency. +//! +//! Default schema: +//! ts TIMESTAMP_NANOS (designated) +//! symbol SYMBOL (~500 tickers) +//! exchange SYMBOL (5 venues) +//! bid_px DOUBLE +//! ask_px DOUBLE +//! last_px DOUBLE +//! bid_sz LONG +//! ask_sz LONG +//! last_sz LONG +//! +//! Run against a local QuestDB instance: +//! cargo run --release --features sync-sender-qwp-ws \ +//! --example qwp_ws_l1_quotes +//! +//! Positional args: +//! 1: connect string (default `qwpws::addr=localhost:9000;`) +//! 2: table name (default `l1_quotes`) +//! 3: row count (default 5_000_000) +//! +//! Pre-create the table (paste into the QuestDB Web Console at +//! http://localhost:9000 or post via curl): +//! +//! CREATE TABLE l1_quotes ( +//! ts TIMESTAMP, +//! symbol SYMBOL CAPACITY 512 NOCACHE, +//! exchange SYMBOL CAPACITY 8 NOCACHE, +//! bid_px DOUBLE, +//! ask_px DOUBLE, +//! last_px DOUBLE, +//! bid_sz LONG, +//! ask_sz LONG, +//! last_sz LONG +//! ) TIMESTAMP(ts) PARTITION BY HOUR WAL; +//! +//! Verify after run: +//! curl 'http://localhost:9000/exec?query=SELECT%20count()%20FROM%20l1_quotes' +//! curl 'http://localhost:9000/exec?query=SELECT%20*%20FROM%20l1_quotes%20LIMIT%2010' + +use std::time::Instant; + +use questdb::ingress::column_sender::{AckLevel, Chunk, QuestDb}; + +const DEFAULT_TOTAL_ROWS: usize = 5_000_000; +/// 25 000 rows × ~60 bytes/row ≈ 1.5 MB. Stays under the QuestDB server's +/// default 2 MiB WebSocket receive buffer (the server logs +/// `QwpIngressUpgradeProcessor … frame too large` and closes the +/// connection for larger frames; the spec's 16 MiB cap is only relevant +/// when the server's buffer is sized for it). +const CHUNK_ROWS: usize = 25_000; +const SYMBOL_CARDINALITY: usize = 500; +const EXCHANGES: &[&str] = &["NYSE", "NASDAQ", "BATS", "ARCA", "IEX"]; + +fn main() -> questdb::Result<()> { + let conf = std::env::args() + .nth(1) + .unwrap_or_else(|| "qwpws::addr=localhost:9000;".to_string()); + let table_name = std::env::args() + .nth(2) + .unwrap_or_else(|| "l1_quotes".to_string()); + let total_rows: usize = std::env::args() + .nth(3) + .and_then(|v| v.parse().ok()) + .unwrap_or(DEFAULT_TOTAL_ROWS); + + println!( + "Generating {} rows of L1 quote data ({} tickers × {} venues)...", + humanise(total_rows), + SYMBOL_CARDINALITY, + EXCHANGES.len() + ); + let gen_start = Instant::now(); + + let symbol_dict_strings: Vec = (0..SYMBOL_CARDINALITY) + .map(|i| format!("TICK{i:03}")) + .collect(); + let (sym_dict_offsets, sym_dict_bytes) = + build_dict(symbol_dict_strings.iter().map(String::as_str)); + let (ex_dict_offsets, ex_dict_bytes) = build_dict(EXCHANGES.iter().copied()); + + // Pre-allocate columnar buffers for the full dataset. At 5 M × 8 B per + // f64/i64 column the peak working set is ~280 MB; comfortable on any + // dev box. + let mut symbol_codes = Vec::with_capacity(total_rows); + let mut exchange_codes = Vec::with_capacity(total_rows); + let mut ts_ns = Vec::with_capacity(total_rows); + let mut bid_px = Vec::with_capacity(total_rows); + let mut ask_px = Vec::with_capacity(total_rows); + let mut last_px = Vec::with_capacity(total_rows); + let mut bid_sz = Vec::with_capacity(total_rows); + let mut ask_sz = Vec::with_capacity(total_rows); + let mut last_sz = Vec::with_capacity(total_rows); + + let start_ts = std::time::SystemTime::now() + .duration_since(std::time::UNIX_EPOCH) + .unwrap() + .as_nanos() as i64; + + // Splitmix-style RNG: avoids a dep on `rand` and produces a uniform + // enough spread for the symbol distribution. + let mut state: u64 = 0x9E37_79B9_7F4A_7C15; + let mut step = || { + state = state.wrapping_mul(0x9E37_79B9_7F4A_7C15); + state ^= state >> 27; + state + }; + + for i in 0..total_rows { + let r1 = step(); + let r2 = step(); + + let sym = (r1 as usize % SYMBOL_CARDINALITY) as i32; + let ex = ((r1 >> 32) as usize % EXCHANGES.len()) as i8; + // Per-symbol base price so the L1 feed has realistic price strata. + let base = 100.0 + sym as f64; + let spread = 0.01 + (((r2 & 0xFFFF) as f64) / 65_535.0) * 0.05; + let drift = (((r2 >> 16) & 0xFFFF) as f64 - 32_768.0) / 1_000_000.0; + let mid = base + drift; + let bid = mid - spread / 2.0; + let ask = mid + spread / 2.0; + let last = mid + (((r2 >> 32) & 0xFFFF) as f64 - 32_768.0) / 1_000_000.0; + let sz_bid = 100 + ((r1 >> 8) & 0xFFFF) as i64; + let sz_ask = 100 + ((r1 >> 24) & 0xFFFF) as i64; + let sz_last = 100 + ((r2 >> 48) & 0x3FF) as i64; + + symbol_codes.push(sym); + exchange_codes.push(ex); + // Monotonic 1 µs cadence — characteristic of a top-of-book feed + // even if individual events are slightly out of order in real + // life. + ts_ns.push(start_ts + (i as i64) * 1_000); + bid_px.push(bid); + ask_px.push(ask); + last_px.push(last); + bid_sz.push(sz_bid); + ask_sz.push(sz_ask); + last_sz.push(sz_last); + } + let gen_elapsed = gen_start.elapsed(); + println!( + " generated in {:.2}s ({:.1} M rows/s)", + gen_elapsed.as_secs_f64(), + total_rows as f64 / gen_elapsed.as_secs_f64() / 1e6 + ); + + println!("\nConnecting to {conf} ..."); + let db = QuestDb::connect(&conf)?; + let mut sender = db.borrow_sender()?; + + // One chunk reused across flushes — the bench design exists exactly + // for this case: per-column `Vec` capacity is retained across + // flush(). + let mut chunk = Chunk::new(&table_name); + + let mut chunk_micros: Vec = Vec::new(); + let send_start = Instant::now(); + let mut flushed = 0usize; + let mut chunk_idx = 0usize; + while flushed < total_rows { + let end = (flushed + CHUNK_ROWS).min(total_rows); + + chunk.column_i64("bid_sz", &bid_sz[flushed..end], None)?; + chunk.column_i64("ask_sz", &ask_sz[flushed..end], None)?; + chunk.column_i64("last_sz", &last_sz[flushed..end], None)?; + chunk.column_f64("bid_px", &bid_px[flushed..end], None)?; + chunk.column_f64("ask_px", &ask_px[flushed..end], None)?; + chunk.column_f64("last_px", &last_px[flushed..end], None)?; + chunk.symbol_dict_i32( + "symbol", + &symbol_codes[flushed..end], + &sym_dict_offsets, + &sym_dict_bytes, + None, + )?; + chunk.symbol_dict_i8( + "exchange", + &exchange_codes[flushed..end], + &ex_dict_offsets, + &ex_dict_bytes, + None, + )?; + chunk.designated_timestamp_nanos(&ts_ns[flushed..end])?; + + let t = Instant::now(); + sender.flush(&mut chunk)?; + chunk_micros.push(t.elapsed().as_micros()); + + flushed = end; + chunk_idx += 1; + eprint!( + "\r flushed chunk {chunk_idx:02} ({}/{} rows)", + humanise(flushed), + humanise(total_rows) + ); + } + sender.sync(AckLevel::Ok)?; + eprintln!(); + let send_elapsed = send_start.elapsed(); + + // Per-row wire payload estimate: + // 3 × f64 + 3 × i64 + 1 × i64 (ts) + 2 B symbol varint + 1 B exchange varint + // = 24 + 24 + 8 + 3 = 59 bytes. Schema/header overhead amortises away. + let bytes_per_row = 59usize; + let total_bytes = total_rows * bytes_per_row; + + println!( + "\nFlushed {} rows in {:.2}s ({} chunks of up to {})", + humanise(total_rows), + send_elapsed.as_secs_f64(), + chunk_idx, + humanise(CHUNK_ROWS) + ); + println!( + " throughput: {:>7.2} M rows/s", + total_rows as f64 / send_elapsed.as_secs_f64() / 1e6 + ); + println!( + " bandwidth: {:>7.1} MB/s (≈ {:.0} byte/row × rows/s)", + total_bytes as f64 / send_elapsed.as_secs_f64() / 1e6, + bytes_per_row + ); + println!( + " per-chunk avg: {:>7.1} ms", + send_elapsed.as_millis() as f64 / chunk_idx as f64 + ); + if let (Some(&min), Some(&max)) = (chunk_micros.iter().min(), chunk_micros.iter().max()) { + let mut sorted = chunk_micros.clone(); + sorted.sort_unstable(); + let p50 = sorted[sorted.len() / 2]; + let p95 = sorted[(sorted.len() * 19) / 20]; + println!( + " per-chunk min/p50/p95/max: {:.2} / {:.2} / {:.2} / {:.2} ms", + min as f64 / 1000.0, + p50 as f64 / 1000.0, + p95 as f64 / 1000.0, + max as f64 / 1000.0, + ); + } + + println!("\nVerify in QuestDB:"); + println!(" curl 'http://localhost:9000/exec?query=SELECT%20count()%20FROM%20{table_name}'"); + println!( + " curl 'http://localhost:9000/exec?query=SELECT%20*%20FROM%20{table_name}%20LIMIT%2010'" + ); + + Ok(()) +} + +fn build_dict<'a, I>(strings: I) -> (Vec, Vec) +where + I: IntoIterator, +{ + let mut offsets: Vec = vec![0]; + let mut bytes: Vec = Vec::new(); + for s in strings { + bytes.extend_from_slice(s.as_bytes()); + offsets.push(bytes.len() as i32); + } + (offsets, bytes) +} + +fn humanise(n: usize) -> String { + if n >= 1_000_000 { + format!("{:.2} M", n as f64 / 1e6) + } else if n >= 1_000 { + format!("{:.1} k", n as f64 / 1e3) + } else { + n.to_string() + } +} diff --git a/questdb-rs/src/ingress.rs b/questdb-rs/src/ingress.rs index b1569abf..09da8dd2 100644 --- a/questdb-rs/src/ingress.rs +++ b/questdb-rs/src/ingress.rs @@ -60,7 +60,7 @@ mod timestamp; mod buffer; pub use buffer::*; -mod sender; +pub(crate) mod sender; #[cfg(feature = "_sender-qwp-ws")] pub(crate) use sender::QwpWsRoleReject; pub use sender::*; @@ -68,6 +68,9 @@ pub use sender::*; mod decimal; pub use decimal::DecimalView; +#[cfg(feature = "sync-sender-qwp-ws")] +pub mod column_sender; + const MAX_NAME_LEN_DEFAULT: usize = 127; /// The maximum allowed dimensions for arrays. @@ -389,6 +392,24 @@ pub(crate) struct QwpWsAddrScan { pub(crate) sanitized_conf: String, } +/// Raw QWP/WebSocket connection produced by +/// [`SenderBuilder::build_qwp_ws_raw_stream`]. The column-major sender uses +/// this as its sole entry point into the network — it does its own +/// synchronous frame I/O on the contained `WsStream` and never touches the +/// row-API publisher / driver / queue stack. +#[cfg(feature = "sync-sender-qwp-ws")] +pub(crate) struct RawQwpWsStream { + pub(crate) stream: sender::qwp_ws::WsStream, + /// Bytes already read past the HTTP upgrade response. The shared + /// handshake helper may consume more bytes than the response body + /// itself; those bytes are the start of the first server WS frame + /// and must be drained before reading more from the socket. + pub(crate) leftover: Vec, + pub(crate) max_buf_size: usize, + pub(crate) request_timeout: Duration, + pub(crate) durable_ack_opt_in: bool, +} + /// Pre-scan a raw connect string for repeated `addr=...` params. Returns the /// full list of addr values and a sanitized conf with duplicate `addr=` params /// removed (the first one is kept so the downstream `questdb_confstr` parser @@ -2384,6 +2405,92 @@ impl SenderBuilder { Ok(sender) } + /// Open a raw QWP/WebSocket connection (TCP + optional TLS + HTTP + /// upgrade) **without** assembling the row-API publisher, queue, or + /// background-thread machinery. + /// + /// Returned by reference, the [`crate::ingress::sender::qwp_ws::WsStream`] + /// is the only thing the column-major sender needs from this crate's + /// builder: it does its own synchronous frame writing and ack reading + /// from there. See `doc/COLUMN_SENDER_PLAN.md`. + #[cfg(feature = "sync-sender-qwp-ws")] + pub(crate) fn build_qwp_ws_raw_stream(&self) -> Result { + if self.init_buf_size.is_specified() && *self.init_buf_size > *self.max_buf_size { + return Err(error::fmt!( + ConfigError, + "init_buf_size ({}) cannot exceed max_buf_size ({})", + *self.init_buf_size, + *self.max_buf_size + )); + } + + if !matches!(self.protocol, Protocol::QwpWs | Protocol::QwpWss) { + return Err(error::fmt!( + ConfigError, + "Column-sender requires a QWP/WebSocket connect string \ + (got protocol {:?})", + self.protocol + )); + } + if self.net_interface.is_some() { + return Err(error::fmt!( + InvalidApiCall, + "net_interface is not supported for QWP over WebSocket." + )); + } + let Some(qwp_ws) = self.qwp_ws.as_ref() else { + return Err(error::fmt!( + ConfigError, + "QWP/WebSocket configuration is missing." + )); + }; + + #[cfg(feature = "insecure-skip-verify")] + let tls_verify = *self.tls_verify; + let tls_roots_password = self.tls_roots_password.deref().as_deref(); + + if tls_roots_password.is_some() && self.tls_roots.deref().is_none() { + return Err(error::fmt!( + ConfigError, + "\"tls_roots_password\" requires \"tls_roots\" \ + (the password unlocks the keystore at that path)" + )); + } + + let tls_settings = tls::TlsSettings::build( + self.protocol.tls_enabled(), + #[cfg(feature = "insecure-skip-verify")] + tls_verify, + *self.tls_ca, + self.tls_roots.deref().as_deref(), + tls_roots_password, + )?; + + let auth = self.build_auth()?; + let basic_auth = qwp_ws_auth_header(&auth)?; + let mut qwp_ws = qwp_ws.clone(); + qwp_ws.apply_reconnect_implies_initial_retry(); + reject_unsupported_qwp_ws_sf_config(&qwp_ws)?; + + let use_tls = matches!(self.protocol, Protocol::QwpWss); + let (stream, _negotiated_version, leftover) = sender::qwp_ws::establish_connection( + self.host.as_str(), + self.port.as_str(), + use_tls, + tls_settings, + &qwp_ws, + basic_auth.as_deref(), + )?; + + Ok(RawQwpWsStream { + stream, + leftover, + max_buf_size: *self.max_buf_size, + request_timeout: *qwp_ws.request_timeout, + durable_ack_opt_in: *qwp_ws.request_durable_ack, + }) + } + #[cfg(any(feature = "_sender-tcp", feature = "_sender-qwp-udp"))] fn ensure_supports_bind_interface(&self, param_name: &str) -> Result<()> { #[cfg(feature = "_sender-tcp")] diff --git a/questdb-rs/src/ingress/buffer/qwp.rs b/questdb-rs/src/ingress/buffer/qwp.rs index 7446fa25..afcce210 100644 --- a/questdb-rs/src/ingress/buffer/qwp.rs +++ b/questdb-rs/src/ingress/buffer/qwp.rs @@ -5066,6 +5066,13 @@ impl SymbolGlobalDict { self.next_id } + /// Number of global ids assigned so far. The column-sender encoder + /// uses this as the `delta_start` field of the delta-symbol-dict + /// prefix. + pub(crate) fn next_id(&self) -> u64 { + self.next_id + } + pub(crate) fn mark(&self) -> SymbolGlobalDictMark { SymbolGlobalDictMark { entries_len: self.entries.len(), @@ -5082,13 +5089,13 @@ impl SymbolGlobalDict { self.next_id = mark.next_id; } - fn entry(&self, id: u64) -> Option<&[u8]> { + pub(crate) fn entry(&self, id: u64) -> Option<&[u8]> { let index = usize::try_from(id).ok()?; self.entries.get(index).map(Vec::as_slice) } /// Returns `(global_id, is_new)`. - fn intern(&mut self, bytes: &[u8]) -> (u64, bool) { + pub(crate) fn intern(&mut self, bytes: &[u8]) -> (u64, bool) { if let Some(&id) = self.map.get(bytes) { return (id, false); } diff --git a/questdb-rs/src/ingress/column_sender/chunk.rs b/questdb-rs/src/ingress/column_sender/chunk.rs new file mode 100644 index 00000000..88a22471 --- /dev/null +++ b/questdb-rs/src/ingress/column_sender/chunk.rs @@ -0,0 +1,902 @@ +/******************************************************************************* + * ___ _ ____ ____ + * / _ \ _ _ ___ ___| |_| _ \| __ ) + * | | | | | | |/ _ \/ __| __| | | | _ \ + * | |_| | |_| | __/\__ \ |_| |_| | |_) | + * \__\_\\__,_|\___||___/\__|____/|____/ + * + * Copyright (c) 2014-2019 Appsicle + * Copyright (c) 2019-2025 QuestDB + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ******************************************************************************/ + +//! Column-major chunk: one DataFrame's worth of borrowed column buffers +//! destined for a single QuestDB table. +//! +//! `Chunk<'a>` stores **descriptors** — raw pointers + lengths + an +//! optional validity bitmap — for each column. No data is copied at +//! append time. Caller buffers must remain alive from +//! [`ColumnSender::flush`](super::ColumnSender::flush) call setup until +//! the call returns; the lifetime parameter `'a` enforces this on the +//! safe Rust API. +//! +//! At flush time, the [`encoder`](super::encoder) walks the descriptors +//! and writes wire bytes straight into the connection's reusable write +//! buffer. The no-null hot path is a single `memcpy` per column from the +//! caller's buffer into that buffer. + +use std::fmt::{self, Debug, Formatter}; +use std::marker::PhantomData; + +use crate::{Result, error}; + +use super::validity::{Validity, check_row_count}; +use super::wire::{ + MAX_NAME_LEN, QWP_TYPE_BOOLEAN, QWP_TYPE_BYTE, QWP_TYPE_DATE, QWP_TYPE_DOUBLE, QWP_TYPE_FLOAT, + QWP_TYPE_INT, QWP_TYPE_IPV4, QWP_TYPE_LONG, QWP_TYPE_LONG256, QWP_TYPE_SHORT, QWP_TYPE_SYMBOL, + QWP_TYPE_TIMESTAMP, QWP_TYPE_TIMESTAMP_NANOS, QWP_TYPE_UUID, QWP_TYPE_VARCHAR, validate_name, +}; + +// =========================================================================== +// Descriptors +// =========================================================================== + +/// Validity bitmap descriptor (raw-ptr form, matching `Validity<'a>`). +/// `non_null_count` is pre-computed at column-append time because several +/// encoder paths (e.g. VARCHAR's dense offset table) size their output +/// from it. +#[derive(Clone, Copy)] +pub(crate) struct ValidityDescriptor { + pub(crate) bits: *const u8, + pub(crate) bit_len: usize, + pub(crate) non_null_count: usize, +} + +impl ValidityDescriptor { + fn from_validity(v: &Validity<'_>) -> Self { + Self { + bits: v.bits.as_ptr(), + bit_len: v.bit_len, + non_null_count: v.non_null_count(), + } + } + + /// SAFETY: caller's buffer must still be alive (Chunk's `'a` lifetime + /// guarantees this on the safe path; the FFI is responsible on the + /// unsafe path). + #[inline] + pub(crate) unsafe fn is_valid(&self, idx: usize) -> bool { + debug_assert!(idx < self.bit_len); + let byte = unsafe { *self.bits.add(idx / 8) }; + (byte >> (idx % 8)) & 1 == 1 + } + + /// Length in bytes of the underlying Arrow bitmap. + #[inline] + pub(crate) fn byte_len(&self) -> usize { + self.bit_len.div_ceil(8) + } +} + +/// Per-column kind dispatch. Each variant carries the raw pointer(s) the +/// encoder dereferences at flush time. +pub(crate) enum ColumnKind { + // ---- Sentinel-null fixed width (no bitmap; 0x00 null_flag) ---- + Byte { + data: *const i8, + }, + Short { + data: *const i16, + }, + Int { + data: *const i32, + }, + Long { + data: *const i64, + }, + Float { + data: *const f32, + }, + Double { + data: *const f64, + }, + // Bool: Arrow LSB-first bitmap input. row_count is the Chunk's row count. + Bool { + bits: *const u8, + }, + + // ---- Bitmap-style fixed width (sparse null encoding) ---- + Ipv4 { + data: *const u32, + }, + TsNanos { + data: *const i64, + }, + TsMicros { + data: *const i64, + }, + DateMillis { + data: *const i64, + }, + Uuid { + data: *const [u8; 16], + }, + Long256 { + data: *const [u8; 32], + }, + + // ---- Variable-width text (VARCHAR) ---- + Varchar { + offsets: *const i32, + /// row_count + 1 + offsets_len: usize, + bytes: *const u8, + bytes_len: usize, + }, + + // ---- Symbol (dictionary-encoded) ---- + Symbol { + codes: SymbolCodesPtr, + dict_offsets: *const i32, + /// dict cardinality + 1 + dict_offsets_len: usize, + dict_bytes: *const u8, + dict_bytes_len: usize, + }, +} + +#[derive(Clone, Copy)] +pub(crate) enum SymbolCodesPtr { + I8(*const i8), + I16(*const i16), + I32(*const i32), +} + +impl SymbolCodesPtr { + /// Read the dict-index for row `i`, sign-extended to `i64` so the + /// encoder can range-check uniformly. SAFETY: caller's `codes` + /// buffer must still be alive. + #[inline] + pub(crate) unsafe fn read_i64(&self, i: usize) -> i64 { + unsafe { + match self { + SymbolCodesPtr::I8(p) => *p.add(i) as i64, + SymbolCodesPtr::I16(p) => *p.add(i) as i64, + SymbolCodesPtr::I32(p) => *p.add(i) as i64, + } + } + } +} + +/// One column slot in a [`Chunk`]. `name` is owned (the chunk holds it +/// for diagnostics + signature emission); everything else is borrowed. +pub(crate) struct ColumnDescriptor { + pub(crate) name: String, + pub(crate) wire_type: u8, + pub(crate) kind: ColumnKind, + pub(crate) validity: Option, +} + +/// Designated timestamp descriptor. Required exactly once per chunk +/// before flush. Designated timestamps are non-null by spec. +pub(crate) struct DesignatedTsDescriptor { + pub(crate) wire_type: u8, + pub(crate) data: *const i64, +} + +// =========================================================================== +// Chunk +// =========================================================================== + +/// One DataFrame's worth of borrowed column buffers destined for one +/// QuestDB table. +/// +/// The lifetime parameter `'a` ties the chunk to every column buffer +/// passed in through `column_*` / `symbol_dict_*`. Each call validates +/// inputs and stores a descriptor referencing the caller's buffer; no +/// data is copied. The caller's buffers must outlive the chunk — +/// concretely, they must remain alive from each column append through +/// the next [`ColumnSender::flush`](super::ColumnSender::flush) call. +pub struct Chunk<'a> { + pub(crate) table: String, + pub(crate) row_count: Option, + pub(crate) columns: Vec, + pub(crate) designated_ts: Option, + _marker: PhantomData<&'a ()>, +} + +impl<'a> Chunk<'a> { + /// Create a chunk for `table`. The table name is validated at flush + /// time against the QWP/Java client length cap (127 bytes UTF-8). + pub fn new(table: impl Into) -> Self { + Self { + table: table.into(), + row_count: None, + columns: Vec::new(), + designated_ts: None, + _marker: PhantomData, + } + } + + pub fn table(&self) -> &str { + &self.table + } + + pub fn row_count(&self) -> usize { + self.row_count.unwrap_or(0) + } + + pub fn is_empty(&self) -> bool { + self.row_count.is_none() && self.designated_ts.is_none() + } + + /// Reset the chunk for reuse. Drops descriptors but keeps the + /// `Vec` capacity so the next chunk fills the same + /// slots without reallocating the outer Vec. + pub fn clear(&mut self) { + self.row_count = None; + self.columns.clear(); + self.designated_ts = None; + } + + // ------------------------------------------------------------------- + // Numeric & fixed-width columns + // ------------------------------------------------------------------- + + pub fn column_i8( + &mut self, + name: &str, + data: &'a [i8], + validity: Option<&Validity<'a>>, + ) -> Result<&mut Self> { + let row_count = check_row_count(self.row_count, data.len(), validity)?; + self.push_column( + name, + QWP_TYPE_BYTE, + ColumnKind::Byte { + data: data.as_ptr(), + }, + validity, + row_count, + ) + } + + pub fn column_i16( + &mut self, + name: &str, + data: &'a [i16], + validity: Option<&Validity<'a>>, + ) -> Result<&mut Self> { + let row_count = check_row_count(self.row_count, data.len(), validity)?; + self.push_column( + name, + QWP_TYPE_SHORT, + ColumnKind::Short { + data: data.as_ptr(), + }, + validity, + row_count, + ) + } + + pub fn column_i32( + &mut self, + name: &str, + data: &'a [i32], + validity: Option<&Validity<'a>>, + ) -> Result<&mut Self> { + let row_count = check_row_count(self.row_count, data.len(), validity)?; + self.push_column( + name, + QWP_TYPE_INT, + ColumnKind::Int { + data: data.as_ptr(), + }, + validity, + row_count, + ) + } + + pub fn column_i64( + &mut self, + name: &str, + data: &'a [i64], + validity: Option<&Validity<'a>>, + ) -> Result<&mut Self> { + let row_count = check_row_count(self.row_count, data.len(), validity)?; + self.push_column( + name, + QWP_TYPE_LONG, + ColumnKind::Long { + data: data.as_ptr(), + }, + validity, + row_count, + ) + } + + pub fn column_f32( + &mut self, + name: &str, + data: &'a [f32], + validity: Option<&Validity<'a>>, + ) -> Result<&mut Self> { + let row_count = check_row_count(self.row_count, data.len(), validity)?; + self.push_column( + name, + QWP_TYPE_FLOAT, + ColumnKind::Float { + data: data.as_ptr(), + }, + validity, + row_count, + ) + } + + pub fn column_f64( + &mut self, + name: &str, + data: &'a [f64], + validity: Option<&Validity<'a>>, + ) -> Result<&mut Self> { + let row_count = check_row_count(self.row_count, data.len(), validity)?; + self.push_column( + name, + QWP_TYPE_DOUBLE, + ColumnKind::Double { + data: data.as_ptr(), + }, + validity, + row_count, + ) + } + + pub fn column_bool( + &mut self, + name: &str, + data: &'a [u8], + row_count: usize, + validity: Option<&Validity<'a>>, + ) -> Result<&mut Self> { + let bytes_required = row_count.div_ceil(8); + if data.len() < bytes_required { + return Err(error::fmt!( + InvalidApiCall, + "Boolean column data too short: {} bytes for {} rows (need at least {})", + data.len(), + row_count, + bytes_required + )); + } + let row_count = check_row_count(self.row_count, row_count, validity)?; + self.push_column( + name, + QWP_TYPE_BOOLEAN, + ColumnKind::Bool { + bits: data.as_ptr(), + }, + validity, + row_count, + ) + } + + // ------------------------------------------------------------------- + // Bitmap-style fixed-width columns + // ------------------------------------------------------------------- + + pub fn column_uuid( + &mut self, + name: &str, + data: &'a [[u8; 16]], + validity: Option<&Validity<'a>>, + ) -> Result<&mut Self> { + let row_count = check_row_count(self.row_count, data.len(), validity)?; + self.push_column( + name, + QWP_TYPE_UUID, + ColumnKind::Uuid { + data: data.as_ptr(), + }, + validity, + row_count, + ) + } + + pub fn column_long256( + &mut self, + name: &str, + data: &'a [[u8; 32]], + validity: Option<&Validity<'a>>, + ) -> Result<&mut Self> { + let row_count = check_row_count(self.row_count, data.len(), validity)?; + self.push_column( + name, + QWP_TYPE_LONG256, + ColumnKind::Long256 { + data: data.as_ptr(), + }, + validity, + row_count, + ) + } + + pub fn column_ipv4( + &mut self, + name: &str, + data: &'a [u32], + validity: Option<&Validity<'a>>, + ) -> Result<&mut Self> { + let row_count = check_row_count(self.row_count, data.len(), validity)?; + self.push_column( + name, + QWP_TYPE_IPV4, + ColumnKind::Ipv4 { + data: data.as_ptr(), + }, + validity, + row_count, + ) + } + + pub fn column_ts_nanos( + &mut self, + name: &str, + data: &'a [i64], + validity: Option<&Validity<'a>>, + ) -> Result<&mut Self> { + let row_count = check_row_count(self.row_count, data.len(), validity)?; + self.push_column( + name, + QWP_TYPE_TIMESTAMP_NANOS, + ColumnKind::TsNanos { + data: data.as_ptr(), + }, + validity, + row_count, + ) + } + + pub fn column_ts_micros( + &mut self, + name: &str, + data: &'a [i64], + validity: Option<&Validity<'a>>, + ) -> Result<&mut Self> { + let row_count = check_row_count(self.row_count, data.len(), validity)?; + self.push_column( + name, + QWP_TYPE_TIMESTAMP, + ColumnKind::TsMicros { + data: data.as_ptr(), + }, + validity, + row_count, + ) + } + + pub fn column_date_millis( + &mut self, + name: &str, + data: &'a [i64], + validity: Option<&Validity<'a>>, + ) -> Result<&mut Self> { + let row_count = check_row_count(self.row_count, data.len(), validity)?; + self.push_column( + name, + QWP_TYPE_DATE, + ColumnKind::DateMillis { + data: data.as_ptr(), + }, + validity, + row_count, + ) + } + + // ------------------------------------------------------------------- + // VARCHAR + // ------------------------------------------------------------------- + + pub fn column_varchar( + &mut self, + name: &str, + offsets: &'a [i32], + bytes: &'a [u8], + validity: Option<&Validity<'a>>, + ) -> Result<&mut Self> { + if offsets.is_empty() { + return Err(error::fmt!( + InvalidApiCall, + "VARCHAR offsets must have at least one entry (row_count + 1)" + )); + } + let row_count = offsets.len() - 1; + let row_count = check_row_count(self.row_count, row_count, validity)?; + validate_varchar_offsets(offsets, bytes.len())?; + self.push_column( + name, + QWP_TYPE_VARCHAR, + ColumnKind::Varchar { + offsets: offsets.as_ptr(), + offsets_len: offsets.len(), + bytes: bytes.as_ptr(), + bytes_len: bytes.len(), + }, + validity, + row_count, + ) + } + + // ------------------------------------------------------------------- + // Symbol + // ------------------------------------------------------------------- + + pub fn symbol_dict_i8( + &mut self, + name: &str, + codes: &'a [i8], + dict_offsets: &'a [i32], + dict_bytes: &'a [u8], + validity: Option<&Validity<'a>>, + ) -> Result<&mut Self> { + self.push_symbol( + name, + SymbolCodesPtr::I8(codes.as_ptr()), + codes.len(), + dict_offsets, + dict_bytes, + validity, + ) + } + + pub fn symbol_dict_i16( + &mut self, + name: &str, + codes: &'a [i16], + dict_offsets: &'a [i32], + dict_bytes: &'a [u8], + validity: Option<&Validity<'a>>, + ) -> Result<&mut Self> { + self.push_symbol( + name, + SymbolCodesPtr::I16(codes.as_ptr()), + codes.len(), + dict_offsets, + dict_bytes, + validity, + ) + } + + pub fn symbol_dict_i32( + &mut self, + name: &str, + codes: &'a [i32], + dict_offsets: &'a [i32], + dict_bytes: &'a [u8], + validity: Option<&Validity<'a>>, + ) -> Result<&mut Self> { + self.push_symbol( + name, + SymbolCodesPtr::I32(codes.as_ptr()), + codes.len(), + dict_offsets, + dict_bytes, + validity, + ) + } + + fn push_symbol( + &mut self, + name: &str, + codes: SymbolCodesPtr, + codes_len: usize, + dict_offsets: &'a [i32], + dict_bytes: &'a [u8], + validity: Option<&Validity<'a>>, + ) -> Result<&mut Self> { + let row_count = check_row_count(self.row_count, codes_len, validity)?; + if dict_offsets.is_empty() { + return Err(error::fmt!( + InvalidApiCall, + "symbol dict offsets must have at least one entry (dict_len + 1)" + )); + } + validate_varchar_offsets(dict_offsets, dict_bytes.len())?; + let dict_len = dict_offsets.len() - 1; + + // Range-check codes for non-null rows. The encoder relies on + // every non-null code being a valid dict index, so we surface + // the failure here at append time. + let bounds_check = match codes { + SymbolCodesPtr::I8(p) => unsafe { range_check_codes(p, codes_len, dict_len, validity) }, + SymbolCodesPtr::I16(p) => unsafe { + range_check_codes(p, codes_len, dict_len, validity) + }, + SymbolCodesPtr::I32(p) => unsafe { + range_check_codes(p, codes_len, dict_len, validity) + }, + }; + bounds_check?; + + self.push_column( + name, + QWP_TYPE_SYMBOL, + ColumnKind::Symbol { + codes, + dict_offsets: dict_offsets.as_ptr(), + dict_offsets_len: dict_offsets.len(), + dict_bytes: dict_bytes.as_ptr(), + dict_bytes_len: dict_bytes.len(), + }, + validity, + row_count, + ) + } + + // ------------------------------------------------------------------- + // Designated timestamp + // ------------------------------------------------------------------- + + pub fn designated_timestamp_micros(&mut self, data: &'a [i64]) -> Result<&mut Self> { + self.set_designated_ts(QWP_TYPE_TIMESTAMP, data) + } + + pub fn designated_timestamp_nanos(&mut self, data: &'a [i64]) -> Result<&mut Self> { + self.set_designated_ts(QWP_TYPE_TIMESTAMP_NANOS, data) + } + + fn set_designated_ts(&mut self, wire_type: u8, data: &'a [i64]) -> Result<&mut Self> { + if self.designated_ts.is_some() { + return Err(error::fmt!( + InvalidApiCall, + "designated timestamp already set on this chunk" + )); + } + let row_count = check_row_count(self.row_count, data.len(), None)?; + self.designated_ts = Some(DesignatedTsDescriptor { + wire_type, + data: data.as_ptr(), + }); + self.row_count = Some(row_count); + Ok(self) + } + + // ------------------------------------------------------------------- + // Internal + // ------------------------------------------------------------------- + + fn push_column( + &mut self, + name: &str, + wire_type: u8, + kind: ColumnKind, + validity: Option<&Validity<'_>>, + row_count: usize, + ) -> Result<&mut Self> { + validate_name("column", name)?; + if name.len() > MAX_NAME_LEN { + return Err(error::fmt!( + InvalidName, + "column name is too long: {} bytes (max {})", + name.len(), + MAX_NAME_LEN + )); + } + self.guard_unique_name(name)?; + let validity = validity.map(ValidityDescriptor::from_validity); + self.columns.push(ColumnDescriptor { + name: name.to_owned(), + wire_type, + kind, + validity, + }); + self.row_count = Some(row_count); + Ok(self) + } + + fn guard_unique_name(&self, name: &str) -> Result<()> { + if self.columns.iter().any(|c| c.name == name) { + return Err(error::fmt!( + InvalidApiCall, + "duplicate column name in chunk: {:?}", + name + )); + } + Ok(()) + } +} + +fn validate_varchar_offsets(offsets: &[i32], bytes_len: usize) -> Result<()> { + let mut prev = offsets[0]; + if prev < 0 { + return Err(error::fmt!( + InvalidApiCall, + "VARCHAR offsets must be non-negative (offsets[0] = {})", + prev + )); + } + for (i, &off) in offsets.iter().enumerate().skip(1) { + if off < prev { + return Err(error::fmt!( + InvalidApiCall, + "VARCHAR offsets must be non-decreasing (offsets[{}] = {} < offsets[{}] = {})", + i, + off, + i - 1, + prev + )); + } + prev = off; + } + if (prev as usize) > bytes_len { + return Err(error::fmt!( + InvalidApiCall, + "VARCHAR offsets exceed bytes buffer: last offset = {}, bytes_len = {}", + prev, + bytes_len + )); + } + Ok(()) +} + +/// SAFETY: `p` must point to `codes_len` valid `T`s. `validity` (if any) +/// must have `bit_len == codes_len` and a bitmap of at least +/// `ceil(codes_len / 8)` bytes — both enforced by `check_row_count` and +/// `Validity::from_bitmap` before this is called. +unsafe fn range_check_codes( + p: *const T, + codes_len: usize, + dict_len: usize, + validity: Option<&Validity<'_>>, +) -> Result<()> +where + T: Copy + Into, +{ + for i in 0..codes_len { + if validity.is_some_and(|v| !v.is_valid(i)) { + continue; + } + let code = unsafe { (*p.add(i)).into() }; + if code < 0 || (code as usize) >= dict_len { + return Err(error::fmt!( + InvalidApiCall, + "symbol code out of range: row {} -> {} (dict_len = {})", + i, + code, + dict_len + )); + } + } + Ok(()) +} + +impl Debug for Chunk<'_> { + fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { + f.debug_struct("Chunk") + .field("table", &self.table) + .field("row_count", &self.row_count()) + .field("columns", &self.columns.len()) + .field("has_designated_ts", &self.designated_ts.is_some()) + .finish() + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn locks_row_count_on_first_column() { + let mut chunk = Chunk::new("t"); + let a = [1i64, 2, 3]; + chunk.column_i64("a", &a, None).unwrap(); + assert_eq!(chunk.row_count(), 3); + let b = [4i64, 5]; + let err = chunk.column_i64("b", &b, None).unwrap_err(); + assert_eq!(err.code(), crate::ErrorCode::InvalidApiCall); + assert!(err.msg().contains("row_count")); + } + + #[test] + fn rejects_duplicate_column_name() { + let mut chunk = Chunk::new("t"); + let a1 = [1i64]; + chunk.column_i64("a", &a1, None).unwrap(); + let a2 = [2i64]; + let err = chunk.column_i64("a", &a2, None).unwrap_err(); + assert_eq!(err.code(), crate::ErrorCode::InvalidApiCall); + assert!(err.msg().contains("duplicate")); + } + + #[test] + fn rejects_invalid_validity_length() { + let mut chunk = Chunk::new("t"); + let bits = [0xFFu8]; + let v = Validity::from_bitmap(&bits, 8).unwrap(); + let data = [1i64, 2, 3]; + let err = chunk.column_i64("a", &data, Some(&v)).unwrap_err(); + assert_eq!(err.code(), crate::ErrorCode::InvalidApiCall); + assert!(err.msg().contains("Validity bitmap")); + } + + #[test] + fn designated_ts_sets_row_count() { + let mut chunk = Chunk::new("t"); + let ts = [1i64, 2, 3]; + chunk.designated_timestamp_micros(&ts).unwrap(); + assert_eq!(chunk.row_count(), 3); + let ts2 = [4i64, 5, 6]; + let err = chunk.designated_timestamp_nanos(&ts2).unwrap_err(); + assert!(err.msg().contains("designated")); + } + + #[test] + fn clear_resets_columns_but_keeps_table() { + let mut chunk = Chunk::new("t"); + let a = [1i64]; + let ts = [10i64]; + chunk.column_i64("a", &a, None).unwrap(); + chunk.designated_timestamp_nanos(&ts).unwrap(); + chunk.clear(); + assert_eq!(chunk.row_count(), 0); + assert!(chunk.is_empty()); + assert_eq!(chunk.table(), "t"); + } + + #[test] + fn varchar_rejects_negative_offset() { + let mut chunk = Chunk::new("t"); + let offsets = [-1i32, 1, 2]; + let err = chunk + .column_varchar("v", &offsets, b"ab", None) + .unwrap_err(); + assert_eq!(err.code(), crate::ErrorCode::InvalidApiCall); + assert!(err.msg().contains("non-negative")); + } + + #[test] + fn varchar_rejects_non_monotonic_offsets() { + let mut chunk = Chunk::new("t"); + let offsets = [0i32, 5, 3]; + let err = chunk + .column_varchar("v", &offsets, b"abcde", None) + .unwrap_err(); + assert_eq!(err.code(), crate::ErrorCode::InvalidApiCall); + assert!(err.msg().contains("non-decreasing")); + } + + #[test] + fn symbol_rejects_out_of_range_code() { + let mut chunk = Chunk::new("t"); + let codes = [0i32, 99]; + let dict_offsets = [0i32, 5]; + let err = chunk + .symbol_dict_i32("sym", &codes, &dict_offsets, b"alpha", None) + .unwrap_err(); + assert_eq!(err.code(), crate::ErrorCode::InvalidApiCall); + assert!(err.msg().contains("out of range")); + } + + #[test] + fn symbol_skips_null_codes() { + let mut chunk = Chunk::new("t"); + let codes = [0i32, 99]; + let dict_offsets = [0i32, 5]; + let bits = [0b0000_0001]; + let v = Validity::from_bitmap(&bits, 2).unwrap(); + chunk + .symbol_dict_i32("sym", &codes, &dict_offsets, b"alpha", Some(&v)) + .expect("null row's bogus code is ignored"); + } +} diff --git a/questdb-rs/src/ingress/column_sender/conf.rs b/questdb-rs/src/ingress/column_sender/conf.rs new file mode 100644 index 00000000..d5c27b43 --- /dev/null +++ b/questdb-rs/src/ingress/column_sender/conf.rs @@ -0,0 +1,408 @@ +/******************************************************************************* + * ___ _ ____ ____ + * / _ \ _ _ ___ ___| |_| _ \| __ ) + * | | | | | | |/ _ \/ __| __| | | | _ \ + * | |_| | |_| | __/\__ \ |_| |_| | |_) | + * \__\_\\__,_|\___||___/\__|____/|____/ + * + * Copyright (c) 2014-2019 Appsicle + * Copyright (c) 2019-2025 QuestDB + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ******************************************************************************/ + +//! Column-sender connect-string parsing. +//! +//! Extracts pool-specific keys (`pool_size`, `pool_max`, +//! `pool_idle_timeout_ms`, `pool_reap`), refuses store-and-forward keys +//! (`sf_*`, `sender_id`), enforces a QWP/WebSocket schema, and produces a +//! sanitized conf string that the underlying [`crate::ingress::SenderBuilder`] +//! can consume to build per-pool-slot connections. + +use std::time::Duration; + +use crate::{Result, error}; + +/// Default number of warm connections opened eagerly at +/// [`super::QuestDb::connect`]. +pub(crate) const DEFAULT_POOL_SIZE: usize = 1; +/// Default hard cap on auto-grow. +pub(crate) const DEFAULT_POOL_MAX: usize = 64; +/// Default idle timeout before the reaper closes an above-`pool_size` +/// connection. +pub(crate) const DEFAULT_POOL_IDLE_TIMEOUT: Duration = Duration::from_secs(60); + +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub(crate) enum PoolReap { + Auto, + Manual, +} + +#[derive(Debug, Clone)] +pub(crate) struct PoolConfig { + pub(crate) pool_size: usize, + pub(crate) pool_max: usize, + pub(crate) pool_idle_timeout: Duration, + pub(crate) pool_reap: PoolReap, +} + +impl Default for PoolConfig { + fn default() -> Self { + Self { + pool_size: DEFAULT_POOL_SIZE, + pool_max: DEFAULT_POOL_MAX, + pool_idle_timeout: DEFAULT_POOL_IDLE_TIMEOUT, + pool_reap: PoolReap::Auto, + } + } +} + +#[derive(Debug, Clone)] +pub(crate) struct ParsedConf { + pub(crate) pool: PoolConfig, +} + +/// Validate and extract pool-specific knobs from a column-sender connect +/// string. +/// +/// The conf string itself is **not** rewritten — the underlying +/// `SenderBuilder` silently ignores the pool keys, so a single parse over the +/// original conf is enough. This function only sanity-checks the schema, +/// refuses store-and-forward keys, and returns the [`PoolConfig`] the pool +/// machinery needs. +pub(crate) fn parse(conf: &str) -> Result { + let Some((service, params)) = conf.split_once("::") else { + return Err(error::fmt!( + ConfigError, + "Invalid column-sender config: missing '::' service separator" + )); + }; + + if !is_qwp_ws_schema(service) { + return Err(error::fmt!( + ConfigError, + "Column-sender requires a QWP/WebSocket connect string \ + (schema must be one of 'qwpws', 'qwpwss', 'ws', or 'wss', \ + got {:?})", + service + )); + } + + let mut pool = PoolConfig::default(); + let mut pool_size_specified = false; + + walk_params(params, |key, value| { + if is_refused_key(key) { + return Err(refused_key_error(key)); + } + match key { + "request_durable_ack" => { + // Syntactic check; the SenderBuilder also parses this + // for ColumnConn. + let _ = parse_on_off("request_durable_ack", value)?; + } + "qwp_ws_progress" if value != "background" => { + return Err(error::fmt!( + ConfigError, + "Column-sender requires \"qwp_ws_progress=background\" (got {:?})", + value + )); + } + "pool_size" => { + pool.pool_size = parse_pool_usize(key, value)?; + pool_size_specified = true; + } + "pool_max" => { + let value = parse_pool_usize(key, value)?; + if value == 0 { + return Err(error::fmt!( + ConfigError, + "\"pool_max\" must be greater than 0" + )); + } + pool.pool_max = value; + } + "pool_idle_timeout_ms" => { + let millis: u64 = value.parse().map_err(|_| { + error::fmt!( + ConfigError, + "Invalid value for \"pool_idle_timeout_ms\" (expected non-negative integer): {:?}", + value + ) + })?; + pool.pool_idle_timeout = Duration::from_millis(millis); + } + "pool_reap" => { + pool.pool_reap = match value { + "auto" => PoolReap::Auto, + "manual" => PoolReap::Manual, + other => { + return Err(error::fmt!( + ConfigError, + "Invalid value for \"pool_reap\" (expected 'auto' or 'manual'): {:?}", + other + )); + } + }; + } + _ => { + // Unknown / passthrough — leave the SenderBuilder to handle it. + } + } + Ok(()) + })?; + + if pool_size_specified && pool.pool_size == 0 { + return Err(error::fmt!( + ConfigError, + "\"pool_size\" must be greater than 0" + )); + } + + if pool.pool_size > pool.pool_max { + return Err(error::fmt!( + ConfigError, + "\"pool_size\" ({}) must not exceed \"pool_max\" ({})", + pool.pool_size, + pool.pool_max + )); + } + + Ok(ParsedConf { pool }) +} + +fn parse_on_off(key: &str, value: &str) -> Result { + match value { + "on" => Ok(true), + "off" => Ok(false), + _ => Err(error::fmt!( + ConfigError, + "Invalid value for {:?} (expected 'on' or 'off'): {:?}", + key, + value + )), + } +} + +fn is_qwp_ws_schema(service: &str) -> bool { + service.eq_ignore_ascii_case("qwpws") + || service.eq_ignore_ascii_case("qwpwss") + || service.eq_ignore_ascii_case("ws") + || service.eq_ignore_ascii_case("wss") +} + +fn is_refused_key(key: &str) -> bool { + // Store-and-forward (`sf_*`) is unsupported by the column-sender API in v1 + // — see `doc/COLUMN_SENDER_PLAN.md` §8. The legacy `sender_id` key is part + // of the same SF family and is refused alongside the `sf_*` keys. + key == "sender_id" || key.starts_with("sf_") +} + +fn refused_key_error(key: &str) -> crate::Error { + error::fmt!( + ConfigError, + "Column-sender does not support store-and-forward configuration \ + (key {:?} is refused; use the row-major `Sender` API if you need \ + on-disk durability)", + key + ) +} + +fn parse_pool_usize(key: &str, value: &str) -> Result { + value.parse::().map_err(|_| { + error::fmt!( + ConfigError, + "Invalid value for {:?} (expected non-negative integer): {:?}", + key, + value + ) + }) +} + +/// Walk a parsed conf-string `params` section, invoking `visit(key, value)` +/// for each `key=value;` pair. +/// +/// Mirrors the value-parsing rules of [`crate::ingress::scan_qwp_ws_addr_params`]: +/// a doubled `;;` is treated as a literal semicolon inside a value. +fn walk_params(params: &str, mut visit: F) -> Result<()> +where + F: FnMut(&str, &str) -> Result<()>, +{ + let mut pos = 0usize; + while pos < params.len() { + let Some(eq_rel) = params[pos..].find('=') else { + return Err(error::fmt!( + ConfigError, + "Invalid column-sender config: parameter without '=' at position {}", + pos + )); + }; + let key = ¶ms[pos..pos + eq_rel]; + pos = pos + eq_rel + 1; + + let mut value = String::new(); + while pos < params.len() { + let rest = ¶ms[pos..]; + let mut chars = rest.char_indices(); + let (_, ch) = chars.next().expect("pos is within params"); + if ch == ';' { + let next_pos = pos + ch.len_utf8(); + if params[next_pos..].starts_with(';') { + value.push(';'); + pos = next_pos + 1; + continue; + } + pos = next_pos; + break; + } + value.push(ch); + pos += ch.len_utf8(); + } + + visit(key, value.as_str())?; + } + Ok(()) +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::ErrorCode; + + fn parse_ok(conf: &str) -> ParsedConf { + parse(conf).unwrap_or_else(|e| panic!("expected ok, got {e}")) + } + + fn parse_err(conf: &str) -> crate::Error { + match parse(conf) { + Ok(_) => panic!("expected error for {conf:?}"), + Err(e) => e, + } + } + + #[test] + fn defaults() { + let p = parse_ok("qwpws::addr=localhost:9000;"); + assert_eq!(p.pool.pool_size, DEFAULT_POOL_SIZE); + assert_eq!(p.pool.pool_max, DEFAULT_POOL_MAX); + assert_eq!(p.pool.pool_idle_timeout, DEFAULT_POOL_IDLE_TIMEOUT); + assert_eq!(p.pool.pool_reap, PoolReap::Auto); + } + + #[test] + fn parses_pool_knobs() { + let p = parse_ok( + "qwpws::addr=localhost:9000;pool_size=4;pool_max=8;pool_idle_timeout_ms=10000;pool_reap=manual;", + ); + assert_eq!(p.pool.pool_size, 4); + assert_eq!(p.pool.pool_max, 8); + assert_eq!(p.pool.pool_idle_timeout, Duration::from_secs(10)); + assert_eq!(p.pool.pool_reap, PoolReap::Manual); + } + + #[test] + fn refuses_non_qwp_ws_schema() { + let err = parse_err("http::addr=localhost:9000;"); + assert_eq!(err.code(), ErrorCode::ConfigError); + assert!(err.msg().contains("QWP/WebSocket")); + } + + #[test] + fn refuses_sf_keys() { + for key in [ + "sf_dir", + "sender_id", + "sf_max_bytes", + "sf_max_total_bytes", + "sf_durability", + "sf_append_deadline_millis", + ] { + let conf = format!("qwpws::addr=localhost:9000;{key}=whatever;"); + let err = parse_err(&conf); + assert_eq!(err.code(), ErrorCode::ConfigError); + assert!( + err.msg().contains("store-and-forward") && err.msg().contains(key), + "{} -> {}", + key, + err.msg() + ); + } + } + + #[test] + fn refuses_pool_size_zero() { + let err = parse_err("qwpws::addr=localhost:9000;pool_size=0;"); + assert_eq!(err.code(), ErrorCode::ConfigError); + assert!(err.msg().contains("pool_size")); + } + + #[test] + fn refuses_pool_size_above_pool_max() { + let err = parse_err("qwpws::addr=localhost:9000;pool_size=10;pool_max=5;"); + assert_eq!(err.code(), ErrorCode::ConfigError); + assert!(err.msg().contains("pool_size") && err.msg().contains("pool_max")); + } + + #[test] + fn invalid_pool_reap_value() { + let err = parse_err("qwpws::addr=localhost:9000;pool_reap=sometimes;"); + assert_eq!(err.code(), ErrorCode::ConfigError); + assert!(err.msg().contains("pool_reap")); + } + + #[test] + fn ignores_unknown_keys() { + // Unknown keys are passed through to the underlying SenderBuilder, + // which silently ignores its own unknowns. The column-sender layer + // must not error on them either. + let _ = parse_ok("qwpws::addr=localhost:9000;auth_timeout=5000;some_future_key=value;"); + } + + #[test] + fn parses_request_durable_ack() { + // Syntactically valid values pass the column-sender's pre-check. + // The actual `durable_ack_opt_in` flag is sourced from the + // SenderBuilder inside `ColumnConn::connect`. + let _ = parse_ok("qwpws::addr=localhost:9000;"); + let _ = parse_ok("qwpws::addr=localhost:9000;request_durable_ack=on;"); + let _ = parse_ok("qwpws::addr=localhost:9000;request_durable_ack=off;"); + } + + #[test] + fn refuses_invalid_request_durable_ack_value() { + let err = parse_err("qwpws::addr=localhost:9000;request_durable_ack=true;"); + assert_eq!(err.code(), ErrorCode::ConfigError); + assert!(err.msg().contains("request_durable_ack")); + } + + #[test] + fn refuses_manual_progress_mode() { + let err = parse_err("qwpws::addr=localhost:9000;qwp_ws_progress=manual;"); + assert_eq!(err.code(), ErrorCode::ConfigError); + assert!(err.msg().contains("qwp_ws_progress")); + } + + #[test] + fn accepts_explicit_background_progress_mode() { + let _ = parse_ok("qwpws::addr=localhost:9000;qwp_ws_progress=background;"); + } + + #[test] + fn doubled_semicolon_in_value() { + // `;;` inside a value should be parsed as a literal `;`, not as a + // record separator. Our walker mirrors `scan_qwp_ws_addr_params` so a + // value containing `;;` does not bleed into the next key. + let _ = parse_ok("qwpws::addr=localhost:9000;password=a;;b;pool_size=2;"); + } +} diff --git a/questdb-rs/src/ingress/column_sender/conn.rs b/questdb-rs/src/ingress/column_sender/conn.rs new file mode 100644 index 00000000..3ed23517 --- /dev/null +++ b/questdb-rs/src/ingress/column_sender/conn.rs @@ -0,0 +1,978 @@ +/******************************************************************************* + * ___ _ ____ ____ + * / _ \ _ _ ___ ___| |_| _ \| __ ) + * | | | | | | |/ _ \/ __| __| | | | _ \ + * | |_| | |_| | __/\__ \ |_| |_| | |_) | + * \__\_\\__,_|\___||___/\__|____/|____/ + * + * Copyright (c) 2014-2019 Appsicle + * Copyright (c) 2019-2025 QuestDB + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ******************************************************************************/ + +//! Dedicated pipelined QWP/WebSocket connection for the column-major +//! sender. +//! +//! `ColumnConn` owns its socket end-to-end. Each `publish_qwp` writes a +//! single QWP frame into the connection's reusable write buffer, masks it +//! per RFC 6455, and `write_all`s to the socket — then returns immediately +//! without waiting for the server's ack. Between publishes, ready acks +//! are drained non-blocking via `try_drain_acks`. When the in-flight +//! count hits the protocol cap (128), the next non-deferred publish +//! blocks until one ack frees a slot. Deferred publishes reserve one +//! in-flight slot for the later commit-triggering frame. An explicit +//! `sync_all_acks` blocks until every in-flight frame is acknowledged. +//! +//! No replay queue, no background thread — single-thread, single-socket, +//! pipelined. + +use std::collections::{HashMap, VecDeque}; +use std::io::{self, Read, Write}; +use std::time::Duration; + +use crate::ingress::SenderBuilder; +use crate::ingress::sender::qwp_ws::WsStream; +use crate::ws::frame::{self, FrameError, FrameHeader, Opcode}; +use crate::ws::mask::{MaskKeySource, apply_mask}; +use crate::{Result, error}; + +use super::sender::AckLevel; + +/// Bytes the encoder leaves untouched at the start of `write_buf` so the +/// WS header can be prepended in place without a copy. RFC 6455 §5.2: the +/// client-to-server header is at most 14 bytes (1 flag + 1 len + 8 ext len +/// + 4 mask key). +pub(crate) const WS_HEADER_RESERVE: usize = 14; + +// Status bytes from the QWP/WS response opcode table. Duplicated here per +// the "no row-API code reuse" stance — the column sender never reaches +// into `crate::ingress::sender::qwp_ws_codec`. +const QWP_STATUS_OK: u8 = 0x00; +const QWP_STATUS_DURABLE_ACK: u8 = 0x02; +const QWP_STATUS_SCHEMA_MISMATCH: u8 = 0x03; +const QWP_STATUS_PARSE_ERROR: u8 = 0x05; +const QWP_STATUS_INTERNAL_ERROR: u8 = 0x06; +const QWP_STATUS_SECURITY_ERROR: u8 = 0x08; +const QWP_STATUS_WRITE_ERROR: u8 = 0x09; + +/// Cap on a single inbound WS frame. Well above QWP's 16 MiB batch limit +/// but small enough to refuse obviously bogus declared lengths early. +const MAX_INBOUND_FRAME_BYTES: u64 = 256 * 1024 * 1024; + +/// QWP spec §Protocol limits: max in-flight batches per connection. +const MAX_IN_FLIGHT: u32 = 128; + +/// Metadata for one published-but-unacked frame. Pushed on publish, +/// popped (front) when the matching OK arrives. +struct PendingAck { + fsn: u64, +} + +/// One pipelined QWP/WebSocket connection owned by the column-major +/// sender. See module docs. +pub(crate) struct ColumnConn { + stream: WsStream, + /// Bytes the WS handshake read past the upgrade response, plus any + /// bytes from inbound WS frames already consumed past their header. + /// Drained before reading more from the socket. + leftover: Vec, + /// Reusable outbound buffer. Bytes 0..WS_HEADER_RESERVE are reserved + /// for the WS header; the encoder writes the QWP frame body from + /// offset WS_HEADER_RESERVE onwards. + write_buf: Vec, + /// Reusable inbound scratch (one ack frame's worth). + read_buf: Vec, + mask_keys: MaskKeySource, + /// Sequence assigned to the next published frame. QWP server numbers + /// client frames starting at 0; first publish gets fsn 0. + next_fsn: u64, + /// Published-but-unacked frames, ordered by fsn. Pushed on publish, + /// popped (front) when the matching OK arrives. + pending_acks: VecDeque, + /// Number of published-but-unacked frames. Redundant with + /// `pending_acks.len()` but avoids a cast for the 128 cap check. + in_flight: u32, + /// For ack_level=Durable: per-table seq_txn watermark the server has + /// reported reaching durable storage. + durable_watermarks: HashMap, + /// Sticky: once `true`, the connection cannot be used for further + /// publishes; the pool drops the slot on return. + must_close: bool, + max_buf_size: usize, + request_timeout: Duration, + durable_ack_opt_in: bool, +} + +impl ColumnConn { + /// Open a fresh column-sender connection. The pool layer + /// ([`super::QuestDb::connect`]) has already extracted pool-specific + /// knobs and refused `sf_*` keys; this function only reaches the + /// remaining QWP/WS settings via [`SenderBuilder::from_conf`]. + pub(crate) fn connect(conf: &str) -> Result { + let builder = SenderBuilder::from_conf(conf)?; + let raw = builder.build_qwp_ws_raw_stream()?; + let mask_keys = MaskKeySource::new() + .map_err(|e| error::fmt!(SocketError, "MaskKeySource init failed: {}", e.0))?; + Ok(Self { + stream: raw.stream, + leftover: raw.leftover, + write_buf: Vec::with_capacity(64 * 1024), + read_buf: Vec::with_capacity(4 * 1024), + mask_keys, + next_fsn: 0, + pending_acks: VecDeque::new(), + in_flight: 0, + durable_watermarks: HashMap::new(), + must_close: false, + max_buf_size: raw.max_buf_size, + request_timeout: raw.request_timeout, + durable_ack_opt_in: raw.durable_ack_opt_in, + }) + } + + pub(crate) fn must_close(&self) -> bool { + self.must_close + } + + /// Hand `encode` a `&mut Vec` with `WS_HEADER_RESERVE` bytes + /// pre-reserved at the front; `encode` appends the QWP frame body to + /// it. Frame the result as a WS binary frame (mask in place), write + /// the bytes to the socket, return the assigned FSN. + /// + /// On any socket or protocol failure the connection is latched as + /// `must_close` and the original error is returned. + pub(crate) fn publish_qwp(&mut self, encode: F) -> Result + where + F: FnOnce(&mut Vec) -> Result<()>, + { + if self.must_close { + return Err(error::fmt!( + SocketError, + "QWP/WebSocket connection latched as terminal; \ + return the sender to the pool and acquire a fresh one." + )); + } + + // Set up the buffer: 14 zero bytes that the WS header will + // overwrite once we know the actual payload length. + self.write_buf.clear(); + self.write_buf.resize(WS_HEADER_RESERVE, 0); + + // Caller writes the QWP frame body. + encode(&mut self.write_buf).inspect_err(|_| { + // Encode failure leaves the connection usable — the bytes + // never hit the wire — but the buffer state needs resetting + // so the next publish starts clean. + self.write_buf.clear(); + })?; + + let payload_len = self.write_buf.len() - WS_HEADER_RESERVE; + if payload_len > self.max_buf_size { + return Err(error::fmt!( + InvalidApiCall, + "QWP frame ({} bytes) exceeds max_buf_size ({} bytes)", + payload_len, + self.max_buf_size + )); + } + + let mask_key = self.mask_keys.next_key().map_err(|e| { + self.latch(error::fmt!(SocketError, "mask key entropy failed: {}", e.0)) + })?; + + // Apply the mask to the QWP frame body in place. + apply_mask(&mut self.write_buf[WS_HEADER_RESERVE..], mask_key, 0); + + // Compute the WS header byte count for this payload length. + let ws_header_len = ws_header_len_for(payload_len); + let header_offset = WS_HEADER_RESERVE - ws_header_len; + write_ws_header( + &mut self.write_buf[header_offset..WS_HEADER_RESERVE], + payload_len, + mask_key, + ); + + self.set_timeouts(Some(self.request_timeout), Some(self.request_timeout))?; + self.stream + .write_all(&self.write_buf[header_offset..]) + .map_err(|e| { + self.latch(error::fmt!( + SocketError, + "QWP/WebSocket socket write failed: {}", + e + )) + })?; + self.stream.flush().map_err(|e| { + self.latch(error::fmt!( + SocketError, + "QWP/WebSocket socket flush failed: {}", + e + )) + })?; + + let fsn = self.next_fsn; + self.next_fsn = self.next_fsn.wrapping_add(1); + Ok(PublishedFrame { fsn }) + } + + /// Record a just-published frame as in-flight. Called by + /// `ColumnSender::flush` after `publish_qwp` succeeds. + pub(crate) fn push_pending(&mut self, fsn: u64) { + self.pending_acks.push_back(PendingAck { fsn }); + self.in_flight += 1; + } + + /// Number of published-but-unacked frames. + pub(crate) fn in_flight(&self) -> u32 { + self.in_flight + } + + /// `true` when a deferred publish can still leave one in-flight slot + /// for the later non-deferred sync commit frame. + pub(crate) fn has_sync_commit_slot(&self) -> bool { + self.in_flight < MAX_IN_FLIGHT - 1 + } + + pub(crate) fn validate_ack_level(&self, ack_level: AckLevel) -> Result<()> { + if ack_level == AckLevel::Durable && !self.durable_ack_opt_in { + return Err(error::fmt!( + InvalidApiCall, + "AckLevel::Durable requires the pool to be opened with \ + `request_durable_ack=on` in the connect string." + )); + } + Ok(()) + } + + /// Drain any ack responses available without blocking. Returns the + /// number of OK acks consumed. + pub(crate) fn try_drain_acks(&mut self) -> Result { + let mut drained = 0u32; + loop { + match self.try_recv_qwp_response()? { + None => return Ok(drained), + Some(response) => { + self.process_response(response)?; + drained += 1; + } + } + } + } + + /// Block until at least one OK ack arrives. Used when + /// `in_flight == MAX_IN_FLIGHT` to free a slot. + pub(crate) fn drain_one_ack_blocking(&mut self) -> Result<()> { + loop { + let response = self.recv_qwp_response()?; + match &response { + QwpResponse::Ok { .. } => { + self.process_response(response)?; + return Ok(()); + } + _ => { + self.process_response(response)?; + } + } + } + } + + /// Block until all in-flight frames are OK-acked. For + /// `AckLevel::Durable`, also wait for durable watermarks to reach + /// every pending frame's seq_txn. + pub(crate) fn sync_all_acks(&mut self, ack_level: AckLevel) -> Result<()> { + if self.must_close { + return Err(error::fmt!( + SocketError, + "QWP/WebSocket connection latched as terminal." + )); + } + self.validate_ack_level(ack_level)?; + + // Phase 1: drain all OK acks. + let mut durable_targets: HashMap = HashMap::new(); + while self.in_flight > 0 { + let response = self.recv_qwp_response()?; + if let QwpResponse::Ok { tables, .. } = &response + && ack_level == AckLevel::Durable + { + for (t, seq_txn) in tables { + let entry = durable_targets.entry(t.clone()).or_insert(i64::MIN); + if *seq_txn > *entry { + *entry = *seq_txn; + } + } + } + self.process_response(response)?; + } + + // Phase 2 (Durable only): wait for watermarks. + if ack_level == AckLevel::Durable { + while durable_targets.iter().any(|(t, target)| { + self.durable_watermarks.get(t).copied().unwrap_or(i64::MIN) < *target + }) { + let response = self.recv_qwp_response()?; + self.process_response(response)?; + } + } + + Ok(()) + } + + /// Dispatch a parsed QWP response: validate OK sequence, update + /// in-flight tracking, absorb durable watermarks, latch on error. + fn process_response(&mut self, response: QwpResponse) -> Result<()> { + match response { + QwpResponse::Ok { sequence, tables } => { + // The server sends cumulative OKs: sequence=N means all + // frames up to and including N are committed. Pop every + // pending entry whose fsn <= sequence. + let mut popped = 0u32; + while let Some(front) = self.pending_acks.front() { + if front.fsn > sequence { + break; + } + self.pending_acks.pop_front(); + popped += 1; + } + if popped == 0 { + return Err(self.latch(error::fmt!( + SocketError, + "QWP OK sequence {} has no matching pending frame (next pending: {:?})", + sequence, + self.pending_acks.front().map(|p| p.fsn) + ))); + } + self.in_flight -= popped; + for (t, seq_txn) in tables { + self.durable_watermarks + .entry(t) + .and_modify(|w| { + if seq_txn > *w { + *w = seq_txn; + } + }) + .or_insert(seq_txn); + } + Ok(()) + } + QwpResponse::DurableAck { tables } => { + for (t, seq_txn) in tables { + self.durable_watermarks + .entry(t) + .and_modify(|w| { + if seq_txn > *w { + *w = seq_txn; + } + }) + .or_insert(seq_txn); + } + Ok(()) + } + QwpResponse::Error { + sequence, + status, + message, + } => { + let err = map_error_status(status, &message); + Err(self.latch(crate::Error::new( + err.code(), + format!( + "QWP server error on fsn {}: status=0x{:02x}, message={:?}", + sequence, status, message + ), + ))) + } + } + } + + /// `true` when the in-flight count has hit the protocol cap and a + /// blocking drain is needed before the next publish. + pub(crate) fn at_in_flight_cap(&self) -> bool { + self.in_flight >= MAX_IN_FLIGHT + } + + /// Latches the connection as terminal and returns the originating + /// error. Used by every socket-side failure path. + fn latch(&mut self, err: crate::Error) -> crate::Error { + self.must_close = true; + err + } + + fn set_timeouts(&self, read: Option, write: Option) -> Result<()> { + // WsStream::set_timeouts is `fn` (not pub(crate)). We replicate + // the socket timeout setting via the tcp_stream accessor, but + // since WsStream::set_timeouts is private we have to use the + // Read/Write IO directly. Skip explicit timeout muting here: + // the underlying socket already has timeouts set during connect + // (see establish_connection in qwp_ws.rs). If they need refresh + // for long flushes, expose a setter on WsStream. + let _ = read; + let _ = write; + Ok(()) + } + + /// Non-blocking attempt to read one QWP/WS data frame. Returns + /// `Ok(None)` if no complete frame is available yet (WouldBlock). + fn try_recv_qwp_response(&mut self) -> Result> { + loop { + match FrameHeader::parse(&self.leftover) { + Ok(h) => { + if !h.fin { + return Err(self.latch(error::fmt!( + SocketError, + "QWP/WebSocket server sent a fragmented frame; QWP is FIN-only" + ))); + } + if h.payload_len > MAX_INBOUND_FRAME_BYTES { + return Err(self.latch(error::fmt!( + SocketError, + "WS frame declared {} payload bytes (max {})", + h.payload_len, + MAX_INBOUND_FRAME_BYTES + ))); + } + let payload_len = h.payload_len as usize; + let header_len = h.header_len; + // Check if we have enough leftover for header + payload. + if self.leftover.len() < header_len + payload_len { + // We have the header but not the full payload yet. + // Try one non-blocking read to get more. + if !self.try_fill_leftover()? { + return Ok(None); + } + continue; + } + // Consume header + payload from leftover. + self.leftover.drain(..header_len); + self.read_buf.clear(); + self.read_buf + .extend_from_slice(&self.leftover[..payload_len]); + self.leftover.drain(..payload_len); + match h.opcode { + Opcode::Binary => { + return parse_qwp_response(&self.read_buf) + .inspect_err(|_| { + self.must_close = true; + }) + .map(Some); + } + Opcode::Ping => { + self.send_pong(payload_len)?; + continue; + } + Opcode::Pong => continue, + Opcode::Close => { + self.must_close = true; + return Err(error::fmt!( + SocketError, + "QWP/WebSocket server closed the connection" + )); + } + } + } + Err(FrameError::Incomplete) => { + if !self.try_fill_leftover()? { + return Ok(None); + } + } + Err(FrameError::Protocol(msg)) => { + return Err(self.latch(error::fmt!( + SocketError, + "QWP/WebSocket frame parse error: {}", + msg + ))); + } + } + } + } + + /// Read one QWP/WS data frame's payload and decode the QWP response. + /// Ping frames are answered transparently; pong frames are dropped; + /// close frames latch the connection. + fn recv_qwp_response(&mut self) -> Result { + loop { + let header = self.read_ws_frame_header()?; + if !header.fin { + return Err(self.latch(error::fmt!( + SocketError, + "QWP/WebSocket server sent a fragmented frame; QWP is FIN-only" + ))); + } + let payload_len = header.payload_len as usize; + if header.payload_len > MAX_INBOUND_FRAME_BYTES { + return Err(self.latch(error::fmt!( + SocketError, + "WS frame declared {} payload bytes (max {})", + header.payload_len, + MAX_INBOUND_FRAME_BYTES + ))); + } + self.read_buf.clear(); + self.read_buf.resize(payload_len, 0); + self.read_exact_into_buf(payload_len)?; + match header.opcode { + Opcode::Binary => { + return parse_qwp_response(&self.read_buf).inspect_err(|_| { + // Parse error: not a transport failure; the + // server gave us bytes that don't conform to the + // QWP response schema. Latch and surface. + self.must_close = true; + }); + } + Opcode::Ping => { + self.send_pong(payload_len)?; + continue; + } + Opcode::Pong => { + continue; + } + Opcode::Close => { + self.must_close = true; + return Err(error::fmt!( + SocketError, + "QWP/WebSocket server closed the connection" + )); + } + } + } + } + + /// Read a complete WS frame header from `leftover` / the socket. + fn read_ws_frame_header(&mut self) -> Result { + // Need at most 10 bytes for any header we'd parse (server frames + // are unmasked). + loop { + match FrameHeader::parse(&self.leftover) { + Ok(h) => { + // Trim the header bytes from leftover and return. + let header_len = h.header_len; + self.leftover.drain(..header_len); + return Ok(h); + } + Err(FrameError::Incomplete) => { + self.fill_leftover()?; + } + Err(FrameError::Protocol(msg)) => { + return Err(self.latch(error::fmt!( + SocketError, + "QWP/WebSocket frame parse error: {}", + msg + ))); + } + } + } + } + + /// Fill `read_buf[..len]` from `leftover` + the socket. + fn read_exact_into_buf(&mut self, len: usize) -> Result<()> { + let from_leftover = self.leftover.len().min(len); + self.read_buf[..from_leftover].copy_from_slice(&self.leftover[..from_leftover]); + self.leftover.drain(..from_leftover); + let mut filled = from_leftover; + while filled < len { + let n = self + .stream + .read(&mut self.read_buf[filled..]) + .map_err(|e| { + self.latch(error::fmt!( + SocketError, + "QWP/WebSocket socket read failed: {}", + e + )) + })?; + if n == 0 { + return Err(self.latch(error::fmt!( + SocketError, + "QWP/WebSocket socket closed unexpectedly during frame read" + ))); + } + filled += n; + } + Ok(()) + } + + /// Non-blocking attempt to read more bytes from the socket into + /// `leftover`. Returns `Ok(true)` if data was read, `Ok(false)` on + /// WouldBlock. + fn try_fill_leftover(&mut self) -> Result { + let mut chunk = [0u8; 4096]; + match self.stream.read_nonblocking_once(&mut chunk) { + Ok(0) => Err(self.latch(error::fmt!( + SocketError, + "QWP/WebSocket socket closed unexpectedly" + ))), + Ok(n) => { + self.leftover.extend_from_slice(&chunk[..n]); + Ok(true) + } + Err(e) if e.kind() == io::ErrorKind::WouldBlock => Ok(false), + Err(e) => Err(self.latch(error::fmt!( + SocketError, + "QWP/WebSocket non-blocking read failed: {}", + e + ))), + } + } + + /// Read at least one more byte from the socket into `leftover`. + fn fill_leftover(&mut self) -> Result<()> { + let mut chunk = [0u8; 1024]; + let n = self.stream.read(&mut chunk).map_err(|e| { + self.latch(error::fmt!( + SocketError, + "QWP/WebSocket socket read failed: {}", + e + )) + })?; + if n == 0 { + return Err(self.latch(error::fmt!( + SocketError, + "QWP/WebSocket socket closed unexpectedly while reading frame header" + ))); + } + self.leftover.extend_from_slice(&chunk[..n]); + Ok(()) + } + + fn send_pong(&mut self, payload_len: usize) -> Result<()> { + // The pong payload must echo the ping payload, which is in + // read_buf[..payload_len]. + let mask_key = self.mask_keys.next_key().map_err(|e| { + self.latch(error::fmt!(SocketError, "mask key entropy failed: {}", e.0)) + })?; + // Use a small scratch buffer to encode the pong; pongs are tiny + // (≤ 125 bytes by RFC) so this allocation is negligible. + let mut pong = Vec::with_capacity(WS_HEADER_RESERVE + payload_len); + frame::encode_client_frame( + &mut pong, + Opcode::Pong, + mask_key, + &self.read_buf[..payload_len], + ); + self.stream.write_all(&pong).map_err(|e| { + self.latch(error::fmt!( + SocketError, + "QWP/WebSocket pong write failed: {}", + e + )) + })?; + self.stream.flush().map_err(|e| { + self.latch(error::fmt!( + SocketError, + "QWP/WebSocket pong flush failed: {}", + e + )) + })?; + Ok(()) + } +} + +/// Outcome of a successful publish call. +pub(crate) struct PublishedFrame { + pub(crate) fsn: u64, +} + +#[derive(Debug)] +enum QwpResponse { + Ok { + sequence: u64, + tables: Vec<(String, i64)>, + }, + DurableAck { + tables: Vec<(String, i64)>, + }, + Error { + sequence: u64, + status: u8, + message: String, + }, +} + +/// Parse a QWP/WS response payload (the body of a binary WS frame). +fn parse_qwp_response(payload: &[u8]) -> Result { + if payload.is_empty() { + return Err(error::fmt!(SocketError, "Empty QWP response frame")); + } + let status = payload[0]; + match status { + QWP_STATUS_OK => { + if payload.len() < 1 + 8 + 2 { + return Err(error::fmt!(SocketError, "QWP OK response truncated")); + } + let sequence = u64::from_le_bytes(payload[1..9].try_into().unwrap()); + let tables = parse_table_entries(payload, 9, "QWP OK response")?; + Ok(QwpResponse::Ok { sequence, tables }) + } + QWP_STATUS_DURABLE_ACK => { + let tables = parse_table_entries(payload, 1, "QWP durable ACK response")?; + Ok(QwpResponse::DurableAck { tables }) + } + _ => { + let (sequence, message) = parse_error_body(payload)?; + Ok(QwpResponse::Error { + sequence, + status, + message, + }) + } + } +} + +fn parse_table_entries( + payload: &[u8], + table_count_offset: usize, + context: &'static str, +) -> Result> { + let table_count_end = table_count_offset + .checked_add(2) + .ok_or_else(|| error::fmt!(SocketError, "{} table count offset overflow", context))?; + if payload.len() < table_count_end { + return Err(error::fmt!(SocketError, "{} truncated", context)); + } + let table_count = u16::from_le_bytes( + payload[table_count_offset..table_count_end] + .try_into() + .unwrap(), + ) as usize; + let mut pos = table_count_end; + let mut entries = Vec::with_capacity(table_count); + for _ in 0..table_count { + let name_len_end = pos + .checked_add(2) + .ok_or_else(|| error::fmt!(SocketError, "{} table entry offset overflow", context))?; + if payload.len() < name_len_end { + return Err(error::fmt!( + SocketError, + "{} table entry truncated", + context + )); + } + let name_len = u16::from_le_bytes(payload[pos..name_len_end].try_into().unwrap()) as usize; + pos = name_len_end; + if name_len == 0 { + return Err(error::fmt!(SocketError, "{} table name is empty", context)); + } + let name_end = pos + .checked_add(name_len) + .ok_or_else(|| error::fmt!(SocketError, "{} table name length overflow", context))?; + let seq_txn_end = name_end + .checked_add(8) + .ok_or_else(|| error::fmt!(SocketError, "{} table entry length overflow", context))?; + if payload.len() < seq_txn_end { + return Err(error::fmt!( + SocketError, + "{} table entry truncated", + context + )); + } + let name = std::str::from_utf8(&payload[pos..name_end]) + .map_err(|_| error::fmt!(SocketError, "{} table name not UTF-8", context))? + .to_owned(); + let seq_txn = i64::from_le_bytes(payload[name_end..seq_txn_end].try_into().unwrap()); + entries.push((name, seq_txn)); + pos = seq_txn_end; + } + if pos != payload.len() { + return Err(error::fmt!( + SocketError, + "{} has trailing bytes after table entries", + context + )); + } + Ok(entries) +} + +fn parse_error_body(payload: &[u8]) -> Result<(u64, String)> { + if payload.len() < 1 + 8 + 2 { + return Err(error::fmt!(SocketError, "QWP error response truncated")); + } + let sequence = u64::from_le_bytes(payload[1..9].try_into().unwrap()); + let msg_len = u16::from_le_bytes(payload[9..11].try_into().unwrap()) as usize; + if msg_len > 1024 { + return Err(error::fmt!( + SocketError, + "QWP error response message too long (declared {} bytes, max 1024)", + msg_len + )); + } + let msg_end = 11usize + .checked_add(msg_len) + .ok_or_else(|| error::fmt!(SocketError, "QWP error response message length overflow"))?; + if payload.len() < msg_end { + return Err(error::fmt!( + SocketError, + "QWP error response truncated (declared {} bytes)", + msg_len + )); + } + if payload.len() != msg_end { + return Err(error::fmt!( + SocketError, + "QWP error response has trailing bytes after message" + )); + } + let message = std::str::from_utf8(&payload[11..msg_end]) + .map_err(|_| error::fmt!(SocketError, "QWP error message not UTF-8"))? + .to_owned(); + Ok((sequence, message)) +} + +fn map_error_status(status: u8, msg: &str) -> crate::Error { + match status { + QWP_STATUS_SCHEMA_MISMATCH => { + error::fmt!(InvalidApiCall, "QWP schema mismatch: {}", msg) + } + QWP_STATUS_PARSE_ERROR => error::fmt!(InvalidApiCall, "QWP parse error: {}", msg), + QWP_STATUS_INTERNAL_ERROR => error::fmt!(ServerFlushError, "QWP internal error: {}", msg), + QWP_STATUS_SECURITY_ERROR => error::fmt!(AuthError, "QWP security error: {}", msg), + QWP_STATUS_WRITE_ERROR => error::fmt!(ServerFlushError, "QWP write error: {}", msg), + _ => error::fmt!( + ServerFlushError, + "QWP unrecognised error status 0x{:02x}: {}", + status, + msg + ), + } +} + +/// On-wire byte count of the client-to-server WS header for a given +/// payload length (mask bit always set ⇒ +4 bytes for the mask key). +#[inline] +fn ws_header_len_for(payload_len: usize) -> usize { + if payload_len <= 125 { + 2 + 4 + } else if payload_len <= 0xFFFF { + 4 + 4 + } else { + 10 + 4 + } +} + +/// Write the RFC 6455 binary-frame client header into `out`. `out.len()` +/// must equal [`ws_header_len_for(payload_len)`]. +fn write_ws_header(out: &mut [u8], payload_len: usize, mask_key: [u8; 4]) { + const FIN_BIT: u8 = 0x80; + const BINARY_OPCODE: u8 = 0x2; + const MASK_BIT: u8 = 0x80; + out[0] = FIN_BIT | BINARY_OPCODE; + let len_bytes; + let mask_offset; + if payload_len <= 125 { + out[1] = MASK_BIT | (payload_len as u8); + mask_offset = 2; + len_bytes = 0; + } else if payload_len <= 0xFFFF { + out[1] = MASK_BIT | 126; + out[2..4].copy_from_slice(&(payload_len as u16).to_be_bytes()); + mask_offset = 4; + len_bytes = 2; + } else { + out[1] = MASK_BIT | 127; + out[2..10].copy_from_slice(&(payload_len as u64).to_be_bytes()); + mask_offset = 10; + len_bytes = 8; + } + let _ = len_bytes; + out[mask_offset..mask_offset + 4].copy_from_slice(&mask_key); +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn ws_header_len_matches_payload_length_class() { + assert_eq!(ws_header_len_for(0), 6); + assert_eq!(ws_header_len_for(125), 6); + assert_eq!(ws_header_len_for(126), 8); + assert_eq!(ws_header_len_for(0xFFFF), 8); + assert_eq!(ws_header_len_for(0x1_0000), 14); + assert_eq!(ws_header_len_for(1 << 24), 14); + } + + #[test] + fn write_ws_header_short_form() { + let mut buf = [0u8; 6]; + write_ws_header(&mut buf, 5, [0xDE, 0xAD, 0xBE, 0xEF]); + assert_eq!(buf[0], 0x82); // FIN=1, opcode=Binary + assert_eq!(buf[1], 0x80 | 5); // MASK=1, len=5 + assert_eq!(&buf[2..6], &[0xDE, 0xAD, 0xBE, 0xEF]); + } + + #[test] + fn write_ws_header_16bit_form() { + let mut buf = [0u8; 8]; + write_ws_header(&mut buf, 200, [1, 2, 3, 4]); + assert_eq!(buf[0], 0x82); + assert_eq!(buf[1], 0x80 | 126); + assert_eq!(u16::from_be_bytes([buf[2], buf[3]]), 200); + assert_eq!(&buf[4..8], &[1, 2, 3, 4]); + } + + #[test] + fn write_ws_header_64bit_form() { + let mut buf = [0u8; 14]; + write_ws_header(&mut buf, 0x1_0000, [9, 8, 7, 6]); + assert_eq!(buf[0], 0x82); + assert_eq!(buf[1], 0x80 | 127); + assert_eq!( + u64::from_be_bytes([ + buf[2], buf[3], buf[4], buf[5], buf[6], buf[7], buf[8], buf[9] + ]), + 0x1_0000 + ); + assert_eq!(&buf[10..14], &[9, 8, 7, 6]); + } + + #[test] + fn parse_qwp_ok_with_one_table() { + // status=OK, sequence=42, table_count=1, name_len=2, "tx", seq_txn=7 + let mut payload = vec![0u8]; + payload.extend_from_slice(&42u64.to_le_bytes()); + payload.extend_from_slice(&1u16.to_le_bytes()); + payload.extend_from_slice(&2u16.to_le_bytes()); + payload.extend_from_slice(b"tx"); + payload.extend_from_slice(&7i64.to_le_bytes()); + let response = parse_qwp_response(&payload).unwrap(); + match response { + QwpResponse::Ok { sequence, tables } => { + assert_eq!(sequence, 42); + assert_eq!(tables, vec![("tx".to_owned(), 7)]); + } + other => panic!("expected Ok, got {other:?}"), + } + } + + #[test] + fn parse_qwp_durable_ack_empty() { + // status=DurableAck, table_count=0 + let mut payload = vec![QWP_STATUS_DURABLE_ACK]; + payload.extend_from_slice(&0u16.to_le_bytes()); + let response = parse_qwp_response(&payload).unwrap(); + match response { + QwpResponse::DurableAck { tables } => { + assert!(tables.is_empty()); + } + other => panic!("expected DurableAck, got {other:?}"), + } + } + + #[test] + fn parse_qwp_error_truncated_rejected() { + // status=PARSE_ERROR but only the status byte present + let err = parse_qwp_response(&[QWP_STATUS_PARSE_ERROR]).unwrap_err(); + assert_eq!(err.code(), crate::ErrorCode::SocketError); + } +} diff --git a/questdb-rs/src/ingress/column_sender/db.rs b/questdb-rs/src/ingress/column_sender/db.rs new file mode 100644 index 00000000..bdb1117f --- /dev/null +++ b/questdb-rs/src/ingress/column_sender/db.rs @@ -0,0 +1,503 @@ +/******************************************************************************* + * ___ _ ____ ____ + * / _ \ _ _ ___ ___| |_| _ \| __ ) + * | | | | | | |/ _ \/ __| __| | | | _ \ + * | |_| | |_| | __/\__ \ |_| |_| | |_) | + * \__\_\\__,_|\___||___/\__|____/|____/ + * + * Copyright (c) 2014-2019 Appsicle + * Copyright (c) 2019-2025 QuestDB + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ******************************************************************************/ + +//! Column-sender connection pool. +//! +//! `QuestDb` is a thread-safe pool of [`crate::ingress::Sender`] handles to +//! a single QuestDB QWP/WebSocket endpoint. The pool eagerly opens +//! `pool_size` connections at `connect`, auto-grows up to `pool_max` on +//! demand, and (under `pool_reap=auto`) runs a background thread that closes +//! above-`pool_size` connections after they have been idle for +//! `pool_idle_timeout_ms`. +//! +//! Each pool slot is handed out as a [`BorrowedSender<'_>`] which returns +//! itself to the pool on `Drop`. Slots whose underlying connection has +//! latched into `must_close=true` are dropped on return instead of being +//! recycled. + +use std::fmt::{self, Debug, Formatter}; +use std::marker::PhantomData; +use std::ops::{Deref, DerefMut}; +use std::rc::Rc; +use std::sync::atomic::{AtomicBool, Ordering}; +use std::sync::{Arc, Condvar, Mutex}; +use std::thread::{self, JoinHandle}; +use std::time::{Duration, Instant}; + +use crate::{Result, error}; + +use super::conf::{self, PoolReap}; +use super::conn::ColumnConn; +use super::sender::ColumnSender; + +/// Lower bound on the reaper's wake interval. +const REAPER_MIN_TICK: Duration = Duration::from_secs(5); + +/// Connection pool for the column-major sender API. +/// +/// Construct with [`QuestDb::connect`]. Share the pool across threads — its +/// internal state is `Mutex`-guarded so [`QuestDb::borrow_sender`] / +/// [`QuestDb::reap_idle`] / Drop-driven returns are safe to interleave. +/// +/// Each borrow ([`BorrowedSender`]) is **not** `Send` — it belongs to the +/// thread that borrowed it. To ingest in parallel, borrow one sender per +/// worker thread from the same `QuestDb`. +pub struct QuestDb { + inner: Arc, + reaper: Option>, +} + +struct DbInner { + /// Original connect string. Kept verbatim so auto-grow can spin up a new + /// connection with the same settings. + conf: String, + pool_size: usize, + pool_max: usize, + pool_idle_timeout: Duration, + state: Mutex, + /// Wakes the reaper thread on `shutdown` and lets a future blocking + /// borrow wait for a free slot once we grow `borrow_sender` past + /// fail-fast (not in v1). + cv: Condvar, + shutdown: AtomicBool, +} + +#[derive(Default)] +struct PoolState { + /// Idle connections, oldest-first (FIFO push/pop from the back). + free: Vec, + /// Sum of currently-borrowed senders + in-flight grow operations. + in_use: usize, +} + +impl PoolState { + fn total(&self) -> usize { + self.free.len() + self.in_use + } +} + +struct PoolEntry { + conn: ColumnConn, + /// Connection-scoped schema interner. Travels with the slot so its + /// `(signature → id)` map stays coherent across borrow/return cycles; + /// both client and server build the same map by first-emit order, so + /// dropping it would resync the next FULL emit at id 0 and corrupt + /// the server's schema table. + schema_registry: super::encoder::SchemaRegistry, + /// Connection-scoped global symbol dictionary — same coherence + /// argument: the server tracks ids by first-emit order over the life + /// of the WS connection, so the dict must travel with the slot. + symbol_dict: crate::ingress::buffer::SymbolGlobalDict, + last_idle_at: Instant, +} + +impl QuestDb { + /// Open a pool against `conf`. + /// + /// The connect string must use a QWP/WebSocket schema (`qwpws::` / + /// `qwpwss::` / `ws::` / `wss::`). Pool-specific keys are recognised: + /// + /// | Key | Default | Meaning | + /// |------------------------|---------|----------------------------------------------------------------| + /// | `pool_size` | 1 | Warm / minimum connections, opened eagerly here. | + /// | `pool_max` | 64 | Hard cap on auto-grow. Borrow at the cap returns `InvalidApiCall`. | + /// | `pool_idle_timeout_ms` | 60000 | Above-`pool_size` idle connections are closed after this long. | + /// | `pool_reap` | `auto` | `auto` runs a background reaper; `manual` requires `reap_idle`. | + /// + /// Store-and-forward keys (`sf_*`, `sender_id`) are **refused** here — + /// see `doc/COLUMN_SENDER_PLAN.md` §8. Use the row-major + /// [`crate::ingress::Sender`] API if you need on-disk durability. + pub fn connect(conf: &str) -> Result { + let parsed = conf::parse(conf)?; + let pool_cfg = parsed.pool; + + let mut free = Vec::with_capacity(pool_cfg.pool_size); + let now = Instant::now(); + for slot in 0..pool_cfg.pool_size { + let conn = ColumnConn::connect(conf).map_err(|err| { + crate::Error::new( + err.code(), + format!( + "Failed to open pool slot {} of {}: {}", + slot + 1, + pool_cfg.pool_size, + err.msg() + ), + ) + })?; + free.push(PoolEntry { + conn, + schema_registry: super::encoder::SchemaRegistry::new(), + symbol_dict: crate::ingress::buffer::SymbolGlobalDict::new(), + last_idle_at: now, + }); + } + + let inner = Arc::new(DbInner { + conf: conf.to_owned(), + pool_size: pool_cfg.pool_size, + pool_max: pool_cfg.pool_max, + pool_idle_timeout: pool_cfg.pool_idle_timeout, + state: Mutex::new(PoolState { free, in_use: 0 }), + cv: Condvar::new(), + shutdown: AtomicBool::new(false), + }); + + let reaper = match pool_cfg.pool_reap { + PoolReap::Auto => Some(spawn_reaper(Arc::clone(&inner))), + PoolReap::Manual => None, + }; + + Ok(Self { inner, reaper }) + } + + /// Borrow a sender. + /// + /// Selection: pop the most-recently-returned slot from the free list; + /// failing that, open a new connection if we are below `pool_max`; + /// failing that, return `InvalidApiCall` (fail-fast at cap). + pub fn borrow_sender(&self) -> Result> { + let cs = self.pick_sender()?; + Ok(BorrowedSender::new(self, cs)) + } + + /// FFI escape hatch: like [`Self::borrow_sender`] but the returned + /// handle is not lifetime-bound to `&self`. Carries an `Arc` + /// internally so it can outlive the user-facing `QuestDb` pointer + /// (the pool's free list and reaper stay alive as long as any + /// borrow is outstanding). + /// + /// Hidden from the Rust API because Rust callers should prefer the + /// lifetime-bound `borrow_sender`, which catches use-after-close at + /// compile time. C callers reach this through `questdb_db_borrow_sender`. + #[doc(hidden)] + pub fn borrow_sender_owned(&self) -> Result { + let cs = self.pick_sender()?; + Ok(OwnedSender { + inner: Arc::clone(&self.inner), + sender: Some(cs), + }) + } + + fn pick_sender(&self) -> Result { + let mut state = self.inner.state.lock().expect("pool mutex poisoned"); + if let Some(entry) = state.free.pop() { + state.in_use += 1; + drop(state); + return Ok(ColumnSender::new( + entry.conn, + entry.schema_registry, + entry.symbol_dict, + )); + } + + if state.total() >= self.inner.pool_max { + return Err(error::fmt!( + InvalidApiCall, + "Connection pool exhausted: {} connections are currently borrowed and \ + the pool is at its `pool_max` cap of {}. Return a sender or raise `pool_max`.", + state.in_use, + self.inner.pool_max + )); + } + + // Reserve the slot before releasing the lock so a concurrent + // `borrow_sender` cannot over-grow past `pool_max`. + state.in_use += 1; + drop(state); + + let conn = match ColumnConn::connect(&self.inner.conf) { + Ok(c) => c, + Err(err) => { + let mut state = self.inner.state.lock().expect("pool mutex poisoned"); + state.in_use -= 1; + return Err(err); + } + }; + + Ok(ColumnSender::new( + conn, + super::encoder::SchemaRegistry::new(), + crate::ingress::buffer::SymbolGlobalDict::new(), + )) + } + + /// Manually reap idle connections. + /// + /// Closes free-list entries that have been idle longer than + /// `pool_idle_timeout_ms`, never shrinking total connection count below + /// `pool_size`. Returns the number of connections closed. + /// + /// Under the default `pool_reap=auto`, a background thread invokes this + /// logic periodically and this call is harmless. Under + /// `pool_reap=manual`, callers that want shrinking must invoke this on + /// their own cadence. + pub fn reap_idle(&self) -> usize { + reap_idle_inner(&self.inner) + } + + /// Close the pool: stop the reaper (if any), drop all idle connections, + /// and consume `self`. + /// + /// Drop has the same effect; `close` exists for parity with the C ABI + /// (where `Drop` is not available) and to give callers a place to handle + /// any reaper-join errors explicitly in the future. + pub fn close(self) { + drop(self); + } + + /// Snapshot the number of idle (free) connections currently in the pool. + #[doc(hidden)] + pub fn free_count(&self) -> usize { + self.inner + .state + .lock() + .expect("pool mutex poisoned") + .free + .len() + } + + /// Snapshot the number of currently-borrowed (or in-flight-being-built) + /// connections. + #[doc(hidden)] + pub fn in_use_count(&self) -> usize { + self.inner.state.lock().expect("pool mutex poisoned").in_use + } +} + +impl Debug for QuestDb { + fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { + let state = self.inner.state.lock(); + let (free, in_use) = match state { + Ok(s) => (s.free.len(), s.in_use), + Err(_) => (0, 0), + }; + f.debug_struct("QuestDb") + .field("pool_size", &self.inner.pool_size) + .field("pool_max", &self.inner.pool_max) + .field("free", &free) + .field("in_use", &in_use) + .finish() + } +} + +impl Drop for QuestDb { + fn drop(&mut self) { + // Wake the reaper and let it observe shutdown. + self.inner.shutdown.store(true, Ordering::SeqCst); + // Notifying under the mutex avoids the lost-wakeup race where the + // reaper has just released the lock and is about to wait. + { + let _g = self.inner.state.lock().expect("pool mutex poisoned"); + self.inner.cv.notify_all(); + } + if let Some(handle) = self.reaper.take() { + let _ = handle.join(); + } + // Remaining free senders are dropped when `inner` (Arc) hits 0. + } +} + +/// A sender borrowed from a [`QuestDb`] pool. +/// +/// On `Drop` the underlying connection is returned to the pool unless it +/// has latched into `must_close=true`, in which case it is dropped (and +/// auto-grow will open a fresh one for the next borrow). +/// +/// `BorrowedSender` is **not** `Send` or `Sync`. The borrowed connection +/// belongs to the borrowing thread for the duration of the borrow. +pub struct BorrowedSender<'a> { + db: &'a QuestDb, + sender: Option, + /// !Send / !Sync marker — `Rc<()>` poisons both auto traits without any + /// runtime cost. + _not_send: PhantomData>, +} + +impl<'a> BorrowedSender<'a> { + fn new(db: &'a QuestDb, sender: ColumnSender) -> Self { + Self { + db, + sender: Some(sender), + _not_send: PhantomData, + } + } +} + +impl Debug for BorrowedSender<'_> { + fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { + f.debug_struct("BorrowedSender") + .field("sender", &self.sender) + .finish() + } +} + +impl Deref for BorrowedSender<'_> { + type Target = ColumnSender; + + fn deref(&self) -> &Self::Target { + self.sender + .as_ref() + .expect("borrowed sender already returned") + } +} + +impl DerefMut for BorrowedSender<'_> { + fn deref_mut(&mut self) -> &mut Self::Target { + self.sender + .as_mut() + .expect("borrowed sender already returned") + } +} + +impl Drop for BorrowedSender<'_> { + fn drop(&mut self) { + let Some(sender) = self.sender.take() else { + return; + }; + return_to_pool(&self.db.inner, sender); + } +} + +/// Owned (lifetime-free) variant of [`BorrowedSender`] used by the C FFI. +/// +/// Holds an `Arc` so the pool's state outlives the user-facing +/// `QuestDb` pointer — the C ABI can free its `questdb_db*` before +/// dropping outstanding `column_sender*` handles without invalidating the +/// free list / mutex. +#[doc(hidden)] +pub struct OwnedSender { + inner: Arc, + sender: Option, +} + +impl OwnedSender { + /// Borrow the underlying [`ColumnSender`] mutably. Always returns a + /// live reference until `Drop` runs. + pub fn get_mut(&mut self) -> &mut ColumnSender { + self.sender + .as_mut() + .expect("OwnedSender already returned to the pool") + } + + /// Inspect the wrapped sender without taking ownership. + pub fn get(&self) -> &ColumnSender { + self.sender + .as_ref() + .expect("OwnedSender already returned to the pool") + } +} + +impl Drop for OwnedSender { + fn drop(&mut self) { + if let Some(sender) = self.sender.take() { + return_to_pool(&self.inner, sender); + } + } +} + +fn return_to_pool(inner: &Arc, sender: ColumnSender) { + let must_close = sender.must_close(); + let mut state = inner.state.lock().expect("pool mutex poisoned"); + state.in_use -= 1; + if !must_close { + state.free.push(PoolEntry { + conn: sender.conn, + schema_registry: sender.schema_registry, + symbol_dict: sender.symbol_dict, + last_idle_at: Instant::now(), + }); + } + // When `must_close`, the contained connection is dropped here, after + // the count was decremented but with the mutex still held — safe + // since `ColumnConn::drop` does not re-enter the pool. + drop(state); +} + +fn spawn_reaper(inner: Arc) -> JoinHandle<()> { + let tick = reaper_tick(inner.pool_idle_timeout); + thread::Builder::new() + .name("questdb-column-sender-pool-reaper".to_string()) + .spawn(move || reaper_loop(inner, tick)) + .expect("failed to spawn pool reaper thread") +} + +fn reaper_tick(idle_timeout: Duration) -> Duration { + let twelfth = idle_timeout / 12; + if twelfth > REAPER_MIN_TICK { + twelfth + } else { + REAPER_MIN_TICK + } +} + +fn reaper_loop(inner: Arc, tick: Duration) { + loop { + if inner.shutdown.load(Ordering::SeqCst) { + break; + } + let state = inner.state.lock().expect("pool mutex poisoned"); + let (state, _) = inner + .cv + .wait_timeout(state, tick) + .expect("pool mutex poisoned"); + drop(state); + if inner.shutdown.load(Ordering::SeqCst) { + break; + } + reap_idle_inner(&inner); + } +} + +fn reap_idle_inner(inner: &DbInner) -> usize { + // Drop the to-be-closed connections OUTSIDE the lock so closing a connection + // (which may take an unbounded amount of time) does not stall concurrent + // borrows. + let to_drop: Vec = { + let mut state = inner.state.lock().expect("pool mutex poisoned"); + let mut to_drop = Vec::new(); + let now = Instant::now(); + // Free-list is oldest at front, newest at back (push on return / + // pop on borrow). We must protect `total() >= pool_size` after the + // drop, so we count current total once and only drop if total stays + // above the floor. + let mut i = 0; + while i < state.free.len() { + if state.total() <= inner.pool_size { + break; + } + let idle_for = now.saturating_duration_since(state.free[i].last_idle_at); + if idle_for > inner.pool_idle_timeout { + let entry = state.free.remove(i); + to_drop.push(entry.conn); + } else { + i += 1; + } + } + to_drop + }; + let dropped = to_drop.len(); + drop(to_drop); + dropped +} diff --git a/questdb-rs/src/ingress/column_sender/encoder.rs b/questdb-rs/src/ingress/column_sender/encoder.rs new file mode 100644 index 00000000..8443c1e8 --- /dev/null +++ b/questdb-rs/src/ingress/column_sender/encoder.rs @@ -0,0 +1,906 @@ +/******************************************************************************* + * ___ _ ____ ____ + * / _ \ _ _ ___ ___| |_| _ \| __ ) + * | | | | | | |/ _ \/ __| __| | | | _ \ + * | |_| | |_| | __/\__ \ |_| |_| | |_) | + * \__\_\\__,_|\___||___/\__|____/|____/ + * + * Copyright (c) 2014-2019 Appsicle + * Copyright (c) 2019-2025 QuestDB + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ******************************************************************************/ + +//! Column-sender QWP/WebSocket frame encoder. +//! +//! Writes the QWP frame body for a `Chunk` directly into the connection's +//! reusable outbound buffer — no allocation per flush, no per-column +//! aggregation copy. The no-null hot path for fixed-width columns is a +//! single `extend_from_slice` (memcpy) straight from the caller's buffer. +//! +//! See `doc/COLUMN_SENDER_PLAN.md` for the design rationale. + +use std::collections::HashMap; +use std::slice; + +use crate::ingress::buffer::SymbolGlobalDict; +use crate::{Result, error}; + +use super::chunk::{ + Chunk, ColumnDescriptor, ColumnKind, DesignatedTsDescriptor, SymbolCodesPtr, ValidityDescriptor, +}; +use super::wire::{ + F32_NULL, F64_NULL, I8_NULL, I16_NULL, I32_NULL, I64_NULL, MAX_NAME_LEN, QWP_FLAG_DEFER_COMMIT, + QWP_FLAG_DELTA_SYMBOL_DICT, QWP_HEADER_LEN, QWP_MAGIC, QWP_SCHEMA_MODE_FULL, + QWP_SCHEMA_MODE_REFERENCE, QWP_VERSION_1, validate_name, write_qwp_bytes, write_qwp_varint, +}; + +/// Connection-scoped table-schema interner. +/// +/// Each unique signature gets a sequentially-assigned `u64` id. The first +/// emit uses `QWP_SCHEMA_MODE_FULL`; subsequent emits reuse the id under +/// `QWP_SCHEMA_MODE_REFERENCE`. Both sides of the wire build the same id +/// mapping by first-emit order; on reconnect both sides reset. +#[derive(Debug, Default)] +pub(crate) struct SchemaRegistry { + by_signature: HashMap, u64>, + next_id: u64, +} + +impl SchemaRegistry { + pub(crate) fn new() -> Self { + Self::default() + } + + fn intern(&mut self, signature: &[u8]) -> (u64, bool) { + if let Some(&id) = self.by_signature.get(signature) { + return (id, false); + } + let id = self.next_id; + self.next_id += 1; + self.by_signature.insert(signature.to_vec(), id); + (id, true) + } + + #[cfg(test)] + pub(crate) fn len(&self) -> usize { + self.by_signature.len() + } +} + +/// Encode `chunk` into `out` as a complete QWP/WebSocket frame body. The +/// caller has already reserved any prefix bytes it needs in `out` (the +/// connection layer reserves the WS header); the encoder appends QWP +/// bytes only. +pub(crate) fn encode_chunk_into( + out: &mut Vec, + chunk: &Chunk<'_>, + schema_registry: &mut SchemaRegistry, + symbol_dict: &mut SymbolGlobalDict, + defer_commit: bool, +) -> Result<()> { + if chunk.is_empty() { + emit_header_only_frame(out, defer_commit); + return Ok(()); + } + if chunk.designated_ts.is_none() { + return Err(error::fmt!( + InvalidApiCall, + "Chunk has no designated timestamp; \ + call designated_timestamp_micros or designated_timestamp_nanos before flush." + )); + } + let row_count = chunk.row_count(); + if row_count == 0 { + return Err(error::fmt!( + InvalidApiCall, + "Chunk row_count is 0; flush at least one row or hand back an empty chunk." + )); + } + validate_name("table", &chunk.table)?; + + let table_bytes = chunk.table.as_bytes(); + if table_bytes.len() > MAX_NAME_LEN { + return Err(error::fmt!( + InvalidName, + "table name is too long: {} bytes (max {})", + table_bytes.len(), + MAX_NAME_LEN + )); + } + + let designated = chunk + .designated_ts + .as_ref() + .expect("guarded by is_none() check above"); + + // --- Pass 1: resolve symbol columns against the connection-scoped + // global dict. We snapshot the dict so we can roll back if encoding + // later fails — symbol entries that never hit the wire must not be + // remembered. --- + let dict_mark = symbol_dict.mark(); + let resolution = match resolve_symbols(chunk, symbol_dict) { + Ok(r) => r, + Err(e) => { + symbol_dict.rollback(dict_mark); + return Err(e); + } + }; + + // --- Schema signature --- + let column_count = chunk.columns.len() + 1; // +1 for designated timestamp + let mut signature = Vec::with_capacity(column_count * 8); + for col in &chunk.columns { + write_qwp_bytes(&mut signature, col.name.as_bytes()); + signature.push(col.wire_type); + } + write_qwp_bytes(&mut signature, &[]); // designated_ts has empty name + signature.push(designated.wire_type); + + let (schema_id, is_new_schema) = schema_registry.intern(&signature); + + // --- Reserve total expected frame size up front. Avoids the + // geometric-growth memcpy pattern when the column data is large. --- + let estimated = estimate_frame_size(chunk, row_count, &signature, &resolution); + out.reserve(estimated); + + // --- Reserve frame header placeholder --- + let frame_start = out.len(); + write_header_placeholder(out, /* table_count = */ 1, defer_commit); + let payload_start = out.len(); + + // --- Delta-symbol-dict prefix --- + write_qwp_varint(out, resolution.delta_start); + write_qwp_varint(out, resolution.new_symbols.len() as u64); + for bytes in &resolution.new_symbols { + write_qwp_bytes(out, bytes); + } + + // --- Table block header --- + write_qwp_bytes(out, table_bytes); + write_qwp_varint(out, row_count as u64); + write_qwp_varint(out, column_count as u64); + + // --- Schema section --- + if is_new_schema { + out.push(QWP_SCHEMA_MODE_FULL); + write_qwp_varint(out, schema_id); + out.extend_from_slice(&signature); + } else { + out.push(QWP_SCHEMA_MODE_REFERENCE); + write_qwp_varint(out, schema_id); + } + + // --- Column payloads --- + for (col_idx, col) in chunk.columns.iter().enumerate() { + // SAFETY: caller buffers are required by Chunk's `'a` (or the + // FFI's documented contract) to outlive this call. + unsafe { + encode_column(out, col, row_count, col_idx, &resolution)?; + } + } + + // --- Designated timestamp --- + encode_designated_ts(out, designated, row_count); + + // --- Patch payload_len --- + let payload_len = (out.len() - payload_start) as u32; + let header = &mut out[frame_start..payload_start]; + header[8..12].copy_from_slice(&payload_len.to_le_bytes()); + + Ok(()) +} + +/// Conservative byte estimate of the encoded QWP frame body. Used to +/// `reserve()` write_buf in one shot before the encode loop — avoids +/// the geometric-growth memcpy pattern when total payload runs into +/// MBs. Walks descriptors once, no actual data reads. +fn estimate_frame_size( + chunk: &Chunk<'_>, + row_count: usize, + signature: &[u8], + resolution: &SymbolResolution, +) -> usize { + let mut total = QWP_HEADER_LEN; + // delta-symbol-dict prefix + total += 10 + 10; // delta_start + new_symbols_count varints + for s in &resolution.new_symbols { + total += 10 + s.len(); + } + // table block header + schema section + total += 10 + chunk.table.len() + 10 + 10; // table name + row + col count varints + total += 1 + 10 + signature.len(); // schema mode + id varint + signature (full case) + + let bitmap_bytes = row_count.div_ceil(8); + for col in &chunk.columns { + let null_overhead = 1 + if col.validity.is_some() { + bitmap_bytes + } else { + 0 + }; + let payload_size = match col.kind { + ColumnKind::Byte { .. } => row_count, + ColumnKind::Short { .. } => 2 * row_count, + ColumnKind::Int { .. } | ColumnKind::Float { .. } | ColumnKind::Ipv4 { .. } => { + 4 * row_count + } + ColumnKind::Long { .. } + | ColumnKind::Double { .. } + | ColumnKind::TsNanos { .. } + | ColumnKind::TsMicros { .. } + | ColumnKind::DateMillis { .. } => 8 * row_count, + ColumnKind::Bool { .. } => bitmap_bytes, + ColumnKind::Uuid { .. } => 16 * row_count, + ColumnKind::Long256 { .. } => 32 * row_count, + ColumnKind::Varchar { bytes_len, .. } => 4 * (row_count + 1) + bytes_len, + ColumnKind::Symbol { .. } => 5 * row_count, // varint upper bound + }; + total += null_overhead + payload_size; + } + // designated timestamp + total += 1 + 8 * row_count; + total +} + +fn emit_header_only_frame(out: &mut Vec, defer_commit: bool) { + let frame_start = out.len(); + write_header_placeholder(out, 0, defer_commit); + let payload_start = out.len(); + write_qwp_varint(out, 0); // delta_start + write_qwp_varint(out, 0); // new_symbols_count + let payload_len = (out.len() - payload_start) as u32; + out[frame_start + 8..frame_start + 12].copy_from_slice(&payload_len.to_le_bytes()); +} + +fn write_header_placeholder(out: &mut Vec, table_count: u16, defer_commit: bool) { + let start = out.len(); + out.extend_from_slice(&QWP_MAGIC); + out.push(QWP_VERSION_1); + let mut flags = QWP_FLAG_DELTA_SYMBOL_DICT; + if defer_commit { + flags |= QWP_FLAG_DEFER_COMMIT; + } + out.push(flags); + out.extend_from_slice(&table_count.to_le_bytes()); + out.extend_from_slice(&0u32.to_le_bytes()); // payload_len placeholder + debug_assert_eq!(out.len() - start, QWP_HEADER_LEN); +} + +// =========================================================================== +// Symbol resolution (pre-pass) +// =========================================================================== + +struct SymbolResolution { + delta_start: u64, + new_symbols: Vec>, + /// One entry per column slot. `Some` for symbol columns; carries the + /// per-row internal-index→global-id map keyed by the dict slot the + /// row references. + per_column: Vec>, +} + +struct ResolvedSymbolColumn { + /// Indexed by dict slot. `u64::MAX` for slots the column never + /// references (we only intern referenced slots). + local_to_global: Vec, + non_null_count: usize, +} + +fn resolve_symbols( + chunk: &Chunk<'_>, + symbol_dict: &mut SymbolGlobalDict, +) -> Result { + let delta_start = symbol_dict.next_id(); + let mut new_symbols: Vec> = Vec::new(); + let mut per_column: Vec> = Vec::with_capacity(chunk.columns.len()); + let row_count = chunk.row_count(); + + for col in &chunk.columns { + let ColumnKind::Symbol { + codes, + dict_offsets, + dict_offsets_len, + dict_bytes, + dict_bytes_len, + } = col.kind + else { + per_column.push(None); + continue; + }; + let dict_len = dict_offsets_len - 1; + // SAFETY: pointers were validated to be in-bounds at append time. + let offsets = unsafe { slice::from_raw_parts(dict_offsets, dict_offsets_len) }; + let dict_bytes_slice = unsafe { slice::from_raw_parts(dict_bytes, dict_bytes_len) }; + // Pass 1: mark referenced dict slots + count non-null rows. + let mut referenced = vec![false; dict_len]; + let mut non_null_count = 0usize; + for i in 0..row_count { + if !is_valid_row(col.validity.as_ref(), i) { + continue; + } + // SAFETY: codes ptr was validated to have row_count elements. + let slot = unsafe { codes.read_i64(i) } as usize; + referenced[slot] = true; + non_null_count += 1; + } + // Pass 2: intern referenced slots, build local_to_global. The + // encoder reads `codes` directly at emit time — no separate + // compact-codes pass / allocation needed (~400 KB saved on a + // 100k-row chunk). + let mut local_to_global = vec![u64::MAX; dict_len]; + for (slot, mark) in referenced.iter().enumerate() { + if !*mark { + continue; + } + let start = offsets[slot] as usize; + let end = offsets[slot + 1] as usize; + let entry_bytes = &dict_bytes_slice[start..end]; + let (gid, is_new) = symbol_dict.intern(entry_bytes); + if is_new { + new_symbols.push(entry_bytes.to_vec()); + } + local_to_global[slot] = gid; + } + per_column.push(Some(ResolvedSymbolColumn { + local_to_global, + non_null_count, + })); + } + Ok(SymbolResolution { + delta_start, + new_symbols, + per_column, + }) +} + +// =========================================================================== +// Column encoders +// =========================================================================== + +/// Encode column `col` into `out`. SAFETY: caller buffers referenced by +/// `col` must still be alive (see `Chunk` lifetime contract). +unsafe fn encode_column( + out: &mut Vec, + col: &ColumnDescriptor, + row_count: usize, + col_idx: usize, + resolution: &SymbolResolution, +) -> Result<()> { + let validity = col.validity.as_ref(); + match col.kind { + ColumnKind::Byte { data } => unsafe { + encode_sentinel_le::(out, data, row_count, validity, I8_NULL, |v| [v as u8]) + }, + ColumnKind::Short { data } => unsafe { + encode_sentinel_le::(out, data, row_count, validity, I16_NULL, i16::to_le_bytes) + }, + ColumnKind::Int { data } => unsafe { + encode_sentinel_le::(out, data, row_count, validity, I32_NULL, i32::to_le_bytes) + }, + ColumnKind::Long { data } => unsafe { + encode_sentinel_le::(out, data, row_count, validity, I64_NULL, i64::to_le_bytes) + }, + ColumnKind::Float { data } => unsafe { + encode_sentinel_le::(out, data, row_count, validity, F32_NULL, f32::to_le_bytes) + }, + ColumnKind::Double { data } => unsafe { + encode_sentinel_le::(out, data, row_count, validity, F64_NULL, f64::to_le_bytes) + }, + ColumnKind::Bool { bits } => unsafe { + encode_bool(out, bits, row_count, validity); + }, + ColumnKind::Ipv4 { data } => unsafe { + encode_bitmap_le::(out, data, row_count, validity, u32::to_le_bytes); + }, + ColumnKind::TsNanos { data } + | ColumnKind::TsMicros { data } + | ColumnKind::DateMillis { data } => unsafe { + encode_bitmap_le::(out, data, row_count, validity, i64::to_le_bytes); + }, + ColumnKind::Uuid { data } => unsafe { + encode_fixed_width_bitmap::<16>(out, data as *const u8, row_count, validity); + }, + ColumnKind::Long256 { data } => unsafe { + encode_fixed_width_bitmap::<32>(out, data as *const u8, row_count, validity); + }, + ColumnKind::Varchar { + offsets, + offsets_len, + bytes, + bytes_len, + } => unsafe { + encode_varchar( + out, + offsets, + offsets_len, + bytes, + bytes_len, + row_count, + validity, + ); + }, + ColumnKind::Symbol { codes, .. } => { + let resolved = resolution.per_column[col_idx] + .as_ref() + .expect("symbol resolution missing for symbol column"); + unsafe { + encode_symbol(out, codes, resolved, row_count, validity); + } + } + } + Ok(()) +} + +/// Sentinel-null path: no validity bitmap, single null_flag byte + dense +/// data. `T` is read directly from caller memory and converted to LE +/// bytes; nulls are sentinel-encoded with `null_value`. +unsafe fn encode_sentinel_le( + out: &mut Vec, + data: *const T, + row_count: usize, + validity: Option<&ValidityDescriptor>, + null_value: T, + to_le: impl Fn(T) -> [u8; N], +) where + T: Copy, +{ + out.push(0); // null_flag = 0x00 (sentinel encoding) + out.reserve(N * row_count); + match validity { + None => { + // Hot path: contiguous typed buffer → bulk memcpy via byte + // reinterpret. POD numerics, any byte pattern is sound. + let bytes = unsafe { slice::from_raw_parts(data as *const u8, row_count * N) }; + out.extend_from_slice(bytes); + } + Some(v) => { + for i in 0..row_count { + let value = if unsafe { v.is_valid(i) } { + unsafe { *data.add(i) } + } else { + null_value + }; + out.extend_from_slice(&to_le(value)); + } + } + } +} + +/// Bitmap-style fixed-width path: null_flag + optional QWP bitmap + +/// dense values for non-null rows only. +unsafe fn encode_bitmap_le( + out: &mut Vec, + data: *const T, + row_count: usize, + validity: Option<&ValidityDescriptor>, + to_le: impl Fn(T) -> [u8; N], +) where + T: Copy, +{ + match validity { + None => { + out.push(0); + out.reserve(N * row_count); + let bytes = unsafe { slice::from_raw_parts(data as *const u8, row_count * N) }; + out.extend_from_slice(bytes); + } + Some(v) => { + out.push(1); + unsafe { write_qwp_bitmap_from_validity(out, v) }; + out.reserve(N * v.non_null_count); + for i in 0..row_count { + if unsafe { v.is_valid(i) } { + let value = unsafe { *data.add(i) }; + out.extend_from_slice(&to_le(value)); + } + } + } + } +} + +/// Bitmap-style fixed-width binary column (UUID, LONG256). `data` +/// points at row 0 of an `[u8; N]` block. +unsafe fn encode_fixed_width_bitmap( + out: &mut Vec, + data: *const u8, + row_count: usize, + validity: Option<&ValidityDescriptor>, +) { + match validity { + None => { + out.push(0); + out.reserve(N * row_count); + let bytes = unsafe { slice::from_raw_parts(data, N * row_count) }; + out.extend_from_slice(bytes); + } + Some(v) => { + out.push(1); + unsafe { write_qwp_bitmap_from_validity(out, v) }; + out.reserve(N * v.non_null_count); + for i in 0..row_count { + if unsafe { v.is_valid(i) } { + let row_start = unsafe { data.add(i * N) }; + let row = unsafe { slice::from_raw_parts(row_start, N) }; + out.extend_from_slice(row); + } + } + } + } +} + +unsafe fn encode_bool( + out: &mut Vec, + bits: *const u8, + row_count: usize, + validity: Option<&ValidityDescriptor>, +) { + out.push(0); // bool always sentinel-encoded + let mut packed = 0u8; + let mut bit_idx = 0u8; + for i in 0..row_count { + let byte_idx = i / 8; + let bit_off = i % 8; + let bit = (unsafe { *bits.add(byte_idx) } >> bit_off) & 1; + let valid = validity.is_none_or(|v| unsafe { v.is_valid(i) }); + if bit == 1 && valid { + packed |= 1u8 << bit_idx; + } + bit_idx += 1; + if bit_idx == 8 { + out.push(packed); + packed = 0; + bit_idx = 0; + } + } + if bit_idx != 0 { + out.push(packed); + } +} + +unsafe fn encode_varchar( + out: &mut Vec, + offsets: *const i32, + offsets_len: usize, + bytes: *const u8, + bytes_len: usize, + row_count: usize, + validity: Option<&ValidityDescriptor>, +) { + let offsets_slice = unsafe { slice::from_raw_parts(offsets, offsets_len) }; + let bytes_slice = unsafe { slice::from_raw_parts(bytes, bytes_len) }; + + match validity { + None => { + out.push(0); // null_flag + out.reserve(4 * (row_count + 1) + bytes_len); + let base = offsets_slice[0]; + if base == 0 { + // Hot path: offset table is bit-identical to LE u32 for + // non-negative i32; memcpy both halves. + let offset_bytes = unsafe { + slice::from_raw_parts( + offsets as *const u8, + offsets_len * std::mem::size_of::(), + ) + }; + out.extend_from_slice(offset_bytes); + let used = offsets_slice[row_count] as usize; + out.extend_from_slice(&bytes_slice[..used]); + } else { + for &off in offsets_slice { + let normalized = (off - base) as u32; + out.extend_from_slice(&normalized.to_le_bytes()); + } + let start = base as usize; + let end = offsets_slice[row_count] as usize; + out.extend_from_slice(&bytes_slice[start..end]); + } + } + Some(v) => { + out.push(1); + unsafe { write_qwp_bitmap_from_validity(out, v) }; + let non_null = v.non_null_count; + let offsets_start = out.len(); + out.resize(offsets_start + 4 * (non_null + 1), 0); + out[offsets_start..offsets_start + 4].copy_from_slice(&0u32.to_le_bytes()); + let mut cumulative: u32 = 0; + let mut next_offset_idx = 1usize; + let bytes_anchor = out.len(); + for i in 0..row_count { + if !unsafe { v.is_valid(i) } { + continue; + } + let start = offsets_slice[i] as usize; + let end = offsets_slice[i + 1] as usize; + let len = end - start; + out.extend_from_slice(&bytes_slice[start..end]); + cumulative = cumulative.saturating_add(len as u32); + let off = offsets_start + 4 * next_offset_idx; + out[off..off + 4].copy_from_slice(&cumulative.to_le_bytes()); + next_offset_idx += 1; + } + debug_assert_eq!(next_offset_idx - 1, non_null); + debug_assert_eq!(out.len() - bytes_anchor, cumulative as usize); + } + } +} + +unsafe fn encode_symbol( + out: &mut Vec, + codes: SymbolCodesPtr, + resolved: &ResolvedSymbolColumn, + row_count: usize, + validity: Option<&ValidityDescriptor>, +) { + match validity { + None => out.push(0), + Some(v) => { + out.push(1); + unsafe { write_qwp_bitmap_from_validity(out, v) }; + } + } + out.reserve(resolved.non_null_count * 4); + // Specialise on the code's bit width so the per-row loop is a + // straight read + table lookup + varint write (~1 ns/row). The + // dispatch overhead is amortised across the whole column. + match codes { + SymbolCodesPtr::I8(p) => unsafe { + emit_symbol_rows(out, p, row_count, validity, &resolved.local_to_global); + }, + SymbolCodesPtr::I16(p) => unsafe { + emit_symbol_rows(out, p, row_count, validity, &resolved.local_to_global); + }, + SymbolCodesPtr::I32(p) => unsafe { + emit_symbol_rows(out, p, row_count, validity, &resolved.local_to_global); + }, + } +} + +unsafe fn emit_symbol_rows( + out: &mut Vec, + codes: *const T, + row_count: usize, + validity: Option<&ValidityDescriptor>, + local_to_global: &[u64], +) where + T: Copy + Into, +{ + for i in 0..row_count { + let valid = validity.is_none_or(|v| unsafe { v.is_valid(i) }); + if !valid { + continue; + } + let slot = unsafe { (*codes.add(i)).into() } as usize; + let gid = local_to_global[slot]; + debug_assert_ne!(gid, u64::MAX, "referenced symbol slot has no global id"); + write_qwp_varint(out, gid); + } +} + +fn encode_designated_ts(out: &mut Vec, ts: &DesignatedTsDescriptor, row_count: usize) { + out.push(0); // designated_ts is always non-null + out.reserve(8 * row_count); + // SAFETY: caller buffer lifetime is the chunk's `'a`. + let bytes = unsafe { + slice::from_raw_parts(ts.data as *const u8, row_count * std::mem::size_of::()) + }; + out.extend_from_slice(bytes); +} + +// =========================================================================== +// Helpers +// =========================================================================== + +/// Write `validity` as a QWP-shape (bit = 1 NULL) bitmap appended to +/// `out`. The high bits past `bit_len` in the last byte are masked. +unsafe fn write_qwp_bitmap_from_validity(out: &mut Vec, v: &ValidityDescriptor) { + let full_bytes = v.bit_len / 8; + let trailing_bits = v.bit_len % 8; + let src = unsafe { slice::from_raw_parts(v.bits, v.byte_len()) }; + for &byte in &src[..full_bytes] { + out.push(!byte); + } + if trailing_bits != 0 { + let mask = (1u8 << trailing_bits) - 1; + out.push((!src[full_bytes]) & mask); + } +} + +#[inline] +fn is_valid_row(validity: Option<&ValidityDescriptor>, i: usize) -> bool { + match validity { + None => true, + // SAFETY: bit_len was checked == row_count at append time, so + // `i < row_count` ⇒ `i < bit_len`. + Some(v) => unsafe { v.is_valid(i) }, + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::ingress::column_sender::Validity; + + fn make_chunk_i64(name: &str, data: &[i64]) -> Vec { + let mut chunk = Chunk::new("trades"); + chunk.column_i64(name, data, None).unwrap(); + chunk.designated_timestamp_nanos(data).unwrap(); + let mut out = Vec::new(); + let mut reg = SchemaRegistry::new(); + let mut dict = SymbolGlobalDict::new(); + encode_chunk_into(&mut out, &chunk, &mut reg, &mut dict, false).unwrap(); + out + } + + #[test] + fn empty_chunk_encodes_to_14_bytes() { + let chunk = Chunk::new("trades"); + let mut out = Vec::new(); + let mut reg = SchemaRegistry::new(); + let mut dict = SymbolGlobalDict::new(); + encode_chunk_into(&mut out, &chunk, &mut reg, &mut dict, false).unwrap(); + assert_eq!(out.len(), 14); + assert_eq!(&out[0..4], b"QWP1"); + assert_eq!(out[5], QWP_FLAG_DELTA_SYMBOL_DICT); + assert_eq!(u16::from_le_bytes([out[6], out[7]]), 0); + } + + #[test] + fn defer_commit_flag_is_set_when_requested() { + let chunk = Chunk::new("trades"); + let mut out = Vec::new(); + let mut reg = SchemaRegistry::new(); + let mut dict = SymbolGlobalDict::new(); + encode_chunk_into(&mut out, &chunk, &mut reg, &mut dict, true).unwrap(); + assert_eq!(out[5] & QWP_FLAG_DEFER_COMMIT, QWP_FLAG_DEFER_COMMIT); + assert_eq!( + out[5] & QWP_FLAG_DELTA_SYMBOL_DICT, + QWP_FLAG_DELTA_SYMBOL_DICT + ); + } + + #[test] + fn non_empty_chunk_without_designated_ts_errors() { + let mut chunk = Chunk::new("trades"); + let data = [1i64, 2, 3]; + chunk.column_i64("a", &data, None).unwrap(); + let mut out = Vec::new(); + let mut reg = SchemaRegistry::new(); + let mut dict = SymbolGlobalDict::new(); + let err = encode_chunk_into(&mut out, &chunk, &mut reg, &mut dict, false).unwrap_err(); + assert_eq!(err.code(), crate::ErrorCode::InvalidApiCall); + assert!(err.msg().contains("designated")); + } + + #[test] + fn second_encode_with_same_schema_uses_reference() { + let mut reg = SchemaRegistry::new(); + let mut dict = SymbolGlobalDict::new(); + + let p1 = [1i64, 2]; + let mut c1 = Chunk::new("trades"); + c1.column_i64("price", &p1, None).unwrap(); + c1.designated_timestamp_nanos(&p1).unwrap(); + let mut out1 = Vec::new(); + encode_chunk_into(&mut out1, &c1, &mut reg, &mut dict, false).unwrap(); + + let p2 = [3i64, 4]; + let mut c2 = Chunk::new("trades"); + c2.column_i64("price", &p2, None).unwrap(); + c2.designated_timestamp_nanos(&p2).unwrap(); + let mut out2 = Vec::new(); + encode_chunk_into(&mut out2, &c2, &mut reg, &mut dict, false).unwrap(); + + assert!(out2.len() < out1.len()); + assert_eq!(reg.len(), 1, "schema signature interned once"); + + let schema_mode_offset = 12 + 1 + 1 + 1 + "trades".len() + 1 + 1; + assert_eq!(out1[schema_mode_offset], QWP_SCHEMA_MODE_FULL); + assert_eq!(out2[schema_mode_offset], QWP_SCHEMA_MODE_REFERENCE); + } + + #[test] + fn distinct_schemas_get_distinct_ids() { + let mut reg = SchemaRegistry::new(); + let mut dict = SymbolGlobalDict::new(); + let x = [1i64]; + let mut a = Chunk::new("a"); + a.column_i64("x", &x, None).unwrap(); + a.designated_timestamp_nanos(&x).unwrap(); + let mut oa = Vec::new(); + encode_chunk_into(&mut oa, &a, &mut reg, &mut dict, false).unwrap(); + + let y = [1.0f64]; + let ts = [1i64]; + let mut b = Chunk::new("b"); + b.column_f64("y", &y, None).unwrap(); + b.designated_timestamp_nanos(&ts).unwrap(); + let mut ob = Vec::new(); + encode_chunk_into(&mut ob, &b, &mut reg, &mut dict, false).unwrap(); + + assert_eq!(reg.len(), 2); + } + + #[test] + fn frame_size_grows_with_column_payloads() { + let p = [1i64, 2, 3, 4]; + let bits = [0xFFu8]; + let v = Validity::from_bitmap(&bits, 4).unwrap(); + let mut chunk = Chunk::new("trades"); + chunk.column_i64("price", &p, Some(&v)).unwrap(); + chunk.designated_timestamp_nanos(&p).unwrap(); + let mut out = Vec::new(); + let mut reg = SchemaRegistry::new(); + let mut dict = SymbolGlobalDict::new(); + encode_chunk_into(&mut out, &chunk, &mut reg, &mut dict, false).unwrap(); + assert!(out.len() > 32); + } + + #[test] + fn symbol_dict_emits_only_referenced_entries() { + let codes = [0i32, 2, 0, 2]; + let dict_offsets = [0i32, 5, 9, 14]; + let dict_bytes = b"alphabetagamma"; + let ts = [1i64, 2, 3, 4]; + let mut chunk = Chunk::new("trades"); + chunk + .symbol_dict_i32("sym", &codes, &dict_offsets, dict_bytes, None) + .unwrap(); + chunk.designated_timestamp_nanos(&ts).unwrap(); + let mut out = Vec::new(); + let mut reg = SchemaRegistry::new(); + let mut dict = SymbolGlobalDict::new(); + encode_chunk_into(&mut out, &chunk, &mut reg, &mut dict, false).unwrap(); + assert_eq!(dict.next_id(), 2, "alpha + gamma only, beta unsent"); + } + + #[test] + fn symbol_dict_second_frame_resends_only_new_entries() { + let mut reg = SchemaRegistry::new(); + let mut dict = SymbolGlobalDict::new(); + let dict_offsets = [0i32, 5, 9, 14]; + let dict_bytes = b"alphabetagamma"; + + let codes1 = [0i32, 1]; + let ts1 = [1i64, 2]; + let mut c1 = Chunk::new("trades"); + c1.symbol_dict_i32("sym", &codes1, &dict_offsets, dict_bytes, None) + .unwrap(); + c1.designated_timestamp_nanos(&ts1).unwrap(); + let mut out1 = Vec::new(); + encode_chunk_into(&mut out1, &c1, &mut reg, &mut dict, false).unwrap(); + assert_eq!(dict.next_id(), 2); + + let codes2 = [0i32, 2]; + let ts2 = [3i64, 4]; + let mut c2 = Chunk::new("trades"); + c2.symbol_dict_i32("sym", &codes2, &dict_offsets, dict_bytes, None) + .unwrap(); + c2.designated_timestamp_nanos(&ts2).unwrap(); + let mut out2 = Vec::new(); + encode_chunk_into(&mut out2, &c2, &mut reg, &mut dict, false).unwrap(); + assert_eq!(dict.next_id(), 3, "gamma added on second frame"); + } + + #[test] + fn i64_no_null_round_trip_wire_bytes() { + let bytes = make_chunk_i64("price", &[10, 20, 30]); + // Frame contains: header(12) + delta_dict(2) + table_block + schema + + // column data + designated_ts data. The exact byte layout is asserted + // implicitly via the other tests; here we just ensure the payload_len + // patched correctly. + let payload_len = u32::from_le_bytes(bytes[8..12].try_into().unwrap()) as usize; + assert_eq!(12 + payload_len, bytes.len()); + } +} diff --git a/questdb-rs/src/ingress/column_sender/mod.rs b/questdb-rs/src/ingress/column_sender/mod.rs new file mode 100644 index 00000000..130daac8 --- /dev/null +++ b/questdb-rs/src/ingress/column_sender/mod.rs @@ -0,0 +1,110 @@ +/******************************************************************************* + * ___ _ ____ ____ + * / _ \ _ _ ___ ___| |_| _ \| __ ) + * | | | | | | |/ _ \/ __| __| | | | _ \ + * | |_| | |_| | __/\__ \ |_| |_| | |_) | + * \__\_\\__,_|\___||___/\__|____/|____/ + * + * Copyright (c) 2014-2019 Appsicle + * Copyright (c) 2019-2025 QuestDB + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ******************************************************************************/ + +//! Column-major sender for QWP/WebSocket. +//! +//! This is a separate API surface from [`crate::ingress::Sender`] / [`crate::ingress::Buffer`]. +//! It exists to ingest **Pandas/Polars DataFrames into QuestDB at the maximum +//! throughput the QWP/WebSocket wire allows**. See `doc/COLUMN_SENDER_PLAN.md` +//! for the design rationale. +//! +//! The user model is `DataFrame → Table`: +//! +//! - Open a connection pool with [`QuestDb::connect`]. +//! - Borrow a sender with [`QuestDb::borrow_sender`]. +//! - Build a [`Chunk`] of column buffers for one table, then pin a +//! designated timestamp on it. +//! - Flush chunks to publish them without waiting for ACKs, then call +//! [`ColumnSender::sync`] to commit and wait at the requested [`AckLevel`]. +//! - Drop the [`BorrowedSender`] to return its connection to the pool. + +mod chunk; +mod conf; +mod conn; +mod db; +mod encoder; +mod sender; +mod validity; +mod wire; + +pub use chunk::Chunk; +pub use db::{BorrowedSender, QuestDb}; +pub use sender::{AckLevel, ColumnSender}; +pub use validity::Validity; + +#[doc(hidden)] +pub use db::OwnedSender; + +/// Internals exposed for criterion benchmarks under +/// `questdb-rs/benches/`. Not part of the public API; bumped freely +/// without semver concerns. +#[doc(hidden)] +pub mod _bench_internals { + use crate::Result; + use crate::ingress::buffer::SymbolGlobalDict; + + use super::chunk::Chunk; + use super::encoder::{SchemaRegistry, encode_chunk_into}; + + /// Opaque holder for the connection-scoped state the encoder needs. + /// Lets benches reuse the encoder across iterations without + /// promoting [`SchemaRegistry`] / [`SymbolGlobalDict`] to the + /// public API. + pub struct BenchEncoderState { + schema_registry: SchemaRegistry, + symbol_dict: SymbolGlobalDict, + } + + impl Default for BenchEncoderState { + fn default() -> Self { + Self::new() + } + } + + impl BenchEncoderState { + pub fn new() -> Self { + Self { + schema_registry: SchemaRegistry::new(), + symbol_dict: SymbolGlobalDict::new(), + } + } + } + + /// Encode `chunk` into `out`. Mirrors [`encode_chunk_into`] but hides + /// the internal-state types so the bench module never has to touch + /// them. + pub fn bench_encode_chunk_into( + out: &mut Vec, + chunk: &Chunk<'_>, + state: &mut BenchEncoderState, + ) -> Result<()> { + encode_chunk_into( + out, + chunk, + &mut state.schema_registry, + &mut state.symbol_dict, + false, + ) + } +} diff --git a/questdb-rs/src/ingress/column_sender/sender.rs b/questdb-rs/src/ingress/column_sender/sender.rs new file mode 100644 index 00000000..ecf7f166 --- /dev/null +++ b/questdb-rs/src/ingress/column_sender/sender.rs @@ -0,0 +1,172 @@ +/******************************************************************************* + * ___ _ ____ ____ + * / _ \ _ _ ___ ___| |_| _ \| __ ) + * | | | | | | |/ _ \/ __| __| | | | _ \ + * | |_| | |_| | __/\__ \ |_| |_| | |_) | + * \__\_\\__,_|\___||___/\__|____/|____/ + * + * Copyright (c) 2014-2019 Appsicle + * Copyright (c) 2019-2025 QuestDB + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ******************************************************************************/ + +//! Borrowed-handle types for the column-major sender. +//! +//! A [`ColumnSender`] owns one pipelined QWP/WebSocket connection +//! ([`super::conn::ColumnConn`]), a connection-scoped +//! [`SchemaRegistry`](super::encoder::SchemaRegistry), and a +//! connection-scoped [`SymbolGlobalDict`]: all three travel back into the +//! pool together when the [`super::BorrowedSender`] is dropped. + +use std::fmt::{self, Debug, Formatter}; + +use crate::ingress::buffer::SymbolGlobalDict; +use crate::{Result, error}; + +use super::chunk::Chunk; +use super::conn::ColumnConn; +use super::encoder::{self, SchemaRegistry}; + +/// Acknowledgement level for [`ColumnSender::sync`]. +#[derive(Clone, Copy, Debug, Default, PartialEq, Eq)] +pub enum AckLevel { + /// Wait for the server's WAL-commit ACK (spec status `0x00`). Always + /// available. + #[default] + Ok, + /// Wait for the server's object-store durability ACK (spec status + /// `0x02`). Enterprise feature; requires `request_durable_ack=on` in + /// the connect string. + Durable, +} + +/// One [`ColumnConn`] in the pool, wrapped in the column-sender API. +pub struct ColumnSender { + pub(crate) conn: ColumnConn, + pub(crate) schema_registry: SchemaRegistry, + pub(crate) symbol_dict: SymbolGlobalDict, + /// The first frame is sent without `FLAG_DEFER_COMMIT` so the server + /// commits it immediately. This lets the WAL segment roll and update + /// `initialSymbolCount`, warming the server's `ClientSymbolCache` for + /// all subsequent deferred frames. + first_frame_sent: bool, +} + +impl Debug for ColumnSender { + fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { + f.debug_struct("ColumnSender") + .field("must_close", &self.conn.must_close()) + .field("in_flight", &self.conn.in_flight()) + .finish() + } +} + +impl ColumnSender { + pub(crate) fn new( + conn: ColumnConn, + schema_registry: SchemaRegistry, + symbol_dict: SymbolGlobalDict, + ) -> Self { + Self { + conn, + schema_registry, + symbol_dict, + first_frame_sent: false, + } + } + + /// `true` once the underlying QWP/WS connection has latched into a + /// permanently-unusable state. On return to the pool such senders + /// are dropped rather than recycled. + #[must_use] + pub fn must_close(&self) -> bool { + self.conn.must_close() + } + + /// Encode `chunk` into a QWP/WebSocket frame, write it to the + /// socket, and return — **without** waiting for the server's ack. + /// + /// The first frame is sent as an immediate commit so the server can + /// warm its symbol cache. Later frames are sent with + /// `FLAG_DEFER_COMMIT`: the server appends rows to WAL but skips the + /// commit. Call [`sync`](Self::sync) to trigger the commit for all + /// accumulated rows. + /// + /// Ready acks are drained non-blocking before the write. Deferred + /// flushes reserve one in-flight slot for the later + /// commit-triggering sync frame; when that reserve would be consumed, + /// this call returns [`ErrorCode::InvalidApiCall`](crate::ErrorCode::InvalidApiCall) + /// and the caller must call [`sync`](Self::sync) before flushing more + /// chunks. + /// + /// On success, `chunk` is cleared (its retained descriptor capacity + /// is preserved) and the caller's buffers are released. + /// + /// On failure, the error is returned and `chunk` is left untouched. + /// Transport and server failures latch the connection as terminal; + /// validation and capacity failures leave it usable. + pub fn flush(&mut self, chunk: &mut Chunk<'_>) -> Result<()> { + let defer = self.first_frame_sent; + self.flush_inner(chunk, defer)?; + self.first_frame_sent = true; + Ok(()) + } + + /// Block until all in-flight frames are acknowledged at the + /// requested [`AckLevel`]. + /// + /// Sends a commit-triggering frame (without `FLAG_DEFER_COMMIT`) + /// so the server commits all rows accumulated from preceding + /// deferred flushes, then drains all acks. + /// + /// `AckLevel::Ok` waits for every in-flight frame's WAL-commit ack. + /// `AckLevel::Durable` additionally waits for the server's + /// object-store durability watermarks to reach every frame's + /// seq_txn (requires `request_durable_ack=on` at connect). + pub fn sync(&mut self, ack_level: AckLevel) -> Result<()> { + self.conn.validate_ack_level(ack_level)?; + + // Send a commit-triggering empty frame (no FLAG_DEFER_COMMIT). + let mut commit_chunk = Chunk::new(""); + self.flush_inner(&mut commit_chunk, /* defer_commit = */ false)?; + self.conn.sync_all_acks(ack_level) + } + + fn flush_inner(&mut self, chunk: &mut Chunk<'_>, defer_commit: bool) -> Result<()> { + self.conn.try_drain_acks()?; + + if defer_commit && !self.conn.has_sync_commit_slot() { + return Err(error::fmt!( + InvalidApiCall, + "column sender deferred flush capacity exhausted; call sync() \ + before flushing more chunks." + )); + } + + if self.conn.at_in_flight_cap() { + self.conn.drain_one_ack_blocking()?; + } + + let schema = &mut self.schema_registry; + let dict = &mut self.symbol_dict; + let published = self.conn.publish_qwp(|out| { + encoder::encode_chunk_into(out, chunk, schema, dict, defer_commit) + })?; + + self.conn.push_pending(published.fsn); + chunk.clear(); + Ok(()) + } +} diff --git a/questdb-rs/src/ingress/column_sender/validity.rs b/questdb-rs/src/ingress/column_sender/validity.rs new file mode 100644 index 00000000..0bdcf124 --- /dev/null +++ b/questdb-rs/src/ingress/column_sender/validity.rs @@ -0,0 +1,141 @@ +/******************************************************************************* + * ___ _ ____ ____ + * / _ \ _ _ ___ ___| |_| _ \| __ ) + * | | | | | | |/ _ \/ __| __| | | | _ \ + * | |_| | |_| | __/\__ \ |_| |_| | |_) | + * \__\_\\__,_|\___||___/\__|____/|____/ + * + * Copyright (c) 2014-2019 Appsicle + * Copyright (c) 2019-2025 QuestDB + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ******************************************************************************/ + +//! Validity bitmap helpers for the column-major sender. +//! +//! Users pass validity in **Arrow shape**: bit = 1 means valid, LSB-first +//! inside each byte. The QWP wire shape is the inverse: bit = 1 means +//! NULL. The conversion happens here; helpers below also count non-null +//! rows and stream Arrow-bit-set positions for the gather path. + +use crate::{Result, error}; + +/// Public validity bitmap. See `doc/COLUMN_SENDER_FFI_ABI.md` §2.4 for the +/// Arrow semantics the API accepts. +#[derive(Debug)] +pub struct Validity<'a> { + pub(crate) bits: &'a [u8], + pub(crate) bit_len: usize, +} + +impl<'a> Validity<'a> { + /// Borrow `bits` as a validity bitmap of length `bit_len` rows. + /// + /// `bits.len()` must be at least `ceil(bit_len / 8)`. Bits past + /// `bit_len` are ignored by the encoder, so callers do not need to + /// zero them. + pub fn from_bitmap(bits: &'a [u8], bit_len: usize) -> Result { + let required_bytes = bit_len.div_ceil(8); + if bits.len() < required_bytes { + return Err(error::fmt!( + InvalidApiCall, + "validity bitmap too short: {} bytes for {} bits (need at least {})", + bits.len(), + bit_len, + required_bytes + )); + } + Ok(Self { bits, bit_len }) + } + + /// Logical length in bits / rows. + pub fn bit_len(&self) -> usize { + self.bit_len + } + + /// `true` iff bit `idx` is set (row `idx` is **valid**, Arrow shape). + #[inline] + pub(crate) fn is_valid(&self, idx: usize) -> bool { + debug_assert!(idx < self.bit_len); + let byte = self.bits[idx / 8]; + (byte >> (idx % 8)) & 1 == 1 + } + + /// Count non-null (i.e. valid) rows. + pub(crate) fn non_null_count(&self) -> usize { + let full_bytes = self.bit_len / 8; + let trailing_bits = self.bit_len % 8; + let mut count: usize = 0; + for &byte in &self.bits[..full_bytes] { + count += byte.count_ones() as usize; + } + if trailing_bits != 0 { + let mask = (1u8 << trailing_bits) - 1; + count += (self.bits[full_bytes] & mask).count_ones() as usize; + } + count + } +} + +/// Validate that a caller-supplied `data` length matches a chunk's locked +/// row count and any validity bitmap. Returns the row count to use. +pub(crate) fn check_row_count( + locked: Option, + data_len: usize, + validity: Option<&Validity<'_>>, +) -> Result { + let row_count = data_len; + if let Some(existing) = locked + && existing != row_count + { + return Err(error::fmt!( + InvalidApiCall, + "Column length mismatch: chunk row_count is {} but this column has {} rows", + existing, + row_count + )); + } + if let Some(v) = validity + && v.bit_len != row_count + { + return Err(error::fmt!( + InvalidApiCall, + "Validity bitmap length ({} bits) does not match column data length ({} rows)", + v.bit_len, + row_count + )); + } + Ok(row_count) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn non_null_count_handles_trailing_bits() { + // 9 bits: 0b1010_1010, 0b0000_0001 — bits 1,3,5,7 valid in byte 0; + // bit 8 (== row 8) valid in byte 1. Trailing bits past row 8 must + // be masked. + let bits = [0b1010_1010, 0xFFu8]; // second byte has every bit set + let v = Validity::from_bitmap(&bits, 9).unwrap(); + assert_eq!(v.non_null_count(), 4 + 1); + } + + #[test] + fn from_bitmap_rejects_short_buffer() { + let err = Validity::from_bitmap(&[0u8], 9).unwrap_err(); + assert_eq!(err.code(), crate::ErrorCode::InvalidApiCall); + } +} diff --git a/questdb-rs/src/ingress/column_sender/wire.rs b/questdb-rs/src/ingress/column_sender/wire.rs new file mode 100644 index 00000000..c62d2a4e --- /dev/null +++ b/questdb-rs/src/ingress/column_sender/wire.rs @@ -0,0 +1,117 @@ +/******************************************************************************* + * ___ _ ____ ____ + * / _ \ _ _ ___ ___| |_| _ \| __ ) + * | | | | | | |/ _ \/ __| __| | | | _ \ + * | |_| | |_| | __/\__ \ |_| |_| | |_) | + * \__\_\\__,_|\___||___/\__|____/|____/ + * + * Copyright (c) 2014-2019 Appsicle + * Copyright (c) 2019-2025 QuestDB + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ******************************************************************************/ + +//! Wire-format helpers for the column-major sender encoder. +//! +//! These are intentionally duplicated from the row-API encoder +//! (`buffer/qwp.rs`): the row helpers are private to that module and the +//! plan calls out the wire surface as a place where we accept the ~100 +//! lines of duplication to keep the column-sender hot path free of +//! cross-module hops. See `doc/COLUMN_SENDER_PLAN.md` §2.1. + +/// QWP message header magic. +pub(crate) const QWP_MAGIC: [u8; 4] = *b"QWP1"; +pub(crate) const QWP_VERSION_1: u8 = 1; +/// Wire-spec flag set on every column-sender frame (matches the row-API +/// `QwpBuffer::encode_ws_message`). +pub(crate) const QWP_FLAG_DEFER_COMMIT: u8 = 0x01; +pub(crate) const QWP_FLAG_DELTA_SYMBOL_DICT: u8 = 0x08; +pub(crate) const QWP_HEADER_LEN: usize = 12; + +/// Full schema mode emits the column-definition signature inline. +pub(crate) const QWP_SCHEMA_MODE_FULL: u8 = 0x00; +/// Reference schema mode reuses a previously-FULL signature by id. +pub(crate) const QWP_SCHEMA_MODE_REFERENCE: u8 = 0x01; + +// Wire type codes — duplicated from `buffer/qwp.rs`. See the QWP v1 spec +// (`questdb/documentation/connect/wire-protocols/qwp-ingress-websocket.md`) +// §Type byte table for the canonical list. +pub(crate) const QWP_TYPE_BOOLEAN: u8 = 0x01; +pub(crate) const QWP_TYPE_BYTE: u8 = 0x02; +pub(crate) const QWP_TYPE_SHORT: u8 = 0x03; +pub(crate) const QWP_TYPE_INT: u8 = 0x04; +pub(crate) const QWP_TYPE_LONG: u8 = 0x05; +pub(crate) const QWP_TYPE_FLOAT: u8 = 0x06; +pub(crate) const QWP_TYPE_DOUBLE: u8 = 0x07; +pub(crate) const QWP_TYPE_TIMESTAMP: u8 = 0x0A; +pub(crate) const QWP_TYPE_DATE: u8 = 0x0B; +pub(crate) const QWP_TYPE_UUID: u8 = 0x0C; +pub(crate) const QWP_TYPE_LONG256: u8 = 0x0D; +pub(crate) const QWP_TYPE_TIMESTAMP_NANOS: u8 = 0x10; +pub(crate) const QWP_TYPE_IPV4: u8 = 0x18; +pub(crate) const QWP_TYPE_VARCHAR: u8 = 0x0F; +pub(crate) const QWP_TYPE_SYMBOL: u8 = 0x09; + +/// Maximum bytes a UTF-8 column or table name is allowed to occupy on the +/// wire. Matches the row-API + Java client cap. +pub(crate) const MAX_NAME_LEN: usize = 127; + +/// Wire-shape sentinels QuestDB treats as NULL for each fixed-width +/// non-bitmap-capable type. The row-API encoder writes these for missing +/// values; the column-sender mirrors them on the nullable path so the +/// wire bytes are byte-compatible with the row encoder. +pub(crate) const I8_NULL: i8 = 0; +pub(crate) const I16_NULL: i16 = 0; +pub(crate) const I32_NULL: i32 = i32::MIN; +pub(crate) const I64_NULL: i64 = i64::MIN; +pub(crate) const F32_NULL: f32 = f32::NAN; +pub(crate) const F64_NULL: f64 = f64::NAN; + +/// Append `value` to `out` as an unsigned QWP varint (LEB128). +#[inline] +pub(crate) fn write_qwp_varint(out: &mut Vec, mut value: u64) { + while value > 0x7F { + out.push(((value & 0x7F) as u8) | 0x80); + value >>= 7; + } + out.push(value as u8); +} + +/// Append a length-prefixed byte string: `varint(len) + bytes`. +#[inline] +pub(crate) fn write_qwp_bytes(out: &mut Vec, bytes: &[u8]) { + write_qwp_varint(out, bytes.len() as u64); + out.extend_from_slice(bytes); +} + +/// Validate a UTF-8 name against the QWP/Java client length cap. +pub(crate) fn validate_name(kind: &'static str, name: &str) -> crate::Result<()> { + if name.is_empty() { + return Err(crate::error::fmt!( + InvalidName, + "{} name must not be empty", + kind + )); + } + if name.len() > MAX_NAME_LEN { + return Err(crate::error::fmt!( + InvalidName, + "{} name is too long: {} bytes (max {})", + kind, + name.len(), + MAX_NAME_LEN + )); + } + Ok(()) +} diff --git a/questdb-rs/src/ingress/sender.rs b/questdb-rs/src/ingress/sender.rs index 257989e2..e5a351c9 100644 --- a/questdb-rs/src/ingress/sender.rs +++ b/questdb-rs/src/ingress/sender.rs @@ -83,7 +83,7 @@ pub(crate) use qwp_ws_ownership::QwpWsRoleReject; pub use qwp_ws_ownership::*; #[cfg(feature = "sync-sender-qwp-ws")] -mod qwp_ws; +pub(crate) mod qwp_ws; #[cfg(feature = "sync-sender-qwp-ws")] pub(crate) use qwp_ws::*; diff --git a/questdb-rs/src/ingress/sender/qwp_ws.rs b/questdb-rs/src/ingress/sender/qwp_ws.rs index 8f272a68..f077e746 100644 --- a/questdb-rs/src/ingress/sender/qwp_ws.rs +++ b/questdb-rs/src/ingress/sender/qwp_ws.rs @@ -2112,6 +2112,9 @@ fn connect_tcp_to_any_addr( tcp.set_nodelay(true).ok(); tcp.set_read_timeout(Some(request_timeout)).ok(); tcp.set_write_timeout(Some(request_timeout)).ok(); + let sock = socket2::SockRef::from(&tcp); + sock.set_send_buffer_size(4 * 1024 * 1024).ok(); + sock.set_recv_buffer_size(4 * 1024 * 1024).ok(); return Ok(tcp); } Err(io) => failures.push(format!("{addr}: {io}")), diff --git a/questdb-rs/src/tests.rs b/questdb-rs/src/tests.rs index e5f060a3..8c28c42b 100644 --- a/questdb-rs/src/tests.rs +++ b/questdb-rs/src/tests.rs @@ -54,6 +54,9 @@ mod qwp_ws_publication_probe; #[cfg(feature = "sync-sender-qwp-ws")] mod qwp_ws_java_golden; +#[cfg(feature = "sync-sender-qwp-ws")] +mod column_sender_pool; + mod sender; mod decimal; diff --git a/questdb-rs/src/tests/column_sender_pool.rs b/questdb-rs/src/tests/column_sender_pool.rs new file mode 100644 index 00000000..65cfa606 --- /dev/null +++ b/questdb-rs/src/tests/column_sender_pool.rs @@ -0,0 +1,661 @@ +/******************************************************************************* + * ___ _ ____ ____ + * / _ \ _ _ ___ ___| |_| _ \| __ ) + * | | | | | | |/ _ \/ __| __| | | | _ \ + * | |_| | |_| | __/\__ \ |_| |_| | |_) | + * \__\_\\__,_|\___||___/\__|____/|____/ + * + * Copyright (c) 2014-2019 Appsicle + * Copyright (c) 2019-2025 QuestDB + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + ******************************************************************************/ + +//! Column-sender pool + flush integration tests (WS-0 through WS-2). +//! +//! - WS-0: eager-open, borrow/return, multi-thread concurrent borrows, +//! fail-fast at `pool_max`, idle reaper. +//! - WS-1: synchronous `flush` round-trip for empty chunks; `AckLevel::Durable` +//! opt-in guard. +//! - WS-2: numeric / fixed-width column round-trip with a designated +//! timestamp; schema reuse across repeated flushes. +//! +//! Pool slots are real [`crate::ingress::Sender`] instances. The mock server +//! defined here accepts the HTTP→WebSocket upgrade so `Sender::build()` +//! succeeds, then either parks on the connection or reads each QWP frame +//! and replies with an OK ack (status 0x00). + +use std::io::Read; +use std::net::TcpListener; +use std::sync::Arc; +use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering}; +use std::sync::mpsc; +use std::thread; +use std::time::{Duration, Instant}; + +use crate::ErrorCode; +use crate::ingress::column_sender::{AckLevel, Chunk, QuestDb}; +use crate::tests::qwp_ws::{perform_server_upgrade, read_frame, write_qwp_ok_response}; + +#[derive(Clone, Copy, Debug)] +enum MockMode { + /// Park the connection after upgrade — used by pool-only tests. + Park, + /// Read every QWP frame the client sends and reply with an OK ack. + AckEachFrame, +} + +/// Spawn a mock server that performs the WS upgrade for up to `max_accepts` +/// connections, then parks each accepted connection (drains until EOF). The +/// returned guard's `Drop` signals the accept loop to stop. +struct MockServer { + port: u16, + stop: Arc, + accepted: Arc, + join: Option>, +} + +impl MockServer { + fn spawn(max_accepts: usize) -> Self { + Self::spawn_with_mode(max_accepts, MockMode::Park) + } + + fn spawn_acking(max_accepts: usize) -> Self { + Self::spawn_with_mode(max_accepts, MockMode::AckEachFrame) + } + + fn spawn_with_mode(max_accepts: usize, mode: MockMode) -> Self { + let listener = TcpListener::bind("127.0.0.1:0").expect("bind 127.0.0.1"); + listener + .set_nonblocking(true) + .expect("set_nonblocking on listener"); + let port = listener.local_addr().expect("local_addr").port(); + + let stop = Arc::new(AtomicBool::new(false)); + let accepted = Arc::new(AtomicUsize::new(0)); + let stop_clone = Arc::clone(&stop); + let accepted_clone = Arc::clone(&accepted); + + let join = thread::Builder::new() + .name("column-sender-pool-mock-server".to_string()) + .spawn(move || { + let mut handles = Vec::new(); + while !stop_clone.load(Ordering::SeqCst) { + match listener.accept() { + Ok((mut stream, _)) => { + if accepted_clone.fetch_add(1, Ordering::SeqCst) >= max_accepts { + // Past the budget — drop without upgrade so + // the client sees a failed connect. + continue; + } + stream + .set_nonblocking(false) + .expect("set_nonblocking(false)"); + let stop = Arc::clone(&stop_clone); + let h = thread::spawn(move || { + if perform_server_upgrade(&mut stream).is_ok() { + match mode { + MockMode::Park => park_connection(&mut stream, &stop), + MockMode::AckEachFrame => { + ack_each_frame(&mut stream, &stop) + } + } + } + }); + handles.push(h); + } + Err(e) if e.kind() == std::io::ErrorKind::WouldBlock => { + thread::sleep(Duration::from_millis(10)); + } + Err(_) => break, + } + } + for h in handles { + let _ = h.join(); + } + }) + .expect("spawn mock server"); + + Self { + port, + stop, + accepted, + join: Some(join), + } + } + + fn port(&self) -> u16 { + self.port + } + + fn accepted(&self) -> usize { + self.accepted.load(Ordering::SeqCst) + } +} + +impl Drop for MockServer { + fn drop(&mut self) { + self.stop.store(true, Ordering::SeqCst); + if let Some(h) = self.join.take() { + let _ = h.join(); + } + } +} + +fn park_connection(stream: &mut std::net::TcpStream, stop: &AtomicBool) { + let _ = stream.set_read_timeout(Some(Duration::from_millis(100))); + let mut buf = [0u8; 1024]; + while !stop.load(Ordering::SeqCst) { + match stream.read(&mut buf) { + Ok(0) => break, + Ok(_) => {} + Err(e) + if e.kind() == std::io::ErrorKind::WouldBlock + || e.kind() == std::io::ErrorKind::TimedOut => + { + continue; + } + Err(_) => break, + } + } +} + +/// Read each WebSocket binary frame the client sends and reply with a QWP +/// OK ack, incrementing the wire sequence per frame. Control frames are +/// ignored. Exits on EOF or `stop`. +fn ack_each_frame(stream: &mut std::net::TcpStream, stop: &AtomicBool) { + let _ = stream.set_read_timeout(Some(Duration::from_millis(50))); + let mut next_wire_seq: u64 = 0; + while !stop.load(Ordering::SeqCst) { + match read_frame(stream) { + Ok((_fin, opcode, _payload)) => { + // Opcode 0x2 = binary; 0x8 = close; everything else is ignored. + if opcode == 0x8 { + break; + } + if opcode != 0x2 { + continue; + } + if write_qwp_ok_response(stream, next_wire_seq).is_err() { + break; + } + next_wire_seq += 1; + } + Err(e) + if e.kind() == std::io::ErrorKind::WouldBlock + || e.kind() == std::io::ErrorKind::TimedOut => + { + continue; + } + Err(_) => break, + } + } +} + +fn conf_for(port: u16, extras: &str) -> String { + format!( + "qwpws::addr=127.0.0.1:{port};auth_timeout=2000;reconnect_max_duration_millis=1000;{extras}" + ) +} + +#[test] +fn refuses_non_qwp_ws_schema() { + let err = QuestDb::connect("http::addr=localhost:9000;").unwrap_err(); + assert_eq!(err.code(), ErrorCode::ConfigError); + assert!(err.msg().contains("QWP/WebSocket")); +} + +#[test] +fn refuses_sf_dir() { + let err = QuestDb::connect("qwpws::addr=localhost:9000;sf_dir=/tmp/sf;").unwrap_err(); + assert_eq!(err.code(), ErrorCode::ConfigError); + assert!( + err.msg().contains("store-and-forward") && err.msg().contains("sf_dir"), + "msg: {}", + err.msg() + ); +} + +#[test] +fn eager_open_opens_pool_size_connections() { + let server = MockServer::spawn(8); + let db = QuestDb::connect(&conf_for(server.port(), "pool_size=3;pool_max=4;")).unwrap(); + assert_eq!(db.free_count(), 3); + assert_eq!(db.in_use_count(), 0); + // Give the server thread time to register the accepts (the upgrades + // complete before `connect` returns, but the AtomicUsize is incremented + // before `perform_server_upgrade`). + wait_until(Duration::from_secs(2), || server.accepted() == 3); + drop(db); +} + +#[test] +fn borrow_and_return_reuses_connection() { + let server = MockServer::spawn(2); + let db = QuestDb::connect(&conf_for(server.port(), "pool_size=1;pool_max=2;")).unwrap(); + assert_eq!(db.free_count(), 1); + { + let _borrow = db.borrow_sender().expect("borrow"); + assert_eq!(db.free_count(), 0); + assert_eq!(db.in_use_count(), 1); + } + // Drop returns the sender to the pool. + assert_eq!(db.free_count(), 1); + assert_eq!(db.in_use_count(), 0); + // Same physical connection — server only ever accepted one. + assert_eq!(server.accepted(), 1); + drop(db); +} + +#[test] +fn auto_grow_opens_new_connection_until_pool_max() { + let server = MockServer::spawn(4); + let db = QuestDb::connect(&conf_for(server.port(), "pool_size=1;pool_max=3;")).unwrap(); + let b1 = db.borrow_sender().expect("b1"); + let b2 = db.borrow_sender().expect("b2 (auto-grow)"); + let b3 = db.borrow_sender().expect("b3 (auto-grow)"); + assert_eq!(db.free_count(), 0); + assert_eq!(db.in_use_count(), 3); + wait_until(Duration::from_secs(2), || server.accepted() == 3); + drop(b1); + drop(b2); + drop(b3); + assert_eq!(db.free_count(), 3); + drop(db); +} + +#[test] +fn fail_fast_at_pool_max() { + let server = MockServer::spawn(4); + let db = QuestDb::connect(&conf_for(server.port(), "pool_size=1;pool_max=2;")).unwrap(); + let _b1 = db.borrow_sender().expect("b1"); + let _b2 = db.borrow_sender().expect("b2"); + let err = db.borrow_sender().expect_err("must fail-fast at cap"); + assert_eq!(err.code(), ErrorCode::InvalidApiCall); + assert!(err.msg().contains("pool_max"), "msg: {}", err.msg()); +} + +#[test] +fn concurrent_borrow_and_return_does_not_deadlock_or_leak() { + let server = MockServer::spawn(16); + let db = + Arc::new(QuestDb::connect(&conf_for(server.port(), "pool_size=1;pool_max=8;")).unwrap()); + let mut handles = Vec::new(); + for _ in 0..8 { + let db = Arc::clone(&db); + handles.push(thread::spawn(move || { + for _ in 0..16 { + let borrow = db.borrow_sender().expect("borrow_sender under contention"); + // Tiny critical section to encourage contention. + std::hint::black_box(&borrow); + thread::yield_now(); + } + })); + } + for h in handles { + h.join().expect("worker thread"); + } + // After all workers finish: every borrow returned. + assert_eq!(db.in_use_count(), 0); + assert!(db.free_count() >= 1); +} + +#[test] +fn manual_reap_closes_excess_idle_connections() { + let server = MockServer::spawn(4); + let db = QuestDb::connect(&conf_for( + server.port(), + "pool_size=1;pool_max=3;pool_idle_timeout_ms=50;pool_reap=manual;", + )) + .unwrap(); + let b1 = db.borrow_sender().expect("b1"); + let b2 = db.borrow_sender().expect("b2 (grow)"); + let b3 = db.borrow_sender().expect("b3 (grow)"); + drop(b1); + drop(b2); + drop(b3); + assert_eq!(db.free_count(), 3); + + // Reap before the idle timeout — nothing should be closed. + let immediate = db.reap_idle(); + assert_eq!(immediate, 0); + assert_eq!(db.free_count(), 3); + + // Wait past the idle timeout, then reap. Must keep `pool_size` warm. + thread::sleep(Duration::from_millis(120)); + let closed = db.reap_idle(); + assert_eq!(closed, 2, "should reap the two excess-over-pool_size slots"); + assert_eq!(db.free_count(), 1, "pool_size warm slot must stay"); + drop(db); +} + +#[test] +fn auto_reaper_closes_excess_idle_connections() { + let server = MockServer::spawn(4); + // tick = max(5s, timeout/12); use a long-enough timeout that timeout/12 + // > 5s so the reaper wakes promptly on its own ticker. + let db = QuestDb::connect(&conf_for( + server.port(), + "pool_size=1;pool_max=3;pool_idle_timeout_ms=100;pool_reap=auto;", + )) + .unwrap(); + let b1 = db.borrow_sender().expect("b1"); + let b2 = db.borrow_sender().expect("b2"); + let b3 = db.borrow_sender().expect("b3"); + drop(b1); + drop(b2); + drop(b3); + assert_eq!(db.free_count(), 3); + + // Auto reaper wakes on a `max(5s, timeout/12)` ticker. With timeout=100ms, + // the floor of 5s applies. Wait > 5s for the first wake-up. + let reaped = wait_until(Duration::from_secs(8), || db.free_count() == 1); + assert!( + reaped, + "auto reaper failed to drain excess; free={}", + db.free_count() + ); + drop(db); +} + +// ---------- WS-1: flush round-trip ---------- + +#[test] +fn refuses_durable_ack_without_opt_in() { + let server = MockServer::spawn(2); + let db = QuestDb::connect(&conf_for(server.port(), "")).unwrap(); + let mut sender = db.borrow_sender().expect("borrow"); + let err = sender + .sync(AckLevel::Durable) + .expect_err("durable without opt-in must fail"); + assert_eq!(err.code(), ErrorCode::InvalidApiCall); + assert!( + err.msg().contains("request_durable_ack"), + "msg: {}", + err.msg() + ); +} + +#[test] +fn durable_ack_without_opt_in_does_not_publish_commit_frame() { + let listener = TcpListener::bind("127.0.0.1:0").expect("bind 127.0.0.1"); + let port = listener.local_addr().expect("local_addr").port(); + let (tx, rx) = mpsc::channel(); + + let handle = thread::spawn(move || { + let (mut stream, _) = listener.accept().expect("accept"); + perform_server_upgrade(&mut stream).expect("upgrade"); + stream + .set_read_timeout(Some(Duration::from_millis(200))) + .expect("set read timeout"); + let frame = match read_frame(&mut stream) { + Ok((_fin, opcode, _payload)) => Some(opcode), + Err(e) + if e.kind() == std::io::ErrorKind::WouldBlock + || e.kind() == std::io::ErrorKind::TimedOut => + { + None + } + Err(e) => panic!("unexpected server read error: {e}"), + }; + tx.send(frame).expect("send frame observation"); + }); + + let db = QuestDb::connect(&conf_for(port, "")).unwrap(); + let mut sender = db.borrow_sender().expect("borrow"); + let err = sender + .sync(AckLevel::Durable) + .expect_err("durable without opt-in must fail before publish"); + assert_eq!(err.code(), ErrorCode::InvalidApiCall); + assert!( + err.msg().contains("request_durable_ack"), + "msg: {}", + err.msg() + ); + assert_eq!( + rx.recv_timeout(Duration::from_secs(2)) + .expect("server observation"), + None, + "sync must reject durable ACK before sending a commit frame" + ); + + drop(sender); + drop(db); + handle.join().expect("server thread"); +} + +#[test] +fn empty_chunk_flush_round_trips() { + let server = MockServer::spawn_acking(2); + let db = QuestDb::connect(&conf_for(server.port(), "")).unwrap(); + let mut sender = db.borrow_sender().expect("borrow"); + let mut chunk = Chunk::new("trades"); + assert_eq!(chunk.row_count(), 0); + sender.flush(&mut chunk).unwrap(); + sender + .sync(AckLevel::Ok) + .expect("empty-chunk flush must round-trip"); + // Flush clears the chunk. + assert_eq!(chunk.row_count(), 0); +} + +#[test] +fn deferred_flush_reserves_slot_for_sync_commit() { + let server = MockServer::spawn(2); + let db = QuestDb::connect(&conf_for(server.port(), "close_flush_timeout_millis=50;")).unwrap(); + let mut sender = db.borrow_sender().expect("borrow"); + let mut chunk = Chunk::new("trades"); + + for _ in 0..127 { + sender.flush(&mut chunk).expect("flush below reserve"); + } + + chunk.column_i64("qty", &[42], None).expect("column_i64"); + chunk + .designated_timestamp_nanos(&[1_700_000_000_000_000_000]) + .expect("designated timestamp"); + let err = sender + .flush(&mut chunk) + .expect_err("deferred flush must preserve the sync commit slot"); + assert_eq!(err.code(), ErrorCode::InvalidApiCall); + assert!(err.msg().contains("sync()"), "msg: {}", err.msg()); + assert_eq!( + chunk.row_count(), + 1, + "capacity failure must leave the caller's chunk untouched" + ); +} + +#[test] +fn flush_clears_chunk_for_reuse_and_can_repeat() { + let server = MockServer::spawn_acking(2); + let db = QuestDb::connect(&conf_for(server.port(), "")).unwrap(); + let mut sender = db.borrow_sender().expect("borrow"); + let mut chunk = Chunk::new("trades"); + for _ in 0..3 { + sender.flush(&mut chunk).unwrap(); + sender.sync(AckLevel::Ok).expect("repeated empty flush"); + } +} + +#[test] +fn flush_rejects_chunk_with_no_designated_timestamp() { + let server = MockServer::spawn(2); + let db = QuestDb::connect(&conf_for(server.port(), "")).unwrap(); + let mut sender = db.borrow_sender().expect("borrow"); + let mut chunk = Chunk::new("trades"); + chunk + .column_i64("price", &[1, 2, 3], None) + .expect("column_i64"); + let err = sender + .flush(&mut chunk) + .expect_err("non-empty chunk without designated_ts must error"); + assert_eq!(err.code(), ErrorCode::InvalidApiCall); + assert!(err.msg().contains("designated"), "msg: {}", err.msg()); + // Chunk is left untouched on failure. + assert_eq!(chunk.row_count(), 3); +} + +#[test] +fn non_empty_chunk_with_numeric_columns_round_trips() { + use crate::ingress::column_sender::Validity; + + let server = MockServer::spawn_acking(2); + let db = QuestDb::connect(&conf_for(server.port(), "")).unwrap(); + let mut sender = db.borrow_sender().expect("borrow"); + + let mut chunk = Chunk::new("trades"); + chunk.column_i64("qty", &[10, 20, 30], None).unwrap(); + chunk.column_f64("price", &[1.1, 2.2, 3.3], None).unwrap(); + // Nullable column: bit 1 (row 1) is null. + let bits = [0b0000_0101]; + let v = Validity::from_bitmap(&bits, 3).unwrap(); + chunk + .column_uuid("id", &[[0x10; 16], [0; 16], [0x20; 16]], Some(&v)) + .unwrap(); + chunk + .designated_timestamp_nanos(&[ + 1_700_000_000_000_000_000, + 1_700_000_000_000_001_000, + 1_700_000_000_000_002_000, + ]) + .unwrap(); + assert_eq!(chunk.row_count(), 3); + + sender.flush(&mut chunk).unwrap(); + sender.sync(AckLevel::Ok).expect("numeric chunk flush"); + assert!(chunk.is_empty(), "flush must clear the chunk"); + + // Second flush with the SAME schema exercises the SchemaRegistry's + // REFERENCE-mode shortcut: it must still round-trip cleanly. + chunk.column_i64("qty", &[40, 50], None).unwrap(); + chunk.column_f64("price", &[4.4, 5.5], None).unwrap(); + chunk + .column_uuid("id", &[[0x30; 16], [0x40; 16]], None) + .unwrap(); + chunk + .designated_timestamp_nanos(&[1_700_000_000_000_003_000, 1_700_000_000_000_004_000]) + .unwrap(); + sender.flush(&mut chunk).unwrap(); + sender + .sync(AckLevel::Ok) + .expect("second flush (schema reuse)"); +} + +#[test] +fn varchar_chunk_round_trips() { + use crate::ingress::column_sender::Validity; + + let server = MockServer::spawn_acking(2); + let db = QuestDb::connect(&conf_for(server.port(), "")).unwrap(); + let mut sender = db.borrow_sender().expect("borrow"); + + let mut chunk = Chunk::new("logs"); + // 4 rows: "alpha", null, "gamma", "δ" (multi-byte UTF-8). + let bytes = b"alphagamma\xCE\xB4"; + // Offsets length must be row_count + 1 = 5. The null row reuses the + // same offset on both sides per the plan's "skip slicing for null + // rows" rule. + let offsets: [i32; 5] = [0, 5, 5, 10, 12]; + let bits = [0b0000_1101]; // 0,2,3 valid; 1 null + let v = Validity::from_bitmap(&bits, 4).unwrap(); + chunk + .column_varchar("msg", &offsets, bytes, Some(&v)) + .unwrap(); + chunk + .column_i64("seq", &[100, 101, 102, 103], None) + .unwrap(); + chunk + .designated_timestamp_nanos(&[ + 1_700_000_000_000_000_000, + 1_700_000_000_000_001_000, + 1_700_000_000_000_002_000, + 1_700_000_000_000_003_000, + ]) + .unwrap(); + assert_eq!(chunk.row_count(), 4); + sender.flush(&mut chunk).unwrap(); + sender.sync(AckLevel::Ok).expect("varchar flush"); + assert!(chunk.is_empty()); +} + +#[test] +fn symbol_chunk_round_trips_and_reuses_global_dict() { + let server = MockServer::spawn_acking(2); + let db = QuestDb::connect(&conf_for(server.port(), "")).unwrap(); + let mut sender = db.borrow_sender().expect("borrow"); + + // Caller has a 3-entry dict; first chunk only references entries 0 and 2, + // so the wire's delta-symbol-dict prefix carries those two new symbols. + let dict_bytes = b"alphabetagamma"; + let dict_offsets: [i32; 4] = [0, 5, 9, 14]; + + let mut chunk = Chunk::new("trades"); + chunk + .symbol_dict_i32("sym", &[0, 2, 0, 2], &dict_offsets, dict_bytes, None) + .expect("symbol_dict_i32 first flush"); + chunk.designated_timestamp_nanos(&[1, 2, 3, 4]).unwrap(); + sender.flush(&mut chunk).unwrap(); + sender.sync(AckLevel::Ok).expect("symbol flush 1"); + + // Second flush re-uses entry 0 ("alpha", already in the global dict) + // and adds entry 1 ("beta"). With the connection-scoped dict the + // wire prefix only resends "beta"; the round-trip must still succeed. + chunk + .symbol_dict_i32("sym", &[1, 0, 1, 0], &dict_offsets, dict_bytes, None) + .expect("symbol_dict_i32 second flush"); + chunk.designated_timestamp_nanos(&[5, 6, 7, 8]).unwrap(); + sender.flush(&mut chunk).unwrap(); + sender.sync(AckLevel::Ok).expect("symbol flush 2"); +} + +#[test] +fn close_joins_reaper_cleanly() { + let server = MockServer::spawn(2); + let db = QuestDb::connect(&conf_for( + server.port(), + // close_flush_timeout_millis bounds the per-Sender close drain, which + // otherwise can wait up to 5s for the mock server's (absent) WS close + // handshake. We only care here that the reaper thread joins. + "pool_size=1;pool_max=2;pool_idle_timeout_ms=500;pool_reap=auto;close_flush_timeout_millis=200;", + )) + .unwrap(); + // Borrow + return so we have something to reap eventually. + let _ = db.borrow_sender().expect("borrow").must_close(); + // close() must return promptly (no hang) — the join is the test. + let start = Instant::now(); + db.close(); + // The bar is "does not hang indefinitely", not strict latency. The + // mock server never replies to a WS close frame, so Sender::drop waits + // out the (200 ms) close-flush timeout; 10 s is plenty of headroom on + // a CI runner under load. + assert!( + start.elapsed() < Duration::from_secs(10), + "close() must not hang on the reaper (took {:?})", + start.elapsed() + ); +} + +fn wait_until bool>(timeout: Duration, mut predicate: F) -> bool { + let deadline = Instant::now() + timeout; + loop { + if predicate() { + return true; + } + if Instant::now() >= deadline { + return false; + } + thread::sleep(Duration::from_millis(50)); + } +} diff --git a/questdb-rs/src/tests/qwp_ws.rs b/questdb-rs/src/tests/qwp_ws.rs index cbd824fa..9e50a040 100644 --- a/questdb-rs/src/tests/qwp_ws.rs +++ b/questdb-rs/src/tests/qwp_ws.rs @@ -41,7 +41,7 @@ use crate::ingress::{ QwpWsProgress, SenderBuilder, SymbolGlobalDict, TableName, TimestampNanos, }; -const WS_GUID: &str = "258EAFA5-E914-47DA-95CA-C5AB0DC85B11"; +pub(crate) const WS_GUID: &str = "258EAFA5-E914-47DA-95CA-C5AB0DC85B11"; const FIRST_WIRE_SEQUENCE: u64 = 0; const QWP_STATUS_OK: u8 = 0x00; const QWP_STATUS_DURABLE_ACK: u8 = 0x02; @@ -94,7 +94,7 @@ struct MockResult { received_frames: Vec>, } -fn read_request_until_blank(stream: &mut R) -> std::io::Result> { +pub(crate) fn read_request_until_blank(stream: &mut R) -> std::io::Result> { let mut buf = Vec::new(); let mut tmp = [0u8; 256]; loop { @@ -110,7 +110,7 @@ fn read_request_until_blank(stream: &mut R) -> std::io::Result> Ok(buf) } -fn parse_header(req: &str, name: &str) -> Option { +pub(crate) fn parse_header(req: &str, name: &str) -> Option { for line in req.split("\r\n").skip(1) { if let Some((k, v)) = line.split_once(':') && k.trim().eq_ignore_ascii_case(name) @@ -121,7 +121,7 @@ fn parse_header(req: &str, name: &str) -> Option { None } -fn read_frame(stream: &mut TcpStream) -> std::io::Result<(bool, u8, Vec)> { +pub(crate) fn read_frame(stream: &mut TcpStream) -> std::io::Result<(bool, u8, Vec)> { let mut hdr = [0u8; 2]; stream.read_exact(&mut hdr)?; let fin = (hdr[0] & 0x80) != 0; @@ -155,7 +155,10 @@ fn read_frame(stream: &mut TcpStream) -> std::io::Result<(bool, u8, Vec)> { Ok((fin, opcode, payload)) } -fn write_server_binary_frame(stream: &mut TcpStream, payload: &[u8]) -> std::io::Result<()> { +pub(crate) fn write_server_binary_frame( + stream: &mut TcpStream, + payload: &[u8], +) -> std::io::Result<()> { // FIN | binary, no mask (server→client). let mut frame = vec![0x82]; let plen = payload.len(); @@ -172,7 +175,7 @@ fn write_server_binary_frame(stream: &mut TcpStream, payload: &[u8]) -> std::io: stream.write_all(&frame) } -fn perform_server_upgrade(stream: &mut TcpStream) -> std::io::Result> { +pub(crate) fn perform_server_upgrade(stream: &mut TcpStream) -> std::io::Result> { stream.set_read_timeout(Some(Duration::from_secs(5)))?; stream.set_write_timeout(Some(Duration::from_secs(5)))?; @@ -272,7 +275,7 @@ fn write_raw_ws_frame(stream: &mut TcpStream, byte0: u8, payload: &[u8]) -> std: stream.write_all(&frame) } -fn write_qwp_ok_response(stream: &mut TcpStream, wire_seq: u64) -> std::io::Result<()> { +pub(crate) fn write_qwp_ok_response(stream: &mut TcpStream, wire_seq: u64) -> std::io::Result<()> { let mut ok = Vec::new(); ok.push(QWP_STATUS_OK); ok.extend_from_slice(&wire_seq.to_le_bytes()); @@ -325,7 +328,7 @@ fn write_qwp_error_response( write_server_binary_frame(stream, &err) } -fn compute_accept(key_b64: &str) -> String { +pub(crate) fn compute_accept(key_b64: &str) -> String { use base64ct::{Base64, Encoding}; let combined = format!("{key_b64}{WS_GUID}"); let digest = sha1(combined.as_bytes()); @@ -407,7 +410,7 @@ fn upgrade_mock_stream_without_upgrade_header(stream: &mut TcpStream) { // Mirror of the production SHA-1 used by the sender, reproduced here to // validate the upgrade handshake from the server side without poking at // internals. ~50 lines is cheaper than another dependency. -fn sha1(input: &[u8]) -> [u8; 20] { +pub(crate) fn sha1(input: &[u8]) -> [u8; 20] { let (mut h0, mut h1, mut h2, mut h3, mut h4) = ( 0x67452301u32, 0xEFCDAB89,