From ca2ffaf6f2a62f3303448389aad3803f7835a98f Mon Sep 17 00:00:00 2001 From: Benjamin <5719034+bnjjj@users.noreply.github.com> Date: Mon, 23 Feb 2026 17:17:52 +0100 Subject: [PATCH 01/86] add support clickhouse destination wip Signed-off-by: Benjamin <5719034+bnjjj@users.noreply.github.com> --- Cargo.lock | 1746 ++++++++------------ Cargo.toml | 1 + etl-destinations/Cargo.toml | 8 + etl-destinations/src/clickhouse/core.rs | 992 +++++++++++ etl-destinations/src/clickhouse/metrics.rs | 32 + etl-destinations/src/clickhouse/mod.rs | 5 + etl-destinations/src/clickhouse/schema.rs | 292 ++++ etl-destinations/src/lib.rs | 2 + etl-examples/Cargo.toml | 3 +- etl-examples/README.md | 40 + 10 files changed, 2063 insertions(+), 1058 deletions(-) create mode 100644 etl-destinations/src/clickhouse/core.rs create mode 100644 etl-destinations/src/clickhouse/metrics.rs create mode 100644 etl-destinations/src/clickhouse/mod.rs create mode 100644 etl-destinations/src/clickhouse/schema.rs diff --git a/Cargo.lock b/Cargo.lock index 27a513d10..8a126e9ff 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -21,9 +21,9 @@ dependencies = [ [[package]] name = "actix-http" -version = "3.12.0" +version = "3.11.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f860ee6746d0c5b682147b2f7f8ef036d4f92fe518251a3a35ffa3650eafdf0e" +checksum = "7926860314cbe2fb5d1f13731e387ab43bd32bca224e82e6e2db85de0a3dba49" dependencies = [ "actix-codec", "actix-rt", @@ -58,14 +58,14 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e01ed3140b2f8d422c68afa1ed2e85d996ea619c988ac834d255db32138655cb" dependencies = [ "quote", - "syn 2.0.117", + "syn 2.0.114", ] [[package]] name = "actix-router" -version = "0.5.4" +version = "0.5.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "14f8c75c51892f18d9c46150c5ac7beb81c95f78c8b83a634d49f4ca32551fe7" +checksum = "13d324164c51f63867b57e73ba5936ea151b8a41a1d23d1031eeb9f70d0236f8" dependencies = [ "bytestring", "cfg-if", @@ -124,9 +124,9 @@ dependencies = [ [[package]] name = "actix-web" -version = "4.13.0" +version = "4.12.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ff87453bc3b56e9b2b23c1cc0b1be8797184accf51d2abe0f8a33ec275d316bf" +checksum = "1654a77ba142e37f049637a3e5685f864514af11fcbc51cb51eb6596afe5b8d6" dependencies = [ "actix-codec", "actix-http", @@ -157,7 +157,7 @@ dependencies = [ "serde_json", "serde_urlencoded", "smallvec", - "socket2 0.6.3", + "socket2 0.6.1", "time", "tracing", "url", @@ -172,7 +172,7 @@ dependencies = [ "actix-router", "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.114", ] [[package]] @@ -286,9 +286,9 @@ dependencies = [ [[package]] name = "anstream" -version = "1.0.0" +version = "0.6.21" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "824a212faf96e9acacdbd09febd34438f8f711fb84e09a8916013cd7815ca28d" +checksum = "43d5b281e737544384e969a5ccad3f1cdd24b48086a0fc1b2a5262a26b8f4f4a" dependencies = [ "anstyle", "anstyle-parse", @@ -301,15 +301,15 @@ dependencies = [ [[package]] name = "anstyle" -version = "1.0.14" +version = "1.0.13" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "940b3a0ca603d1eade50a4846a2afffd5ef57a9feac2c0e2ec2e14f9ead76000" +checksum = "5192cca8006f1fd4f7237516f40fa183bb07f8fbdfedaa0036de5ea9b0b45e78" [[package]] name = "anstyle-parse" -version = "1.0.0" +version = "0.2.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "52ce7f38b242319f7cabaa6813055467063ecdc9d355bbb4ce0c68908cd8130e" +checksum = "4e7644824f0aa2c7b9384579234ef10eb7efb6a0deb83f9630a49594dd9c15c2" dependencies = [ "utf8parse", ] @@ -320,7 +320,7 @@ version = "1.1.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "40c48f72fd53cd289104fc64099abca73db4166ad86ea0b4341abe65af83dadc" dependencies = [ - "windows-sys 0.60.2", + "windows-sys 0.61.2", ] [[package]] @@ -331,14 +331,14 @@ checksum = "291e6a250ff86cd4a820112fb8898808a366d8f9f58ce16d1f538353ad55747d" dependencies = [ "anstyle", "once_cell_polyfill", - "windows-sys 0.60.2", + "windows-sys 0.61.2", ] [[package]] name = "anyhow" -version = "1.0.102" +version = "1.0.100" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7f202df86484c868dbad7eaa557ef785d5c66295e41b460ef922eca0723b842c" +checksum = "a23eb6b1614318a8071c9b2521f36b424b2c83db5eb3a0fead4a6c0809af6e61" [[package]] name = "apache-avro" @@ -358,9 +358,9 @@ dependencies = [ "serde", "serde_bytes", "serde_json", - "strum 0.27.2", - "strum_macros 0.27.2", - "thiserror 2.0.18", + "strum", + "strum_macros", + "thiserror 2.0.17", "uuid", "zstd", ] @@ -376,9 +376,9 @@ dependencies = [ [[package]] name = "arc-swap" -version = "1.9.0" +version = "1.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a07d1f37ff60921c83bdfc7407723bdefe89b44b98a9b772f225c8f9d67141a6" +checksum = "51d03449bb8ca2cc2ef70869af31463d1ae5ccc8fa3e334b307203fbf815207e" dependencies = [ "rustversion", ] @@ -403,96 +403,46 @@ checksum = "7c02d123df017efcdfbd739ef81735b36c5ba83ec3c59c80a9d7ecc718f92e50" [[package]] name = "arrow" -version = "57.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e4754a624e5ae42081f464514be454b39711daae0458906dacde5f4c632f33a8" -dependencies = [ - "arrow-arith 57.3.0", - "arrow-array 57.3.0", - "arrow-buffer 57.3.0", - "arrow-cast 57.3.0", - "arrow-data 57.3.0", - "arrow-ord 57.3.0", - "arrow-row 57.3.0", - "arrow-schema 57.3.0", - "arrow-select 57.3.0", - "arrow-string 57.3.0", -] - -[[package]] -name = "arrow" -version = "58.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "602268ce9f569f282cedb9a9f6bac569b680af47b9b077d515900c03c5d190da" -dependencies = [ - "arrow-arith 58.0.0", - "arrow-array 58.0.0", - "arrow-buffer 58.0.0", - "arrow-cast 58.0.0", - "arrow-data 58.0.0", - "arrow-ord 58.0.0", - "arrow-row 58.0.0", - "arrow-schema 58.0.0", - "arrow-select 58.0.0", - "arrow-string 58.0.0", -] - -[[package]] -name = "arrow-arith" -version = "57.3.0" +version = "57.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f7b3141e0ec5145a22d8694ea8b6d6f69305971c4fa1c1a13ef0195aef2d678b" +checksum = "2a2b10dcb159faf30d3f81f6d56c1211a5bea2ca424eabe477648a44b993320e" dependencies = [ - "arrow-array 57.3.0", - "arrow-buffer 57.3.0", - "arrow-data 57.3.0", - "arrow-schema 57.3.0", - "chrono", - "num-traits", + "arrow-arith", + "arrow-array", + "arrow-buffer", + "arrow-cast", + "arrow-data", + "arrow-ord", + "arrow-row", + "arrow-schema", + "arrow-select", + "arrow-string", ] [[package]] name = "arrow-arith" -version = "58.0.0" +version = "57.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cd53c6bf277dea91f136ae8e3a5d7041b44b5e489e244e637d00ae302051f56f" +checksum = "288015089e7931843c80ed4032c5274f02b37bcb720c4a42096d50b390e70372" dependencies = [ - "arrow-array 58.0.0", - "arrow-buffer 58.0.0", - "arrow-data 58.0.0", - "arrow-schema 58.0.0", + "arrow-array", + "arrow-buffer", + "arrow-data", + "arrow-schema", "chrono", "num-traits", ] [[package]] name = "arrow-array" -version = "57.3.0" +version = "57.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4c8955af33b25f3b175ee10af580577280b4bd01f7e823d94c7cdef7cf8c9aef" +checksum = "65ca404ea6191e06bf30956394173337fa9c35f445bd447fe6c21ab944e1a23c" dependencies = [ "ahash 0.8.12", - "arrow-buffer 57.3.0", - "arrow-data 57.3.0", - "arrow-schema 57.3.0", - "chrono", - "half", - "hashbrown 0.16.1", - "num-complex", - "num-integer", - "num-traits", -] - -[[package]] -name = "arrow-array" -version = "58.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e53796e07a6525edaf7dc28b540d477a934aff14af97967ad1d5550878969b9e" -dependencies = [ - "ahash 0.8.12", - "arrow-buffer 58.0.0", - "arrow-data 58.0.0", - "arrow-schema 58.0.0", + "arrow-buffer", + "arrow-data", + "arrow-schema", "chrono", "half", "hashbrown 0.16.1", @@ -503,21 +453,9 @@ dependencies = [ [[package]] name = "arrow-buffer" -version = "57.3.0" +version = "57.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c697ddca96183182f35b3a18e50b9110b11e916d7b7799cbfd4d34662f2c56c2" -dependencies = [ - "bytes", - "half", - "num-bigint", - "num-traits", -] - -[[package]] -name = "arrow-buffer" -version = "58.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f2c1a85bb2e94ee10b76531d8bc3ce9b7b4c0d508cabfb17d477f63f2617bd20" +checksum = "36356383099be0151dacc4245309895f16ba7917d79bdb71a7148659c9206c56" dependencies = [ "bytes", "half", @@ -527,16 +465,16 @@ dependencies = [ [[package]] name = "arrow-cast" -version = "57.3.0" +version = "57.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "646bbb821e86fd57189c10b4fcdaa941deaf4181924917b0daa92735baa6ada5" +checksum = "9c8e372ed52bd4ee88cc1e6c3859aa7ecea204158ac640b10e187936e7e87074" dependencies = [ - "arrow-array 57.3.0", - "arrow-buffer 57.3.0", - "arrow-data 57.3.0", - "arrow-ord 57.3.0", - "arrow-schema 57.3.0", - "arrow-select 57.3.0", + "arrow-array", + "arrow-buffer", + "arrow-data", + "arrow-ord", + "arrow-schema", + "arrow-select", "atoi", "base64", "chrono", @@ -546,49 +484,14 @@ dependencies = [ "ryu", ] -[[package]] -name = "arrow-cast" -version = "58.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "89fb245db6b0e234ed8e15b644edb8664673fefe630575e94e62cd9d489a8a26" -dependencies = [ - "arrow-array 58.0.0", - "arrow-buffer 58.0.0", - "arrow-data 58.0.0", - "arrow-ord 58.0.0", - "arrow-schema 58.0.0", - "arrow-select 58.0.0", - "atoi", - "base64", - "chrono", - "comfy-table", - "half", - "lexical-core", - "num-traits", - "ryu", -] - [[package]] name = "arrow-data" -version = "57.3.0" +version = "57.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1fdd994a9d28e6365aa78e15da3f3950c0fdcea6b963a12fa1c391afb637b304" +checksum = "bf87f4ff5fc13290aa47e499a8b669a82c5977c6a1fedce22c7f542c1fd5a597" dependencies = [ - "arrow-buffer 57.3.0", - "arrow-schema 57.3.0", - "half", - "num-integer", - "num-traits", -] - -[[package]] -name = "arrow-data" -version = "58.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "189d210bc4244c715fa3ed9e6e22864673cccb73d5da28c2723fb2e527329b33" -dependencies = [ - "arrow-buffer 58.0.0", - "arrow-schema 58.0.0", + "arrow-buffer", + "arrow-schema", "half", "num-integer", "num-traits", @@ -596,141 +499,75 @@ dependencies = [ [[package]] name = "arrow-ipc" -version = "57.3.0" +version = "57.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "abf7df950701ab528bf7c0cf7eeadc0445d03ef5d6ffc151eaae6b38a58feff1" +checksum = "eb3ca63edd2073fcb42ba112f8ae165df1de935627ead6e203d07c99445f2081" dependencies = [ - "arrow-array 57.3.0", - "arrow-buffer 57.3.0", - "arrow-data 57.3.0", - "arrow-schema 57.3.0", - "arrow-select 57.3.0", + "arrow-array", + "arrow-buffer", + "arrow-data", + "arrow-schema", + "arrow-select", "flatbuffers", ] [[package]] name = "arrow-ord" -version = "57.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f7d8f1870e03d4cbed632959498bcc84083b5a24bded52905ae1695bd29da45b" -dependencies = [ - "arrow-array 57.3.0", - "arrow-buffer 57.3.0", - "arrow-data 57.3.0", - "arrow-schema 57.3.0", - "arrow-select 57.3.0", -] - -[[package]] -name = "arrow-ord" -version = "58.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "211136cb253577ee1a6665f741a13136d4e563f64f5093ffd6fb837af90b9495" -dependencies = [ - "arrow-array 58.0.0", - "arrow-buffer 58.0.0", - "arrow-data 58.0.0", - "arrow-schema 58.0.0", - "arrow-select 58.0.0", -] - -[[package]] -name = "arrow-row" -version = "57.3.0" +version = "57.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "18228633bad92bff92a95746bbeb16e5fc318e8382b75619dec26db79e4de4c0" +checksum = "13c4e0530272ca755d6814218dffd04425c5b7854b87fa741d5ff848bf50aa39" dependencies = [ - "arrow-array 57.3.0", - "arrow-buffer 57.3.0", - "arrow-data 57.3.0", - "arrow-schema 57.3.0", - "half", + "arrow-array", + "arrow-buffer", + "arrow-data", + "arrow-schema", + "arrow-select", ] [[package]] name = "arrow-row" -version = "58.0.0" +version = "57.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8e0f20145f9f5ea3fe383e2ba7a7487bf19be36aa9dbf5dd6a1f92f657179663" +checksum = "b07f52788744cc71c4628567ad834cadbaeb9f09026ff1d7a4120f69edf7abd3" dependencies = [ - "arrow-array 58.0.0", - "arrow-buffer 58.0.0", - "arrow-data 58.0.0", - "arrow-schema 58.0.0", + "arrow-array", + "arrow-buffer", + "arrow-data", + "arrow-schema", "half", ] [[package]] name = "arrow-schema" -version = "57.3.0" +version = "57.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8c872d36b7bf2a6a6a2b40de9156265f0242910791db366a2c17476ba8330d68" - -[[package]] -name = "arrow-schema" -version = "58.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1b47e0ca91cc438d2c7879fe95e0bca5329fff28649e30a88c6f760b1faeddcb" -dependencies = [ - "bitflags", -] - -[[package]] -name = "arrow-select" -version = "57.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "68bf3e3efbd1278f770d67e5dc410257300b161b93baedb3aae836144edcaf4b" -dependencies = [ - "ahash 0.8.12", - "arrow-array 57.3.0", - "arrow-buffer 57.3.0", - "arrow-data 57.3.0", - "arrow-schema 57.3.0", - "num-traits", -] +checksum = "6bb63203e8e0e54b288d0d8043ca8fa1013820822a27692ef1b78a977d879f2c" [[package]] name = "arrow-select" -version = "58.0.0" +version = "57.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "750a7d1dda177735f5e82a314485b6915c7cccdbb278262ac44090f4aba4a325" +checksum = "c96d8a1c180b44ecf2e66c9a2f2bbcb8b1b6f14e165ce46ac8bde211a363411b" dependencies = [ "ahash 0.8.12", - "arrow-array 58.0.0", - "arrow-buffer 58.0.0", - "arrow-data 58.0.0", - "arrow-schema 58.0.0", - "num-traits", -] - -[[package]] -name = "arrow-string" -version = "57.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "85e968097061b3c0e9fe3079cf2e703e487890700546b5b0647f60fca1b5a8d8" -dependencies = [ - "arrow-array 57.3.0", - "arrow-buffer 57.3.0", - "arrow-data 57.3.0", - "arrow-schema 57.3.0", - "arrow-select 57.3.0", - "memchr", + "arrow-array", + "arrow-buffer", + "arrow-data", + "arrow-schema", "num-traits", - "regex", - "regex-syntax", ] [[package]] name = "arrow-string" -version = "58.0.0" +version = "57.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e1eab1208bc4fe55d768cdc9b9f3d9df5a794cdb3ee2586bf89f9b30dc31ad8c" +checksum = "a8ad6a81add9d3ea30bf8374ee8329992c7fd246ffd8b7e2f48a3cea5aa0cc9a" dependencies = [ - "arrow-array 58.0.0", - "arrow-buffer 58.0.0", - "arrow-data 58.0.0", - "arrow-schema 58.0.0", - "arrow-select 58.0.0", + "arrow-array", + "arrow-buffer", + "arrow-data", + "arrow-schema", + "arrow-select", "memchr", "num-traits", "regex", @@ -785,7 +622,7 @@ checksum = "c7c24de15d275a1ecfd47a380fb4d5ec9bfe0933f309ed5e705b775596a3574d" dependencies = [ "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.114", ] [[package]] @@ -796,7 +633,7 @@ checksum = "9035ad2d096bed7955a320ee7e2230574d28fd3c3a0f186cbea1ff3c7eed5dbb" dependencies = [ "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.114", ] [[package]] @@ -822,9 +659,9 @@ checksum = "c08606f8c3cbf4ce6ec8e28fb0014a2c086708fe954eaa885384a6165172e7e8" [[package]] name = "aws-lc-rs" -version = "1.16.2" +version = "1.15.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a054912289d18629dc78375ba2c3726a3afe3ff71b4edba9dedfca0e3446d1fc" +checksum = "e84ce723ab67259cfeb9877c6a639ee9eb7a27b28123abd71db7f0d5d0cc9d86" dependencies = [ "aws-lc-sys", "zeroize", @@ -832,9 +669,9 @@ dependencies = [ [[package]] name = "aws-lc-sys" -version = "0.39.0" +version = "0.36.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1fa7e52a4c5c547c741610a2c6f123f3881e409b714cd27e6798ef020c514f0a" +checksum = "43a442ece363113bd4bd4c8b18977a7798dd4d3c3383f34fb61936960e8f4ad8" dependencies = [ "cc", "cmake", @@ -908,9 +745,9 @@ checksum = "230c5f1ca6a325a32553f8640d31ac9b49f2411e901e427570154868b46da4f7" [[package]] name = "bitflags" -version = "2.11.0" +version = "2.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "843867be96c8daad0d758b57df9392b6d8d271134fce549de6ce169ff98a92af" +checksum = "812e12b5285cc515a9c72a5c1d3b6d46a19dac5acfef5265968c166106e31dd3" dependencies = [ "serde_core", ] @@ -945,11 +782,17 @@ dependencies = [ "objc2", ] +[[package]] +name = "bnum" +version = "0.13.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "119771309b95163ec7aaf79810da82f7cd0599c19722d48b9c03894dca833966" + [[package]] name = "bon" -version = "3.9.1" +version = "3.8.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f47dbe92550676ee653353c310dfb9cf6ba17ee70396e1f7cf0a2020ad49b2fe" +checksum = "234655ec178edd82b891e262ea7cf71f6584bcd09eff94db786be23f1821825c" dependencies = [ "bon-macros", "rustversion", @@ -957,9 +800,9 @@ dependencies = [ [[package]] name = "bon-macros" -version = "3.9.1" +version = "3.8.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "519bd3116aeeb42d5372c29d982d16d0170d3d4a5ed85fc7dd91642ffff3c67c" +checksum = "89ec27229c38ed0eb3c0feee3d2c1d6a4379ae44f418a29a658890e062d8f365" dependencies = [ "darling 0.23.0", "ident_case", @@ -967,31 +810,30 @@ dependencies = [ "proc-macro2", "quote", "rustversion", - "syn 2.0.117", + "syn 2.0.114", ] [[package]] name = "borsh" -version = "1.6.1" +version = "1.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cfd1e3f8955a5d7de9fab72fc8373fade9fb8a703968cb200ae3dc6cf08e185a" +checksum = "d1da5ab77c1437701eeff7c88d968729e7766172279eab0676857b3d63af7a6f" dependencies = [ "borsh-derive", - "bytes", "cfg_aliases", ] [[package]] name = "borsh-derive" -version = "1.6.1" +version = "1.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bfcfdc083699101d5a7965e49925975f2f55060f94f9a05e7187be95d530ca59" +checksum = "0686c856aa6aac0c4498f936d7d6a02df690f614c03e4d906d1018062b5c5e2c" dependencies = [ "once_cell", "proc-macro-crate", "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.114", ] [[package]] @@ -1015,11 +857,20 @@ dependencies = [ "alloc-stdlib", ] +[[package]] +name = "bstr" +version = "1.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "63044e1ae8e69f3b5a92c736ca6269b8d12fa7efe39bf34ddb06d102cf0e2cab" +dependencies = [ + "memchr", +] + [[package]] name = "bumpalo" -version = "3.20.2" +version = "3.19.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5d20789868f4b01b2f2caec9f5c4e0213b41e3e5702a50157d699ae31ced2fcb" +checksum = "5dd9dc738b7a8311c7ade152424974d8115f2cdad61e8dab8dac9f2362298510" [[package]] name = "bytecheck" @@ -1045,9 +896,9 @@ dependencies = [ [[package]] name = "bytemuck" -version = "1.25.0" +version = "1.24.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c8efb64bd706a16a1bdde310ae86b351e4d21550d98d056f22f8a7f7a2183fec" +checksum = "1fbdf580320f38b612e485521afda1ee26d10cc9884efaaa750d383e13e3c5f4" [[package]] name = "byteorder" @@ -1060,6 +911,9 @@ name = "bytes" version = "1.11.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1e748733b7cbc798e1434b6ac524f0c1ff2ab456fe201501e6497c8417a4fc33" +dependencies = [ + "serde", +] [[package]] name = "bytestring" @@ -1070,17 +924,11 @@ dependencies = [ "bytes", ] -[[package]] -name = "cast" -version = "0.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "37b2a672a2cb129a2e41c10b1224bb368f9f37a2b16b612598138befd7b37eb5" - [[package]] name = "cc" -version = "1.2.57" +version = "1.2.52" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7a0dd1ca384932ff3641c8718a02769f1698e7563dc6974ffd03346116310423" +checksum = "cd4932aefd12402b36c60956a4fe0035421f544799057659ff86f923657aada3" dependencies = [ "find-msvc-tools", "jobserver", @@ -1102,9 +950,9 @@ checksum = "613afe47fcd5fac7ccf1db93babcb082c5994d996f20b8b159f2ad1658eb5724" [[package]] name = "chrono" -version = "0.4.44" +version = "0.4.43" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c673075a2e0e5f4a1dde27ce9dee1ea4558c7ffe648f576438a20ca1d2acc4b0" +checksum = "fac4744fb15ae8337dc853fee7fb3f4e48c0fbaa23d0afe49c447b4fab126118" dependencies = [ "iana-time-zone", "js-sys", @@ -1116,9 +964,9 @@ dependencies = [ [[package]] name = "clap" -version = "4.6.0" +version = "4.5.54" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b193af5b67834b676abd72466a96c1024e6a6ad978a1f484bd90b85c94041351" +checksum = "c6e6ff9dcd79cff5cd969a17a545d79e84ab086e444102a591e288a8aa3ce394" dependencies = [ "clap_builder", "clap_derive", @@ -1126,9 +974,9 @@ dependencies = [ [[package]] name = "clap_builder" -version = "4.6.0" +version = "4.5.54" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "714a53001bf66416adb0e2ef5ac857140e7dc3a0c48fb28b2f10762fc4b5069f" +checksum = "fa42cf4d2b7a41bc8f663a7cab4031ebafa1bf3875705bfaf8466dc60ab52c00" dependencies = [ "anstream", "anstyle", @@ -1138,48 +986,85 @@ dependencies = [ [[package]] name = "clap_derive" -version = "4.6.0" +version = "4.5.49" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1110bd8a634a1ab8cb04345d8d878267d57c3cf1b38d91b71af6686408bbca6a" +checksum = "2a0b5487afeab2deb2ff4e03a807ad1a03ac532ff5a2cee5d86884440c7f7671" dependencies = [ "heck", "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.114", ] [[package]] name = "clap_lex" -version = "1.1.0" +version = "0.7.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c8d4a3bb8b1e0c1050499d1815f5ab16d04f0959b233085fb31653fbfc9d98f9" +checksum = "c3e64b0cc0439b12df2fa678eae89a1c56a529fd067a9115f7827f1fffd22b32" [[package]] -name = "cmake" -version = "0.1.57" +name = "clickhouse" +version = "0.14.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "75443c44cd6b379beb8c5b45d85d0773baf31cce901fe7bb252f4eff3008ef7d" +checksum = "d975a05171c6f8a453f60ec6287c0018c90911d5a8a46d9b6abe386ea359fab3" dependencies = [ - "cc", + "bnum", + "bstr", + "bytes", + "clickhouse-macros", + "clickhouse-types", + "futures-channel", + "futures-util", + "http-body-util", + "hyper", + "hyper-rustls", + "hyper-util", + "polonius-the-crab", + "quanta", + "rustls", + "serde", + "thiserror 2.0.17", + "tokio", + "url", ] [[package]] -name = "colorchoice" -version = "1.0.5" +name = "clickhouse-macros" +version = "0.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1d07550c9036bf2ae0c684c4297d503f838287c83c53686d05370d0e139ae570" +checksum = "ff6669899e23cb87b43daf7996f0ea3b9c07d0fb933d745bb7b815b052515ae3" +dependencies = [ + "proc-macro2", + "quote", + "serde_derive_internals", + "syn 2.0.114", +] + +[[package]] +name = "clickhouse-types" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "358fbfd439fb0bed02a3e2ecc5131f6a9d039ba5639aed650cf0e845f6ebfc16" +dependencies = [ + "bytes", + "thiserror 2.0.17", +] [[package]] -name = "comfy-table" -version = "7.1.2" +name = "cmake" +version = "0.1.57" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e0d05af1e006a2407bedef5af410552494ce5be9090444dbbcb57258c1af3d56" +checksum = "75443c44cd6b379beb8c5b45d85d0773baf31cce901fe7bb252f4eff3008ef7d" dependencies = [ - "strum 0.26.3", - "strum_macros 0.26.4", - "unicode-width", + "cc", ] +[[package]] +name = "colorchoice" +version = "1.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b05b61dc5112cbb17e4b6cd61790d9845d13888356391624cbe7e41efeac1e75" + [[package]] name = "concurrent-queue" version = "2.5.0" @@ -1204,9 +1089,9 @@ dependencies = [ [[package]] name = "configcat" -version = "0.1.5" +version = "0.1.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9836fc676f74106765176c8fd0a3295e473fdb3bbeb97b3ab68575b1e5173543" +checksum = "07c24f431a8fe2bc8e7b1ede60acd0a57df8512b2a80a1cbc7ee349961974fc4" dependencies = [ "arc-swap", "base16ct", @@ -1392,6 +1277,16 @@ dependencies = [ "darling_macro 0.20.11", ] +[[package]] +name = "darling" +version = "0.21.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9cdf337090841a411e2a7f3deb9187445851f91b309c0c0a29e05f74a00a48c0" +dependencies = [ + "darling_core 0.21.3", + "darling_macro 0.21.3", +] + [[package]] name = "darling" version = "0.23.0" @@ -1413,7 +1308,21 @@ dependencies = [ "proc-macro2", "quote", "strsim", - "syn 2.0.117", + "syn 2.0.114", +] + +[[package]] +name = "darling_core" +version = "0.21.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1247195ecd7e3c85f83c8d2a366e4210d588e802133e1e355180a9870b517ea4" +dependencies = [ + "fnv", + "ident_case", + "proc-macro2", + "quote", + "strsim", + "syn 2.0.114", ] [[package]] @@ -1426,7 +1335,7 @@ dependencies = [ "proc-macro2", "quote", "strsim", - "syn 2.0.117", + "syn 2.0.114", ] [[package]] @@ -1437,7 +1346,18 @@ checksum = "fc34b93ccb385b40dc71c6fceac4b2ad23662c7eeb248cf10d529b7e055b6ead" dependencies = [ "darling_core 0.20.11", "quote", - "syn 2.0.117", + "syn 2.0.114", +] + +[[package]] +name = "darling_macro" +version = "0.21.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d38308df82d1080de0afee5d069fa14b0326a88c14f15c5ccda35b4a6c414c81" +dependencies = [ + "darling_core 0.21.3", + "quote", + "syn 2.0.114", ] [[package]] @@ -1448,9 +1368,27 @@ checksum = "ac3984ec7bd6cfa798e62b4a642426a5be0e68f9401cfc2a01e3fa9ea2fcdb8d" dependencies = [ "darling_core 0.23.0", "quote", - "syn 2.0.117", + "syn 2.0.114", +] + +[[package]] +name = "deadpool" +version = "0.12.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0be2b1d1d6ec8d846f05e137292d0b89133caf95ef33695424c09568bdd39b1b" +dependencies = [ + "deadpool-runtime", + "lazy_static", + "num_cpus", + "tokio", ] +[[package]] +name = "deadpool-runtime" +version = "0.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "092966b41edc516079bdf31ec78a2e0588d1d0c08f78b91d8307215928642b2b" + [[package]] name = "debugid" version = "0.8.0" @@ -1470,17 +1408,7 @@ dependencies = [ "const-oid", "der_derive", "flagset", - "pem-rfc7468 0.7.0", - "zeroize", -] - -[[package]] -name = "der" -version = "0.8.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "71fd89660b2dc699704064e59e9dba0147b903e85319429e131620d022be411b" -dependencies = [ - "pem-rfc7468 1.0.0", + "pem-rfc7468", "zeroize", ] @@ -1492,14 +1420,14 @@ checksum = "8034092389675178f570469e6c3b0465d3d30b4505c294a6550db47f3c17ad18" dependencies = [ "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.114", ] [[package]] name = "deranged" -version = "0.5.8" +version = "0.5.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7cd812cc2bc1d69d4764bd80df88b4317eaef9e773c75226407d9bc0876b211c" +checksum = "ececcb659e7ba858fb4f10388c250a7252eb0a27373f1a72b8748afdd248e587" dependencies = [ "powerfmt", "serde_core", @@ -1513,7 +1441,7 @@ checksum = "1e567bd82dcff979e4b03460c307b3cdc9e96fde3d73bed1496d2bc75d9dd62a" dependencies = [ "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.114", ] [[package]] @@ -1534,7 +1462,7 @@ dependencies = [ "darling 0.20.11", "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.114", ] [[package]] @@ -1544,7 +1472,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ab63b0e2bf4d5928aff72e83a7dace85d7bba5fe12dcc3c5a572d78caffd3f3c" dependencies = [ "derive_builder_core", - "syn 2.0.117", + "syn 2.0.114", ] [[package]] @@ -1566,7 +1494,7 @@ dependencies = [ "proc-macro2", "quote", "rustc_version", - "syn 2.0.117", + "syn 2.0.114", "unicode-xid", ] @@ -1584,9 +1512,9 @@ dependencies = [ [[package]] name = "dispatch2" -version = "0.3.1" +version = "0.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1e0e367e4e7da84520dedcac1901e4da967309406d1e51017ae1abfb97adbd38" +checksum = "89a09f22a6c6069a18470eb92d2298acf25463f14256d24778e1230d789a2aec" dependencies = [ "bitflags", "objc2", @@ -1600,14 +1528,14 @@ checksum = "97369cbbc041bc366949bc74d34658d6cda5621039731c6310521892a3a20ae0" dependencies = [ "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.114", ] [[package]] name = "dissimilar" -version = "1.0.11" +version = "1.0.10" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "aeda16ab4059c5fd2a83f2b9c9e9c981327b18aa8e3b313f7e6563799d4f093e" +checksum = "8975ffdaa0ef3661bfe02dbdcc06c9f829dfafe6a3c474de366a8d5e44276921" [[package]] name = "dlv-list" @@ -1624,24 +1552,6 @@ version = "0.15.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1aaf95b3e5c8f23aa320147307562d361db0ae0d51242340f558153b4eb2439b" -[[package]] -name = "duckdb" -version = "1.10501.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f13bc6d6487032fc2825a62ef8b4924b2378a2eb3166e132e5f3141ae9dd633f" -dependencies = [ - "arrow 58.0.0", - "cast", - "fallible-iterator 0.3.0", - "fallible-streaming-iterator", - "hashlink 0.10.0", - "libduckdb-sys", - "num-integer", - "r2d2", - "rust_decimal", - "strum 0.27.2", -] - [[package]] name = "dunce" version = "1.0.5" @@ -1663,7 +1573,7 @@ dependencies = [ "enum-ordinalize", "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.114", ] [[package]] @@ -1701,7 +1611,7 @@ checksum = "8ca9601fb2d62598ee17836250842873a413586e5d7ed88b356e38ddbb0ec631" dependencies = [ "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.114", ] [[package]] @@ -1717,7 +1627,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "39cab71617ae0d63f51a36d69f866391735b51691dbda63cf6f96d042b63efeb" dependencies = [ "libc", - "windows-sys 0.52.0", + "windows-sys 0.61.2", ] [[package]] @@ -1751,7 +1661,6 @@ dependencies = [ "postgres-replication", "rand 0.9.2", "rustls", - "serde", "serde_json", "sqlx", "sysinfo", @@ -1796,11 +1705,10 @@ dependencies = [ "serde", "serde_json", "sqlx", - "thiserror 2.0.18", + "thiserror 2.0.17", "tokio", "tracing", "tracing-actix-web", - "url", "utoipa", "utoipa-swagger-ui", "uuid", @@ -1830,11 +1738,11 @@ dependencies = [ "secrecy", "serde", "serde_json", + "serde_yaml", "sqlx", "tempfile", - "thiserror 2.0.18", + "thiserror 2.0.17", "tokio-postgres", - "url", "utoipa", ] @@ -1842,11 +1750,11 @@ dependencies = [ name = "etl-destinations" version = "0.1.0" dependencies = [ - "arrow 57.3.0", + "arrow", "async-trait", "base64", "chrono", - "duckdb", + "clickhouse", "etl", "etl-telemetry", "futures", @@ -1854,22 +1762,16 @@ dependencies = [ "iceberg", "iceberg-catalog-rest", "metrics", - "parking_lot", "parquet", - "pg_escape", "prost", - "r2d2", "rand 0.9.2", "reqwest", "rustls", "serde", "serde_json", - "tempfile", "tokio", - "tokio-postgres", "tonic", "tracing", - "url", "uuid", ] @@ -1879,14 +1781,12 @@ version = "0.1.0" dependencies = [ "clap", "etl", - "etl-config", "etl-destinations", - "etl-telemetry", "rustls", + "sysinfo", "tokio", "tracing", "tracing-subscriber", - "url", ] [[package]] @@ -1900,7 +1800,7 @@ dependencies = [ "serde", "serde_json", "sqlx", - "thiserror 2.0.18", + "thiserror 2.0.17", "tokio", "tokio-postgres", "tracing", @@ -1927,7 +1827,6 @@ dependencies = [ "tikv-jemallocator", "tokio", "tracing", - "url", ] [[package]] @@ -1936,9 +1835,8 @@ version = "0.1.0" dependencies = [ "etl-config", "metrics-exporter-prometheus", - "serde", "serde_json", - "thiserror 2.0.18", + "thiserror 2.0.17", "tokio", "tracing", "tracing-appender", @@ -1994,40 +1892,17 @@ version = "0.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4443176a9f2c162692bd3d352d745ef9413eec5782a80d8fd6f8a1ac692a07f7" -[[package]] -name = "fallible-iterator" -version = "0.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2acce4a10f12dc2fb14a218589d4f1f62ef011b2d0cc4b3cb1bba8e94da14649" - -[[package]] -name = "fallible-streaming-iterator" -version = "0.1.9" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7360491ce676a36bf9bb3c56c1aa791658183a54d2744120f27285738d90465a" - [[package]] name = "fastrand" version = "2.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "37909eebbb50d72f9059c3b6d82c0463f2ff062c9e95845c43a6c9c0355411be" -[[package]] -name = "filetime" -version = "0.2.27" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f98844151eee8917efc50bd9e8318cb963ae8b297431495d3f758616ea5c57db" -dependencies = [ - "cfg-if", - "libc", - "libredox", -] - [[package]] name = "find-msvc-tools" -version = "0.1.9" +version = "0.1.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5baebc0774151f905a1a2cc41989300b1e6fbb29aff0ceffa1064fdd3088d582" +checksum = "f449e6c6c08c865631d4890cfacf252b3d396c9bcc83adb6623cdb02a8336c41" [[package]] name = "findshlibs" @@ -2065,9 +1940,9 @@ dependencies = [ [[package]] name = "flate2" -version = "1.1.9" +version = "1.1.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "843fba2746e448b37e26a819579957415c8cef339bf08564fe8b7ddbd959573c" +checksum = "b375d6465b98090a5f25b1c7703f3859783755aa9a80433b36e0379a3ec2f369" dependencies = [ "crc32fast", "miniz_oxide", @@ -2141,9 +2016,9 @@ checksum = "e6d5a32815ae3f33302d95fdcb2ce17862f8c65363dcfd29360480ba1001fc9c" [[package]] name = "futures" -version = "0.3.32" +version = "0.3.31" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8b147ee9d1f6d097cef9ce628cd2ee62288d963e16fb287bd9286455b241382d" +checksum = "65bc07b1a8bc7c85c5f2e110c476c7389b4554ba72af57d8445ea63a576b0876" dependencies = [ "futures-channel", "futures-core", @@ -2156,9 +2031,9 @@ dependencies = [ [[package]] name = "futures-channel" -version = "0.3.32" +version = "0.3.31" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "07bbe89c50d7a535e539b8c17bc0b49bdb77747034daa8087407d655f3f7cc1d" +checksum = "2dff15bf788c671c1934e366d07e30c1814a8ef514e1af724a602e8a2fbe1b10" dependencies = [ "futures-core", "futures-sink", @@ -2166,15 +2041,15 @@ dependencies = [ [[package]] name = "futures-core" -version = "0.3.32" +version = "0.3.31" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7e3450815272ef58cec6d564423f6e755e25379b217b0bc688e295ba24df6b1d" +checksum = "05f29059c0c2090612e8d742178b0580d2dc940c837851ad723096f87af6663e" [[package]] name = "futures-executor" -version = "0.3.32" +version = "0.3.31" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "baf29c38818342a3b26b5b923639e7b1f4a61fc5e76102d4b1981c6dc7a7579d" +checksum = "1e28d1d997f585e54aebc3f97d39e72338912123a67330d723fdbb564d646c9f" dependencies = [ "futures-core", "futures-task", @@ -2194,38 +2069,38 @@ dependencies = [ [[package]] name = "futures-io" -version = "0.3.32" +version = "0.3.31" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cecba35d7ad927e23624b22ad55235f2239cfa44fd10428eecbeba6d6a717718" +checksum = "9e5c1b78ca4aae1ac06c48a526a655760685149f0d465d21f37abfe57ce075c6" [[package]] name = "futures-macro" -version = "0.3.32" +version = "0.3.31" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e835b70203e41293343137df5c0664546da5745f82ec9b84d40be8336958447b" +checksum = "162ee34ebcb7c64a8abebc059ce0fee27c2262618d7b60ed8faf72fef13c3650" dependencies = [ "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.114", ] [[package]] name = "futures-sink" -version = "0.3.32" +version = "0.3.31" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c39754e157331b013978ec91992bde1ac089843443c49cbc7f46150b0fad0893" +checksum = "e575fab7d1e0dcb8d0c7bcf9a63ee213816ab51902e6d244a95819acacf1d4f7" [[package]] name = "futures-task" -version = "0.3.32" +version = "0.3.31" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "037711b3d59c33004d3856fbdc83b99d4ff37a24768fa1be9ce3538a1cde4393" +checksum = "f90f7dce0722e95104fcb095585910c0977252f286e354b5e3bd38902cd99988" [[package]] name = "futures-util" -version = "0.3.32" +version = "0.3.31" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "389ca41296e6190b48053de0321d02a77f32f8a5d2461dd38762c0593805c6d6" +checksum = "9fa08315bb612088cc391249efdc3bc77536f16c91f6cf495e6fbe85b20a4a81" dependencies = [ "futures-channel", "futures-core", @@ -2235,16 +2110,18 @@ dependencies = [ "futures-task", "memchr", "pin-project-lite", + "pin-utils", "slab", ] [[package]] name = "gcp-bigquery-client" version = "0.28.0" -source = "git+https://github.com/iambriccardo/gcp-bigquery-client?rev=c4fc59e338ca181d29b0dd53cac786fbe8513633#c4fc59e338ca181d29b0dd53cac786fbe8513633" +source = "git+https://github.com/iambriccardo/gcp-bigquery-client?rev=81ea3352af2e5fcbf04cd0ae47572d5ae97f992a#81ea3352af2e5fcbf04cd0ae47572d5ae97f992a" dependencies = [ "async-stream", "async-trait", + "deadpool", "dyn-clone", "futures", "hyper-util", @@ -2256,7 +2133,7 @@ dependencies = [ "reqwest", "serde", "serde_json", - "thiserror 2.0.18", + "thiserror 2.0.17", "time", "tokio", "tokio-stream", @@ -2301,24 +2178,11 @@ dependencies = [ "cfg-if", "js-sys", "libc", - "r-efi 5.3.0", + "r-efi", "wasip2", "wasm-bindgen", ] -[[package]] -name = "getrandom" -version = "0.4.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0de51e6874e94e7bf76d726fc5d13ba782deca734ff60d5bb2fb2607c7406555" -dependencies = [ - "cfg-if", - "libc", - "r-efi 6.0.0", - "wasip2", - "wasip3", -] - [[package]] name = "gimli" version = "0.32.3" @@ -2474,12 +2338,29 @@ version = "0.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea" +[[package]] +name = "hermit-abi" +version = "0.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fc0fef456e4baa96da950455cd02c081ca953b141298e41db3fc7e36b1da849c" + [[package]] name = "hex" version = "0.4.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7f24254aa9a54b5c858eaee2f5bccdb46aaf0e486a595ed5fd8f86ba55232a70" +[[package]] +name = "higher-kinded-types" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e690f8474c6c5d8ff99656fcbc195a215acc3949481a8b0b3351c838972dc776" +dependencies = [ + "macro_rules_attribute", + "never-say-never", + "paste", +] + [[package]] name = "hkdf" version = "0.12.4" @@ -2633,7 +2514,7 @@ dependencies = [ "tokio", "tokio-rustls", "tower-service", - "webpki-roots 1.0.6", + "webpki-roots 1.0.5", ] [[package]] @@ -2667,13 +2548,14 @@ dependencies = [ [[package]] name = "hyper-util" -version = "0.1.20" +version = "0.1.19" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "96547c2556ec9d12fb1578c4eaf448b04993e7fb79cbaad930a656880a6bdfa0" +checksum = "727805d60e7938b76b826a6ef209eb70eaa1812794f9424d4a4e2d740662df5f" dependencies = [ "base64", "bytes", "futures-channel", + "futures-core", "futures-util", "http 1.4.0", "http-body", @@ -2682,7 +2564,7 @@ dependencies = [ "libc", "percent-encoding", "pin-project-lite", - "socket2 0.6.3", + "socket2 0.6.1", "system-configuration", "tokio", "tower-service", @@ -2692,9 +2574,9 @@ dependencies = [ [[package]] name = "iana-time-zone" -version = "0.1.65" +version = "0.1.64" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e31bc9ad994ba00e440a8aa5c9ef0ec67d5cb5e5cb0cc7f8b744a35b389cc470" +checksum = "33e57f83510bb73707521ebaffa789ec8caf86f9657cad665b092b581d40e9fb" dependencies = [ "android_system_properties", "core-foundation-sys", @@ -2723,14 +2605,14 @@ dependencies = [ "anyhow", "apache-avro", "array-init", - "arrow-arith 57.3.0", - "arrow-array 57.3.0", - "arrow-buffer 57.3.0", - "arrow-cast 57.3.0", - "arrow-ord 57.3.0", - "arrow-schema 57.3.0", - "arrow-select 57.3.0", - "arrow-string 57.3.0", + "arrow-arith", + "arrow-array", + "arrow-buffer", + "arrow-cast", + "arrow-ord", + "arrow-schema", + "arrow-select", + "arrow-string", "as-any", "async-trait", "backon", @@ -2743,7 +2625,7 @@ dependencies = [ "flate2", "fnv", "futures", - "itertools 0.13.0", + "itertools", "moka", "murmur3", "num-bigint", @@ -2762,7 +2644,7 @@ dependencies = [ "serde_json", "serde_repr", "serde_with", - "strum 0.27.2", + "strum", "tokio", "typed-builder", "url", @@ -2780,7 +2662,7 @@ dependencies = [ "chrono", "http 1.4.0", "iceberg", - "itertools 0.13.0", + "itertools", "reqwest", "serde", "serde_derive", @@ -2872,12 +2754,6 @@ dependencies = [ "zerovec", ] -[[package]] -name = "id-arena" -version = "2.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3d3067d79b975e8844ca9eb072e16b31c3c1c36928edf9c6789548c524d0d954" - [[package]] name = "ident_case" version = "1.0.1" @@ -2936,9 +2812,9 @@ dependencies = [ [[package]] name = "insta" -version = "1.46.3" +version = "1.46.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e82db8c87c7f1ccecb34ce0c24399b8a73081427f3c7c50a5d597925356115e4" +checksum = "1b66886d14d18d420ab5052cbff544fc5d34d0b2cdd35eb5976aaa10a4a472e5" dependencies = [ "once_cell", "pest", @@ -2956,15 +2832,15 @@ checksum = "8bb03732005da905c88227371639bf1ad885cc712789c011c31c5fb3ab3ccf02" [[package]] name = "ipnet" -version = "2.12.0" +version = "2.11.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d98f6fed1fde3f8c21bc40a1abb88dd75e67924f9cffc3ef95607bad8017f8e2" +checksum = "469fb0b9cefa57e3ef31275ee7cacb78f2fdca44e4765491884a2b119d4eb130" [[package]] name = "iri-string" -version = "0.7.11" +version = "0.7.10" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d8e7418f59cc01c88316161279a7f665217ae316b388e58a0d10e29f54f1e5eb" +checksum = "c91338f0783edbd6195decb37bae672fd3b165faffb89bf7b9e6942f8b1a731a" dependencies = [ "memchr", "serde", @@ -2985,26 +2861,17 @@ dependencies = [ "either", ] -[[package]] -name = "itertools" -version = "0.14.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2b192c782037fadd9cfa75548310488aabdbf3d2da73885b31bd0abd03351285" -dependencies = [ - "either", -] - [[package]] name = "itoa" -version = "1.0.18" +version = "1.0.17" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8f42a60cbdf9a97f5d2305f08a87dc4e09308d1276d28c869c684d7777685682" +checksum = "92ecc6618181def0457392ccd0ee51198e065e016d1d527a7ac1b6dc7c1f09d2" [[package]] name = "jiff" -version = "0.2.23" +version = "0.2.18" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1a3546dc96b6d42c5f24902af9e2538e82e39ad350b0c766eb3fbf2d8f3d8359" +checksum = "e67e8da4c49d6d9909fe03361f9b620f58898859f5c7aded68351e85e71ecf50" dependencies = [ "jiff-static", "jiff-tzdb-platform", @@ -3012,25 +2879,25 @@ dependencies = [ "portable-atomic", "portable-atomic-util", "serde_core", - "windows-sys 0.52.0", + "windows-sys 0.61.2", ] [[package]] name = "jiff-static" -version = "0.2.23" +version = "0.2.18" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2a8c8b344124222efd714b73bb41f8b5120b27a7cc1c75593a6ff768d9d05aa4" +checksum = "e0c84ee7f197eca9a86c6fd6cb771e55eb991632f15f2bc3ca6ec838929e6e78" dependencies = [ "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.114", ] [[package]] name = "jiff-tzdb" -version = "0.1.6" +version = "0.1.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c900ef84826f1338a557697dc8fc601df9ca9af4ac137c7fb61d4c6f2dfd3076" +checksum = "68971ebff725b9e2ca27a601c5eb38a4c5d64422c4cbab0c535f248087eda5c2" [[package]] name = "jiff-tzdb-platform" @@ -3053,9 +2920,9 @@ dependencies = [ [[package]] name = "js-sys" -version = "0.3.91" +version = "0.3.85" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b49715b7073f385ba4bc528e5747d02e66cb39c6146efb66b781f131f0fb399c" +checksum = "8c942ebf8e95485ca0d52d97da7c5a2c387d0e7f0ba4c35e93bfcaee045955b3" dependencies = [ "once_cell", "wasm-bindgen", @@ -3083,7 +2950,7 @@ dependencies = [ "pest_derive", "regex", "serde_json", - "thiserror 2.0.18", + "thiserror 2.0.17", ] [[package]] @@ -3150,7 +3017,7 @@ dependencies = [ "serde", "serde_json", "serde_yaml", - "thiserror 2.0.18", + "thiserror 2.0.17", "tokio", "tokio-util", "tower", @@ -3174,7 +3041,7 @@ dependencies = [ "serde", "serde-value", "serde_json", - "thiserror 2.0.18", + "thiserror 2.0.17", ] [[package]] @@ -3188,7 +3055,7 @@ dependencies = [ "quote", "serde", "serde_json", - "syn 2.0.117", + "syn 2.0.114", ] [[package]] @@ -3212,7 +3079,7 @@ dependencies = [ "pin-project", "serde", "serde_json", - "thiserror 2.0.18", + "thiserror 2.0.17", "tokio", "tokio-util", "tracing", @@ -3233,12 +3100,6 @@ dependencies = [ "spin", ] -[[package]] -name = "leb128fmt" -version = "0.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "09edd9e8b54e49e587e4f6295a7d29c3ea94d469cb40ab8ca70b288248a81db2" - [[package]] name = "lexical-core" version = "1.0.6" @@ -3298,43 +3159,25 @@ dependencies = [ [[package]] name = "libc" -version = "0.2.183" +version = "0.2.180" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b5b646652bf6661599e1da8901b3b9522896f01e736bad5f723fe7a3a27f899d" - -[[package]] -name = "libduckdb-sys" -version = "1.10501.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "12096c1694924782b3fe21e790630b77bacb4fcb7ad9d7ee0fec626f985bf248" -dependencies = [ - "cc", - "flate2", - "pkg-config", - "reqwest", - "serde", - "serde_json", - "tar", - "vcpkg", - "zip 6.0.0", -] +checksum = "bcc35a38544a891a5f7c865aca548a982ccb3b8650a5b06d0fd33a10283c56fc" [[package]] name = "libm" -version = "0.2.16" +version = "0.2.15" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b6d2cec3eae94f9f509c767b45932f1ada8350c4bdb85af2fcab4a3c14807981" +checksum = "f9fbbcab51052fe104eb5e5d351cf728d30a5be1fe14d9be8a3b097481fb97de" [[package]] name = "libredox" -version = "0.1.15" +version = "0.1.12" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7ddbf48fd451246b1f8c2610bd3b4ac0cc6e149d89832867093ab69a17194f08" +checksum = "3d0b95e02c851351f877147b7deea7b1afb1df71b63aa5f8270716e0c5720616" dependencies = [ "bitflags", "libc", - "plain", - "redox_syscall 0.7.3", + "redox_syscall 0.7.0", ] [[package]] @@ -3349,9 +3192,9 @@ dependencies = [ [[package]] name = "linux-raw-sys" -version = "0.12.1" +version = "0.11.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "32a66949e030da00e8c7d4434b251670a91556f4144941d37452769c25d58a53" +checksum = "df1d3c3b53da64cf5760482273a98e575c651a67eec7f77df96b5b642de8f039" [[package]] name = "litemap" @@ -3388,13 +3231,29 @@ checksum = "112b39cec0b298b6c1999fee3e31427f74f676e4cb9879ed1a121b43661a4154" [[package]] name = "lz4_flex" -version = "0.12.1" +version = "0.12.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "98c23545df7ecf1b16c303910a69b079e8e251d60f7dd2cc9b4177f2afaf1746" +checksum = "ab6473172471198271ff72e9379150e9dfd70d8e533e0752a27e515b48dd375e" dependencies = [ "twox-hash", ] +[[package]] +name = "macro_rules_attribute" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "65049d7923698040cd0b1ddcced9b0eb14dd22c5f86ae59c3740eab64a676520" +dependencies = [ + "macro_rules_attribute-proc_macro", + "paste", +] + +[[package]] +name = "macro_rules_attribute-proc_macro" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "670fdfda89751bc4a84ac13eaa63e205cf0fd22b4c9a5fbfa085b63c1f1d3a30" + [[package]] name = "matchers" version = "0.2.0" @@ -3416,9 +3275,9 @@ dependencies = [ [[package]] name = "memchr" -version = "2.8.0" +version = "2.7.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f8ca58f447f06ed17d5fc4043ce1b10dd205e060fb3ce5b979b8ed8e59ff3f79" +checksum = "f52b00d39961fc5b2736ea853c9cc86238e165017a493d1d5c8eac6bdc4cc273" [[package]] name = "metrics" @@ -3445,7 +3304,7 @@ dependencies = [ "metrics", "metrics-util", "quanta", - "thiserror 2.0.18", + "thiserror 2.0.17", "tokio", "tracing", ] @@ -3512,9 +3371,9 @@ dependencies = [ [[package]] name = "moka" -version = "0.12.15" +version = "0.12.12" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "957228ad12042ee839f93c8f257b62b4c0ab5eaae1d4fa60de53b27c9d7c5046" +checksum = "a3dec6bd31b08944e08b58fd99373893a6c17054d6f3ea5006cc894f4f4eee2a" dependencies = [ "async-lock", "crossbeam-channel", @@ -3550,21 +3409,27 @@ checksum = "e94e1e6445d314f972ff7395df2de295fe51b71821694f0b0e1e79c4f12c8577" [[package]] name = "native-tls" -version = "0.2.18" +version = "0.2.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "465500e14ea162429d264d44189adc38b199b62b1c21eea9f69e4b73cb03bbf2" +checksum = "87de3442987e9dbec73158d5c715e7ad9072fda936bb03d19d7fa10e00520f0e" dependencies = [ "libc", "log", "openssl", - "openssl-probe 0.2.1", + "openssl-probe 0.1.6", "openssl-sys", "schannel", - "security-framework 3.7.0", + "security-framework 2.11.1", "security-framework-sys", "tempfile", ] +[[package]] +name = "never-say-never" +version = "6.6.666" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cf5a574dadd7941adeaa71823ecba5e28331b8313fb2e1c6a5c7e5981ea53ad6" + [[package]] name = "nix" version = "0.30.1" @@ -3602,7 +3467,7 @@ version = "0.50.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7957b9740744892f114936ab4a57b3f487491bbeafaf8083688b16841a4240e5" dependencies = [ - "windows-sys 0.60.2", + "windows-sys 0.61.2", ] [[package]] @@ -3643,9 +3508,9 @@ dependencies = [ [[package]] name = "num-conv" -version = "0.2.1" +version = "0.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c6673768db2d862beb9b39a78fdcb1a69439615d5794a1be50caa9bc92c81967" +checksum = "cf97ec579c3c42f953ef76dbf8d55ac91fb219dde70e49aa4a6b7d74e9919050" [[package]] name = "num-integer" @@ -3677,6 +3542,16 @@ dependencies = [ "libm", ] +[[package]] +name = "num_cpus" +version = "1.17.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "91df4bbde75afed763b708b7eee1e8e7651e02d97f6d5dd763e89367e957b23b" +dependencies = [ + "hermit-abi", + "libc", +] + [[package]] name = "num_threads" version = "0.1.7" @@ -3688,18 +3563,18 @@ dependencies = [ [[package]] name = "objc2" -version = "0.6.4" +version = "0.6.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3a12a8ed07aefc768292f076dc3ac8c48f3781c8f2d5851dd3d98950e8c5a89f" +checksum = "b7c2599ce0ec54857b29ce62166b0ed9b4f6f1a70ccc9a71165b6154caca8c05" dependencies = [ "objc2-encode", ] [[package]] name = "objc2-cloud-kit" -version = "0.3.2" +version = "0.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "73ad74d880bb43877038da939b7427bba67e9dd42004a18b809ba7d87cee241c" +checksum = "17614fdcd9b411e6ff1117dfb1d0150f908ba83a7df81b1f118005fe0a8ea15d" dependencies = [ "bitflags", "objc2", @@ -3708,9 +3583,9 @@ dependencies = [ [[package]] name = "objc2-core-data" -version = "0.3.2" +version = "0.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0b402a653efbb5e82ce4df10683b6b28027616a2715e90009947d50b8dd298fa" +checksum = "291fbbf7d29287518e8686417cf7239c74700fd4b607623140a7d4a3c834329d" dependencies = [ "objc2", "objc2-foundation", @@ -3718,9 +3593,9 @@ dependencies = [ [[package]] name = "objc2-core-foundation" -version = "0.3.2" +version = "0.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2a180dd8642fa45cdb7dd721cd4c11b1cadd4929ce112ebd8b9f5803cc79d536" +checksum = "1c10c2894a6fed806ade6027bcd50662746363a9589d3ec9d9bef30a4e4bc166" dependencies = [ "bitflags", "dispatch2", @@ -3729,9 +3604,9 @@ dependencies = [ [[package]] name = "objc2-core-graphics" -version = "0.3.2" +version = "0.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e022c9d066895efa1345f8e33e584b9f958da2fd4cd116792e15e07e4720a807" +checksum = "989c6c68c13021b5c2d6b71456ebb0f9dc78d752e86a98da7c716f4f9470f5a4" dependencies = [ "bitflags", "dispatch2", @@ -3742,9 +3617,9 @@ dependencies = [ [[package]] name = "objc2-core-image" -version = "0.3.2" +version = "0.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e5d563b38d2b97209f8e861173de434bd0214cf020e3423a52624cd1d989f006" +checksum = "79b3dc0cc4386b6ccf21c157591b34a7f44c8e75b064f85502901ab2188c007e" dependencies = [ "objc2", "objc2-foundation", @@ -3752,26 +3627,14 @@ dependencies = [ [[package]] name = "objc2-core-location" -version = "0.3.2" +version = "0.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ca347214e24bc973fc025fd0d36ebb179ff30536ed1f80252706db19ee452009" +checksum = "ac0f75792558aa9d618443bbb5db7426a7a0b6fddf96903f86ef9ad02e135740" dependencies = [ "objc2", "objc2-foundation", ] -[[package]] -name = "objc2-core-text" -version = "0.3.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0cde0dfb48d25d2b4862161a4d5fcc0e3c24367869ad306b0c9ec0073bfed92d" -dependencies = [ - "bitflags", - "objc2", - "objc2-core-foundation", - "objc2-core-graphics", -] - [[package]] name = "objc2-encode" version = "4.1.0" @@ -3780,9 +3643,9 @@ checksum = "ef25abbcd74fb2609453eb695bd2f860d389e457f67dc17cafc8b8cbc89d0c33" [[package]] name = "objc2-foundation" -version = "0.3.2" +version = "0.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e3e0adef53c21f888deb4fa59fc59f7eb17404926ee8a6f59f5df0fd7f9f3272" +checksum = "900831247d2fe1a09a683278e5384cfb8c80c79fe6b166f9d14bfdde0ea1b03c" dependencies = [ "bitflags", "block2", @@ -3793,9 +3656,9 @@ dependencies = [ [[package]] name = "objc2-io-kit" -version = "0.3.2" +version = "0.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "33fafba39597d6dc1fb709123dfa8289d39406734be322956a69f0931c73bb15" +checksum = "71c1c64d6120e51cd86033f67176b1cb66780c2efe34dec55176f77befd93c0a" dependencies = [ "libc", "objc2-core-foundation", @@ -3803,9 +3666,9 @@ dependencies = [ [[package]] name = "objc2-io-surface" -version = "0.3.2" +version = "0.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "180788110936d59bab6bd83b6060ffdfffb3b922ba1396b312ae795e1de9d81d" +checksum = "7282e9ac92529fa3457ce90ebb15f4ecbc383e8338060960760fa2cf75420c3c" dependencies = [ "bitflags", "objc2", @@ -3814,9 +3677,9 @@ dependencies = [ [[package]] name = "objc2-quartz-core" -version = "0.3.2" +version = "0.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "96c1358452b371bf9f104e21ec536d37a650eb10f7ee379fff67d2e08d537f1f" +checksum = "90ffb6a0cd5f182dc964334388560b12a57f7b74b3e2dec5e2722aa2dfb2ccd5" dependencies = [ "bitflags", "objc2", @@ -3826,9 +3689,9 @@ dependencies = [ [[package]] name = "objc2-ui-kit" -version = "0.3.2" +version = "0.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d87d638e33c06f577498cbcc50491496a3ed4246998a7fbba7ccb98b1e7eab22" +checksum = "25b1312ad7bc8a0e92adae17aa10f90aae1fb618832f9b993b022b591027daed" dependencies = [ "bitflags", "block2", @@ -3839,7 +3702,6 @@ dependencies = [ "objc2-core-graphics", "objc2-core-image", "objc2-core-location", - "objc2-core-text", "objc2-foundation", "objc2-quartz-core", "objc2-user-notifications", @@ -3847,9 +3709,9 @@ dependencies = [ [[package]] name = "objc2-user-notifications" -version = "0.3.2" +version = "0.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9df9128cbbfef73cda168416ccf7f837b62737d748333bfe9ab71c245d76613e" +checksum = "2a3f5ec77a81d9e0c5a0b32159b0cb143d7086165e79708351e02bf37dfc65cd" dependencies = [ "objc2", "objc2-foundation", @@ -3866,9 +3728,9 @@ dependencies = [ [[package]] name = "once_cell" -version = "1.21.4" +version = "1.21.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9f7c3e4beb33f85d45ae3e3a1792185706c8e16d043238c593331cc7cd313b50" +checksum = "42f5e15c9953c5e4ccceeb2e7382a716482c34515315f7b03532b8b4e8393d2d" [[package]] name = "once_cell_polyfill" @@ -3907,9 +3769,9 @@ dependencies = [ [[package]] name = "openssl" -version = "0.10.76" +version = "0.10.75" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "951c002c75e16ea2c65b8c7e4d3d51d5530d8dfa7d060b4776828c88cfb18ecf" +checksum = "08838db121398ad17ab8531ce9de97b244589089e290a384c900cb9ff7434328" dependencies = [ "bitflags", "cfg-if", @@ -3928,7 +3790,7 @@ checksum = "a948666b637a0f465e8564c73e89d4dde00d72d4d473cc972f390fc3dcee7d9c" dependencies = [ "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.114", ] [[package]] @@ -3939,15 +3801,15 @@ checksum = "d05e27ee213611ffe7d6348b942e8f942b37114c00cc03cec254295a4a17852e" [[package]] name = "openssl-probe" -version = "0.2.1" +version = "0.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7c87def4c32ab89d880effc9e097653c8da5d6ef28e6b539d313baaacfbafcbe" +checksum = "9f50d9b3dabb09ecd771ad0aa242ca6894994c130308ca3d7684634df8037391" [[package]] name = "openssl-sys" -version = "0.9.112" +version = "0.9.111" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "57d55af3b3e226502be1526dfdba67ab0e9c96fc293004e79576b2b9edb0dbdb" +checksum = "82cab2d520aa75e3c58898289429321eb788c3106963d0dc886ec7a5f4adc321" dependencies = [ "cc", "libc", @@ -4030,18 +3892,18 @@ dependencies = [ [[package]] name = "parquet" -version = "57.3.0" +version = "57.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6ee96b29972a257b855ff2341b37e61af5f12d6af1158b6dcdb5b31ea07bb3cb" +checksum = "5f6a2926a30477c0b95fea6c28c3072712b139337a242c2cc64817bdc20a8854" dependencies = [ "ahash 0.8.12", - "arrow-array 57.3.0", - "arrow-buffer 57.3.0", - "arrow-cast 57.3.0", - "arrow-data 57.3.0", + "arrow-array", + "arrow-buffer", + "arrow-cast", + "arrow-data", "arrow-ipc", - "arrow-schema 57.3.0", - "arrow-select 57.3.0", + "arrow-schema", + "arrow-select", "base64", "brotli", "bytes", @@ -4095,15 +3957,6 @@ dependencies = [ "base64ct", ] -[[package]] -name = "pem-rfc7468" -version = "1.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a6305423e0e7738146434843d1694d621cce767262b2a86910beab705e4493d9" -dependencies = [ - "base64ct", -] - [[package]] name = "percent-encoding" version = "2.3.2" @@ -4112,9 +3965,9 @@ checksum = "9b4f627cb1b25917193a259e49bdad08f671f8d9708acfd5fe0a8c1455d87220" [[package]] name = "pest" -version = "2.8.6" +version = "2.8.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e0848c601009d37dfa3430c4666e147e49cdcf1b92ecd3e63657d8a5f19da662" +checksum = "2c9eb05c21a464ea704b53158d358a31e6425db2f63a1a7312268b05fe2b75f7" dependencies = [ "memchr", "ucd-trie", @@ -4122,9 +3975,9 @@ dependencies = [ [[package]] name = "pest_derive" -version = "2.8.6" +version = "2.8.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "11f486f1ea21e6c10ed15d5a7c77165d0ee443402f0780849d1768e7d9d6fe77" +checksum = "68f9dbced329c441fa79d80472764b1a2c7e57123553b8519b36663a2fb234ed" dependencies = [ "pest", "pest_generator", @@ -4132,22 +3985,22 @@ dependencies = [ [[package]] name = "pest_generator" -version = "2.8.6" +version = "2.8.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8040c4647b13b210a963c1ed407c1ff4fdfa01c31d6d2a098218702e6664f94f" +checksum = "3bb96d5051a78f44f43c8f712d8e810adb0ebf923fc9ed2655a7f66f63ba8ee5" dependencies = [ "pest", "pest_meta", "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.114", ] [[package]] name = "pest_meta" -version = "2.8.6" +version = "2.8.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "89815c69d36021a140146f26659a81d6c2afa33d216d736dd4be5381a7362220" +checksum = "602113b5b5e8621770cfd490cfd90b9f84ab29bd2b0e49ad83eb6d186cef2365" dependencies = [ "pest", "sha2", @@ -4203,7 +4056,7 @@ dependencies = [ "phf_shared", "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.114", ] [[package]] @@ -4217,29 +4070,29 @@ dependencies = [ [[package]] name = "pin-project" -version = "1.1.11" +version = "1.1.10" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f1749c7ed4bcaf4c3d0a3efc28538844fb29bcdd7d2b67b2be7e20ba861ff517" +checksum = "677f1add503faace112b9f1373e43e9e054bfdd22ff1a63c1bc485eaec6a6a8a" dependencies = [ "pin-project-internal", ] [[package]] name = "pin-project-internal" -version = "1.1.11" +version = "1.1.10" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d9b20ed30f105399776b9c883e68e536ef602a16ae6f596d2c473591d6ad64c6" +checksum = "6e918e4ff8c4549eb882f14b3a4bc8c8bc93de829416eacf579f1207a8fbf861" dependencies = [ "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.114", ] [[package]] name = "pin-project-lite" -version = "0.2.17" +version = "0.2.16" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a89322df9ebe1c1578d689c92318e070967d1042b512afbe49518723f4e6d5cd" +checksum = "3b3cff922bd51709b605d9ead9aa71031d81447142d828eb4a6eba76fe619f9b" [[package]] name = "pin-utils" @@ -4253,7 +4106,7 @@ version = "0.7.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c8ffb9f10fa047879315e6625af03c164b16962a5368d724ed16323b68ace47f" dependencies = [ - "der 0.7.10", + "der", "pkcs8", "spki", ] @@ -4264,7 +4117,7 @@ version = "0.10.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f950b2377845cebe5cf8b5165cb3cc1a5e0fa5cfa3e1f7f55707d8fd82e0a7b7" dependencies = [ - "der 0.7.10", + "der", "spki", ] @@ -4275,22 +4128,26 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7edddbd0b52d732b21ad9a5fab5c704c14cd949e5e9a1ec5929a24fded1b904c" [[package]] -name = "plain" -version = "0.2.3" +name = "polonius-the-crab" +version = "0.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b4596b6d070b27117e987119b4dac604f3c58cfb0b191112e24771b2faeac1a6" +checksum = "ec242d7eccbb2fd8b3b5b6e3cf89f94a91a800f469005b44d154359609f8af72" +dependencies = [ + "higher-kinded-types", + "never-say-never", +] [[package]] name = "portable-atomic" -version = "1.13.1" +version = "1.13.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c33a9471896f1c69cecef8d20cbe2f7accd12527ce60845ff44c153bb2a21b49" +checksum = "f89776e4d69bb58bc6993e99ffa1d11f228b839984854c7daeb5d37f87cbe950" [[package]] name = "portable-atomic-util" -version = "0.2.6" +version = "0.2.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "091397be61a01d4be58e7841595bd4bfedb15f1cd54977d79b8271e94ed799a3" +checksum = "d8a2f0d8d040d7848a709caf78912debcc3f33ee4b3cac47d73d1e1069e83507" dependencies = [ "portable-atomic", ] @@ -4303,7 +4160,7 @@ dependencies = [ "base64", "byteorder", "bytes", - "fallible-iterator 0.2.0", + "fallible-iterator", "hmac", "md-5", "memchr", @@ -4334,7 +4191,7 @@ source = "git+https://github.com/MaterializeInc/rust-postgres?rev=c4b473b478b3ad dependencies = [ "bytes", "chrono", - "fallible-iterator 0.2.0", + "fallible-iterator", "postgres-protocol", "serde", "serde_json", @@ -4372,23 +4229,23 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "479ca8adacdd7ce8f1fb39ce9ecccbfe93a3f1344b3d0d97f20bc0196208f62b" dependencies = [ "proc-macro2", - "syn 2.0.117", + "syn 2.0.114", ] [[package]] name = "proc-macro-crate" -version = "3.5.0" +version = "3.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e67ba7e9b2b56446f1d419b1d807906278ffa1a658a8a5d8a39dcb1f5a78614f" +checksum = "219cb19e96be00ab2e37d6e299658a0cfa83e52429179969b0f0121b4ac46983" dependencies = [ "toml_edit", ] [[package]] name = "proc-macro2" -version = "1.0.106" +version = "1.0.105" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8fd00f0bb2e90d81d1044c2b32617f68fcb9fa3bb7640c23e9c748e53fb30934" +checksum = "535d180e0ecab6268a3e718bb9fd44db66bbbc256257165fc699dadf70d16fe7" dependencies = [ "unicode-ident", ] @@ -4410,7 +4267,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "343d3bd7056eda839b03204e68deff7d1b13aba7af2b2fd16890697274262ee7" dependencies = [ "heck", - "itertools 0.14.0", + "itertools", "log", "multimap", "petgraph", @@ -4420,7 +4277,7 @@ dependencies = [ "pulldown-cmark", "pulldown-cmark-to-cmark", "regex", - "syn 2.0.117", + "syn 2.0.114", "tempfile", ] @@ -4431,10 +4288,10 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "27c6023962132f4b30eb4c172c91ce92d933da334c59c23cddee82358ddafb0b" dependencies = [ "anyhow", - "itertools 0.14.0", + "itertools", "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.114", ] [[package]] @@ -4468,9 +4325,9 @@ dependencies = [ [[package]] name = "pulldown-cmark" -version = "0.13.3" +version = "0.13.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7c3a14896dfa883796f1cb410461aef38810ea05f2b2c33c5aded3649095fdad" +checksum = "1e8bbe1a966bd2f362681a44f6edce3c2310ac21e4d5067a6e7ec396297a6ea0" dependencies = [ "bitflags", "memchr", @@ -4540,8 +4397,8 @@ dependencies = [ "quinn-udp", "rustc-hash", "rustls", - "socket2 0.6.3", - "thiserror 2.0.18", + "socket2 0.6.1", + "thiserror 2.0.17", "tokio", "tracing", "web-time", @@ -4549,9 +4406,9 @@ dependencies = [ [[package]] name = "quinn-proto" -version = "0.11.14" +version = "0.11.13" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "434b42fec591c96ef50e21e886936e66d3cc3f737104fdb9b737c40ffb94c098" +checksum = "f1906b49b0c3bc04b5fe5d86a77925ae6524a19b816ae38ce1e426255f1d8a31" dependencies = [ "bytes", "getrandom 0.3.4", @@ -4562,7 +4419,7 @@ dependencies = [ "rustls", "rustls-pki-types", "slab", - "thiserror 2.0.18", + "thiserror 2.0.17", "tinyvec", "tracing", "web-time", @@ -4577,16 +4434,16 @@ dependencies = [ "cfg_aliases", "libc", "once_cell", - "socket2 0.6.3", + "socket2 0.6.1", "tracing", "windows-sys 0.60.2", ] [[package]] name = "quote" -version = "1.0.45" +version = "1.0.43" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "41f2619966050689382d2b44f664f4bc593e129785a36d6ee376ddf37259b924" +checksum = "dc74d9a594b72ae6656596548f56f667211f8a97b3d4c3d467150794690dc40a" dependencies = [ "proc-macro2", ] @@ -4597,23 +4454,6 @@ version = "5.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "69cdb34c158ceb288df11e18b4bd39de994f6657d83847bdffdbd7f346754b0f" -[[package]] -name = "r-efi" -version = "6.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f8dcc9c7d52a811697d2151c701e0d08956f92b0e24136cf4cf27b57a6a0d9bf" - -[[package]] -name = "r2d2" -version = "0.8.10" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "51de85fb3fb6524929c8a2eb85e6b6d363de4e8c48f9e2c2eac4944abc181c93" -dependencies = [ - "log", - "parking_lot", - "scheduled-thread-pool", -] - [[package]] name = "radium" version = "0.7.0" @@ -4708,9 +4548,9 @@ dependencies = [ [[package]] name = "redox_syscall" -version = "0.7.3" +version = "0.7.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6ce70a74e890531977d37e532c34d45e9055d2409ed08ddba14529471ed0be16" +checksum = "49f3fe0889e69e2ae9e41f4d6c4c0181701d00e4697b356fb1f74173a5e0ee27" dependencies = [ "bitflags", ] @@ -4732,14 +4572,14 @@ checksum = "b7186006dcb21920990093f30e3dea63b7d6e977bf1256be20c3563a5db070da" dependencies = [ "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.114", ] [[package]] name = "regex" -version = "1.12.3" +version = "1.12.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e10754a14b9137dd7b1e3e5b0493cc9171fdd105e0ab477f51b72e7f3ac0e276" +checksum = "843bc0191f75f3e22651ae5f1e72939ab2f72a4bc30fa80a066bd66edefc24d4" dependencies = [ "aho-corasick", "memchr", @@ -4749,9 +4589,9 @@ dependencies = [ [[package]] name = "regex-automata" -version = "0.4.14" +version = "0.4.13" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6e1dd4122fc1595e8162618945476892eefca7b88c52820e74af6262213cae8f" +checksum = "5276caf25ac86c8d810222b3dbb938e512c55c6831a10f3e6ed1c93b84041f1c" dependencies = [ "aho-corasick", "memchr", @@ -4760,15 +4600,15 @@ dependencies = [ [[package]] name = "regex-lite" -version = "0.1.9" +version = "0.1.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cab834c73d247e67f4fae452806d17d3c7501756d98c8808d7c9c7aa7d18f973" +checksum = "8d942b98df5e658f56f20d592c7f868833fe38115e65c33003d8cd224b0155da" [[package]] name = "regex-syntax" -version = "0.8.10" +version = "0.8.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dc897dd8d9e8bd1ed8cdad82b5966c3e0ecae09fb1907d58efaa013543185d0a" +checksum = "7a2d987857b319362043e95f5353c0535c1f58eec5336fdfcf626430af7def58" [[package]] name = "rend" @@ -4853,7 +4693,7 @@ dependencies = [ "wasm-bindgen-futures", "wasm-streams", "web-sys", - "webpki-roots 1.0.6", + "webpki-roots 1.0.5", ] [[package]] @@ -4949,7 +4789,7 @@ dependencies = [ "proc-macro2", "quote", "rust-embed-utils", - "syn 2.0.117", + "syn 2.0.114", "walkdir", ] @@ -4991,9 +4831,9 @@ dependencies = [ [[package]] name = "rustc-demangle" -version = "0.1.27" +version = "0.1.26" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b50b8869d9fc858ce7266cce0194bd74df58b9d0e3f6df3a9fc8eb470d95c09d" +checksum = "56f7d92ca342cea22a06f2121d944b4fd82af56988c270852495420f961d4ace" [[package]] name = "rustc-hash" @@ -5012,22 +4852,22 @@ dependencies = [ [[package]] name = "rustix" -version = "1.1.4" +version = "1.1.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b6fe4565b9518b83ef4f91bb47ce29620ca828bd32cb7e408f0062e9930ba190" +checksum = "146c9e247ccc180c1f61615433868c99f3de3ae256a30a43b49f67c2d9171f34" dependencies = [ "bitflags", "errno", "libc", "linux-raw-sys", - "windows-sys 0.52.0", + "windows-sys 0.61.2", ] [[package]] name = "rustls" -version = "0.23.37" +version = "0.23.36" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "758025cb5fccfd3bc2fd74708fd4682be41d99e5dff73c377c0646c6012c73a4" +checksum = "c665f33d38cea657d9614f766881e4d510e0eda4239891eea56b4cadcf01801b" dependencies = [ "aws-lc-rs", "log", @@ -5058,10 +4898,10 @@ version = "0.8.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "612460d5f7bea540c490b2b6395d8e34a953e52b491accd6c86c8164c5932a63" dependencies = [ - "openssl-probe 0.2.1", + "openssl-probe 0.2.0", "rustls-pki-types", "schannel", - "security-framework 3.7.0", + "security-framework 3.5.1", ] [[package]] @@ -5075,9 +4915,9 @@ dependencies = [ [[package]] name = "rustls-pki-types" -version = "1.14.0" +version = "1.13.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "be040f8b0a225e40375822a563fa9524378b9d63112f53e19ffff34df5d33fdd" +checksum = "21e6f2ab2928ca4291b86736a8bd920a277a399bba1589409d72154ff87c1282" dependencies = [ "web-time", "zeroize", @@ -5085,9 +4925,9 @@ dependencies = [ [[package]] name = "rustls-webpki" -version = "0.103.10" +version = "0.103.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "df33b2b81ac578cabaf06b89b0631153a3f416b0a886e8a7a1707fb51abbd1ef" +checksum = "2ffdfa2f5286e2247234e03f680868ac2815974dc39e00ea15adc445d0aafe52" dependencies = [ "aws-lc-rs", "ring", @@ -5103,9 +4943,9 @@ checksum = "b39cdef0fa800fc44525c84ccb54a029961a8215f9619753635a9c0d2538d46d" [[package]] name = "ryu" -version = "1.0.23" +version = "1.0.22" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9774ba4a74de5f7b1c1451ed6cd5285a32eddb5cccb8cc655a4e50009e06477f" +checksum = "a50f4cf475b65d88e057964e0e9bb1f0aa9bbb2036dc65c64596b42932536984" [[package]] name = "same-file" @@ -5118,22 +4958,13 @@ dependencies = [ [[package]] name = "schannel" -version = "0.1.29" +version = "0.1.28" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "91c1b7e4904c873ef0710c1f407dde2e6287de2bebc1bbbf7d430bb7cbffd939" +checksum = "891d81b926048e76efe18581bf793546b4c0eaf8448d72be8de2bbee5fd166e1" dependencies = [ "windows-sys 0.61.2", ] -[[package]] -name = "scheduled-thread-pool" -version = "0.2.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3cbc66816425a074528352f5789333ecff06ca41b36b0b0efdfbb29edc391a19" -dependencies = [ - "parking_lot", -] - [[package]] name = "schemars" version = "0.8.22" @@ -5160,9 +4991,9 @@ dependencies = [ [[package]] name = "schemars" -version = "1.2.1" +version = "1.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a2b42f36aa1cd011945615b92222f6bf73c599a102a300334cd7f8dbeec726cc" +checksum = "54e910108742c57a770f492731f99be216a52fadd361b06c8fb59d74ccc267d2" dependencies = [ "dyn-clone", "ref-cast", @@ -5179,7 +5010,7 @@ dependencies = [ "proc-macro2", "quote", "serde_derive_internals", - "syn 2.0.117", + "syn 2.0.114", ] [[package]] @@ -5219,9 +5050,9 @@ dependencies = [ [[package]] name = "security-framework" -version = "3.7.0" +version = "3.5.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b7f4bc775c73d9a02cde8bf7b2ec4c9d12743edf609006c7facc23998404cd1d" +checksum = "b3297343eaf830f66ede390ea39da1d462b6b0c1b000f420d0a83f898bbbe6ef" dependencies = [ "bitflags", "core-foundation 0.10.1", @@ -5232,9 +5063,9 @@ dependencies = [ [[package]] name = "security-framework-sys" -version = "2.17.0" +version = "2.15.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6ce2691df843ecc5d231c0b14ece2acc3efb62c0a398c7e1d875f3983ce020e3" +checksum = "cc1f0cbffaac4852523ce30d8bd3c5cdc873501d96ff467ca09b6767bb8cd5c0" dependencies = [ "core-foundation-sys", "libc", @@ -5361,7 +5192,7 @@ dependencies = [ "rand 0.9.2", "serde", "serde_json", - "thiserror 2.0.18", + "thiserror 2.0.17", "time", "url", "uuid", @@ -5420,7 +5251,7 @@ checksum = "d540f220d3187173da220f885ab66608367b6574e925011a9353e4badda91d79" dependencies = [ "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.114", ] [[package]] @@ -5431,7 +5262,7 @@ checksum = "18d26a20a969b9e3fdf2fc2d9f21eda6c40e2de84c9408bb5d3b05d499aae711" dependencies = [ "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.114", ] [[package]] @@ -5455,7 +5286,7 @@ checksum = "175ee3e80ae9982737ca543e96133087cbd9a485eecc3bc4de9c1a37b47ea59c" dependencies = [ "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.114", ] [[package]] @@ -5472,9 +5303,9 @@ dependencies = [ [[package]] name = "serde_with" -version = "3.18.0" +version = "3.16.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dd5414fad8e6907dbdd5bc441a50ae8d6e26151a03b1de04d89a5576de61d01f" +checksum = "4fa237f2807440d238e0364a218270b98f767a00d3dada77b1c53ae88940e2e7" dependencies = [ "base64", "chrono", @@ -5482,7 +5313,7 @@ dependencies = [ "indexmap 1.9.3", "indexmap 2.13.0", "schemars 0.9.0", - "schemars 1.2.1", + "schemars 1.2.0", "serde_core", "serde_json", "serde_with_macros", @@ -5491,14 +5322,14 @@ dependencies = [ [[package]] name = "serde_with_macros" -version = "3.18.0" +version = "3.16.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d3db8978e608f1fe7357e211969fd9abdcae80bac1ba7a3369bb7eb6b404eb65" +checksum = "52a8e3ca0ca629121f70ab50f95249e5a6f925cc0f6ffe8256c45b728875706c" dependencies = [ - "darling 0.23.0", + "darling 0.21.3", "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.114", ] [[package]] @@ -5591,21 +5422,21 @@ checksum = "bbbb5d9659141646ae647b42fe094daf6c6192d1620870b449d9557f748b2daa" [[package]] name = "siphasher" -version = "1.0.2" +version = "1.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b2aa850e253778c88a04c3d7323b043aeda9d3e30d5971937c1855769763678e" +checksum = "56199f7ddabf13fe5074ce809e7d3f42b42ae711800501b5b16ea82ad029c39d" [[package]] name = "sketches-ddsketch" -version = "0.3.1" +version = "0.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0c6f73aeb92d671e0cc4dca167e59b2deb6387c375391bc99ee743f326994a2b" +checksum = "c1e9a774a6c28142ac54bb25d25562e6bcf957493a184f15ad4eebccb23e410a" [[package]] name = "slab" -version = "0.4.12" +version = "0.4.11" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0c790de23124f9ab44544d7ac05d60440adc586479ce501c1d6d7da3cd8c9cf5" +checksum = "7a2ae44ef20feb57a68b23d846850f861394c2e02dc425a50098ae8c90267589" [[package]] name = "smallvec" @@ -5634,9 +5465,9 @@ dependencies = [ [[package]] name = "socket2" -version = "0.6.3" +version = "0.6.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3a766e1110788c36f4fa1c2b71b387a7815aa65f88ce0229841826633d93723e" +checksum = "17129e116933cf371d018bb80ae557e889637989d8638274fb25622827b03881" dependencies = [ "libc", "windows-sys 0.60.2", @@ -5658,7 +5489,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d91ed6c858b01f942cd56b37a94b3e0a1798290327d1236e4d9cf4eaca44d29d" dependencies = [ "base64ct", - "der 0.7.10", + "der", ] [[package]] @@ -5702,7 +5533,7 @@ dependencies = [ "serde_json", "sha2", "smallvec", - "thiserror 2.0.18", + "thiserror 2.0.17", "tokio", "tokio-stream", "tracing", @@ -5720,7 +5551,7 @@ dependencies = [ "quote", "sqlx-core", "sqlx-macros-core", - "syn 2.0.117", + "syn 2.0.114", ] [[package]] @@ -5743,7 +5574,7 @@ dependencies = [ "sqlx-mysql", "sqlx-postgres", "sqlx-sqlite", - "syn 2.0.117", + "syn 2.0.114", "tokio", "url", ] @@ -5785,7 +5616,7 @@ dependencies = [ "smallvec", "sqlx-core", "stringprep", - "thiserror 2.0.18", + "thiserror 2.0.17", "tracing", "whoami", ] @@ -5822,7 +5653,7 @@ dependencies = [ "smallvec", "sqlx-core", "stringprep", - "thiserror 2.0.18", + "thiserror 2.0.17", "tracing", "whoami", ] @@ -5846,7 +5677,7 @@ dependencies = [ "serde", "serde_urlencoded", "sqlx-core", - "thiserror 2.0.18", + "thiserror 2.0.17", "tracing", "url", ] @@ -5880,32 +5711,13 @@ version = "0.11.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f" -[[package]] -name = "strum" -version = "0.26.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8fec0f0aef304996cf250b31b5a10dee7980c85da9d759361292b8bca5a18f06" - [[package]] name = "strum" version = "0.27.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "af23d6f6c1a224baef9d3f61e287d2761385a5b88fdab4eb4c6f11aeb54c4bcf" dependencies = [ - "strum_macros 0.27.2", -] - -[[package]] -name = "strum_macros" -version = "0.26.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4c6bee85a5a24955dc440386795aa378cd9cf82acd5f764469152d2270e581be" -dependencies = [ - "heck", - "proc-macro2", - "quote", - "rustversion", - "syn 2.0.117", + "strum_macros", ] [[package]] @@ -5917,7 +5729,7 @@ dependencies = [ "heck", "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.114", ] [[package]] @@ -5939,9 +5751,9 @@ dependencies = [ [[package]] name = "syn" -version = "2.0.117" +version = "2.0.114" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e665b8803e7b1d2a727f4023456bbbbe74da67099c585258af0ad9c5013b9b99" +checksum = "d4d107df263a3013ef9b1879b0df87d706ff80f65a86ea879bd9c31f9b307c2a" dependencies = [ "proc-macro2", "quote", @@ -5965,14 +5777,14 @@ checksum = "728a70f3dbaf5bab7f0c4b1ac8d7ae5ea60a4b5549c8a5914361c99147a709d2" dependencies = [ "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.114", ] [[package]] name = "sysinfo" -version = "0.38.4" +version = "0.38.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "92ab6a2f8bfe508deb3c6406578252e491d299cbbf3bc0529ecc3313aee4a52f" +checksum = "1efc19935b4b66baa6f654ac7924c192f55b175c00a7ab72410fc24284dacda8" dependencies = [ "libc", "memchr", @@ -5984,9 +5796,9 @@ dependencies = [ [[package]] name = "system-configuration" -version = "0.7.0" +version = "0.6.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a13f3d0daba03132c0aa9767f98351b3488edc2c100cda2d2ec2b04f3d8d3c8b" +checksum = "3c879d448e9d986b661742763247d3693ed13609438cf3d006f51f5368a5ba6b" dependencies = [ "bitflags", "core-foundation 0.9.4", @@ -6015,28 +5827,17 @@ version = "1.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "55937e1799185b12863d447f42597ed69d9928686b8d88a1df17376a097d8369" -[[package]] -name = "tar" -version = "0.4.45" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "22692a6476a21fa75fdfc11d452fda482af402c008cdbaf3476414e122040973" -dependencies = [ - "filetime", - "libc", - "xattr", -] - [[package]] name = "tempfile" -version = "3.27.0" +version = "3.24.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "32497e9a4c7b38532efcdebeef879707aa9f794296a4f0244f6f69e9bc8574bd" +checksum = "655da9c7eb6305c55742045d5a8d2037996d61d8de95806335c7c86ce0f82e9c" dependencies = [ "fastrand", - "getrandom 0.4.2", + "getrandom 0.3.4", "once_cell", "rustix", - "windows-sys 0.52.0", + "windows-sys 0.61.2", ] [[package]] @@ -6050,11 +5851,11 @@ dependencies = [ [[package]] name = "thiserror" -version = "2.0.18" +version = "2.0.17" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4288b5bcbc7920c07a1149a35cf9590a2aa808e0bc1eafaade0b80947865fbc4" +checksum = "f63587ca0f12b72a0600bcba1d40081f830876000bb46dd2337a3051618f4fc8" dependencies = [ - "thiserror-impl 2.0.18", + "thiserror-impl 2.0.17", ] [[package]] @@ -6065,18 +5866,18 @@ checksum = "4fee6c4efc90059e10f81e6d42c60a18f76588c3d74cb83a0b242a2b6c7504c1" dependencies = [ "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.114", ] [[package]] name = "thiserror-impl" -version = "2.0.18" +version = "2.0.17" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ebc4ee7f67670e9b64d05fa4253e753e016c6c95ff35b89b7941d6b856dec1d5" +checksum = "3ff15c8ecd7de3849db632e14d18d2571fa09dfc5ed93479bc4485c7a517c913" dependencies = [ "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.114", ] [[package]] @@ -6184,9 +5985,9 @@ dependencies = [ [[package]] name = "tinyvec" -version = "1.11.0" +version = "1.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3e61e67053d25a4e82c844e8424039d9745781b3fc4f32b8d55ed50f5f667ef3" +checksum = "bfa5fdc3bce6191a1dbc8c02d5c8bffcf557bafa17c124c5264a458f1b0613fa" dependencies = [ "tinyvec_macros", ] @@ -6199,9 +6000,9 @@ checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20" [[package]] name = "tokio" -version = "1.50.0" +version = "1.49.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "27ad5e34374e03cfffefc301becb44e9dc3c17584f414349ebe29ed26661822d" +checksum = "72a2903cd7736441aac9df9d7688bd0ce48edccaadf181c3b90be801e81d3d86" dependencies = [ "bytes", "libc", @@ -6209,20 +6010,20 @@ dependencies = [ "parking_lot", "pin-project-lite", "signal-hook-registry", - "socket2 0.6.3", + "socket2 0.6.1", "tokio-macros", "windows-sys 0.61.2", ] [[package]] name = "tokio-macros" -version = "2.6.1" +version = "2.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5c55a2eff8b69ce66c84f85e1da1c233edc36ceb85a2058d11b0d6a3c7e7569c" +checksum = "af407857209536a95c8e56f8231ef2c2e2aff839b22e07a1ffcbc617e9db9fa5" dependencies = [ "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.114", ] [[package]] @@ -6243,7 +6044,7 @@ dependencies = [ "async-trait", "byteorder", "bytes", - "fallible-iterator 0.2.0", + "fallible-iterator", "futures-channel", "futures-util", "log", @@ -6298,18 +6099,18 @@ dependencies = [ [[package]] name = "toml_datetime" -version = "1.1.0+spec-1.1.0" +version = "0.7.5+spec-1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "97251a7c317e03ad83774a8752a7e81fb6067740609f75ea2b585b569a59198f" +checksum = "92e1cfed4a3038bc5a127e35a2d360f145e1f4b971b551a2ba5fd7aedf7e1347" dependencies = [ "serde_core", ] [[package]] name = "toml_edit" -version = "0.25.8+spec-1.1.0" +version = "0.23.10+spec-1.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "16bff38f1d86c47f9ff0647e6838d7bb362522bdf44006c7068c2b1e606f1f3c" +checksum = "84c8b9f757e028cee9fa244aea147aab2a9ec09d5325a9b01e0a49730c2b5269" dependencies = [ "indexmap 2.13.0", "toml_datetime", @@ -6319,18 +6120,18 @@ dependencies = [ [[package]] name = "toml_parser" -version = "1.1.0+spec-1.1.0" +version = "1.0.6+spec-1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2334f11ee363607eb04df9b8fc8a13ca1715a72ba8662a26ac285c98aabb4011" +checksum = "a3198b4b0a8e11f09dd03e133c0280504d0801269e9afa46362ffde1cbeebf44" dependencies = [ "winnow", ] [[package]] name = "tonic" -version = "0.14.5" +version = "0.14.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fec7c61a0695dc1887c1b53952990f3ad2e3a31453e1f49f10e75424943a93ec" +checksum = "eb7613188ce9f7df5bfe185db26c5814347d110db17920415cf2fbcad85e7203" dependencies = [ "async-trait", "base64", @@ -6358,21 +6159,21 @@ dependencies = [ [[package]] name = "tonic-build" -version = "0.14.5" +version = "0.14.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1882ac3bf5ef12877d7ed57aad87e75154c11931c2ba7e6cde5e22d63522c734" +checksum = "4c40aaccc9f9eccf2cd82ebc111adc13030d23e887244bc9cfa5d1d636049de3" dependencies = [ "prettyplease", "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.114", ] [[package]] name = "tonic-prost" -version = "0.14.5" +version = "0.14.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a55376a0bbaa4975a3f10d009ad763d8f4108f067c7c2e74f3001fb49778d309" +checksum = "66bd50ad6ce1252d87ef024b3d64fe4c3cf54a86fb9ef4c631fdd0ded7aeaa67" dependencies = [ "bytes", "prost", @@ -6381,16 +6182,16 @@ dependencies = [ [[package]] name = "tonic-prost-build" -version = "0.14.5" +version = "0.14.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f3144df636917574672e93d0f56d7edec49f90305749c668df5101751bb8f95a" +checksum = "b4a16cba4043dc3ff43fcb3f96b4c5c154c64cbd18ca8dce2ab2c6a451d058a2" dependencies = [ "prettyplease", "proc-macro2", "prost-build", "prost-types", "quote", - "syn 2.0.117", + "syn 2.0.114", "tempfile", "tonic-build", ] @@ -6479,7 +6280,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "786d480bce6247ab75f005b14ae1624ad978d3029d9113f0a22fa1ac773faeaf" dependencies = [ "crossbeam-channel", - "thiserror 2.0.18", + "thiserror 2.0.17", "time", "tracing-subscriber", ] @@ -6492,7 +6293,7 @@ checksum = "7490cfa5ec963746568740651ac6781f701c9c5ea257c58e057f3ba8cf69e8da" dependencies = [ "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.114", ] [[package]] @@ -6528,9 +6329,9 @@ dependencies = [ [[package]] name = "tracing-subscriber" -version = "0.3.23" +version = "0.3.22" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cb7f578e5945fb242538965c2d0b04418d38ec25c79d160cd279bf0731c8d319" +checksum = "2f30143827ddab0d256fd843b7a66d164e9f271cfa0dde49142c5ca0ca291f1e" dependencies = [ "matchers", "nu-ansi-term", @@ -6576,7 +6377,7 @@ checksum = "3c36781cc0e46a83726d9879608e4cf6c2505237e263a8eb8c24502989cfdb28" dependencies = [ "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.114", ] [[package]] @@ -6614,9 +6415,9 @@ checksum = "5c1cb5db39152898a79168971543b1cb5020dff7fe43c8dc468b0885f5e29df5" [[package]] name = "unicode-ident" -version = "1.0.24" +version = "1.0.22" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e6e4313cd5fcd3dad5cafa179702e2b244f760991f45397d14d4ebf38247da75" +checksum = "9312f7c4f6ff9069b165498234ce8be658059c6728633667c526e27dc2cf1df5" [[package]] name = "unicode-normalization" @@ -6635,15 +6436,9 @@ checksum = "7df058c713841ad818f1dc5d3fd88063241cc61f49f5fbea4b951e8cf5a8d71d" [[package]] name = "unicode-segmentation" -version = "1.13.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9629274872b2bfaf8d66f5f15725007f635594914870f65218920345aa11aa8c" - -[[package]] -name = "unicode-width" -version = "0.2.2" +version = "1.12.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b4ac048d71ede7ee76d585517add45da530660ef4390e49b098733c6e897f254" +checksum = "f6ccf251212114b54433ec949fd6a7841275f9ada20dddd2f29e9ceea4501493" [[package]] name = "unicode-xid" @@ -6665,26 +6460,26 @@ checksum = "8ecb6da28b8a351d773b68d5825ac39017e680750f980f3a1a85cd8dd28a47c1" [[package]] name = "ureq" -version = "3.3.0" +version = "3.1.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dea7109cdcd5864d4eeb1b58a1648dc9bf520360d7af16ec26d0a9354bafcfc0" +checksum = "d39cb1dbab692d82a977c0392ffac19e188bd9186a9f32806f0aaa859d75585a" dependencies = [ "base64", - "der 0.8.0", + "der", "log", "native-tls", "percent-encoding", "rustls-pki-types", "ureq-proto", - "utf8-zero", + "utf-8", "webpki-root-certs", ] [[package]] name = "ureq-proto" -version = "0.6.0" +version = "0.5.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e994ba84b0bd1b1b0cf92878b7ef898a5c1760108fe7b6010327e274917a808c" +checksum = "d81f9efa9df032be5934a46a068815a10a042b494b6a58cb0a1a97bb5467ed6f" dependencies = [ "base64", "http 1.4.0", @@ -6706,10 +6501,10 @@ dependencies = [ ] [[package]] -name = "utf8-zero" -version = "0.8.1" +name = "utf-8" +version = "0.7.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b8c0a043c9540bae7c578c88f91dda8bd82e59ae27c21baca69c8b191aaf5a6e" +checksum = "09cc8ee72d2a9becf2f2febe0205bbed8fc6615b7cb429ad062dc7b7ddd036a9" [[package]] name = "utf8_iter" @@ -6744,7 +6539,7 @@ dependencies = [ "proc-macro2", "quote", "regex", - "syn 2.0.117", + "syn 2.0.114", ] [[package]] @@ -6762,7 +6557,7 @@ dependencies = [ "serde_json", "utoipa", "utoipa-swagger-ui-vendored", - "zip 3.0.0", + "zip", ] [[package]] @@ -6773,11 +6568,11 @@ checksum = "e2eebbbfe4093922c2b6734d7c679ebfebd704a0d7e56dfcb0d05818ce28977d" [[package]] name = "uuid" -version = "1.22.0" +version = "1.19.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a68d3c8f01c0cfa54a75291d83601161799e4a89a39e0929f4b0354d88757a37" +checksum = "e2e054861b4bd027cd373e18e8d8d8e6548085000e41290d95ce0c373a654b4a" dependencies = [ - "getrandom 0.4.2", + "getrandom 0.3.4", "js-sys", "serde_core", "wasm-bindgen", @@ -6828,18 +6623,9 @@ checksum = "ccf3ec651a847eb01de73ccad15eb7d99f80485de043efb2f370cd654f4ea44b" [[package]] name = "wasip2" -version = "1.0.2+wasi-0.2.9" +version = "1.0.1+wasi-0.2.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9517f9239f02c069db75e65f174b3da828fe5f5b945c4dd26bd25d89c03ebcf5" -dependencies = [ - "wit-bindgen", -] - -[[package]] -name = "wasip3" -version = "0.4.0+wasi-0.3.0-rc-2026-01-06" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5428f8bf88ea5ddc08faddef2ac4a67e390b88186c703ce6dbd955e1c145aca5" +checksum = "0562428422c63773dad2c345a1882263bbf4d65cf3f42e90921f787ef5ad58e7" dependencies = [ "wit-bindgen", ] @@ -6852,9 +6638,9 @@ checksum = "b8dad83b4f25e74f184f64c43b150b91efe7647395b42289f38e50566d82855b" [[package]] name = "wasm-bindgen" -version = "0.2.114" +version = "0.2.108" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6532f9a5c1ece3798cb1c2cfdba640b9b3ba884f5db45973a6f442510a87d38e" +checksum = "64024a30ec1e37399cf85a7ffefebdb72205ca1c972291c51512360d90bd8566" dependencies = [ "cfg-if", "once_cell", @@ -6865,9 +6651,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-futures" -version = "0.4.64" +version = "0.4.58" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e9c5522b3a28661442748e09d40924dfb9ca614b21c00d3fd135720e48b67db8" +checksum = "70a6e77fd0ae8029c9ea0063f87c46fde723e7d887703d74ad2616d792e51e6f" dependencies = [ "cfg-if", "futures-util", @@ -6879,9 +6665,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-macro" -version = "0.2.114" +version = "0.2.108" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "18a2d50fcf105fb33bb15f00e7a77b772945a2ee45dcf454961fd843e74c18e6" +checksum = "008b239d9c740232e71bd39e8ef6429d27097518b6b30bdf9086833bd5b6d608" dependencies = [ "quote", "wasm-bindgen-macro-support", @@ -6889,48 +6675,26 @@ dependencies = [ [[package]] name = "wasm-bindgen-macro-support" -version = "0.2.114" +version = "0.2.108" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "03ce4caeaac547cdf713d280eda22a730824dd11e6b8c3ca9e42247b25c631e3" +checksum = "5256bae2d58f54820e6490f9839c49780dff84c65aeab9e772f15d5f0e913a55" dependencies = [ "bumpalo", "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.114", "wasm-bindgen-shared", ] [[package]] name = "wasm-bindgen-shared" -version = "0.2.114" +version = "0.2.108" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "75a326b8c223ee17883a4251907455a2431acc2791c98c26279376490c378c16" +checksum = "1f01b580c9ac74c8d8f0c0e4afb04eeef2acf145458e52c03845ee9cd23e3d12" dependencies = [ "unicode-ident", ] -[[package]] -name = "wasm-encoder" -version = "0.244.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "990065f2fe63003fe337b932cfb5e3b80e0b4d0f5ff650e6985b1048f62c8319" -dependencies = [ - "leb128fmt", - "wasmparser", -] - -[[package]] -name = "wasm-metadata" -version = "0.244.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bb0e353e6a2fbdc176932bbaab493762eb1255a7900fe0fea1a2f96c296cc909" -dependencies = [ - "anyhow", - "indexmap 2.13.0", - "wasm-encoder", - "wasmparser", -] - [[package]] name = "wasm-streams" version = "0.4.2" @@ -6944,23 +6708,11 @@ dependencies = [ "web-sys", ] -[[package]] -name = "wasmparser" -version = "0.244.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "47b807c72e1bac69382b3a6fb3dbe8ea4c0ed87ff5629b8685ae6b9a611028fe" -dependencies = [ - "bitflags", - "hashbrown 0.15.5", - "indexmap 2.13.0", - "semver", -] - [[package]] name = "web-sys" -version = "0.3.91" +version = "0.3.85" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "854ba17bb104abfb26ba36da9729addc7ce7f06f5c0f90f3c391f8461cca21f9" +checksum = "312e32e551d92129218ea9a2452120f4aabc03529ef03e4d0d82fb2780608598" dependencies = [ "js-sys", "wasm-bindgen", @@ -6978,9 +6730,9 @@ dependencies = [ [[package]] name = "webpki-root-certs" -version = "1.0.6" +version = "1.0.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "804f18a4ac2676ffb4e8b5b5fa9ae38af06df08162314f96a68d2a363e21a8ca" +checksum = "36a29fc0408b113f68cf32637857ab740edfafdf460c326cd2afaa2d84cc05dc" dependencies = [ "rustls-pki-types", ] @@ -6991,14 +6743,14 @@ version = "0.26.11" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "521bc38abb08001b01866da9f51eb7c5d647a19260e00054a8c7fd5f9e57f7a9" dependencies = [ - "webpki-roots 1.0.6", + "webpki-roots 1.0.5", ] [[package]] name = "webpki-roots" -version = "1.0.6" +version = "1.0.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "22cfaf3c063993ff62e73cb4311efde4db1efb31ab78a3e5c457939ad5cc0bed" +checksum = "12bed680863276c63889429bfd6cab3b99943659923822de1c8a39c49e4d722c" dependencies = [ "rustls-pki-types", ] @@ -7036,7 +6788,7 @@ version = "0.1.11" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c2a7b1c03c876122aa43f3020e6c3c3ee5c05081c9a00739faf7503aeba10d22" dependencies = [ - "windows-sys 0.48.0", + "windows-sys 0.61.2", ] [[package]] @@ -7098,7 +6850,7 @@ checksum = "053e2e040ab57b9dc951b72c264860db7eb3b0200ba345b4e4c3b14f67855ddf" dependencies = [ "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.114", ] [[package]] @@ -7109,7 +6861,7 @@ checksum = "3f316c4a2570ba26bbec722032c4099d8c8bc095efccdc15688708623367e358" dependencies = [ "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.114", ] [[package]] @@ -7390,100 +7142,18 @@ checksum = "d6bbff5f0aada427a1e5a6da5f1f98158182f26556f345ac9e04d36d0ebed650" [[package]] name = "winnow" -version = "1.0.0" +version = "0.7.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a90e88e4667264a994d34e6d1ab2d26d398dcdca8b7f52bec8668957517fc7d8" +checksum = "5a5364e9d77fcdeeaa6062ced926ee3381faa2ee02d3eb83a5c27a8825540829" dependencies = [ "memchr", ] [[package]] name = "wit-bindgen" -version = "0.51.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d7249219f66ced02969388cf2bb044a09756a083d0fab1e566056b04d9fbcaa5" -dependencies = [ - "wit-bindgen-rust-macro", -] - -[[package]] -name = "wit-bindgen-core" -version = "0.51.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ea61de684c3ea68cb082b7a88508a8b27fcc8b797d738bfc99a82facf1d752dc" -dependencies = [ - "anyhow", - "heck", - "wit-parser", -] - -[[package]] -name = "wit-bindgen-rust" -version = "0.51.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b7c566e0f4b284dd6561c786d9cb0142da491f46a9fbed79ea69cdad5db17f21" -dependencies = [ - "anyhow", - "heck", - "indexmap 2.13.0", - "prettyplease", - "syn 2.0.117", - "wasm-metadata", - "wit-bindgen-core", - "wit-component", -] - -[[package]] -name = "wit-bindgen-rust-macro" -version = "0.51.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0c0f9bfd77e6a48eccf51359e3ae77140a7f50b1e2ebfe62422d8afdaffab17a" -dependencies = [ - "anyhow", - "prettyplease", - "proc-macro2", - "quote", - "syn 2.0.117", - "wit-bindgen-core", - "wit-bindgen-rust", -] - -[[package]] -name = "wit-component" -version = "0.244.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9d66ea20e9553b30172b5e831994e35fbde2d165325bec84fc43dbf6f4eb9cb2" -dependencies = [ - "anyhow", - "bitflags", - "indexmap 2.13.0", - "log", - "serde", - "serde_derive", - "serde_json", - "wasm-encoder", - "wasm-metadata", - "wasmparser", - "wit-parser", -] - -[[package]] -name = "wit-parser" -version = "0.244.0" +version = "0.46.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ecc8ac4bc1dc3381b7f59c34f00b67e18f910c2c0f50015669dde7def656a736" -dependencies = [ - "anyhow", - "id-arena", - "indexmap 2.13.0", - "log", - "semver", - "serde", - "serde_derive", - "serde_json", - "unicode-xid", - "wasmparser", -] +checksum = "f17a85883d4e6d00e8a97c586de764dabcc06133f7f1d55dce5cdc070ad7fe59" [[package]] name = "writeable" @@ -7507,34 +7177,10 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1301e935010a701ae5f8655edc0ad17c44bad3ac5ce8c39185f75453b720ae94" dependencies = [ "const-oid", - "der 0.7.10", + "der", "spki", ] -[[package]] -name = "xattr" -version = "1.6.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "32e45ad4206f6d2479085147f02bc2ef834ac85886624a23575ae137c8aa8156" -dependencies = [ - "libc", - "rustix", -] - -[[package]] -name = "xtask" -version = "0.1.0" -dependencies = [ - "anyhow", - "clap", - "k8s-openapi", - "kube", - "schemars 0.8.22", - "serde", - "serde_json", - "tokio", -] - [[package]] name = "yaml-rust2" version = "0.8.1" @@ -7565,7 +7211,7 @@ checksum = "b659052874eb698efe5b9e8cf382204678a0086ebf46982b79d6ca3182927e5d" dependencies = [ "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.114", "synstructure", ] @@ -7588,7 +7234,7 @@ dependencies = [ "seahash", "serde", "serde_json", - "thiserror 2.0.18", + "thiserror 2.0.17", "time", "tokio", "url", @@ -7596,22 +7242,22 @@ dependencies = [ [[package]] name = "zerocopy" -version = "0.8.47" +version = "0.8.33" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "efbb2a062be311f2ba113ce66f697a4dc589f85e78a4aea276200804cea0ed87" +checksum = "668f5168d10b9ee831de31933dc111a459c97ec93225beb307aed970d1372dfd" dependencies = [ "zerocopy-derive", ] [[package]] name = "zerocopy-derive" -version = "0.8.47" +version = "0.8.33" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0e8bc7269b54418e7aeeef514aa68f8690b8c0489a06b0136e5f57c4c5ccab89" +checksum = "2c7962b26b0a8685668b671ee4b54d007a67d4eaf05fda79ac0ecf41e32270f1" dependencies = [ "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.114", ] [[package]] @@ -7631,7 +7277,7 @@ checksum = "d71e5d6e06ab090c67b5e44993ec16b72dcbaabc526db883a360057678b48502" dependencies = [ "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.114", "synstructure", ] @@ -7671,7 +7317,7 @@ checksum = "eadce39539ca5cb3985590102671f2567e659fca9666581ad3411d59207951f3" dependencies = [ "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.114", ] [[package]] @@ -7688,31 +7334,17 @@ dependencies = [ "zopfli", ] -[[package]] -name = "zip" -version = "6.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "eb2a05c7c36fde6c09b08576c9f7fb4cda705990f73b58fe011abf7dfb24168b" -dependencies = [ - "arbitrary", - "crc32fast", - "flate2", - "indexmap 2.13.0", - "memchr", - "zopfli", -] - [[package]] name = "zlib-rs" -version = "0.6.3" +version = "0.5.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3be3d40e40a133f9c916ee3f9f4fa2d9d63435b5fbe1bfc6d9dae0aa0ada1513" +checksum = "40990edd51aae2c2b6907af74ffb635029d5788228222c4bb811e9351c0caad3" [[package]] name = "zmij" -version = "1.0.21" +version = "1.0.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b8848ee67ecc8aedbaf3e4122217aff892639231befc6a1b58d29fff4c2cabaa" +checksum = "bd8f3f50b848df28f887acb68e41201b5aea6bc8a8dacc00fb40635ff9a72fea" [[package]] name = "zopfli" diff --git a/Cargo.toml b/Cargo.toml index 357678954..2637972a7 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -45,6 +45,7 @@ base64 = { version = "0.22.1", default-features = false } byteorder = { version = "1.5.0", default-features = false } bytes = { version = "1.10.1" } chrono = { version = "0.4.41", default-features = false } +clickhouse = { version = "0.14", default-features = false } clap = { version = "4.5.42", default-features = false } config = { version = "0.14", default-features = false } configcat = { version = "0.1.3", default-features = false } diff --git a/etl-destinations/Cargo.toml b/etl-destinations/Cargo.toml index 5f8b284f7..506688cc1 100644 --- a/etl-destinations/Cargo.toml +++ b/etl-destinations/Cargo.toml @@ -44,6 +44,13 @@ iceberg = [ "dep:serde", "dep:serde_json", ] +clickhouse = [ + "dep:clickhouse", + "dep:metrics", + "dep:tracing", + "dep:tokio", + "dep:serde", +] egress = ["etl/egress"] # We assume that `test-utils` is always used in conjunction with `bigquery` or `iceberg` thus we only # put here the extra dependencies needed. @@ -56,6 +63,7 @@ arrow = { workspace = true, optional = true } async-trait = { workspace = true, optional = true } base64 = { workspace = true, optional = true } chrono = { workspace = true } +clickhouse = { workspace = true, optional = true, features = ["inserter", "rustls-tls"] } duckdb = { workspace = true, optional = true, features = ["bundled", "r2d2"] } gcp-bigquery-client = { workspace = true, optional = true, features = ["rust-tls", "aws-lc-rs"] } iceberg = { workspace = true, optional = true } diff --git a/etl-destinations/src/clickhouse/core.rs b/etl-destinations/src/clickhouse/core.rs new file mode 100644 index 000000000..ba106f358 --- /dev/null +++ b/etl-destinations/src/clickhouse/core.rs @@ -0,0 +1,992 @@ +use std::{ + collections::{HashMap, HashSet}, + fmt, + sync::{Arc, RwLock}, +}; + +use crate::clickhouse::metrics::{ + ETL_CH_DDL_DURATION_SECONDS, ETL_CH_INSERT_DURATION_SECONDS, register_metrics, +}; +use crate::clickhouse::schema::{build_create_table_sql, table_name_to_clickhouse_table_name}; +use chrono::NaiveDate; +use clickhouse::Client; +use etl::error::{ErrorKind, EtlResult}; +use etl::store::schema::SchemaStore; +use etl::store::state::StateStore; +use etl::types::{ArrayCell, Cell, Event, TableId, TableRow}; +use etl::{bail, etl_error}; +use etl::{destination::Destination, types::PgLsn}; +use std::time::Instant; +use tokio::task::JoinSet; +use tracing::{debug, info}; + +// ── CDC operation type ──────────────────────────────────────────────────────── + +#[derive(Copy, Clone)] +enum CdcOperation { + Insert, + Update, + Delete, +} + +impl CdcOperation { + fn as_str(self) -> &'static str { + match self { + CdcOperation::Insert => "INSERT", + CdcOperation::Update => "UPDATE", + CdcOperation::Delete => "DELETE", + } + } +} + +/// A single row pending insertion, carrying the CDC metadata alongside the cell data. +struct PendingRow { + operation: CdcOperation, + lsn: PgLsn, + cells: Vec, +} + +// ── Unix epoch constant for Date conversion ─────────────────────────────────── + +fn unix_epoch() -> NaiveDate { + NaiveDate::from_ymd_opt(1970, 1, 1).expect("valid date") +} + +// ── Inserter configuration ──────────────────────────────────────────────────── + +/// Controls intermediate flushing inside a single `write_table_rows` / `write_events` call. +/// +/// The upstream `BatchConfig::max_fill_ms` controls when `write_events` is called; +/// these limits prevent unbounded memory use for very large batches (e.g. initial copy). +pub struct ClickHouseInserterConfig { + /// Start a new INSERT after this many rows (default: 100_000). + pub max_rows_per_insert: u64, + /// Start a new INSERT after this many uncompressed bytes. + /// + /// Derive this from `BatchConfig::memory_budget_ratio × total_memory / max_table_sync_workers` + /// (the same formula used by `BatchBudget::ideal_batch_size_bytes`). + pub max_bytes_per_insert: u64, +} + +impl Default for ClickHouseInserterConfig { + fn default() -> Self { + Self { + max_rows_per_insert: 100_000, + max_bytes_per_insert: 256 * 1024 * 1024, // 256 MiB + } + } +} + +// ── ClickHouseValue ─────────────────────────────────────────────────────────── + +/// Owned ClickHouse-compatible value, moved (not cloned) from a [`Cell`]. +enum ClickHouseValue { + Null, + Bool(bool), + Int16(i16), + Int32(i32), + Int64(i64), + UInt32(u32), + Float32(f32), + Float64(f64), + /// TEXT, NUMERIC (string), TIME (string), JSON, BYTEA (hex-encoded) + String(String), + /// Days since Unix epoch (ClickHouse `Date` on wire = UInt16 LE) + Date(u16), + /// Microseconds since Unix epoch (ClickHouse `DateTime64(6)` on wire = Int64 LE) + DateTime64(i64), + /// UUID in standard 16-byte big-endian order (converted to ClickHouse wire format on encode) + Uuid([u8; 16]), + Array(Vec), +} + +// ── RowBinary encoding ──────────────────────────────────────────────────────── +// +// We bypass the `Row` / `Inserter` API entirely and write RowBinary bytes directly +// via `Client::insert_formatted_with("INSERT INTO \"t\" FORMAT RowBinary")`. +// +// This avoids two fatal issues with the `Inserter` path: +// +// 1. `Insert::new` always calls `join_column_names::().expect(…)`, which panics +// when `COLUMN_NAMES = &[]` regardless of whether validation is enabled. +// +// 2. The RowBinary serde serializer wraps its `BufMut` writer in a fresh `&mut` at +// every `serialize_some` call, telescoping the type to `&mut &mut … BytesMut` for +// nullable array elements and overflowing the compiler's recursion limit. +// +// Direct binary encoding has neither problem: it is a simple recursive function that +// writes bytes to a `Vec` with no generics and no type-level recursion. + +/// Encodes a variable-length integer (LEB128) used by ClickHouse for string/array lengths. +fn rb_varint(mut v: usize, buf: &mut Vec) { + loop { + let byte = (v & 0x7f) as u8; + v >>= 7; + if v == 0 { + buf.push(byte); + return; + } + buf.push(byte | 0x80); + } +} + +/// Encodes a value for a `Nullable(T)` column (1-byte null indicator + value if present). +fn rb_encode_nullable(val: &ClickHouseValue, buf: &mut Vec) { + match val { + ClickHouseValue::Null => buf.push(1), + v => { + buf.push(0); + rb_encode_value(v, buf); + } + } +} + +/// Encodes a value for a non-nullable column (no null indicator byte). +fn rb_encode_value(val: &ClickHouseValue, buf: &mut Vec) { + match val { + ClickHouseValue::Null => { + // A non-nullable column unexpectedly received NULL (data quality issue from + // Postgres). Write a zero-length string as the least-harmful fallback. + buf.push(0); // varint 0 = empty string + } + ClickHouseValue::Bool(b) => buf.push(*b as u8), + ClickHouseValue::Int16(v) => buf.extend_from_slice(&v.to_le_bytes()), + ClickHouseValue::Int32(v) => buf.extend_from_slice(&v.to_le_bytes()), + ClickHouseValue::Int64(v) => buf.extend_from_slice(&v.to_le_bytes()), + ClickHouseValue::UInt32(v) => buf.extend_from_slice(&v.to_le_bytes()), + ClickHouseValue::Float32(v) => buf.extend_from_slice(&v.to_le_bytes()), + ClickHouseValue::Float64(v) => buf.extend_from_slice(&v.to_le_bytes()), + ClickHouseValue::String(s) => { + rb_varint(s.len(), buf); + buf.extend_from_slice(s.as_bytes()); + } + ClickHouseValue::Date(days) => buf.extend_from_slice(&days.to_le_bytes()), + ClickHouseValue::DateTime64(micros) => buf.extend_from_slice(µs.to_le_bytes()), + ClickHouseValue::Uuid(bytes) => { + // ClickHouse RowBinary UUID = two little-endian u64 (high bits then low bits). + // Our bytes are in standard UUID big-endian order, so we split into two u64 + // and write each in little-endian. + let high = u64::from_be_bytes(bytes[0..8].try_into().unwrap()); + let low = u64::from_be_bytes(bytes[8..16].try_into().unwrap()); + buf.extend_from_slice(&high.to_le_bytes()); + buf.extend_from_slice(&low.to_le_bytes()); + } + // Array elements are always Nullable in ClickHouse: Array(Nullable(T)). + ClickHouseValue::Array(items) => { + rb_varint(items.len(), buf); + for item in items { + rb_encode_nullable(item, buf); + } + } + } +} + +/// Encodes a complete row into `buf`, selecting nullable vs non-nullable encoding per column. +fn rb_encode_row(values: &[ClickHouseValue], nullable_flags: &[bool], buf: &mut Vec) { + for (val, &is_nullable) in values.iter().zip(nullable_flags.iter()) { + if is_nullable { + rb_encode_nullable(val, buf); + } else { + rb_encode_value(val, buf); + } + } +} + +// ── Cell → ClickHouseValue conversion ──────────────────────────────────────── + +/// Converts a [`Cell`] to a [`ClickHouseValue`], consuming it (no clone). +fn cell_to_clickhouse_value(cell: Cell) -> ClickHouseValue { + match cell { + Cell::Null => ClickHouseValue::Null, + Cell::Bool(b) => ClickHouseValue::Bool(b), + Cell::I16(v) => ClickHouseValue::Int16(v), + Cell::I32(v) => ClickHouseValue::Int32(v), + Cell::I64(v) => ClickHouseValue::Int64(v), + Cell::U32(v) => ClickHouseValue::UInt32(v), + Cell::F32(v) => ClickHouseValue::Float32(v), + Cell::F64(v) => ClickHouseValue::Float64(v), + Cell::Numeric(n) => ClickHouseValue::String(n.to_string()), + Cell::Date(d) => { + let days = d + .signed_duration_since(unix_epoch()) + .num_days() + .clamp(0, i64::from(u16::MAX)) as u16; + ClickHouseValue::Date(days) + } + Cell::Time(t) => ClickHouseValue::String(t.to_string()), + Cell::Timestamp(dt) => ClickHouseValue::DateTime64(dt.and_utc().timestamp_micros()), + Cell::TimestampTz(dt) => ClickHouseValue::DateTime64(dt.timestamp_micros()), + Cell::Uuid(u) => ClickHouseValue::Uuid(*u.as_bytes()), + Cell::Json(j) => ClickHouseValue::String(j.to_string()), + Cell::Bytes(b) => ClickHouseValue::String(bytes_to_hex(&b)), + Cell::String(s) => ClickHouseValue::String(s), + Cell::Array(array_cell) => { + ClickHouseValue::Array(array_cell_to_clickhouse_values(array_cell)) + } + } +} + +fn array_cell_to_clickhouse_values(array_cell: ArrayCell) -> Vec { + match array_cell { + ArrayCell::Bool(v) => v + .into_iter() + .map(|o| o.map_or(ClickHouseValue::Null, ClickHouseValue::Bool)) + .collect(), + ArrayCell::String(v) => v + .into_iter() + .map(|o| o.map_or(ClickHouseValue::Null, ClickHouseValue::String)) + .collect(), + ArrayCell::I16(v) => v + .into_iter() + .map(|o| o.map_or(ClickHouseValue::Null, ClickHouseValue::Int16)) + .collect(), + ArrayCell::I32(v) => v + .into_iter() + .map(|o| o.map_or(ClickHouseValue::Null, ClickHouseValue::Int32)) + .collect(), + ArrayCell::I64(v) => v + .into_iter() + .map(|o| o.map_or(ClickHouseValue::Null, ClickHouseValue::Int64)) + .collect(), + ArrayCell::U32(v) => v + .into_iter() + .map(|o| o.map_or(ClickHouseValue::Null, ClickHouseValue::UInt32)) + .collect(), + ArrayCell::F32(v) => v + .into_iter() + .map(|o| o.map_or(ClickHouseValue::Null, ClickHouseValue::Float32)) + .collect(), + ArrayCell::F64(v) => v + .into_iter() + .map(|o| o.map_or(ClickHouseValue::Null, ClickHouseValue::Float64)) + .collect(), + ArrayCell::Numeric(v) => v + .into_iter() + .map(|o| { + o.map_or(ClickHouseValue::Null, |n| { + ClickHouseValue::String(n.to_string()) + }) + }) + .collect(), + ArrayCell::Date(v) => v + .into_iter() + .map(|o| { + o.map_or(ClickHouseValue::Null, |d| { + let days = d + .signed_duration_since(unix_epoch()) + .num_days() + .clamp(0, i64::from(u16::MAX)) as u16; + ClickHouseValue::Date(days) + }) + }) + .collect(), + ArrayCell::Time(v) => v + .into_iter() + .map(|o| { + o.map_or(ClickHouseValue::Null, |t| { + ClickHouseValue::String(t.to_string()) + }) + }) + .collect(), + ArrayCell::Timestamp(v) => v + .into_iter() + .map(|o| { + o.map_or(ClickHouseValue::Null, |dt| { + ClickHouseValue::DateTime64(dt.and_utc().timestamp_micros()) + }) + }) + .collect(), + ArrayCell::TimestampTz(v) => v + .into_iter() + .map(|o| { + o.map_or(ClickHouseValue::Null, |dt| { + ClickHouseValue::DateTime64(dt.timestamp_micros()) + }) + }) + .collect(), + ArrayCell::Uuid(v) => v + .into_iter() + .map(|o| { + o.map_or(ClickHouseValue::Null, |u| { + ClickHouseValue::Uuid(*u.as_bytes()) + }) + }) + .collect(), + ArrayCell::Json(v) => v + .into_iter() + .map(|o| { + o.map_or(ClickHouseValue::Null, |j| { + ClickHouseValue::String(j.to_string()) + }) + }) + .collect(), + ArrayCell::Bytes(v) => v + .into_iter() + .map(|o| { + o.map_or(ClickHouseValue::Null, |b| { + ClickHouseValue::String(bytes_to_hex(&b)) + }) + }) + .collect(), + } +} + +fn bytes_to_hex(bytes: &[u8]) -> String { + let mut s = String::with_capacity(bytes.len() * 2); + for b in bytes { + use fmt::Write; + let _ = write!(s, "{b:02x}"); + } + s +} + +// ── Destination struct ──────────────────────────────────────────────────────── + +/// CDC-capable ClickHouse destination that replicates Postgres tables. +/// +/// Uses append-only MergeTree tables with two CDC columns (`cdc_operation`, `cdc_lsn`) +/// appended to each row. Rows are encoded as RowBinary and sent via +/// `INSERT INTO "table" FORMAT RowBinary` — no column-name header required. +/// +/// The struct is cheaply cloneable: `client` has an internal `Arc`, and `table_cache` +/// is wrapped in `Arc>`. +#[derive(Clone)] +pub struct ClickHouseDestination { + client: Client, + inserter_config: Arc, + store: S, + /// Cache: ClickHouse table name → `Arc<[bool]>` (nullable flags per column, + /// including the two trailing CDC columns which are always `false`). + /// + /// `std::sync::RwLock` is appropriate here: both reads (hot path) and writes (rare, + /// only on first encounter of a new table) are brief in-memory operations. The lock + /// is always released before any `.await` point (DDL is executed with no lock held), + /// so the async `tokio::sync::RwLock` would be unnecessary overhead. + table_cache: Arc>>>, +} + +impl ClickHouseDestination +where + S: StateStore + SchemaStore + Send + Sync, +{ + /// Creates a new `ClickHouseDestination`. + /// + /// When using an `https://` URL, TLS is handled automatically by the `rustls-tls` + /// feature using webpki root certificates. + pub fn new( + url: impl Into, + user: impl Into, + password: Option, + database: impl Into, + inserter_config: ClickHouseInserterConfig, + store: S, + ) -> EtlResult { + register_metrics(); + let client = build_client(url.into(), user.into(), password, database.into()); + Ok(Self { + client, + inserter_config: Arc::new(inserter_config), + store, + table_cache: Arc::new(RwLock::new(HashMap::new())), + }) + } + + /// Creates a new `ClickHouseDestination` with TLS using a custom CA certificate. + /// + /// Note: The `clickhouse` crate v0.14 does not expose a public API for configuring + /// custom CA certificates. For HTTPS with standard TLS (webpki roots), use + /// [`ClickHouseDestination::new`] with an `https://` URL. Custom CA certificate + /// support is planned for a future update when the crate exposes the necessary API. + #[allow(dead_code)] + pub fn new_with_tls( + _url: impl Into, + _user: impl Into, + _password: Option, + _database: impl Into, + _ca_cert_pem: String, + _inserter_config: ClickHouseInserterConfig, + _store: S, + ) -> EtlResult { + bail!( + ErrorKind::Unknown, + "Custom CA certificates not supported", + "The clickhouse crate v0.14 does not expose an API for custom CA certificates. \ + Use ClickHouseDestination::new() with an https:// URL for standard TLS \ + (webpki root certificates are used for server verification)." + ) + } + + /// Ensures the ClickHouse table for `table_id` exists, returning + /// `(ch_table_name, nullable_flags)`. + /// + /// Uses a two-phase locking strategy: + /// 1. Fast-path read (no await) → return cached entry if present. + /// 2. Slow-path: compute DDL, run `CREATE TABLE IF NOT EXISTS` (await, no lock held), + /// then write-lock to insert (using `or_insert` for the concurrent first-writer race). + async fn ensure_table_exists(&self, table_id: TableId) -> EtlResult<(String, Arc<[bool]>)> { + // 1. Get table schema from store. + let table_schema = self + .store + .get_table_schema(&table_id) + .await? + .ok_or_else(|| { + etl_error!( + ErrorKind::MissingTableSchema, + "Table schema not found", + format!("No schema found for table {table_id}") + ) + })?; + + // 2. Determine / persist ClickHouse table name. + let ch_table_name = { + if let Some(name) = self.store.get_table_mapping(&table_id).await? { + name + } else { + let name = table_name_to_clickhouse_table_name( + &table_schema.name.schema, + &table_schema.name.name, + ); + self.store + .store_table_mapping(table_id, name.clone()) + .await?; + name + } + }; + + // 3. Fast-path cache check (no await). + { + let guard = self.table_cache.read().unwrap(); + if let Some(flags) = guard.get(&ch_table_name) { + return Ok((ch_table_name, Arc::clone(flags))); + } + } + + // 4. Compute nullable flags (user columns + 2 CDC columns always non-nullable). + let column_schemas = &table_schema.column_schemas; + let mut nullable_flags_vec: Vec = column_schemas.iter().map(|c| c.nullable).collect(); + nullable_flags_vec.push(false); // cdc_operation + nullable_flags_vec.push(false); // cdc_lsn + let nullable_flags: Arc<[bool]> = nullable_flags_vec.into(); + + // 5. Build and execute DDL (no lock held during this await). + let ddl = build_create_table_sql(&ch_table_name, column_schemas); + let ddl_start = Instant::now(); + self.client.query(&ddl).execute().await.map_err(|e| { + etl_error!( + ErrorKind::Unknown, + "ClickHouse DDL failed", + format!("Failed to create table '{ch_table_name}': {e}") + ) + })?; + metrics::histogram!(ETL_CH_DDL_DURATION_SECONDS, "table" => ch_table_name.clone()) + .record(ddl_start.elapsed().as_secs_f64()); + + // 6. Write-lock: insert, using or_insert to handle concurrent first-writer race. + let stored_flags = { + let mut guard = self.table_cache.write().unwrap(); + Arc::clone( + guard + .entry(ch_table_name.clone()) + .or_insert_with(|| Arc::clone(&nullable_flags)), + ) + }; + + Ok((ch_table_name, stored_flags)) + } + + async fn truncate_table_inner(&self, table_id: TableId) -> EtlResult<()> { + let (ch_table_name, _) = self.ensure_table_exists(table_id).await?; + self.client + .query(&format!("TRUNCATE TABLE IF EXISTS \"{ch_table_name}\"")) + .execute() + .await + .map_err(|e| { + etl_error!( + ErrorKind::Unknown, + "ClickHouse truncate failed", + format!("Failed to truncate table '{ch_table_name}': {e}") + ) + }) + } + + async fn write_table_rows_inner( + &self, + table_id: TableId, + table_rows: Vec, + ) -> EtlResult<()> { + let (ch_table_name, nullable_flags) = self.ensure_table_exists(table_id).await?; + let sql = format!("INSERT INTO \"{ch_table_name}\" FORMAT RowBinary"); + let max_rows = self.inserter_config.max_rows_per_insert; + let max_bytes = self.inserter_config.max_bytes_per_insert; + + let mut insert = self + .client + .insert_formatted_with(sql.clone()) + .buffered_with_capacity(256 * 1024); + let mut rows = 0u64; + let mut bytes = 0u64; + let mut row_buf = Vec::new(); + let mut insert_start = Instant::now(); + + for table_row in table_rows { + row_buf.clear(); + let mut values: Vec = table_row + .into_values() + .into_iter() + .map(cell_to_clickhouse_value) + .collect(); + values.push(ClickHouseValue::String("INSERT".to_string())); + values.push(ClickHouseValue::Int64(0)); + rb_encode_row(&values, &nullable_flags, &mut row_buf); + + insert.write_buffered(&row_buf); + rows += 1; + bytes += row_buf.len() as u64; + + if rows >= max_rows || bytes >= max_bytes { + insert.end().await.map_err(|e| { + etl_error!( + ErrorKind::Unknown, + "ClickHouse insert flush failed", + format!("Failed to flush INSERT for '{ch_table_name}': {e}") + ) + })?; + metrics::histogram!( + ETL_CH_INSERT_DURATION_SECONDS, + "table" => ch_table_name.clone(), + "source" => "copy" + ) + .record(insert_start.elapsed().as_secs_f64()); + insert = self + .client + .insert_formatted_with(sql.clone()) + .buffered_with_capacity(256 * 1024); + insert_start = Instant::now(); + rows = 0; + bytes = 0; + } + } + + insert.end().await.map_err(|e| { + etl_error!( + ErrorKind::Unknown, + "ClickHouse insert flush failed", + format!("Failed to flush INSERT for '{ch_table_name}': {e}") + ) + })?; + metrics::histogram!( + ETL_CH_INSERT_DURATION_SECONDS, + "table" => ch_table_name.clone(), + "source" => "copy" + ) + .record(insert_start.elapsed().as_secs_f64()); + Ok(()) + } + + /// Processes events in passes driven by an outer loop that runs until the iterator + /// is exhausted. Each pass: + /// 1. Accumulates Insert/Update/Delete rows per table until a Truncate (or end). + /// 2. Writes those rows concurrently. + /// 3. Drains consecutive Truncate events (deduplicated) and executes them. + /// + /// Breaking at a Truncate never skips events — the outer loop resumes from that + /// position, so rows accumulated before the Truncate are flushed first, then the + /// Truncate fires, then subsequent events (including inserts on the same table) + /// are processed in the next pass. + async fn write_events_inner(&self, events: Vec) -> EtlResult<()> { + let mut event_iter = events.into_iter().peekable(); + + while event_iter.peek().is_some() { + // Accumulate non-truncate events grouped by table_id. + let mut table_id_to_rows: HashMap> = HashMap::new(); + + while let Some(event) = event_iter.peek() { + if matches!(event, Event::Truncate(_)) { + break; + } + + let event = event_iter.next().unwrap(); + match event { + Event::Insert(insert) => { + table_id_to_rows + .entry(insert.table_id) + .or_default() + .push(PendingRow { + operation: CdcOperation::Insert, + lsn: insert.commit_lsn, + cells: insert.table_row.into_values(), + }); + } + Event::Update(update) => { + table_id_to_rows + .entry(update.table_id) + .or_default() + .push(PendingRow { + operation: CdcOperation::Update, + lsn: update.commit_lsn, + cells: update.table_row.into_values(), + }); + } + Event::Delete(delete) => { + let Some((_, old_row)) = delete.old_table_row else { + info!("delete event has no row data, skipping"); + continue; + }; + table_id_to_rows + .entry(delete.table_id) + .or_default() + .push(PendingRow { + operation: CdcOperation::Delete, + lsn: delete.commit_lsn, + cells: old_row.into_values(), + }); + } + event => { + debug!( + event_type = %event.event_type(), + "skipping unsupported event type" + ); + } + } + } + + // Write accumulated rows concurrently, one JoinSet task per table. + if !table_id_to_rows.is_empty() { + // Phase 1: ensure all tables exist (must happen outside JoinSet spawns + // since ensure_table_exists holds &self which is not 'static). + let mut table_meta: HashMap)> = HashMap::new(); + for &table_id in table_id_to_rows.keys() { + let (name, flags) = self.ensure_table_exists(table_id).await?; + table_meta.insert(table_id, (name, flags)); + } + + // Phase 2: spawn concurrent writers with pre-resolved metadata. + // Only the ClickHouse Client (cheaply cloneable, 'static) goes into spawn. + let mut join_set: JoinSet> = JoinSet::new(); + for (table_id, row_data) in table_id_to_rows { + let (ch_table_name, nullable_flags) = table_meta.remove(&table_id).unwrap(); + let client = self.client.clone(); + let max_rows = self.inserter_config.max_rows_per_insert; + let max_bytes = self.inserter_config.max_bytes_per_insert; + + join_set.spawn(async move { + let sql = format!("INSERT INTO \"{ch_table_name}\" FORMAT RowBinary"); + let mut insert = client + .insert_formatted_with(sql.clone()) + .buffered_with_capacity(256 * 1024); + let mut rows = 0u64; + let mut bytes = 0u64; + let mut row_buf = Vec::new(); + let mut insert_start = Instant::now(); + + for PendingRow { + operation, + lsn, + cells, + } in row_data + { + row_buf.clear(); + let mut values: Vec = + cells.into_iter().map(cell_to_clickhouse_value).collect(); + values.push(ClickHouseValue::String(operation.as_str().to_string())); + values.push(ClickHouseValue::Int64(i64::try_from(u64::from(lsn)).inspect_err(|error| { + tracing::error!(?error, "cannot convert u64 value to i64 for clickhouse destination, fallback to max i64"); + }).unwrap_or(i64::MAX))); + rb_encode_row(&values, &nullable_flags, &mut row_buf); + + insert.write_buffered(&row_buf); + rows += 1; + bytes += row_buf.len() as u64; + + if rows >= max_rows || bytes >= max_bytes { + insert.end().await.map_err(|e| { + etl_error!( + ErrorKind::Unknown, + "ClickHouse insert flush failed", + format!( + "Failed to flush INSERT for '{ch_table_name}': {e}" + ) + ) + })?; + metrics::histogram!( + ETL_CH_INSERT_DURATION_SECONDS, + "table" => ch_table_name.clone(), + "source" => "streaming" + ) + .record(insert_start.elapsed().as_secs_f64()); + insert = client + .insert_formatted_with(sql.clone()) + .buffered_with_capacity(256 * 1024); + insert_start = Instant::now(); + rows = 0; + bytes = 0; + } + } + + insert.end().await.map_err(|e| { + etl_error!( + ErrorKind::Unknown, + "ClickHouse insert flush failed", + format!("Failed to flush INSERT for '{ch_table_name}': {e}") + ) + })?; + metrics::histogram!( + ETL_CH_INSERT_DURATION_SECONDS, + "table" => ch_table_name.clone(), + "source" => "streaming" + ) + .record(insert_start.elapsed().as_secs_f64()); + + Ok(()) + }); + } + + while let Some(result) = join_set.join_next().await { + result + .map_err(|_| etl_error!(ErrorKind::Unknown, "Failed to join future"))??; + } + } + + // Collect and deduplicate truncate events. + let mut truncate_table_ids = HashSet::new(); + while let Some(Event::Truncate(_)) = event_iter.peek() { + if let Some(Event::Truncate(truncate_event)) = event_iter.next() { + for table_id in truncate_event.rel_ids { + truncate_table_ids.insert(TableId::new(table_id)); + } + } + } + + for table_id in truncate_table_ids { + self.truncate_table_inner(table_id).await?; + } + } + + Ok(()) + } +} + +impl Destination for ClickHouseDestination +where + S: StateStore + SchemaStore + Send + Sync, +{ + fn name() -> &'static str { + "clickhouse" + } + + async fn truncate_table(&self, table_id: TableId) -> EtlResult<()> { + self.truncate_table_inner(table_id).await + } + + async fn write_table_rows( + &self, + table_id: TableId, + table_rows: Vec, + ) -> EtlResult<()> { + self.write_table_rows_inner(table_id, table_rows).await + } + + async fn write_events(&self, events: Vec) -> EtlResult<()> { + self.write_events_inner(events).await + } +} + +// ── Client builder ──────────────────────────────────────────────────────────── + +fn build_client(url: String, user: String, password: Option, database: String) -> Client { + let mut client = Client::default() + .with_url(url) + .with_user(user) + .with_database(database); + + if let Some(pw) = password { + client = client.with_password(pw); + } + + client +} + +// ── Unit tests ──────────────────────────────────────────────────────────────── + +#[cfg(test)] +mod tests { + use super::*; + use chrono::NaiveDate; + use uuid::Uuid; + + #[test] + fn test_cell_to_clickhouse_value_null() { + assert!(matches!( + cell_to_clickhouse_value(Cell::Null), + ClickHouseValue::Null + )); + } + + #[test] + fn test_cell_to_clickhouse_value_bool() { + assert!(matches!( + cell_to_clickhouse_value(Cell::Bool(true)), + ClickHouseValue::Bool(true) + )); + } + + #[test] + fn test_cell_to_clickhouse_value_i32() { + assert!(matches!( + cell_to_clickhouse_value(Cell::I32(42)), + ClickHouseValue::Int32(42) + )); + } + + #[test] + fn test_cell_to_clickhouse_value_string() { + if let ClickHouseValue::String(s) = + cell_to_clickhouse_value(Cell::String("hello".to_string())) + { + assert_eq!(s, "hello"); + } else { + panic!("expected String variant"); + } + } + + #[test] + fn test_cell_to_clickhouse_value_date() { + // 1970-01-01 = day 0 + let epoch = NaiveDate::from_ymd_opt(1970, 1, 1).unwrap(); + assert!(matches!( + cell_to_clickhouse_value(Cell::Date(epoch)), + ClickHouseValue::Date(0) + )); + + // 1970-01-02 = day 1 + let day1 = NaiveDate::from_ymd_opt(1970, 1, 2).unwrap(); + assert!(matches!( + cell_to_clickhouse_value(Cell::Date(day1)), + ClickHouseValue::Date(1) + )); + } + + #[test] + fn test_cell_to_clickhouse_value_timestamp() { + let epoch = chrono::DateTime::from_timestamp(0, 0).unwrap().naive_utc(); + assert!(matches!( + cell_to_clickhouse_value(Cell::Timestamp(epoch)), + ClickHouseValue::DateTime64(0) + )); + } + + #[test] + fn test_cell_to_clickhouse_value_uuid() { + let u = Uuid::parse_str("550e8400-e29b-41d4-a716-446655440000").unwrap(); + let expected_bytes = *u.as_bytes(); + if let ClickHouseValue::Uuid(bytes) = cell_to_clickhouse_value(Cell::Uuid(u)) { + assert_eq!(bytes, expected_bytes); + } else { + panic!("expected Uuid variant"); + } + } + + #[test] + fn test_cell_to_clickhouse_value_bytes_hex() { + let bytes = vec![0xde, 0xad, 0xbe, 0xef]; + if let ClickHouseValue::String(s) = cell_to_clickhouse_value(Cell::Bytes(bytes)) { + assert_eq!(s, "deadbeef"); + } else { + panic!("expected String variant"); + } + } + + #[test] + fn test_rb_encode_value_scalars() { + let mut buf = Vec::new(); + + buf.clear(); + rb_encode_value(&ClickHouseValue::Bool(true), &mut buf); + assert_eq!(buf, [1u8]); + + buf.clear(); + rb_encode_value(&ClickHouseValue::Int32(-1), &mut buf); + assert_eq!(buf, (-1i32).to_le_bytes()); + + buf.clear(); + rb_encode_value(&ClickHouseValue::String("hi".to_string()), &mut buf); + assert_eq!(buf, [2, b'h', b'i']); // varint(2) + bytes + + buf.clear(); + rb_encode_value(&ClickHouseValue::Date(1), &mut buf); + assert_eq!(buf, 1u16.to_le_bytes()); + } + + #[test] + fn test_rb_encode_uuid_wire_format() { + // UUID 550e8400-e29b-41d4-a716-446655440000 + let u = Uuid::parse_str("550e8400-e29b-41d4-a716-446655440000").unwrap(); + let val = ClickHouseValue::Uuid(*u.as_bytes()); + let mut buf = Vec::new(); + rb_encode_value(&val, &mut buf); + + assert_eq!(buf.len(), 16); + // high u64 from bytes 0-7, written LE + let bytes = u.as_bytes(); + let high = u64::from_be_bytes(bytes[0..8].try_into().unwrap()); + let low = u64::from_be_bytes(bytes[8..16].try_into().unwrap()); + let mut expected = high.to_le_bytes().to_vec(); + expected.extend_from_slice(&low.to_le_bytes()); + assert_eq!(buf, expected); + } + + #[test] + fn test_rb_encode_nullable() { + let mut buf = Vec::new(); + + // Null → just the 1 byte + rb_encode_nullable(&ClickHouseValue::Null, &mut buf); + assert_eq!(buf, [1u8]); + + buf.clear(); + rb_encode_nullable(&ClickHouseValue::Int32(42), &mut buf); + let mut expected = vec![0u8]; // not-null indicator + expected.extend_from_slice(&42i32.to_le_bytes()); + assert_eq!(buf, expected); + } + + #[test] + fn test_rb_varint() { + let mut buf = Vec::new(); + rb_varint(0, &mut buf); + assert_eq!(buf, [0x00]); + + buf.clear(); + rb_varint(127, &mut buf); + assert_eq!(buf, [0x7f]); + + buf.clear(); + rb_varint(128, &mut buf); + assert_eq!(buf, [0x80, 0x01]); + + buf.clear(); + rb_varint(300, &mut buf); + assert_eq!(buf, [0xac, 0x02]); // 300 = 0b100101100 → [0x2c | 0x80, 0x02] + } + + #[test] + fn test_nullable_flags_includes_cdc() { + let mut all_flags: Vec = vec![true, false]; + all_flags.push(false); // cdc_operation + all_flags.push(false); // cdc_lsn + + assert_eq!(all_flags.len(), 4); + assert!(all_flags[0]); + assert!(!all_flags[1]); + assert!(!all_flags[2]); + assert!(!all_flags[3]); + } + + #[test] + fn test_bytes_to_hex() { + assert_eq!(bytes_to_hex(&[]), ""); + assert_eq!(bytes_to_hex(&[0x00]), "00"); + assert_eq!(bytes_to_hex(&[0xff]), "ff"); + assert_eq!(bytes_to_hex(&[0xde, 0xad, 0xbe, 0xef]), "deadbeef"); + } +} diff --git a/etl-destinations/src/clickhouse/metrics.rs b/etl-destinations/src/clickhouse/metrics.rs new file mode 100644 index 000000000..6b12a1bf3 --- /dev/null +++ b/etl-destinations/src/clickhouse/metrics.rs @@ -0,0 +1,32 @@ +use std::sync::Once; + +use metrics::{Unit, describe_histogram}; + +static REGISTER_METRICS: Once = Once::new(); + +/// Duration of `CREATE TABLE IF NOT EXISTS` DDL operations sent to ClickHouse. +/// Labels: `table`. +pub const ETL_CH_DDL_DURATION_SECONDS: &str = "etl_ch_ddl_duration_seconds"; + +/// Duration of a single RowBinary INSERT statement from first write to server acknowledgement. +/// Labels: `table`, `source` (`copy` = initial table sync, `streaming` = CDC events). +pub const ETL_CH_INSERT_DURATION_SECONDS: &str = "etl_ch_insert_duration_seconds"; + +/// Register ClickHouse-specific metrics. +/// +/// Safe to call multiple times — registration happens only once. +pub fn register_metrics() { + REGISTER_METRICS.call_once(|| { + describe_histogram!( + ETL_CH_DDL_DURATION_SECONDS, + Unit::Seconds, + "Duration of CREATE TABLE IF NOT EXISTS DDL operations sent to ClickHouse, labeled by table" + ); + + describe_histogram!( + ETL_CH_INSERT_DURATION_SECONDS, + Unit::Seconds, + "Duration of RowBinary INSERT statements from first write to server acknowledgement, labeled by table and source" + ); + }); +} diff --git a/etl-destinations/src/clickhouse/mod.rs b/etl-destinations/src/clickhouse/mod.rs new file mode 100644 index 000000000..584d0ab26 --- /dev/null +++ b/etl-destinations/src/clickhouse/mod.rs @@ -0,0 +1,5 @@ +mod core; +mod metrics; +mod schema; + +pub use core::{ClickHouseDestination, ClickHouseInserterConfig}; diff --git a/etl-destinations/src/clickhouse/schema.rs b/etl-destinations/src/clickhouse/schema.rs new file mode 100644 index 000000000..c62dddbdb --- /dev/null +++ b/etl-destinations/src/clickhouse/schema.rs @@ -0,0 +1,292 @@ +use etl::types::{ColumnSchema, Type, is_array_type}; + +/// Returns the base ClickHouse type string for a Postgres scalar type. +/// +/// The returned string does not include `Nullable(...)` wrapping — callers are +/// responsible for applying that when the column is nullable. Arrays always use +/// `Array(Nullable(T))` since Postgres array elements are nullable. +pub fn postgres_column_type_to_clickhouse_sql(typ: &Type) -> &'static str { + match typ { + &Type::BOOL => "Boolean", + &Type::CHAR | &Type::BPCHAR | &Type::VARCHAR | &Type::NAME | &Type::TEXT => "String", + &Type::INT2 => "Int16", + &Type::INT4 => "Int32", + &Type::INT8 => "Int64", + &Type::FLOAT4 => "Float32", + &Type::FLOAT8 => "Float64", + &Type::NUMERIC => "String", + &Type::DATE => "Date", + &Type::TIME => "String", + &Type::TIMESTAMP => "DateTime64(6)", + &Type::TIMESTAMPTZ => "DateTime64(6, 'UTC')", + &Type::UUID => "UUID", + &Type::JSON | &Type::JSONB => "String", + &Type::BYTEA => "String", + &Type::OID => "UInt32", + _ => "String", + } +} + +/// Returns the ClickHouse array element type for a Postgres array type. +/// +/// Used by [`build_create_table_sql`] to construct `Array(Nullable(T))` columns. +fn postgres_array_element_clickhouse_sql(typ: &Type) -> &'static str { + match typ { + &Type::BOOL_ARRAY => "Boolean", + &Type::CHAR_ARRAY + | &Type::BPCHAR_ARRAY + | &Type::VARCHAR_ARRAY + | &Type::NAME_ARRAY + | &Type::TEXT_ARRAY => "String", + &Type::INT2_ARRAY => "Int16", + &Type::INT4_ARRAY => "Int32", + &Type::INT8_ARRAY => "Int64", + &Type::FLOAT4_ARRAY => "Float32", + &Type::FLOAT8_ARRAY => "Float64", + &Type::NUMERIC_ARRAY => "String", + &Type::DATE_ARRAY => "Date", + &Type::TIME_ARRAY => "String", + &Type::TIMESTAMP_ARRAY => "DateTime64(6)", + &Type::TIMESTAMPTZ_ARRAY => "DateTime64(6, 'UTC')", + &Type::UUID_ARRAY => "UUID", + &Type::JSON_ARRAY | &Type::JSONB_ARRAY => "String", + &Type::BYTEA_ARRAY => "String", + &Type::OID_ARRAY => "UInt32", + _ => "String", + } +} + +/// Converts a Postgres `public.my_table` style table name into a ClickHouse table +/// name using the same double-underscore escaping convention used by DuckLake/Iceberg. +/// +/// - Schema and table are joined with `_` +/// - Any literal `_` in the schema or table name is escaped to `__` +/// +/// Examples: +/// - `public.orders` → `public_orders` +/// - `my_schema.t` → `my__schema_t` +pub fn table_name_to_clickhouse_table_name(schema: &str, table: &str) -> String { + let escaped_schema = schema.replace('_', "__"); + let escaped_table = table.replace('_', "__"); + format!("{escaped_schema}_{escaped_table}") +} + +/// Generates a `CREATE TABLE IF NOT EXISTS` SQL statement for the given columns. +/// +/// - Non-nullable columns use the bare ClickHouse type (`Int32`, `String`, …). +/// - Nullable columns use `Nullable(T)`. +/// - Array columns always use `Array(Nullable(T))` (Postgres array elements are nullable). +/// - Two CDC trailing columns are always appended as non-nullable: +/// `cdc_operation String, cdc_lsn Int64` +/// - The table uses `MergeTree()` with `ORDER BY tuple()` (pure append order). +pub fn build_create_table_sql(table_name: &str, column_schemas: &[ColumnSchema]) -> String { + let mut cols = Vec::with_capacity(column_schemas.len() + 2); + + for col in column_schemas { + let col_type = if is_array_type(&col.typ) { + let elem = postgres_array_element_clickhouse_sql(&col.typ); + format!("Array(Nullable({elem}))") + } else { + let base = postgres_column_type_to_clickhouse_sql(&col.typ); + if col.nullable { + format!("Nullable({base})") + } else { + base.to_string() + } + }; + cols.push(format!(" \"{}\" {}", col.name, col_type)); + } + + // CDC columns — always non-nullable + cols.push(" \"cdc_operation\" String".to_string()); + cols.push(" \"cdc_lsn\" Int64".to_string()); + + let col_defs = cols.join(",\n"); + format!( + "CREATE TABLE IF NOT EXISTS \"{table_name}\" (\n{col_defs}\n) ENGINE = MergeTree()\nORDER BY tuple()" + ) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_table_name_escaping() { + assert_eq!( + table_name_to_clickhouse_table_name("public", "orders"), + "public_orders" + ); + assert_eq!( + table_name_to_clickhouse_table_name("my_schema", "my_table"), + "my__schema_my__table" + ); + assert_eq!( + table_name_to_clickhouse_table_name("public", "my__table"), + "public_my____table" + ); + } + + #[test] + fn test_scalar_type_mapping() { + assert_eq!(postgres_column_type_to_clickhouse_sql(&Type::BOOL), "Boolean"); + assert_eq!( + postgres_column_type_to_clickhouse_sql(&Type::CHAR), + "String" + ); + assert_eq!( + postgres_column_type_to_clickhouse_sql(&Type::BPCHAR), + "String" + ); + assert_eq!( + postgres_column_type_to_clickhouse_sql(&Type::VARCHAR), + "String" + ); + assert_eq!( + postgres_column_type_to_clickhouse_sql(&Type::NAME), + "String" + ); + assert_eq!( + postgres_column_type_to_clickhouse_sql(&Type::TEXT), + "String" + ); + assert_eq!(postgres_column_type_to_clickhouse_sql(&Type::INT2), "Int16"); + assert_eq!(postgres_column_type_to_clickhouse_sql(&Type::INT4), "Int32"); + assert_eq!(postgres_column_type_to_clickhouse_sql(&Type::INT8), "Int64"); + assert_eq!( + postgres_column_type_to_clickhouse_sql(&Type::FLOAT4), + "Float32" + ); + assert_eq!( + postgres_column_type_to_clickhouse_sql(&Type::FLOAT8), + "Float64" + ); + assert_eq!( + postgres_column_type_to_clickhouse_sql(&Type::NUMERIC), + "String" + ); + assert_eq!(postgres_column_type_to_clickhouse_sql(&Type::DATE), "Date"); + assert_eq!( + postgres_column_type_to_clickhouse_sql(&Type::TIME), + "String" + ); + assert_eq!( + postgres_column_type_to_clickhouse_sql(&Type::TIMESTAMP), + "DateTime64(6)" + ); + assert_eq!( + postgres_column_type_to_clickhouse_sql(&Type::TIMESTAMPTZ), + "DateTime64(6, 'UTC')" + ); + assert_eq!(postgres_column_type_to_clickhouse_sql(&Type::UUID), "UUID"); + assert_eq!( + postgres_column_type_to_clickhouse_sql(&Type::JSON), + "String" + ); + assert_eq!( + postgres_column_type_to_clickhouse_sql(&Type::JSONB), + "String" + ); + assert_eq!( + postgres_column_type_to_clickhouse_sql(&Type::BYTEA), + "String" + ); + assert_eq!(postgres_column_type_to_clickhouse_sql(&Type::OID), "UInt32"); + } + + #[test] + fn test_array_type_mapping() { + assert_eq!( + postgres_array_element_clickhouse_sql(&Type::BOOL_ARRAY), + "Boolean" + ); + assert_eq!( + postgres_array_element_clickhouse_sql(&Type::TEXT_ARRAY), + "String" + ); + assert_eq!( + postgres_array_element_clickhouse_sql(&Type::INT4_ARRAY), + "Int32" + ); + assert_eq!( + postgres_array_element_clickhouse_sql(&Type::INT8_ARRAY), + "Int64" + ); + assert_eq!( + postgres_array_element_clickhouse_sql(&Type::FLOAT8_ARRAY), + "Float64" + ); + assert_eq!( + postgres_array_element_clickhouse_sql(&Type::UUID_ARRAY), + "UUID" + ); + assert_eq!( + postgres_array_element_clickhouse_sql(&Type::JSONB_ARRAY), + "String" + ); + } + + #[test] + fn test_build_create_table_sql_nullable() { + let schemas = vec![ + ColumnSchema { + name: "id".to_string(), + typ: Type::INT4, + modifier: -1, + nullable: false, + primary: true, + }, + ColumnSchema { + name: "name".to_string(), + typ: Type::TEXT, + modifier: -1, + nullable: true, + primary: false, + }, + ]; + let sql = build_create_table_sql("public_users", &schemas); + assert!(sql.contains("\"id\" Int32"), "id should be non-nullable Int32"); + assert!( + sql.contains("\"name\" Nullable(String)"), + "name should be Nullable(String)" + ); + } + + #[test] + fn test_build_create_table_sql_cdc_columns() { + let schemas = vec![ColumnSchema { + name: "id".to_string(), + typ: Type::INT4, + modifier: -1, + nullable: false, + primary: true, + }]; + let sql = build_create_table_sql("public_t", &schemas); + assert!( + sql.contains("\"cdc_operation\" String"), + "cdc_operation should be non-nullable" + ); + assert!( + sql.contains("\"cdc_lsn\" Int64"), + "cdc_lsn should be non-nullable Int64" + ); + assert!(sql.contains("ENGINE = MergeTree()")); + assert!(sql.contains("ORDER BY tuple()")); + } + + #[test] + fn test_build_create_table_sql_array_columns() { + let schemas = vec![ColumnSchema { + name: "tags".to_string(), + typ: Type::TEXT_ARRAY, + modifier: -1, + nullable: false, + primary: false, + }]; + let sql = build_create_table_sql("public_t", &schemas); + assert!( + sql.contains("\"tags\" Array(Nullable(String))"), + "array columns should always be Array(Nullable(T))" + ); + } +} diff --git a/etl-destinations/src/lib.rs b/etl-destinations/src/lib.rs index 721d63c79..aaa9127b7 100644 --- a/etl-destinations/src/lib.rs +++ b/etl-destinations/src/lib.rs @@ -9,6 +9,8 @@ mod table_name; #[cfg(feature = "bigquery")] pub mod bigquery; +#[cfg(feature = "clickhouse")] +pub mod clickhouse; #[cfg(feature = "ducklake")] pub mod ducklake; #[cfg(feature = "egress")] diff --git a/etl-examples/Cargo.toml b/etl-examples/Cargo.toml index b52558ad1..b8a256f5f 100644 --- a/etl-examples/Cargo.toml +++ b/etl-examples/Cargo.toml @@ -19,11 +19,12 @@ path = "src/bin/ducklake.rs" [dependencies] etl = { workspace = true } etl-config = { workspace = true } -etl-destinations = { workspace = true, features = ["bigquery", "ducklake"] } +etl-destinations = { workspace = true, features = ["bigquery", "clickhouse", "ducklake"] } etl-telemetry = { workspace = true } clap = { workspace = true, default-features = true, features = ["std", "derive"] } rustls = { workspace = true, features = ["aws-lc-rs", "logging"] } +sysinfo = { workspace = true, features = ["system"] } tokio = { workspace = true, features = ["macros", "signal"] } tracing = { workspace = true, default-features = true } tracing-subscriber = { workspace = true, default-features = true, features = ["env-filter"] } diff --git a/etl-examples/README.md b/etl-examples/README.md index f6711e068..aa72955f9 100644 --- a/etl-examples/README.md +++ b/etl-examples/README.md @@ -8,6 +8,7 @@ Postgres to various destinations using the ETL pipeline. | Example | Binary | Destination | |---------|--------|-------------| | [BigQuery](#bigquery) | `bigquery` | Google BigQuery (cloud data warehouse) | +| [ClickHouse](#clickhouse-setup) | `clickhouse` | ClickHouse (column-oriented OLAP database) | | [DuckLake](#ducklake) | `ducklake` | DuckLake (open data lake format) | --- @@ -95,6 +96,45 @@ cargo run --bin ducklake -p etl-examples -- \ The CLI also accepts plain local paths such as `./lake_data/` and normalizes them to absolute `file://` URLs before constructing the destination. +## ClickHouse Setup + +To run the ClickHouse example, you'll need a running ClickHouse instance accessible over HTTP(S). + +Create a publication in Postgres: + +```sql +create publication my_pub +for table table1, table2; +``` + +Then run the ClickHouse example: + +```bash +cargo run -p etl-examples --bin clickhouse -- \ + --db-host localhost \ + --db-port 5432 \ + --db-name postgres \ + --db-username postgres \ + --db-password password \ + --ch-url http://localhost:8123 \ + --ch-user default \ + --ch-database default \ + --publication my_pub +``` + +Each Postgres table is replicated as an append-only `MergeTree` table. Two CDC metadata +columns are appended to every row: + +- `cdc_operation`: `INSERT`, `UPDATE`, or `DELETE` +- `cdc_lsn`: the Postgres LSN at the time of the change + +Table names are derived from the Postgres schema and table name using double-underscore +escaping (e.g. `public.orders` → `public_orders`, `my_schema.t` → `my__schema_t`). + +For HTTPS connections, provide an `https://` URL — TLS is handled automatically using +webpki root certificates. Use `--ch-password` if your ClickHouse instance requires +authentication. + ### Example configuration This is a fuller local example that also enables a dedicated DuckDB log dump on From 01da5178674285565f3282d6cfc7b957e033a26b Mon Sep 17 00:00:00 2001 From: Benjamin <5719034+bnjjj@users.noreply.github.com> Date: Tue, 24 Feb 2026 17:29:05 +0100 Subject: [PATCH 02/86] add support for clickhouse destination in ETL and API Signed-off-by: Benjamin <5719034+bnjjj@users.noreply.github.com> --- Cargo.lock | 2 + etl-api/Cargo.toml | 2 +- etl-api/src/configs/destination.rs | 100 +++ etl-api/src/k8s/base.rs | 17 + etl-api/src/k8s/cache.rs | 12 + etl-api/src/k8s/core.rs | 22 + etl-api/src/k8s/http.rs | 94 +++ etl-api/src/validation/validators.rs | 68 ++ etl-api/tests/support/k8s_client.rs | 12 + etl-config/src/shared/destination.rs | 28 + etl-destinations/Cargo.toml | 4 + etl-destinations/src/clickhouse/client.rs | 172 +++++ etl-destinations/src/clickhouse/core.rs | 761 +++----------------- etl-destinations/src/clickhouse/encoding.rs | 454 ++++++++++++ etl-destinations/src/clickhouse/mod.rs | 3 + etl-replicator/Cargo.toml | 5 +- etl-replicator/src/core.rs | 147 +++- etl/src/concurrency/memory_monitor.rs | 8 +- etl/src/lib.rs | 6 +- 19 files changed, 1234 insertions(+), 683 deletions(-) create mode 100644 etl-destinations/src/clickhouse/client.rs create mode 100644 etl-destinations/src/clickhouse/encoding.rs diff --git a/Cargo.lock b/Cargo.lock index 8a126e9ff..8ae34be4a 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1762,6 +1762,7 @@ dependencies = [ "iceberg", "iceberg-catalog-rest", "metrics", + "parking_lot", "parquet", "prost", "rand 0.9.2", @@ -1823,6 +1824,7 @@ dependencies = [ "serde", "serde_json", "sqlx", + "sysinfo", "tikv-jemalloc-ctl", "tikv-jemallocator", "tokio", diff --git a/etl-api/Cargo.toml b/etl-api/Cargo.toml index 0efee9b4f..66936d449 100644 --- a/etl-api/Cargo.toml +++ b/etl-api/Cargo.toml @@ -18,7 +18,7 @@ name = "etl-api" [dependencies] etl = { workspace = true } etl-config = { workspace = true, features = ["utoipa", "supabase"] } -etl-destinations = { workspace = true, features = ["bigquery", "iceberg", "ducklake"] } +etl-destinations = { workspace = true, features = ["bigquery", "clickhouse", "ducklake", "iceberg"] } etl-postgres = { workspace = true, features = ["replication"] } etl-telemetry = { workspace = true } diff --git a/etl-api/src/configs/destination.rs b/etl-api/src/configs/destination.rs index 708f42474..8826ba772 100644 --- a/etl-api/src/configs/destination.rs +++ b/etl-api/src/configs/destination.rs @@ -39,6 +39,22 @@ pub enum FullApiDestinationConfig { #[serde(skip_serializing_if = "Option::is_none")] connection_pool_size: Option, }, + ClickHouse { + /// ClickHouse HTTP(S) endpoint URL + #[schema(example = "http://test:8123")] + #[serde(deserialize_with = "crate::utils::trim_string")] + url: String, //TODO: use url type instead + /// ClickHouse user name + #[schema(example = "foo")] + #[serde(deserialize_with = "crate::utils::trim_string")] + user: String, + /// ClickHouse password (omit for passwordless access) + password: Option, + /// ClickHouse target database + #[schema(example = "my_db")] + #[serde(deserialize_with = "crate::utils::trim_string")] + database: String, + }, Iceberg { #[serde(flatten)] config: FullApiIcebergConfig, @@ -117,6 +133,17 @@ impl From for FullApiDestinationConfig { max_staleness_mins, connection_pool_size: Some(connection_pool_size), }, + StoredDestinationConfig::ClickHouse { + url, + user, + password, + database, + } => Self::ClickHouse { + url, + user, + password, + database, + }, StoredDestinationConfig::Iceberg { config } => match config { StoredIcebergConfig::Supabase { project_ref, @@ -191,6 +218,12 @@ pub enum StoredDestinationConfig { max_staleness_mins: Option, connection_pool_size: usize, }, + ClickHouse { + url: String, //TODO: use url type instead + user: String, + password: Option, + database: String, + }, Iceberg { config: StoredIcebergConfig, }, @@ -224,6 +257,17 @@ impl StoredDestinationConfig { max_staleness_mins, connection_pool_size, }, + Self::ClickHouse { + url, + user, + password, + database, + } => DestinationConfig::ClickHouse { + url, + user, + password: password.map(|s| s.into()), + database, + }, Self::Iceberg { config } => match config { StoredIcebergConfig::Supabase { project_ref, @@ -306,6 +350,17 @@ impl From for StoredDestinationConfig { connection_pool_size: connection_pool_size .unwrap_or(DestinationConfig::DEFAULT_CONNECTION_POOL_SIZE), }, + FullApiDestinationConfig::ClickHouse { + url, + user, + password, + database, + } => Self::ClickHouse { + url, + user, + password, + database, + }, FullApiDestinationConfig::Iceberg { config } => match config { FullApiIcebergConfig::Supabase { project_ref, @@ -397,6 +452,24 @@ impl Encrypt for StoredDestinationConfig { connection_pool_size, }) } + Self::ClickHouse { + url, + user, + password, + database, + } => { + let encrypted_password = match password { + Some(p) => Some(encrypt_text(p.expose_secret().to_owned(), encryption_key)?), + None => None, + }; + + Ok(EncryptedStoredDestinationConfig::ClickHouse { + url, + user, + password: encrypted_password, + database, + }) + } Self::Iceberg { config } => match config { StoredIcebergConfig::Supabase { project_ref, @@ -500,6 +573,12 @@ pub enum EncryptedStoredDestinationConfig { #[serde(default = "default_connection_pool_size")] connection_pool_size: usize, }, + ClickHouse { + url: String, //TODO: use url type instead + user: String, + password: Option, + database: String, + }, Iceberg { #[serde(flatten)] config: EncryptedStoredIcebergConfig, @@ -614,6 +693,27 @@ impl Decrypt for EncryptedStoredDestinationConfig { }) } }, + EncryptedStoredDestinationConfig::ClickHouse { + url, + user, + password, + database, + } => { + let password = match password { + Some(p) => Some(SerializableSecretString::from(decrypt_text( + p, + encryption_key, + )?)), + None => None, + }; + + Ok(StoredDestinationConfig::ClickHouse { + url, + user, + password, + database, + }) + } Self::Ducklake { catalog_url, data_path, diff --git a/etl-api/src/k8s/base.rs b/etl-api/src/k8s/base.rs index 4064d7d49..4c4e4f77a 100644 --- a/etl-api/src/k8s/base.rs +++ b/etl-api/src/k8s/base.rs @@ -40,6 +40,8 @@ pub enum DestinationType { BigQuery, /// Apache Iceberg destination. Iceberg, + /// ClickHouse destination. + ClickHouse, /// DuckLake destination. Ducklake, } @@ -50,6 +52,7 @@ impl From<&StoredDestinationConfig> for DestinationType { match value { StoredDestinationConfig::BigQuery { .. } => DestinationType::BigQuery, StoredDestinationConfig::Iceberg { .. } => DestinationType::Iceberg, + StoredDestinationConfig::ClickHouse { .. } => DestinationType::ClickHouse, StoredDestinationConfig::Ducklake { .. } => DestinationType::Ducklake, } } @@ -132,6 +135,15 @@ pub trait K8sClient: Send + Sync { bq_service_account_key: &str, ) -> Result<(), K8sError>; + /// Creates or updates the ClickHouse password for a replicator. + /// + /// The secret name is derived from `prefix` and stored in the data-plane namespace. + async fn create_or_update_clickhouse_secret( + &self, + prefix: &str, + password: Option<&str>, + ) -> Result<(), K8sError>; + /// Creates or updates the Iceberg credentials secret for a replicator. /// /// The secret contains the catalog token, S3 access key ID, and S3 secret access key. @@ -159,6 +171,11 @@ pub trait K8sClient: Send + Sync { /// Does nothing if the secret does not exist. async fn delete_postgres_secret(&self, prefix: &str) -> Result<(), K8sError>; + /// Deletes the ClickHouse credentials for a replicator. + /// + /// Does nothing if the secret does not exist. + async fn delete_clickhouse_secret(&self, prefix: &str) -> Result<(), K8sError>; + /// Deletes the BigQuery service account secret for a replicator. /// /// Does nothing if the secret does not exist. diff --git a/etl-api/src/k8s/cache.rs b/etl-api/src/k8s/cache.rs index 7aed05965..80f436739 100644 --- a/etl-api/src/k8s/cache.rs +++ b/etl-api/src/k8s/cache.rs @@ -174,6 +174,14 @@ mod tests { Ok(()) } + async fn create_or_update_clickhouse_secret( + &self, + _prefix: &str, + _password: Option<&str>, + ) -> Result<(), K8sError> { + Ok(()) + } + async fn create_or_update_ducklake_secret( &self, _prefix: &str, @@ -187,6 +195,10 @@ mod tests { Ok(()) } + async fn delete_clickhouse_secret(&self, _prefix: &str) -> Result<(), K8sError> { + Ok(()) + } + async fn delete_bigquery_secret(&self, _prefix: &str) -> Result<(), K8sError> { Ok(()) } diff --git a/etl-api/src/k8s/core.rs b/etl-api/src/k8s/core.rs index 04b1d6a42..8bb0a3552 100644 --- a/etl-api/src/k8s/core.rs +++ b/etl-api/src/k8s/core.rs @@ -29,6 +29,13 @@ pub enum Secrets { /// Google Cloud service account key JSON for BigQuery authentication. big_query_service_account_key: String, }, + /// Credentials for ClickHouse destinations. + ClickHouse { + /// PostgreSQL source database password. + postgres_password: String, + /// Clickhouse password + password: Option, + }, /// Credentials for Iceberg destinations. Iceberg { /// PostgreSQL source database password. @@ -181,6 +188,10 @@ fn build_secrets_from_configs( s3_access_key_id: s3_access_key_id.expose_secret().to_string(), s3_secret_access_key: s3_secret_access_key.expose_secret().to_string(), }, + StoredDestinationConfig::ClickHouse { password, .. } => Secrets::ClickHouse { + postgres_password, + password: password.as_ref().map(|p| p.expose_secret().to_string()), + }, StoredDestinationConfig::Ducklake { s3_access_key_id, s3_secret_access_key, @@ -271,6 +282,17 @@ async fn create_or_update_dynamic_replicator_secrets( ) .await?; } + Secrets::ClickHouse { + postgres_password, + password, + } => { + k8s_client + .create_or_update_postgres_secret(prefix, &postgres_password) + .await?; + k8s_client + .create_or_update_clickhouse_secret(prefix, password.as_deref()) + .await?; + } Secrets::Ducklake { postgres_password, s3_access_key_id, diff --git a/etl-api/src/k8s/http.rs b/etl-api/src/k8s/http.rs index bc36fd3d2..7f1570c00 100644 --- a/etl-api/src/k8s/http.rs +++ b/etl-api/src/k8s/http.rs @@ -18,6 +18,10 @@ use tracing::debug; /// Secret name suffix for the BigQuery service account key. const BQ_SECRET_NAME_SUFFIX: &str = "bq-service-account-key"; +/// Secret name suffix for the ClickHouse password. +const CLICKHOUSE_SECRET_NAME_SUFFIX: &str = "ch-password"; +/// Name of the password in the ClickHouse secret and its reference. +const CLICKHOUSE_PASSWORD_NAME: &str = "ch-password"; /// Name of the service account key in the BigQuery secret and its reference. const BQ_SERVICE_ACCOUNT_KEY_NAME: &str = "service-account-key"; /// Secret name suffix for iceberg secrets (includes catalog token, @@ -314,6 +318,36 @@ impl K8sClient for HttpK8sClient { Ok(()) } + async fn create_or_update_clickhouse_secret( + &self, + prefix: &str, + password: Option<&str>, + ) -> Result<(), K8sError> { + debug!("patching clickhouse secret"); + + if let Some(password) = password { + let encoded_ch_password = BASE64_STANDARD.encode(password); + let clickhouse_secret_name = create_clickhouse_secret_name(prefix); + let replicator_app_name = create_replicator_app_name(prefix); + let clickhouse_secret_json = create_clickhouse_password_secret_json( + &clickhouse_secret_name, + &replicator_app_name, + &encoded_ch_password, + ); + let secret: Secret = serde_json::from_value(clickhouse_secret_json)?; + + // We are forcing the update since we are the field manager that should own the fields. If + // there is an override (likely during an incident or SREs intervention), we want to override + // their changes. The API database is the source of truth for credentials. + let pp = PatchParams::apply(&clickhouse_secret_name).force(); + self.secrets_api + .patch(&clickhouse_secret_name, &pp, &Patch::Apply(secret)) + .await?; + } + + Ok(()) + } + async fn create_or_update_iceberg_secret( &self, prefix: &str, @@ -390,6 +424,16 @@ impl K8sClient for HttpK8sClient { Ok(()) } + async fn delete_clickhouse_secret(&self, prefix: &str) -> Result<(), K8sError> { + debug!("deleting clickhouse secret"); + + let ch_secret_name = create_clickhouse_secret_name(prefix); + let dp = DeleteParams::default(); + Self::handle_delete_with_404_ignore(self.secrets_api.delete(&ch_secret_name, &dp).await)?; + + Ok(()) + } + async fn delete_bigquery_secret(&self, prefix: &str) -> Result<(), K8sError> { debug!("deleting bq secret"); @@ -599,6 +643,10 @@ fn create_iceberg_secret_name(prefix: &str) -> String { format!("{prefix}-{ICEBERG_SECRET_NAME_SUFFIX}") } +fn create_clickhouse_secret_name(prefix: &str) -> String { + format!("{prefix}-{CLICKHOUSE_SECRET_NAME_SUFFIX}") +} + fn create_ducklake_secret_name(prefix: &str) -> String { format!("{prefix}-{DUCKLAKE_SECRET_NAME_SUFFIX}") } @@ -650,6 +698,29 @@ fn create_postgres_secret_json( }) } +fn create_clickhouse_password_secret_json( + secret_name: &str, + replicator_app_name: &str, + encoded_ch_password: &str, +) -> serde_json::Value { + json!({ + "apiVersion": "v1", + "kind": "Secret", + "metadata": { + "name": secret_name, + "namespace": DATA_PLANE_NAMESPACE, + "labels": { + "etl.supabase.com/app-name": replicator_app_name, + "etl.supabase.com/app-type": REPLICATOR_APP_LABEL, + } + }, + "type": "Opaque", + "data": { + CLICKHOUSE_PASSWORD_NAME: encoded_ch_password, + } + }) +} + fn create_bq_service_account_key_secret_json( secret_name: &str, replicator_app_name: &str, @@ -829,6 +900,17 @@ fn create_container_environment_json( let bq_secret_env_var_json = create_bq_secret_env_var_json(&bq_secret_name); container_environment.push(bq_secret_env_var_json); } + DestinationType::ClickHouse => { + let postgres_secret_name = create_postgres_secret_name(prefix); + let postgres_secret_env_var_json = + create_postgres_secret_env_var_json(&postgres_secret_name); + container_environment.push(postgres_secret_env_var_json); + + let clickhouse_secret_name = create_clickhouse_secret_name(prefix); + let clickhouse_secret_env_var_json = + create_clickhouse_secret_env_var_json(&clickhouse_secret_name); + container_environment.push(clickhouse_secret_env_var_json); + } DestinationType::Iceberg => { let postgres_secret_name = create_postgres_secret_name(prefix); let postgres_secret_env_var_json = @@ -1010,6 +1092,18 @@ fn create_bq_secret_env_var_json(bq_secret_name: &str) -> serde_json::Value { }) } +fn create_clickhouse_secret_env_var_json(clickouse_secret_name: &str) -> serde_json::Value { + json!({ + "name": "APP_DESTINATION__CLICKHOUSE__PASSWORD", + "valueFrom": { + "secretKeyRef": { + "name": clickouse_secret_name, + "key": CLICKHOUSE_PASSWORD_NAME + } + } + }) +} + fn create_iceberg_catlog_token_env_var_json(iceberg_secret_name: &str) -> serde_json::Value { json!({ "name": "APP_DESTINATION__ICEBERG__SUPABASE__CATALOG_TOKEN", diff --git a/etl-api/src/validation/validators.rs b/etl-api/src/validation/validators.rs index 0d74de2eb..31a178d83 100644 --- a/etl-api/src/validation/validators.rs +++ b/etl-api/src/validation/validators.rs @@ -6,6 +6,7 @@ use async_trait::async_trait; use etl::store::both::memory::MemoryStore; use etl_config::parse_ducklake_url; use etl_destinations::bigquery::BigQueryClient; +use etl_destinations::clickhouse::ClickHouseClient; use etl_destinations::ducklake::{DuckLakeDestination, S3Config as DucklakeS3Config}; use etl_destinations::iceberg::{ IcebergClient, S3_ACCESS_KEY_ID, S3_ENDPOINT, S3_SECRET_ACCESS_KEY, @@ -652,6 +653,59 @@ impl Validator for BigQueryValidator { } } +/// Validates Clickhouse destination connectivity and dataset accessibility. +#[derive(Debug)] +struct ClickHouseValidator { + url: String, //TODO: use url type instead + user: String, + password: Option, + database: String, +} + +impl ClickHouseValidator { + fn new( + url: String, //TODO: use url type instead + user: String, + password: Option, + database: String, + ) -> Self { + Self { + url, + user, + password, + database, + } + } +} + +#[async_trait] +impl Validator for ClickHouseValidator { + async fn validate( + &self, + _ctx: &ValidationContext, + ) -> Result, ValidationError> { + let client = ClickHouseClient::new( + self.url.clone(), + self.user.clone(), + self.password.clone(), + self.database.clone(), + ); + match client.ping().await { + Ok(_) => Ok(Vec::new()), + Err(_) => Ok(vec![ValidationFailure::critical( + "ClickHouse Connection Failed", + "Unable to create clickhouse client.\n\n\ + Please verify:\n\ + (1) The url is valid and accessible\n\ + (2) The username is correct\n\ + (3) You set the right password\n\ + (4) You set the right database name + ", + )]), + } + } +} + /// Validates Iceberg destination connectivity. #[derive(Debug)] struct IcebergValidator { @@ -904,6 +958,20 @@ impl Validator for DestinationValidator { ); validator.validate(ctx).await } + FullApiDestinationConfig::ClickHouse { + url, + user, + password, + database, + } => { + let validator = ClickHouseValidator::new( + url.clone(), + user.clone(), + password.as_ref().map(|p| p.expose_secret().to_string()), + database.clone(), + ); + validator.validate(ctx).await + } FullApiDestinationConfig::Iceberg { config } => { let validator = IcebergValidator::new(config.clone()); validator.validate(ctx).await diff --git a/etl-api/tests/support/k8s_client.rs b/etl-api/tests/support/k8s_client.rs index 0ad6e8293..35b697323 100644 --- a/etl-api/tests/support/k8s_client.rs +++ b/etl-api/tests/support/k8s_client.rs @@ -71,6 +71,14 @@ impl K8sClient for MockK8sClient { Ok(()) } + async fn create_or_update_clickhouse_secret( + &self, + _prefix: &str, + _password: Option<&str>, + ) -> Result<(), K8sError> { + Ok(()) + } + async fn create_or_update_iceberg_secret( &self, _prefix: &str, @@ -95,6 +103,10 @@ impl K8sClient for MockK8sClient { Ok(()) } + async fn delete_clickhouse_secret(&self, _prefix: &str) -> Result<(), K8sError> { + Ok(()) + } + async fn delete_bigquery_secret(&self, _prefix: &str) -> Result<(), K8sError> { Ok(()) } diff --git a/etl-config/src/shared/destination.rs b/etl-config/src/shared/destination.rs index c5379c683..c06b52dbc 100644 --- a/etl-config/src/shared/destination.rs +++ b/etl-config/src/shared/destination.rs @@ -46,6 +46,16 @@ pub enum DestinationConfig { #[serde(default = "default_connection_pool_size")] connection_pool_size: usize, }, + ClickHouse { + /// ClickHouse HTTP(S) endpoint URL + url: String, //TODO: use url instead + /// ClickHouse user name + user: String, + /// ClickHouse password (omit for passwordless access) + password: Option, + /// ClickHouse target database + database: String, + }, Iceberg { #[serde(flatten)] config: IcebergConfig, @@ -224,6 +234,14 @@ pub enum DestinationConfigWithoutSecrets { #[serde(default = "default_connection_pool_size")] connection_pool_size: usize, }, + ClickHouse { + /// ClickHouse HTTP(S) endpoint URL + url: String, //TODO: use url instead + /// ClickHouse user name + user: String, + /// ClickHouse target database + database: String, + }, Iceberg { #[serde(flatten)] config: IcebergConfigWithoutSecrets, @@ -264,6 +282,16 @@ impl From for DestinationConfigWithoutSecrets { max_staleness_mins, connection_pool_size, }, + DestinationConfig::ClickHouse { + url, + user, + database, + .. + } => DestinationConfigWithoutSecrets::ClickHouse { + url, + user, + database, + }, DestinationConfig::Iceberg { config } => DestinationConfigWithoutSecrets::Iceberg { config: config.into(), }, diff --git a/etl-destinations/Cargo.toml b/etl-destinations/Cargo.toml index 506688cc1..511e705f3 100644 --- a/etl-destinations/Cargo.toml +++ b/etl-destinations/Cargo.toml @@ -50,6 +50,8 @@ clickhouse = [ "dep:tracing", "dep:tokio", "dep:serde", + "dep:futures", + "dep:parking_lot" ] egress = ["etl/egress"] # We assume that `test-utils` is always used in conjunction with `bigquery` or `iceberg` thus we only @@ -65,6 +67,7 @@ base64 = { workspace = true, optional = true } chrono = { workspace = true } clickhouse = { workspace = true, optional = true, features = ["inserter", "rustls-tls"] } duckdb = { workspace = true, optional = true, features = ["bundled", "r2d2"] } +futures = { optional = true, workspace = true } gcp-bigquery-client = { workspace = true, optional = true, features = ["rust-tls", "aws-lc-rs"] } iceberg = { workspace = true, optional = true } iceberg-catalog-rest = { workspace = true, optional = true } @@ -84,6 +87,7 @@ tonic = { workspace = true, optional = true } tracing = { workspace = true, optional = true, default-features = true } url = { workspace = true, optional = true } uuid = { workspace = true, optional = true, features = ["v4"] } +parking_lot = { workspace = true, optional = true } [dev-dependencies] duckdb = { workspace = true, features = ["bundled"] } diff --git a/etl-destinations/src/clickhouse/client.rs b/etl-destinations/src/clickhouse/client.rs new file mode 100644 index 000000000..37b00cb4a --- /dev/null +++ b/etl-destinations/src/clickhouse/client.rs @@ -0,0 +1,172 @@ +use std::sync::Arc; +use std::time::Instant; + +use clickhouse::Client; +use etl::error::{ErrorKind, EtlResult}; +use etl::etl_error; + +use crate::clickhouse::encoding::{ClickHouseValue, rb_encode_row}; +use crate::clickhouse::metrics::ETL_CH_INSERT_DURATION_SECONDS; + +/// Capacity of the internal write buffer used per INSERT statement. +/// +/// When this many bytes have been written to the buffer it is flushed to the +/// network (but the INSERT statement itself is not closed — that only happens +/// when `end()` is called or the `max_bytes_per_insert` limit is reached). +const BUFFERED_CAPACITY: usize = 256 * 1024; + +/// High-level ClickHouse client used by [`super::core::ClickHouseDestination`]. +/// +/// Wraps a [`clickhouse::Client`] and exposes typed methods for DDL, truncation, +/// and RowBinary bulk inserts. Cheaply cloneable — the inner client holds an `Arc` +/// internally, and the outer `Arc` here ensures a single shared instance. +#[derive(Clone)] +pub struct ClickHouseClient { + inner: Arc, +} + +impl ClickHouseClient { + /// Creates a new [`ClickHouseClient`]. + /// + /// When `url` starts with `https://`, TLS is handled automatically by the + /// `rustls-tls` feature using webpki root certificates. + pub fn new( + url: impl Into, + user: impl Into, + password: Option, + database: impl Into, + ) -> Self { + let mut client = Client::default() + .with_url(url) + .with_user(user) + .with_database(database); + + if let Some(pw) = password { + client = client.with_password(pw); + } + + Self { + inner: Arc::new(client), + } + } + + pub async fn ping(&self) -> EtlResult<()> { + self.inner + .query("SELECT 1") + .fetch_one::() + .await + .map(|_| ()) + .map_err(|e| { + etl_error!( + ErrorKind::Unknown, + "ClickHouse connectivity check failed", + e + ) + }) + } + + /// Executes a DDL statement (e.g. `CREATE TABLE IF NOT EXISTS …`). + pub(crate) async fn execute_ddl(&self, sql: &str) -> EtlResult<()> { + self.inner.query(sql).execute().await.map_err(|e| { + etl_error!( + ErrorKind::Unknown, + "ClickHouse DDL failed", + format!("DDL execution failed: {e}") + ) + }) + } + + /// Executes `TRUNCATE TABLE IF EXISTS ""`. + pub(crate) async fn truncate_table(&self, table_name: &str) -> EtlResult<()> { + self.inner + .query(&format!("TRUNCATE TABLE IF EXISTS \"{table_name}\"")) + .execute() + .await + .map_err(|e| { + etl_error!( + ErrorKind::Unknown, + "ClickHouse truncate failed", + format!("Failed to truncate table '{table_name}': {e}") + ) + }) + } + + /// Inserts `rows` into `table_name` using the RowBinary format. + /// + /// Each element of `rows` is a complete, already-encoded row of + /// [`ClickHouseValue`]s in column order (user columns + CDC columns). + /// `nullable_flags` must have the same length as each row. + /// + /// When the accumulated uncompressed byte count reaches `max_bytes_per_insert` + /// the current INSERT statement is committed and a new one is opened, keeping + /// peak memory usage bounded for large initial copies. + /// + /// The `source` label (`"copy"` or `"streaming"`) is attached to the + /// `etl_ch_insert_duration_seconds` histogram recorded after each committed + /// INSERT statement. + pub(crate) async fn insert_rows( + &self, + table_name: &str, + rows: Vec>, + nullable_flags: &[bool], + max_bytes_per_insert: u64, + source: &'static str, + ) -> EtlResult<()> { + let sql = format!("INSERT INTO \"{table_name}\" FORMAT RowBinary"); + + let mut insert = self + .inner + .insert_formatted_with(sql.clone()) + .buffered_with_capacity(BUFFERED_CAPACITY); + let mut bytes = 0u64; + let mut row_buf = Vec::new(); + let mut insert_start = Instant::now(); + + for row in rows { + row_buf.clear(); + rb_encode_row(row, nullable_flags, &mut row_buf)?; + + insert.write_buffered(&row_buf); + bytes += row_buf.len() as u64; + + if bytes >= max_bytes_per_insert { + insert.end().await.map_err(|e| { + etl_error!( + ErrorKind::Unknown, + "ClickHouse insert flush failed", + format!("Failed to flush INSERT for '{table_name}': {e}") + ) + })?; + metrics::histogram!( + ETL_CH_INSERT_DURATION_SECONDS, + "table" => table_name.to_string(), + "source" => source + ) + .record(insert_start.elapsed().as_secs_f64()); + + insert = self + .inner + .insert_formatted_with(sql.clone()) + .buffered_with_capacity(BUFFERED_CAPACITY); + insert_start = Instant::now(); + bytes = 0; + } + } + + insert.end().await.map_err(|e| { + etl_error!( + ErrorKind::Unknown, + "ClickHouse insert flush failed", + format!("Failed to flush INSERT for '{table_name}': {e}") + ) + })?; + metrics::histogram!( + ETL_CH_INSERT_DURATION_SECONDS, + "table" => table_name.to_string(), + "source" => source + ) + .record(insert_start.elapsed().as_secs_f64()); + + Ok(()) + } +} diff --git a/etl-destinations/src/clickhouse/core.rs b/etl-destinations/src/clickhouse/core.rs index ba106f358..3784234c7 100644 --- a/etl-destinations/src/clickhouse/core.rs +++ b/etl-destinations/src/clickhouse/core.rs @@ -1,21 +1,19 @@ use std::{ collections::{HashMap, HashSet}, - fmt, - sync::{Arc, RwLock}, + sync::Arc, }; -use crate::clickhouse::metrics::{ - ETL_CH_DDL_DURATION_SECONDS, ETL_CH_INSERT_DURATION_SECONDS, register_metrics, -}; +use crate::clickhouse::client::ClickHouseClient; +use crate::clickhouse::encoding::{ClickHouseValue, cell_to_clickhouse_value}; +use crate::clickhouse::metrics::{ETL_CH_DDL_DURATION_SECONDS, register_metrics}; use crate::clickhouse::schema::{build_create_table_sql, table_name_to_clickhouse_table_name}; -use chrono::NaiveDate; -use clickhouse::Client; use etl::error::{ErrorKind, EtlResult}; +use etl::etl_error; use etl::store::schema::SchemaStore; use etl::store::state::StateStore; -use etl::types::{ArrayCell, Cell, Event, TableId, TableRow}; -use etl::{bail, etl_error}; +use etl::types::{Cell, Event, TableId, TableRow}; use etl::{destination::Destination, types::PgLsn}; +use parking_lot::RwLock; use std::time::Instant; use tokio::task::JoinSet; use tracing::{debug, info}; @@ -29,12 +27,12 @@ enum CdcOperation { Delete, } -impl CdcOperation { - fn as_str(self) -> &'static str { +impl std::fmt::Display for CdcOperation { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { match self { - CdcOperation::Insert => "INSERT", - CdcOperation::Update => "UPDATE", - CdcOperation::Delete => "DELETE", + CdcOperation::Insert => write!(f, "INSERT"), + CdcOperation::Update => write!(f, "UPDATE"), + CdcOperation::Delete => write!(f, "DELETE"), } } } @@ -46,12 +44,6 @@ struct PendingRow { cells: Vec, } -// ── Unix epoch constant for Date conversion ─────────────────────────────────── - -fn unix_epoch() -> NaiveDate { - NaiveDate::from_ymd_opt(1970, 1, 1).expect("valid date") -} - // ── Inserter configuration ──────────────────────────────────────────────────── /// Controls intermediate flushing inside a single `write_table_rows` / `write_events` call. @@ -59,8 +51,6 @@ fn unix_epoch() -> NaiveDate { /// The upstream `BatchConfig::max_fill_ms` controls when `write_events` is called; /// these limits prevent unbounded memory use for very large batches (e.g. initial copy). pub struct ClickHouseInserterConfig { - /// Start a new INSERT after this many rows (default: 100_000). - pub max_rows_per_insert: u64, /// Start a new INSERT after this many uncompressed bytes. /// /// Derive this from `BatchConfig::memory_budget_ratio × total_memory / max_table_sync_workers` @@ -68,278 +58,6 @@ pub struct ClickHouseInserterConfig { pub max_bytes_per_insert: u64, } -impl Default for ClickHouseInserterConfig { - fn default() -> Self { - Self { - max_rows_per_insert: 100_000, - max_bytes_per_insert: 256 * 1024 * 1024, // 256 MiB - } - } -} - -// ── ClickHouseValue ─────────────────────────────────────────────────────────── - -/// Owned ClickHouse-compatible value, moved (not cloned) from a [`Cell`]. -enum ClickHouseValue { - Null, - Bool(bool), - Int16(i16), - Int32(i32), - Int64(i64), - UInt32(u32), - Float32(f32), - Float64(f64), - /// TEXT, NUMERIC (string), TIME (string), JSON, BYTEA (hex-encoded) - String(String), - /// Days since Unix epoch (ClickHouse `Date` on wire = UInt16 LE) - Date(u16), - /// Microseconds since Unix epoch (ClickHouse `DateTime64(6)` on wire = Int64 LE) - DateTime64(i64), - /// UUID in standard 16-byte big-endian order (converted to ClickHouse wire format on encode) - Uuid([u8; 16]), - Array(Vec), -} - -// ── RowBinary encoding ──────────────────────────────────────────────────────── -// -// We bypass the `Row` / `Inserter` API entirely and write RowBinary bytes directly -// via `Client::insert_formatted_with("INSERT INTO \"t\" FORMAT RowBinary")`. -// -// This avoids two fatal issues with the `Inserter` path: -// -// 1. `Insert::new` always calls `join_column_names::().expect(…)`, which panics -// when `COLUMN_NAMES = &[]` regardless of whether validation is enabled. -// -// 2. The RowBinary serde serializer wraps its `BufMut` writer in a fresh `&mut` at -// every `serialize_some` call, telescoping the type to `&mut &mut … BytesMut` for -// nullable array elements and overflowing the compiler's recursion limit. -// -// Direct binary encoding has neither problem: it is a simple recursive function that -// writes bytes to a `Vec` with no generics and no type-level recursion. - -/// Encodes a variable-length integer (LEB128) used by ClickHouse for string/array lengths. -fn rb_varint(mut v: usize, buf: &mut Vec) { - loop { - let byte = (v & 0x7f) as u8; - v >>= 7; - if v == 0 { - buf.push(byte); - return; - } - buf.push(byte | 0x80); - } -} - -/// Encodes a value for a `Nullable(T)` column (1-byte null indicator + value if present). -fn rb_encode_nullable(val: &ClickHouseValue, buf: &mut Vec) { - match val { - ClickHouseValue::Null => buf.push(1), - v => { - buf.push(0); - rb_encode_value(v, buf); - } - } -} - -/// Encodes a value for a non-nullable column (no null indicator byte). -fn rb_encode_value(val: &ClickHouseValue, buf: &mut Vec) { - match val { - ClickHouseValue::Null => { - // A non-nullable column unexpectedly received NULL (data quality issue from - // Postgres). Write a zero-length string as the least-harmful fallback. - buf.push(0); // varint 0 = empty string - } - ClickHouseValue::Bool(b) => buf.push(*b as u8), - ClickHouseValue::Int16(v) => buf.extend_from_slice(&v.to_le_bytes()), - ClickHouseValue::Int32(v) => buf.extend_from_slice(&v.to_le_bytes()), - ClickHouseValue::Int64(v) => buf.extend_from_slice(&v.to_le_bytes()), - ClickHouseValue::UInt32(v) => buf.extend_from_slice(&v.to_le_bytes()), - ClickHouseValue::Float32(v) => buf.extend_from_slice(&v.to_le_bytes()), - ClickHouseValue::Float64(v) => buf.extend_from_slice(&v.to_le_bytes()), - ClickHouseValue::String(s) => { - rb_varint(s.len(), buf); - buf.extend_from_slice(s.as_bytes()); - } - ClickHouseValue::Date(days) => buf.extend_from_slice(&days.to_le_bytes()), - ClickHouseValue::DateTime64(micros) => buf.extend_from_slice(µs.to_le_bytes()), - ClickHouseValue::Uuid(bytes) => { - // ClickHouse RowBinary UUID = two little-endian u64 (high bits then low bits). - // Our bytes are in standard UUID big-endian order, so we split into two u64 - // and write each in little-endian. - let high = u64::from_be_bytes(bytes[0..8].try_into().unwrap()); - let low = u64::from_be_bytes(bytes[8..16].try_into().unwrap()); - buf.extend_from_slice(&high.to_le_bytes()); - buf.extend_from_slice(&low.to_le_bytes()); - } - // Array elements are always Nullable in ClickHouse: Array(Nullable(T)). - ClickHouseValue::Array(items) => { - rb_varint(items.len(), buf); - for item in items { - rb_encode_nullable(item, buf); - } - } - } -} - -/// Encodes a complete row into `buf`, selecting nullable vs non-nullable encoding per column. -fn rb_encode_row(values: &[ClickHouseValue], nullable_flags: &[bool], buf: &mut Vec) { - for (val, &is_nullable) in values.iter().zip(nullable_flags.iter()) { - if is_nullable { - rb_encode_nullable(val, buf); - } else { - rb_encode_value(val, buf); - } - } -} - -// ── Cell → ClickHouseValue conversion ──────────────────────────────────────── - -/// Converts a [`Cell`] to a [`ClickHouseValue`], consuming it (no clone). -fn cell_to_clickhouse_value(cell: Cell) -> ClickHouseValue { - match cell { - Cell::Null => ClickHouseValue::Null, - Cell::Bool(b) => ClickHouseValue::Bool(b), - Cell::I16(v) => ClickHouseValue::Int16(v), - Cell::I32(v) => ClickHouseValue::Int32(v), - Cell::I64(v) => ClickHouseValue::Int64(v), - Cell::U32(v) => ClickHouseValue::UInt32(v), - Cell::F32(v) => ClickHouseValue::Float32(v), - Cell::F64(v) => ClickHouseValue::Float64(v), - Cell::Numeric(n) => ClickHouseValue::String(n.to_string()), - Cell::Date(d) => { - let days = d - .signed_duration_since(unix_epoch()) - .num_days() - .clamp(0, i64::from(u16::MAX)) as u16; - ClickHouseValue::Date(days) - } - Cell::Time(t) => ClickHouseValue::String(t.to_string()), - Cell::Timestamp(dt) => ClickHouseValue::DateTime64(dt.and_utc().timestamp_micros()), - Cell::TimestampTz(dt) => ClickHouseValue::DateTime64(dt.timestamp_micros()), - Cell::Uuid(u) => ClickHouseValue::Uuid(*u.as_bytes()), - Cell::Json(j) => ClickHouseValue::String(j.to_string()), - Cell::Bytes(b) => ClickHouseValue::String(bytes_to_hex(&b)), - Cell::String(s) => ClickHouseValue::String(s), - Cell::Array(array_cell) => { - ClickHouseValue::Array(array_cell_to_clickhouse_values(array_cell)) - } - } -} - -fn array_cell_to_clickhouse_values(array_cell: ArrayCell) -> Vec { - match array_cell { - ArrayCell::Bool(v) => v - .into_iter() - .map(|o| o.map_or(ClickHouseValue::Null, ClickHouseValue::Bool)) - .collect(), - ArrayCell::String(v) => v - .into_iter() - .map(|o| o.map_or(ClickHouseValue::Null, ClickHouseValue::String)) - .collect(), - ArrayCell::I16(v) => v - .into_iter() - .map(|o| o.map_or(ClickHouseValue::Null, ClickHouseValue::Int16)) - .collect(), - ArrayCell::I32(v) => v - .into_iter() - .map(|o| o.map_or(ClickHouseValue::Null, ClickHouseValue::Int32)) - .collect(), - ArrayCell::I64(v) => v - .into_iter() - .map(|o| o.map_or(ClickHouseValue::Null, ClickHouseValue::Int64)) - .collect(), - ArrayCell::U32(v) => v - .into_iter() - .map(|o| o.map_or(ClickHouseValue::Null, ClickHouseValue::UInt32)) - .collect(), - ArrayCell::F32(v) => v - .into_iter() - .map(|o| o.map_or(ClickHouseValue::Null, ClickHouseValue::Float32)) - .collect(), - ArrayCell::F64(v) => v - .into_iter() - .map(|o| o.map_or(ClickHouseValue::Null, ClickHouseValue::Float64)) - .collect(), - ArrayCell::Numeric(v) => v - .into_iter() - .map(|o| { - o.map_or(ClickHouseValue::Null, |n| { - ClickHouseValue::String(n.to_string()) - }) - }) - .collect(), - ArrayCell::Date(v) => v - .into_iter() - .map(|o| { - o.map_or(ClickHouseValue::Null, |d| { - let days = d - .signed_duration_since(unix_epoch()) - .num_days() - .clamp(0, i64::from(u16::MAX)) as u16; - ClickHouseValue::Date(days) - }) - }) - .collect(), - ArrayCell::Time(v) => v - .into_iter() - .map(|o| { - o.map_or(ClickHouseValue::Null, |t| { - ClickHouseValue::String(t.to_string()) - }) - }) - .collect(), - ArrayCell::Timestamp(v) => v - .into_iter() - .map(|o| { - o.map_or(ClickHouseValue::Null, |dt| { - ClickHouseValue::DateTime64(dt.and_utc().timestamp_micros()) - }) - }) - .collect(), - ArrayCell::TimestampTz(v) => v - .into_iter() - .map(|o| { - o.map_or(ClickHouseValue::Null, |dt| { - ClickHouseValue::DateTime64(dt.timestamp_micros()) - }) - }) - .collect(), - ArrayCell::Uuid(v) => v - .into_iter() - .map(|o| { - o.map_or(ClickHouseValue::Null, |u| { - ClickHouseValue::Uuid(*u.as_bytes()) - }) - }) - .collect(), - ArrayCell::Json(v) => v - .into_iter() - .map(|o| { - o.map_or(ClickHouseValue::Null, |j| { - ClickHouseValue::String(j.to_string()) - }) - }) - .collect(), - ArrayCell::Bytes(v) => v - .into_iter() - .map(|o| { - o.map_or(ClickHouseValue::Null, |b| { - ClickHouseValue::String(bytes_to_hex(&b)) - }) - }) - .collect(), - } -} - -fn bytes_to_hex(bytes: &[u8]) -> String { - let mut s = String::with_capacity(bytes.len() * 2); - for b in bytes { - use fmt::Write; - let _ = write!(s, "{b:02x}"); - } - s -} - // ── Destination struct ──────────────────────────────────────────────────────── /// CDC-capable ClickHouse destination that replicates Postgres tables. @@ -348,13 +66,13 @@ fn bytes_to_hex(bytes: &[u8]) -> String { /// appended to each row. Rows are encoded as RowBinary and sent via /// `INSERT INTO "table" FORMAT RowBinary` — no column-name header required. /// -/// The struct is cheaply cloneable: `client` has an internal `Arc`, and `table_cache` -/// is wrapped in `Arc>`. +/// The struct is cheaply cloneable: `client` wraps an `Arc` internally, and +/// `table_cache` is wrapped in `Arc>`. #[derive(Clone)] pub struct ClickHouseDestination { - client: Client, + client: ClickHouseClient, inserter_config: Arc, - store: S, + store: Arc, /// Cache: ClickHouse table name → `Arc<[bool]>` (nullable flags per column, /// including the two trailing CDC columns which are always `false`). /// @@ -382,40 +100,14 @@ where store: S, ) -> EtlResult { register_metrics(); - let client = build_client(url.into(), user.into(), password, database.into()); Ok(Self { - client, + client: ClickHouseClient::new(url, user, password, database), inserter_config: Arc::new(inserter_config), - store, + store: Arc::new(store), table_cache: Arc::new(RwLock::new(HashMap::new())), }) } - /// Creates a new `ClickHouseDestination` with TLS using a custom CA certificate. - /// - /// Note: The `clickhouse` crate v0.14 does not expose a public API for configuring - /// custom CA certificates. For HTTPS with standard TLS (webpki roots), use - /// [`ClickHouseDestination::new`] with an `https://` URL. Custom CA certificate - /// support is planned for a future update when the crate exposes the necessary API. - #[allow(dead_code)] - pub fn new_with_tls( - _url: impl Into, - _user: impl Into, - _password: Option, - _database: impl Into, - _ca_cert_pem: String, - _inserter_config: ClickHouseInserterConfig, - _store: S, - ) -> EtlResult { - bail!( - ErrorKind::Unknown, - "Custom CA certificates not supported", - "The clickhouse crate v0.14 does not expose an API for custom CA certificates. \ - Use ClickHouseDestination::new() with an https:// URL for standard TLS \ - (webpki root certificates are used for server verification)." - ) - } - /// Ensures the ClickHouse table for `table_id` exists, returning /// `(ch_table_name, nullable_flags)`. /// @@ -424,7 +116,6 @@ where /// 2. Slow-path: compute DDL, run `CREATE TABLE IF NOT EXISTS` (await, no lock held), /// then write-lock to insert (using `or_insert` for the concurrent first-writer race). async fn ensure_table_exists(&self, table_id: TableId) -> EtlResult<(String, Arc<[bool]>)> { - // 1. Get table schema from store. let table_schema = self .store .get_table_schema(&table_id) @@ -437,7 +128,6 @@ where ) })?; - // 2. Determine / persist ClickHouse table name. let ch_table_name = { if let Some(name) = self.store.get_table_mapping(&table_id).await? { name @@ -453,37 +143,31 @@ where } }; - // 3. Fast-path cache check (no await). { - let guard = self.table_cache.read().unwrap(); + let guard = self.table_cache.read(); if let Some(flags) = guard.get(&ch_table_name) { return Ok((ch_table_name, Arc::clone(flags))); } } - // 4. Compute nullable flags (user columns + 2 CDC columns always non-nullable). + // Compute nullable flags (user columns + 2 CDC columns always non-nullable). let column_schemas = &table_schema.column_schemas; - let mut nullable_flags_vec: Vec = column_schemas.iter().map(|c| c.nullable).collect(); + let mut nullable_flags_vec: Vec = + column_schemas.iter().map(|c| c.nullable).collect(); nullable_flags_vec.push(false); // cdc_operation nullable_flags_vec.push(false); // cdc_lsn let nullable_flags: Arc<[bool]> = nullable_flags_vec.into(); - // 5. Build and execute DDL (no lock held during this await). + // Execute DDL (no lock held during this await). let ddl = build_create_table_sql(&ch_table_name, column_schemas); let ddl_start = Instant::now(); - self.client.query(&ddl).execute().await.map_err(|e| { - etl_error!( - ErrorKind::Unknown, - "ClickHouse DDL failed", - format!("Failed to create table '{ch_table_name}': {e}") - ) - })?; + self.client.execute_ddl(&ddl).await?; metrics::histogram!(ETL_CH_DDL_DURATION_SECONDS, "table" => ch_table_name.clone()) .record(ddl_start.elapsed().as_secs_f64()); - // 6. Write-lock: insert, using or_insert to handle concurrent first-writer race. + // Write-lock: insert, using or_insert to handle concurrent first-writer race. let stored_flags = { - let mut guard = self.table_cache.write().unwrap(); + let mut guard = self.table_cache.write(); Arc::clone( guard .entry(ch_table_name.clone()) @@ -496,17 +180,7 @@ where async fn truncate_table_inner(&self, table_id: TableId) -> EtlResult<()> { let (ch_table_name, _) = self.ensure_table_exists(table_id).await?; - self.client - .query(&format!("TRUNCATE TABLE IF EXISTS \"{ch_table_name}\"")) - .execute() - .await - .map_err(|e| { - etl_error!( - ErrorKind::Unknown, - "ClickHouse truncate failed", - format!("Failed to truncate table '{ch_table_name}': {e}") - ) - }) + self.client.truncate_table(&ch_table_name).await } async fn write_table_rows_inner( @@ -515,72 +189,30 @@ where table_rows: Vec, ) -> EtlResult<()> { let (ch_table_name, nullable_flags) = self.ensure_table_exists(table_id).await?; - let sql = format!("INSERT INTO \"{ch_table_name}\" FORMAT RowBinary"); - let max_rows = self.inserter_config.max_rows_per_insert; - let max_bytes = self.inserter_config.max_bytes_per_insert; - - let mut insert = self - .client - .insert_formatted_with(sql.clone()) - .buffered_with_capacity(256 * 1024); - let mut rows = 0u64; - let mut bytes = 0u64; - let mut row_buf = Vec::new(); - let mut insert_start = Instant::now(); - - for table_row in table_rows { - row_buf.clear(); - let mut values: Vec = table_row - .into_values() - .into_iter() - .map(cell_to_clickhouse_value) - .collect(); - values.push(ClickHouseValue::String("INSERT".to_string())); - values.push(ClickHouseValue::Int64(0)); - rb_encode_row(&values, &nullable_flags, &mut row_buf); - - insert.write_buffered(&row_buf); - rows += 1; - bytes += row_buf.len() as u64; - - if rows >= max_rows || bytes >= max_bytes { - insert.end().await.map_err(|e| { - etl_error!( - ErrorKind::Unknown, - "ClickHouse insert flush failed", - format!("Failed to flush INSERT for '{ch_table_name}': {e}") - ) - })?; - metrics::histogram!( - ETL_CH_INSERT_DURATION_SECONDS, - "table" => ch_table_name.clone(), - "source" => "copy" - ) - .record(insert_start.elapsed().as_secs_f64()); - insert = self - .client - .insert_formatted_with(sql.clone()) - .buffered_with_capacity(256 * 1024); - insert_start = Instant::now(); - rows = 0; - bytes = 0; - } - } - insert.end().await.map_err(|e| { - etl_error!( - ErrorKind::Unknown, - "ClickHouse insert flush failed", - format!("Failed to flush INSERT for '{ch_table_name}': {e}") + let rows: Vec> = table_rows + .into_iter() + .map(|table_row| { + let mut values: Vec = table_row + .into_values() + .into_iter() + .map(cell_to_clickhouse_value) + .collect(); + values.push(ClickHouseValue::String(String::from("INSERT"))); + values.push(ClickHouseValue::Int64(0)); + values + }) + .collect(); + + self.client + .insert_rows( + &ch_table_name, + rows, + &nullable_flags, + self.inserter_config.max_bytes_per_insert, + "copy", ) - })?; - metrics::histogram!( - ETL_CH_INSERT_DURATION_SECONDS, - "table" => ch_table_name.clone(), - "source" => "copy" - ) - .record(insert_start.elapsed().as_secs_f64()); - Ok(()) + .await } /// Processes events in passes driven by an outer loop that runs until the iterator @@ -605,7 +237,9 @@ where break; } - let event = event_iter.next().unwrap(); + let event = event_iter + .next() + .expect("event iterator should not be empty, we peeked at the next event; qed"); match event { Event::Insert(insert) => { table_id_to_rows @@ -661,83 +295,41 @@ where } // Phase 2: spawn concurrent writers with pre-resolved metadata. - // Only the ClickHouse Client (cheaply cloneable, 'static) goes into spawn. + // Only the ClickHouseClient (cheaply cloneable, 'static) goes into spawn. let mut join_set: JoinSet> = JoinSet::new(); for (table_id, row_data) in table_id_to_rows { - let (ch_table_name, nullable_flags) = table_meta.remove(&table_id).unwrap(); - let client = self.client.clone(); - let max_rows = self.inserter_config.max_rows_per_insert; - let max_bytes = self.inserter_config.max_bytes_per_insert; - - join_set.spawn(async move { - let sql = format!("INSERT INTO \"{ch_table_name}\" FORMAT RowBinary"); - let mut insert = client - .insert_formatted_with(sql.clone()) - .buffered_with_capacity(256 * 1024); - let mut rows = 0u64; - let mut bytes = 0u64; - let mut row_buf = Vec::new(); - let mut insert_start = Instant::now(); - - for PendingRow { - operation, - lsn, - cells, - } in row_data - { - row_buf.clear(); - let mut values: Vec = - cells.into_iter().map(cell_to_clickhouse_value).collect(); - values.push(ClickHouseValue::String(operation.as_str().to_string())); - values.push(ClickHouseValue::Int64(i64::try_from(u64::from(lsn)).inspect_err(|error| { - tracing::error!(?error, "cannot convert u64 value to i64 for clickhouse destination, fallback to max i64"); - }).unwrap_or(i64::MAX))); - rb_encode_row(&values, &nullable_flags, &mut row_buf); - - insert.write_buffered(&row_buf); - rows += 1; - bytes += row_buf.len() as u64; - - if rows >= max_rows || bytes >= max_bytes { - insert.end().await.map_err(|e| { - etl_error!( - ErrorKind::Unknown, - "ClickHouse insert flush failed", - format!( - "Failed to flush INSERT for '{ch_table_name}': {e}" - ) - ) - })?; - metrics::histogram!( - ETL_CH_INSERT_DURATION_SECONDS, - "table" => ch_table_name.clone(), - "source" => "streaming" - ) - .record(insert_start.elapsed().as_secs_f64()); - insert = client - .insert_formatted_with(sql.clone()) - .buffered_with_capacity(256 * 1024); - insert_start = Instant::now(); - rows = 0; - bytes = 0; - } - } - - insert.end().await.map_err(|e| { + let (ch_table_name, nullable_flags) = + table_meta.remove(&table_id).ok_or_else(|| { etl_error!( ErrorKind::Unknown, - "ClickHouse insert flush failed", - format!("Failed to flush INSERT for '{ch_table_name}': {e}") + "ClickHouse insert failed", + format!("Failed to remove metadata for table ID {table_id}") ) })?; - metrics::histogram!( - ETL_CH_INSERT_DURATION_SECONDS, - "table" => ch_table_name.clone(), - "source" => "streaming" - ) - .record(insert_start.elapsed().as_secs_f64()); - - Ok(()) + let client = self.client.clone(); + let max_bytes = self.inserter_config.max_bytes_per_insert; + + join_set.spawn(async move { + let rows: Vec> = row_data + .into_iter() + .map(|PendingRow { operation, lsn, cells }| { + let mut values: Vec = + cells.into_iter().map(cell_to_clickhouse_value).collect(); + values.push(ClickHouseValue::String(operation.to_string())); + values.push(ClickHouseValue::Int64( + i64::try_from(u64::from(lsn)) + .inspect_err(|error| { + tracing::error!(?error, "cannot convert u64 LSN to i64, falling back to i64::MAX"); + }) + .unwrap_or(i64::MAX), + )); + values + }) + .collect(); + + client + .insert_rows(&ch_table_name, rows, &nullable_flags, max_bytes, "streaming") + .await }); } @@ -757,9 +349,12 @@ where } } - for table_id in truncate_table_ids { - self.truncate_table_inner(table_id).await?; - } + futures::future::try_join_all( + truncate_table_ids + .into_iter() + .map(|table_id| self.truncate_table_inner(table_id)), + ) + .await?; } Ok(()) @@ -791,184 +386,10 @@ where } } -// ── Client builder ──────────────────────────────────────────────────────────── - -fn build_client(url: String, user: String, password: Option, database: String) -> Client { - let mut client = Client::default() - .with_url(url) - .with_user(user) - .with_database(database); - - if let Some(pw) = password { - client = client.with_password(pw); - } - - client -} - // ── Unit tests ──────────────────────────────────────────────────────────────── #[cfg(test)] mod tests { - use super::*; - use chrono::NaiveDate; - use uuid::Uuid; - - #[test] - fn test_cell_to_clickhouse_value_null() { - assert!(matches!( - cell_to_clickhouse_value(Cell::Null), - ClickHouseValue::Null - )); - } - - #[test] - fn test_cell_to_clickhouse_value_bool() { - assert!(matches!( - cell_to_clickhouse_value(Cell::Bool(true)), - ClickHouseValue::Bool(true) - )); - } - - #[test] - fn test_cell_to_clickhouse_value_i32() { - assert!(matches!( - cell_to_clickhouse_value(Cell::I32(42)), - ClickHouseValue::Int32(42) - )); - } - - #[test] - fn test_cell_to_clickhouse_value_string() { - if let ClickHouseValue::String(s) = - cell_to_clickhouse_value(Cell::String("hello".to_string())) - { - assert_eq!(s, "hello"); - } else { - panic!("expected String variant"); - } - } - - #[test] - fn test_cell_to_clickhouse_value_date() { - // 1970-01-01 = day 0 - let epoch = NaiveDate::from_ymd_opt(1970, 1, 1).unwrap(); - assert!(matches!( - cell_to_clickhouse_value(Cell::Date(epoch)), - ClickHouseValue::Date(0) - )); - - // 1970-01-02 = day 1 - let day1 = NaiveDate::from_ymd_opt(1970, 1, 2).unwrap(); - assert!(matches!( - cell_to_clickhouse_value(Cell::Date(day1)), - ClickHouseValue::Date(1) - )); - } - - #[test] - fn test_cell_to_clickhouse_value_timestamp() { - let epoch = chrono::DateTime::from_timestamp(0, 0).unwrap().naive_utc(); - assert!(matches!( - cell_to_clickhouse_value(Cell::Timestamp(epoch)), - ClickHouseValue::DateTime64(0) - )); - } - - #[test] - fn test_cell_to_clickhouse_value_uuid() { - let u = Uuid::parse_str("550e8400-e29b-41d4-a716-446655440000").unwrap(); - let expected_bytes = *u.as_bytes(); - if let ClickHouseValue::Uuid(bytes) = cell_to_clickhouse_value(Cell::Uuid(u)) { - assert_eq!(bytes, expected_bytes); - } else { - panic!("expected Uuid variant"); - } - } - - #[test] - fn test_cell_to_clickhouse_value_bytes_hex() { - let bytes = vec![0xde, 0xad, 0xbe, 0xef]; - if let ClickHouseValue::String(s) = cell_to_clickhouse_value(Cell::Bytes(bytes)) { - assert_eq!(s, "deadbeef"); - } else { - panic!("expected String variant"); - } - } - - #[test] - fn test_rb_encode_value_scalars() { - let mut buf = Vec::new(); - - buf.clear(); - rb_encode_value(&ClickHouseValue::Bool(true), &mut buf); - assert_eq!(buf, [1u8]); - - buf.clear(); - rb_encode_value(&ClickHouseValue::Int32(-1), &mut buf); - assert_eq!(buf, (-1i32).to_le_bytes()); - - buf.clear(); - rb_encode_value(&ClickHouseValue::String("hi".to_string()), &mut buf); - assert_eq!(buf, [2, b'h', b'i']); // varint(2) + bytes - - buf.clear(); - rb_encode_value(&ClickHouseValue::Date(1), &mut buf); - assert_eq!(buf, 1u16.to_le_bytes()); - } - - #[test] - fn test_rb_encode_uuid_wire_format() { - // UUID 550e8400-e29b-41d4-a716-446655440000 - let u = Uuid::parse_str("550e8400-e29b-41d4-a716-446655440000").unwrap(); - let val = ClickHouseValue::Uuid(*u.as_bytes()); - let mut buf = Vec::new(); - rb_encode_value(&val, &mut buf); - - assert_eq!(buf.len(), 16); - // high u64 from bytes 0-7, written LE - let bytes = u.as_bytes(); - let high = u64::from_be_bytes(bytes[0..8].try_into().unwrap()); - let low = u64::from_be_bytes(bytes[8..16].try_into().unwrap()); - let mut expected = high.to_le_bytes().to_vec(); - expected.extend_from_slice(&low.to_le_bytes()); - assert_eq!(buf, expected); - } - - #[test] - fn test_rb_encode_nullable() { - let mut buf = Vec::new(); - - // Null → just the 1 byte - rb_encode_nullable(&ClickHouseValue::Null, &mut buf); - assert_eq!(buf, [1u8]); - - buf.clear(); - rb_encode_nullable(&ClickHouseValue::Int32(42), &mut buf); - let mut expected = vec![0u8]; // not-null indicator - expected.extend_from_slice(&42i32.to_le_bytes()); - assert_eq!(buf, expected); - } - - #[test] - fn test_rb_varint() { - let mut buf = Vec::new(); - rb_varint(0, &mut buf); - assert_eq!(buf, [0x00]); - - buf.clear(); - rb_varint(127, &mut buf); - assert_eq!(buf, [0x7f]); - - buf.clear(); - rb_varint(128, &mut buf); - assert_eq!(buf, [0x80, 0x01]); - - buf.clear(); - rb_varint(300, &mut buf); - assert_eq!(buf, [0xac, 0x02]); // 300 = 0b100101100 → [0x2c | 0x80, 0x02] - } - #[test] fn test_nullable_flags_includes_cdc() { let mut all_flags: Vec = vec![true, false]; @@ -981,12 +402,4 @@ mod tests { assert!(!all_flags[2]); assert!(!all_flags[3]); } - - #[test] - fn test_bytes_to_hex() { - assert_eq!(bytes_to_hex(&[]), ""); - assert_eq!(bytes_to_hex(&[0x00]), "00"); - assert_eq!(bytes_to_hex(&[0xff]), "ff"); - assert_eq!(bytes_to_hex(&[0xde, 0xad, 0xbe, 0xef]), "deadbeef"); - } } diff --git a/etl-destinations/src/clickhouse/encoding.rs b/etl-destinations/src/clickhouse/encoding.rs new file mode 100644 index 000000000..afcf167cd --- /dev/null +++ b/etl-destinations/src/clickhouse/encoding.rs @@ -0,0 +1,454 @@ +use std::fmt; + +use chrono::NaiveDate; +use etl::error::{ErrorKind, EtlResult}; +use etl::etl_error; +use etl::types::{ArrayCell, Cell}; + +// ── RowBinary encoding ──────────────────────────────────────────────────────── +// +// We bypass the `Row` / `Inserter` API entirely and write RowBinary bytes directly +// via `Client::insert_formatted_with("INSERT INTO \"t\" FORMAT RowBinary")`. +// +// This avoids two fatal issues with the `Inserter` path: +// +// 1. `Insert::new` always calls `join_column_names::().expect(…)`, which panics +// when `COLUMN_NAMES = &[]` regardless of whether validation is enabled. +// +// 2. The RowBinary serde serializer wraps its `BufMut` writer in a fresh `&mut` at +// every `serialize_some` call, telescoping the type to `&mut &mut … BytesMut` for +// nullable array elements and overflowing the compiler's recursion limit. +// +// Direct binary encoding has neither problem: it is a simple recursive function that +// writes bytes to a `Vec` with no generics and no type-level recursion. + +// ── ClickHouseValue ─────────────────────────────────────────────────────────── + +/// Owned ClickHouse-compatible value, moved (not cloned) from a [`Cell`]. +pub(crate) enum ClickHouseValue { + Null, + Bool(bool), + Int16(i16), + Int32(i32), + Int64(i64), + UInt32(u32), + Float32(f32), + Float64(f64), + /// TEXT, NUMERIC (string), TIME (string), JSON, BYTEA (hex-encoded) + String(String), + /// Days since Unix epoch (ClickHouse `Date` on wire = UInt16 LE) + Date(u16), + /// Microseconds since Unix epoch (ClickHouse `DateTime64(6)` on wire = Int64 LE) + DateTime64(i64), + /// UUID in standard 16-byte big-endian order (converted to ClickHouse wire format on encode) + Uuid([u8; 16]), + Array(Vec), +} + +// ── Cell → ClickHouseValue conversion ──────────────────────────────────────── + +/// Converts a [`Cell`] to a [`ClickHouseValue`], consuming it (no clone). +pub(crate) fn cell_to_clickhouse_value(cell: Cell) -> ClickHouseValue { + match cell { + Cell::Null => ClickHouseValue::Null, + Cell::Bool(b) => ClickHouseValue::Bool(b), + Cell::I16(v) => ClickHouseValue::Int16(v), + Cell::I32(v) => ClickHouseValue::Int32(v), + Cell::I64(v) => ClickHouseValue::Int64(v), + Cell::U32(v) => ClickHouseValue::UInt32(v), + Cell::F32(v) => ClickHouseValue::Float32(v), + Cell::F64(v) => ClickHouseValue::Float64(v), + Cell::Numeric(n) => ClickHouseValue::String(n.to_string()), + Cell::Date(d) => { + let days = d + .signed_duration_since(unix_epoch()) + .num_days() + .clamp(0, i64::from(u16::MAX)) as u16; + ClickHouseValue::Date(days) + } + Cell::Time(t) => ClickHouseValue::String(t.to_string()), + Cell::Timestamp(dt) => ClickHouseValue::DateTime64(dt.and_utc().timestamp_micros()), + Cell::TimestampTz(dt) => ClickHouseValue::DateTime64(dt.timestamp_micros()), + Cell::Uuid(u) => ClickHouseValue::Uuid(u.to_bytes_le()), + Cell::Json(j) => ClickHouseValue::String(j.to_string()), + Cell::Bytes(b) => ClickHouseValue::String(bytes_to_hex(b)), + Cell::String(s) => ClickHouseValue::String(s), + Cell::Array(array_cell) => { + ClickHouseValue::Array(array_cell_to_clickhouse_values(array_cell)) + } + } +} + +fn array_cell_to_clickhouse_values(array_cell: ArrayCell) -> Vec { + match array_cell { + ArrayCell::Bool(v) => v + .into_iter() + .map(|o| o.map_or(ClickHouseValue::Null, ClickHouseValue::Bool)) + .collect(), + ArrayCell::String(v) => v + .into_iter() + .map(|o| o.map_or(ClickHouseValue::Null, ClickHouseValue::String)) + .collect(), + ArrayCell::I16(v) => v + .into_iter() + .map(|o| o.map_or(ClickHouseValue::Null, ClickHouseValue::Int16)) + .collect(), + ArrayCell::I32(v) => v + .into_iter() + .map(|o| o.map_or(ClickHouseValue::Null, ClickHouseValue::Int32)) + .collect(), + ArrayCell::I64(v) => v + .into_iter() + .map(|o| o.map_or(ClickHouseValue::Null, ClickHouseValue::Int64)) + .collect(), + ArrayCell::U32(v) => v + .into_iter() + .map(|o| o.map_or(ClickHouseValue::Null, ClickHouseValue::UInt32)) + .collect(), + ArrayCell::F32(v) => v + .into_iter() + .map(|o| o.map_or(ClickHouseValue::Null, ClickHouseValue::Float32)) + .collect(), + ArrayCell::F64(v) => v + .into_iter() + .map(|o| o.map_or(ClickHouseValue::Null, ClickHouseValue::Float64)) + .collect(), + ArrayCell::Numeric(v) => v + .into_iter() + .map(|o| { + o.map_or(ClickHouseValue::Null, |n| { + ClickHouseValue::String(n.to_string()) + }) + }) + .collect(), + ArrayCell::Date(v) => v + .into_iter() + .map(|o| { + o.map_or(ClickHouseValue::Null, |d| { + let days = d + .signed_duration_since(unix_epoch()) + .num_days() + .clamp(0, i64::from(u16::MAX)) as u16; + ClickHouseValue::Date(days) + }) + }) + .collect(), + ArrayCell::Time(v) => v + .into_iter() + .map(|o| { + o.map_or(ClickHouseValue::Null, |t| { + ClickHouseValue::String(t.to_string()) + }) + }) + .collect(), + ArrayCell::Timestamp(v) => v + .into_iter() + .map(|o| { + o.map_or(ClickHouseValue::Null, |dt| { + ClickHouseValue::DateTime64(dt.and_utc().timestamp_micros()) + }) + }) + .collect(), + ArrayCell::TimestampTz(v) => v + .into_iter() + .map(|o| { + o.map_or(ClickHouseValue::Null, |dt| { + ClickHouseValue::DateTime64(dt.timestamp_micros()) + }) + }) + .collect(), + ArrayCell::Uuid(v) => v + .into_iter() + .map(|o| { + o.map_or(ClickHouseValue::Null, |u| { + ClickHouseValue::Uuid(*u.as_bytes()) + }) + }) + .collect(), + ArrayCell::Json(v) => v + .into_iter() + .map(|o| { + o.map_or(ClickHouseValue::Null, |j| { + ClickHouseValue::String(j.to_string()) + }) + }) + .collect(), + ArrayCell::Bytes(v) => v + .into_iter() + .map(|o| { + o.map_or(ClickHouseValue::Null, |b| { + ClickHouseValue::String(bytes_to_hex(b)) + }) + }) + .collect(), + } +} + +fn unix_epoch() -> NaiveDate { + NaiveDate::from_ymd_opt(1970, 1, 1).expect("valid date") +} + +fn bytes_to_hex(bytes: Vec) -> String { + let mut s = String::with_capacity(bytes.len() * 2); + for b in bytes { + use fmt::Write; + let _ = write!(s, "{b:02x}"); + } + s +} + +// ── RowBinary wire encoding ─────────────────────────────────────────────────── + +/// Encodes a variable-length integer (LEB128) used by ClickHouse for string/array lengths. +pub(crate) fn rb_varint(mut v: usize, buf: &mut Vec) { + loop { + let byte = (v & 0x7f) as u8; + v >>= 7; + if v == 0 { + buf.push(byte); + return; + } + buf.push(byte | 0x80); + } +} + +/// Encodes a value for a `Nullable(T)` column (1-byte null indicator + value if present). +pub(crate) fn rb_encode_nullable(val: ClickHouseValue, buf: &mut Vec) -> EtlResult<()> { + match val { + ClickHouseValue::Null => buf.push(1), + v => { + buf.push(0); + rb_encode_value(v, buf)?; + } + } + Ok(()) +} + +/// Encodes a value for a non-nullable column (no null indicator byte). +pub(crate) fn rb_encode_value(val: ClickHouseValue, buf: &mut Vec) -> EtlResult<()> { + match val { + ClickHouseValue::Null => { + // A non-nullable column unexpectedly received NULL (data quality issue from + // Postgres). Write a zero-length string as the least-harmful fallback. + buf.push(0); // varint 0 = empty string + } + ClickHouseValue::Bool(b) => buf.push(b as u8), + ClickHouseValue::Int16(v) => buf.extend_from_slice(&v.to_le_bytes()), + ClickHouseValue::Int32(v) => buf.extend_from_slice(&v.to_le_bytes()), + ClickHouseValue::Int64(v) => buf.extend_from_slice(&v.to_le_bytes()), + ClickHouseValue::UInt32(v) => buf.extend_from_slice(&v.to_le_bytes()), + ClickHouseValue::Float32(v) => buf.extend_from_slice(&v.to_le_bytes()), + ClickHouseValue::Float64(v) => buf.extend_from_slice(&v.to_le_bytes()), + ClickHouseValue::String(s) => { + rb_varint(s.len(), buf); + buf.extend_from_slice(s.as_bytes()); + } + ClickHouseValue::Date(days) => buf.extend_from_slice(&days.to_le_bytes()), + ClickHouseValue::DateTime64(micros) => buf.extend_from_slice(µs.to_le_bytes()), + ClickHouseValue::Uuid(bytes) => { + // ClickHouse RowBinary UUID = two little-endian u64 (high bits then low bits). + // Our bytes are in standard UUID big-endian order, so we split into two u64 + // and write each in little-endian. + let high = u64::from_be_bytes(bytes[0..8].try_into().map_err(|e: std::array::TryFromSliceError| { + etl_error!(ErrorKind::ConversionError, "UUID high-half conversion failed", e) + })?); + let low = u64::from_be_bytes(bytes[8..16].try_into().map_err(|e: std::array::TryFromSliceError| { + etl_error!(ErrorKind::ConversionError, "UUID low-half conversion failed", e) + })?); + buf.extend_from_slice(&high.to_le_bytes()); + buf.extend_from_slice(&low.to_le_bytes()); + } + // Array elements are always Nullable in ClickHouse: Array(Nullable(T)). + ClickHouseValue::Array(items) => { + rb_varint(items.len(), buf); + for item in items { + rb_encode_nullable(item, buf)?; + } + } + } + Ok(()) +} + +/// Encodes a complete row into `buf`, selecting nullable vs non-nullable encoding per column. +pub(crate) fn rb_encode_row( + values: Vec, + nullable_flags: &[bool], + buf: &mut Vec, +) -> EtlResult<()> { + for (val, &is_nullable) in values.into_iter().zip(nullable_flags.iter()) { + if is_nullable { + rb_encode_nullable(val, buf)?; + } else { + rb_encode_value(val, buf)?; + } + } + Ok(()) +} + +// ── Unit tests ──────────────────────────────────────────────────────────────── + +#[cfg(test)] +mod tests { + use super::*; + use chrono::NaiveDate; + use etl::types::Cell; + use uuid::Uuid; + + #[test] + fn test_cell_to_clickhouse_value_null() { + assert!(matches!( + cell_to_clickhouse_value(Cell::Null), + ClickHouseValue::Null + )); + } + + #[test] + fn test_cell_to_clickhouse_value_bool() { + assert!(matches!( + cell_to_clickhouse_value(Cell::Bool(true)), + ClickHouseValue::Bool(true) + )); + } + + #[test] + fn test_cell_to_clickhouse_value_i32() { + assert!(matches!( + cell_to_clickhouse_value(Cell::I32(42)), + ClickHouseValue::Int32(42) + )); + } + + #[test] + fn test_cell_to_clickhouse_value_string() { + if let ClickHouseValue::String(s) = + cell_to_clickhouse_value(Cell::String("hello".to_string())) + { + assert_eq!(s, "hello"); + } else { + panic!("expected String variant"); + } + } + + #[test] + fn test_cell_to_clickhouse_value_date() { + let epoch = NaiveDate::from_ymd_opt(1970, 1, 1).unwrap(); + assert!(matches!( + cell_to_clickhouse_value(Cell::Date(epoch)), + ClickHouseValue::Date(0) + )); + + let day1 = NaiveDate::from_ymd_opt(1970, 1, 2).unwrap(); + assert!(matches!( + cell_to_clickhouse_value(Cell::Date(day1)), + ClickHouseValue::Date(1) + )); + } + + #[test] + fn test_cell_to_clickhouse_value_timestamp() { + let epoch = chrono::DateTime::from_timestamp(0, 0).unwrap().naive_utc(); + assert!(matches!( + cell_to_clickhouse_value(Cell::Timestamp(epoch)), + ClickHouseValue::DateTime64(0) + )); + } + + #[test] + fn test_cell_to_clickhouse_value_uuid() { + let u = Uuid::parse_str("550e8400-e29b-41d4-a716-446655440000").unwrap(); + let expected_bytes = u.to_bytes_le(); + if let ClickHouseValue::Uuid(bytes) = cell_to_clickhouse_value(Cell::Uuid(u)) { + assert_eq!(bytes, expected_bytes); + } else { + panic!("expected Uuid variant"); + } + } + + #[test] + fn test_cell_to_clickhouse_value_bytes_hex() { + let bytes = vec![0xde, 0xad, 0xbe, 0xef]; + if let ClickHouseValue::String(s) = cell_to_clickhouse_value(Cell::Bytes(bytes)) { + assert_eq!(s, "deadbeef"); + } else { + panic!("expected String variant"); + } + } + + #[test] + fn test_rb_encode_value_scalars() { + let mut buf = Vec::new(); + + buf.clear(); + rb_encode_value(ClickHouseValue::Bool(true), &mut buf).unwrap(); + assert_eq!(buf, [1u8]); + + buf.clear(); + rb_encode_value(ClickHouseValue::Int32(-1), &mut buf).unwrap(); + assert_eq!(buf, (-1i32).to_le_bytes()); + + buf.clear(); + rb_encode_value(ClickHouseValue::String("hi".to_string()), &mut buf).unwrap(); + assert_eq!(buf, [2, b'h', b'i']); // varint(2) + bytes + + buf.clear(); + rb_encode_value(ClickHouseValue::Date(1), &mut buf).unwrap(); + assert_eq!(buf, 1u16.to_le_bytes()); + } + + #[test] + fn test_rb_encode_uuid_wire_format() { + let u = Uuid::parse_str("550e8400-e29b-41d4-a716-446655440000").unwrap(); + let val = ClickHouseValue::Uuid(*u.as_bytes()); + let mut buf = Vec::new(); + rb_encode_value(val, &mut buf).unwrap(); + + assert_eq!(buf.len(), 16); + let bytes = u.as_bytes(); + let high = u64::from_be_bytes(bytes[0..8].try_into().unwrap()); + let low = u64::from_be_bytes(bytes[8..16].try_into().unwrap()); + let mut expected = high.to_le_bytes().to_vec(); + expected.extend_from_slice(&low.to_le_bytes()); + assert_eq!(buf, expected); + } + + #[test] + fn test_rb_encode_nullable() { + let mut buf = Vec::new(); + + rb_encode_nullable(ClickHouseValue::Null, &mut buf).unwrap(); + assert_eq!(buf, [1u8]); + + buf.clear(); + rb_encode_nullable(ClickHouseValue::Int32(42), &mut buf).unwrap(); + let mut expected = vec![0u8]; + expected.extend_from_slice(&42i32.to_le_bytes()); + assert_eq!(buf, expected); + } + + #[test] + fn test_rb_varint() { + let mut buf = Vec::new(); + rb_varint(0, &mut buf); + assert_eq!(buf, [0x00]); + + buf.clear(); + rb_varint(127, &mut buf); + assert_eq!(buf, [0x7f]); + + buf.clear(); + rb_varint(128, &mut buf); + assert_eq!(buf, [0x80, 0x01]); + + buf.clear(); + rb_varint(300, &mut buf); + assert_eq!(buf, [0xac, 0x02]); + } + + #[test] + fn test_bytes_to_hex() { + assert_eq!(bytes_to_hex([].to_vec()), ""); + assert_eq!(bytes_to_hex([0x00].to_vec()), "00"); + assert_eq!(bytes_to_hex([0xff].to_vec()), "ff"); + assert_eq!(bytes_to_hex([0xde, 0xad, 0xbe, 0xef].to_vec()), "deadbeef"); + } +} diff --git a/etl-destinations/src/clickhouse/mod.rs b/etl-destinations/src/clickhouse/mod.rs index 584d0ab26..692519872 100644 --- a/etl-destinations/src/clickhouse/mod.rs +++ b/etl-destinations/src/clickhouse/mod.rs @@ -1,5 +1,8 @@ +pub mod client; mod core; +mod encoding; mod metrics; mod schema; +pub use client::ClickHouseClient; pub use core::{ClickHouseDestination, ClickHouseInserterConfig}; diff --git a/etl-replicator/Cargo.toml b/etl-replicator/Cargo.toml index 76262e2de..834669f06 100644 --- a/etl-replicator/Cargo.toml +++ b/etl-replicator/Cargo.toml @@ -14,7 +14,7 @@ egress = ["etl/egress", "etl-destinations/egress"] [dependencies] etl = { workspace = true } etl-config = { workspace = true, features = ["supabase"] } -etl-destinations = { workspace = true, features = ["bigquery", "iceberg", "ducklake"] } +etl-destinations = { workspace = true, features = ["bigquery", "clickhouse", "ducklake", "iceberg"] } etl-telemetry = { workspace = true } configcat = { workspace = true } @@ -25,7 +25,8 @@ secrecy = { workspace = true } sentry = { workspace = true } serde = { workspace = true, features = ["derive"] } serde_json = { workspace = true } -sqlx = { workspace = true, features = ["runtime-tokio-rustls", "postgres"] } +sqlx = { workspace = true, features = ["runtime-tokio-rustls", "postgres", "migrate"] } +sysinfo = { workspace = true, features = ["system"] } tokio = { workspace = true, features = ["rt-multi-thread", "macros", "signal"] } tracing = { workspace = true, default-features = true } diff --git a/etl-replicator/src/core.rs b/etl-replicator/src/core.rs index 0caa98e9f..3bc55c443 100644 --- a/etl-replicator/src/core.rs +++ b/etl-replicator/src/core.rs @@ -4,7 +4,10 @@ use crate::error::{ReplicatorError, ReplicatorResult}; use crate::error_notification::ErrorNotificationClient; use crate::error_reporting::ErrorReportingStateStore; use crate::metrics; +use crate::migrations::migrate_state_store; use crate::sentry::set_destination_tag; +use etl::concurrency::memory_monitor::MemorySnapshot; +use etl::config::MemoryBackpressureConfig; use etl::pipeline::Pipeline; use etl::store::both::postgres::PostgresStore; use etl::store::cleanup::CleanupStore; @@ -12,8 +15,11 @@ use etl::store::schema::SchemaStore; use etl::store::state::StateStore; use etl::types::PipelineId; use etl::{config::IcebergConfig, destination::Destination}; -use etl_config::shared::{DestinationConfig, PgConnectionConfig, ReplicatorConfig}; +use etl_config::shared::{ + BatchConfig, DestinationConfig, PgConnectionConfig, PipelineConfig, ReplicatorConfig, +}; use etl_config::{Environment, parse_ducklake_url}; +use etl_destinations::clickhouse::{ClickHouseDestination, ClickHouseInserterConfig}; use etl_destinations::iceberg::{ DestinationNamespace, S3_ACCESS_KEY_ID, S3_ENDPOINT, S3_SECRET_ACCESS_KEY, }; @@ -23,6 +29,7 @@ use etl_destinations::{ iceberg::{IcebergClient, IcebergDestination}, }; use secrecy::ExposeSecret; +use sysinfo::{MemoryRefreshKind, RefreshKind, System}; use tokio::signal::unix::{SignalKind, signal}; use tracing::{error, info, warn}; @@ -138,6 +145,7 @@ pub async fn start_replicator_with_config( let pipeline = Pipeline::new(replicator_config.pipeline, state_store, destination); start_pipeline(pipeline).await?; } +<<<<<<< HEAD DestinationConfig::Ducklake { catalog_url, data_path, @@ -179,6 +187,42 @@ pub async fn start_replicator_with_config( ) .await?; + let pipeline = Pipeline::new(replicator_config.pipeline, state_store, destination); + start_pipeline(pipeline).await?; + } + DestinationConfig::ClickHouse { + url, + user, + password, + database, + } => { + let mut sys = System::new_with_specifics( + RefreshKind::nothing().with_memory(MemoryRefreshKind::everything()), + ); + let total_memory_bytes = MemorySnapshot::from_system(&mut sys).total(); + let max_bytes_per_insert = (total_memory_bytes as f64 + * replicator_config + .pipeline + .memory_backpressure + .as_ref() + .map(|config| config.activate_threshold) + .unwrap_or(MemoryBackpressureConfig::default().activate_threshold) + as f64 + / replicator_config.pipeline.max_table_sync_workers as f64) + as u64; + + let inserter_config = ClickHouseInserterConfig { + max_bytes_per_insert, + }; + let destination = ClickHouseDestination::new( + url, + user, + password.as_ref().map(|p| p.expose_secret().to_string()), + database, + inserter_config, + state_store.clone(), + )?; + let pipeline = Pipeline::new(replicator_config.pipeline, state_store, destination); start_pipeline(pipeline).await?; } @@ -206,6 +250,107 @@ pub fn create_props( props } +fn log_config(config: &ReplicatorConfig) { + log_destination_config(&config.destination); + log_pipeline_config(&config.pipeline); +} + +fn log_destination_config(config: &DestinationConfig) { + match config { + DestinationConfig::BigQuery { + project_id, + dataset_id, + service_account_key: _, + max_staleness_mins, + connection_pool_size, + } => { + debug!( + project_id, + dataset_id, + max_staleness_mins, + connection_pool_size, + "using bigquery destination config" + ) + } + DestinationConfig::Iceberg { + config: + IcebergConfig::Supabase { + namespace, + project_ref, + catalog_token: _, + warehouse_name, + s3_access_key_id: _, + s3_secret_access_key: _, + s3_region, + }, + } => { + debug!( + namespace, + project_ref, warehouse_name, s3_region, "using supabase iceberg destination config" + ) + } + DestinationConfig::Iceberg { + config: + IcebergConfig::Rest { + catalog_uri, + warehouse_name, + namespace, + s3_access_key_id: _, + s3_secret_access_key: _, + s3_endpoint, + }, + } => { + debug!( + catalog_uri, + warehouse_name, + namespace, + s3_endpoint, + "using generic rest iceberg destination config" + ) + } + DestinationConfig::ClickHouse { + url, + user, + database, + password: _, + } => debug!(url, user, database, "using clickhouse destination config"), + DestinationConfig::Ducklake { catalog_url, data_path, .. } => { + debug!(catalog_url, data_path, "using ducklake destination config") + } + } +} + +fn log_pipeline_config(config: &PipelineConfig) { + debug!( + pipeline_id = config.id, + publication_name = config.publication_name, + table_error_retry_delay_ms = config.table_error_retry_delay_ms, + max_table_sync_workers = config.max_table_sync_workers, + "pipeline config" + ); + log_pg_connection_config(&config.pg_connection); + log_batch_config(&config.batch); +} + +fn log_pg_connection_config(config: &PgConnectionConfig) { + debug!( + host = config.host, + port = config.port, + dbname = config.name, + username = config.username, + tls_enabled = config.tls.enabled, + "source postgres connection config", + ); +} + +fn log_batch_config(config: &BatchConfig) { + debug!( + max_fill_ms = config.max_fill_ms, + memory_budget_ratio = config.memory_budget_ratio, + "batch config" + ); +} + /// Initializes the state store. /// /// Creates a [`PostgresStore`] instance for the given pipeline and connection diff --git a/etl/src/concurrency/memory_monitor.rs b/etl/src/concurrency/memory_monitor.rs index f258ca3ab..d72fb3590 100644 --- a/etl/src/concurrency/memory_monitor.rs +++ b/etl/src/concurrency/memory_monitor.rs @@ -22,14 +22,14 @@ use crate::types::PipelineId; /// Represents a memory snapshot. #[derive(Debug, Clone, Copy)] -struct MemorySnapshot { +pub struct MemorySnapshot { used: u64, total: u64, } impl MemorySnapshot { /// Refreshes memory readings from the operating system. - fn from_system(system: &mut sysinfo::System) -> Self { + pub fn from_system(system: &mut sysinfo::System) -> Self { system.refresh_memory_specifics(sysinfo::MemoryRefreshKind::nothing().with_ram()); match system.cgroup_limits() { @@ -53,6 +53,10 @@ impl MemorySnapshot { used_percent.clamp(0.0, 1.0) } + + pub fn total(&self) -> u64 { + self.total + } } /// Internal shared state for memory backpressure. diff --git a/etl/src/lib.rs b/etl/src/lib.rs index 6fe69ef02..25a5f91d4 100644 --- a/etl/src/lib.rs +++ b/etl/src/lib.rs @@ -113,7 +113,7 @@ //! // Create and start the pipeline //! let mut pipeline = Pipeline::new(config, store, destination); //! pipeline.start().await?; -//! +//! //! // Pipeline will run until stopped //! pipeline.wait().await?; //! @@ -123,10 +123,10 @@ //! //! # Feature Flags //! -//! - `test-utils`: Enable testing utilities and mock implementations +//! - `test-utils`: Enable testing utilities and mock implementations //! - `failpoints`: Enable fault injection for testing error scenarios -mod concurrency; +pub mod concurrency; pub mod config; mod conversions; pub mod destination; From feb8d65c30b63f880fc8b0fa573681bbf129783a Mon Sep 17 00:00:00 2001 From: Benjamin <5719034+bnjjj@users.noreply.github.com> Date: Thu, 26 Feb 2026 15:50:45 +0100 Subject: [PATCH 03/86] Add example binary Signed-off-by: Benjamin <5719034+bnjjj@users.noreply.github.com> --- etl-examples/src/bin/clickhouse.rs | 257 +++++++++++++++++++++++++++++ 1 file changed, 257 insertions(+) create mode 100644 etl-examples/src/bin/clickhouse.rs diff --git a/etl-examples/src/bin/clickhouse.rs b/etl-examples/src/bin/clickhouse.rs new file mode 100644 index 000000000..94d14e3db --- /dev/null +++ b/etl-examples/src/bin/clickhouse.rs @@ -0,0 +1,257 @@ +/* + +ClickHouse Example + +This example demonstrates how to use the pipeline to stream +data from Postgres to ClickHouse using change data capture (CDC). + +Each Postgres table is replicated as an append-only MergeTree table. +Two CDC metadata columns are appended to every row: + - `cdc_operation`: `INSERT`, `UPDATE`, or `DELETE` + - `cdc_lsn`: the Postgres LSN at the time of the change + +Table names are derived from the Postgres schema and table name using +double-underscore escaping (e.g. `public.orders` → `public__orders`). + +Prerequisites: +1. Postgres server with logical replication enabled (wal_level = logical) +2. A publication created in Postgres (CREATE PUBLICATION my_pub FOR ALL TABLES;) +3. A running ClickHouse instance accessible over HTTP(S) + +Usage: + cargo run -p etl-examples --bin clickhouse -- \ + --db-host localhost \ + --db-port 5432 \ + --db-name postgres \ + --db-username postgres \ + --db-password password \ + --ch-url http://localhost:8123 \ + --ch-user default \ + --ch-database default \ + --publication my_pub + +For HTTPS connections, provide an `https://` URL — TLS is handled automatically +using webpki root certificates. Use `--ch-password` if your ClickHouse instance +requires authentication. + +*/ + +use clap::{Args, Parser}; +use etl::concurrency::memory_monitor::MemorySnapshot; +use etl::config::{ + BatchConfig, InvalidatedSlotBehavior, MemoryBackpressureConfig, PgConnectionConfig, + PipelineConfig, TableSyncCopyConfig, TcpKeepaliveConfig, TlsConfig, +}; +use etl::pipeline::Pipeline; +use etl::store::both::memory::MemoryStore; +use etl_destinations::clickhouse::{ClickHouseDestination, ClickHouseInserterConfig}; +use std::error::Error; +use std::sync::Once; +use tokio::signal; +use tracing::{error, info}; +use tracing_subscriber::{layer::SubscriberExt, util::SubscriberInitExt}; + +/// Ensures crypto provider is only initialized once. +static INIT_CRYPTO: Once = Once::new(); + +/// Installs the default cryptographic provider for rustls. +fn install_crypto_provider() { + INIT_CRYPTO.call_once(|| { + rustls::crypto::aws_lc_rs::default_provider() + .install_default() + .expect("failed to install default crypto provider"); + }); +} + +/// Main application arguments combining database and ClickHouse configurations. +#[derive(Debug, Parser)] +#[command(name = "clickhouse", version, about, arg_required_else_help = true)] +struct AppArgs { + /// Postgres connection parameters + #[clap(flatten)] + db_args: DbArgs, + /// ClickHouse destination parameters + #[clap(flatten)] + ch_args: ChArgs, + /// Postgres publication name (must be created beforehand with CREATE PUBLICATION) + #[arg(long)] + publication: String, +} + +/// Postgres database connection configuration. +#[derive(Debug, Args)] +struct DbArgs { + /// Host on which Postgres is running (e.g., localhost or IP address) + #[arg(long)] + db_host: String, + /// Port on which Postgres is running (default: 5432) + #[arg(long)] + db_port: u16, + /// Postgres database name to connect to + #[arg(long)] + db_name: String, + /// Postgres database user name (must have REPLICATION privileges) + #[arg(long)] + db_username: String, + /// Postgres database user password (optional if using trust authentication) + #[arg(long)] + db_password: Option, +} + +/// ClickHouse destination configuration. +#[derive(Debug, Args)] +struct ChArgs { + /// ClickHouse HTTP(S) endpoint (e.g. http://localhost:8123 or https://host:8443) + #[arg(long)] + ch_url: String, + /// ClickHouse user name + #[arg(long)] + ch_user: String, + /// ClickHouse user password (optional) + #[arg(long)] + ch_password: Option, + /// ClickHouse target database + #[arg(long)] + ch_database: String, + /// Maximum time to wait for a batch to fill in milliseconds (lower values = lower latency, less throughput) + #[arg(long, default_value = "5000")] + max_batch_fill_duration_ms: u64, + /// Maximum number of concurrent table sync workers (higher values = faster initial sync, more resource usage) + #[arg(long, default_value = "4")] + max_table_sync_workers: u16, +} + +/// Entry point — handles error reporting and process exit. +#[tokio::main] +async fn main() -> Result<(), Box> { + if let Err(e) = main_impl().await { + error!("{e}"); + std::process::exit(1); + } + + Ok(()) +} + +/// Initialize structured logging with configurable log levels via RUST_LOG environment variable. +fn init_tracing() { + tracing_subscriber::registry() + .with( + tracing_subscriber::EnvFilter::try_from_default_env() + .unwrap_or_else(|_| "clickhouse=info".into()), + ) + .with(tracing_subscriber::fmt::layer()) + .init(); +} + +/// Set default log level if RUST_LOG environment variable is not set. +fn set_log_level() { + if std::env::var("RUST_LOG").is_err() { + unsafe { + std::env::set_var("RUST_LOG", "info"); + } + } +} + +/// Main implementation containing all pipeline setup and execution logic. +async fn main_impl() -> Result<(), Box> { + set_log_level(); + init_tracing(); + + // Install required crypto provider for TLS (used when ch_url is https://) + install_crypto_provider(); + + let args = AppArgs::parse(); + + // Configure Postgres connection settings + // Note: TLS is disabled in this example — enable for production use + let pg_connection_config = PgConnectionConfig { + host: args.db_args.db_host, + port: args.db_args.db_port, + name: args.db_args.db_name, + username: args.db_args.db_username, + password: args.db_args.db_password.map(Into::into), + tls: TlsConfig { + trusted_root_certs: String::new(), + enabled: false, // Set to true and provide certs for production + }, + keepalive: TcpKeepaliveConfig::default(), + }; + + // Create in-memory store for tracking table replication states and schemas. + // In production, you might want to use a persistent store like PostgresStore. + let store = MemoryStore::new(); + + let pipeline_config = PipelineConfig { + id: 1, + publication_name: args.publication, + pg_connection: pg_connection_config, + batch: BatchConfig { + max_fill_ms: args.ch_args.max_batch_fill_duration_ms, + memory_budget_ratio: BatchConfig::DEFAULT_MEMORY_BUDGET_RATIO, + }, + table_error_retry_delay_ms: 10000, + table_error_retry_max_attempts: 5, + max_table_sync_workers: args.ch_args.max_table_sync_workers, + memory_refresh_interval_ms: 100, + memory_backpressure: Some(MemoryBackpressureConfig::default()), + table_sync_copy: TableSyncCopyConfig::default(), + invalidated_slot_behavior: InvalidatedSlotBehavior::default(), + max_copy_connections_per_table: PipelineConfig::DEFAULT_MAX_COPY_CONNECTIONS_PER_TABLE, + }; + + // Compute max_bytes_per_insert using the same formula as BatchBudget::ideal_batch_size_bytes: + // total_memory * memory_budget_ratio / max_table_sync_workers + let max_bytes_per_insert = { + let total_memory = MemorySnapshot::from_system(&mut sysinfo::System::new()).total(); + let budget = (total_memory as f64 * f64::from(BatchConfig::DEFAULT_MEMORY_BUDGET_RATIO)) + as u64; + (budget / u64::from(args.ch_args.max_table_sync_workers)).max(1) + }; + + // Initialize the ClickHouse destination. + // Tables are created automatically as append-only MergeTree tables. + let clickhouse_destination = ClickHouseDestination::new( + args.ch_args.ch_url, + args.ch_args.ch_user, + args.ch_args.ch_password, + args.ch_args.ch_database, + ClickHouseInserterConfig { max_bytes_per_insert }, + store.clone(), + )?; + + let mut pipeline = Pipeline::new(pipeline_config, store, clickhouse_destination); + + info!( + "Starting ClickHouse CDC pipeline - connecting to Postgres and initializing replication..." + ); + + // Start the pipeline — this will: + // 1. Connect to Postgres + // 2. Initialize table states based on the publication + // 3. Start apply and table sync workers + // 4. Begin streaming replication data + pipeline.start().await?; + + info!("pipeline started, data replication is now active, press ctrl+c to stop"); + + let shutdown_signal = async { + signal::ctrl_c() + .await + .expect("Failed to install Ctrl+C handler"); + info!("received ctrl+c signal, initiating graceful shutdown"); + }; + + tokio::select! { + result = pipeline.wait() => { + info!("pipeline completed normally (this usually indicates an error condition)"); + result?; + } + _ = shutdown_signal => { + info!("gracefully shutting down pipeline and cleaning up resources"); + } + } + + info!("pipeline stopped, all resources cleaned up"); + + Ok(()) +} From d6bdb141ff226af737923da5c0f5f3968511d54b Mon Sep 17 00:00:00 2001 From: Benjamin <5719034+bnjjj@users.noreply.github.com> Date: Mon, 2 Mar 2026 10:58:03 +0100 Subject: [PATCH 04/86] fixes Signed-off-by: Benjamin <5719034+bnjjj@users.noreply.github.com> --- etl-api/src/k8s/http.rs | 2 +- etl-destinations/Cargo.toml | 3 +- etl-destinations/src/clickhouse/core.rs | 15 +- etl-destinations/src/clickhouse/mod.rs | 2 + etl-destinations/src/clickhouse/test_utils.rs | 197 ++++++++++++ etl-destinations/tests/clickhouse_pipeline.rs | 304 ++++++++++++++++++ etl-destinations/tests/support/clickhouse.rs | 78 +++++ etl-destinations/tests/support/mod.rs | 2 + etl-replicator/configuration/base.yaml | 21 ++ etl-replicator/configuration/dev.yaml | 21 ++ etl-replicator/configuration/prod.yaml | 21 ++ 11 files changed, 661 insertions(+), 5 deletions(-) create mode 100644 etl-destinations/src/clickhouse/test_utils.rs create mode 100644 etl-destinations/tests/clickhouse_pipeline.rs create mode 100644 etl-destinations/tests/support/clickhouse.rs create mode 100644 etl-replicator/configuration/base.yaml create mode 100644 etl-replicator/configuration/dev.yaml create mode 100644 etl-replicator/configuration/prod.yaml diff --git a/etl-api/src/k8s/http.rs b/etl-api/src/k8s/http.rs index 7f1570c00..8bb4ea565 100644 --- a/etl-api/src/k8s/http.rs +++ b/etl-api/src/k8s/http.rs @@ -1094,7 +1094,7 @@ fn create_bq_secret_env_var_json(bq_secret_name: &str) -> serde_json::Value { fn create_clickhouse_secret_env_var_json(clickouse_secret_name: &str) -> serde_json::Value { json!({ - "name": "APP_DESTINATION__CLICKHOUSE__PASSWORD", + "name": "APP_DESTINATION__CLICK_HOUSE__PASSWORD", "valueFrom": { "secretKeyRef": { "name": clickouse_secret_name, diff --git a/etl-destinations/Cargo.toml b/etl-destinations/Cargo.toml index 511e705f3..86e7a5bac 100644 --- a/etl-destinations/Cargo.toml +++ b/etl-destinations/Cargo.toml @@ -96,10 +96,11 @@ etl-telemetry = { workspace = true } tempfile = { workspace = true } chrono = { workspace = true } +clickhouse = { workspace = true, features = ["inserter", "rustls-tls"] } futures = { workspace = true } rand = { workspace = true, features = ["thread_rng"] } rustls = { workspace = true, features = ["aws-lc-rs", "logging"] } -serde = { workspace = true } +serde = { workspace = true, features = ["derive"] } serde_json = { workspace = true } tokio = { workspace = true, features = ["full"] } uuid = { workspace = true, features = ["v4"] } diff --git a/etl-destinations/src/clickhouse/core.rs b/etl-destinations/src/clickhouse/core.rs index 3784234c7..425db8a12 100644 --- a/etl-destinations/src/clickhouse/core.rs +++ b/etl-destinations/src/clickhouse/core.rs @@ -11,7 +11,7 @@ use etl::error::{ErrorKind, EtlResult}; use etl::etl_error; use etl::store::schema::SchemaStore; use etl::store::state::StateStore; -use etl::types::{Cell, Event, TableId, TableRow}; +use etl::types::{Cell, Event, TableId, TableRow, is_array_type}; use etl::{destination::Destination, types::PgLsn}; use parking_lot::RwLock; use std::time::Instant; @@ -151,9 +151,18 @@ where } // Compute nullable flags (user columns + 2 CDC columns always non-nullable). + // + // Array columns are NEVER marked nullable here, even if the Postgres column is nullable. + // The DDL always emits `Array(Nullable(T))` (no outer `Nullable` wrapper), so ClickHouse + // does not expect a null-indicator byte before the array. If we mistakenly set + // `nullable_flags[i] = true` for an array column, `rb_encode_nullable` would prepend a + // spurious `0x00` byte that ClickHouse reads as `varint(0)` (empty array), causing every + // subsequent column to be read from the wrong offset and ultimately "Cannot read all data". let column_schemas = &table_schema.column_schemas; - let mut nullable_flags_vec: Vec = - column_schemas.iter().map(|c| c.nullable).collect(); + let mut nullable_flags_vec: Vec = column_schemas + .iter() + .map(|c| c.nullable && !is_array_type(&c.typ)) + .collect(); nullable_flags_vec.push(false); // cdc_operation nullable_flags_vec.push(false); // cdc_lsn let nullable_flags: Arc<[bool]> = nullable_flags_vec.into(); diff --git a/etl-destinations/src/clickhouse/mod.rs b/etl-destinations/src/clickhouse/mod.rs index 692519872..c293da0d8 100644 --- a/etl-destinations/src/clickhouse/mod.rs +++ b/etl-destinations/src/clickhouse/mod.rs @@ -3,6 +3,8 @@ mod core; mod encoding; mod metrics; mod schema; +#[cfg(feature = "test-utils")] +pub mod test_utils; pub use client::ClickHouseClient; pub use core::{ClickHouseDestination, ClickHouseInserterConfig}; diff --git a/etl-destinations/src/clickhouse/test_utils.rs b/etl-destinations/src/clickhouse/test_utils.rs new file mode 100644 index 000000000..6251a0749 --- /dev/null +++ b/etl-destinations/src/clickhouse/test_utils.rs @@ -0,0 +1,197 @@ +//! Test utilities for ClickHouse destinations. + +use clickhouse::Client; +use etl::store::schema::SchemaStore; +use etl::store::state::StateStore; +use etl::types::PipelineId; +use tokio::runtime::Handle; +use uuid::Uuid; + +use crate::clickhouse::{ClickHouseDestination, ClickHouseInserterConfig}; + +/// ClickHouse HTTP URL (e.g. `http://localhost:8123`). +pub const CLICKHOUSE_URL_ENV: &str = "TESTS_CLICKHOUSE_URL"; +/// ClickHouse user name (required). +pub const CLICKHOUSE_USER_ENV: &str = "TESTS_CLICKHOUSE_USER"; +/// ClickHouse password (optional — omit or leave empty for passwordless access). +pub const CLICKHOUSE_PASSWORD_ENV: &str = "TESTS_CLICKHOUSE_PASSWORD"; + +/// Returns whether ClickHouse integration tests should be skipped. +/// +/// Prints a warning and returns `true` when any required env var is missing. +/// Required: [`CLICKHOUSE_URL_ENV`], [`CLICKHOUSE_USER_ENV`]. +/// Optional: [`CLICKHOUSE_PASSWORD_ENV`]. +pub fn skip_if_missing_clickhouse_env_vars() -> bool { + let missing: Vec<&str> = [CLICKHOUSE_URL_ENV, CLICKHOUSE_USER_ENV] + .iter() + .copied() + .filter(|var| std::env::var_os(var).is_none()) + .collect(); + + if missing.is_empty() { + return false; + } + + eprintln!( + "skipping clickhouse integration test: missing {}", + missing.join(", ") + ); + true +} + +/// Returns the ClickHouse HTTP URL from the environment. +/// +/// # Panics +/// +/// Panics if [`CLICKHOUSE_URL_ENV`] is not set. +pub fn get_clickhouse_url() -> String { + std::env::var(CLICKHOUSE_URL_ENV) + .unwrap_or_else(|_| panic!("{CLICKHOUSE_URL_ENV} must be set")) +} + +/// Returns the ClickHouse user name from the environment. +/// +/// # Panics +/// +/// Panics if [`CLICKHOUSE_USER_ENV`] is not set. +pub fn get_clickhouse_user() -> String { + std::env::var(CLICKHOUSE_USER_ENV) + .unwrap_or_else(|_| panic!("{CLICKHOUSE_USER_ENV} must be set")) +} + +/// Returns the ClickHouse password from the environment, or `None` if unset. +pub fn get_clickhouse_password() -> Option { + std::env::var(CLICKHOUSE_PASSWORD_ENV).ok().filter(|s| !s.is_empty()) +} + +/// Generates a unique database name for test isolation. +pub fn random_database_name() -> String { + format!("etl_tests_{}", Uuid::new_v4().simple()) +} + +/// ClickHouse connection for testing. +/// +/// Wraps a [`Client`] and automatically drops the test database on [`Drop`]. +pub struct ClickHouseTestDatabase { + /// Root client (no database selected) used for CREATE/DROP DATABASE. + root_client: Client, + /// Client scoped to the test database for queries. + db_client: Client, + url: String, + user: String, + password: Option, + database: String, +} + +impl ClickHouseTestDatabase { + fn new(url: String, user: String, password: Option, database: String) -> Self { + let build_client = |db: Option<&str>| { + let mut c = Client::default().with_url(&url).with_user(&user); + if let Some(db) = db { + c = c.with_database(db); + } + if let Some(pw) = &password { + c = c.with_password(pw); + } + c + }; + + Self { + root_client: build_client(None), + db_client: build_client(Some(&database)), + url, + user, + password, + database, + } + } + + /// Creates the test database in ClickHouse. + pub async fn create_database(&self) { + self.root_client + .query(&format!( + "CREATE DATABASE IF NOT EXISTS `{}`", + self.database + )) + .execute() + .await + .expect("Failed to create test ClickHouse database"); + } + + /// Drops the test database from ClickHouse. + pub async fn drop_database(&self) { + self.root_client + .query(&format!( + "DROP DATABASE IF EXISTS `{}`", + self.database + )) + .execute() + .await + .expect("Failed to drop test ClickHouse database"); + } + + /// Builds a [`ClickHouseDestination`] scoped to this test database. + pub fn build_destination( + &self, + _pipeline_id: PipelineId, + store: S, + ) -> ClickHouseDestination + where + S: StateStore + SchemaStore + Send + Sync, + { + ClickHouseDestination::new( + &self.url, + &self.user, + self.password.clone(), + &self.database, + ClickHouseInserterConfig { + // 100 MiB — large enough that tests never hit an intermediate flush. + max_bytes_per_insert: 100 * 1024 * 1024, + }, + store, + ) + .expect("Failed to create ClickHouseDestination for test") + } + + /// Fetches all rows from a ClickHouse table using the given SQL query. + /// + /// `T` must be an owned row type (i.e. `Value<'a> = Self`) and implement + /// [`serde::de::DeserializeOwned`]. The caller is responsible for writing a + /// SELECT whose columns match `T`'s fields in the correct order. + pub async fn query(&self, sql: &str) -> Vec + where + T: for<'a> clickhouse::Row = T> + serde::de::DeserializeOwned + 'static, + { + self.db_client + .query(sql) + .fetch_all::() + .await + .expect("ClickHouse query failed") + } +} + +impl Drop for ClickHouseTestDatabase { + fn drop(&mut self) { + if let Ok(handle) = Handle::try_current() { + handle.block_on(self.drop_database()); + } + } +} + +/// Creates a fresh, isolated ClickHouse database for a single test. +/// +/// Reads connection parameters from environment variables: +/// - [`CLICKHOUSE_URL_ENV`] — required +/// - [`CLICKHOUSE_USER_ENV`] — required +/// - [`CLICKHOUSE_PASSWORD_ENV`] — optional +/// +/// The database is dropped automatically when the returned handle is dropped. +pub async fn setup_clickhouse_database() -> ClickHouseTestDatabase { + let url = get_clickhouse_url(); + let user = get_clickhouse_user(); + let password = get_clickhouse_password(); + let database = random_database_name(); + let db = ClickHouseTestDatabase::new(url, user, password, database); + db.create_database().await; + db +} diff --git a/etl-destinations/tests/clickhouse_pipeline.rs b/etl-destinations/tests/clickhouse_pipeline.rs new file mode 100644 index 000000000..32acebce2 --- /dev/null +++ b/etl-destinations/tests/clickhouse_pipeline.rs @@ -0,0 +1,304 @@ +#![cfg(all(feature = "clickhouse", feature = "test-utils"))] + +use etl::state::table::TableReplicationPhaseType; +use etl::test_utils::database::{spawn_source_database, test_table_name}; +use etl::test_utils::notifying_store::NotifyingStore; +use etl::test_utils::pipeline::create_pipeline; +use etl::types::PipelineId; +use etl_destinations::clickhouse::test_utils::{ + setup_clickhouse_database, skip_if_missing_clickhouse_env_vars, +}; +use etl_telemetry::tracing::init_test_tracing; +use rand::random; +use std::sync::Once; + +use crate::support::clickhouse::AllTypesRow; + +mod support; + +/// Ensures the rustls crypto provider is only installed once across all tests. +static INIT_CRYPTO: Once = Once::new(); + +fn install_crypto_provider() { + INIT_CRYPTO.call_once(|| { + rustls::crypto::aws_lc_rs::default_provider() + .install_default() + .expect("failed to install default crypto provider"); + }); +} + +/// ClickHouse table name for `test.all_types_encoding`. +/// +/// Derived from `table_name_to_clickhouse_table_name("test", "all_types_encoding")`: +/// - "test" → "test" (no underscores) +/// - "all_types_encoding" → "all__types__encoding" (underscores escaped to __) +const ALL_TYPES_CH_TABLE: &str = "test_all__types__encoding"; + +/// SELECT query that fetches all verified columns from the ClickHouse table. +/// +/// `uuid_col` is projected via `toString()` because the ClickHouse UUID RowBinary +/// wire format does not directly map to a Rust `String`; `toString()` gives us the +/// canonical `xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx` string form. +/// +/// All other columns are read with their native ClickHouse types: +/// - `Date` → u16 (days since 1970-01-01) +/// - `DateTime64(6)` → i64 (microseconds since epoch) +/// - `Array(Nullable(T))` → `Vec>` +const ALL_TYPES_SELECT: &str = concat!( + "SELECT ", + "id, smallint_col, integer_col, bigint_col, real_col, double_col, ", + "numeric_col, boolean_col, text_col, varchar_col, ", + "date_col, timestamp_col, timestamptz_col, time_col, interval_col, ", + "jsonb_col, json_col, integer_array_col, text_array_col, ", + "bytea_col, inet_col, cidr_col, macaddr_col, ", + "toString(uuid_col) AS uuid_col, ", + "cdc_operation ", + "FROM \"test_all__types__encoding\" ", + "ORDER BY id", +); + +/// Days from 1970-01-01 to 2024-01-15 (used to verify the `date_col` round-trip). +/// +/// Python: `(date(2024, 1, 15) - date(1970, 1, 1)).days` = 19737 +const DATE_2024_01_15_DAYS: u16 = 19737; + +/// Microseconds from epoch for `2024-01-15 12:00:00 UTC`. +/// +/// Python: `int(datetime(2024, 1, 15, 12, 0, 0, tzinfo=timezone.utc).timestamp() * 1_000_000)` +/// = 1705320000000000 +const TS_2024_01_15_12_00_US: i64 = 1_705_320_000_000_000; + +/// Tests that all Postgres column types (including nullable arrays) round-trip +/// correctly through the ClickHouse RowBinary encoding. +/// +/// # Regression test +/// +/// This test specifically catches the nullable-array encoding bug where +/// `nullable_flags[i] = true` for array columns caused `rb_encode_nullable` to +/// prepend an extra null-indicator byte. ClickHouse read that byte as `varint(0)` +/// (empty array) and then parsed the actual element bytes as subsequent column +/// data, ultimately failing with "Cannot read all data" at row 2. +/// +/// The fix: array columns always use `nullable_flags[i] = false` because the DDL +/// emits `Array(Nullable(T))` without an outer `Nullable` wrapper. +/// +/// Row 1 has **empty** arrays (accidentally passed with the old code because +/// `0x00` null-indicator == `varint(0)` = empty array). +/// Row 2 has **non-empty** arrays (fails with the old code, passes with the fix). +#[tokio::test(flavor = "multi_thread")] +async fn all_types_table_copy() { + if skip_if_missing_clickhouse_env_vars() { + return; + } + + init_test_tracing(); + install_crypto_provider(); + + // ── Postgres source ─────────────────────────────────────────────────────── + let database = spawn_source_database().await; + let table_name = test_table_name("all_types_encoding"); + + let table_id = database + .create_table( + table_name.clone(), + true, // add serial primary key + &[ + // Scalar types + ("smallint_col", "smallint not null"), + ("integer_col", "integer not null"), + ("bigint_col", "bigint not null"), + ("real_col", "real not null"), + ("double_col", "double precision not null"), + ("numeric_col", "numeric(10,2) not null"), + ("boolean_col", "boolean not null"), + ("text_col", "text not null"), + ("varchar_col", "varchar(100) not null"), + ("date_col", "date not null"), + ("timestamp_col", "timestamp not null"), + ("timestamptz_col", "timestamptz not null"), + ("time_col", "time not null"), + ("interval_col", "interval not null"), + ("jsonb_col", "jsonb not null"), + ("json_col", "json not null"), + // Nullable array columns (key for the regression test). + // These are intentionally nullable so that nullable_flags[i] would + // have been set to `true` before the fix, triggering the bug. + ("integer_array_col", "integer[]"), + ("text_array_col", "text[]"), + // Other types + ("bytea_col", "bytea not null"), + ("inet_col", "inet not null"), + ("cidr_col", "cidr not null"), + ("macaddr_col", "macaddr not null"), + ("uuid_col", "uuid not null"), + ], + ) + .await + .expect("Failed to create test table"); + + let publication_name = "test_pub_clickhouse"; + database + .create_publication(publication_name, std::slice::from_ref(&table_name)) + .await + .expect("Failed to create publication"); + + // Insert rows BEFORE starting the pipeline — they will be captured by the + // initial table-copy phase (write_table_rows path). + // + // Row 1: empty arrays. With the old encoding bug, this accidentally + // produced valid RowBinary because `0x00` (null-indicator) == + // varint(0) == empty array. + database + .run_sql(&format!( + r#"INSERT INTO {table} ( + smallint_col, integer_col, bigint_col, + real_col, double_col, numeric_col, boolean_col, + text_col, varchar_col, + date_col, timestamp_col, timestamptz_col, + time_col, interval_col, jsonb_col, json_col, + integer_array_col, text_array_col, + bytea_col, inet_col, cidr_col, macaddr_col, uuid_col + ) VALUES ( + 42, 1000, 9999999, + 1.5, 2.5, 12345.67, true, + 'hello text', 'hello varchar', + '2024-01-15', '2024-01-15 12:00:00', '2024-01-15 12:00:00+00', + '14:30:00', '1 day', + '{{"key":"value"}}', '{{"simple":42}}', + ARRAY[]::integer[], ARRAY[]::text[], + '\xdeadbeef', + '192.168.1.1', '192.168.0.0/16', 'aa:bb:cc:dd:ee:ff', + 'f47ac10b-58cc-4372-a567-0e02b2c3d479' + )"#, + table = table_name.as_quoted_identifier(), + )) + .await + .expect("Failed to insert row 1"); + + // Row 2: NON-EMPTY arrays. With the old encoding bug, this row caused + // ClickHouse to fail with "Cannot read all data" because the extra + // null-indicator byte caused the entire RowBinary stream to be + // mis-aligned after the array column. + database + .run_sql(&format!( + r#"INSERT INTO {table} ( + smallint_col, integer_col, bigint_col, + real_col, double_col, numeric_col, boolean_col, + text_col, varchar_col, + date_col, timestamp_col, timestamptz_col, + time_col, interval_col, jsonb_col, json_col, + integer_array_col, text_array_col, + bytea_col, inet_col, cidr_col, macaddr_col, uuid_col + ) VALUES ( + -32768, -2147483648, -9223372036854775808, + -1.5, -2.5, -99999.99, false, + 'world text', 'world varchar', + '2024-01-15', '2024-01-15 12:00:00', '2024-01-15 12:00:00+00', + '00:00:01', '30 days 23 hours', + '{{"arr":[1,2,3]}}', '{{"n":0}}', + ARRAY[1, 2, 3]::integer[], ARRAY['alpha', 'beta']::text[], + '\xcafebabe', + '10.0.0.1', '10.0.0.0/8', 'ff:ee:dd:cc:bb:aa', + 'a1b2c3d4-e5f6-7890-abcd-ef1234567890' + )"#, + table = table_name.as_quoted_identifier(), + )) + .await + .expect("Failed to insert row 2"); + + // ── ClickHouse destination ──────────────────────────────────────────────── + let ch_db = setup_clickhouse_database().await; + let store = NotifyingStore::new(); + let pipeline_id: PipelineId = random(); + let destination = ch_db.build_destination(pipeline_id, store.clone()); + + let table_ready = store + .notify_on_table_state_type(table_id, TableReplicationPhaseType::Ready) + .await; + + let mut pipeline = create_pipeline( + &database.config, + pipeline_id, + publication_name.to_owned(), + store.clone(), + destination, + ); + + pipeline.start().await.unwrap(); + table_ready.notified().await; + pipeline.shutdown_and_wait().await.unwrap(); + + // ── Verify ClickHouse data ──────────────────────────────────────────────── + let rows: Vec = ch_db.query(ALL_TYPES_SELECT).await; + + assert_eq!(rows.len(), 2, "expected 2 rows in ClickHouse"); + + // ── Row 1 assertions ───────────────────────────────────────────────────── + let r1 = &rows[0]; + assert_eq!(r1.id, 1); + assert_eq!(r1.smallint_col, 42); + assert_eq!(r1.integer_col, 1000); + assert_eq!(r1.bigint_col, 9_999_999); + assert!((r1.real_col - 1.5_f32).abs() < 1e-3, "real_col mismatch"); + assert!((r1.double_col - 2.5_f64).abs() < 1e-6, "double_col mismatch"); + assert_eq!(r1.numeric_col, "12345.67"); + assert!(r1.boolean_col); + assert_eq!(r1.text_col, "hello text"); + assert_eq!(r1.varchar_col, "hello varchar"); + assert_eq!(r1.date_col, DATE_2024_01_15_DAYS, "date round-trip failed"); + assert_eq!( + r1.timestamp_col, TS_2024_01_15_12_00_US, + "timestamp round-trip failed" + ); + assert_eq!( + r1.timestamptz_col, TS_2024_01_15_12_00_US, + "timestamptz round-trip failed" + ); + assert_eq!(r1.time_col, "14:30:00"); + assert_eq!(r1.bytea_col, "deadbeef"); + assert_eq!(r1.inet_col, "192.168.1.1"); + assert_eq!(r1.cidr_col, "192.168.0.0/16"); + assert_eq!(r1.macaddr_col, "aa:bb:cc:dd:ee:ff"); + assert_eq!( + r1.uuid_col.to_lowercase(), + "f47ac10b-58cc-4372-a567-0e02b2c3d479" + ); + assert_eq!(r1.cdc_operation, "INSERT"); + // Empty arrays — the regression case that accidentally worked before the fix. + assert_eq!( + r1.integer_array_col, + Vec::>::new(), + "row 1 integer_array_col should be empty" + ); + assert_eq!( + r1.text_array_col, + Vec::>::new(), + "row 1 text_array_col should be empty" + ); + + // ── Row 2 assertions ───────────────────────────────────────────────────── + let r2 = &rows[1]; + assert_eq!(r2.id, 2); + assert_eq!(r2.smallint_col, -32768); + assert_eq!(r2.integer_col, -2_147_483_648); + assert_eq!(r2.bigint_col, i64::MIN); + assert!(!r2.boolean_col); + assert_eq!(r2.numeric_col, "-99999.99"); + assert_eq!(r2.bytea_col, "cafebabe"); + assert_eq!( + r2.uuid_col.to_lowercase(), + "a1b2c3d4-e5f6-7890-abcd-ef1234567890" + ); + assert_eq!(r2.cdc_operation, "INSERT"); + // Non-empty arrays — the regression case that triggered the bug before the fix. + assert_eq!( + r2.integer_array_col, + vec![Some(1), Some(2), Some(3)], + "row 2 integer_array_col mismatch — nullable-array encoding bug likely present" + ); + assert_eq!( + r2.text_array_col, + vec![Some("alpha".to_string()), Some("beta".to_string())], + "row 2 text_array_col mismatch — nullable-array encoding bug likely present" + ); +} diff --git a/etl-destinations/tests/support/clickhouse.rs b/etl-destinations/tests/support/clickhouse.rs new file mode 100644 index 000000000..ff310ffc0 --- /dev/null +++ b/etl-destinations/tests/support/clickhouse.rs @@ -0,0 +1,78 @@ +#![allow(dead_code)] +#![cfg(all(feature = "clickhouse", feature = "test-utils"))] + +/// A row read back from the ClickHouse `all_types_encoding` test table. +/// +/// Column-to-type mapping: +/// - `Date` → `u16` (days since 1970-01-01 in RowBinary) +/// - `DateTime64(6)` → `i64` (microseconds since epoch in RowBinary) +/// - `UUID` → `String` (via `toString()` in the SELECT query) +/// - `Array(Nullable(T))` → `Vec>` +/// +/// Fields must match the SELECT column list in the test query exactly. +#[derive(clickhouse::Row, serde::Deserialize, Debug, Clone)] +pub struct AllTypesRow { + pub id: i32, + pub smallint_col: i16, + pub integer_col: i32, + pub bigint_col: i64, + pub real_col: f32, + pub double_col: f64, + pub numeric_col: String, + pub boolean_col: bool, + pub text_col: String, + pub varchar_col: String, + pub date_col: u16, // Date → days since epoch + pub timestamp_col: i64, // DateTime64(6) → microseconds + pub timestamptz_col: i64, // DateTime64(6,'UTC') → microseconds + pub time_col: String, + pub interval_col: String, + pub jsonb_col: String, + pub json_col: String, + pub integer_array_col: Vec>, + pub text_array_col: Vec>, + pub bytea_col: String, // hex-encoded + pub inet_col: String, + pub cidr_col: String, + pub macaddr_col: String, + pub uuid_col: String, // via toString() in SELECT + pub cdc_operation: String, +} + +impl PartialEq for AllTypesRow { + fn eq(&self, other: &Self) -> bool { + fn f32_eq(a: f32, b: f32) -> bool { + (a - b).abs() < 1e-3 + } + fn f64_eq(a: f64, b: f64) -> bool { + (a - b).abs() < 1e-6 + } + + self.id == other.id + && self.smallint_col == other.smallint_col + && self.integer_col == other.integer_col + && self.bigint_col == other.bigint_col + && f32_eq(self.real_col, other.real_col) + && f64_eq(self.double_col, other.double_col) + && self.numeric_col == other.numeric_col + && self.boolean_col == other.boolean_col + && self.text_col == other.text_col + && self.varchar_col == other.varchar_col + && self.date_col == other.date_col + && self.timestamp_col == other.timestamp_col + && self.timestamptz_col == other.timestamptz_col + && self.time_col == other.time_col + && self.jsonb_col == other.jsonb_col + && self.json_col == other.json_col + && self.integer_array_col == other.integer_array_col + && self.text_array_col == other.text_array_col + && self.bytea_col == other.bytea_col + && self.inet_col == other.inet_col + && self.cidr_col == other.cidr_col + && self.macaddr_col == other.macaddr_col + && self.uuid_col.to_lowercase() == other.uuid_col.to_lowercase() + && self.cdc_operation == other.cdc_operation + } +} + +impl Eq for AllTypesRow {} diff --git a/etl-destinations/tests/support/mod.rs b/etl-destinations/tests/support/mod.rs index f149f0ba5..60f305046 100644 --- a/etl-destinations/tests/support/mod.rs +++ b/etl-destinations/tests/support/mod.rs @@ -1,4 +1,6 @@ pub mod bigquery; +#[cfg(all(feature = "clickhouse", feature = "test-utils"))] +pub mod clickhouse; #[cfg(feature = "ducklake")] pub mod ducklake; pub mod iceberg; diff --git a/etl-replicator/configuration/base.yaml b/etl-replicator/configuration/base.yaml new file mode 100644 index 000000000..14fec2eea --- /dev/null +++ b/etl-replicator/configuration/base.yaml @@ -0,0 +1,21 @@ +application: + host: "[::]" + port: 8080 +destination: + click_house: + url: http://clickhouse.etl-data-plane.svc.cluster.local:8123 + user: default + password: password + database: mydb +pipeline: + id: 42 + publication_name: my_pub + pg_connection: + host: postgres.etl-data-plane.svc.cluster.local + port: 5432 + name: mydb + username: postgres + password: password + tls: + trusted_root_certs: "" + enabled: false diff --git a/etl-replicator/configuration/dev.yaml b/etl-replicator/configuration/dev.yaml new file mode 100644 index 000000000..14fec2eea --- /dev/null +++ b/etl-replicator/configuration/dev.yaml @@ -0,0 +1,21 @@ +application: + host: "[::]" + port: 8080 +destination: + click_house: + url: http://clickhouse.etl-data-plane.svc.cluster.local:8123 + user: default + password: password + database: mydb +pipeline: + id: 42 + publication_name: my_pub + pg_connection: + host: postgres.etl-data-plane.svc.cluster.local + port: 5432 + name: mydb + username: postgres + password: password + tls: + trusted_root_certs: "" + enabled: false diff --git a/etl-replicator/configuration/prod.yaml b/etl-replicator/configuration/prod.yaml new file mode 100644 index 000000000..14fec2eea --- /dev/null +++ b/etl-replicator/configuration/prod.yaml @@ -0,0 +1,21 @@ +application: + host: "[::]" + port: 8080 +destination: + click_house: + url: http://clickhouse.etl-data-plane.svc.cluster.local:8123 + user: default + password: password + database: mydb +pipeline: + id: 42 + publication_name: my_pub + pg_connection: + host: postgres.etl-data-plane.svc.cluster.local + port: 5432 + name: mydb + username: postgres + password: password + tls: + trusted_root_certs: "" + enabled: false From df9f0a9203f7b9660851eaa8c5ca657d803e46d9 Mon Sep 17 00:00:00 2001 From: Jordan McQueen Date: Mon, 6 Apr 2026 03:10:19 +0900 Subject: [PATCH 05/86] Add local ClickHouse test helper --- DEVELOPMENT.md | 42 +++++++++++++++++--- scripts/docker-compose.yaml | 21 ++++++++++ scripts/init.sh | 62 +++++++++++++++++++++++------ scripts/test-clickhouse.sh | 77 +++++++++++++++++++++++++++++++++++++ 4 files changed, 186 insertions(+), 16 deletions(-) create mode 100755 scripts/test-clickhouse.sh diff --git a/DEVELOPMENT.md b/DEVELOPMENT.md index 0b385c59e..aa98f0b93 100644 --- a/DEVELOPMENT.md +++ b/DEVELOPMENT.md @@ -50,10 +50,10 @@ The fastest way to get started is using the setup script: ``` This script will: -1. Start PostgreSQL via Docker Compose -2. Run etl-api migrations -3. Seed the default replicator image -4. Configure the Kubernetes environment (OrbStack) +1. Start PostgreSQL, ClickHouse, and the local Iceberg dependencies via Docker Compose. +2. Run etl-api migrations. +3. Seed the default replicator image. +4. Configure the Kubernetes environment (OrbStack). ## Database Setup @@ -84,12 +84,19 @@ POSTGRES_DATA_VOLUME=/path/to/data ./scripts/init.sh | `POSTGRES_DB` | `postgres` | Database name | | `POSTGRES_PORT` | `5430` | Database port | | `POSTGRES_HOST` | `localhost` | Database host | +| `CLICKHOUSE_HTTP_PORT` | `8123` | ClickHouse HTTP port | +| `CLICKHOUSE_NATIVE_PORT` | `9000` | ClickHouse native TCP port | +| `CLICKHOUSE_USER` | `etl` | ClickHouse user for the local Docker Compose setup | +| `CLICKHOUSE_PASSWORD` | `etl` | ClickHouse password for the local Docker Compose setup | | `SKIP_DOCKER` | (empty) | Skip Docker Compose if set | -| `POSTGRES_DATA_VOLUME` | (empty) | Path for persistent storage | +| `POSTGRES_DATA_VOLUME` | (empty) | Path for PostgreSQL persistent storage | +| `CLICKHOUSE_DATA_VOLUME` | (empty) | Path for ClickHouse persistent storage | | `REPLICATOR_IMAGE` | `ramsup/etl-replicator:latest` | Default replicator image | PostgreSQL 18+ containers store data under `/var/lib/postgresql//data`, so the Docker Compose setup mounts the parent `/var/lib/postgresql` directory to keep upgrades compatible. +The same Docker Compose stack also starts ClickHouse on `http://localhost:8123` by default, which is enough for local destination development and ClickHouse integration tests. + ### Manual Setup If you prefer manual setup or have an existing PostgreSQL instance: @@ -354,6 +361,18 @@ Iceberg destination tests use local MinIO and Lakekeeper instances. The followin **Note:** Iceberg tests are only run when the `iceberg` and `test-utils` features are enabled. These use hardcoded local URLs and do not require environment variables. +#### ClickHouse Test Variables + +ClickHouse destination tests require a reachable ClickHouse HTTP endpoint: + +| Variable | Required | Description | +|----------|----------|-------------| +| `TESTS_CLICKHOUSE_URL` | **Yes** | ClickHouse HTTP URL (for example, `http://localhost:8123`) | +| `TESTS_CLICKHOUSE_USER` | **Yes** | ClickHouse user name (for the local Docker Compose setup, use `etl`) | +| `TESTS_CLICKHOUSE_PASSWORD` | No | ClickHouse password; for the local Docker Compose setup, use `etl` | + +**Note:** ClickHouse tests are only run when the `clickhouse` and `test-utils` features are enabled. Each test creates a unique database in ClickHouse and drops it automatically when the test finishes. The Docker Compose setup started by `./scripts/init.sh` is sufficient for these tests. + #### Test Output and Logging | Variable | Description | @@ -392,6 +411,11 @@ export TESTS_DATABASE_PASSWORD=postgres export TESTS_BIGQUERY_PROJECT_ID=your-gcp-project-id export TESTS_BIGQUERY_SA_KEY_PATH=/path/to/service-account-key.json +# ClickHouse test configuration (optional - only needed for ClickHouse tests) +export TESTS_CLICKHOUSE_URL=http://localhost:8123 +export TESTS_CLICKHOUSE_USER=etl +export TESTS_CLICKHOUSE_PASSWORD=etl + # Enable test output (optional) export ENABLE_TRACING=1 export RUST_LOG=info @@ -417,6 +441,11 @@ TESTS_DATABASE_PASSWORD=postgres TESTS_BIGQUERY_PROJECT_ID=your-gcp-project-id TESTS_BIGQUERY_SA_KEY_PATH=/path/to/service-account-key.json +# ClickHouse (optional - only for ClickHouse tests) +TESTS_CLICKHOUSE_URL=http://localhost:8123 +TESTS_CLICKHOUSE_USER=etl +TESTS_CLICKHOUSE_PASSWORD=etl + # Test output (optional) ENABLE_TRACING=1 RUST_LOG=info @@ -447,6 +476,9 @@ TESTS_DATABASE_HOST=localhost TESTS_DATABASE_PORT=5430 TESTS_DATABASE_USERNAME=p # Run tests with tracing output for debugging TESTS_DATABASE_HOST=localhost TESTS_DATABASE_PORT=5430 TESTS_DATABASE_USERNAME=postgres TESTS_DATABASE_PASSWORD=postgres ENABLE_TRACING=1 RUST_LOG=info cargo test -p etl-api --test tenants tenant_can_be_created -- --nocapture + +# Run the ClickHouse destination integration test against the local Docker Compose service +TESTS_DATABASE_HOST=localhost TESTS_DATABASE_PORT=5430 TESTS_DATABASE_USERNAME=postgres TESTS_DATABASE_PASSWORD=postgres TESTS_CLICKHOUSE_URL=http://localhost:8123 TESTS_CLICKHOUSE_USER=etl TESTS_CLICKHOUSE_PASSWORD=etl cargo test -p etl-destinations --features clickhouse,test-utils clickhouse_pipeline -- --nocapture ``` **Packages requiring `--features test-utils`:** diff --git a/scripts/docker-compose.yaml b/scripts/docker-compose.yaml index db20a610a..ad1a2d0f1 100644 --- a/scripts/docker-compose.yaml +++ b/scripts/docker-compose.yaml @@ -37,6 +37,25 @@ services: timeout: 5s retries: 5 + clickhouse: + image: clickhouse/clickhouse-server:latest + environment: + CLICKHOUSE_USER: ${CLICKHOUSE_USER:-etl} + CLICKHOUSE_PASSWORD: ${CLICKHOUSE_PASSWORD:-etl} + CLICKHOUSE_DEFAULT_ACCESS_MANAGEMENT: 1 + ports: + - "${CLICKHOUSE_HTTP_PORT:-8123}:8123" + - "${CLICKHOUSE_NATIVE_PORT:-9000}:9000" + volumes: + - ${CLICKHOUSE_DATA_VOLUME:-clickhouse_data}:/var/lib/clickhouse + restart: unless-stopped + healthcheck: + test: ["CMD-SHELL", "clickhouse-client --user \"$$CLICKHOUSE_USER\" --password \"$$CLICKHOUSE_PASSWORD\" --query \"SELECT 1\""] + interval: 5s + timeout: 5s + retries: 10 + start_period: 10s + catalog-postgres: image: postgres:${LAKEKEEPER_POSTGRES_VERSION:-18} environment: @@ -165,6 +184,8 @@ services: volumes: postgres_data: driver: local + clickhouse_data: + driver: local lakekeeper_postgres_data: driver: local minio_data: diff --git a/scripts/init.sh b/scripts/init.sh index fe11cbeda..22ef2196e 100755 --- a/scripts/init.sh +++ b/scripts/init.sh @@ -27,10 +27,16 @@ DB_PASSWORD="${POSTGRES_PASSWORD:=postgres}" DB_NAME="${POSTGRES_DB:=postgres}" DB_PORT="${POSTGRES_PORT:=5430}" DB_HOST="${POSTGRES_HOST:=localhost}" +CLICKHOUSE_HTTP_PORT="${CLICKHOUSE_HTTP_PORT:=8123}" +CLICKHOUSE_NATIVE_PORT="${CLICKHOUSE_NATIVE_PORT:=9000}" +CLICKHOUSE_USER="${CLICKHOUSE_USER:=etl}" +CLICKHOUSE_PASSWORD="${CLICKHOUSE_PASSWORD:=etl}" # Docker compose setup +USING_DOCKER_COMPOSE=0 if [[ -z "${SKIP_DOCKER}" ]] then + USING_DOCKER_COMPOSE=1 echo "🐳 Starting all services with Docker Compose..." # Export environment variables for docker-compose @@ -38,14 +44,26 @@ then export POSTGRES_PASSWORD="${DB_PASSWORD}" export POSTGRES_DB="${DB_NAME}" export POSTGRES_PORT="${DB_PORT}" + export CLICKHOUSE_HTTP_PORT="${CLICKHOUSE_HTTP_PORT}" + export CLICKHOUSE_NATIVE_PORT="${CLICKHOUSE_NATIVE_PORT}" + export CLICKHOUSE_USER="${CLICKHOUSE_USER}" + export CLICKHOUSE_PASSWORD="${CLICKHOUSE_PASSWORD}" # Handle persistent storage if [[ -n "${POSTGRES_DATA_VOLUME}" ]]; then - echo "📁 Setting up persistent storage at ${POSTGRES_DATA_VOLUME}" + echo "📁 Setting up Postgres persistent storage at ${POSTGRES_DATA_VOLUME}" mkdir -p "${POSTGRES_DATA_VOLUME}" export POSTGRES_DATA_VOLUME="${POSTGRES_DATA_VOLUME}" else - echo "📁 No storage path specified, using default Docker volume" + echo "📁 No Postgres storage path specified, using default Docker volume" + fi + + if [[ -n "${CLICKHOUSE_DATA_VOLUME}" ]]; then + echo "📁 Setting up ClickHouse persistent storage at ${CLICKHOUSE_DATA_VOLUME}" + mkdir -p "${CLICKHOUSE_DATA_VOLUME}" + export CLICKHOUSE_DATA_VOLUME="${CLICKHOUSE_DATA_VOLUME}" + else + echo "📁 No ClickHouse storage path specified, using default Docker volume" fi # Pull latest images before starting services @@ -57,17 +75,39 @@ then echo "✅ All services started" fi -# Wait for Postgres to be ready -echo "⏳ Waiting for Postgres to be ready..." -until docker-compose -f ./scripts/docker-compose.yaml exec -T source-postgres pg_isready -U postgres > /dev/null 2>&1; do - echo "Waiting for Postgres..." - sleep 1 -done - -echo "✅ Postgres is up and running on port ${DB_PORT}" - # Export DATABASE_URL for potential use by other scripts export DATABASE_URL=postgres://${DB_USER}:${DB_PASSWORD}@${DB_HOST}:${DB_PORT}/${DB_NAME} + +# Wait for Postgres to be ready +if [[ "${USING_DOCKER_COMPOSE}" == "1" ]]; then + echo "⏳ Waiting for Postgres to be ready..." + until docker-compose -f ./scripts/docker-compose.yaml exec -T source-postgres pg_isready -U postgres > /dev/null 2>&1; do + echo "Waiting for Postgres..." + sleep 1 + done + + echo "✅ Postgres is up and running on port ${DB_PORT}" + + echo "⏳ Waiting for ClickHouse to be ready..." + until docker-compose -f ./scripts/docker-compose.yaml exec -T clickhouse clickhouse-client --user "$CLICKHOUSE_USER" --password "$CLICKHOUSE_PASSWORD" --query "SELECT 1" > /dev/null 2>&1; do + echo "Waiting for ClickHouse..." + sleep 1 + done + + echo "✅ ClickHouse is up and running on port ${CLICKHOUSE_HTTP_PORT}" + echo "🔗 ClickHouse HTTP URL: http://localhost:${CLICKHOUSE_HTTP_PORT}" + echo "🧪 ClickHouse test env: TESTS_CLICKHOUSE_URL=http://localhost:${CLICKHOUSE_HTTP_PORT} TESTS_CLICKHOUSE_USER=${CLICKHOUSE_USER} TESTS_CLICKHOUSE_PASSWORD=${CLICKHOUSE_PASSWORD}" +else + echo "⏳ Waiting for Postgres to be ready..." + until psql "${DATABASE_URL}" -c "select 1" > /dev/null 2>&1; do + echo "Waiting for Postgres..." + sleep 1 + done + + echo "✅ Postgres is up and running on port ${DB_PORT}" + echo "ℹ️ SKIP_DOCKER is set; skipping ClickHouse readiness checks." +fi + echo "🔗 Database URL: ${DATABASE_URL}" # Run database migrations diff --git a/scripts/test-clickhouse.sh b/scripts/test-clickhouse.sh new file mode 100755 index 000000000..49fb63c98 --- /dev/null +++ b/scripts/test-clickhouse.sh @@ -0,0 +1,77 @@ +#!/usr/bin/env bash +set -euo pipefail + +COMPOSE_FILE="${COMPOSE_FILE:-./scripts/docker-compose.yaml}" +DOCKER_COMPOSE_BIN="${DOCKER_COMPOSE_BIN:-docker-compose}" +POSTGRES_SERVICE="${POSTGRES_SERVICE:-source-postgres}" +CLICKHOUSE_SERVICE="${CLICKHOUSE_SERVICE:-clickhouse}" +POSTGRES_PORT="${POSTGRES_PORT:-5430}" +POSTGRES_USER="${POSTGRES_USER:-postgres}" +POSTGRES_PASSWORD="${POSTGRES_PASSWORD:-postgres}" +CLICKHOUSE_HTTP_PORT="${CLICKHOUSE_HTTP_PORT:-8123}" +CLICKHOUSE_USER="${CLICKHOUSE_USER:-etl}" +CLICKHOUSE_PASSWORD="${CLICKHOUSE_PASSWORD:-etl}" +CARGO_TOOLCHAIN="${CARGO_TOOLCHAIN:-}" +TEST_TARGET="${TEST_TARGET:-clickhouse_pipeline}" +TEST_NAME_FILTER="${TEST_NAME_FILTER:-}" +CARGO_PACKAGE="${CARGO_PACKAGE:-etl-destinations}" +FEATURES="${FEATURES:-clickhouse,test-utils}" + +require_cmd() { + if ! command -v "$1" >/dev/null 2>&1; then + echo >&2 "❌ Error: required command '$1' is not installed." + exit 1 + fi +} + +require_cmd "$DOCKER_COMPOSE_BIN" +require_cmd cargo + +if [[ -z "${SKIP_DOCKER:-}" ]]; then + echo "🐳 Starting local Postgres and ClickHouse services..." + "$DOCKER_COMPOSE_BIN" -f "$COMPOSE_FILE" up -d "$POSTGRES_SERVICE" "$CLICKHOUSE_SERVICE" + + echo "⏳ Waiting for Postgres to be ready..." + until "$DOCKER_COMPOSE_BIN" -f "$COMPOSE_FILE" exec -T "$POSTGRES_SERVICE" pg_isready -U "$POSTGRES_USER" >/dev/null 2>&1; do + echo "Waiting for Postgres..." + sleep 1 + done + + echo "⏳ Waiting for ClickHouse to be ready..." + until "$DOCKER_COMPOSE_BIN" -f "$COMPOSE_FILE" exec -T "$CLICKHOUSE_SERVICE" clickhouse-client --user "$CLICKHOUSE_USER" --password "$CLICKHOUSE_PASSWORD" --query "SELECT 1" >/dev/null 2>&1; do + echo "Waiting for ClickHouse..." + sleep 1 + done +fi + +export TESTS_DATABASE_HOST="${TESTS_DATABASE_HOST:-localhost}" +export TESTS_DATABASE_PORT="${TESTS_DATABASE_PORT:-$POSTGRES_PORT}" +export TESTS_DATABASE_USERNAME="${TESTS_DATABASE_USERNAME:-$POSTGRES_USER}" +export TESTS_DATABASE_PASSWORD="${TESTS_DATABASE_PASSWORD:-$POSTGRES_PASSWORD}" +export TESTS_CLICKHOUSE_URL="${TESTS_CLICKHOUSE_URL:-http://localhost:$CLICKHOUSE_HTTP_PORT}" +export TESTS_CLICKHOUSE_USER="${TESTS_CLICKHOUSE_USER:-$CLICKHOUSE_USER}" +export TESTS_CLICKHOUSE_PASSWORD="${TESTS_CLICKHOUSE_PASSWORD:-$CLICKHOUSE_PASSWORD}" + +if [[ -n "$CARGO_TOOLCHAIN" ]]; then + CARGO_CMD=(cargo "+$CARGO_TOOLCHAIN") +else + CARGO_CMD=(cargo) +fi + +echo "🧪 Running ClickHouse destination test with:" +echo " TESTS_DATABASE_HOST=$TESTS_DATABASE_HOST" +echo " TESTS_DATABASE_PORT=$TESTS_DATABASE_PORT" +echo " TESTS_DATABASE_USERNAME=$TESTS_DATABASE_USERNAME" +echo " TESTS_CLICKHOUSE_URL=$TESTS_CLICKHOUSE_URL" +echo " TESTS_CLICKHOUSE_USER=$TESTS_CLICKHOUSE_USER" +echo " TESTS_CLICKHOUSE_PASSWORD=${TESTS_CLICKHOUSE_PASSWORD:+[set]}" +echo " cargo toolchain=${CARGO_TOOLCHAIN:-project default}" + +TEST_ARGS=(test -p "$CARGO_PACKAGE" --features "$FEATURES" --test "$TEST_TARGET") +if [[ -n "$TEST_NAME_FILTER" ]]; then + TEST_ARGS+=("$TEST_NAME_FILTER") +fi +TEST_ARGS+=(-- --nocapture) + +echo "🚀 ${CARGO_CMD[*]} ${TEST_ARGS[*]}" +"${CARGO_CMD[@]}" "${TEST_ARGS[@]}" From 03b6975ffff18e9f6cb3ba0988cf7fc9ab4531d9 Mon Sep 17 00:00:00 2001 From: Jordan McQueen Date: Mon, 6 Apr 2026 03:10:39 +0900 Subject: [PATCH 06/86] Fix ClickHouse test row id type --- etl-destinations/tests/support/clickhouse.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/etl-destinations/tests/support/clickhouse.rs b/etl-destinations/tests/support/clickhouse.rs index ff310ffc0..7e6fb2599 100644 --- a/etl-destinations/tests/support/clickhouse.rs +++ b/etl-destinations/tests/support/clickhouse.rs @@ -12,7 +12,7 @@ /// Fields must match the SELECT column list in the test query exactly. #[derive(clickhouse::Row, serde::Deserialize, Debug, Clone)] pub struct AllTypesRow { - pub id: i32, + pub id: i64, pub smallint_col: i16, pub integer_col: i32, pub bigint_col: i64, From f4543bdc77b4ecdc599188d0ff6b061dab8a1c3b Mon Sep 17 00:00:00 2001 From: Jordan McQueen Date: Mon, 6 Apr 2026 03:11:10 +0900 Subject: [PATCH 07/86] Avoid async cleanup panic in ClickHouse tests --- etl-destinations/src/clickhouse/test_utils.rs | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) diff --git a/etl-destinations/src/clickhouse/test_utils.rs b/etl-destinations/src/clickhouse/test_utils.rs index 6251a0749..9c7a6b445 100644 --- a/etl-destinations/src/clickhouse/test_utils.rs +++ b/etl-destinations/src/clickhouse/test_utils.rs @@ -172,9 +172,22 @@ impl ClickHouseTestDatabase { impl Drop for ClickHouseTestDatabase { fn drop(&mut self) { - if let Ok(handle) = Handle::try_current() { - handle.block_on(self.drop_database()); - } + let root_client = self.root_client.clone(); + let database = self.database.clone(); + + let _ = std::panic::catch_unwind(std::panic::AssertUnwindSafe(|| { + tokio::task::block_in_place(move || { + Handle::current().block_on(async move { + if let Err(error) = root_client + .query(&format!("DROP DATABASE IF EXISTS `{database}`")) + .execute() + .await + { + eprintln!("warning: failed to drop test ClickHouse database: {error}"); + } + }); + }); + })); } } From f2430af28ed40dcca838cfdede5423f75bd31638 Mon Sep 17 00:00:00 2001 From: Jordan McQueen Date: Mon, 6 Apr 2026 03:12:05 +0900 Subject: [PATCH 08/86] Fix ClickHouse UUID encoding --- etl-destinations/src/clickhouse/encoding.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/etl-destinations/src/clickhouse/encoding.rs b/etl-destinations/src/clickhouse/encoding.rs index afcf167cd..99ffa6030 100644 --- a/etl-destinations/src/clickhouse/encoding.rs +++ b/etl-destinations/src/clickhouse/encoding.rs @@ -69,7 +69,7 @@ pub(crate) fn cell_to_clickhouse_value(cell: Cell) -> ClickHouseValue { Cell::Time(t) => ClickHouseValue::String(t.to_string()), Cell::Timestamp(dt) => ClickHouseValue::DateTime64(dt.and_utc().timestamp_micros()), Cell::TimestampTz(dt) => ClickHouseValue::DateTime64(dt.timestamp_micros()), - Cell::Uuid(u) => ClickHouseValue::Uuid(u.to_bytes_le()), + Cell::Uuid(u) => ClickHouseValue::Uuid(*u.as_bytes()), Cell::Json(j) => ClickHouseValue::String(j.to_string()), Cell::Bytes(b) => ClickHouseValue::String(bytes_to_hex(b)), Cell::String(s) => ClickHouseValue::String(s), @@ -356,7 +356,7 @@ mod tests { #[test] fn test_cell_to_clickhouse_value_uuid() { let u = Uuid::parse_str("550e8400-e29b-41d4-a716-446655440000").unwrap(); - let expected_bytes = u.to_bytes_le(); + let expected_bytes = *u.as_bytes(); if let ClickHouseValue::Uuid(bytes) = cell_to_clickhouse_value(Cell::Uuid(u)) { assert_eq!(bytes, expected_bytes); } else { From 41c84fcc7299e9783227ac598af012f6001f72a7 Mon Sep 17 00:00:00 2001 From: Jordan McQueen Date: Tue, 7 Apr 2026 14:05:05 +0900 Subject: [PATCH 09/86] Add ClickHouse update streaming integration test --- etl-destinations/tests/clickhouse_pipeline.rs | 134 ++++++++++++++++-- 1 file changed, 125 insertions(+), 9 deletions(-) diff --git a/etl-destinations/tests/clickhouse_pipeline.rs b/etl-destinations/tests/clickhouse_pipeline.rs index 32acebce2..8205c5022 100644 --- a/etl-destinations/tests/clickhouse_pipeline.rs +++ b/etl-destinations/tests/clickhouse_pipeline.rs @@ -6,11 +6,13 @@ use etl::test_utils::notifying_store::NotifyingStore; use etl::test_utils::pipeline::create_pipeline; use etl::types::PipelineId; use etl_destinations::clickhouse::test_utils::{ - setup_clickhouse_database, skip_if_missing_clickhouse_env_vars, + ClickHouseTestDatabase, setup_clickhouse_database, skip_if_missing_clickhouse_env_vars, }; use etl_telemetry::tracing::init_test_tracing; use rand::random; use std::sync::Once; +use std::time::Duration; +use tokio::time::sleep; use crate::support::clickhouse::AllTypesRow; @@ -27,13 +29,6 @@ fn install_crypto_provider() { }); } -/// ClickHouse table name for `test.all_types_encoding`. -/// -/// Derived from `table_name_to_clickhouse_table_name("test", "all_types_encoding")`: -/// - "test" → "test" (no underscores) -/// - "all_types_encoding" → "all__types__encoding" (underscores escaped to __) -const ALL_TYPES_CH_TABLE: &str = "test_all__types__encoding"; - /// SELECT query that fetches all verified columns from the ClickHouse table. /// /// `uuid_col` is projected via `toString()` because the ClickHouse UUID RowBinary @@ -57,6 +52,22 @@ const ALL_TYPES_SELECT: &str = concat!( "ORDER BY id", ); +/// A row read back from the ClickHouse `update_flow` test table. +#[derive(clickhouse::Row, serde::Deserialize, Debug)] +struct UpdateFlowRow { + id: i64, + value: String, + cdc_operation: String, + cdc_lsn: i64, +} + +/// SELECT query used to verify the `update_flow` streaming test. +const UPDATE_FLOW_SELECT: &str = concat!( + "SELECT id, value, cdc_operation, cdc_lsn ", + "FROM \"test_update__flow\" ", + "ORDER BY id, cdc_lsn", +); + /// Days from 1970-01-01 to 2024-01-15 (used to verify the `date_col` round-trip). /// /// Python: `(date(2024, 1, 15) - date(1970, 1, 1)).days` = 19737 @@ -68,6 +79,22 @@ const DATE_2024_01_15_DAYS: u16 = 19737; /// = 1705320000000000 const TS_2024_01_15_12_00_US: i64 = 1_705_320_000_000_000; +/// Waits until ClickHouse returns at least `expected_rows` from `UPDATE_FLOW_SELECT`. +async fn wait_for_update_flow_rows( + ch_db: &ClickHouseTestDatabase, + expected_rows: usize, +) -> Vec { + for _ in 0..50 { + let rows: Vec = ch_db.query(UPDATE_FLOW_SELECT).await; + if rows.len() >= expected_rows { + return rows; + } + sleep(Duration::from_millis(100)).await; + } + + panic!("timed out waiting for clickhouse update_flow rows"); +} + /// Tests that all Postgres column types (including nullable arrays) round-trip /// correctly through the ClickHouse RowBinary encoding. /// @@ -240,7 +267,10 @@ async fn all_types_table_copy() { assert_eq!(r1.integer_col, 1000); assert_eq!(r1.bigint_col, 9_999_999); assert!((r1.real_col - 1.5_f32).abs() < 1e-3, "real_col mismatch"); - assert!((r1.double_col - 2.5_f64).abs() < 1e-6, "double_col mismatch"); + assert!( + (r1.double_col - 2.5_f64).abs() < 1e-6, + "double_col mismatch" + ); assert_eq!(r1.numeric_col, "12345.67"); assert!(r1.boolean_col); assert_eq!(r1.text_col, "hello text"); @@ -302,3 +332,89 @@ async fn all_types_table_copy() { "row 2 text_array_col mismatch — nullable-array encoding bug likely present" ); } + +/// Tests that UPDATE events are streamed to ClickHouse after the initial table copy. +/// +/// ClickHouse is append-only for CDC in this destination, so the original copied row +/// remains present with `cdc_operation = "INSERT"` and the streamed change arrives as +/// a second row with `cdc_operation = "UPDATE"` and a positive LSN. +#[tokio::test(flavor = "multi_thread")] +async fn updates_are_streamed_to_clickhouse() { + if skip_if_missing_clickhouse_env_vars() { + return; + } + + init_test_tracing(); + install_crypto_provider(); + + let database = spawn_source_database().await; + let table_name = test_table_name("update_flow"); + + let table_id = database + .create_table(table_name.clone(), true, &[("value", "text not null")]) + .await + .expect("Failed to create update_flow test table"); + + let publication_name = "test_pub_clickhouse_updates"; + database + .create_publication(publication_name, std::slice::from_ref(&table_name)) + .await + .expect("Failed to create update_flow publication"); + + database + .run_sql(&format!( + "INSERT INTO {} (value) VALUES ('before')", + table_name.as_quoted_identifier(), + )) + .await + .expect("Failed to insert initial update_flow row"); + + let ch_db = setup_clickhouse_database().await; + let store = NotifyingStore::new(); + let pipeline_id: PipelineId = random(); + let destination = ch_db.build_destination(pipeline_id, store.clone()); + + let table_ready = store + .notify_on_table_state_type(table_id, TableReplicationPhaseType::Ready) + .await; + + let mut pipeline = create_pipeline( + &database.config, + pipeline_id, + publication_name.to_owned(), + store, + destination, + ); + + pipeline.start().await.unwrap(); + table_ready.notified().await; + + database + .run_sql(&format!( + "UPDATE {} SET value = 'after' WHERE id = 1", + table_name.as_quoted_identifier(), + )) + .await + .expect("Failed to update update_flow row"); + + let rows = wait_for_update_flow_rows(&ch_db, 2).await; + + pipeline.shutdown_and_wait().await.unwrap(); + + assert_eq!(rows.len(), 2, "expected copied row plus streamed update"); + + let insert_row = &rows[0]; + assert_eq!(insert_row.id, 1); + assert_eq!(insert_row.value, "before"); + assert_eq!(insert_row.cdc_operation, "INSERT"); + assert_eq!(insert_row.cdc_lsn, 0); + + let update_row = &rows[1]; + assert_eq!(update_row.id, 1); + assert_eq!(update_row.value, "after"); + assert_eq!(update_row.cdc_operation, "UPDATE"); + assert!( + update_row.cdc_lsn > insert_row.cdc_lsn, + "streamed update should have a positive LSN" + ); +} From bc9a5f330d6f93f3a5544d68d73e625c34d01791 Mon Sep 17 00:00:00 2001 From: Jordan McQueen Date: Tue, 7 Apr 2026 14:05:52 +0900 Subject: [PATCH 10/86] Format ClickHouse destination files --- Cargo.lock | 1758 +++++++++++------ etl-destinations/Cargo.toml | 1 - etl-destinations/src/clickhouse/encoding.rs | 24 +- etl-destinations/src/clickhouse/schema.rs | 10 +- etl-destinations/src/clickhouse/test_utils.rs | 12 +- etl-destinations/tests/support/clickhouse.rs | 4 +- etl-examples/src/bin/clickhouse.rs | 8 +- etl-replicator/src/core.rs | 1 - 8 files changed, 1143 insertions(+), 675 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 8ae34be4a..94dea0e97 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -21,9 +21,9 @@ dependencies = [ [[package]] name = "actix-http" -version = "3.11.2" +version = "3.12.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7926860314cbe2fb5d1f13731e387ab43bd32bca224e82e6e2db85de0a3dba49" +checksum = "f860ee6746d0c5b682147b2f7f8ef036d4f92fe518251a3a35ffa3650eafdf0e" dependencies = [ "actix-codec", "actix-rt", @@ -58,14 +58,14 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e01ed3140b2f8d422c68afa1ed2e85d996ea619c988ac834d255db32138655cb" dependencies = [ "quote", - "syn 2.0.114", + "syn 2.0.117", ] [[package]] name = "actix-router" -version = "0.5.3" +version = "0.5.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "13d324164c51f63867b57e73ba5936ea151b8a41a1d23d1031eeb9f70d0236f8" +checksum = "14f8c75c51892f18d9c46150c5ac7beb81c95f78c8b83a634d49f4ca32551fe7" dependencies = [ "bytestring", "cfg-if", @@ -124,9 +124,9 @@ dependencies = [ [[package]] name = "actix-web" -version = "4.12.1" +version = "4.13.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1654a77ba142e37f049637a3e5685f864514af11fcbc51cb51eb6596afe5b8d6" +checksum = "ff87453bc3b56e9b2b23c1cc0b1be8797184accf51d2abe0f8a33ec275d316bf" dependencies = [ "actix-codec", "actix-http", @@ -157,7 +157,7 @@ dependencies = [ "serde_json", "serde_urlencoded", "smallvec", - "socket2 0.6.1", + "socket2 0.6.3", "time", "tracing", "url", @@ -172,7 +172,7 @@ dependencies = [ "actix-router", "proc-macro2", "quote", - "syn 2.0.114", + "syn 2.0.117", ] [[package]] @@ -286,9 +286,9 @@ dependencies = [ [[package]] name = "anstream" -version = "0.6.21" +version = "1.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "43d5b281e737544384e969a5ccad3f1cdd24b48086a0fc1b2a5262a26b8f4f4a" +checksum = "824a212faf96e9acacdbd09febd34438f8f711fb84e09a8916013cd7815ca28d" dependencies = [ "anstyle", "anstyle-parse", @@ -301,15 +301,15 @@ dependencies = [ [[package]] name = "anstyle" -version = "1.0.13" +version = "1.0.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5192cca8006f1fd4f7237516f40fa183bb07f8fbdfedaa0036de5ea9b0b45e78" +checksum = "940b3a0ca603d1eade50a4846a2afffd5ef57a9feac2c0e2ec2e14f9ead76000" [[package]] name = "anstyle-parse" -version = "0.2.7" +version = "1.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4e7644824f0aa2c7b9384579234ef10eb7efb6a0deb83f9630a49594dd9c15c2" +checksum = "52ce7f38b242319f7cabaa6813055467063ecdc9d355bbb4ce0c68908cd8130e" dependencies = [ "utf8parse", ] @@ -336,9 +336,9 @@ dependencies = [ [[package]] name = "anyhow" -version = "1.0.100" +version = "1.0.102" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a23eb6b1614318a8071c9b2521f36b424b2c83db5eb3a0fead4a6c0809af6e61" +checksum = "7f202df86484c868dbad7eaa557ef785d5c66295e41b460ef922eca0723b842c" [[package]] name = "apache-avro" @@ -360,7 +360,7 @@ dependencies = [ "serde_json", "strum", "strum_macros", - "thiserror 2.0.17", + "thiserror 2.0.18", "uuid", "zstd", ] @@ -376,9 +376,9 @@ dependencies = [ [[package]] name = "arc-swap" -version = "1.8.0" +version = "1.9.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "51d03449bb8ca2cc2ef70869af31463d1ae5ccc8fa3e334b307203fbf815207e" +checksum = "6a3a1fd6f75306b68087b831f025c712524bcb19aad54e557b1129cfa0a2b207" dependencies = [ "rustversion", ] @@ -403,46 +403,78 @@ checksum = "7c02d123df017efcdfbd739ef81735b36c5ba83ec3c59c80a9d7ecc718f92e50" [[package]] name = "arrow" -version = "57.2.0" +version = "57.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2a2b10dcb159faf30d3f81f6d56c1211a5bea2ca424eabe477648a44b993320e" +checksum = "e4754a624e5ae42081f464514be454b39711daae0458906dacde5f4c632f33a8" dependencies = [ - "arrow-arith", - "arrow-array", - "arrow-buffer", - "arrow-cast", - "arrow-data", - "arrow-ord", - "arrow-row", - "arrow-schema", - "arrow-select", - "arrow-string", + "arrow-arith 57.3.0", + "arrow-array 57.3.0", + "arrow-buffer 57.3.0", + "arrow-cast 57.3.0", + "arrow-data 57.3.0", + "arrow-ord 57.3.0", + "arrow-row 57.3.0", + "arrow-schema 57.3.0", + "arrow-select 57.3.0", + "arrow-string 57.3.0", +] + +[[package]] +name = "arrow" +version = "58.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d441fdda254b65f3e9025910eb2c2066b6295d9c8ed409522b8d2ace1ff8574c" +dependencies = [ + "arrow-arith 58.1.0", + "arrow-array 58.1.0", + "arrow-buffer 58.1.0", + "arrow-cast 58.1.0", + "arrow-data 58.1.0", + "arrow-ord 58.1.0", + "arrow-row 58.1.0", + "arrow-schema 58.1.0", + "arrow-select 58.1.0", + "arrow-string 58.1.0", ] [[package]] name = "arrow-arith" -version = "57.2.0" +version = "57.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "288015089e7931843c80ed4032c5274f02b37bcb720c4a42096d50b390e70372" +checksum = "f7b3141e0ec5145a22d8694ea8b6d6f69305971c4fa1c1a13ef0195aef2d678b" dependencies = [ - "arrow-array", - "arrow-buffer", - "arrow-data", - "arrow-schema", + "arrow-array 57.3.0", + "arrow-buffer 57.3.0", + "arrow-data 57.3.0", + "arrow-schema 57.3.0", + "chrono", + "num-traits", +] + +[[package]] +name = "arrow-arith" +version = "58.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ced5406f8b720cc0bc3aa9cf5758f93e8593cda5490677aa194e4b4b383f9a59" +dependencies = [ + "arrow-array 58.1.0", + "arrow-buffer 58.1.0", + "arrow-data 58.1.0", + "arrow-schema 58.1.0", "chrono", "num-traits", ] [[package]] name = "arrow-array" -version = "57.2.0" +version = "57.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "65ca404ea6191e06bf30956394173337fa9c35f445bd447fe6c21ab944e1a23c" +checksum = "4c8955af33b25f3b175ee10af580577280b4bd01f7e823d94c7cdef7cf8c9aef" dependencies = [ "ahash 0.8.12", - "arrow-buffer", - "arrow-data", - "arrow-schema", + "arrow-buffer 57.3.0", + "arrow-data 57.3.0", + "arrow-schema 57.3.0", "chrono", "half", "hashbrown 0.16.1", @@ -451,11 +483,41 @@ dependencies = [ "num-traits", ] +[[package]] +name = "arrow-array" +version = "58.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "772bd34cacdda8baec9418d80d23d0fb4d50ef0735685bd45158b83dfeb6e62d" +dependencies = [ + "ahash 0.8.12", + "arrow-buffer 58.1.0", + "arrow-data 58.1.0", + "arrow-schema 58.1.0", + "chrono", + "half", + "hashbrown 0.16.1", + "num-complex", + "num-integer", + "num-traits", +] + +[[package]] +name = "arrow-buffer" +version = "57.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c697ddca96183182f35b3a18e50b9110b11e916d7b7799cbfd4d34662f2c56c2" +dependencies = [ + "bytes", + "half", + "num-bigint", + "num-traits", +] + [[package]] name = "arrow-buffer" -version = "57.2.0" +version = "58.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "36356383099be0151dacc4245309895f16ba7917d79bdb71a7148659c9206c56" +checksum = "898f4cf1e9598fdb77f356fdf2134feedfd0ee8d5a4e0a5f573e7d0aec16baa4" dependencies = [ "bytes", "half", @@ -465,16 +527,16 @@ dependencies = [ [[package]] name = "arrow-cast" -version = "57.2.0" +version = "57.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9c8e372ed52bd4ee88cc1e6c3859aa7ecea204158ac640b10e187936e7e87074" +checksum = "646bbb821e86fd57189c10b4fcdaa941deaf4181924917b0daa92735baa6ada5" dependencies = [ - "arrow-array", - "arrow-buffer", - "arrow-data", - "arrow-ord", - "arrow-schema", - "arrow-select", + "arrow-array 57.3.0", + "arrow-buffer 57.3.0", + "arrow-data 57.3.0", + "arrow-ord 57.3.0", + "arrow-schema 57.3.0", + "arrow-select 57.3.0", "atoi", "base64", "chrono", @@ -484,14 +546,49 @@ dependencies = [ "ryu", ] +[[package]] +name = "arrow-cast" +version = "58.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b0127816c96533d20fc938729f48c52d3e48f99717e7a0b5ade77d742510736d" +dependencies = [ + "arrow-array 58.1.0", + "arrow-buffer 58.1.0", + "arrow-data 58.1.0", + "arrow-ord 58.1.0", + "arrow-schema 58.1.0", + "arrow-select 58.1.0", + "atoi", + "base64", + "chrono", + "comfy-table", + "half", + "lexical-core", + "num-traits", + "ryu", +] + +[[package]] +name = "arrow-data" +version = "57.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1fdd994a9d28e6365aa78e15da3f3950c0fdcea6b963a12fa1c391afb637b304" +dependencies = [ + "arrow-buffer 57.3.0", + "arrow-schema 57.3.0", + "half", + "num-integer", + "num-traits", +] + [[package]] name = "arrow-data" -version = "57.2.0" +version = "58.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bf87f4ff5fc13290aa47e499a8b669a82c5977c6a1fedce22c7f542c1fd5a597" +checksum = "42d10beeab2b1c3bb0b53a00f7c944a178b622173a5c7bcabc3cb45d90238df4" dependencies = [ - "arrow-buffer", - "arrow-schema", + "arrow-buffer 58.1.0", + "arrow-schema 58.1.0", "half", "num-integer", "num-traits", @@ -499,75 +596,141 @@ dependencies = [ [[package]] name = "arrow-ipc" -version = "57.2.0" +version = "57.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "eb3ca63edd2073fcb42ba112f8ae165df1de935627ead6e203d07c99445f2081" +checksum = "abf7df950701ab528bf7c0cf7eeadc0445d03ef5d6ffc151eaae6b38a58feff1" dependencies = [ - "arrow-array", - "arrow-buffer", - "arrow-data", - "arrow-schema", - "arrow-select", + "arrow-array 57.3.0", + "arrow-buffer 57.3.0", + "arrow-data 57.3.0", + "arrow-schema 57.3.0", + "arrow-select 57.3.0", "flatbuffers", ] [[package]] name = "arrow-ord" -version = "57.2.0" +version = "57.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f7d8f1870e03d4cbed632959498bcc84083b5a24bded52905ae1695bd29da45b" +dependencies = [ + "arrow-array 57.3.0", + "arrow-buffer 57.3.0", + "arrow-data 57.3.0", + "arrow-schema 57.3.0", + "arrow-select 57.3.0", +] + +[[package]] +name = "arrow-ord" +version = "58.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "763a7ba279b20b52dad300e68cfc37c17efa65e68623169076855b3a9e941ca5" +dependencies = [ + "arrow-array 58.1.0", + "arrow-buffer 58.1.0", + "arrow-data 58.1.0", + "arrow-schema 58.1.0", + "arrow-select 58.1.0", +] + +[[package]] +name = "arrow-row" +version = "57.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "13c4e0530272ca755d6814218dffd04425c5b7854b87fa741d5ff848bf50aa39" +checksum = "18228633bad92bff92a95746bbeb16e5fc318e8382b75619dec26db79e4de4c0" dependencies = [ - "arrow-array", - "arrow-buffer", - "arrow-data", - "arrow-schema", - "arrow-select", + "arrow-array 57.3.0", + "arrow-buffer 57.3.0", + "arrow-data 57.3.0", + "arrow-schema 57.3.0", + "half", ] [[package]] name = "arrow-row" -version = "57.2.0" +version = "58.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b07f52788744cc71c4628567ad834cadbaeb9f09026ff1d7a4120f69edf7abd3" +checksum = "e14fe367802f16d7668163ff647830258e6e0aeea9a4d79aaedf273af3bdcd3e" dependencies = [ - "arrow-array", - "arrow-buffer", - "arrow-data", - "arrow-schema", + "arrow-array 58.1.0", + "arrow-buffer 58.1.0", + "arrow-data 58.1.0", + "arrow-schema 58.1.0", "half", ] [[package]] name = "arrow-schema" -version = "57.2.0" +version = "57.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8c872d36b7bf2a6a6a2b40de9156265f0242910791db366a2c17476ba8330d68" + +[[package]] +name = "arrow-schema" +version = "58.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c30a1365d7a7dc50cc847e54154e6af49e4c4b0fddc9f607b687f29212082743" +dependencies = [ + "bitflags", +] + +[[package]] +name = "arrow-select" +version = "57.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6bb63203e8e0e54b288d0d8043ca8fa1013820822a27692ef1b78a977d879f2c" +checksum = "68bf3e3efbd1278f770d67e5dc410257300b161b93baedb3aae836144edcaf4b" +dependencies = [ + "ahash 0.8.12", + "arrow-array 57.3.0", + "arrow-buffer 57.3.0", + "arrow-data 57.3.0", + "arrow-schema 57.3.0", + "num-traits", +] [[package]] name = "arrow-select" -version = "57.2.0" +version = "58.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c96d8a1c180b44ecf2e66c9a2f2bbcb8b1b6f14e165ce46ac8bde211a363411b" +checksum = "78694888660a9e8ac949853db393af2a8b8fc82c19ce333132dfa2e72cc1a7fe" dependencies = [ "ahash 0.8.12", - "arrow-array", - "arrow-buffer", - "arrow-data", - "arrow-schema", + "arrow-array 58.1.0", + "arrow-buffer 58.1.0", + "arrow-data 58.1.0", + "arrow-schema 58.1.0", "num-traits", ] [[package]] name = "arrow-string" -version = "57.2.0" +version = "57.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a8ad6a81add9d3ea30bf8374ee8329992c7fd246ffd8b7e2f48a3cea5aa0cc9a" +checksum = "85e968097061b3c0e9fe3079cf2e703e487890700546b5b0647f60fca1b5a8d8" dependencies = [ - "arrow-array", - "arrow-buffer", - "arrow-data", - "arrow-schema", - "arrow-select", + "arrow-array 57.3.0", + "arrow-buffer 57.3.0", + "arrow-data 57.3.0", + "arrow-schema 57.3.0", + "arrow-select 57.3.0", + "memchr", + "num-traits", + "regex", + "regex-syntax", +] + +[[package]] +name = "arrow-string" +version = "58.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "61e04a01f8bb73ce54437514c5fd3ee2aa3e8abe4c777ee5cc55853b1652f79e" +dependencies = [ + "arrow-array 58.1.0", + "arrow-buffer 58.1.0", + "arrow-data 58.1.0", + "arrow-schema 58.1.0", + "arrow-select 58.1.0", "memchr", "num-traits", "regex", @@ -622,7 +785,7 @@ checksum = "c7c24de15d275a1ecfd47a380fb4d5ec9bfe0933f309ed5e705b775596a3574d" dependencies = [ "proc-macro2", "quote", - "syn 2.0.114", + "syn 2.0.117", ] [[package]] @@ -633,7 +796,7 @@ checksum = "9035ad2d096bed7955a320ee7e2230574d28fd3c3a0f186cbea1ff3c7eed5dbb" dependencies = [ "proc-macro2", "quote", - "syn 2.0.114", + "syn 2.0.117", ] [[package]] @@ -659,9 +822,9 @@ checksum = "c08606f8c3cbf4ce6ec8e28fb0014a2c086708fe954eaa885384a6165172e7e8" [[package]] name = "aws-lc-rs" -version = "1.15.3" +version = "1.16.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e84ce723ab67259cfeb9877c6a639ee9eb7a27b28123abd71db7f0d5d0cc9d86" +checksum = "a054912289d18629dc78375ba2c3726a3afe3ff71b4edba9dedfca0e3446d1fc" dependencies = [ "aws-lc-sys", "zeroize", @@ -669,9 +832,9 @@ dependencies = [ [[package]] name = "aws-lc-sys" -version = "0.36.0" +version = "0.39.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "43a442ece363113bd4bd4c8b18977a7798dd4d3c3383f34fb61936960e8f4ad8" +checksum = "83a25cf98105baa966497416dbd42565ce3a8cf8dbfd59803ec9ad46f3126399" dependencies = [ "cc", "cmake", @@ -745,9 +908,9 @@ checksum = "230c5f1ca6a325a32553f8640d31ac9b49f2411e901e427570154868b46da4f7" [[package]] name = "bitflags" -version = "2.10.0" +version = "2.11.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "812e12b5285cc515a9c72a5c1d3b6d46a19dac5acfef5265968c166106e31dd3" +checksum = "843867be96c8daad0d758b57df9392b6d8d271134fce549de6ce169ff98a92af" dependencies = [ "serde_core", ] @@ -790,9 +953,9 @@ checksum = "119771309b95163ec7aaf79810da82f7cd0599c19722d48b9c03894dca833966" [[package]] name = "bon" -version = "3.8.2" +version = "3.9.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "234655ec178edd82b891e262ea7cf71f6584bcd09eff94db786be23f1821825c" +checksum = "f47dbe92550676ee653353c310dfb9cf6ba17ee70396e1f7cf0a2020ad49b2fe" dependencies = [ "bon-macros", "rustversion", @@ -800,9 +963,9 @@ dependencies = [ [[package]] name = "bon-macros" -version = "3.8.2" +version = "3.9.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "89ec27229c38ed0eb3c0feee3d2c1d6a4379ae44f418a29a658890e062d8f365" +checksum = "519bd3116aeeb42d5372c29d982d16d0170d3d4a5ed85fc7dd91642ffff3c67c" dependencies = [ "darling 0.23.0", "ident_case", @@ -810,30 +973,31 @@ dependencies = [ "proc-macro2", "quote", "rustversion", - "syn 2.0.114", + "syn 2.0.117", ] [[package]] name = "borsh" -version = "1.6.0" +version = "1.6.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d1da5ab77c1437701eeff7c88d968729e7766172279eab0676857b3d63af7a6f" +checksum = "cfd1e3f8955a5d7de9fab72fc8373fade9fb8a703968cb200ae3dc6cf08e185a" dependencies = [ "borsh-derive", + "bytes", "cfg_aliases", ] [[package]] name = "borsh-derive" -version = "1.6.0" +version = "1.6.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0686c856aa6aac0c4498f936d7d6a02df690f614c03e4d906d1018062b5c5e2c" +checksum = "bfcfdc083699101d5a7965e49925975f2f55060f94f9a05e7187be95d530ca59" dependencies = [ "once_cell", "proc-macro-crate", "proc-macro2", "quote", - "syn 2.0.114", + "syn 2.0.117", ] [[package]] @@ -868,9 +1032,9 @@ dependencies = [ [[package]] name = "bumpalo" -version = "3.19.1" +version = "3.20.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5dd9dc738b7a8311c7ade152424974d8115f2cdad61e8dab8dac9f2362298510" +checksum = "5d20789868f4b01b2f2caec9f5c4e0213b41e3e5702a50157d699ae31ced2fcb" [[package]] name = "bytecheck" @@ -896,9 +1060,9 @@ dependencies = [ [[package]] name = "bytemuck" -version = "1.24.0" +version = "1.25.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1fbdf580320f38b612e485521afda1ee26d10cc9884efaaa750d383e13e3c5f4" +checksum = "c8efb64bd706a16a1bdde310ae86b351e4d21550d98d056f22f8a7f7a2183fec" [[package]] name = "byteorder" @@ -924,11 +1088,17 @@ dependencies = [ "bytes", ] +[[package]] +name = "cast" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "37b2a672a2cb129a2e41c10b1224bb368f9f37a2b16b612598138befd7b37eb5" + [[package]] name = "cc" -version = "1.2.52" +version = "1.2.59" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cd4932aefd12402b36c60956a4fe0035421f544799057659ff86f923657aada3" +checksum = "b7a4d3ec6524d28a329fc53654bbadc9bdd7b0431f5d65f1a56ffb28a1ee5283" dependencies = [ "find-msvc-tools", "jobserver", @@ -950,9 +1120,9 @@ checksum = "613afe47fcd5fac7ccf1db93babcb082c5994d996f20b8b159f2ad1658eb5724" [[package]] name = "chrono" -version = "0.4.43" +version = "0.4.44" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fac4744fb15ae8337dc853fee7fb3f4e48c0fbaa23d0afe49c447b4fab126118" +checksum = "c673075a2e0e5f4a1dde27ce9dee1ea4558c7ffe648f576438a20ca1d2acc4b0" dependencies = [ "iana-time-zone", "js-sys", @@ -964,9 +1134,9 @@ dependencies = [ [[package]] name = "clap" -version = "4.5.54" +version = "4.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c6e6ff9dcd79cff5cd969a17a545d79e84ab086e444102a591e288a8aa3ce394" +checksum = "b193af5b67834b676abd72466a96c1024e6a6ad978a1f484bd90b85c94041351" dependencies = [ "clap_builder", "clap_derive", @@ -974,9 +1144,9 @@ dependencies = [ [[package]] name = "clap_builder" -version = "4.5.54" +version = "4.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fa42cf4d2b7a41bc8f663a7cab4031ebafa1bf3875705bfaf8466dc60ab52c00" +checksum = "714a53001bf66416adb0e2ef5ac857140e7dc3a0c48fb28b2f10762fc4b5069f" dependencies = [ "anstream", "anstyle", @@ -986,27 +1156,27 @@ dependencies = [ [[package]] name = "clap_derive" -version = "4.5.49" +version = "4.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2a0b5487afeab2deb2ff4e03a807ad1a03ac532ff5a2cee5d86884440c7f7671" +checksum = "1110bd8a634a1ab8cb04345d8d878267d57c3cf1b38d91b71af6686408bbca6a" dependencies = [ "heck", "proc-macro2", "quote", - "syn 2.0.114", + "syn 2.0.117", ] [[package]] name = "clap_lex" -version = "0.7.7" +version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c3e64b0cc0439b12df2fa678eae89a1c56a529fd067a9115f7827f1fffd22b32" +checksum = "c8d4a3bb8b1e0c1050499d1815f5ab16d04f0959b233085fb31653fbfc9d98f9" [[package]] name = "clickhouse" -version = "0.14.2" +version = "0.14.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d975a05171c6f8a453f60ec6287c0018c90911d5a8a46d9b6abe386ea359fab3" +checksum = "0bfb36e41b644dcd5be4ef54a3b7d2abc9bb07eda777ab3f90d1b0dbb97c940a" dependencies = [ "bnum", "bstr", @@ -1023,7 +1193,7 @@ dependencies = [ "quanta", "rustls", "serde", - "thiserror 2.0.17", + "thiserror 2.0.18", "tokio", "url", ] @@ -1037,33 +1207,43 @@ dependencies = [ "proc-macro2", "quote", "serde_derive_internals", - "syn 2.0.114", + "syn 2.0.117", ] [[package]] name = "clickhouse-types" -version = "0.1.1" +version = "0.1.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "358fbfd439fb0bed02a3e2ecc5131f6a9d039ba5639aed650cf0e845f6ebfc16" +checksum = "30a5efddc880ce9e2573bd867413d9056fa2bea0206af88dec21e72178b9dc74" dependencies = [ "bytes", - "thiserror 2.0.17", + "thiserror 2.0.18", ] [[package]] name = "cmake" -version = "0.1.57" +version = "0.1.58" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "75443c44cd6b379beb8c5b45d85d0773baf31cce901fe7bb252f4eff3008ef7d" +checksum = "c0f78a02292a74a88ac736019ab962ece0bc380e3f977bf72e376c5d78ff0678" dependencies = [ "cc", ] [[package]] name = "colorchoice" -version = "1.0.4" +version = "1.0.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b05b61dc5112cbb17e4b6cd61790d9845d13888356391624cbe7e41efeac1e75" +checksum = "1d07550c9036bf2ae0c684c4297d503f838287c83c53686d05370d0e139ae570" + +[[package]] +name = "comfy-table" +version = "7.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "958c5d6ecf1f214b4c2bbbbf6ab9523a864bd136dcf71a7e8904799acfe1ad47" +dependencies = [ + "unicode-segmentation", + "unicode-width", +] [[package]] name = "concurrent-queue" @@ -1089,9 +1269,9 @@ dependencies = [ [[package]] name = "configcat" -version = "0.1.4" +version = "0.1.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "07c24f431a8fe2bc8e7b1ede60acd0a57df8512b2a80a1cbc7ee349961974fc4" +checksum = "9836fc676f74106765176c8fd0a3295e473fdb3bbeb97b3ab68575b1e5173543" dependencies = [ "arc-swap", "base16ct", @@ -1277,16 +1457,6 @@ dependencies = [ "darling_macro 0.20.11", ] -[[package]] -name = "darling" -version = "0.21.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9cdf337090841a411e2a7f3deb9187445851f91b309c0c0a29e05f74a00a48c0" -dependencies = [ - "darling_core 0.21.3", - "darling_macro 0.21.3", -] - [[package]] name = "darling" version = "0.23.0" @@ -1308,21 +1478,7 @@ dependencies = [ "proc-macro2", "quote", "strsim", - "syn 2.0.114", -] - -[[package]] -name = "darling_core" -version = "0.21.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1247195ecd7e3c85f83c8d2a366e4210d588e802133e1e355180a9870b517ea4" -dependencies = [ - "fnv", - "ident_case", - "proc-macro2", - "quote", - "strsim", - "syn 2.0.114", + "syn 2.0.117", ] [[package]] @@ -1335,7 +1491,7 @@ dependencies = [ "proc-macro2", "quote", "strsim", - "syn 2.0.114", + "syn 2.0.117", ] [[package]] @@ -1346,18 +1502,7 @@ checksum = "fc34b93ccb385b40dc71c6fceac4b2ad23662c7eeb248cf10d529b7e055b6ead" dependencies = [ "darling_core 0.20.11", "quote", - "syn 2.0.114", -] - -[[package]] -name = "darling_macro" -version = "0.21.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d38308df82d1080de0afee5d069fa14b0326a88c14f15c5ccda35b4a6c414c81" -dependencies = [ - "darling_core 0.21.3", - "quote", - "syn 2.0.114", + "syn 2.0.117", ] [[package]] @@ -1368,27 +1513,9 @@ checksum = "ac3984ec7bd6cfa798e62b4a642426a5be0e68f9401cfc2a01e3fa9ea2fcdb8d" dependencies = [ "darling_core 0.23.0", "quote", - "syn 2.0.114", -] - -[[package]] -name = "deadpool" -version = "0.12.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0be2b1d1d6ec8d846f05e137292d0b89133caf95ef33695424c09568bdd39b1b" -dependencies = [ - "deadpool-runtime", - "lazy_static", - "num_cpus", - "tokio", + "syn 2.0.117", ] -[[package]] -name = "deadpool-runtime" -version = "0.1.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "092966b41edc516079bdf31ec78a2e0588d1d0c08f78b91d8307215928642b2b" - [[package]] name = "debugid" version = "0.8.0" @@ -1408,7 +1535,17 @@ dependencies = [ "const-oid", "der_derive", "flagset", - "pem-rfc7468", + "pem-rfc7468 0.7.0", + "zeroize", +] + +[[package]] +name = "der" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "71fd89660b2dc699704064e59e9dba0147b903e85319429e131620d022be411b" +dependencies = [ + "pem-rfc7468 1.0.0", "zeroize", ] @@ -1420,14 +1557,14 @@ checksum = "8034092389675178f570469e6c3b0465d3d30b4505c294a6550db47f3c17ad18" dependencies = [ "proc-macro2", "quote", - "syn 2.0.114", + "syn 2.0.117", ] [[package]] name = "deranged" -version = "0.5.5" +version = "0.5.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ececcb659e7ba858fb4f10388c250a7252eb0a27373f1a72b8748afdd248e587" +checksum = "7cd812cc2bc1d69d4764bd80df88b4317eaef9e773c75226407d9bc0876b211c" dependencies = [ "powerfmt", "serde_core", @@ -1441,7 +1578,7 @@ checksum = "1e567bd82dcff979e4b03460c307b3cdc9e96fde3d73bed1496d2bc75d9dd62a" dependencies = [ "proc-macro2", "quote", - "syn 2.0.114", + "syn 2.0.117", ] [[package]] @@ -1462,7 +1599,7 @@ dependencies = [ "darling 0.20.11", "proc-macro2", "quote", - "syn 2.0.114", + "syn 2.0.117", ] [[package]] @@ -1472,7 +1609,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ab63b0e2bf4d5928aff72e83a7dace85d7bba5fe12dcc3c5a572d78caffd3f3c" dependencies = [ "derive_builder_core", - "syn 2.0.114", + "syn 2.0.117", ] [[package]] @@ -1494,7 +1631,7 @@ dependencies = [ "proc-macro2", "quote", "rustc_version", - "syn 2.0.114", + "syn 2.0.117", "unicode-xid", ] @@ -1512,9 +1649,9 @@ dependencies = [ [[package]] name = "dispatch2" -version = "0.3.0" +version = "0.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "89a09f22a6c6069a18470eb92d2298acf25463f14256d24778e1230d789a2aec" +checksum = "1e0e367e4e7da84520dedcac1901e4da967309406d1e51017ae1abfb97adbd38" dependencies = [ "bitflags", "objc2", @@ -1528,14 +1665,14 @@ checksum = "97369cbbc041bc366949bc74d34658d6cda5621039731c6310521892a3a20ae0" dependencies = [ "proc-macro2", "quote", - "syn 2.0.114", + "syn 2.0.117", ] [[package]] name = "dissimilar" -version = "1.0.10" +version = "1.0.11" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8975ffdaa0ef3661bfe02dbdcc06c9f829dfafe6a3c474de366a8d5e44276921" +checksum = "aeda16ab4059c5fd2a83f2b9c9e9c981327b18aa8e3b313f7e6563799d4f093e" [[package]] name = "dlv-list" @@ -1552,6 +1689,24 @@ version = "0.15.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1aaf95b3e5c8f23aa320147307562d361db0ae0d51242340f558153b4eb2439b" +[[package]] +name = "duckdb" +version = "1.10501.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f13bc6d6487032fc2825a62ef8b4924b2378a2eb3166e132e5f3141ae9dd633f" +dependencies = [ + "arrow 58.1.0", + "cast", + "fallible-iterator 0.3.0", + "fallible-streaming-iterator", + "hashlink 0.10.0", + "libduckdb-sys", + "num-integer", + "r2d2", + "rust_decimal", + "strum", +] + [[package]] name = "dunce" version = "1.0.5" @@ -1573,7 +1728,7 @@ dependencies = [ "enum-ordinalize", "proc-macro2", "quote", - "syn 2.0.114", + "syn 2.0.117", ] [[package]] @@ -1611,7 +1766,7 @@ checksum = "8ca9601fb2d62598ee17836250842873a413586e5d7ed88b356e38ddbb0ec631" dependencies = [ "proc-macro2", "quote", - "syn 2.0.114", + "syn 2.0.117", ] [[package]] @@ -1661,6 +1816,7 @@ dependencies = [ "postgres-replication", "rand 0.9.2", "rustls", + "serde", "serde_json", "sqlx", "sysinfo", @@ -1705,10 +1861,11 @@ dependencies = [ "serde", "serde_json", "sqlx", - "thiserror 2.0.17", + "thiserror 2.0.18", "tokio", "tracing", "tracing-actix-web", + "url", "utoipa", "utoipa-swagger-ui", "uuid", @@ -1738,11 +1895,11 @@ dependencies = [ "secrecy", "serde", "serde_json", - "serde_yaml", "sqlx", "tempfile", - "thiserror 2.0.17", + "thiserror 2.0.18", "tokio-postgres", + "url", "utoipa", ] @@ -1750,11 +1907,12 @@ dependencies = [ name = "etl-destinations" version = "0.1.0" dependencies = [ - "arrow", + "arrow 57.3.0", "async-trait", "base64", "chrono", "clickhouse", + "duckdb", "etl", "etl-telemetry", "futures", @@ -1764,15 +1922,20 @@ dependencies = [ "metrics", "parking_lot", "parquet", + "pg_escape", "prost", + "r2d2", "rand 0.9.2", "reqwest", "rustls", "serde", "serde_json", + "tempfile", "tokio", + "tokio-postgres", "tonic", "tracing", + "url", "uuid", ] @@ -1782,12 +1945,15 @@ version = "0.1.0" dependencies = [ "clap", "etl", + "etl-config", "etl-destinations", + "etl-telemetry", "rustls", "sysinfo", "tokio", "tracing", "tracing-subscriber", + "url", ] [[package]] @@ -1801,7 +1967,7 @@ dependencies = [ "serde", "serde_json", "sqlx", - "thiserror 2.0.17", + "thiserror 2.0.18", "tokio", "tokio-postgres", "tracing", @@ -1829,6 +1995,7 @@ dependencies = [ "tikv-jemallocator", "tokio", "tracing", + "url", ] [[package]] @@ -1837,8 +2004,9 @@ version = "0.1.0" dependencies = [ "etl-config", "metrics-exporter-prometheus", + "serde", "serde_json", - "thiserror 2.0.17", + "thiserror 2.0.18", "tokio", "tracing", "tracing-appender", @@ -1894,17 +2062,40 @@ version = "0.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4443176a9f2c162692bd3d352d745ef9413eec5782a80d8fd6f8a1ac692a07f7" +[[package]] +name = "fallible-iterator" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2acce4a10f12dc2fb14a218589d4f1f62ef011b2d0cc4b3cb1bba8e94da14649" + +[[package]] +name = "fallible-streaming-iterator" +version = "0.1.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7360491ce676a36bf9bb3c56c1aa791658183a54d2744120f27285738d90465a" + [[package]] name = "fastrand" -version = "2.3.0" +version = "2.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9f1f227452a390804cdb637b74a86990f2a7d7ba4b7d5693aac9b4dd6defd8d6" + +[[package]] +name = "filetime" +version = "0.2.27" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "37909eebbb50d72f9059c3b6d82c0463f2ff062c9e95845c43a6c9c0355411be" +checksum = "f98844151eee8917efc50bd9e8318cb963ae8b297431495d3f758616ea5c57db" +dependencies = [ + "cfg-if", + "libc", + "libredox", +] [[package]] name = "find-msvc-tools" -version = "0.1.7" +version = "0.1.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f449e6c6c08c865631d4890cfacf252b3d396c9bcc83adb6623cdb02a8336c41" +checksum = "5baebc0774151f905a1a2cc41989300b1e6fbb29aff0ceffa1064fdd3088d582" [[package]] name = "findshlibs" @@ -1942,9 +2133,9 @@ dependencies = [ [[package]] name = "flate2" -version = "1.1.8" +version = "1.1.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b375d6465b98090a5f25b1c7703f3859783755aa9a80433b36e0379a3ec2f369" +checksum = "843fba2746e448b37e26a819579957415c8cef339bf08564fe8b7ddbd959573c" dependencies = [ "crc32fast", "miniz_oxide", @@ -2018,9 +2209,9 @@ checksum = "e6d5a32815ae3f33302d95fdcb2ce17862f8c65363dcfd29360480ba1001fc9c" [[package]] name = "futures" -version = "0.3.31" +version = "0.3.32" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "65bc07b1a8bc7c85c5f2e110c476c7389b4554ba72af57d8445ea63a576b0876" +checksum = "8b147ee9d1f6d097cef9ce628cd2ee62288d963e16fb287bd9286455b241382d" dependencies = [ "futures-channel", "futures-core", @@ -2033,9 +2224,9 @@ dependencies = [ [[package]] name = "futures-channel" -version = "0.3.31" +version = "0.3.32" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2dff15bf788c671c1934e366d07e30c1814a8ef514e1af724a602e8a2fbe1b10" +checksum = "07bbe89c50d7a535e539b8c17bc0b49bdb77747034daa8087407d655f3f7cc1d" dependencies = [ "futures-core", "futures-sink", @@ -2043,15 +2234,15 @@ dependencies = [ [[package]] name = "futures-core" -version = "0.3.31" +version = "0.3.32" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "05f29059c0c2090612e8d742178b0580d2dc940c837851ad723096f87af6663e" +checksum = "7e3450815272ef58cec6d564423f6e755e25379b217b0bc688e295ba24df6b1d" [[package]] name = "futures-executor" -version = "0.3.31" +version = "0.3.32" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1e28d1d997f585e54aebc3f97d39e72338912123a67330d723fdbb564d646c9f" +checksum = "baf29c38818342a3b26b5b923639e7b1f4a61fc5e76102d4b1981c6dc7a7579d" dependencies = [ "futures-core", "futures-task", @@ -2071,38 +2262,38 @@ dependencies = [ [[package]] name = "futures-io" -version = "0.3.31" +version = "0.3.32" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9e5c1b78ca4aae1ac06c48a526a655760685149f0d465d21f37abfe57ce075c6" +checksum = "cecba35d7ad927e23624b22ad55235f2239cfa44fd10428eecbeba6d6a717718" [[package]] name = "futures-macro" -version = "0.3.31" +version = "0.3.32" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "162ee34ebcb7c64a8abebc059ce0fee27c2262618d7b60ed8faf72fef13c3650" +checksum = "e835b70203e41293343137df5c0664546da5745f82ec9b84d40be8336958447b" dependencies = [ "proc-macro2", "quote", - "syn 2.0.114", + "syn 2.0.117", ] [[package]] name = "futures-sink" -version = "0.3.31" +version = "0.3.32" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e575fab7d1e0dcb8d0c7bcf9a63ee213816ab51902e6d244a95819acacf1d4f7" +checksum = "c39754e157331b013978ec91992bde1ac089843443c49cbc7f46150b0fad0893" [[package]] name = "futures-task" -version = "0.3.31" +version = "0.3.32" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f90f7dce0722e95104fcb095585910c0977252f286e354b5e3bd38902cd99988" +checksum = "037711b3d59c33004d3856fbdc83b99d4ff37a24768fa1be9ce3538a1cde4393" [[package]] name = "futures-util" -version = "0.3.31" +version = "0.3.32" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9fa08315bb612088cc391249efdc3bc77536f16c91f6cf495e6fbe85b20a4a81" +checksum = "389ca41296e6190b48053de0321d02a77f32f8a5d2461dd38762c0593805c6d6" dependencies = [ "futures-channel", "futures-core", @@ -2112,18 +2303,16 @@ dependencies = [ "futures-task", "memchr", "pin-project-lite", - "pin-utils", "slab", ] [[package]] name = "gcp-bigquery-client" version = "0.28.0" -source = "git+https://github.com/iambriccardo/gcp-bigquery-client?rev=81ea3352af2e5fcbf04cd0ae47572d5ae97f992a#81ea3352af2e5fcbf04cd0ae47572d5ae97f992a" +source = "git+https://github.com/iambriccardo/gcp-bigquery-client?rev=c4fc59e338ca181d29b0dd53cac786fbe8513633#c4fc59e338ca181d29b0dd53cac786fbe8513633" dependencies = [ "async-stream", "async-trait", - "deadpool", "dyn-clone", "futures", "hyper-util", @@ -2135,7 +2324,7 @@ dependencies = [ "reqwest", "serde", "serde_json", - "thiserror 2.0.17", + "thiserror 2.0.18", "time", "tokio", "tokio-stream", @@ -2180,11 +2369,24 @@ dependencies = [ "cfg-if", "js-sys", "libc", - "r-efi", + "r-efi 5.3.0", "wasip2", "wasm-bindgen", ] +[[package]] +name = "getrandom" +version = "0.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0de51e6874e94e7bf76d726fc5d13ba782deca734ff60d5bb2fb2607c7406555" +dependencies = [ + "cfg-if", + "libc", + "r-efi 6.0.0", + "wasip2", + "wasip3", +] + [[package]] name = "gimli" version = "0.32.3" @@ -2215,7 +2417,7 @@ dependencies = [ "futures-sink", "futures-util", "http 0.2.12", - "indexmap 2.13.0", + "indexmap 2.13.1", "slab", "tokio", "tokio-util", @@ -2234,7 +2436,7 @@ dependencies = [ "futures-core", "futures-sink", "http 1.4.0", - "indexmap 2.13.0", + "indexmap 2.13.1", "slab", "tokio", "tokio-util", @@ -2340,12 +2542,6 @@ version = "0.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea" -[[package]] -name = "hermit-abi" -version = "0.5.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fc0fef456e4baa96da950455cd02c081ca953b141298e41db3fc7e36b1da849c" - [[package]] name = "hex" version = "0.4.3" @@ -2459,9 +2655,9 @@ checksum = "df3b46402a9d5adb4c86a0cf463f42e19994e3ee891101b1841f30a545cb49a9" [[package]] name = "hyper" -version = "1.8.1" +version = "1.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2ab2d4f250c3d7b1c9fcdff1cece94ea4e2dfbec68614f7b87cb205f24ca9d11" +checksum = "6299f016b246a94207e63da54dbe807655bf9e00044f73ded42c3ac5305fbcca" dependencies = [ "atomic-waker", "bytes", @@ -2474,7 +2670,6 @@ dependencies = [ "httpdate", "itoa", "pin-project-lite", - "pin-utils", "smallvec", "tokio", "want", @@ -2516,7 +2711,7 @@ dependencies = [ "tokio", "tokio-rustls", "tower-service", - "webpki-roots 1.0.5", + "webpki-roots 1.0.6", ] [[package]] @@ -2550,14 +2745,13 @@ dependencies = [ [[package]] name = "hyper-util" -version = "0.1.19" +version = "0.1.20" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "727805d60e7938b76b826a6ef209eb70eaa1812794f9424d4a4e2d740662df5f" +checksum = "96547c2556ec9d12fb1578c4eaf448b04993e7fb79cbaad930a656880a6bdfa0" dependencies = [ "base64", "bytes", "futures-channel", - "futures-core", "futures-util", "http 1.4.0", "http-body", @@ -2566,7 +2760,7 @@ dependencies = [ "libc", "percent-encoding", "pin-project-lite", - "socket2 0.6.1", + "socket2 0.6.3", "system-configuration", "tokio", "tower-service", @@ -2576,9 +2770,9 @@ dependencies = [ [[package]] name = "iana-time-zone" -version = "0.1.64" +version = "0.1.65" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "33e57f83510bb73707521ebaffa789ec8caf86f9657cad665b092b581d40e9fb" +checksum = "e31bc9ad994ba00e440a8aa5c9ef0ec67d5cb5e5cb0cc7f8b744a35b389cc470" dependencies = [ "android_system_properties", "core-foundation-sys", @@ -2607,14 +2801,14 @@ dependencies = [ "anyhow", "apache-avro", "array-init", - "arrow-arith", - "arrow-array", - "arrow-buffer", - "arrow-cast", - "arrow-ord", - "arrow-schema", - "arrow-select", - "arrow-string", + "arrow-arith 57.3.0", + "arrow-array 57.3.0", + "arrow-buffer 57.3.0", + "arrow-cast 57.3.0", + "arrow-ord 57.3.0", + "arrow-schema 57.3.0", + "arrow-select 57.3.0", + "arrow-string 57.3.0", "as-any", "async-trait", "backon", @@ -2627,7 +2821,7 @@ dependencies = [ "flate2", "fnv", "futures", - "itertools", + "itertools 0.13.0", "moka", "murmur3", "num-bigint", @@ -2664,7 +2858,7 @@ dependencies = [ "chrono", "http 1.4.0", "iceberg", - "itertools", + "itertools 0.13.0", "reqwest", "serde", "serde_derive", @@ -2677,12 +2871,13 @@ dependencies = [ [[package]] name = "icu_collections" -version = "2.1.1" +version = "2.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4c6b649701667bbe825c3b7e6388cb521c23d88644678e83c0c4d0a621a34b43" +checksum = "2984d1cd16c883d7935b9e07e44071dca8d917fd52ecc02c04d5fa0b5a3f191c" dependencies = [ "displaydoc", "potential_utf", + "utf8_iter", "yoke", "zerofrom", "zerovec", @@ -2690,9 +2885,9 @@ dependencies = [ [[package]] name = "icu_locale_core" -version = "2.1.1" +version = "2.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "edba7861004dd3714265b4db54a3c390e880ab658fec5f7db895fae2046b5bb6" +checksum = "92219b62b3e2b4d88ac5119f8904c10f8f61bf7e95b640d25ba3075e6cac2c29" dependencies = [ "displaydoc", "litemap", @@ -2703,9 +2898,9 @@ dependencies = [ [[package]] name = "icu_normalizer" -version = "2.1.1" +version = "2.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5f6c8828b67bf8908d82127b2054ea1b4427ff0230ee9141c54251934ab1b599" +checksum = "c56e5ee99d6e3d33bd91c5d85458b6005a22140021cc324cea84dd0e72cff3b4" dependencies = [ "icu_collections", "icu_normalizer_data", @@ -2717,15 +2912,15 @@ dependencies = [ [[package]] name = "icu_normalizer_data" -version = "2.1.1" +version = "2.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7aedcccd01fc5fe81e6b489c15b247b8b0690feb23304303a9e560f37efc560a" +checksum = "da3be0ae77ea334f4da67c12f149704f19f81d1adf7c51cf482943e84a2bad38" [[package]] name = "icu_properties" -version = "2.1.2" +version = "2.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "020bfc02fe870ec3a66d93e677ccca0562506e5872c650f893269e08615d74ec" +checksum = "bee3b67d0ea5c2cca5003417989af8996f8604e34fb9ddf96208a033901e70de" dependencies = [ "icu_collections", "icu_locale_core", @@ -2737,15 +2932,15 @@ dependencies = [ [[package]] name = "icu_properties_data" -version = "2.1.2" +version = "2.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "616c294cf8d725c6afcd8f55abc17c56464ef6211f9ed59cccffe534129c77af" +checksum = "8e2bbb201e0c04f7b4b3e14382af113e17ba4f63e2c9d2ee626b720cbce54a14" [[package]] name = "icu_provider" -version = "2.1.1" +version = "2.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "85962cf0ce02e1e0a629cc34e7ca3e373ce20dda4c4d7294bbd0bf1fdb59e614" +checksum = "139c4cf31c8b5f33d7e199446eff9c1e02decfc2f0eec2c8d71f65befa45b421" dependencies = [ "displaydoc", "icu_locale_core", @@ -2756,6 +2951,12 @@ dependencies = [ "zerovec", ] +[[package]] +name = "id-arena" +version = "2.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3d3067d79b975e8844ca9eb072e16b31c3c1c36928edf9c6789548c524d0d954" + [[package]] name = "ident_case" version = "1.0.1" @@ -2802,9 +3003,9 @@ dependencies = [ [[package]] name = "indexmap" -version = "2.13.0" +version = "2.13.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7714e70437a7dc3ac8eb7e6f8df75fd8eb422675fc7678aff7364301092b1017" +checksum = "45a8a2b9cb3e0b0c1803dbb0758ffac5de2f425b23c28f518faabd9d805342ff" dependencies = [ "equivalent", "hashbrown 0.16.1", @@ -2814,9 +3015,9 @@ dependencies = [ [[package]] name = "insta" -version = "1.46.0" +version = "1.47.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1b66886d14d18d420ab5052cbff544fc5d34d0b2cdd35eb5976aaa10a4a472e5" +checksum = "7b4a6248eb93a4401ed2f37dfe8ea592d3cf05b7cf4f8efa867b6895af7e094e" dependencies = [ "once_cell", "pest", @@ -2834,15 +3035,15 @@ checksum = "8bb03732005da905c88227371639bf1ad885cc712789c011c31c5fb3ab3ccf02" [[package]] name = "ipnet" -version = "2.11.0" +version = "2.12.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "469fb0b9cefa57e3ef31275ee7cacb78f2fdca44e4765491884a2b119d4eb130" +checksum = "d98f6fed1fde3f8c21bc40a1abb88dd75e67924f9cffc3ef95607bad8017f8e2" [[package]] name = "iri-string" -version = "0.7.10" +version = "0.7.12" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c91338f0783edbd6195decb37bae672fd3b165faffb89bf7b9e6942f8b1a731a" +checksum = "25e659a4bb38e810ebc252e53b5814ff908a8c58c2a9ce2fae1bbec24cbf4e20" dependencies = [ "memchr", "serde", @@ -2863,17 +3064,26 @@ dependencies = [ "either", ] +[[package]] +name = "itertools" +version = "0.14.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2b192c782037fadd9cfa75548310488aabdbf3d2da73885b31bd0abd03351285" +dependencies = [ + "either", +] + [[package]] name = "itoa" -version = "1.0.17" +version = "1.0.18" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "92ecc6618181def0457392ccd0ee51198e065e016d1d527a7ac1b6dc7c1f09d2" +checksum = "8f42a60cbdf9a97f5d2305f08a87dc4e09308d1276d28c869c684d7777685682" [[package]] name = "jiff" -version = "0.2.18" +version = "0.2.23" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e67e8da4c49d6d9909fe03361f9b620f58898859f5c7aded68351e85e71ecf50" +checksum = "1a3546dc96b6d42c5f24902af9e2538e82e39ad350b0c766eb3fbf2d8f3d8359" dependencies = [ "jiff-static", "jiff-tzdb-platform", @@ -2886,20 +3096,20 @@ dependencies = [ [[package]] name = "jiff-static" -version = "0.2.18" +version = "0.2.23" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e0c84ee7f197eca9a86c6fd6cb771e55eb991632f15f2bc3ca6ec838929e6e78" +checksum = "2a8c8b344124222efd714b73bb41f8b5120b27a7cc1c75593a6ff768d9d05aa4" dependencies = [ "proc-macro2", "quote", - "syn 2.0.114", + "syn 2.0.117", ] [[package]] name = "jiff-tzdb" -version = "0.1.5" +version = "0.1.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "68971ebff725b9e2ca27a601c5eb38a4c5d64422c4cbab0c535f248087eda5c2" +checksum = "c900ef84826f1338a557697dc8fc601df9ca9af4ac137c7fb61d4c6f2dfd3076" [[package]] name = "jiff-tzdb-platform" @@ -2922,10 +3132,12 @@ dependencies = [ [[package]] name = "js-sys" -version = "0.3.85" +version = "0.3.94" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8c942ebf8e95485ca0d52d97da7c5a2c387d0e7f0ba4c35e93bfcaee045955b3" +checksum = "2e04e2ef80ce82e13552136fabeef8a5ed1f985a96805761cbb9a2c34e7664d9" dependencies = [ + "cfg-if", + "futures-util", "once_cell", "wasm-bindgen", ] @@ -2952,7 +3164,7 @@ dependencies = [ "pest_derive", "regex", "serde_json", - "thiserror 2.0.17", + "thiserror 2.0.18", ] [[package]] @@ -3019,7 +3231,7 @@ dependencies = [ "serde", "serde_json", "serde_yaml", - "thiserror 2.0.17", + "thiserror 2.0.18", "tokio", "tokio-util", "tower", @@ -3043,7 +3255,7 @@ dependencies = [ "serde", "serde-value", "serde_json", - "thiserror 2.0.17", + "thiserror 2.0.18", ] [[package]] @@ -3057,7 +3269,7 @@ dependencies = [ "quote", "serde", "serde_json", - "syn 2.0.114", + "syn 2.0.117", ] [[package]] @@ -3081,7 +3293,7 @@ dependencies = [ "pin-project", "serde", "serde_json", - "thiserror 2.0.17", + "thiserror 2.0.18", "tokio", "tokio-util", "tracing", @@ -3102,6 +3314,12 @@ dependencies = [ "spin", ] +[[package]] +name = "leb128fmt" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "09edd9e8b54e49e587e4f6295a7d29c3ea94d469cb40ab8ca70b288248a81db2" + [[package]] name = "lexical-core" version = "1.0.6" @@ -3161,25 +3379,43 @@ dependencies = [ [[package]] name = "libc" -version = "0.2.180" +version = "0.2.184" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bcc35a38544a891a5f7c865aca548a982ccb3b8650a5b06d0fd33a10283c56fc" +checksum = "48f5d2a454e16a5ea0f4ced81bd44e4cfc7bd3a507b61887c99fd3538b28e4af" + +[[package]] +name = "libduckdb-sys" +version = "1.10501.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "12096c1694924782b3fe21e790630b77bacb4fcb7ad9d7ee0fec626f985bf248" +dependencies = [ + "cc", + "flate2", + "pkg-config", + "reqwest", + "serde", + "serde_json", + "tar", + "vcpkg", + "zip 6.0.0", +] [[package]] name = "libm" -version = "0.2.15" +version = "0.2.16" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f9fbbcab51052fe104eb5e5d351cf728d30a5be1fe14d9be8a3b097481fb97de" +checksum = "b6d2cec3eae94f9f509c767b45932f1ada8350c4bdb85af2fcab4a3c14807981" [[package]] name = "libredox" -version = "0.1.12" +version = "0.1.15" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3d0b95e02c851351f877147b7deea7b1afb1df71b63aa5f8270716e0c5720616" +checksum = "7ddbf48fd451246b1f8c2610bd3b4ac0cc6e149d89832867093ab69a17194f08" dependencies = [ "bitflags", "libc", - "redox_syscall 0.7.0", + "plain", + "redox_syscall 0.7.3", ] [[package]] @@ -3194,15 +3430,15 @@ dependencies = [ [[package]] name = "linux-raw-sys" -version = "0.11.0" +version = "0.12.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "df1d3c3b53da64cf5760482273a98e575c651a67eec7f77df96b5b642de8f039" +checksum = "32a66949e030da00e8c7d4434b251670a91556f4144941d37452769c25d58a53" [[package]] name = "litemap" -version = "0.8.1" +version = "0.8.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6373607a59f0be73a39b6fe456b8192fcc3585f602af20751600e974dd455e77" +checksum = "92daf443525c4cce67b150400bc2316076100ce0b3686209eb8cf3c31612e6f0" [[package]] name = "local-waker" @@ -3233,9 +3469,9 @@ checksum = "112b39cec0b298b6c1999fee3e31427f74f676e4cb9879ed1a121b43661a4154" [[package]] name = "lz4_flex" -version = "0.12.0" +version = "0.12.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ab6473172471198271ff72e9379150e9dfd70d8e533e0752a27e515b48dd375e" +checksum = "98c23545df7ecf1b16c303910a69b079e8e251d60f7dd2cc9b4177f2afaf1746" dependencies = [ "twox-hash", ] @@ -3277,9 +3513,9 @@ dependencies = [ [[package]] name = "memchr" -version = "2.7.6" +version = "2.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f52b00d39961fc5b2736ea853c9cc86238e165017a493d1d5c8eac6bdc4cc273" +checksum = "f8ca58f447f06ed17d5fc4043ce1b10dd205e060fb3ce5b979b8ed8e59ff3f79" [[package]] name = "metrics" @@ -3301,12 +3537,12 @@ dependencies = [ "http-body-util", "hyper", "hyper-util", - "indexmap 2.13.0", + "indexmap 2.13.1", "ipnet", "metrics", "metrics-util", "quanta", - "thiserror 2.0.17", + "thiserror 2.0.18", "tokio", "tracing", ] @@ -3361,9 +3597,9 @@ dependencies = [ [[package]] name = "mio" -version = "1.1.1" +version = "1.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a69bcab0ad47271a0234d9422b131806bf3968021e5dc9328caf2d4cd58557fc" +checksum = "50b7e5b27aa02a74bac8c3f23f448f8d87ff11f92d3aac1a6ed369ee08cc56c1" dependencies = [ "libc", "log", @@ -3373,9 +3609,9 @@ dependencies = [ [[package]] name = "moka" -version = "0.12.12" +version = "0.12.15" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a3dec6bd31b08944e08b58fd99373893a6c17054d6f3ea5006cc894f4f4eee2a" +checksum = "957228ad12042ee839f93c8f257b62b4c0ab5eaae1d4fa60de53b27c9d7c5046" dependencies = [ "async-lock", "crossbeam-channel", @@ -3411,17 +3647,17 @@ checksum = "e94e1e6445d314f972ff7395df2de295fe51b71821694f0b0e1e79c4f12c8577" [[package]] name = "native-tls" -version = "0.2.14" +version = "0.2.18" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "87de3442987e9dbec73158d5c715e7ad9072fda936bb03d19d7fa10e00520f0e" +checksum = "465500e14ea162429d264d44189adc38b199b62b1c21eea9f69e4b73cb03bbf2" dependencies = [ "libc", "log", "openssl", - "openssl-probe 0.1.6", + "openssl-probe 0.2.1", "openssl-sys", "schannel", - "security-framework 2.11.1", + "security-framework 3.7.0", "security-framework-sys", "tempfile", ] @@ -3510,9 +3746,9 @@ dependencies = [ [[package]] name = "num-conv" -version = "0.2.0" +version = "0.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cf97ec579c3c42f953ef76dbf8d55ac91fb219dde70e49aa4a6b7d74e9919050" +checksum = "c6673768db2d862beb9b39a78fdcb1a69439615d5794a1be50caa9bc92c81967" [[package]] name = "num-integer" @@ -3544,16 +3780,6 @@ dependencies = [ "libm", ] -[[package]] -name = "num_cpus" -version = "1.17.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "91df4bbde75afed763b708b7eee1e8e7651e02d97f6d5dd763e89367e957b23b" -dependencies = [ - "hermit-abi", - "libc", -] - [[package]] name = "num_threads" version = "0.1.7" @@ -3565,18 +3791,18 @@ dependencies = [ [[package]] name = "objc2" -version = "0.6.3" +version = "0.6.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b7c2599ce0ec54857b29ce62166b0ed9b4f6f1a70ccc9a71165b6154caca8c05" +checksum = "3a12a8ed07aefc768292f076dc3ac8c48f3781c8f2d5851dd3d98950e8c5a89f" dependencies = [ "objc2-encode", ] [[package]] name = "objc2-cloud-kit" -version = "0.3.1" +version = "0.3.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "17614fdcd9b411e6ff1117dfb1d0150f908ba83a7df81b1f118005fe0a8ea15d" +checksum = "73ad74d880bb43877038da939b7427bba67e9dd42004a18b809ba7d87cee241c" dependencies = [ "bitflags", "objc2", @@ -3585,9 +3811,9 @@ dependencies = [ [[package]] name = "objc2-core-data" -version = "0.3.1" +version = "0.3.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "291fbbf7d29287518e8686417cf7239c74700fd4b607623140a7d4a3c834329d" +checksum = "0b402a653efbb5e82ce4df10683b6b28027616a2715e90009947d50b8dd298fa" dependencies = [ "objc2", "objc2-foundation", @@ -3595,9 +3821,9 @@ dependencies = [ [[package]] name = "objc2-core-foundation" -version = "0.3.1" +version = "0.3.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1c10c2894a6fed806ade6027bcd50662746363a9589d3ec9d9bef30a4e4bc166" +checksum = "2a180dd8642fa45cdb7dd721cd4c11b1cadd4929ce112ebd8b9f5803cc79d536" dependencies = [ "bitflags", "dispatch2", @@ -3606,9 +3832,9 @@ dependencies = [ [[package]] name = "objc2-core-graphics" -version = "0.3.1" +version = "0.3.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "989c6c68c13021b5c2d6b71456ebb0f9dc78d752e86a98da7c716f4f9470f5a4" +checksum = "e022c9d066895efa1345f8e33e584b9f958da2fd4cd116792e15e07e4720a807" dependencies = [ "bitflags", "dispatch2", @@ -3619,9 +3845,9 @@ dependencies = [ [[package]] name = "objc2-core-image" -version = "0.3.1" +version = "0.3.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "79b3dc0cc4386b6ccf21c157591b34a7f44c8e75b064f85502901ab2188c007e" +checksum = "e5d563b38d2b97209f8e861173de434bd0214cf020e3423a52624cd1d989f006" dependencies = [ "objc2", "objc2-foundation", @@ -3629,14 +3855,26 @@ dependencies = [ [[package]] name = "objc2-core-location" -version = "0.3.1" +version = "0.3.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ac0f75792558aa9d618443bbb5db7426a7a0b6fddf96903f86ef9ad02e135740" +checksum = "ca347214e24bc973fc025fd0d36ebb179ff30536ed1f80252706db19ee452009" dependencies = [ "objc2", "objc2-foundation", ] +[[package]] +name = "objc2-core-text" +version = "0.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0cde0dfb48d25d2b4862161a4d5fcc0e3c24367869ad306b0c9ec0073bfed92d" +dependencies = [ + "bitflags", + "objc2", + "objc2-core-foundation", + "objc2-core-graphics", +] + [[package]] name = "objc2-encode" version = "4.1.0" @@ -3645,9 +3883,9 @@ checksum = "ef25abbcd74fb2609453eb695bd2f860d389e457f67dc17cafc8b8cbc89d0c33" [[package]] name = "objc2-foundation" -version = "0.3.1" +version = "0.3.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "900831247d2fe1a09a683278e5384cfb8c80c79fe6b166f9d14bfdde0ea1b03c" +checksum = "e3e0adef53c21f888deb4fa59fc59f7eb17404926ee8a6f59f5df0fd7f9f3272" dependencies = [ "bitflags", "block2", @@ -3658,9 +3896,9 @@ dependencies = [ [[package]] name = "objc2-io-kit" -version = "0.3.1" +version = "0.3.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "71c1c64d6120e51cd86033f67176b1cb66780c2efe34dec55176f77befd93c0a" +checksum = "33fafba39597d6dc1fb709123dfa8289d39406734be322956a69f0931c73bb15" dependencies = [ "libc", "objc2-core-foundation", @@ -3668,9 +3906,9 @@ dependencies = [ [[package]] name = "objc2-io-surface" -version = "0.3.1" +version = "0.3.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7282e9ac92529fa3457ce90ebb15f4ecbc383e8338060960760fa2cf75420c3c" +checksum = "180788110936d59bab6bd83b6060ffdfffb3b922ba1396b312ae795e1de9d81d" dependencies = [ "bitflags", "objc2", @@ -3679,9 +3917,9 @@ dependencies = [ [[package]] name = "objc2-quartz-core" -version = "0.3.1" +version = "0.3.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "90ffb6a0cd5f182dc964334388560b12a57f7b74b3e2dec5e2722aa2dfb2ccd5" +checksum = "96c1358452b371bf9f104e21ec536d37a650eb10f7ee379fff67d2e08d537f1f" dependencies = [ "bitflags", "objc2", @@ -3691,9 +3929,9 @@ dependencies = [ [[package]] name = "objc2-ui-kit" -version = "0.3.1" +version = "0.3.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "25b1312ad7bc8a0e92adae17aa10f90aae1fb618832f9b993b022b591027daed" +checksum = "d87d638e33c06f577498cbcc50491496a3ed4246998a7fbba7ccb98b1e7eab22" dependencies = [ "bitflags", "block2", @@ -3704,6 +3942,7 @@ dependencies = [ "objc2-core-graphics", "objc2-core-image", "objc2-core-location", + "objc2-core-text", "objc2-foundation", "objc2-quartz-core", "objc2-user-notifications", @@ -3711,9 +3950,9 @@ dependencies = [ [[package]] name = "objc2-user-notifications" -version = "0.3.1" +version = "0.3.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2a3f5ec77a81d9e0c5a0b32159b0cb143d7086165e79708351e02bf37dfc65cd" +checksum = "9df9128cbbfef73cda168416ccf7f837b62737d748333bfe9ab71c245d76613e" dependencies = [ "objc2", "objc2-foundation", @@ -3730,9 +3969,9 @@ dependencies = [ [[package]] name = "once_cell" -version = "1.21.3" +version = "1.21.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "42f5e15c9953c5e4ccceeb2e7382a716482c34515315f7b03532b8b4e8393d2d" +checksum = "9f7c3e4beb33f85d45ae3e3a1792185706c8e16d043238c593331cc7cd313b50" [[package]] name = "once_cell_polyfill" @@ -3771,9 +4010,9 @@ dependencies = [ [[package]] name = "openssl" -version = "0.10.75" +version = "0.10.76" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "08838db121398ad17ab8531ce9de97b244589089e290a384c900cb9ff7434328" +checksum = "951c002c75e16ea2c65b8c7e4d3d51d5530d8dfa7d060b4776828c88cfb18ecf" dependencies = [ "bitflags", "cfg-if", @@ -3792,7 +4031,7 @@ checksum = "a948666b637a0f465e8564c73e89d4dde00d72d4d473cc972f390fc3dcee7d9c" dependencies = [ "proc-macro2", "quote", - "syn 2.0.114", + "syn 2.0.117", ] [[package]] @@ -3803,15 +4042,15 @@ checksum = "d05e27ee213611ffe7d6348b942e8f942b37114c00cc03cec254295a4a17852e" [[package]] name = "openssl-probe" -version = "0.2.0" +version = "0.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9f50d9b3dabb09ecd771ad0aa242ca6894994c130308ca3d7684634df8037391" +checksum = "7c87def4c32ab89d880effc9e097653c8da5d6ef28e6b539d313baaacfbafcbe" [[package]] name = "openssl-sys" -version = "0.9.111" +version = "0.9.112" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "82cab2d520aa75e3c58898289429321eb788c3106963d0dc886ec7a5f4adc321" +checksum = "57d55af3b3e226502be1526dfdba67ab0e9c96fc293004e79576b2b9edb0dbdb" dependencies = [ "cc", "libc", @@ -3894,18 +4133,18 @@ dependencies = [ [[package]] name = "parquet" -version = "57.2.0" +version = "57.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5f6a2926a30477c0b95fea6c28c3072712b139337a242c2cc64817bdc20a8854" +checksum = "6ee96b29972a257b855ff2341b37e61af5f12d6af1158b6dcdb5b31ea07bb3cb" dependencies = [ "ahash 0.8.12", - "arrow-array", - "arrow-buffer", - "arrow-cast", - "arrow-data", + "arrow-array 57.3.0", + "arrow-buffer 57.3.0", + "arrow-cast 57.3.0", + "arrow-data 57.3.0", "arrow-ipc", - "arrow-schema", - "arrow-select", + "arrow-schema 57.3.0", + "arrow-select 57.3.0", "base64", "brotli", "bytes", @@ -3959,6 +4198,15 @@ dependencies = [ "base64ct", ] +[[package]] +name = "pem-rfc7468" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a6305423e0e7738146434843d1694d621cce767262b2a86910beab705e4493d9" +dependencies = [ + "base64ct", +] + [[package]] name = "percent-encoding" version = "2.3.2" @@ -3967,9 +4215,9 @@ checksum = "9b4f627cb1b25917193a259e49bdad08f671f8d9708acfd5fe0a8c1455d87220" [[package]] name = "pest" -version = "2.8.5" +version = "2.8.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2c9eb05c21a464ea704b53158d358a31e6425db2f63a1a7312268b05fe2b75f7" +checksum = "e0848c601009d37dfa3430c4666e147e49cdcf1b92ecd3e63657d8a5f19da662" dependencies = [ "memchr", "ucd-trie", @@ -3977,9 +4225,9 @@ dependencies = [ [[package]] name = "pest_derive" -version = "2.8.5" +version = "2.8.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "68f9dbced329c441fa79d80472764b1a2c7e57123553b8519b36663a2fb234ed" +checksum = "11f486f1ea21e6c10ed15d5a7c77165d0ee443402f0780849d1768e7d9d6fe77" dependencies = [ "pest", "pest_generator", @@ -3987,22 +4235,22 @@ dependencies = [ [[package]] name = "pest_generator" -version = "2.8.5" +version = "2.8.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3bb96d5051a78f44f43c8f712d8e810adb0ebf923fc9ed2655a7f66f63ba8ee5" +checksum = "8040c4647b13b210a963c1ed407c1ff4fdfa01c31d6d2a098218702e6664f94f" dependencies = [ "pest", "pest_meta", "proc-macro2", "quote", - "syn 2.0.114", + "syn 2.0.117", ] [[package]] name = "pest_meta" -version = "2.8.5" +version = "2.8.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "602113b5b5e8621770cfd490cfd90b9f84ab29bd2b0e49ad83eb6d186cef2365" +checksum = "89815c69d36021a140146f26659a81d6c2afa33d216d736dd4be5381a7362220" dependencies = [ "pest", "sha2", @@ -4016,7 +4264,7 @@ checksum = "8701b58ea97060d5e5b155d383a69952a60943f0e6dfe30b04c287beb0b27455" dependencies = [ "fixedbitset", "hashbrown 0.15.5", - "indexmap 2.13.0", + "indexmap 2.13.1", ] [[package]] @@ -4058,7 +4306,7 @@ dependencies = [ "phf_shared", "proc-macro2", "quote", - "syn 2.0.114", + "syn 2.0.117", ] [[package]] @@ -4072,35 +4320,29 @@ dependencies = [ [[package]] name = "pin-project" -version = "1.1.10" +version = "1.1.11" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "677f1add503faace112b9f1373e43e9e054bfdd22ff1a63c1bc485eaec6a6a8a" +checksum = "f1749c7ed4bcaf4c3d0a3efc28538844fb29bcdd7d2b67b2be7e20ba861ff517" dependencies = [ "pin-project-internal", ] [[package]] name = "pin-project-internal" -version = "1.1.10" +version = "1.1.11" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6e918e4ff8c4549eb882f14b3a4bc8c8bc93de829416eacf579f1207a8fbf861" +checksum = "d9b20ed30f105399776b9c883e68e536ef602a16ae6f596d2c473591d6ad64c6" dependencies = [ "proc-macro2", "quote", - "syn 2.0.114", + "syn 2.0.117", ] [[package]] name = "pin-project-lite" -version = "0.2.16" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3b3cff922bd51709b605d9ead9aa71031d81447142d828eb4a6eba76fe619f9b" - -[[package]] -name = "pin-utils" -version = "0.1.0" +version = "0.2.17" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184" +checksum = "a89322df9ebe1c1578d689c92318e070967d1042b512afbe49518723f4e6d5cd" [[package]] name = "pkcs1" @@ -4108,7 +4350,7 @@ version = "0.7.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c8ffb9f10fa047879315e6625af03c164b16962a5368d724ed16323b68ace47f" dependencies = [ - "der", + "der 0.7.10", "pkcs8", "spki", ] @@ -4119,7 +4361,7 @@ version = "0.10.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f950b2377845cebe5cf8b5165cb3cc1a5e0fa5cfa3e1f7f55707d8fd82e0a7b7" dependencies = [ - "der", + "der 0.7.10", "spki", ] @@ -4129,6 +4371,12 @@ version = "0.3.32" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7edddbd0b52d732b21ad9a5fab5c704c14cd949e5e9a1ec5929a24fded1b904c" +[[package]] +name = "plain" +version = "0.2.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b4596b6d070b27117e987119b4dac604f3c58cfb0b191112e24771b2faeac1a6" + [[package]] name = "polonius-the-crab" version = "0.5.0" @@ -4141,15 +4389,15 @@ dependencies = [ [[package]] name = "portable-atomic" -version = "1.13.0" +version = "1.13.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f89776e4d69bb58bc6993e99ffa1d11f228b839984854c7daeb5d37f87cbe950" +checksum = "c33a9471896f1c69cecef8d20cbe2f7accd12527ce60845ff44c153bb2a21b49" [[package]] name = "portable-atomic-util" -version = "0.2.4" +version = "0.2.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d8a2f0d8d040d7848a709caf78912debcc3f33ee4b3cac47d73d1e1069e83507" +checksum = "091397be61a01d4be58e7841595bd4bfedb15f1cd54977d79b8271e94ed799a3" dependencies = [ "portable-atomic", ] @@ -4162,7 +4410,7 @@ dependencies = [ "base64", "byteorder", "bytes", - "fallible-iterator", + "fallible-iterator 0.2.0", "hmac", "md-5", "memchr", @@ -4193,7 +4441,7 @@ source = "git+https://github.com/MaterializeInc/rust-postgres?rev=c4b473b478b3ad dependencies = [ "bytes", "chrono", - "fallible-iterator", + "fallible-iterator 0.2.0", "postgres-protocol", "serde", "serde_json", @@ -4202,9 +4450,9 @@ dependencies = [ [[package]] name = "potential_utf" -version = "0.1.4" +version = "0.1.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b73949432f5e2a09657003c25bca5e19a0e9c84f8058ca374f49e0ebe605af77" +checksum = "0103b1cef7ec0cf76490e969665504990193874ea05c85ff9bab8b911d0a0564" dependencies = [ "zerovec", ] @@ -4231,23 +4479,23 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "479ca8adacdd7ce8f1fb39ce9ecccbfe93a3f1344b3d0d97f20bc0196208f62b" dependencies = [ "proc-macro2", - "syn 2.0.114", + "syn 2.0.117", ] [[package]] name = "proc-macro-crate" -version = "3.4.0" +version = "3.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "219cb19e96be00ab2e37d6e299658a0cfa83e52429179969b0f0121b4ac46983" +checksum = "e67ba7e9b2b56446f1d419b1d807906278ffa1a658a8a5d8a39dcb1f5a78614f" dependencies = [ "toml_edit", ] [[package]] name = "proc-macro2" -version = "1.0.105" +version = "1.0.106" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "535d180e0ecab6268a3e718bb9fd44db66bbbc256257165fc699dadf70d16fe7" +checksum = "8fd00f0bb2e90d81d1044c2b32617f68fcb9fa3bb7640c23e9c748e53fb30934" dependencies = [ "unicode-ident", ] @@ -4269,7 +4517,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "343d3bd7056eda839b03204e68deff7d1b13aba7af2b2fd16890697274262ee7" dependencies = [ "heck", - "itertools", + "itertools 0.14.0", "log", "multimap", "petgraph", @@ -4279,7 +4527,7 @@ dependencies = [ "pulldown-cmark", "pulldown-cmark-to-cmark", "regex", - "syn 2.0.114", + "syn 2.0.117", "tempfile", ] @@ -4290,10 +4538,10 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "27c6023962132f4b30eb4c172c91ce92d933da334c59c23cddee82358ddafb0b" dependencies = [ "anyhow", - "itertools", + "itertools 0.14.0", "proc-macro2", "quote", - "syn 2.0.114", + "syn 2.0.117", ] [[package]] @@ -4327,9 +4575,9 @@ dependencies = [ [[package]] name = "pulldown-cmark" -version = "0.13.0" +version = "0.13.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1e8bbe1a966bd2f362681a44f6edce3c2310ac21e4d5067a6e7ec396297a6ea0" +checksum = "7c3a14896dfa883796f1cb410461aef38810ea05f2b2c33c5aded3649095fdad" dependencies = [ "bitflags", "memchr", @@ -4399,8 +4647,8 @@ dependencies = [ "quinn-udp", "rustc-hash", "rustls", - "socket2 0.6.1", - "thiserror 2.0.17", + "socket2 0.6.3", + "thiserror 2.0.18", "tokio", "tracing", "web-time", @@ -4408,9 +4656,9 @@ dependencies = [ [[package]] name = "quinn-proto" -version = "0.11.13" +version = "0.11.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f1906b49b0c3bc04b5fe5d86a77925ae6524a19b816ae38ce1e426255f1d8a31" +checksum = "434b42fec591c96ef50e21e886936e66d3cc3f737104fdb9b737c40ffb94c098" dependencies = [ "bytes", "getrandom 0.3.4", @@ -4421,7 +4669,7 @@ dependencies = [ "rustls", "rustls-pki-types", "slab", - "thiserror 2.0.17", + "thiserror 2.0.18", "tinyvec", "tracing", "web-time", @@ -4436,16 +4684,16 @@ dependencies = [ "cfg_aliases", "libc", "once_cell", - "socket2 0.6.1", + "socket2 0.6.3", "tracing", "windows-sys 0.60.2", ] [[package]] name = "quote" -version = "1.0.43" +version = "1.0.45" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dc74d9a594b72ae6656596548f56f667211f8a97b3d4c3d467150794690dc40a" +checksum = "41f2619966050689382d2b44f664f4bc593e129785a36d6ee376ddf37259b924" dependencies = [ "proc-macro2", ] @@ -4456,6 +4704,23 @@ version = "5.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "69cdb34c158ceb288df11e18b4bd39de994f6657d83847bdffdbd7f346754b0f" +[[package]] +name = "r-efi" +version = "6.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f8dcc9c7d52a811697d2151c701e0d08956f92b0e24136cf4cf27b57a6a0d9bf" + +[[package]] +name = "r2d2" +version = "0.8.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "51de85fb3fb6524929c8a2eb85e6b6d363de4e8c48f9e2c2eac4944abc181c93" +dependencies = [ + "log", + "parking_lot", + "scheduled-thread-pool", +] + [[package]] name = "radium" version = "0.7.0" @@ -4550,9 +4815,9 @@ dependencies = [ [[package]] name = "redox_syscall" -version = "0.7.0" +version = "0.7.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "49f3fe0889e69e2ae9e41f4d6c4c0181701d00e4697b356fb1f74173a5e0ee27" +checksum = "6ce70a74e890531977d37e532c34d45e9055d2409ed08ddba14529471ed0be16" dependencies = [ "bitflags", ] @@ -4574,14 +4839,14 @@ checksum = "b7186006dcb21920990093f30e3dea63b7d6e977bf1256be20c3563a5db070da" dependencies = [ "proc-macro2", "quote", - "syn 2.0.114", + "syn 2.0.117", ] [[package]] name = "regex" -version = "1.12.2" +version = "1.12.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "843bc0191f75f3e22651ae5f1e72939ab2f72a4bc30fa80a066bd66edefc24d4" +checksum = "e10754a14b9137dd7b1e3e5b0493cc9171fdd105e0ab477f51b72e7f3ac0e276" dependencies = [ "aho-corasick", "memchr", @@ -4591,9 +4856,9 @@ dependencies = [ [[package]] name = "regex-automata" -version = "0.4.13" +version = "0.4.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5276caf25ac86c8d810222b3dbb938e512c55c6831a10f3e6ed1c93b84041f1c" +checksum = "6e1dd4122fc1595e8162618945476892eefca7b88c52820e74af6262213cae8f" dependencies = [ "aho-corasick", "memchr", @@ -4602,15 +4867,15 @@ dependencies = [ [[package]] name = "regex-lite" -version = "0.1.8" +version = "0.1.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8d942b98df5e658f56f20d592c7f868833fe38115e65c33003d8cd224b0155da" +checksum = "cab834c73d247e67f4fae452806d17d3c7501756d98c8808d7c9c7aa7d18f973" [[package]] name = "regex-syntax" -version = "0.8.8" +version = "0.8.10" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7a2d987857b319362043e95f5353c0535c1f58eec5336fdfcf626430af7def58" +checksum = "dc897dd8d9e8bd1ed8cdad82b5966c3e0ecae09fb1907d58efaa013543185d0a" [[package]] name = "rend" @@ -4695,7 +4960,7 @@ dependencies = [ "wasm-bindgen-futures", "wasm-streams", "web-sys", - "webpki-roots 1.0.5", + "webpki-roots 1.0.6", ] [[package]] @@ -4791,7 +5056,7 @@ dependencies = [ "proc-macro2", "quote", "rust-embed-utils", - "syn 2.0.114", + "syn 2.0.117", "walkdir", ] @@ -4817,9 +5082,9 @@ dependencies = [ [[package]] name = "rust_decimal" -version = "1.40.0" +version = "1.41.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "61f703d19852dbf87cbc513643fa81428361eb6940f1ac14fd58155d295a3eb0" +checksum = "2ce901f9a19d251159075a4c37af514c3b8ef99c22e02dd8c19161cf397ee94a" dependencies = [ "arrayvec", "borsh", @@ -4829,19 +5094,20 @@ dependencies = [ "rkyv", "serde", "serde_json", + "wasm-bindgen", ] [[package]] name = "rustc-demangle" -version = "0.1.26" +version = "0.1.27" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "56f7d92ca342cea22a06f2121d944b4fd82af56988c270852495420f961d4ace" +checksum = "b50b8869d9fc858ce7266cce0194bd74df58b9d0e3f6df3a9fc8eb470d95c09d" [[package]] name = "rustc-hash" -version = "2.1.1" +version = "2.1.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "357703d41365b4b27c590e3ed91eabb1b663f07c4c084095e60cbed4362dff0d" +checksum = "94300abf3f1ae2e2b8ffb7b58043de3d399c73fa6f4b73826402a5c457614dbe" [[package]] name = "rustc_version" @@ -4854,9 +5120,9 @@ dependencies = [ [[package]] name = "rustix" -version = "1.1.3" +version = "1.1.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "146c9e247ccc180c1f61615433868c99f3de3ae256a30a43b49f67c2d9171f34" +checksum = "b6fe4565b9518b83ef4f91bb47ce29620ca828bd32cb7e408f0062e9930ba190" dependencies = [ "bitflags", "errno", @@ -4867,9 +5133,9 @@ dependencies = [ [[package]] name = "rustls" -version = "0.23.36" +version = "0.23.37" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c665f33d38cea657d9614f766881e4d510e0eda4239891eea56b4cadcf01801b" +checksum = "758025cb5fccfd3bc2fd74708fd4682be41d99e5dff73c377c0646c6012c73a4" dependencies = [ "aws-lc-rs", "log", @@ -4900,10 +5166,10 @@ version = "0.8.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "612460d5f7bea540c490b2b6395d8e34a953e52b491accd6c86c8164c5932a63" dependencies = [ - "openssl-probe 0.2.0", + "openssl-probe 0.2.1", "rustls-pki-types", "schannel", - "security-framework 3.5.1", + "security-framework 3.7.0", ] [[package]] @@ -4917,9 +5183,9 @@ dependencies = [ [[package]] name = "rustls-pki-types" -version = "1.13.2" +version = "1.14.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "21e6f2ab2928ca4291b86736a8bd920a277a399bba1589409d72154ff87c1282" +checksum = "be040f8b0a225e40375822a563fa9524378b9d63112f53e19ffff34df5d33fdd" dependencies = [ "web-time", "zeroize", @@ -4927,9 +5193,9 @@ dependencies = [ [[package]] name = "rustls-webpki" -version = "0.103.8" +version = "0.103.10" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2ffdfa2f5286e2247234e03f680868ac2815974dc39e00ea15adc445d0aafe52" +checksum = "df33b2b81ac578cabaf06b89b0631153a3f416b0a886e8a7a1707fb51abbd1ef" dependencies = [ "aws-lc-rs", "ring", @@ -4945,9 +5211,9 @@ checksum = "b39cdef0fa800fc44525c84ccb54a029961a8215f9619753635a9c0d2538d46d" [[package]] name = "ryu" -version = "1.0.22" +version = "1.0.23" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a50f4cf475b65d88e057964e0e9bb1f0aa9bbb2036dc65c64596b42932536984" +checksum = "9774ba4a74de5f7b1c1451ed6cd5285a32eddb5cccb8cc655a4e50009e06477f" [[package]] name = "same-file" @@ -4960,13 +5226,22 @@ dependencies = [ [[package]] name = "schannel" -version = "0.1.28" +version = "0.1.29" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "891d81b926048e76efe18581bf793546b4c0eaf8448d72be8de2bbee5fd166e1" +checksum = "91c1b7e4904c873ef0710c1f407dde2e6287de2bebc1bbbf7d430bb7cbffd939" dependencies = [ "windows-sys 0.61.2", ] +[[package]] +name = "scheduled-thread-pool" +version = "0.2.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3cbc66816425a074528352f5789333ecff06ca41b36b0b0efdfbb29edc391a19" +dependencies = [ + "parking_lot", +] + [[package]] name = "schemars" version = "0.8.22" @@ -4993,9 +5268,9 @@ dependencies = [ [[package]] name = "schemars" -version = "1.2.0" +version = "1.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "54e910108742c57a770f492731f99be216a52fadd361b06c8fb59d74ccc267d2" +checksum = "a2b42f36aa1cd011945615b92222f6bf73c599a102a300334cd7f8dbeec726cc" dependencies = [ "dyn-clone", "ref-cast", @@ -5012,7 +5287,7 @@ dependencies = [ "proc-macro2", "quote", "serde_derive_internals", - "syn 2.0.114", + "syn 2.0.117", ] [[package]] @@ -5052,9 +5327,9 @@ dependencies = [ [[package]] name = "security-framework" -version = "3.5.1" +version = "3.7.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b3297343eaf830f66ede390ea39da1d462b6b0c1b000f420d0a83f898bbbe6ef" +checksum = "b7f4bc775c73d9a02cde8bf7b2ec4c9d12743edf609006c7facc23998404cd1d" dependencies = [ "bitflags", "core-foundation 0.10.1", @@ -5065,9 +5340,9 @@ dependencies = [ [[package]] name = "security-framework-sys" -version = "2.15.0" +version = "2.17.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cc1f0cbffaac4852523ce30d8bd3c5cdc873501d96ff467ca09b6767bb8cd5c0" +checksum = "6ce2691df843ecc5d231c0b14ece2acc3efb62c0a398c7e1d875f3983ce020e3" dependencies = [ "core-foundation-sys", "libc", @@ -5075,9 +5350,9 @@ dependencies = [ [[package]] name = "semver" -version = "1.0.27" +version = "1.0.28" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d767eb0aabc880b29956c35734170f26ed551a859dbd361d140cdbeca61ab1e2" +checksum = "8a7852d02fc848982e0c167ef163aaff9cd91dc640ba85e263cb1ce46fae51cd" [[package]] name = "sentry" @@ -5194,7 +5469,7 @@ dependencies = [ "rand 0.9.2", "serde", "serde_json", - "thiserror 2.0.17", + "thiserror 2.0.18", "time", "url", "uuid", @@ -5253,7 +5528,7 @@ checksum = "d540f220d3187173da220f885ab66608367b6574e925011a9353e4badda91d79" dependencies = [ "proc-macro2", "quote", - "syn 2.0.114", + "syn 2.0.117", ] [[package]] @@ -5264,7 +5539,7 @@ checksum = "18d26a20a969b9e3fdf2fc2d9f21eda6c40e2de84c9408bb5d3b05d499aae711" dependencies = [ "proc-macro2", "quote", - "syn 2.0.114", + "syn 2.0.117", ] [[package]] @@ -5288,7 +5563,7 @@ checksum = "175ee3e80ae9982737ca543e96133087cbd9a485eecc3bc4de9c1a37b47ea59c" dependencies = [ "proc-macro2", "quote", - "syn 2.0.114", + "syn 2.0.117", ] [[package]] @@ -5305,17 +5580,17 @@ dependencies = [ [[package]] name = "serde_with" -version = "3.16.1" +version = "3.18.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4fa237f2807440d238e0364a218270b98f767a00d3dada77b1c53ae88940e2e7" +checksum = "dd5414fad8e6907dbdd5bc441a50ae8d6e26151a03b1de04d89a5576de61d01f" dependencies = [ "base64", "chrono", "hex", "indexmap 1.9.3", - "indexmap 2.13.0", + "indexmap 2.13.1", "schemars 0.9.0", - "schemars 1.2.0", + "schemars 1.2.1", "serde_core", "serde_json", "serde_with_macros", @@ -5324,14 +5599,14 @@ dependencies = [ [[package]] name = "serde_with_macros" -version = "3.16.1" +version = "3.18.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "52a8e3ca0ca629121f70ab50f95249e5a6f925cc0f6ffe8256c45b728875706c" +checksum = "d3db8978e608f1fe7357e211969fd9abdcae80bac1ba7a3369bb7eb6b404eb65" dependencies = [ - "darling 0.21.3", + "darling 0.23.0", "proc-macro2", "quote", - "syn 2.0.114", + "syn 2.0.117", ] [[package]] @@ -5340,7 +5615,7 @@ version = "0.9.34+deprecated" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6a8b1a1a2ebf674015cc02edccce75287f1a0130d394307b36743c2f5d504b47" dependencies = [ - "indexmap 2.13.0", + "indexmap 2.13.1", "itoa", "ryu", "serde", @@ -5406,9 +5681,9 @@ dependencies = [ [[package]] name = "simd-adler32" -version = "0.3.8" +version = "0.3.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e320a6c5ad31d271ad523dcf3ad13e2767ad8b1cb8f047f75a8aeaf8da139da2" +checksum = "703d5c7ef118737c72f1af64ad2f6f8c5e1921f818cdcb97b8fe6fc69bf66214" [[package]] name = "simdutf8" @@ -5424,21 +5699,21 @@ checksum = "bbbb5d9659141646ae647b42fe094daf6c6192d1620870b449d9557f748b2daa" [[package]] name = "siphasher" -version = "1.0.1" +version = "1.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "56199f7ddabf13fe5074ce809e7d3f42b42ae711800501b5b16ea82ad029c39d" +checksum = "b2aa850e253778c88a04c3d7323b043aeda9d3e30d5971937c1855769763678e" [[package]] name = "sketches-ddsketch" -version = "0.3.0" +version = "0.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c1e9a774a6c28142ac54bb25d25562e6bcf957493a184f15ad4eebccb23e410a" +checksum = "0c6f73aeb92d671e0cc4dca167e59b2deb6387c375391bc99ee743f326994a2b" [[package]] name = "slab" -version = "0.4.11" +version = "0.4.12" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7a2ae44ef20feb57a68b23d846850f861394c2e02dc425a50098ae8c90267589" +checksum = "0c790de23124f9ab44544d7ac05d60440adc586479ce501c1d6d7da3cd8c9cf5" [[package]] name = "smallvec" @@ -5467,12 +5742,12 @@ dependencies = [ [[package]] name = "socket2" -version = "0.6.1" +version = "0.6.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "17129e116933cf371d018bb80ae557e889637989d8638274fb25622827b03881" +checksum = "3a766e1110788c36f4fa1c2b71b387a7815aa65f88ce0229841826633d93723e" dependencies = [ "libc", - "windows-sys 0.60.2", + "windows-sys 0.61.2", ] [[package]] @@ -5491,7 +5766,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d91ed6c858b01f942cd56b37a94b3e0a1798290327d1236e4d9cf4eaca44d29d" dependencies = [ "base64ct", - "der", + "der 0.7.10", ] [[package]] @@ -5525,7 +5800,7 @@ dependencies = [ "futures-util", "hashbrown 0.15.5", "hashlink 0.10.0", - "indexmap 2.13.0", + "indexmap 2.13.1", "log", "memchr", "once_cell", @@ -5535,7 +5810,7 @@ dependencies = [ "serde_json", "sha2", "smallvec", - "thiserror 2.0.17", + "thiserror 2.0.18", "tokio", "tokio-stream", "tracing", @@ -5553,7 +5828,7 @@ dependencies = [ "quote", "sqlx-core", "sqlx-macros-core", - "syn 2.0.114", + "syn 2.0.117", ] [[package]] @@ -5576,7 +5851,7 @@ dependencies = [ "sqlx-mysql", "sqlx-postgres", "sqlx-sqlite", - "syn 2.0.114", + "syn 2.0.117", "tokio", "url", ] @@ -5618,7 +5893,7 @@ dependencies = [ "smallvec", "sqlx-core", "stringprep", - "thiserror 2.0.17", + "thiserror 2.0.18", "tracing", "whoami", ] @@ -5655,7 +5930,7 @@ dependencies = [ "smallvec", "sqlx-core", "stringprep", - "thiserror 2.0.17", + "thiserror 2.0.18", "tracing", "whoami", ] @@ -5679,7 +5954,7 @@ dependencies = [ "serde", "serde_urlencoded", "sqlx-core", - "thiserror 2.0.17", + "thiserror 2.0.18", "tracing", "url", ] @@ -5731,7 +6006,7 @@ dependencies = [ "heck", "proc-macro2", "quote", - "syn 2.0.114", + "syn 2.0.117", ] [[package]] @@ -5753,9 +6028,9 @@ dependencies = [ [[package]] name = "syn" -version = "2.0.114" +version = "2.0.117" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d4d107df263a3013ef9b1879b0df87d706ff80f65a86ea879bd9c31f9b307c2a" +checksum = "e665b8803e7b1d2a727f4023456bbbbe74da67099c585258af0ad9c5013b9b99" dependencies = [ "proc-macro2", "quote", @@ -5779,14 +6054,14 @@ checksum = "728a70f3dbaf5bab7f0c4b1ac8d7ae5ea60a4b5549c8a5914361c99147a709d2" dependencies = [ "proc-macro2", "quote", - "syn 2.0.114", + "syn 2.0.117", ] [[package]] name = "sysinfo" -version = "0.38.2" +version = "0.38.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1efc19935b4b66baa6f654ac7924c192f55b175c00a7ab72410fc24284dacda8" +checksum = "92ab6a2f8bfe508deb3c6406578252e491d299cbbf3bc0529ecc3313aee4a52f" dependencies = [ "libc", "memchr", @@ -5798,9 +6073,9 @@ dependencies = [ [[package]] name = "system-configuration" -version = "0.6.1" +version = "0.7.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3c879d448e9d986b661742763247d3693ed13609438cf3d006f51f5368a5ba6b" +checksum = "a13f3d0daba03132c0aa9767f98351b3488edc2c100cda2d2ec2b04f3d8d3c8b" dependencies = [ "bitflags", "core-foundation 0.9.4", @@ -5829,14 +6104,25 @@ version = "1.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "55937e1799185b12863d447f42597ed69d9928686b8d88a1df17376a097d8369" +[[package]] +name = "tar" +version = "0.4.45" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "22692a6476a21fa75fdfc11d452fda482af402c008cdbaf3476414e122040973" +dependencies = [ + "filetime", + "libc", + "xattr", +] + [[package]] name = "tempfile" -version = "3.24.0" +version = "3.27.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "655da9c7eb6305c55742045d5a8d2037996d61d8de95806335c7c86ce0f82e9c" +checksum = "32497e9a4c7b38532efcdebeef879707aa9f794296a4f0244f6f69e9bc8574bd" dependencies = [ "fastrand", - "getrandom 0.3.4", + "getrandom 0.4.2", "once_cell", "rustix", "windows-sys 0.61.2", @@ -5853,11 +6139,11 @@ dependencies = [ [[package]] name = "thiserror" -version = "2.0.17" +version = "2.0.18" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f63587ca0f12b72a0600bcba1d40081f830876000bb46dd2337a3051618f4fc8" +checksum = "4288b5bcbc7920c07a1149a35cf9590a2aa808e0bc1eafaade0b80947865fbc4" dependencies = [ - "thiserror-impl 2.0.17", + "thiserror-impl 2.0.18", ] [[package]] @@ -5868,18 +6154,18 @@ checksum = "4fee6c4efc90059e10f81e6d42c60a18f76588c3d74cb83a0b242a2b6c7504c1" dependencies = [ "proc-macro2", "quote", - "syn 2.0.114", + "syn 2.0.117", ] [[package]] name = "thiserror-impl" -version = "2.0.17" +version = "2.0.18" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3ff15c8ecd7de3849db632e14d18d2571fa09dfc5ed93479bc4485c7a517c913" +checksum = "ebc4ee7f67670e9b64d05fa4253e753e016c6c95ff35b89b7941d6b856dec1d5" dependencies = [ "proc-macro2", "quote", - "syn 2.0.114", + "syn 2.0.117", ] [[package]] @@ -5977,9 +6263,9 @@ dependencies = [ [[package]] name = "tinystr" -version = "0.8.2" +version = "0.8.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "42d3e9c45c09de15d06dd8acf5f4e0e399e85927b7f00711024eb7ae10fa4869" +checksum = "c8323304221c2a851516f22236c5722a72eaa19749016521d6dff0824447d96d" dependencies = [ "displaydoc", "zerovec", @@ -5987,9 +6273,9 @@ dependencies = [ [[package]] name = "tinyvec" -version = "1.10.0" +version = "1.11.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bfa5fdc3bce6191a1dbc8c02d5c8bffcf557bafa17c124c5264a458f1b0613fa" +checksum = "3e61e67053d25a4e82c844e8424039d9745781b3fc4f32b8d55ed50f5f667ef3" dependencies = [ "tinyvec_macros", ] @@ -6002,9 +6288,9 @@ checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20" [[package]] name = "tokio" -version = "1.49.0" +version = "1.51.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "72a2903cd7736441aac9df9d7688bd0ce48edccaadf181c3b90be801e81d3d86" +checksum = "2bd1c4c0fc4a7ab90fc15ef6daaa3ec3b893f004f915f2392557ed23237820cd" dependencies = [ "bytes", "libc", @@ -6012,20 +6298,20 @@ dependencies = [ "parking_lot", "pin-project-lite", "signal-hook-registry", - "socket2 0.6.1", + "socket2 0.6.3", "tokio-macros", "windows-sys 0.61.2", ] [[package]] name = "tokio-macros" -version = "2.6.0" +version = "2.7.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "af407857209536a95c8e56f8231ef2c2e2aff839b22e07a1ffcbc617e9db9fa5" +checksum = "385a6cb71ab9ab790c5fe8d67f1645e6c450a7ce006a33de03daa956cf70a496" dependencies = [ "proc-macro2", "quote", - "syn 2.0.114", + "syn 2.0.117", ] [[package]] @@ -6046,7 +6332,7 @@ dependencies = [ "async-trait", "byteorder", "bytes", - "fallible-iterator", + "fallible-iterator 0.2.0", "futures-channel", "futures-util", "log", @@ -6101,20 +6387,20 @@ dependencies = [ [[package]] name = "toml_datetime" -version = "0.7.5+spec-1.1.0" +version = "1.1.1+spec-1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "92e1cfed4a3038bc5a127e35a2d360f145e1f4b971b551a2ba5fd7aedf7e1347" +checksum = "3165f65f62e28e0115a00b2ebdd37eb6f3b641855f9d636d3cd4103767159ad7" dependencies = [ "serde_core", ] [[package]] name = "toml_edit" -version = "0.23.10+spec-1.0.0" +version = "0.25.10+spec-1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "84c8b9f757e028cee9fa244aea147aab2a9ec09d5325a9b01e0a49730c2b5269" +checksum = "a82418ca169e235e6c399a84e395ab6debeb3bc90edc959bf0f48647c6a32d1b" dependencies = [ - "indexmap 2.13.0", + "indexmap 2.13.1", "toml_datetime", "toml_parser", "winnow", @@ -6122,18 +6408,18 @@ dependencies = [ [[package]] name = "toml_parser" -version = "1.0.6+spec-1.1.0" +version = "1.1.2+spec-1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a3198b4b0a8e11f09dd03e133c0280504d0801269e9afa46362ffde1cbeebf44" +checksum = "a2abe9b86193656635d2411dc43050282ca48aa31c2451210f4202550afb7526" dependencies = [ "winnow", ] [[package]] name = "tonic" -version = "0.14.2" +version = "0.14.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "eb7613188ce9f7df5bfe185db26c5814347d110db17920415cf2fbcad85e7203" +checksum = "fec7c61a0695dc1887c1b53952990f3ad2e3a31453e1f49f10e75424943a93ec" dependencies = [ "async-trait", "base64", @@ -6161,21 +6447,21 @@ dependencies = [ [[package]] name = "tonic-build" -version = "0.14.2" +version = "0.14.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4c40aaccc9f9eccf2cd82ebc111adc13030d23e887244bc9cfa5d1d636049de3" +checksum = "1882ac3bf5ef12877d7ed57aad87e75154c11931c2ba7e6cde5e22d63522c734" dependencies = [ "prettyplease", "proc-macro2", "quote", - "syn 2.0.114", + "syn 2.0.117", ] [[package]] name = "tonic-prost" -version = "0.14.2" +version = "0.14.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "66bd50ad6ce1252d87ef024b3d64fe4c3cf54a86fb9ef4c631fdd0ded7aeaa67" +checksum = "a55376a0bbaa4975a3f10d009ad763d8f4108f067c7c2e74f3001fb49778d309" dependencies = [ "bytes", "prost", @@ -6184,16 +6470,16 @@ dependencies = [ [[package]] name = "tonic-prost-build" -version = "0.14.2" +version = "0.14.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b4a16cba4043dc3ff43fcb3f96b4c5c154c64cbd18ca8dce2ab2c6a451d058a2" +checksum = "f3144df636917574672e93d0f56d7edec49f90305749c668df5101751bb8f95a" dependencies = [ "prettyplease", "proc-macro2", "prost-build", "prost-types", "quote", - "syn 2.0.114", + "syn 2.0.117", "tempfile", "tonic-build", ] @@ -6206,7 +6492,7 @@ checksum = "ebe5ef63511595f1344e2d5cfa636d973292adc0eec1f0ad45fae9f0851ab1d4" dependencies = [ "futures-core", "futures-util", - "indexmap 2.13.0", + "indexmap 2.13.1", "pin-project-lite", "slab", "sync_wrapper", @@ -6282,7 +6568,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "786d480bce6247ab75f005b14ae1624ad978d3029d9113f0a22fa1ac773faeaf" dependencies = [ "crossbeam-channel", - "thiserror 2.0.17", + "thiserror 2.0.18", "time", "tracing-subscriber", ] @@ -6295,7 +6581,7 @@ checksum = "7490cfa5ec963746568740651ac6781f701c9c5ea257c58e057f3ba8cf69e8da" dependencies = [ "proc-macro2", "quote", - "syn 2.0.114", + "syn 2.0.117", ] [[package]] @@ -6331,9 +6617,9 @@ dependencies = [ [[package]] name = "tracing-subscriber" -version = "0.3.22" +version = "0.3.23" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2f30143827ddab0d256fd843b7a66d164e9f271cfa0dde49142c5ca0ca291f1e" +checksum = "cb7f578e5945fb242538965c2d0b04418d38ec25c79d160cd279bf0731c8d319" dependencies = [ "matchers", "nu-ansi-term", @@ -6379,7 +6665,7 @@ checksum = "3c36781cc0e46a83726d9879608e4cf6c2505237e263a8eb8c24502989cfdb28" dependencies = [ "proc-macro2", "quote", - "syn 2.0.114", + "syn 2.0.117", ] [[package]] @@ -6417,9 +6703,9 @@ checksum = "5c1cb5db39152898a79168971543b1cb5020dff7fe43c8dc468b0885f5e29df5" [[package]] name = "unicode-ident" -version = "1.0.22" +version = "1.0.24" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9312f7c4f6ff9069b165498234ce8be658059c6728633667c526e27dc2cf1df5" +checksum = "e6e4313cd5fcd3dad5cafa179702e2b244f760991f45397d14d4ebf38247da75" [[package]] name = "unicode-normalization" @@ -6438,9 +6724,15 @@ checksum = "7df058c713841ad818f1dc5d3fd88063241cc61f49f5fbea4b951e8cf5a8d71d" [[package]] name = "unicode-segmentation" -version = "1.12.0" +version = "1.13.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9629274872b2bfaf8d66f5f15725007f635594914870f65218920345aa11aa8c" + +[[package]] +name = "unicode-width" +version = "0.2.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f6ccf251212114b54433ec949fd6a7841275f9ada20dddd2f29e9ceea4501493" +checksum = "b4ac048d71ede7ee76d585517add45da530660ef4390e49b098733c6e897f254" [[package]] name = "unicode-xid" @@ -6462,26 +6754,26 @@ checksum = "8ecb6da28b8a351d773b68d5825ac39017e680750f980f3a1a85cd8dd28a47c1" [[package]] name = "ureq" -version = "3.1.4" +version = "3.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d39cb1dbab692d82a977c0392ffac19e188bd9186a9f32806f0aaa859d75585a" +checksum = "dea7109cdcd5864d4eeb1b58a1648dc9bf520360d7af16ec26d0a9354bafcfc0" dependencies = [ "base64", - "der", + "der 0.8.0", "log", "native-tls", "percent-encoding", "rustls-pki-types", "ureq-proto", - "utf-8", + "utf8-zero", "webpki-root-certs", ] [[package]] name = "ureq-proto" -version = "0.5.3" +version = "0.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d81f9efa9df032be5934a46a068815a10a042b494b6a58cb0a1a97bb5467ed6f" +checksum = "e994ba84b0bd1b1b0cf92878b7ef898a5c1760108fe7b6010327e274917a808c" dependencies = [ "base64", "http 1.4.0", @@ -6503,10 +6795,10 @@ dependencies = [ ] [[package]] -name = "utf-8" -version = "0.7.6" +name = "utf8-zero" +version = "0.8.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "09cc8ee72d2a9becf2f2febe0205bbed8fc6615b7cb429ad062dc7b7ddd036a9" +checksum = "b8c0a043c9540bae7c578c88f91dda8bd82e59ae27c21baca69c8b191aaf5a6e" [[package]] name = "utf8_iter" @@ -6526,7 +6818,7 @@ version = "5.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2fcc29c80c21c31608227e0912b2d7fddba57ad76b606890627ba8ee7964e993" dependencies = [ - "indexmap 2.13.0", + "indexmap 2.13.1", "serde", "serde_json", "utoipa-gen", @@ -6541,7 +6833,7 @@ dependencies = [ "proc-macro2", "quote", "regex", - "syn 2.0.114", + "syn 2.0.117", ] [[package]] @@ -6559,7 +6851,7 @@ dependencies = [ "serde_json", "utoipa", "utoipa-swagger-ui-vendored", - "zip", + "zip 3.0.0", ] [[package]] @@ -6570,11 +6862,11 @@ checksum = "e2eebbbfe4093922c2b6734d7c679ebfebd704a0d7e56dfcb0d05818ce28977d" [[package]] name = "uuid" -version = "1.19.0" +version = "1.23.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e2e054861b4bd027cd373e18e8d8d8e6548085000e41290d95ce0c373a654b4a" +checksum = "5ac8b6f42ead25368cf5b098aeb3dc8a1a2c05a3eee8a9a1a68c640edbfc79d9" dependencies = [ - "getrandom 0.3.4", + "getrandom 0.4.2", "js-sys", "serde_core", "wasm-bindgen", @@ -6625,9 +6917,18 @@ checksum = "ccf3ec651a847eb01de73ccad15eb7d99f80485de043efb2f370cd654f4ea44b" [[package]] name = "wasip2" -version = "1.0.1+wasi-0.2.4" +version = "1.0.2+wasi-0.2.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9517f9239f02c069db75e65f174b3da828fe5f5b945c4dd26bd25d89c03ebcf5" +dependencies = [ + "wit-bindgen", +] + +[[package]] +name = "wasip3" +version = "0.4.0+wasi-0.3.0-rc-2026-01-06" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0562428422c63773dad2c345a1882263bbf4d65cf3f42e90921f787ef5ad58e7" +checksum = "5428f8bf88ea5ddc08faddef2ac4a67e390b88186c703ce6dbd955e1c145aca5" dependencies = [ "wit-bindgen", ] @@ -6640,36 +6941,33 @@ checksum = "b8dad83b4f25e74f184f64c43b150b91efe7647395b42289f38e50566d82855b" [[package]] name = "wasm-bindgen" -version = "0.2.108" +version = "0.2.117" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "64024a30ec1e37399cf85a7ffefebdb72205ca1c972291c51512360d90bd8566" +checksum = "0551fc1bb415591e3372d0bc4780db7e587d84e2a7e79da121051c5c4b89d0b0" dependencies = [ "cfg-if", "once_cell", "rustversion", + "serde", "wasm-bindgen-macro", "wasm-bindgen-shared", ] [[package]] name = "wasm-bindgen-futures" -version = "0.4.58" +version = "0.4.67" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "70a6e77fd0ae8029c9ea0063f87c46fde723e7d887703d74ad2616d792e51e6f" +checksum = "03623de6905b7206edd0a75f69f747f134b7f0a2323392d664448bf2d3c5d87e" dependencies = [ - "cfg-if", - "futures-util", "js-sys", - "once_cell", "wasm-bindgen", - "web-sys", ] [[package]] name = "wasm-bindgen-macro" -version = "0.2.108" +version = "0.2.117" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "008b239d9c740232e71bd39e8ef6429d27097518b6b30bdf9086833bd5b6d608" +checksum = "7fbdf9a35adf44786aecd5ff89b4563a90325f9da0923236f6104e603c7e86be" dependencies = [ "quote", "wasm-bindgen-macro-support", @@ -6677,26 +6975,48 @@ dependencies = [ [[package]] name = "wasm-bindgen-macro-support" -version = "0.2.108" +version = "0.2.117" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5256bae2d58f54820e6490f9839c49780dff84c65aeab9e772f15d5f0e913a55" +checksum = "dca9693ef2bab6d4e6707234500350d8dad079eb508dca05530c85dc3a529ff2" dependencies = [ "bumpalo", "proc-macro2", "quote", - "syn 2.0.114", + "syn 2.0.117", "wasm-bindgen-shared", ] [[package]] name = "wasm-bindgen-shared" -version = "0.2.108" +version = "0.2.117" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1f01b580c9ac74c8d8f0c0e4afb04eeef2acf145458e52c03845ee9cd23e3d12" +checksum = "39129a682a6d2d841b6c429d0c51e5cb0ed1a03829d8b3d1e69a011e62cb3d3b" dependencies = [ "unicode-ident", ] +[[package]] +name = "wasm-encoder" +version = "0.244.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "990065f2fe63003fe337b932cfb5e3b80e0b4d0f5ff650e6985b1048f62c8319" +dependencies = [ + "leb128fmt", + "wasmparser", +] + +[[package]] +name = "wasm-metadata" +version = "0.244.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bb0e353e6a2fbdc176932bbaab493762eb1255a7900fe0fea1a2f96c296cc909" +dependencies = [ + "anyhow", + "indexmap 2.13.1", + "wasm-encoder", + "wasmparser", +] + [[package]] name = "wasm-streams" version = "0.4.2" @@ -6710,11 +7030,23 @@ dependencies = [ "web-sys", ] +[[package]] +name = "wasmparser" +version = "0.244.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "47b807c72e1bac69382b3a6fb3dbe8ea4c0ed87ff5629b8685ae6b9a611028fe" +dependencies = [ + "bitflags", + "hashbrown 0.15.5", + "indexmap 2.13.1", + "semver", +] + [[package]] name = "web-sys" -version = "0.3.85" +version = "0.3.94" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "312e32e551d92129218ea9a2452120f4aabc03529ef03e4d0d82fb2780608598" +checksum = "cd70027e39b12f0849461e08ffc50b9cd7688d942c1c8e3c7b22273236b4dd0a" dependencies = [ "js-sys", "wasm-bindgen", @@ -6732,9 +7064,9 @@ dependencies = [ [[package]] name = "webpki-root-certs" -version = "1.0.5" +version = "1.0.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "36a29fc0408b113f68cf32637857ab740edfafdf460c326cd2afaa2d84cc05dc" +checksum = "804f18a4ac2676ffb4e8b5b5fa9ae38af06df08162314f96a68d2a363e21a8ca" dependencies = [ "rustls-pki-types", ] @@ -6745,14 +7077,14 @@ version = "0.26.11" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "521bc38abb08001b01866da9f51eb7c5d647a19260e00054a8c7fd5f9e57f7a9" dependencies = [ - "webpki-roots 1.0.5", + "webpki-roots 1.0.6", ] [[package]] name = "webpki-roots" -version = "1.0.5" +version = "1.0.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "12bed680863276c63889429bfd6cab3b99943659923822de1c8a39c49e4d722c" +checksum = "22cfaf3c063993ff62e73cb4311efde4db1efb31ab78a3e5c457939ad5cc0bed" dependencies = [ "rustls-pki-types", ] @@ -6852,7 +7184,7 @@ checksum = "053e2e040ab57b9dc951b72c264860db7eb3b0200ba345b4e4c3b14f67855ddf" dependencies = [ "proc-macro2", "quote", - "syn 2.0.114", + "syn 2.0.117", ] [[package]] @@ -6863,7 +7195,7 @@ checksum = "3f316c4a2570ba26bbec722032c4099d8c8bc095efccdc15688708623367e358" dependencies = [ "proc-macro2", "quote", - "syn 2.0.114", + "syn 2.0.117", ] [[package]] @@ -7144,24 +7476,106 @@ checksum = "d6bbff5f0aada427a1e5a6da5f1f98158182f26556f345ac9e04d36d0ebed650" [[package]] name = "winnow" -version = "0.7.14" +version = "1.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5a5364e9d77fcdeeaa6062ced926ee3381faa2ee02d3eb83a5c27a8825540829" +checksum = "09dac053f1cd375980747450bfc7250c264eaae0583872e845c0c7cd578872b5" dependencies = [ "memchr", ] [[package]] name = "wit-bindgen" -version = "0.46.0" +version = "0.51.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d7249219f66ced02969388cf2bb044a09756a083d0fab1e566056b04d9fbcaa5" +dependencies = [ + "wit-bindgen-rust-macro", +] + +[[package]] +name = "wit-bindgen-core" +version = "0.51.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ea61de684c3ea68cb082b7a88508a8b27fcc8b797d738bfc99a82facf1d752dc" +dependencies = [ + "anyhow", + "heck", + "wit-parser", +] + +[[package]] +name = "wit-bindgen-rust" +version = "0.51.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b7c566e0f4b284dd6561c786d9cb0142da491f46a9fbed79ea69cdad5db17f21" +dependencies = [ + "anyhow", + "heck", + "indexmap 2.13.1", + "prettyplease", + "syn 2.0.117", + "wasm-metadata", + "wit-bindgen-core", + "wit-component", +] + +[[package]] +name = "wit-bindgen-rust-macro" +version = "0.51.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0c0f9bfd77e6a48eccf51359e3ae77140a7f50b1e2ebfe62422d8afdaffab17a" +dependencies = [ + "anyhow", + "prettyplease", + "proc-macro2", + "quote", + "syn 2.0.117", + "wit-bindgen-core", + "wit-bindgen-rust", +] + +[[package]] +name = "wit-component" +version = "0.244.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9d66ea20e9553b30172b5e831994e35fbde2d165325bec84fc43dbf6f4eb9cb2" +dependencies = [ + "anyhow", + "bitflags", + "indexmap 2.13.1", + "log", + "serde", + "serde_derive", + "serde_json", + "wasm-encoder", + "wasm-metadata", + "wasmparser", + "wit-parser", +] + +[[package]] +name = "wit-parser" +version = "0.244.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f17a85883d4e6d00e8a97c586de764dabcc06133f7f1d55dce5cdc070ad7fe59" +checksum = "ecc8ac4bc1dc3381b7f59c34f00b67e18f910c2c0f50015669dde7def656a736" +dependencies = [ + "anyhow", + "id-arena", + "indexmap 2.13.1", + "log", + "semver", + "serde", + "serde_derive", + "serde_json", + "unicode-xid", + "wasmparser", +] [[package]] name = "writeable" -version = "0.6.2" +version = "0.6.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9edde0db4769d2dc68579893f2306b26c6ecfbe0ef499b013d731b7b9247e0b9" +checksum = "1ffae5123b2d3fc086436f8834ae3ab053a283cfac8fe0a0b8eaae044768a4c4" [[package]] name = "wyz" @@ -7179,10 +7593,34 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1301e935010a701ae5f8655edc0ad17c44bad3ac5ce8c39185f75453b720ae94" dependencies = [ "const-oid", - "der", + "der 0.7.10", "spki", ] +[[package]] +name = "xattr" +version = "1.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "32e45ad4206f6d2479085147f02bc2ef834ac85886624a23575ae137c8aa8156" +dependencies = [ + "libc", + "rustix", +] + +[[package]] +name = "xtask" +version = "0.1.0" +dependencies = [ + "anyhow", + "clap", + "k8s-openapi", + "kube", + "schemars 0.8.22", + "serde", + "serde_json", + "tokio", +] + [[package]] name = "yaml-rust2" version = "0.8.1" @@ -7196,9 +7634,9 @@ dependencies = [ [[package]] name = "yoke" -version = "0.8.1" +version = "0.8.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "72d6e5c6afb84d73944e5cedb052c4680d5657337201555f9f2a16b7406d4954" +checksum = "abe8c5fda708d9ca3df187cae8bfb9ceda00dd96231bed36e445a1a48e66f9ca" dependencies = [ "stable_deref_trait", "yoke-derive", @@ -7207,13 +7645,13 @@ dependencies = [ [[package]] name = "yoke-derive" -version = "0.8.1" +version = "0.8.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b659052874eb698efe5b9e8cf382204678a0086ebf46982b79d6ca3182927e5d" +checksum = "de844c262c8848816172cef550288e7dc6c7b7814b4ee56b3e1553f275f1858e" dependencies = [ "proc-macro2", "quote", - "syn 2.0.114", + "syn 2.0.117", "synstructure", ] @@ -7236,7 +7674,7 @@ dependencies = [ "seahash", "serde", "serde_json", - "thiserror 2.0.17", + "thiserror 2.0.18", "time", "tokio", "url", @@ -7244,42 +7682,42 @@ dependencies = [ [[package]] name = "zerocopy" -version = "0.8.33" +version = "0.8.48" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "668f5168d10b9ee831de31933dc111a459c97ec93225beb307aed970d1372dfd" +checksum = "eed437bf9d6692032087e337407a86f04cd8d6a16a37199ed57949d415bd68e9" dependencies = [ "zerocopy-derive", ] [[package]] name = "zerocopy-derive" -version = "0.8.33" +version = "0.8.48" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2c7962b26b0a8685668b671ee4b54d007a67d4eaf05fda79ac0ecf41e32270f1" +checksum = "70e3cd084b1788766f53af483dd21f93881ff30d7320490ec3ef7526d203bad4" dependencies = [ "proc-macro2", "quote", - "syn 2.0.114", + "syn 2.0.117", ] [[package]] name = "zerofrom" -version = "0.1.6" +version = "0.1.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "50cc42e0333e05660c3587f3bf9d0478688e15d870fab3346451ce7f8c9fbea5" +checksum = "69faa1f2a1ea75661980b013019ed6687ed0e83d069bc1114e2cc74c6c04c4df" dependencies = [ "zerofrom-derive", ] [[package]] name = "zerofrom-derive" -version = "0.1.6" +version = "0.1.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d71e5d6e06ab090c67b5e44993ec16b72dcbaabc526db883a360057678b48502" +checksum = "11532158c46691caf0f2593ea8358fed6bbf68a0315e80aae9bd41fbade684a1" dependencies = [ "proc-macro2", "quote", - "syn 2.0.114", + "syn 2.0.117", "synstructure", ] @@ -7291,9 +7729,9 @@ checksum = "b97154e67e32c85465826e8bcc1c59429aaaf107c1e4a9e53c8d8ccd5eff88d0" [[package]] name = "zerotrie" -version = "0.2.3" +version = "0.2.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2a59c17a5562d507e4b54960e8569ebee33bee890c70aa3fe7b97e85a9fd7851" +checksum = "0f9152d31db0792fa83f70fb2f83148effb5c1f5b8c7686c3459e361d9bc20bf" dependencies = [ "displaydoc", "yoke", @@ -7302,9 +7740,9 @@ dependencies = [ [[package]] name = "zerovec" -version = "0.11.5" +version = "0.11.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6c28719294829477f525be0186d13efa9a3c602f7ec202ca9e353d310fb9a002" +checksum = "90f911cbc359ab6af17377d242225f4d75119aec87ea711a880987b18cd7b239" dependencies = [ "yoke", "zerofrom", @@ -7313,13 +7751,13 @@ dependencies = [ [[package]] name = "zerovec-derive" -version = "0.11.2" +version = "0.11.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "eadce39539ca5cb3985590102671f2567e659fca9666581ad3411d59207951f3" +checksum = "625dc425cab0dca6dc3c3319506e6593dcb08a9f387ea3b284dbd52a92c40555" dependencies = [ "proc-macro2", "quote", - "syn 2.0.114", + "syn 2.0.117", ] [[package]] @@ -7331,22 +7769,36 @@ dependencies = [ "arbitrary", "crc32fast", "flate2", - "indexmap 2.13.0", + "indexmap 2.13.1", + "memchr", + "zopfli", +] + +[[package]] +name = "zip" +version = "6.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "eb2a05c7c36fde6c09b08576c9f7fb4cda705990f73b58fe011abf7dfb24168b" +dependencies = [ + "arbitrary", + "crc32fast", + "flate2", + "indexmap 2.13.1", "memchr", "zopfli", ] [[package]] name = "zlib-rs" -version = "0.5.5" +version = "0.6.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "40990edd51aae2c2b6907af74ffb635029d5788228222c4bb811e9351c0caad3" +checksum = "3be3d40e40a133f9c916ee3f9f4fa2d9d63435b5fbe1bfc6d9dae0aa0ada1513" [[package]] name = "zmij" -version = "1.0.14" +version = "1.0.21" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bd8f3f50b848df28f887acb68e41201b5aea6bc8a8dacc00fb40635ff9a72fea" +checksum = "b8848ee67ecc8aedbaf3e4122217aff892639231befc6a1b58d29fff4c2cabaa" [[package]] name = "zopfli" diff --git a/etl-destinations/Cargo.toml b/etl-destinations/Cargo.toml index 86e7a5bac..a2451aaa4 100644 --- a/etl-destinations/Cargo.toml +++ b/etl-destinations/Cargo.toml @@ -87,7 +87,6 @@ tonic = { workspace = true, optional = true } tracing = { workspace = true, optional = true, default-features = true } url = { workspace = true, optional = true } uuid = { workspace = true, optional = true, features = ["v4"] } -parking_lot = { workspace = true, optional = true } [dev-dependencies] duckdb = { workspace = true, features = ["bundled"] } diff --git a/etl-destinations/src/clickhouse/encoding.rs b/etl-destinations/src/clickhouse/encoding.rs index 99ffa6030..7ab48b7b4 100644 --- a/etl-destinations/src/clickhouse/encoding.rs +++ b/etl-destinations/src/clickhouse/encoding.rs @@ -249,12 +249,24 @@ pub(crate) fn rb_encode_value(val: ClickHouseValue, buf: &mut Vec) -> EtlRes // ClickHouse RowBinary UUID = two little-endian u64 (high bits then low bits). // Our bytes are in standard UUID big-endian order, so we split into two u64 // and write each in little-endian. - let high = u64::from_be_bytes(bytes[0..8].try_into().map_err(|e: std::array::TryFromSliceError| { - etl_error!(ErrorKind::ConversionError, "UUID high-half conversion failed", e) - })?); - let low = u64::from_be_bytes(bytes[8..16].try_into().map_err(|e: std::array::TryFromSliceError| { - etl_error!(ErrorKind::ConversionError, "UUID low-half conversion failed", e) - })?); + let high = u64::from_be_bytes(bytes[0..8].try_into().map_err( + |e: std::array::TryFromSliceError| { + etl_error!( + ErrorKind::ConversionError, + "UUID high-half conversion failed", + e + ) + }, + )?); + let low = u64::from_be_bytes(bytes[8..16].try_into().map_err( + |e: std::array::TryFromSliceError| { + etl_error!( + ErrorKind::ConversionError, + "UUID low-half conversion failed", + e + ) + }, + )?); buf.extend_from_slice(&high.to_le_bytes()); buf.extend_from_slice(&low.to_le_bytes()); } diff --git a/etl-destinations/src/clickhouse/schema.rs b/etl-destinations/src/clickhouse/schema.rs index c62dddbdb..cd6dae260 100644 --- a/etl-destinations/src/clickhouse/schema.rs +++ b/etl-destinations/src/clickhouse/schema.rs @@ -129,7 +129,10 @@ mod tests { #[test] fn test_scalar_type_mapping() { - assert_eq!(postgres_column_type_to_clickhouse_sql(&Type::BOOL), "Boolean"); + assert_eq!( + postgres_column_type_to_clickhouse_sql(&Type::BOOL), + "Boolean" + ); assert_eq!( postgres_column_type_to_clickhouse_sql(&Type::CHAR), "String" @@ -245,7 +248,10 @@ mod tests { }, ]; let sql = build_create_table_sql("public_users", &schemas); - assert!(sql.contains("\"id\" Int32"), "id should be non-nullable Int32"); + assert!( + sql.contains("\"id\" Int32"), + "id should be non-nullable Int32" + ); assert!( sql.contains("\"name\" Nullable(String)"), "name should be Nullable(String)" diff --git a/etl-destinations/src/clickhouse/test_utils.rs b/etl-destinations/src/clickhouse/test_utils.rs index 9c7a6b445..e838ebe34 100644 --- a/etl-destinations/src/clickhouse/test_utils.rs +++ b/etl-destinations/src/clickhouse/test_utils.rs @@ -45,8 +45,7 @@ pub fn skip_if_missing_clickhouse_env_vars() -> bool { /// /// Panics if [`CLICKHOUSE_URL_ENV`] is not set. pub fn get_clickhouse_url() -> String { - std::env::var(CLICKHOUSE_URL_ENV) - .unwrap_or_else(|_| panic!("{CLICKHOUSE_URL_ENV} must be set")) + std::env::var(CLICKHOUSE_URL_ENV).unwrap_or_else(|_| panic!("{CLICKHOUSE_URL_ENV} must be set")) } /// Returns the ClickHouse user name from the environment. @@ -61,7 +60,9 @@ pub fn get_clickhouse_user() -> String { /// Returns the ClickHouse password from the environment, or `None` if unset. pub fn get_clickhouse_password() -> Option { - std::env::var(CLICKHOUSE_PASSWORD_ENV).ok().filter(|s| !s.is_empty()) + std::env::var(CLICKHOUSE_PASSWORD_ENV) + .ok() + .filter(|s| !s.is_empty()) } /// Generates a unique database name for test isolation. @@ -121,10 +122,7 @@ impl ClickHouseTestDatabase { /// Drops the test database from ClickHouse. pub async fn drop_database(&self) { self.root_client - .query(&format!( - "DROP DATABASE IF EXISTS `{}`", - self.database - )) + .query(&format!("DROP DATABASE IF EXISTS `{}`", self.database)) .execute() .await .expect("Failed to drop test ClickHouse database"); diff --git a/etl-destinations/tests/support/clickhouse.rs b/etl-destinations/tests/support/clickhouse.rs index 7e6fb2599..5878d0c9a 100644 --- a/etl-destinations/tests/support/clickhouse.rs +++ b/etl-destinations/tests/support/clickhouse.rs @@ -31,11 +31,11 @@ pub struct AllTypesRow { pub json_col: String, pub integer_array_col: Vec>, pub text_array_col: Vec>, - pub bytea_col: String, // hex-encoded + pub bytea_col: String, // hex-encoded pub inet_col: String, pub cidr_col: String, pub macaddr_col: String, - pub uuid_col: String, // via toString() in SELECT + pub uuid_col: String, // via toString() in SELECT pub cdc_operation: String, } diff --git a/etl-examples/src/bin/clickhouse.rs b/etl-examples/src/bin/clickhouse.rs index 94d14e3db..bedc0a0e1 100644 --- a/etl-examples/src/bin/clickhouse.rs +++ b/etl-examples/src/bin/clickhouse.rs @@ -203,8 +203,8 @@ async fn main_impl() -> Result<(), Box> { // total_memory * memory_budget_ratio / max_table_sync_workers let max_bytes_per_insert = { let total_memory = MemorySnapshot::from_system(&mut sysinfo::System::new()).total(); - let budget = (total_memory as f64 * f64::from(BatchConfig::DEFAULT_MEMORY_BUDGET_RATIO)) - as u64; + let budget = + (total_memory as f64 * f64::from(BatchConfig::DEFAULT_MEMORY_BUDGET_RATIO)) as u64; (budget / u64::from(args.ch_args.max_table_sync_workers)).max(1) }; @@ -215,7 +215,9 @@ async fn main_impl() -> Result<(), Box> { args.ch_args.ch_user, args.ch_args.ch_password, args.ch_args.ch_database, - ClickHouseInserterConfig { max_bytes_per_insert }, + ClickHouseInserterConfig { + max_bytes_per_insert, + }, store.clone(), )?; diff --git a/etl-replicator/src/core.rs b/etl-replicator/src/core.rs index 3bc55c443..9c271da28 100644 --- a/etl-replicator/src/core.rs +++ b/etl-replicator/src/core.rs @@ -145,7 +145,6 @@ pub async fn start_replicator_with_config( let pipeline = Pipeline::new(replicator_config.pipeline, state_store, destination); start_pipeline(pipeline).await?; } -<<<<<<< HEAD DestinationConfig::Ducklake { catalog_url, data_path, From 0be4e5ff1790164c60e929293a45a23b899a639c Mon Sep 17 00:00:00 2001 From: Jordan McQueen Date: Tue, 7 Apr 2026 17:52:20 +0900 Subject: [PATCH 11/86] Adapt ClickHouse destination to async flushing API Main introduced AsyncResult parameters on the Destination trait (truncate_table, write_table_rows, write_events). Update the ClickHouse implementation to conform, add missing K8sClient trait stubs, and fix imports that drifted during rebase. --- etl-api/src/k8s/core.rs | 12 +++++++++++ etl-destinations/src/clickhouse/core.rs | 28 ++++++++++++++++++++----- etl-replicator/src/core.rs | 9 +++++--- 3 files changed, 41 insertions(+), 8 deletions(-) diff --git a/etl-api/src/k8s/core.rs b/etl-api/src/k8s/core.rs index 8bb0a3552..dff63f2ef 100644 --- a/etl-api/src/k8s/core.rs +++ b/etl-api/src/k8s/core.rs @@ -481,6 +481,14 @@ mod tests { Ok(()) } + async fn create_or_update_clickhouse_secret( + &self, + _prefix: &str, + _password: Option<&str>, + ) -> Result<(), K8sError> { + Ok(()) + } + async fn create_or_update_ducklake_secret( &self, prefix: &str, @@ -505,6 +513,10 @@ mod tests { Ok(()) } + async fn delete_clickhouse_secret(&self, _prefix: &str) -> Result<(), K8sError> { + Ok(()) + } + async fn delete_ducklake_secret(&self, prefix: &str) -> Result<(), K8sError> { self.calls .lock() diff --git a/etl-destinations/src/clickhouse/core.rs b/etl-destinations/src/clickhouse/core.rs index 425db8a12..7c6610d7a 100644 --- a/etl-destinations/src/clickhouse/core.rs +++ b/etl-destinations/src/clickhouse/core.rs @@ -7,6 +7,9 @@ use crate::clickhouse::client::ClickHouseClient; use crate::clickhouse::encoding::{ClickHouseValue, cell_to_clickhouse_value}; use crate::clickhouse::metrics::{ETL_CH_DDL_DURATION_SECONDS, register_metrics}; use crate::clickhouse::schema::{build_create_table_sql, table_name_to_clickhouse_table_name}; +use etl::destination::async_result::{ + TruncateTableResult, WriteEventsResult, WriteTableRowsResult, +}; use etl::error::{ErrorKind, EtlResult}; use etl::etl_error; use etl::store::schema::SchemaStore; @@ -378,20 +381,35 @@ where "clickhouse" } - async fn truncate_table(&self, table_id: TableId) -> EtlResult<()> { - self.truncate_table_inner(table_id).await + async fn truncate_table( + &self, + table_id: TableId, + async_result: TruncateTableResult<()>, + ) -> EtlResult<()> { + let result = self.truncate_table_inner(table_id).await; + async_result.send(result); + Ok(()) } async fn write_table_rows( &self, table_id: TableId, table_rows: Vec, + async_result: WriteTableRowsResult<()>, ) -> EtlResult<()> { - self.write_table_rows_inner(table_id, table_rows).await + let result = self.write_table_rows_inner(table_id, table_rows).await; + async_result.send(result); + Ok(()) } - async fn write_events(&self, events: Vec) -> EtlResult<()> { - self.write_events_inner(events).await + async fn write_events( + &self, + events: Vec, + async_result: WriteEventsResult<()>, + ) -> EtlResult<()> { + let result = self.write_events_inner(events).await; + async_result.send(result); + Ok(()) } } diff --git a/etl-replicator/src/core.rs b/etl-replicator/src/core.rs index 9c271da28..8fa54a338 100644 --- a/etl-replicator/src/core.rs +++ b/etl-replicator/src/core.rs @@ -4,7 +4,6 @@ use crate::error::{ReplicatorError, ReplicatorResult}; use crate::error_notification::ErrorNotificationClient; use crate::error_reporting::ErrorReportingStateStore; use crate::metrics; -use crate::migrations::migrate_state_store; use crate::sentry::set_destination_tag; use etl::concurrency::memory_monitor::MemorySnapshot; use etl::config::MemoryBackpressureConfig; @@ -31,7 +30,7 @@ use etl_destinations::{ use secrecy::ExposeSecret; use sysinfo::{MemoryRefreshKind, RefreshKind, System}; use tokio::signal::unix::{SignalKind, signal}; -use tracing::{error, info, warn}; +use tracing::{debug, error, info, warn}; /// Starts the replicator service with the provided configuration. /// @@ -313,7 +312,11 @@ fn log_destination_config(config: &DestinationConfig) { database, password: _, } => debug!(url, user, database, "using clickhouse destination config"), - DestinationConfig::Ducklake { catalog_url, data_path, .. } => { + DestinationConfig::Ducklake { + catalog_url, + data_path, + .. + } => { debug!(catalog_url, data_path, "using ducklake destination config") } } From 99dd0542bd4d8c6ad8bda3c8c1d9643a624dfb1d Mon Sep 17 00:00:00 2001 From: Jordan McQueen Date: Tue, 7 Apr 2026 18:25:19 +0900 Subject: [PATCH 12/86] Remove unused log_config helpers from replicator These were carried over during the ClickHouse rebase but the startup path on main no longer calls them, triggering dead_code warnings. --- etl-replicator/src/core.rs | 111 +------------------------------------ 1 file changed, 2 insertions(+), 109 deletions(-) diff --git a/etl-replicator/src/core.rs b/etl-replicator/src/core.rs index 8fa54a338..8b66aa764 100644 --- a/etl-replicator/src/core.rs +++ b/etl-replicator/src/core.rs @@ -14,9 +14,7 @@ use etl::store::schema::SchemaStore; use etl::store::state::StateStore; use etl::types::PipelineId; use etl::{config::IcebergConfig, destination::Destination}; -use etl_config::shared::{ - BatchConfig, DestinationConfig, PgConnectionConfig, PipelineConfig, ReplicatorConfig, -}; +use etl_config::shared::{DestinationConfig, PgConnectionConfig, ReplicatorConfig}; use etl_config::{Environment, parse_ducklake_url}; use etl_destinations::clickhouse::{ClickHouseDestination, ClickHouseInserterConfig}; use etl_destinations::iceberg::{ @@ -30,7 +28,7 @@ use etl_destinations::{ use secrecy::ExposeSecret; use sysinfo::{MemoryRefreshKind, RefreshKind, System}; use tokio::signal::unix::{SignalKind, signal}; -use tracing::{debug, error, info, warn}; +use tracing::{error, info, warn}; /// Starts the replicator service with the provided configuration. /// @@ -248,111 +246,6 @@ pub fn create_props( props } -fn log_config(config: &ReplicatorConfig) { - log_destination_config(&config.destination); - log_pipeline_config(&config.pipeline); -} - -fn log_destination_config(config: &DestinationConfig) { - match config { - DestinationConfig::BigQuery { - project_id, - dataset_id, - service_account_key: _, - max_staleness_mins, - connection_pool_size, - } => { - debug!( - project_id, - dataset_id, - max_staleness_mins, - connection_pool_size, - "using bigquery destination config" - ) - } - DestinationConfig::Iceberg { - config: - IcebergConfig::Supabase { - namespace, - project_ref, - catalog_token: _, - warehouse_name, - s3_access_key_id: _, - s3_secret_access_key: _, - s3_region, - }, - } => { - debug!( - namespace, - project_ref, warehouse_name, s3_region, "using supabase iceberg destination config" - ) - } - DestinationConfig::Iceberg { - config: - IcebergConfig::Rest { - catalog_uri, - warehouse_name, - namespace, - s3_access_key_id: _, - s3_secret_access_key: _, - s3_endpoint, - }, - } => { - debug!( - catalog_uri, - warehouse_name, - namespace, - s3_endpoint, - "using generic rest iceberg destination config" - ) - } - DestinationConfig::ClickHouse { - url, - user, - database, - password: _, - } => debug!(url, user, database, "using clickhouse destination config"), - DestinationConfig::Ducklake { - catalog_url, - data_path, - .. - } => { - debug!(catalog_url, data_path, "using ducklake destination config") - } - } -} - -fn log_pipeline_config(config: &PipelineConfig) { - debug!( - pipeline_id = config.id, - publication_name = config.publication_name, - table_error_retry_delay_ms = config.table_error_retry_delay_ms, - max_table_sync_workers = config.max_table_sync_workers, - "pipeline config" - ); - log_pg_connection_config(&config.pg_connection); - log_batch_config(&config.batch); -} - -fn log_pg_connection_config(config: &PgConnectionConfig) { - debug!( - host = config.host, - port = config.port, - dbname = config.name, - username = config.username, - tls_enabled = config.tls.enabled, - "source postgres connection config", - ); -} - -fn log_batch_config(config: &BatchConfig) { - debug!( - max_fill_ms = config.max_fill_ms, - memory_budget_ratio = config.memory_budget_ratio, - "batch config" - ); -} - /// Initializes the state store. /// /// Creates a [`PostgresStore`] instance for the given pipeline and connection From 2831b72dbab38f4bdbeba68f757a03d9a9195e31 Mon Sep 17 00:00:00 2001 From: Jordan McQueen Date: Tue, 7 Apr 2026 18:29:22 +0900 Subject: [PATCH 13/86] Sort Cargo.toml files --- Cargo.toml | 2 +- etl-destinations/Cargo.toml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 2637972a7..7db83f553 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -45,8 +45,8 @@ base64 = { version = "0.22.1", default-features = false } byteorder = { version = "1.5.0", default-features = false } bytes = { version = "1.10.1" } chrono = { version = "0.4.41", default-features = false } -clickhouse = { version = "0.14", default-features = false } clap = { version = "4.5.42", default-features = false } +clickhouse = { version = "0.14", default-features = false } config = { version = "0.14", default-features = false } configcat = { version = "0.1.3", default-features = false } const-oid = { version = "0.9.6", default-features = false } diff --git a/etl-destinations/Cargo.toml b/etl-destinations/Cargo.toml index a2451aaa4..85cef3cfd 100644 --- a/etl-destinations/Cargo.toml +++ b/etl-destinations/Cargo.toml @@ -51,7 +51,7 @@ clickhouse = [ "dep:tokio", "dep:serde", "dep:futures", - "dep:parking_lot" + "dep:parking_lot", ] egress = ["etl/egress"] # We assume that `test-utils` is always used in conjunction with `bigquery` or `iceberg` thus we only From 04ead7c981afb6531cb9261204ec1e5d32085d67 Mon Sep 17 00:00:00 2001 From: Jordan McQueen Date: Wed, 8 Apr 2026 20:55:17 +0900 Subject: [PATCH 14/86] Remove skip_if_missing_clickhouse_env_vars from tests Silently skipping tests when env vars are missing gives false confidence. The tests should fail explicitly so missing configuration is always noticed. --- etl-destinations/src/clickhouse/test_utils.rs | 23 ------------------- etl-destinations/tests/clickhouse_pipeline.rs | 12 +--------- 2 files changed, 1 insertion(+), 34 deletions(-) diff --git a/etl-destinations/src/clickhouse/test_utils.rs b/etl-destinations/src/clickhouse/test_utils.rs index e838ebe34..70662c9ef 100644 --- a/etl-destinations/src/clickhouse/test_utils.rs +++ b/etl-destinations/src/clickhouse/test_utils.rs @@ -16,29 +16,6 @@ pub const CLICKHOUSE_USER_ENV: &str = "TESTS_CLICKHOUSE_USER"; /// ClickHouse password (optional — omit or leave empty for passwordless access). pub const CLICKHOUSE_PASSWORD_ENV: &str = "TESTS_CLICKHOUSE_PASSWORD"; -/// Returns whether ClickHouse integration tests should be skipped. -/// -/// Prints a warning and returns `true` when any required env var is missing. -/// Required: [`CLICKHOUSE_URL_ENV`], [`CLICKHOUSE_USER_ENV`]. -/// Optional: [`CLICKHOUSE_PASSWORD_ENV`]. -pub fn skip_if_missing_clickhouse_env_vars() -> bool { - let missing: Vec<&str> = [CLICKHOUSE_URL_ENV, CLICKHOUSE_USER_ENV] - .iter() - .copied() - .filter(|var| std::env::var_os(var).is_none()) - .collect(); - - if missing.is_empty() { - return false; - } - - eprintln!( - "skipping clickhouse integration test: missing {}", - missing.join(", ") - ); - true -} - /// Returns the ClickHouse HTTP URL from the environment. /// /// # Panics diff --git a/etl-destinations/tests/clickhouse_pipeline.rs b/etl-destinations/tests/clickhouse_pipeline.rs index 8205c5022..d9bb12908 100644 --- a/etl-destinations/tests/clickhouse_pipeline.rs +++ b/etl-destinations/tests/clickhouse_pipeline.rs @@ -5,9 +5,7 @@ use etl::test_utils::database::{spawn_source_database, test_table_name}; use etl::test_utils::notifying_store::NotifyingStore; use etl::test_utils::pipeline::create_pipeline; use etl::types::PipelineId; -use etl_destinations::clickhouse::test_utils::{ - ClickHouseTestDatabase, setup_clickhouse_database, skip_if_missing_clickhouse_env_vars, -}; +use etl_destinations::clickhouse::test_utils::{ClickHouseTestDatabase, setup_clickhouse_database}; use etl_telemetry::tracing::init_test_tracing; use rand::random; use std::sync::Once; @@ -114,10 +112,6 @@ async fn wait_for_update_flow_rows( /// Row 2 has **non-empty** arrays (fails with the old code, passes with the fix). #[tokio::test(flavor = "multi_thread")] async fn all_types_table_copy() { - if skip_if_missing_clickhouse_env_vars() { - return; - } - init_test_tracing(); install_crypto_provider(); @@ -340,10 +334,6 @@ async fn all_types_table_copy() { /// a second row with `cdc_operation = "UPDATE"` and a positive LSN. #[tokio::test(flavor = "multi_thread")] async fn updates_are_streamed_to_clickhouse() { - if skip_if_missing_clickhouse_env_vars() { - return; - } - init_test_tracing(); install_crypto_provider(); From 0a92f3d878c36d4be2da53fe37742923f5abd839 Mon Sep 17 00:00:00 2001 From: Jordan McQueen Date: Thu, 9 Apr 2026 15:49:07 +0900 Subject: [PATCH 15/86] Add boundary-values integration test for ClickHouse Exercises encoding edge cases the all_types test does not cover: - NULL scalars (nullable columns with SQL NULL) - NULL elements inside arrays (ARRAY[1, NULL, 3]) - Empty strings distinct from NULLs - Single-element arrays (varint length = 0x01) - Multi-byte UTF-8 (emoji, CJK) in scalars and arrays - Zero integer distinct from NULL --- etl-destinations/tests/clickhouse_pipeline.rs | 194 +++++++++++++++++- etl-destinations/tests/support/clickhouse.rs | 14 ++ 2 files changed, 207 insertions(+), 1 deletion(-) diff --git a/etl-destinations/tests/clickhouse_pipeline.rs b/etl-destinations/tests/clickhouse_pipeline.rs index d9bb12908..152b0dfa1 100644 --- a/etl-destinations/tests/clickhouse_pipeline.rs +++ b/etl-destinations/tests/clickhouse_pipeline.rs @@ -12,7 +12,7 @@ use std::sync::Once; use std::time::Duration; use tokio::time::sleep; -use crate::support::clickhouse::AllTypesRow; +use crate::support::clickhouse::{AllTypesRow, BoundaryValuesRow}; mod support; @@ -408,3 +408,195 @@ async fn updates_are_streamed_to_clickhouse() { "streamed update should have a positive LSN" ); } + +// ── Boundary-values test ───────────────────────────────────────────────────── + +const BOUNDARY_VALUES_SELECT: &str = concat!( + "SELECT id, nullable_text, nullable_int, ", + "int_array_col, text_array_col, ", + "cdc_operation ", + "FROM \"test_boundary__values\" ", + "ORDER BY id", +); + +/// Tests that edge-case values survive the Postgres → ClickHouse pipeline +/// without data loss or corruption. +/// +/// # GIVEN +/// +/// A Postgres table with nullable scalar columns and nullable array columns, +/// populated with four rows that exercise encoding boundary conditions: +/// +/// 1. **All NULLs** — nullable scalars are NULL, arrays are empty. +/// 2. **NULL elements inside arrays** — `{1, NULL, 3}`, `{'a', NULL, 'c'}`. +/// 3. **Empty strings** — a present-but-empty text value next to a NULL integer, +/// plus single-element arrays (varint length = 1). +/// 4. **Multi-byte UTF-8** — emoji and CJK characters, verifying that the +/// RowBinary varint encodes byte length (not character count) correctly. +/// +/// # WHEN +/// +/// The pipeline runs initial table copy from Postgres to ClickHouse. +/// +/// # THEN +/// +/// Every row in ClickHouse exactly matches what was inserted into Postgres: +/// - SQL NULLs remain NULL (not empty string, not zero). +/// - Empty strings remain empty strings (not NULL). +/// - Array elements preserve their position, including interior NULLs. +/// - Multi-byte text round-trips byte-for-byte. +#[tokio::test(flavor = "multi_thread")] +async fn boundary_values_table_copy() { + init_test_tracing(); + install_crypto_provider(); + + // ── GIVEN: Postgres source with boundary-value rows ────────────────────── + + let database = spawn_source_database().await; + let table_name = test_table_name("boundary_values"); + + let table_id = database + .create_table( + table_name.clone(), + true, + &[ + ("nullable_text", "text"), // nullable + ("nullable_int", "integer"), // nullable + ("int_array_col", "integer[]"), // Array(Nullable(Int32)) + ("text_array_col", "text[]"), // Array(Nullable(String)) + ], + ) + .await + .expect("Failed to create boundary_values table"); + + let publication_name = "test_pub_ch_boundary"; + database + .create_publication(publication_name, std::slice::from_ref(&table_name)) + .await + .expect("Failed to create publication"); + + // Row 1: all nullable columns are NULL, arrays are empty. + database + .run_sql(&format!( + "INSERT INTO {} (nullable_text, nullable_int, int_array_col, text_array_col) \ + VALUES (NULL, NULL, ARRAY[]::integer[], ARRAY[]::text[])", + table_name.as_quoted_identifier(), + )) + .await + .expect("Failed to insert row 1 (all NULLs)"); + + // Row 2: arrays with interior NULL elements — the element at index 1 is NULL + // while surrounding elements are present. + database + .run_sql(&format!( + "INSERT INTO {} (nullable_text, nullable_int, int_array_col, text_array_col) \ + VALUES ('present', 42, ARRAY[1, NULL, 3]::integer[], ARRAY['a', NULL, 'c']::text[])", + table_name.as_quoted_identifier(), + )) + .await + .expect("Failed to insert row 2 (NULL array elements)"); + + // Row 3: empty string (not NULL) for text, NULL for integer, and + // single-element arrays (varint length byte = 0x01). + database + .run_sql(&format!( + "INSERT INTO {} (nullable_text, nullable_int, int_array_col, text_array_col) \ + VALUES ('', NULL, ARRAY[99]::integer[], ARRAY['only']::text[])", + table_name.as_quoted_identifier(), + )) + .await + .expect("Failed to insert row 3 (empty string + single-element arrays)"); + + // Row 4: multi-byte UTF-8 — emoji (4 bytes per char) and CJK (3 bytes per char). + // The RowBinary varint encodes byte length, not character count. + database + .run_sql(&format!( + "INSERT INTO {} (nullable_text, nullable_int, int_array_col, text_array_col) \ + VALUES ('hello 🌍🚀', 0, ARRAY[1, 2], ARRAY['日本語', '中文'])", + table_name.as_quoted_identifier(), + )) + .await + .expect("Failed to insert row 4 (multi-byte UTF-8)"); + + // ── WHEN: pipeline copies data to ClickHouse ───────────────────────────── + + let ch_db = setup_clickhouse_database().await; + let store = NotifyingStore::new(); + let pipeline_id: PipelineId = random(); + let destination = ch_db.build_destination(pipeline_id, store.clone()); + + let table_ready = store + .notify_on_table_state_type(table_id, TableReplicationPhaseType::Ready) + .await; + + let mut pipeline = create_pipeline( + &database.config, + pipeline_id, + publication_name.to_owned(), + store.clone(), + destination, + ); + + pipeline.start().await.unwrap(); + table_ready.notified().await; + pipeline.shutdown_and_wait().await.unwrap(); + + // ── THEN: ClickHouse data matches Postgres exactly ─────────────────────── + + let rows: Vec = ch_db.query(BOUNDARY_VALUES_SELECT).await; + assert_eq!(rows.len(), 4, "expected 4 rows in ClickHouse"); + + // Row 1: NULL scalars stay NULL, empty arrays stay empty. + let r = &rows[0]; + assert_eq!( + r.nullable_text, None, + "NULL text must not become empty string" + ); + assert_eq!(r.nullable_int, None, "NULL int must not become zero"); + assert!(r.int_array_col.is_empty()); + assert!(r.text_array_col.is_empty()); + + // Row 2: interior NULLs preserved in position. + let r = &rows[1]; + assert_eq!(r.nullable_text.as_deref(), Some("present")); + assert_eq!(r.nullable_int, Some(42)); + assert_eq!( + r.int_array_col, + vec![Some(1), None, Some(3)], + "interior NULL in integer array must be preserved" + ); + assert_eq!( + r.text_array_col, + vec![Some("a".to_string()), None, Some("c".to_string())], + "interior NULL in text array must be preserved" + ); + + // Row 3: empty string is distinct from NULL. + let r = &rows[2]; + assert_eq!( + r.nullable_text.as_deref(), + Some(""), + "empty string must round-trip as empty string, not NULL" + ); + assert_eq!(r.nullable_int, None); + assert_eq!(r.int_array_col, vec![Some(99)], "single-element array"); + assert_eq!( + r.text_array_col, + vec![Some("only".to_string())], + "single-element array" + ); + + // Row 4: multi-byte UTF-8 preserved byte-for-byte. + let r = &rows[3]; + assert_eq!( + r.nullable_text.as_deref(), + Some("hello 🌍🚀"), + "multi-byte UTF-8 must round-trip exactly" + ); + assert_eq!(r.nullable_int, Some(0), "zero must not become NULL"); + assert_eq!( + r.text_array_col, + vec![Some("日本語".to_string()), Some("中文".to_string())], + "multi-byte UTF-8 in arrays must round-trip exactly" + ); +} diff --git a/etl-destinations/tests/support/clickhouse.rs b/etl-destinations/tests/support/clickhouse.rs index 5878d0c9a..cb5d3ccc7 100644 --- a/etl-destinations/tests/support/clickhouse.rs +++ b/etl-destinations/tests/support/clickhouse.rs @@ -76,3 +76,17 @@ impl PartialEq for AllTypesRow { } impl Eq for AllTypesRow {} + +/// A row read back from the ClickHouse `boundary_values` test table. +/// +/// Covers edge cases that the `all_types` test does not: nullable scalars, +/// NULL array elements, empty strings, and multi-byte UTF-8. +#[derive(clickhouse::Row, serde::Deserialize, Debug)] +pub struct BoundaryValuesRow { + pub id: i64, + pub nullable_text: Option, + pub nullable_int: Option, + pub int_array_col: Vec>, + pub text_array_col: Vec>, + pub cdc_operation: String, +} From 6ba07bb5707a33bdbe9ac5f9c97ea13c2dc5b998 Mon Sep 17 00:00:00 2001 From: Jordan McQueen Date: Thu, 9 Apr 2026 15:56:30 +0900 Subject: [PATCH 16/86] Add ClickHouse env vars to CI The docker-compose stack already starts a ClickHouse container but the test env vars were not set, causing the integration tests to panic. Wire up the same defaults used by docker-compose. --- .github/workflows/ci.yml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 3c38ec412..54bf11b31 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -147,6 +147,9 @@ jobs: TESTS_DATABASE_PORT: 5430 TESTS_DATABASE_USERNAME: postgres TESTS_DATABASE_PASSWORD: postgres + TESTS_CLICKHOUSE_URL: http://localhost:8123 + TESTS_CLICKHOUSE_USER: etl + TESTS_CLICKHOUSE_PASSWORD: etl ETL_DUCKDB_EXTENSION_ROOT: ${{ github.workspace }}/vendor/duckdb/extensions steps: - name: Checkout From 363aa29dab280521d37f4a8df73e4c1377d6bbb5 Mon Sep 17 00:00:00 2001 From: Jordan McQueen Date: Fri, 10 Apr 2026 14:24:54 +0900 Subject: [PATCH 17/86] Fix missing ClickHouse secret deletion on pipeline teardown delete_dynamic_replicator_secrets deleted BigQuery, Iceberg, and Ducklake secrets but not ClickHouse, leaving orphaned K8s secrets when ClickHouse pipelines were deleted or changed destination type. --- etl-api/src/k8s/core.rs | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/etl-api/src/k8s/core.rs b/etl-api/src/k8s/core.rs index dff63f2ef..02d6c93e8 100644 --- a/etl-api/src/k8s/core.rs +++ b/etl-api/src/k8s/core.rs @@ -391,13 +391,12 @@ async fn delete_dynamic_replicator_secrets( ) -> Result<(), PipelineError> { k8s_client.delete_postgres_secret(prefix).await?; - // Although it won't happen that there are both bq and iceberg secrets at the same time - // we delete them both here because the state in the db might not be the same as that - // running in the k8s cluster. E.g. if a pipeline is updated from bq to iceberg or vice-versa - // then there's a risk of wrong secret type being attempted for deletion which might leave - // the actual secret behind. So for simplicty we just delete both kinds of secrets. The - // one which doesn't exist will be safely ignored. + // Delete all destination-specific secret types unconditionally. Only one will + // exist at a time, but if a pipeline's destination was changed (e.g. BigQuery → + // ClickHouse) the old secret type might still be present. Deleting a + // non-existent secret is a safe no-op. k8s_client.delete_bigquery_secret(prefix).await?; + k8s_client.delete_clickhouse_secret(prefix).await?; k8s_client.delete_iceberg_secret(prefix).await?; k8s_client.delete_ducklake_secret(prefix).await?; From bc376b02f60002783cc49b66131fce8e07aa1e18 Mon Sep 17 00:00:00 2001 From: Jordan McQueen Date: Fri, 10 Apr 2026 14:32:01 +0900 Subject: [PATCH 18/86] Error on NULL value in non-nullable ClickHouse column Previously this silently wrote a zero-length string, corrupting data without any signal. Now returns a ConversionError so the problem is surfaced to the operator. --- etl-destinations/src/clickhouse/encoding.rs | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/etl-destinations/src/clickhouse/encoding.rs b/etl-destinations/src/clickhouse/encoding.rs index 7ab48b7b4..de2663b4c 100644 --- a/etl-destinations/src/clickhouse/encoding.rs +++ b/etl-destinations/src/clickhouse/encoding.rs @@ -228,9 +228,13 @@ pub(crate) fn rb_encode_nullable(val: ClickHouseValue, buf: &mut Vec) -> Etl pub(crate) fn rb_encode_value(val: ClickHouseValue, buf: &mut Vec) -> EtlResult<()> { match val { ClickHouseValue::Null => { - // A non-nullable column unexpectedly received NULL (data quality issue from - // Postgres). Write a zero-length string as the least-harmful fallback. - buf.push(0); // varint 0 = empty string + // The Postgres schema says this column is NOT NULL, but a NULL arrived. + // If this proves too strict (e.g. transient schema mismatches), we could + // downgrade to writing a zero-length string as a silent fallback. + return Err(etl_error!( + ErrorKind::ConversionError, + "NULL value for non-nullable ClickHouse column" + )); } ClickHouseValue::Bool(b) => buf.push(b as u8), ClickHouseValue::Int16(v) => buf.extend_from_slice(&v.to_le_bytes()), From 935112a6fa26ed68ee7917a27c4d42a9c820a454 Mon Sep 17 00:00:00 2001 From: Jordan McQueen Date: Fri, 10 Apr 2026 16:06:30 +0900 Subject: [PATCH 19/86] Use ApplyWorkerPanic error kind for ClickHouse JoinSet failures The previous ErrorKind::Unknown discarded the JoinError, losing the panic message. Now preserves the error as a detail string and uses a more specific error kind matching the DuckLake convention. --- etl-destinations/src/clickhouse/core.rs | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/etl-destinations/src/clickhouse/core.rs b/etl-destinations/src/clickhouse/core.rs index 7c6610d7a..cd3403c74 100644 --- a/etl-destinations/src/clickhouse/core.rs +++ b/etl-destinations/src/clickhouse/core.rs @@ -346,8 +346,13 @@ where } while let Some(result) = join_set.join_next().await { - result - .map_err(|_| etl_error!(ErrorKind::Unknown, "Failed to join future"))??; + result.map_err(|e| { + etl_error!( + ErrorKind::ApplyWorkerPanic, + "insert task failed", + e.to_string() + ) + })??; } } From 0201934b9a385ac454cd9606833c8605605fdcdd Mon Sep 17 00:00:00 2001 From: Jordan McQueen Date: Mon, 13 Apr 2026 15:07:33 +0900 Subject: [PATCH 20/86] Remove unused PartialEq/Eq impls from AllTypesRow --- etl-destinations/tests/support/clickhouse.rs | 38 -------------------- 1 file changed, 38 deletions(-) diff --git a/etl-destinations/tests/support/clickhouse.rs b/etl-destinations/tests/support/clickhouse.rs index cb5d3ccc7..484dcf19f 100644 --- a/etl-destinations/tests/support/clickhouse.rs +++ b/etl-destinations/tests/support/clickhouse.rs @@ -39,44 +39,6 @@ pub struct AllTypesRow { pub cdc_operation: String, } -impl PartialEq for AllTypesRow { - fn eq(&self, other: &Self) -> bool { - fn f32_eq(a: f32, b: f32) -> bool { - (a - b).abs() < 1e-3 - } - fn f64_eq(a: f64, b: f64) -> bool { - (a - b).abs() < 1e-6 - } - - self.id == other.id - && self.smallint_col == other.smallint_col - && self.integer_col == other.integer_col - && self.bigint_col == other.bigint_col - && f32_eq(self.real_col, other.real_col) - && f64_eq(self.double_col, other.double_col) - && self.numeric_col == other.numeric_col - && self.boolean_col == other.boolean_col - && self.text_col == other.text_col - && self.varchar_col == other.varchar_col - && self.date_col == other.date_col - && self.timestamp_col == other.timestamp_col - && self.timestamptz_col == other.timestamptz_col - && self.time_col == other.time_col - && self.jsonb_col == other.jsonb_col - && self.json_col == other.json_col - && self.integer_array_col == other.integer_array_col - && self.text_array_col == other.text_array_col - && self.bytea_col == other.bytea_col - && self.inet_col == other.inet_col - && self.cidr_col == other.cidr_col - && self.macaddr_col == other.macaddr_col - && self.uuid_col.to_lowercase() == other.uuid_col.to_lowercase() - && self.cdc_operation == other.cdc_operation - } -} - -impl Eq for AllTypesRow {} - /// A row read back from the ClickHouse `boundary_values` test table. /// /// Covers edge cases that the `all_types` test does not: nullable scalars, From 6e8f6d2c7d5b081df5ebcb68c44b4c580b898faf Mon Sep 17 00:00:00 2001 From: Jordan McQueen Date: Mon, 13 Apr 2026 16:04:29 +0900 Subject: [PATCH 21/86] Improve wait_for_update_flow_rows timeout diagnostic --- etl-destinations/tests/clickhouse_pipeline.rs | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/etl-destinations/tests/clickhouse_pipeline.rs b/etl-destinations/tests/clickhouse_pipeline.rs index 152b0dfa1..be444fb92 100644 --- a/etl-destinations/tests/clickhouse_pipeline.rs +++ b/etl-destinations/tests/clickhouse_pipeline.rs @@ -82,15 +82,20 @@ async fn wait_for_update_flow_rows( ch_db: &ClickHouseTestDatabase, expected_rows: usize, ) -> Vec { + let mut rows: Vec = Vec::with_capacity(expected_rows); for _ in 0..50 { - let rows: Vec = ch_db.query(UPDATE_FLOW_SELECT).await; + rows = ch_db.query(UPDATE_FLOW_SELECT).await; if rows.len() >= expected_rows { return rows; } sleep(Duration::from_millis(100)).await; } - panic!("timed out waiting for clickhouse update_flow rows"); + panic!( + "timed out waiting for clickhouse update_flow rows: got {} of {}", + rows.len(), + expected_rows, + ); } /// Tests that all Postgres column types (including nullable arrays) round-trip From b16f6f0b63da3b15dc06dc51564dbf3047e035b0 Mon Sep 17 00:00:00 2001 From: Jordan McQueen Date: Tue, 14 Apr 2026 15:39:17 +0900 Subject: [PATCH 22/86] Add DELETE streaming integration test for ClickHouse MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Uses REPLICA IDENTITY FULL so Postgres sends all column values in the DELETE event. Inserts two rows, deletes one, and verifies the correct row appears with cdc_operation = "DELETE", old data preserved, and a positive LSN — while the other row remains untouched. --- etl-destinations/tests/clickhouse_pipeline.rs | 153 ++++++++++++++++++ 1 file changed, 153 insertions(+) diff --git a/etl-destinations/tests/clickhouse_pipeline.rs b/etl-destinations/tests/clickhouse_pipeline.rs index be444fb92..9b3209f8e 100644 --- a/etl-destinations/tests/clickhouse_pipeline.rs +++ b/etl-destinations/tests/clickhouse_pipeline.rs @@ -66,6 +66,13 @@ const UPDATE_FLOW_SELECT: &str = concat!( "ORDER BY id, cdc_lsn", ); +/// SELECT query used to verify the `delete_flow` streaming test. +const DELETE_FLOW_SELECT: &str = concat!( + "SELECT id, value, cdc_operation, cdc_lsn ", + "FROM \"test_delete__flow\" ", + "ORDER BY id, cdc_lsn", +); + /// Days from 1970-01-01 to 2024-01-15 (used to verify the `date_col` round-trip). /// /// Python: `(date(2024, 1, 15) - date(1970, 1, 1)).days` = 19737 @@ -98,6 +105,27 @@ async fn wait_for_update_flow_rows( ); } +/// Waits until ClickHouse returns at least `expected_rows` from `DELETE_FLOW_SELECT`. +async fn wait_for_delete_flow_rows( + ch_db: &ClickHouseTestDatabase, + expected_rows: usize, +) -> Vec { + let mut rows: Vec = Vec::with_capacity(expected_rows); + for _ in 0..50 { + rows = ch_db.query(DELETE_FLOW_SELECT).await; + if rows.len() >= expected_rows { + return rows; + } + sleep(Duration::from_millis(100)).await; + } + + panic!( + "timed out waiting for clickhouse delete_flow rows: got {} of {}", + rows.len(), + expected_rows, + ); +} + /// Tests that all Postgres column types (including nullable arrays) round-trip /// correctly through the ClickHouse RowBinary encoding. /// @@ -605,3 +633,128 @@ async fn boundary_values_table_copy() { "multi-byte UTF-8 in arrays must round-trip exactly" ); } + +/// Tests that DELETE events are streamed to ClickHouse after the initial table copy. +/// +/// # GIVEN +/// +/// A Postgres table with `REPLICA IDENTITY FULL` (so Postgres sends all column +/// values in DELETE events, not just the primary key), populated with two rows: +/// +/// 1. `id=1, value='keep_me'` — will remain untouched. +/// 2. `id=2, value='delete_me'` — will be deleted after table copy. +/// +/// # WHEN +/// +/// The pipeline copies both rows, then a `DELETE ... WHERE id = 2` is issued +/// against Postgres. +/// +/// # THEN +/// +/// ClickHouse contains three rows (append-only CDC): +/// - Two `INSERT` rows from the initial table copy (`cdc_lsn = 0`). +/// - One `DELETE` row for `id=2` with the old row data preserved and a positive LSN. +/// - The `id=1` row has no corresponding `DELETE`. +#[tokio::test(flavor = "multi_thread")] +async fn deletes_are_streamed_to_clickhouse() { + init_test_tracing(); + install_crypto_provider(); + + // ── GIVEN: Postgres source with two rows, REPLICA IDENTITY FULL ───────── + + let database = spawn_source_database().await; + let table_name = test_table_name("delete_flow"); + + let table_id = database + .create_table(table_name.clone(), true, &[("value", "text not null")]) + .await + .expect("Failed to create delete_flow test table"); + + database + .run_sql(&format!( + "ALTER TABLE {} REPLICA IDENTITY FULL", + table_name.as_quoted_identifier(), + )) + .await + .expect("Failed to set replica identity full"); + + let publication_name = "test_pub_clickhouse_deletes"; + database + .create_publication(publication_name, std::slice::from_ref(&table_name)) + .await + .expect("Failed to create delete_flow publication"); + + database + .run_sql(&format!( + "INSERT INTO {} (value) VALUES ('keep_me'), ('delete_me')", + table_name.as_quoted_identifier(), + )) + .await + .expect("Failed to insert delete_flow rows"); + + // ── WHEN: pipeline copies data, then a DELETE is streamed ──────────────── + + let ch_db = setup_clickhouse_database().await; + let store = NotifyingStore::new(); + let pipeline_id: PipelineId = random(); + let destination = ch_db.build_destination(pipeline_id, store.clone()); + + let table_ready = store + .notify_on_table_state_type(table_id, TableReplicationPhaseType::Ready) + .await; + + let mut pipeline = create_pipeline( + &database.config, + pipeline_id, + publication_name.to_owned(), + store, + destination, + ); + + pipeline.start().await.unwrap(); + table_ready.notified().await; + + database + .run_sql(&format!( + "DELETE FROM {} WHERE id = 2", + table_name.as_quoted_identifier(), + )) + .await + .expect("Failed to delete delete_flow row"); + + let rows = wait_for_delete_flow_rows(&ch_db, 3).await; + + pipeline.shutdown_and_wait().await.unwrap(); + + // ── THEN: two INSERTs from table copy, one DELETE from streaming ───────── + + assert_eq!( + rows.len(), + 3, + "expected 2 copied rows plus 1 streamed delete" + ); + + // Row 1: copied, untouched. + let r = &rows[0]; + assert_eq!(r.id, 1); + assert_eq!(r.value, "keep_me"); + assert_eq!(r.cdc_operation, "INSERT"); + assert_eq!(r.cdc_lsn, 0); + + // Row 2: copied, then deleted. + let r = &rows[1]; + assert_eq!(r.id, 2); + assert_eq!(r.value, "delete_me"); + assert_eq!(r.cdc_operation, "INSERT"); + assert_eq!(r.cdc_lsn, 0); + + // Row 3: the streamed DELETE for id=2, preserving old row data. + let r = &rows[2]; + assert_eq!(r.id, 2, "delete must target the correct row"); + assert_eq!( + r.value, "delete_me", + "old row data must be preserved in DELETE" + ); + assert_eq!(r.cdc_operation, "DELETE"); + assert!(r.cdc_lsn > 0, "streamed delete should have a positive LSN"); +} From 40742da948fb07c6dc4daf622813284071420799 Mon Sep 17 00:00:00 2001 From: Jordan McQueen Date: Tue, 14 Apr 2026 16:01:39 +0900 Subject: [PATCH 23/86] Add GIVEN/WHEN/THEN structure to ClickHouse integration tests --- etl-destinations/tests/clickhouse_pipeline.rs | 118 +++++++++++------- etl-destinations/tests/support/clickhouse.rs | 14 +-- 2 files changed, 77 insertions(+), 55 deletions(-) diff --git a/etl-destinations/tests/clickhouse_pipeline.rs b/etl-destinations/tests/clickhouse_pipeline.rs index 9b3209f8e..f2cdd71d9 100644 --- a/etl-destinations/tests/clickhouse_pipeline.rs +++ b/etl-destinations/tests/clickhouse_pipeline.rs @@ -34,9 +34,9 @@ fn install_crypto_provider() { /// canonical `xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx` string form. /// /// All other columns are read with their native ClickHouse types: -/// - `Date` → u16 (days since 1970-01-01) -/// - `DateTime64(6)` → i64 (microseconds since epoch) -/// - `Array(Nullable(T))` → `Vec>` +/// - `Date` -> u16 (days since 1970-01-01) +/// - `DateTime64(6)` -> i64 (microseconds since epoch) +/// - `Array(Nullable(T))` -> `Vec>` const ALL_TYPES_SELECT: &str = concat!( "SELECT ", "id, smallint_col, integer_col, bigint_col, real_col, double_col, ", @@ -129,26 +129,41 @@ async fn wait_for_delete_flow_rows( /// Tests that all Postgres column types (including nullable arrays) round-trip /// correctly through the ClickHouse RowBinary encoding. /// -/// # Regression test +/// # GIVEN +/// +/// A Postgres table covering every supported column type -- scalars (integers, +/// floats, numeric, boolean, text, varchar, date, timestamp, timestamptz, time, +/// interval, jsonb, json, bytea, inet, cidr, macaddr, uuid) and nullable array +/// columns (`integer[]`, `text[]`). Two rows are inserted before the pipeline +/// starts: +/// +/// 1. Positive/typical values with **empty** arrays. +/// 2. Boundary values (min-ints, negative floats) with **non-empty** arrays. +/// +/// # WHEN /// -/// This test specifically catches the nullable-array encoding bug where -/// `nullable_flags[i] = true` for array columns caused `rb_encode_nullable` to -/// prepend an extra null-indicator byte. ClickHouse read that byte as `varint(0)` -/// (empty array) and then parsed the actual element bytes as subsequent column -/// data, ultimately failing with "Cannot read all data" at row 2. +/// The pipeline runs initial table copy from Postgres to ClickHouse. +/// +/// # THEN /// -/// The fix: array columns always use `nullable_flags[i] = false` because the DDL -/// emits `Array(Nullable(T))` without an outer `Nullable` wrapper. +/// Every column round-trips correctly: +/// - Scalars match their inserted values exactly (floats within epsilon). +/// - Empty arrays remain empty; non-empty arrays preserve elements. +/// - Both rows have `cdc_operation = "INSERT"`. /// -/// Row 1 has **empty** arrays (accidentally passed with the old code because -/// `0x00` null-indicator == `varint(0)` = empty array). -/// Row 2 has **non-empty** arrays (fails with the old code, passes with the fix). +/// # Regression +/// +/// Row 2's non-empty arrays specifically catch the nullable-array encoding bug +/// where `nullable_flags[i] = true` for array columns caused `rb_encode_nullable` +/// to prepend an extra null-indicator byte. ClickHouse read that byte as +/// `varint(0)` (empty array) and then parsed the actual element bytes as +/// subsequent column data, failing with "Cannot read all data" at row 2. #[tokio::test(flavor = "multi_thread")] async fn all_types_table_copy() { init_test_tracing(); install_crypto_provider(); - // ── Postgres source ─────────────────────────────────────────────────────── + // --- GIVEN: Postgres source with all supported column types --- let database = spawn_source_database().await; let table_name = test_table_name("all_types_encoding"); @@ -196,9 +211,6 @@ async fn all_types_table_copy() { .await .expect("Failed to create publication"); - // Insert rows BEFORE starting the pipeline — they will be captured by the - // initial table-copy phase (write_table_rows path). - // // Row 1: empty arrays. With the old encoding bug, this accidentally // produced valid RowBinary because `0x00` (null-indicator) == // varint(0) == empty array. @@ -260,7 +272,7 @@ async fn all_types_table_copy() { .await .expect("Failed to insert row 2"); - // ── ClickHouse destination ──────────────────────────────────────────────── + // --- WHEN: pipeline copies data to ClickHouse --- let ch_db = setup_clickhouse_database().await; let store = NotifyingStore::new(); let pipeline_id: PipelineId = random(); @@ -282,12 +294,12 @@ async fn all_types_table_copy() { table_ready.notified().await; pipeline.shutdown_and_wait().await.unwrap(); - // ── Verify ClickHouse data ──────────────────────────────────────────────── + // --- THEN: every column round-trips correctly --- let rows: Vec = ch_db.query(ALL_TYPES_SELECT).await; assert_eq!(rows.len(), 2, "expected 2 rows in ClickHouse"); - // ── Row 1 assertions ───────────────────────────────────────────────────── + // Row 1: positive/typical values, empty arrays. let r1 = &rows[0]; assert_eq!(r1.id, 1); assert_eq!(r1.smallint_col, 42); @@ -321,7 +333,7 @@ async fn all_types_table_copy() { "f47ac10b-58cc-4372-a567-0e02b2c3d479" ); assert_eq!(r1.cdc_operation, "INSERT"); - // Empty arrays — the regression case that accidentally worked before the fix. + // Empty arrays -- the regression case that accidentally worked before the fix. assert_eq!( r1.integer_array_col, Vec::>::new(), @@ -333,7 +345,7 @@ async fn all_types_table_copy() { "row 1 text_array_col should be empty" ); - // ── Row 2 assertions ───────────────────────────────────────────────────── + // Row 2: boundary values, non-empty arrays (the regression case). let r2 = &rows[1]; assert_eq!(r2.id, 2); assert_eq!(r2.smallint_col, -32768); @@ -347,29 +359,41 @@ async fn all_types_table_copy() { "a1b2c3d4-e5f6-7890-abcd-ef1234567890" ); assert_eq!(r2.cdc_operation, "INSERT"); - // Non-empty arrays — the regression case that triggered the bug before the fix. + // Non-empty arrays -- the regression case that triggered the bug before the fix. assert_eq!( r2.integer_array_col, vec![Some(1), Some(2), Some(3)], - "row 2 integer_array_col mismatch — nullable-array encoding bug likely present" + "row 2 integer_array_col mismatch -- nullable-array encoding bug likely present" ); assert_eq!( r2.text_array_col, vec![Some("alpha".to_string()), Some("beta".to_string())], - "row 2 text_array_col mismatch — nullable-array encoding bug likely present" + "row 2 text_array_col mismatch -- nullable-array encoding bug likely present" ); } /// Tests that UPDATE events are streamed to ClickHouse after the initial table copy. /// -/// ClickHouse is append-only for CDC in this destination, so the original copied row -/// remains present with `cdc_operation = "INSERT"` and the streamed change arrives as -/// a second row with `cdc_operation = "UPDATE"` and a positive LSN. +/// # GIVEN +/// +/// A Postgres table with a single row (`id=1, value='before'`). +/// +/// # WHEN +/// +/// The pipeline copies the row, then an `UPDATE ... SET value = 'after'` is +/// issued against Postgres. +/// +/// # THEN +/// +/// ClickHouse contains two rows (append-only CDC): +/// - The original `INSERT` from table copy with `cdc_lsn = 0`. +/// - The streamed `UPDATE` with the new value and a positive LSN. #[tokio::test(flavor = "multi_thread")] async fn updates_are_streamed_to_clickhouse() { init_test_tracing(); install_crypto_provider(); + // --- GIVEN: Postgres source with one row --- let database = spawn_source_database().await; let table_name = test_table_name("update_flow"); @@ -392,6 +416,7 @@ async fn updates_are_streamed_to_clickhouse() { .await .expect("Failed to insert initial update_flow row"); + // --- WHEN: pipeline copies data, then an UPDATE is streamed --- let ch_db = setup_clickhouse_database().await; let store = NotifyingStore::new(); let pipeline_id: PipelineId = random(); @@ -424,6 +449,7 @@ async fn updates_are_streamed_to_clickhouse() { pipeline.shutdown_and_wait().await.unwrap(); + // --- THEN: one INSERT from table copy, one UPDATE from streaming --- assert_eq!(rows.len(), 2, "expected copied row plus streamed update"); let insert_row = &rows[0]; @@ -442,8 +468,6 @@ async fn updates_are_streamed_to_clickhouse() { ); } -// ── Boundary-values test ───────────────────────────────────────────────────── - const BOUNDARY_VALUES_SELECT: &str = concat!( "SELECT id, nullable_text, nullable_int, ", "int_array_col, text_array_col, ", @@ -452,7 +476,7 @@ const BOUNDARY_VALUES_SELECT: &str = concat!( "ORDER BY id", ); -/// Tests that edge-case values survive the Postgres → ClickHouse pipeline +/// Tests that edge-case values survive the Postgres -> ClickHouse pipeline /// without data loss or corruption. /// /// # GIVEN @@ -460,11 +484,11 @@ const BOUNDARY_VALUES_SELECT: &str = concat!( /// A Postgres table with nullable scalar columns and nullable array columns, /// populated with four rows that exercise encoding boundary conditions: /// -/// 1. **All NULLs** — nullable scalars are NULL, arrays are empty. -/// 2. **NULL elements inside arrays** — `{1, NULL, 3}`, `{'a', NULL, 'c'}`. -/// 3. **Empty strings** — a present-but-empty text value next to a NULL integer, +/// 1. **All NULLs** -- nullable scalars are NULL, arrays are empty. +/// 2. **NULL elements inside arrays** -- `{1, NULL, 3}`, `{'a', NULL, 'c'}`. +/// 3. **Empty strings** -- a present-but-empty text value next to a NULL integer, /// plus single-element arrays (varint length = 1). -/// 4. **Multi-byte UTF-8** — emoji and CJK characters, verifying that the +/// 4. **Multi-byte UTF-8** -- emoji and CJK characters, verifying that the /// RowBinary varint encodes byte length (not character count) correctly. /// /// # WHEN @@ -483,7 +507,7 @@ async fn boundary_values_table_copy() { init_test_tracing(); install_crypto_provider(); - // ── GIVEN: Postgres source with boundary-value rows ────────────────────── + // --- GIVEN: Postgres source with boundary-value rows --- let database = spawn_source_database().await; let table_name = test_table_name("boundary_values"); @@ -518,7 +542,7 @@ async fn boundary_values_table_copy() { .await .expect("Failed to insert row 1 (all NULLs)"); - // Row 2: arrays with interior NULL elements — the element at index 1 is NULL + // Row 2: arrays with interior NULL elements -- the element at index 1 is NULL // while surrounding elements are present. database .run_sql(&format!( @@ -540,7 +564,7 @@ async fn boundary_values_table_copy() { .await .expect("Failed to insert row 3 (empty string + single-element arrays)"); - // Row 4: multi-byte UTF-8 — emoji (4 bytes per char) and CJK (3 bytes per char). + // Row 4: multi-byte UTF-8 -- emoji (4 bytes per char) and CJK (3 bytes per char). // The RowBinary varint encodes byte length, not character count. database .run_sql(&format!( @@ -551,8 +575,7 @@ async fn boundary_values_table_copy() { .await .expect("Failed to insert row 4 (multi-byte UTF-8)"); - // ── WHEN: pipeline copies data to ClickHouse ───────────────────────────── - + // --- WHEN: pipeline copies data to ClickHouse --- let ch_db = setup_clickhouse_database().await; let store = NotifyingStore::new(); let pipeline_id: PipelineId = random(); @@ -574,8 +597,7 @@ async fn boundary_values_table_copy() { table_ready.notified().await; pipeline.shutdown_and_wait().await.unwrap(); - // ── THEN: ClickHouse data matches Postgres exactly ─────────────────────── - + // --- THEN: ClickHouse data matches Postgres exactly --- let rows: Vec = ch_db.query(BOUNDARY_VALUES_SELECT).await; assert_eq!(rows.len(), 4, "expected 4 rows in ClickHouse"); @@ -641,8 +663,8 @@ async fn boundary_values_table_copy() { /// A Postgres table with `REPLICA IDENTITY FULL` (so Postgres sends all column /// values in DELETE events, not just the primary key), populated with two rows: /// -/// 1. `id=1, value='keep_me'` — will remain untouched. -/// 2. `id=2, value='delete_me'` — will be deleted after table copy. +/// 1. `id=1, value='keep_me'` -- will remain untouched. +/// 2. `id=2, value='delete_me'` -- will be deleted after table copy. /// /// # WHEN /// @@ -660,7 +682,7 @@ async fn deletes_are_streamed_to_clickhouse() { init_test_tracing(); install_crypto_provider(); - // ── GIVEN: Postgres source with two rows, REPLICA IDENTITY FULL ───────── + // --- GIVEN: Postgres source with two rows, REPLICA IDENTITY FULL --- let database = spawn_source_database().await; let table_name = test_table_name("delete_flow"); @@ -692,7 +714,7 @@ async fn deletes_are_streamed_to_clickhouse() { .await .expect("Failed to insert delete_flow rows"); - // ── WHEN: pipeline copies data, then a DELETE is streamed ──────────────── + // --- WHEN: pipeline copies data, then a DELETE is streamed --- let ch_db = setup_clickhouse_database().await; let store = NotifyingStore::new(); @@ -726,7 +748,7 @@ async fn deletes_are_streamed_to_clickhouse() { pipeline.shutdown_and_wait().await.unwrap(); - // ── THEN: two INSERTs from table copy, one DELETE from streaming ───────── + // --- THEN: two INSERTs from table copy, one DELETE from streaming --- assert_eq!( rows.len(), diff --git a/etl-destinations/tests/support/clickhouse.rs b/etl-destinations/tests/support/clickhouse.rs index 484dcf19f..a4a0829d7 100644 --- a/etl-destinations/tests/support/clickhouse.rs +++ b/etl-destinations/tests/support/clickhouse.rs @@ -4,10 +4,10 @@ /// A row read back from the ClickHouse `all_types_encoding` test table. /// /// Column-to-type mapping: -/// - `Date` → `u16` (days since 1970-01-01 in RowBinary) -/// - `DateTime64(6)` → `i64` (microseconds since epoch in RowBinary) -/// - `UUID` → `String` (via `toString()` in the SELECT query) -/// - `Array(Nullable(T))` → `Vec>` +/// - `Date` -> `u16` (days since 1970-01-01 in RowBinary) +/// - `DateTime64(6)` -> `i64` (microseconds since epoch in RowBinary) +/// - `UUID` -> `String` (via `toString()` in the SELECT query) +/// - `Array(Nullable(T))` -> `Vec>` /// /// Fields must match the SELECT column list in the test query exactly. #[derive(clickhouse::Row, serde::Deserialize, Debug, Clone)] @@ -22,9 +22,9 @@ pub struct AllTypesRow { pub boolean_col: bool, pub text_col: String, pub varchar_col: String, - pub date_col: u16, // Date → days since epoch - pub timestamp_col: i64, // DateTime64(6) → microseconds - pub timestamptz_col: i64, // DateTime64(6,'UTC') → microseconds + pub date_col: u16, // Date -> days since epoch + pub timestamp_col: i64, // DateTime64(6) -> microseconds + pub timestamptz_col: i64, // DateTime64(6,'UTC') -> microseconds pub time_col: String, pub interval_col: String, pub jsonb_col: String, From 19d074b9ecab8300ed6619a7eaef04cd99b31fe3 Mon Sep 17 00:00:00 2001 From: Jordan McQueen Date: Tue, 14 Apr 2026 17:57:26 +0900 Subject: [PATCH 24/86] Add pipeline restart/recovery integration test for ClickHouse Verifies that rebuilding the destination and pipeline with the same store and pipeline_id resumes CDC streaming without re-running the initial table copy. --- etl-destinations/tests/clickhouse_pipeline.rs | 152 ++++++++++++++++++ 1 file changed, 152 insertions(+) diff --git a/etl-destinations/tests/clickhouse_pipeline.rs b/etl-destinations/tests/clickhouse_pipeline.rs index f2cdd71d9..5b673d038 100644 --- a/etl-destinations/tests/clickhouse_pipeline.rs +++ b/etl-destinations/tests/clickhouse_pipeline.rs @@ -73,6 +73,13 @@ const DELETE_FLOW_SELECT: &str = concat!( "ORDER BY id, cdc_lsn", ); +/// SELECT query used to verify the `restart_flow` test. +const RESTART_FLOW_SELECT: &str = concat!( + "SELECT id, value, cdc_operation, cdc_lsn ", + "FROM \"test_restart__flow\" ", + "ORDER BY id, cdc_lsn", +); + /// Days from 1970-01-01 to 2024-01-15 (used to verify the `date_col` round-trip). /// /// Python: `(date(2024, 1, 15) - date(1970, 1, 1)).days` = 19737 @@ -126,6 +133,27 @@ async fn wait_for_delete_flow_rows( ); } +/// Waits until ClickHouse returns at least `expected_rows` from `RESTART_FLOW_SELECT`. +async fn wait_for_restart_flow_rows( + ch_db: &ClickHouseTestDatabase, + expected_rows: usize, +) -> Vec { + let mut rows: Vec = Vec::with_capacity(expected_rows); + for _ in 0..50 { + rows = ch_db.query(RESTART_FLOW_SELECT).await; + if rows.len() >= expected_rows { + return rows; + } + sleep(Duration::from_millis(100)).await; + } + + panic!( + "timed out waiting for clickhouse restart_flow rows: got {} of {}", + rows.len(), + expected_rows, + ); +} + /// Tests that all Postgres column types (including nullable arrays) round-trip /// correctly through the ClickHouse RowBinary encoding. /// @@ -780,3 +808,127 @@ async fn deletes_are_streamed_to_clickhouse() { assert_eq!(r.cdc_operation, "DELETE"); assert!(r.cdc_lsn > 0, "streamed delete should have a positive LSN"); } + +/// Tests that a pipeline restart resumes CDC streaming without re-running +/// the initial table copy. +/// +/// # GIVEN +/// +/// A Postgres table with one row (`id=1, value='before_restart'`), copied +/// to ClickHouse by a first pipeline run that then shuts down cleanly. +/// +/// # WHEN +/// +/// A new `ClickHouseDestination` and `Pipeline` are built with the same +/// store and pipeline_id (simulating process restart), the pipeline is +/// started, and a second row (`id=2, value='after_restart'`) is inserted +/// into Postgres. +/// +/// # THEN +/// +/// ClickHouse contains exactly two rows: +/// - `id=1` from the initial table copy (`cdc_lsn = 0`). +/// - `id=2` from CDC streaming in the second run (`cdc_lsn > 0`). +/// No duplicate `id=1` row exists -- table copy must not re-run. +#[tokio::test(flavor = "multi_thread")] +async fn pipeline_restart_resumes_streaming() { + init_test_tracing(); + install_crypto_provider(); + + // --- GIVEN: first pipeline run copies one row --- + let database = spawn_source_database().await; + let table_name = test_table_name("restart_flow"); + + let table_id = database + .create_table(table_name.clone(), true, &[("value", "text not null")]) + .await + .expect("Failed to create restart_flow test table"); + + let publication_name = "test_pub_clickhouse_restart"; + database + .create_publication(publication_name, std::slice::from_ref(&table_name)) + .await + .expect("Failed to create restart_flow publication"); + + database + .run_sql(&format!( + "INSERT INTO {} (value) VALUES ('before_restart')", + table_name.as_quoted_identifier(), + )) + .await + .expect("Failed to insert initial restart_flow row"); + + let ch_db = setup_clickhouse_database().await; + let store = NotifyingStore::new(); + let pipeline_id: PipelineId = random(); + let destination = ch_db.build_destination(pipeline_id, store.clone()); + + let table_ready = store + .notify_on_table_state_type(table_id, TableReplicationPhaseType::Ready) + .await; + + let mut pipeline = create_pipeline( + &database.config, + pipeline_id, + publication_name.to_owned(), + store.clone(), + destination, + ); + + pipeline.start().await.unwrap(); + table_ready.notified().await; + pipeline.shutdown_and_wait().await.unwrap(); + + // Verify first run produced exactly one row. + let rows: Vec = ch_db.query(RESTART_FLOW_SELECT).await; + assert_eq!(rows.len(), 1, "first run should copy exactly one row"); + assert_eq!(rows[0].id, 1); + assert_eq!(rows[0].value, "before_restart"); + + // --- WHEN: rebuild destination and pipeline, then stream a new insert --- + let destination = ch_db.build_destination(pipeline_id, store.clone()); + + let mut pipeline = create_pipeline( + &database.config, + pipeline_id, + publication_name.to_owned(), + store, + destination, + ); + + pipeline.start().await.unwrap(); + + database + .run_sql(&format!( + "INSERT INTO {} (value) VALUES ('after_restart')", + table_name.as_quoted_identifier(), + )) + .await + .expect("Failed to insert post-restart row"); + + let rows = wait_for_restart_flow_rows(&ch_db, 2).await; + + pipeline.shutdown_and_wait().await.unwrap(); + + // --- THEN: exactly two rows, no duplicate from re-running table copy --- + assert_eq!( + rows.len(), + 2, + "expected original copied row plus one streamed insert, no duplicates" + ); + + let r = &rows[0]; + assert_eq!(r.id, 1); + assert_eq!(r.value, "before_restart"); + assert_eq!(r.cdc_operation, "INSERT"); + assert_eq!(r.cdc_lsn, 0, "first row should be from table copy"); + + let r = &rows[1]; + assert_eq!(r.id, 2); + assert_eq!(r.value, "after_restart"); + assert_eq!(r.cdc_operation, "INSERT"); + assert!( + r.cdc_lsn > 0, + "second row should be from CDC streaming after restart" + ); +} From adb28cd5d49f113dc02fc2a458054570152cd13d Mon Sep 17 00:00:00 2001 From: Jordan McQueen Date: Tue, 14 Apr 2026 18:11:26 +0900 Subject: [PATCH 25/86] Add truncate integration test for ClickHouse Copies two rows, truncates the Postgres table, verifies ClickHouse is empty, then inserts a new row and verifies only the post-truncate data exists. --- etl-destinations/tests/clickhouse_pipeline.rs | 149 ++++++++++++++++++ 1 file changed, 149 insertions(+) diff --git a/etl-destinations/tests/clickhouse_pipeline.rs b/etl-destinations/tests/clickhouse_pipeline.rs index 5b673d038..eeb150a82 100644 --- a/etl-destinations/tests/clickhouse_pipeline.rs +++ b/etl-destinations/tests/clickhouse_pipeline.rs @@ -80,6 +80,13 @@ const RESTART_FLOW_SELECT: &str = concat!( "ORDER BY id, cdc_lsn", ); +/// SELECT query used to verify the `truncate_flow` test. +const TRUNCATE_FLOW_SELECT: &str = concat!( + "SELECT id, value, cdc_operation, cdc_lsn ", + "FROM \"test_truncate__flow\" ", + "ORDER BY id, cdc_lsn", +); + /// Days from 1970-01-01 to 2024-01-15 (used to verify the `date_col` round-trip). /// /// Python: `(date(2024, 1, 15) - date(1970, 1, 1)).days` = 19737 @@ -154,6 +161,40 @@ async fn wait_for_restart_flow_rows( ); } +/// Waits until ClickHouse returns exactly zero rows from `TRUNCATE_FLOW_SELECT`. +async fn wait_for_truncate_flow_empty(ch_db: &ClickHouseTestDatabase) { + for _ in 0..50 { + let rows: Vec = ch_db.query(TRUNCATE_FLOW_SELECT).await; + if rows.is_empty() { + return; + } + sleep(Duration::from_millis(100)).await; + } + + panic!("timed out waiting for clickhouse truncate_flow table to become empty"); +} + +/// Waits until ClickHouse returns at least `expected_rows` from `TRUNCATE_FLOW_SELECT`. +async fn wait_for_truncate_flow_rows( + ch_db: &ClickHouseTestDatabase, + expected_rows: usize, +) -> Vec { + let mut rows: Vec = Vec::with_capacity(expected_rows); + for _ in 0..50 { + rows = ch_db.query(TRUNCATE_FLOW_SELECT).await; + if rows.len() >= expected_rows { + return rows; + } + sleep(Duration::from_millis(100)).await; + } + + panic!( + "timed out waiting for clickhouse truncate_flow rows: got {} of {}", + rows.len(), + expected_rows, + ); +} + /// Tests that all Postgres column types (including nullable arrays) round-trip /// correctly through the ClickHouse RowBinary encoding. /// @@ -932,3 +973,111 @@ async fn pipeline_restart_resumes_streaming() { "second row should be from CDC streaming after restart" ); } + +/// Tests that TRUNCATE clears the ClickHouse table and that subsequent inserts +/// produce a clean slate with only post-truncate data. +/// +/// # GIVEN +/// +/// A Postgres table with two rows (`id=1, value='alpha'` and `id=2, +/// value='beta'`), copied to ClickHouse by the initial table copy. +/// +/// # WHEN +/// +/// 1. Postgres issues `TRUNCATE` on the table. +/// 2. After the table becomes empty in ClickHouse, a new row +/// (`id=3, value='gamma'`) is inserted into Postgres. +/// +/// # THEN +/// +/// After truncate, ClickHouse contains zero rows. +/// After the post-truncate insert, ClickHouse contains exactly one row: +/// - `id=3, value='gamma', cdc_operation='INSERT', cdc_lsn > 0`. +/// No pre-truncate rows survive. +#[tokio::test(flavor = "multi_thread")] +async fn truncate_clears_table_and_accepts_new_inserts() { + init_test_tracing(); + install_crypto_provider(); + + // --- GIVEN: two rows copied to ClickHouse --- + let database = spawn_source_database().await; + let table_name = test_table_name("truncate_flow"); + + let table_id = database + .create_table(table_name.clone(), true, &[("value", "text not null")]) + .await + .expect("Failed to create truncate_flow test table"); + + let publication_name = "test_pub_clickhouse_truncate"; + database + .create_publication(publication_name, std::slice::from_ref(&table_name)) + .await + .expect("Failed to create truncate_flow publication"); + + database + .run_sql(&format!( + "INSERT INTO {} (value) VALUES ('alpha'), ('beta')", + table_name.as_quoted_identifier(), + )) + .await + .expect("Failed to insert truncate_flow rows"); + + let ch_db = setup_clickhouse_database().await; + let store = NotifyingStore::new(); + let pipeline_id: PipelineId = random(); + let destination = ch_db.build_destination(pipeline_id, store.clone()); + + let table_ready = store + .notify_on_table_state_type(table_id, TableReplicationPhaseType::Ready) + .await; + + let mut pipeline = create_pipeline( + &database.config, + pipeline_id, + publication_name.to_owned(), + store, + destination, + ); + + pipeline.start().await.unwrap(); + table_ready.notified().await; + + // Verify both rows arrived from table copy. + let rows: Vec = ch_db.query(TRUNCATE_FLOW_SELECT).await; + assert_eq!(rows.len(), 2, "table copy should produce two rows"); + + // --- WHEN: truncate, then insert a new row --- + database + .truncate_table(table_name.clone()) + .await + .expect("Failed to truncate table in Postgres"); + + wait_for_truncate_flow_empty(&ch_db).await; + + database + .run_sql(&format!( + "INSERT INTO {} (value) VALUES ('gamma')", + table_name.as_quoted_identifier(), + )) + .await + .expect("Failed to insert post-truncate row"); + + let rows = wait_for_truncate_flow_rows(&ch_db, 1).await; + + pipeline.shutdown_and_wait().await.unwrap(); + + // --- THEN: only the post-truncate row exists --- + assert_eq!(rows.len(), 1, "only post-truncate row should exist"); + + let r = &rows[0]; + assert_eq!( + r.id, 3, + "post-truncate row should have id=3 (serial continues)" + ); + assert_eq!(r.value, "gamma"); + assert_eq!(r.cdc_operation, "INSERT"); + assert!( + r.cdc_lsn > 0, + "post-truncate insert should come from CDC streaming" + ); +} From a3b00438e2cb05f3b5ba75cdc4270e76109e4cca Mon Sep 17 00:00:00 2001 From: Jordan McQueen Date: Tue, 14 Apr 2026 18:17:10 +0900 Subject: [PATCH 26/86] Add intermediate INSERT flush integration test for ClickHouse Adds build_destination_with_config to ClickHouseTestDatabase for overriding the inserter config. Tests that setting max_bytes_per_insert to 1 byte (forcing a flush after every row) does not lose any rows during initial table copy. --- etl-destinations/src/clickhouse/test_utils.rs | 29 ++++- etl-destinations/tests/clickhouse_pipeline.rs | 102 ++++++++++++++++++ 2 files changed, 126 insertions(+), 5 deletions(-) diff --git a/etl-destinations/src/clickhouse/test_utils.rs b/etl-destinations/src/clickhouse/test_utils.rs index 70662c9ef..75a745114 100644 --- a/etl-destinations/src/clickhouse/test_utils.rs +++ b/etl-destinations/src/clickhouse/test_utils.rs @@ -105,11 +105,33 @@ impl ClickHouseTestDatabase { .expect("Failed to drop test ClickHouse database"); } - /// Builds a [`ClickHouseDestination`] scoped to this test database. + /// Builds a [`ClickHouseDestination`] scoped to this test database with + /// default inserter config (100 MiB per INSERT -- large enough that tests + /// never hit an intermediate flush). pub fn build_destination( + &self, + pipeline_id: PipelineId, + store: S, + ) -> ClickHouseDestination + where + S: StateStore + SchemaStore + Send + Sync, + { + self.build_destination_with_config( + pipeline_id, + store, + ClickHouseInserterConfig { + max_bytes_per_insert: 100 * 1024 * 1024, + }, + ) + } + + /// Builds a [`ClickHouseDestination`] scoped to this test database with + /// a caller-supplied [`ClickHouseInserterConfig`]. + pub fn build_destination_with_config( &self, _pipeline_id: PipelineId, store: S, + config: ClickHouseInserterConfig, ) -> ClickHouseDestination where S: StateStore + SchemaStore + Send + Sync, @@ -119,10 +141,7 @@ impl ClickHouseTestDatabase { &self.user, self.password.clone(), &self.database, - ClickHouseInserterConfig { - // 100 MiB — large enough that tests never hit an intermediate flush. - max_bytes_per_insert: 100 * 1024 * 1024, - }, + config, store, ) .expect("Failed to create ClickHouseDestination for test") diff --git a/etl-destinations/tests/clickhouse_pipeline.rs b/etl-destinations/tests/clickhouse_pipeline.rs index eeb150a82..e5a41b7f4 100644 --- a/etl-destinations/tests/clickhouse_pipeline.rs +++ b/etl-destinations/tests/clickhouse_pipeline.rs @@ -5,6 +5,7 @@ use etl::test_utils::database::{spawn_source_database, test_table_name}; use etl::test_utils::notifying_store::NotifyingStore; use etl::test_utils::pipeline::create_pipeline; use etl::types::PipelineId; +use etl_destinations::clickhouse::ClickHouseInserterConfig; use etl_destinations::clickhouse::test_utils::{ClickHouseTestDatabase, setup_clickhouse_database}; use etl_telemetry::tracing::init_test_tracing; use rand::random; @@ -1081,3 +1082,104 @@ async fn truncate_clears_table_and_accepts_new_inserts() { "post-truncate insert should come from CDC streaming" ); } + +/// SELECT query used to verify the `flush_split` test. +const FLUSH_SPLIT_SELECT: &str = concat!( + "SELECT id, value, cdc_operation, cdc_lsn ", + "FROM \"test_flush__split\" ", + "ORDER BY id, cdc_lsn", +); + +/// Tests that the intermediate INSERT flush (`max_bytes_per_insert`) does not +/// lose rows when a batch is split across multiple INSERT statements. +/// +/// # GIVEN +/// +/// A Postgres table with 10 rows, and a ClickHouse destination configured with +/// `max_bytes_per_insert = 1` (forcing a new INSERT after every single row). +/// +/// # WHEN +/// +/// The pipeline runs initial table copy from Postgres to ClickHouse. +/// +/// # THEN +/// +/// All 10 rows arrive in ClickHouse despite being split across many INSERT +/// statements. No rows are lost at flush boundaries. +#[tokio::test(flavor = "multi_thread")] +async fn intermediate_flush_preserves_all_rows() { + init_test_tracing(); + install_crypto_provider(); + + // --- GIVEN: 10 rows and a tiny max_bytes_per_insert --- + let database = spawn_source_database().await; + let table_name = test_table_name("flush_split"); + + let table_id = database + .create_table(table_name.clone(), true, &[("value", "text not null")]) + .await + .expect("Failed to create flush_split test table"); + + let publication_name = "test_pub_clickhouse_flush"; + database + .create_publication(publication_name, std::slice::from_ref(&table_name)) + .await + .expect("Failed to create flush_split publication"); + + let row_count = 10; + let values: Vec = (1..=row_count).map(|i| format!("('row_{i}')")).collect(); + database + .run_sql(&format!( + "INSERT INTO {} (value) VALUES {}", + table_name.as_quoted_identifier(), + values.join(", "), + )) + .await + .expect("Failed to insert flush_split rows"); + + let ch_db = setup_clickhouse_database().await; + let store = NotifyingStore::new(); + let pipeline_id: PipelineId = random(); + let destination = ch_db.build_destination_with_config( + pipeline_id, + store.clone(), + ClickHouseInserterConfig { + // 1 byte -- forces a new INSERT after every row. + max_bytes_per_insert: 1, + }, + ); + + let table_ready = store + .notify_on_table_state_type(table_id, TableReplicationPhaseType::Ready) + .await; + + // --- WHEN: pipeline copies data with aggressive flush splitting --- + let mut pipeline = create_pipeline( + &database.config, + pipeline_id, + publication_name.to_owned(), + store, + destination, + ); + + pipeline.start().await.unwrap(); + table_ready.notified().await; + pipeline.shutdown_and_wait().await.unwrap(); + + // --- THEN: all rows arrive despite being split across many INSERTs --- + let rows: Vec = ch_db.query(FLUSH_SPLIT_SELECT).await; + assert_eq!( + rows.len(), + row_count, + "all rows must survive intermediate flush splits" + ); + + for (i, r) in rows.iter().enumerate() { + let expected_id = (i + 1) as i64; + let expected_value = format!("row_{}", i + 1); + assert_eq!(r.id, expected_id, "row {} id mismatch", i + 1); + assert_eq!(r.value, expected_value, "row {} value mismatch", i + 1); + assert_eq!(r.cdc_operation, "INSERT"); + assert_eq!(r.cdc_lsn, 0, "all rows should be from table copy"); + } +} From 7ebab69c56d2d6bfcefb498846ce2b6181a00dfb Mon Sep 17 00:00:00 2001 From: Jordan McQueen Date: Wed, 15 Apr 2026 15:28:22 +0900 Subject: [PATCH 27/86] Add unit test for NULL rejection in non-nullable ClickHouse column Verifies that rb_encode_value returns ConversionError and writes no bytes when given a NULL value for a non-nullable column. --- etl-destinations/src/clickhouse/encoding.rs | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/etl-destinations/src/clickhouse/encoding.rs b/etl-destinations/src/clickhouse/encoding.rs index de2663b4c..93de8fcf1 100644 --- a/etl-destinations/src/clickhouse/encoding.rs +++ b/etl-destinations/src/clickhouse/encoding.rs @@ -467,4 +467,23 @@ mod tests { assert_eq!(bytes_to_hex([0xff].to_vec()), "ff"); assert_eq!(bytes_to_hex([0xde, 0xad, 0xbe, 0xef].to_vec()), "deadbeef"); } + + /// # GIVEN + /// A NULL ClickHouseValue passed to the non-nullable encoder. + /// + /// # WHEN + /// `rb_encode_value` is called. + /// + /// # THEN + /// It returns a ConversionError rather than writing invalid RowBinary. + #[test] + fn test_rb_encode_value_rejects_null_for_non_nullable_column() { + let mut buf = Vec::new(); + let result = rb_encode_value(ClickHouseValue::Null, &mut buf); + + assert!(result.is_err(), "NULL in non-nullable column must error"); + let err = result.unwrap_err(); + assert_eq!(err.kind(), ErrorKind::ConversionError); + assert!(buf.is_empty(), "no bytes should be written on error"); + } } From 2d42b86d07b6f9b5684bf1fb1ff050e0f0dfa05c Mon Sep 17 00:00:00 2001 From: Jordan McQueen Date: Wed, 15 Apr 2026 15:52:33 +0900 Subject: [PATCH 28/86] Add multi-table integration test for ClickHouse Two tables in a single publication each receive independent table copy and CDC streaming writes. Verifies that events for different tables are routed correctly. --- etl-destinations/tests/clickhouse_pipeline.rs | 152 ++++++++++++++++++ 1 file changed, 152 insertions(+) diff --git a/etl-destinations/tests/clickhouse_pipeline.rs b/etl-destinations/tests/clickhouse_pipeline.rs index e5a41b7f4..06747f628 100644 --- a/etl-destinations/tests/clickhouse_pipeline.rs +++ b/etl-destinations/tests/clickhouse_pipeline.rs @@ -1183,3 +1183,155 @@ async fn intermediate_flush_preserves_all_rows() { assert_eq!(r.cdc_lsn, 0, "all rows should be from table copy"); } } + +/// Tests that CDC events for multiple tables in the same publication are +/// written correctly to separate ClickHouse tables. +/// +/// # GIVEN +/// +/// Two Postgres tables in the same publication, each with one pre-existing +/// row: +/// - `multi_a` with `(id=1, value='init_a')` +/// - `multi_b` with `(id=1, value='init_b')` +/// +/// # WHEN +/// +/// The pipeline copies both tables, then one new row is inserted into each +/// Postgres table (`'streamed_a'` and `'streamed_b'`). +/// +/// # THEN +/// +/// Each ClickHouse table contains exactly two rows: +/// - One from table copy (`cdc_lsn = 0`) +/// - One from CDC streaming (`cdc_lsn > 0`) +#[tokio::test(flavor = "multi_thread")] +async fn multiple_tables_receive_independent_writes() { + init_test_tracing(); + install_crypto_provider(); + + // --- GIVEN: two tables in one publication, each with one row --- + let database = spawn_source_database().await; + let table_a = test_table_name("multi_a"); + let table_b = test_table_name("multi_b"); + + let table_a_id = database + .create_table(table_a.clone(), true, &[("value", "text not null")]) + .await + .expect("Failed to create multi_a table"); + + let table_b_id = database + .create_table(table_b.clone(), true, &[("value", "text not null")]) + .await + .expect("Failed to create multi_b table"); + + let publication_name = "test_pub_clickhouse_multi"; + database + .create_publication(publication_name, &[table_a.clone(), table_b.clone()]) + .await + .expect("Failed to create multi-table publication"); + + database + .run_sql(&format!( + "INSERT INTO {} (value) VALUES ('init_a')", + table_a.as_quoted_identifier(), + )) + .await + .expect("Failed to insert into multi_a"); + + database + .run_sql(&format!( + "INSERT INTO {} (value) VALUES ('init_b')", + table_b.as_quoted_identifier(), + )) + .await + .expect("Failed to insert into multi_b"); + + let ch_db = setup_clickhouse_database().await; + let store = NotifyingStore::new(); + let pipeline_id: PipelineId = random(); + let destination = ch_db.build_destination(pipeline_id, store.clone()); + + let table_a_ready = store + .notify_on_table_state_type(table_a_id, TableReplicationPhaseType::Ready) + .await; + let table_b_ready = store + .notify_on_table_state_type(table_b_id, TableReplicationPhaseType::Ready) + .await; + + let mut pipeline = create_pipeline( + &database.config, + pipeline_id, + publication_name.to_owned(), + store, + destination, + ); + + pipeline.start().await.unwrap(); + tokio::join!(table_a_ready.notified(), table_b_ready.notified()); + + // --- WHEN: insert one row into each table --- + database + .run_sql(&format!( + "INSERT INTO {} (value) VALUES ('streamed_a')", + table_a.as_quoted_identifier(), + )) + .await + .expect("Failed to insert streamed row into multi_a"); + + database + .run_sql(&format!( + "INSERT INTO {} (value) VALUES ('streamed_b')", + table_b.as_quoted_identifier(), + )) + .await + .expect("Failed to insert streamed row into multi_b"); + + let select_a = concat!( + "SELECT id, value, cdc_operation, cdc_lsn ", + "FROM \"test_multi__a\" ", + "ORDER BY id, cdc_lsn", + ); + let select_b = concat!( + "SELECT id, value, cdc_operation, cdc_lsn ", + "FROM \"test_multi__b\" ", + "ORDER BY id, cdc_lsn", + ); + + // Poll until both tables have 2 rows. + let mut rows_a: Vec = Vec::with_capacity(2); + let mut rows_b: Vec = Vec::with_capacity(2); + for _ in 0..50 { + rows_a = ch_db.query(select_a).await; + rows_b = ch_db.query(select_b).await; + if rows_a.len() >= 2 && rows_b.len() >= 2 { + break; + } + sleep(Duration::from_millis(100)).await; + } + + pipeline.shutdown_and_wait().await.unwrap(); + + // --- THEN: each table has one copied row and one streamed row --- + assert_eq!(rows_a.len(), 2, "multi_a should have 2 rows"); + assert_eq!(rows_b.len(), 2, "multi_b should have 2 rows"); + + assert_eq!(rows_a[0].id, 1); + assert_eq!(rows_a[0].value, "init_a"); + assert_eq!(rows_a[0].cdc_operation, "INSERT"); + assert_eq!(rows_a[0].cdc_lsn, 0); + + assert_eq!(rows_a[1].id, 2); + assert_eq!(rows_a[1].value, "streamed_a"); + assert_eq!(rows_a[1].cdc_operation, "INSERT"); + assert!(rows_a[1].cdc_lsn > 0); + + assert_eq!(rows_b[0].id, 1); + assert_eq!(rows_b[0].value, "init_b"); + assert_eq!(rows_b[0].cdc_operation, "INSERT"); + assert_eq!(rows_b[0].cdc_lsn, 0); + + assert_eq!(rows_b[1].id, 2); + assert_eq!(rows_b[1].value, "streamed_b"); + assert_eq!(rows_b[1].cdc_operation, "INSERT"); + assert!(rows_b[1].cdc_lsn > 0); +} From e5fa4b2a1645e4eea5367e928c234d29c45c83c8 Mon Sep 17 00:00:00 2001 From: Jordan McQueen Date: Wed, 15 Apr 2026 16:15:49 +0900 Subject: [PATCH 29/86] Add sequential transaction ordering test for ClickHouse Two transactions on separate Postgres connections update the same row and commit sequentially. Verifies that ClickHouse CDC rows have strictly increasing LSNs matching Postgres commit order. --- etl-destinations/tests/clickhouse_pipeline.rs | 130 ++++++++++++++++++ 1 file changed, 130 insertions(+) diff --git a/etl-destinations/tests/clickhouse_pipeline.rs b/etl-destinations/tests/clickhouse_pipeline.rs index 06747f628..3e73559dd 100644 --- a/etl-destinations/tests/clickhouse_pipeline.rs +++ b/etl-destinations/tests/clickhouse_pipeline.rs @@ -1335,3 +1335,133 @@ async fn multiple_tables_receive_independent_writes() { assert_eq!(rows_b[1].cdc_operation, "INSERT"); assert!(rows_b[1].cdc_lsn > 0); } + +/// SELECT query used to verify the `tx_order` test. +const TX_ORDER_SELECT: &str = concat!( + "SELECT id, value, cdc_operation, cdc_lsn ", + "FROM \"test_tx__order\" ", + "ORDER BY id, cdc_lsn", +); + +/// Tests that updates from separately committed transactions arrive in +/// ClickHouse with LSNs reflecting Postgres commit order. +/// +/// # GIVEN +/// +/// A Postgres table with one row (`id=1, value='original'`), copied to +/// ClickHouse. Two database connections to the same source. +/// +/// # WHEN +/// +/// Transaction A (on connection 1) updates the row to `'update_a'` and +/// commits. Then transaction B (on connection 2) updates the row to +/// `'update_b'` and commits. +/// +/// # THEN +/// +/// ClickHouse contains three rows (append-only CDC) with strictly +/// increasing `cdc_lsn`: +/// - `value='original'`, `cdc_operation='INSERT'`, `cdc_lsn=0` +/// - `value='update_a'`, `cdc_operation='UPDATE'`, `cdc_lsn > 0` +/// - `value='update_b'`, `cdc_operation='UPDATE'`, `cdc_lsn > update_a's lsn` +#[tokio::test(flavor = "multi_thread")] +async fn sequential_transactions_preserve_commit_order() { + init_test_tracing(); + install_crypto_provider(); + + // --- GIVEN: one row, two database connections --- + let mut database_1 = spawn_source_database().await; + let mut database_2 = database_1.duplicate().await; + let table_name = test_table_name("tx_order"); + + let table_id = database_1 + .create_table(table_name.clone(), true, &[("value", "text not null")]) + .await + .expect("Failed to create tx_order test table"); + + let publication_name = "test_pub_clickhouse_tx_order"; + database_1 + .create_publication(publication_name, std::slice::from_ref(&table_name)) + .await + .expect("Failed to create tx_order publication"); + + database_1 + .run_sql(&format!( + "INSERT INTO {} (value) VALUES ('original')", + table_name.as_quoted_identifier(), + )) + .await + .expect("Failed to insert initial tx_order row"); + + let ch_db = setup_clickhouse_database().await; + let store = NotifyingStore::new(); + let pipeline_id: PipelineId = random(); + let destination = ch_db.build_destination(pipeline_id, store.clone()); + + let table_ready = store + .notify_on_table_state_type(table_id, TableReplicationPhaseType::Ready) + .await; + + let mut pipeline = create_pipeline( + &database_1.config, + pipeline_id, + publication_name.to_owned(), + store, + destination, + ); + + pipeline.start().await.unwrap(); + table_ready.notified().await; + + // --- WHEN: two transactions commit sequentially on separate connections --- + let tx_a = database_1.begin_transaction().await; + tx_a.run_sql(&format!( + "UPDATE {} SET value = 'update_a' WHERE id = 1", + table_name.as_quoted_identifier(), + )) + .await + .expect("Failed to execute update_a"); + tx_a.commit_transaction().await; + + let tx_b = database_2.begin_transaction().await; + tx_b.run_sql(&format!( + "UPDATE {} SET value = 'update_b' WHERE id = 1", + table_name.as_quoted_identifier(), + )) + .await + .expect("Failed to execute update_b"); + tx_b.commit_transaction().await; + + // Poll until all three rows arrive. + let mut rows: Vec = Vec::with_capacity(3); + for _ in 0..50 { + rows = ch_db.query(TX_ORDER_SELECT).await; + if rows.len() >= 3 { + break; + } + sleep(Duration::from_millis(100)).await; + } + + pipeline.shutdown_and_wait().await.unwrap(); + + // --- THEN: three rows with strictly increasing LSNs --- + assert_eq!(rows.len(), 3, "expected INSERT + two UPDATEs"); + + let r = &rows[0]; + assert_eq!(r.value, "original"); + assert_eq!(r.cdc_operation, "INSERT"); + assert_eq!(r.cdc_lsn, 0); + + let r = &rows[1]; + assert_eq!(r.value, "update_a"); + assert_eq!(r.cdc_operation, "UPDATE"); + assert!(r.cdc_lsn > 0); + + let r = &rows[2]; + assert_eq!(r.value, "update_b"); + assert_eq!(r.cdc_operation, "UPDATE"); + assert!( + r.cdc_lsn > rows[1].cdc_lsn, + "update_b must have a higher LSN than update_a" + ); +} From ef46b2573b8e1264059ea9f51f4925b9fed4781b Mon Sep 17 00:00:00 2001 From: Jordan McQueen Date: Wed, 15 Apr 2026 18:19:28 +0900 Subject: [PATCH 30/86] Add default replica identity DELETE test for ClickHouse Verifies that a DELETE under default replica identity (PK only) writes a valid CDC row with the correct PK and zero-value non-PK columns, and that the pipeline continues operating afterward. --- etl-destinations/tests/clickhouse_pipeline.rs | 134 ++++++++++++++++++ 1 file changed, 134 insertions(+) diff --git a/etl-destinations/tests/clickhouse_pipeline.rs b/etl-destinations/tests/clickhouse_pipeline.rs index 3e73559dd..2c1f5f262 100644 --- a/etl-destinations/tests/clickhouse_pipeline.rs +++ b/etl-destinations/tests/clickhouse_pipeline.rs @@ -1465,3 +1465,137 @@ async fn sequential_transactions_preserve_commit_order() { "update_b must have a higher LSN than update_a" ); } + +/// SELECT query used to verify the `default_identity_delete` test. +const DEFAULT_IDENTITY_DELETE_SELECT: &str = concat!( + "SELECT id, value, cdc_operation, cdc_lsn ", + "FROM \"test_default__identity__delete\" ", + "ORDER BY id, cdc_lsn", +); + +/// Tests that a DELETE under default replica identity (PK only) produces a +/// valid CDC row in ClickHouse with the correct PK and zero-value non-PK +/// columns. +/// +/// # GIVEN +/// +/// A Postgres table with **default replica identity** (not FULL) and one row +/// (`id=1, value='to_delete'`), copied to ClickHouse. +/// +/// # WHEN +/// +/// The row is deleted in Postgres, then a new row (`id=2, value='after'`) is +/// inserted. +/// +/// # THEN +/// +/// ClickHouse contains three rows: +/// - `id=1, value='to_delete', cdc_operation='INSERT', cdc_lsn=0` (table copy) +/// - `id=1, value='', cdc_operation='DELETE', cdc_lsn > 0` (streamed +/// delete -- Postgres only sent the PK, so the non-PK `value` column is a +/// zero-value empty string, not the original data) +/// - `id=2, value='after', cdc_operation='INSERT', cdc_lsn > 0` (proves +/// the pipeline continued after the delete) +#[tokio::test(flavor = "multi_thread")] +async fn delete_with_default_replica_identity() { + init_test_tracing(); + install_crypto_provider(); + + // --- GIVEN: one row, default replica identity (NOT full) --- + let database = spawn_source_database().await; + let table_name = test_table_name("default_identity_delete"); + + let table_id = database + .create_table(table_name.clone(), true, &[("value", "text not null")]) + .await + .expect("Failed to create test table"); + + // Deliberately NOT setting REPLICA IDENTITY FULL -- default (PK only). + + let publication_name = "test_pub_ch_default_ident"; + database + .create_publication(publication_name, std::slice::from_ref(&table_name)) + .await + .expect("Failed to create publication"); + + database + .run_sql(&format!( + "INSERT INTO {} (value) VALUES ('to_delete')", + table_name.as_quoted_identifier(), + )) + .await + .expect("Failed to insert row"); + + let ch_db = setup_clickhouse_database().await; + let store = NotifyingStore::new(); + let pipeline_id: PipelineId = random(); + let destination = ch_db.build_destination(pipeline_id, store.clone()); + + let table_ready = store + .notify_on_table_state_type(table_id, TableReplicationPhaseType::Ready) + .await; + + let mut pipeline = create_pipeline( + &database.config, + pipeline_id, + publication_name.to_owned(), + store, + destination, + ); + + pipeline.start().await.unwrap(); + table_ready.notified().await; + + // --- WHEN: delete the row, then insert a new one --- + database + .run_sql(&format!( + "DELETE FROM {} WHERE id = 1", + table_name.as_quoted_identifier(), + )) + .await + .expect("Failed to delete row"); + + database + .run_sql(&format!( + "INSERT INTO {} (value) VALUES ('after')", + table_name.as_quoted_identifier(), + )) + .await + .expect("Failed to insert post-delete row"); + + // Poll for 3 rows: original INSERT + DELETE + new INSERT. + let mut rows: Vec = Vec::with_capacity(3); + for _ in 0..50 { + rows = ch_db.query(DEFAULT_IDENTITY_DELETE_SELECT).await; + if rows.len() >= 3 { + break; + } + sleep(Duration::from_millis(100)).await; + } + + pipeline.shutdown_and_wait().await.unwrap(); + + // --- THEN: DELETE row has correct PK, zero-value non-PK columns --- + assert_eq!(rows.len(), 3, "expected INSERT + DELETE + INSERT"); + + let r = &rows[0]; + assert_eq!(r.id, 1); + assert_eq!(r.value, "to_delete"); + assert_eq!(r.cdc_operation, "INSERT"); + assert_eq!(r.cdc_lsn, 0); + + let r = &rows[1]; + assert_eq!(r.id, 1, "DELETE row must have the correct PK"); + assert_eq!( + r.value, "", + "non-PK column should be zero-value (empty string) under default replica identity" + ); + assert_eq!(r.cdc_operation, "DELETE"); + assert!(r.cdc_lsn > 0); + + let r = &rows[2]; + assert_eq!(r.id, 2); + assert_eq!(r.value, "after"); + assert_eq!(r.cdc_operation, "INSERT"); + assert!(r.cdc_lsn > 0, "pipeline must continue after the delete"); +} From 6ceefab9e01d16538e02997dc502fe1664b94d54 Mon Sep 17 00:00:00 2001 From: Jordan McQueen Date: Wed, 15 Apr 2026 18:55:04 +0900 Subject: [PATCH 31/86] Use two pre-existing rows in default replica identity DELETE test With two rows before the delete, the PK assertion on the DELETE row is meaningful -- it proves the correct row was identified among multiple candidates. --- etl-destinations/tests/clickhouse_pipeline.rs | 55 +++++++++++-------- 1 file changed, 33 insertions(+), 22 deletions(-) diff --git a/etl-destinations/tests/clickhouse_pipeline.rs b/etl-destinations/tests/clickhouse_pipeline.rs index 2c1f5f262..4af376041 100644 --- a/etl-destinations/tests/clickhouse_pipeline.rs +++ b/etl-destinations/tests/clickhouse_pipeline.rs @@ -1479,29 +1479,30 @@ const DEFAULT_IDENTITY_DELETE_SELECT: &str = concat!( /// /// # GIVEN /// -/// A Postgres table with **default replica identity** (not FULL) and one row -/// (`id=1, value='to_delete'`), copied to ClickHouse. +/// A Postgres table with **default replica identity** (not FULL) and two rows +/// (`id=1, value='keep'` and `id=2, value='to_delete'`), copied to ClickHouse. /// /// # WHEN /// -/// The row is deleted in Postgres, then a new row (`id=2, value='after'`) is -/// inserted. +/// Row `id=2` is deleted in Postgres, then a new row (`id=3, value='after'`) +/// is inserted. /// /// # THEN /// -/// ClickHouse contains three rows: -/// - `id=1, value='to_delete', cdc_operation='INSERT', cdc_lsn=0` (table copy) -/// - `id=1, value='', cdc_operation='DELETE', cdc_lsn > 0` (streamed +/// ClickHouse contains four rows: +/// - `id=1, value='keep', cdc_operation='INSERT', cdc_lsn=0` (untouched) +/// - `id=2, value='to_delete', cdc_operation='INSERT', cdc_lsn=0` (table copy) +/// - `id=2, value='', cdc_operation='DELETE', cdc_lsn > 0` (streamed /// delete -- Postgres only sent the PK, so the non-PK `value` column is a /// zero-value empty string, not the original data) -/// - `id=2, value='after', cdc_operation='INSERT', cdc_lsn > 0` (proves +/// - `id=3, value='after', cdc_operation='INSERT', cdc_lsn > 0` (proves /// the pipeline continued after the delete) #[tokio::test(flavor = "multi_thread")] async fn delete_with_default_replica_identity() { init_test_tracing(); install_crypto_provider(); - // --- GIVEN: one row, default replica identity (NOT full) --- + // --- GIVEN: two rows, default replica identity (NOT full) --- let database = spawn_source_database().await; let table_name = test_table_name("default_identity_delete"); @@ -1520,11 +1521,11 @@ async fn delete_with_default_replica_identity() { database .run_sql(&format!( - "INSERT INTO {} (value) VALUES ('to_delete')", + "INSERT INTO {} (value) VALUES ('keep'), ('to_delete')", table_name.as_quoted_identifier(), )) .await - .expect("Failed to insert row"); + .expect("Failed to insert rows"); let ch_db = setup_clickhouse_database().await; let store = NotifyingStore::new(); @@ -1546,10 +1547,10 @@ async fn delete_with_default_replica_identity() { pipeline.start().await.unwrap(); table_ready.notified().await; - // --- WHEN: delete the row, then insert a new one --- + // --- WHEN: delete id=2, then insert a new row --- database .run_sql(&format!( - "DELETE FROM {} WHERE id = 1", + "DELETE FROM {} WHERE id = 2", table_name.as_quoted_identifier(), )) .await @@ -1563,11 +1564,11 @@ async fn delete_with_default_replica_identity() { .await .expect("Failed to insert post-delete row"); - // Poll for 3 rows: original INSERT + DELETE + new INSERT. - let mut rows: Vec = Vec::with_capacity(3); + // Poll for 4 rows: 2 copied INSERTs + DELETE + new INSERT. + let mut rows: Vec = Vec::with_capacity(4); for _ in 0..50 { rows = ch_db.query(DEFAULT_IDENTITY_DELETE_SELECT).await; - if rows.len() >= 3 { + if rows.len() >= 4 { break; } sleep(Duration::from_millis(100)).await; @@ -1575,17 +1576,27 @@ async fn delete_with_default_replica_identity() { pipeline.shutdown_and_wait().await.unwrap(); - // --- THEN: DELETE row has correct PK, zero-value non-PK columns --- - assert_eq!(rows.len(), 3, "expected INSERT + DELETE + INSERT"); + // --- THEN: DELETE targets the correct row, non-PK columns are zero-values --- + assert_eq!( + rows.len(), + 4, + "expected 2 copied INSERTs + DELETE + new INSERT" + ); let r = &rows[0]; assert_eq!(r.id, 1); - assert_eq!(r.value, "to_delete"); + assert_eq!(r.value, "keep"); assert_eq!(r.cdc_operation, "INSERT"); assert_eq!(r.cdc_lsn, 0); let r = &rows[1]; - assert_eq!(r.id, 1, "DELETE row must have the correct PK"); + assert_eq!(r.id, 2); + assert_eq!(r.value, "to_delete"); + assert_eq!(r.cdc_operation, "INSERT"); + assert_eq!(r.cdc_lsn, 0); + + let r = &rows[2]; + assert_eq!(r.id, 2, "DELETE must target the correct row among multiple"); assert_eq!( r.value, "", "non-PK column should be zero-value (empty string) under default replica identity" @@ -1593,8 +1604,8 @@ async fn delete_with_default_replica_identity() { assert_eq!(r.cdc_operation, "DELETE"); assert!(r.cdc_lsn > 0); - let r = &rows[2]; - assert_eq!(r.id, 2); + let r = &rows[3]; + assert_eq!(r.id, 3); assert_eq!(r.value, "after"); assert_eq!(r.cdc_operation, "INSERT"); assert!(r.cdc_lsn > 0, "pipeline must continue after the delete"); From 589de1b0e080729f67d1c01751f3e509867238a2 Mon Sep 17 00:00:00 2001 From: Jordan McQueen Date: Thu, 16 Apr 2026 13:29:31 +0900 Subject: [PATCH 32/86] Add large batch (1024 rows) table copy test for ClickHouse Inserts 1024 rows with unique values into Postgres and verifies all arrive in ClickHouse after table copy. Spot-checks a sample of rows at known positions for correct id and value. --- etl-destinations/tests/clickhouse_pipeline.rs | 97 +++++++++++++++++++ 1 file changed, 97 insertions(+) diff --git a/etl-destinations/tests/clickhouse_pipeline.rs b/etl-destinations/tests/clickhouse_pipeline.rs index 4af376041..d5ca8eb75 100644 --- a/etl-destinations/tests/clickhouse_pipeline.rs +++ b/etl-destinations/tests/clickhouse_pipeline.rs @@ -1610,3 +1610,100 @@ async fn delete_with_default_replica_identity() { assert_eq!(r.cdc_operation, "INSERT"); assert!(r.cdc_lsn > 0, "pipeline must continue after the delete"); } + +/// SELECT query used to verify the `large_batch` test. +const LARGE_BATCH_SELECT: &str = concat!( + "SELECT id, value, cdc_operation, cdc_lsn ", + "FROM \"test_large__batch\" ", + "ORDER BY id, cdc_lsn", +); + +/// Tests that a large table copy (1024 rows) completes without data loss or +/// corruption. +/// +/// # GIVEN +/// +/// A Postgres table with 1024 rows, each with a unique value derived from its +/// row number. +/// +/// # WHEN +/// +/// The pipeline runs initial table copy from Postgres to ClickHouse. +/// +/// # THEN +/// +/// All 1024 rows arrive in ClickHouse. A sample of rows at known positions +/// (first, last, powers of two, and a few interior points) are spot-checked +/// for correct id and value. +#[tokio::test(flavor = "multi_thread")] +async fn large_batch_table_copy() { + init_test_tracing(); + install_crypto_provider(); + + // --- GIVEN: 1024 rows with unique values --- + let database = spawn_source_database().await; + let table_name = test_table_name("large_batch"); + + let table_id = database + .create_table(table_name.clone(), true, &[("value", "text not null")]) + .await + .expect("Failed to create large_batch test table"); + + let publication_name = "test_pub_clickhouse_large_batch"; + database + .create_publication(publication_name, std::slice::from_ref(&table_name)) + .await + .expect("Failed to create large_batch publication"); + + let row_count: usize = 1024; + let values: Vec = (1..=row_count).map(|i| format!("('val_{i:04}')")).collect(); + // Insert in chunks to avoid exceeding Postgres query size limits. + for chunk in values.chunks(256) { + database + .run_sql(&format!( + "INSERT INTO {} (value) VALUES {}", + table_name.as_quoted_identifier(), + chunk.join(", "), + )) + .await + .expect("Failed to insert large_batch rows"); + } + + let ch_db = setup_clickhouse_database().await; + let store = NotifyingStore::new(); + let pipeline_id: PipelineId = random(); + let destination = ch_db.build_destination(pipeline_id, store.clone()); + + let table_ready = store + .notify_on_table_state_type(table_id, TableReplicationPhaseType::Ready) + .await; + + // --- WHEN: pipeline copies all rows --- + let mut pipeline = create_pipeline( + &database.config, + pipeline_id, + publication_name.to_owned(), + store, + destination, + ); + + pipeline.start().await.unwrap(); + table_ready.notified().await; + pipeline.shutdown_and_wait().await.unwrap(); + + // --- THEN: all rows arrive, spot-check a sample --- + let rows: Vec = ch_db.query(LARGE_BATCH_SELECT).await; + assert_eq!(rows.len(), row_count, "all 1024 rows must arrive"); + + // Spot-check: first, last, powers of two, and a few interior points. + let sample_ids: &[usize] = &[ + 1, 2, 4, 8, 16, 32, 64, 100, 128, 256, 500, 512, 750, 1000, 1024, + ]; + for &id in sample_ids { + let r = &rows[id - 1]; + assert_eq!(r.id, id as i64, "row {id} id mismatch"); + assert_eq!(r.value, format!("val_{id:04}"), "row {id} value mismatch"); + assert_eq!(r.cdc_operation, "INSERT"); + assert_eq!(r.cdc_lsn, 0); + } +} From b26235b238ce6a074c99215ec78ca8ae315e3bb2 Mon Sep 17 00:00:00 2001 From: Jordan McQueen Date: Thu, 16 Apr 2026 13:36:07 +0900 Subject: [PATCH 33/86] Add ping connectivity tests for ClickHouseClient Verifies ping returns Ok against a running instance and Err against an unreachable URL. This code path is used by the API validation layer for connection checks. --- etl-destinations/tests/clickhouse_pipeline.rs | 39 ++++++++++++++++++- 1 file changed, 38 insertions(+), 1 deletion(-) diff --git a/etl-destinations/tests/clickhouse_pipeline.rs b/etl-destinations/tests/clickhouse_pipeline.rs index d5ca8eb75..ff71000c9 100644 --- a/etl-destinations/tests/clickhouse_pipeline.rs +++ b/etl-destinations/tests/clickhouse_pipeline.rs @@ -6,7 +6,11 @@ use etl::test_utils::notifying_store::NotifyingStore; use etl::test_utils::pipeline::create_pipeline; use etl::types::PipelineId; use etl_destinations::clickhouse::ClickHouseInserterConfig; -use etl_destinations::clickhouse::test_utils::{ClickHouseTestDatabase, setup_clickhouse_database}; +use etl_destinations::clickhouse::client::ClickHouseClient; +use etl_destinations::clickhouse::test_utils::{ + ClickHouseTestDatabase, get_clickhouse_password, get_clickhouse_url, get_clickhouse_user, + setup_clickhouse_database, +}; use etl_telemetry::tracing::init_test_tracing; use rand::random; use std::sync::Once; @@ -1707,3 +1711,36 @@ async fn large_batch_table_copy() { assert_eq!(r.cdc_lsn, 0); } } + +/// # GIVEN +/// A ClickHouseClient pointed at the running test ClickHouse instance. +/// +/// # WHEN +/// `ping()` is called. +/// +/// # THEN +/// It returns Ok(()). +#[tokio::test(flavor = "multi_thread")] +async fn ping_succeeds_against_running_clickhouse() { + let client = ClickHouseClient::new( + get_clickhouse_url(), + get_clickhouse_user(), + get_clickhouse_password(), + "default", + ); + assert!(client.ping().await.is_ok()); +} + +/// # GIVEN +/// A ClickHouseClient pointed at a URL where nothing is listening. +/// +/// # WHEN +/// `ping()` is called. +/// +/// # THEN +/// It returns Err. +#[tokio::test(flavor = "multi_thread")] +async fn ping_fails_against_unreachable_clickhouse() { + let client = ClickHouseClient::new("http://localhost:1", "nobody", None::, "default"); + assert!(client.ping().await.is_err()); +} From a6d09a9f81866b0a1b576f3d70f8ac39c6cd71cd Mon Sep 17 00:00:00 2001 From: Jordan McQueen Date: Fri, 17 Apr 2026 15:31:28 +0900 Subject: [PATCH 34/86] Refine default replica identity DELETE test style and LSN check Reword composite WHEN to avoid 'then' and assert that the post-delete INSERT has a strictly higher LSN than the DELETE row. --- etl-destinations/tests/clickhouse_pipeline.rs | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/etl-destinations/tests/clickhouse_pipeline.rs b/etl-destinations/tests/clickhouse_pipeline.rs index ff71000c9..67420dba7 100644 --- a/etl-destinations/tests/clickhouse_pipeline.rs +++ b/etl-destinations/tests/clickhouse_pipeline.rs @@ -1488,7 +1488,7 @@ const DEFAULT_IDENTITY_DELETE_SELECT: &str = concat!( /// /// # WHEN /// -/// Row `id=2` is deleted in Postgres, then a new row (`id=3, value='after'`) +/// Row `id=2` is deleted in Postgres, and a new row (`id=3, value='after'`) /// is inserted. /// /// # THEN @@ -1551,7 +1551,7 @@ async fn delete_with_default_replica_identity() { pipeline.start().await.unwrap(); table_ready.notified().await; - // --- WHEN: delete id=2, then insert a new row --- + // --- WHEN: delete id=2, and insert a new row --- database .run_sql(&format!( "DELETE FROM {} WHERE id = 2", @@ -1613,6 +1613,10 @@ async fn delete_with_default_replica_identity() { assert_eq!(r.value, "after"); assert_eq!(r.cdc_operation, "INSERT"); assert!(r.cdc_lsn > 0, "pipeline must continue after the delete"); + assert!( + r.cdc_lsn > rows[2].cdc_lsn, + "post-delete INSERT must have a higher LSN than the DELETE" + ); } /// SELECT query used to verify the `large_batch` test. From d57f3c8464cecaef1a15e5e333238953b2f29a8c Mon Sep 17 00:00:00 2001 From: Jordan McQueen Date: Mon, 20 Apr 2026 11:51:34 +0900 Subject: [PATCH 35/86] Use Url for ClickHouse --- Cargo.toml | 2 +- etl-api/src/configs/destination.rs | 145 +++++++++++++++++- etl-api/src/utils.rs | 48 ++++++ etl-api/src/validation/validators.rs | 10 +- etl-config/src/shared/destination.rs | 9 +- etl-destinations/Cargo.toml | 1 + etl-destinations/src/clickhouse/client.rs | 5 +- etl-destinations/src/clickhouse/core.rs | 3 +- etl-destinations/src/clickhouse/test_utils.rs | 3 +- etl-destinations/tests/clickhouse_pipeline.rs | 10 +- etl-examples/src/bin/clickhouse.rs | 3 +- etl-replicator/src/core.rs | 2 +- 12 files changed, 215 insertions(+), 26 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 7db83f553..eeaa96ce1 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -92,7 +92,7 @@ tracing-actix-web = { version = "0.7.19", default-features = false } tracing-appender = { version = "0.2.3", default-features = false } tracing-log = { version = "0.2.0", default-features = false } tracing-subscriber = { version = "0.3.19", default-features = false } -url = { version = "2.5.8" } +url = { version = "2.5.8", features = ["serde"] } utoipa = { version = "5.4.0", default-features = false } utoipa-swagger-ui = { version = "9.0.2", default-features = false, features = ["vendored"] } uuid = { version = "1.17.0", default-features = false } diff --git a/etl-api/src/configs/destination.rs b/etl-api/src/configs/destination.rs index 8826ba772..17e6093d3 100644 --- a/etl-api/src/configs/destination.rs +++ b/etl-api/src/configs/destination.rs @@ -2,6 +2,7 @@ use etl_config::SerializableSecretString; use etl_config::shared::{DestinationConfig, IcebergConfig}; use secrecy::ExposeSecret; use serde::{Deserialize, Serialize}; +use url::Url; use utoipa::ToSchema; use crate::configs::encryption::{ @@ -40,10 +41,10 @@ pub enum FullApiDestinationConfig { connection_pool_size: Option, }, ClickHouse { - /// ClickHouse HTTP(S) endpoint URL - #[schema(example = "http://test:8123")] - #[serde(deserialize_with = "crate::utils::trim_string")] - url: String, //TODO: use url type instead + /// ClickHouse HTTP(S) endpoint URL. + #[schema(value_type = String, example = "http://test:8123")] + #[serde(deserialize_with = "crate::utils::trim_http_url")] + url: Url, /// ClickHouse user name #[schema(example = "foo")] #[serde(deserialize_with = "crate::utils::trim_string")] @@ -219,7 +220,7 @@ pub enum StoredDestinationConfig { connection_pool_size: usize, }, ClickHouse { - url: String, //TODO: use url type instead + url: Url, user: String, password: Option, database: String, @@ -574,7 +575,7 @@ pub enum EncryptedStoredDestinationConfig { connection_pool_size: usize, }, ClickHouse { - url: String, //TODO: use url type instead + url: Url, user: String, password: Option, database: String, @@ -1028,6 +1029,138 @@ mod tests { } } + #[test] + fn test_stored_destination_config_encryption_decryption_clickhouse() { + let config = StoredDestinationConfig::ClickHouse { + url: Url::parse("https://example.com:8443").unwrap(), + user: "etl".to_string(), + password: Some(SerializableSecretString::from("secret".to_string())), + database: "analytics".to_string(), + }; + + let key = EncryptionKey { + id: 1, + key: generate_random_key::<32>().unwrap(), + }; + + let encrypted = config.clone().encrypt(&key).unwrap(); + let decrypted = encrypted.decrypt(&key).unwrap(); + + match (config, decrypted) { + ( + StoredDestinationConfig::ClickHouse { + url: u1, + user: user1, + password: p1, + database: d1, + }, + StoredDestinationConfig::ClickHouse { + url: u2, + user: user2, + password: p2, + database: d2, + }, + ) => { + assert_eq!(u1, u2); + assert_eq!(user1, user2); + assert_eq!(d1, d2); + assert_eq!( + p1.as_ref().map(|value| value.expose_secret()), + p2.as_ref().map(|value| value.expose_secret()) + ); + } + _ => panic!("Config types don't match"), + } + } + + #[test] + fn test_full_api_destination_config_conversion_clickhouse() { + let full_config = FullApiDestinationConfig::ClickHouse { + url: Url::parse("https://example.com:8443").unwrap(), + user: "etl".to_string(), + password: Some(SerializableSecretString::from("secret".to_string())), + database: "analytics".to_string(), + }; + + let stored: StoredDestinationConfig = full_config.clone().into(); + let back_to_full: FullApiDestinationConfig = stored.into(); + + match (full_config, back_to_full) { + ( + FullApiDestinationConfig::ClickHouse { + url: u1, + user: user1, + password: p1, + database: d1, + }, + FullApiDestinationConfig::ClickHouse { + url: u2, + user: user2, + password: p2, + database: d2, + }, + ) => { + assert_eq!(u1, u2); + assert_eq!(user1, user2); + assert_eq!(d1, d2); + assert_eq!( + p1.as_ref().map(|value| value.expose_secret()), + p2.as_ref().map(|value| value.expose_secret()) + ); + } + _ => panic!("Config types don't match"), + } + } + + #[test] + fn test_full_api_destination_config_deserializes_clickhouse_url() { + let json = r#" + { + "click_house": { + "url": " https://example.com:8443 ", + "user": "etl", + "database": "analytics" + } + } + "#; + + let deserialized: FullApiDestinationConfig = serde_json::from_str(json).unwrap(); + match deserialized { + FullApiDestinationConfig::ClickHouse { + url, + user, + password, + database, + } => { + assert_eq!(url.as_str(), "https://example.com:8443/"); + assert_eq!(user, "etl"); + assert!(password.is_none()); + assert_eq!(database, "analytics"); + } + _ => panic!("Deserialization failed or variant mismatch"), + } + } + + #[test] + fn test_full_api_destination_config_rejects_non_http_clickhouse_url() { + let json = r#" + { + "click_house": { + "url": "ftp://example.com/data", + "user": "etl", + "database": "analytics" + } + } + "#; + + let error = serde_json::from_str::(json).unwrap_err(); + assert!( + error + .to_string() + .contains("url must use http or https scheme") + ); + } + #[test] fn test_full_api_destination_config_conversion_bigquery() { let full_config = FullApiDestinationConfig::BigQuery { diff --git a/etl-api/src/utils.rs b/etl-api/src/utils.rs index c5184a5ac..5a8214d1e 100644 --- a/etl-api/src/utils.rs +++ b/etl-api/src/utils.rs @@ -1,6 +1,8 @@ use etl_config::SerializableSecretString; use rand::Rng; +use serde::de::Error as _; use serde::{Deserialize, Deserializer}; +use url::Url; /// Deserializes a string and trims leading and trailing whitespace. pub fn trim_string<'de, D>(deserializer: D) -> Result @@ -31,6 +33,22 @@ where Ok(opt.map(|s| SerializableSecretString::from(s.trim().to_string()))) } +/// Deserializes an HTTP(S) URL string and trims leading and trailing whitespace. +pub fn trim_http_url<'de, D>(deserializer: D) -> Result +where + D: Deserializer<'de>, +{ + let s = String::deserialize(deserializer)?; + let url = Url::parse(s.trim()).map_err(D::Error::custom)?; + + match url.scheme() { + "http" | "https" => Ok(url), + scheme => Err(D::Error::custom(format!( + "url must use http or https scheme, got {scheme}" + ))), + } +} + /// Generates a random alphabetic string of length `len`. pub fn generate_random_alpha_str(len: usize) -> String { let chars = [ @@ -296,4 +314,34 @@ mod tests { let result: TestStruct = serde_json::from_str(json).unwrap(); assert_eq!(result.value, Some("".to_string())); } + + #[test] + fn test_trim_http_url_trims_and_parses() { + #[derive(Debug, Deserialize)] + struct TestStruct { + #[serde(rename = "value", deserialize_with = "trim_http_url")] + _value: Url, + } + + let json = r#"{"value": " https://example.com:8443/path "}"#; + let result: TestStruct = serde_json::from_str(json).unwrap(); + assert_eq!(result._value.as_str(), "https://example.com:8443/path"); + } + + #[test] + fn test_trim_http_url_rejects_non_http_scheme() { + #[derive(Debug, Deserialize)] + struct TestStruct { + #[serde(rename = "value", deserialize_with = "trim_http_url")] + _value: Url, + } + + let json = r#"{"value": "ftp://example.com/data"}"#; + let error = serde_json::from_str::(json).unwrap_err(); + assert!( + error + .to_string() + .contains("url must use http or https scheme") + ); + } } diff --git a/etl-api/src/validation/validators.rs b/etl-api/src/validation/validators.rs index 31a178d83..e0eb905fd 100644 --- a/etl-api/src/validation/validators.rs +++ b/etl-api/src/validation/validators.rs @@ -13,6 +13,7 @@ use etl_destinations::iceberg::{ }; use secrecy::ExposeSecret; use sqlx::FromRow; +use url::Url; use crate::configs::destination::{FullApiDestinationConfig, FullApiIcebergConfig}; use crate::configs::pipeline::FullApiPipelineConfig; @@ -656,19 +657,14 @@ impl Validator for BigQueryValidator { /// Validates Clickhouse destination connectivity and dataset accessibility. #[derive(Debug)] struct ClickHouseValidator { - url: String, //TODO: use url type instead + url: Url, user: String, password: Option, database: String, } impl ClickHouseValidator { - fn new( - url: String, //TODO: use url type instead - user: String, - password: Option, - database: String, - ) -> Self { + fn new(url: Url, user: String, password: Option, database: String) -> Self { Self { url, user, diff --git a/etl-config/src/shared/destination.rs b/etl-config/src/shared/destination.rs index c06b52dbc..80764ab4a 100644 --- a/etl-config/src/shared/destination.rs +++ b/etl-config/src/shared/destination.rs @@ -1,5 +1,6 @@ use secrecy::SecretString; use serde::{Deserialize, Serialize}; +use url::Url; const fn default_connection_pool_size() -> usize { DestinationConfig::DEFAULT_CONNECTION_POOL_SIZE @@ -47,8 +48,8 @@ pub enum DestinationConfig { connection_pool_size: usize, }, ClickHouse { - /// ClickHouse HTTP(S) endpoint URL - url: String, //TODO: use url instead + /// ClickHouse HTTP(S) endpoint URL. + url: Url, /// ClickHouse user name user: String, /// ClickHouse password (omit for passwordless access) @@ -235,8 +236,8 @@ pub enum DestinationConfigWithoutSecrets { connection_pool_size: usize, }, ClickHouse { - /// ClickHouse HTTP(S) endpoint URL - url: String, //TODO: use url instead + /// ClickHouse HTTP(S) endpoint URL. + url: Url, /// ClickHouse user name user: String, /// ClickHouse target database diff --git a/etl-destinations/Cargo.toml b/etl-destinations/Cargo.toml index 85cef3cfd..452b0465f 100644 --- a/etl-destinations/Cargo.toml +++ b/etl-destinations/Cargo.toml @@ -52,6 +52,7 @@ clickhouse = [ "dep:serde", "dep:futures", "dep:parking_lot", + "dep:url", ] egress = ["etl/egress"] # We assume that `test-utils` is always used in conjunction with `bigquery` or `iceberg` thus we only diff --git a/etl-destinations/src/clickhouse/client.rs b/etl-destinations/src/clickhouse/client.rs index 37b00cb4a..80628a086 100644 --- a/etl-destinations/src/clickhouse/client.rs +++ b/etl-destinations/src/clickhouse/client.rs @@ -4,6 +4,7 @@ use std::time::Instant; use clickhouse::Client; use etl::error::{ErrorKind, EtlResult}; use etl::etl_error; +use url::Url; use crate::clickhouse::encoding::{ClickHouseValue, rb_encode_row}; use crate::clickhouse::metrics::ETL_CH_INSERT_DURATION_SECONDS; @@ -31,13 +32,13 @@ impl ClickHouseClient { /// When `url` starts with `https://`, TLS is handled automatically by the /// `rustls-tls` feature using webpki root certificates. pub fn new( - url: impl Into, + url: Url, user: impl Into, password: Option, database: impl Into, ) -> Self { let mut client = Client::default() - .with_url(url) + .with_url(url.to_string()) .with_user(user) .with_database(database); diff --git a/etl-destinations/src/clickhouse/core.rs b/etl-destinations/src/clickhouse/core.rs index cd3403c74..f2190ffde 100644 --- a/etl-destinations/src/clickhouse/core.rs +++ b/etl-destinations/src/clickhouse/core.rs @@ -20,6 +20,7 @@ use parking_lot::RwLock; use std::time::Instant; use tokio::task::JoinSet; use tracing::{debug, info}; +use url::Url; // ── CDC operation type ──────────────────────────────────────────────────────── @@ -95,7 +96,7 @@ where /// When using an `https://` URL, TLS is handled automatically by the `rustls-tls` /// feature using webpki root certificates. pub fn new( - url: impl Into, + url: Url, user: impl Into, password: Option, database: impl Into, diff --git a/etl-destinations/src/clickhouse/test_utils.rs b/etl-destinations/src/clickhouse/test_utils.rs index 75a745114..d9e64fe68 100644 --- a/etl-destinations/src/clickhouse/test_utils.rs +++ b/etl-destinations/src/clickhouse/test_utils.rs @@ -5,6 +5,7 @@ use etl::store::schema::SchemaStore; use etl::store::state::StateStore; use etl::types::PipelineId; use tokio::runtime::Handle; +use url::Url; use uuid::Uuid; use crate::clickhouse::{ClickHouseDestination, ClickHouseInserterConfig}; @@ -137,7 +138,7 @@ impl ClickHouseTestDatabase { S: StateStore + SchemaStore + Send + Sync, { ClickHouseDestination::new( - &self.url, + Url::parse(&self.url).expect("failed to parse test ClickHouse URL"), &self.user, self.password.clone(), &self.database, diff --git a/etl-destinations/tests/clickhouse_pipeline.rs b/etl-destinations/tests/clickhouse_pipeline.rs index 67420dba7..5c312c306 100644 --- a/etl-destinations/tests/clickhouse_pipeline.rs +++ b/etl-destinations/tests/clickhouse_pipeline.rs @@ -16,6 +16,7 @@ use rand::random; use std::sync::Once; use std::time::Duration; use tokio::time::sleep; +use url::Url; use crate::support::clickhouse::{AllTypesRow, BoundaryValuesRow}; @@ -1727,7 +1728,7 @@ async fn large_batch_table_copy() { #[tokio::test(flavor = "multi_thread")] async fn ping_succeeds_against_running_clickhouse() { let client = ClickHouseClient::new( - get_clickhouse_url(), + Url::parse(&get_clickhouse_url()).unwrap(), get_clickhouse_user(), get_clickhouse_password(), "default", @@ -1745,6 +1746,11 @@ async fn ping_succeeds_against_running_clickhouse() { /// It returns Err. #[tokio::test(flavor = "multi_thread")] async fn ping_fails_against_unreachable_clickhouse() { - let client = ClickHouseClient::new("http://localhost:1", "nobody", None::, "default"); + let client = ClickHouseClient::new( + Url::parse("http://localhost:1").unwrap(), + "nobody", + None::, + "default", + ); assert!(client.ping().await.is_err()); } diff --git a/etl-examples/src/bin/clickhouse.rs b/etl-examples/src/bin/clickhouse.rs index bedc0a0e1..50efc75f3 100644 --- a/etl-examples/src/bin/clickhouse.rs +++ b/etl-examples/src/bin/clickhouse.rs @@ -50,6 +50,7 @@ use std::sync::Once; use tokio::signal; use tracing::{error, info}; use tracing_subscriber::{layer::SubscriberExt, util::SubscriberInitExt}; +use url::Url; /// Ensures crypto provider is only initialized once. static INIT_CRYPTO: Once = Once::new(); @@ -211,7 +212,7 @@ async fn main_impl() -> Result<(), Box> { // Initialize the ClickHouse destination. // Tables are created automatically as append-only MergeTree tables. let clickhouse_destination = ClickHouseDestination::new( - args.ch_args.ch_url, + Url::parse(&args.ch_args.ch_url)?, args.ch_args.ch_user, args.ch_args.ch_password, args.ch_args.ch_database, diff --git a/etl-replicator/src/core.rs b/etl-replicator/src/core.rs index 8b66aa764..1d9583570 100644 --- a/etl-replicator/src/core.rs +++ b/etl-replicator/src/core.rs @@ -211,7 +211,7 @@ pub async fn start_replicator_with_config( max_bytes_per_insert, }; let destination = ClickHouseDestination::new( - url, + url.clone(), user, password.as_ref().map(|p| p.expose_secret().to_string()), database, From c1ef49d6f7affc732e0445bf020d4bb9dcb7a943 Mon Sep 17 00:00:00 2001 From: Jordan McQueen Date: Mon, 20 Apr 2026 13:14:49 +0900 Subject: [PATCH 36/86] Keep ClickHouse password secret --- etl-api/src/validation/validators.rs | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/etl-api/src/validation/validators.rs b/etl-api/src/validation/validators.rs index e0eb905fd..cf6187977 100644 --- a/etl-api/src/validation/validators.rs +++ b/etl-api/src/validation/validators.rs @@ -4,7 +4,7 @@ use std::collections::HashMap; use async_trait::async_trait; use etl::store::both::memory::MemoryStore; -use etl_config::parse_ducklake_url; +use etl_config::{SerializableSecretString, parse_ducklake_url}; use etl_destinations::bigquery::BigQueryClient; use etl_destinations::clickhouse::ClickHouseClient; use etl_destinations::ducklake::{DuckLakeDestination, S3Config as DucklakeS3Config}; @@ -659,12 +659,17 @@ impl Validator for BigQueryValidator { struct ClickHouseValidator { url: Url, user: String, - password: Option, + password: Option, database: String, } impl ClickHouseValidator { - fn new(url: Url, user: String, password: Option, database: String) -> Self { + fn new( + url: Url, + user: String, + password: Option, + database: String, + ) -> Self { Self { url, user, @@ -683,7 +688,9 @@ impl Validator for ClickHouseValidator { let client = ClickHouseClient::new( self.url.clone(), self.user.clone(), - self.password.clone(), + self.password + .as_ref() + .map(|password| password.expose_secret().to_owned()), self.database.clone(), ); match client.ping().await { @@ -963,7 +970,7 @@ impl Validator for DestinationValidator { let validator = ClickHouseValidator::new( url.clone(), user.clone(), - password.as_ref().map(|p| p.expose_secret().to_string()), + password.clone(), database.clone(), ); validator.validate(ctx).await From 7ba200a6e0853e731f3426897141b64e50d3e871 Mon Sep 17 00:00:00 2001 From: Jordan McQueen Date: Mon, 20 Apr 2026 17:57:07 +0900 Subject: [PATCH 37/86] Update ColumnSchema in ClickHouse schema tests for new field layout ColumnSchema.primary was replaced with ordinal_position and primary_key_ordinal_position upstream. --- etl-destinations/src/clickhouse/schema.rs | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/etl-destinations/src/clickhouse/schema.rs b/etl-destinations/src/clickhouse/schema.rs index cd6dae260..c373475e6 100644 --- a/etl-destinations/src/clickhouse/schema.rs +++ b/etl-destinations/src/clickhouse/schema.rs @@ -236,15 +236,17 @@ mod tests { name: "id".to_string(), typ: Type::INT4, modifier: -1, + ordinal_position: 1, + primary_key_ordinal_position: Some(1), nullable: false, - primary: true, }, ColumnSchema { name: "name".to_string(), typ: Type::TEXT, modifier: -1, + ordinal_position: 2, + primary_key_ordinal_position: None, nullable: true, - primary: false, }, ]; let sql = build_create_table_sql("public_users", &schemas); @@ -264,8 +266,9 @@ mod tests { name: "id".to_string(), typ: Type::INT4, modifier: -1, + ordinal_position: 1, + primary_key_ordinal_position: Some(1), nullable: false, - primary: true, }]; let sql = build_create_table_sql("public_t", &schemas); assert!( @@ -286,8 +289,9 @@ mod tests { name: "tags".to_string(), typ: Type::TEXT_ARRAY, modifier: -1, + ordinal_position: 1, + primary_key_ordinal_position: None, nullable: false, - primary: false, }]; let sql = build_create_table_sql("public_t", &schemas); assert!( From f0ef83d374475224d97fdd48b1f4cb04482d906a Mon Sep 17 00:00:00 2001 From: Jordan McQueen Date: Mon, 20 Apr 2026 18:20:40 +0900 Subject: [PATCH 38/86] Adapt ClickHouse destination to new Destination trait and event APIs - ensure_table_exists now takes &ReplicatedTableSchema instead of TableId, deriving table name and column schemas directly from the schema. No more store lookups for schema or table mappings. - ClickHouseDestination is no longer generic over S (store). The store field and SchemaStore/StateStore trait bounds are removed since the schema is now passed in via method arguments and events. - Event processing uses replicated_table_schema.id() instead of table_id, and truncate uses truncated_tables instead of rel_ids. - Test helpers simplified: build_destination() no longer takes pipeline_id or store arguments. - clickhouse_pipeline tests registered in consolidated main.rs test binary. --- etl-destinations/src/clickhouse/core.rs | 128 ++++++++---------- etl-destinations/src/clickhouse/test_utils.rs | 49 +++---- etl-destinations/tests/clickhouse_pipeline.rs | 33 ++--- etl-destinations/tests/main.rs | 2 + etl-destinations/tests/support/clickhouse.rs | 1 - etl-examples/src/bin/clickhouse.rs | 1 - etl-replicator/src/core.rs | 1 - 7 files changed, 86 insertions(+), 129 deletions(-) diff --git a/etl-destinations/src/clickhouse/core.rs b/etl-destinations/src/clickhouse/core.rs index f2190ffde..9dd6b04d6 100644 --- a/etl-destinations/src/clickhouse/core.rs +++ b/etl-destinations/src/clickhouse/core.rs @@ -1,7 +1,4 @@ -use std::{ - collections::{HashMap, HashSet}, - sync::Arc, -}; +use std::{collections::HashMap, sync::Arc}; use crate::clickhouse::client::ClickHouseClient; use crate::clickhouse::encoding::{ClickHouseValue, cell_to_clickhouse_value}; @@ -12,9 +9,7 @@ use etl::destination::async_result::{ }; use etl::error::{ErrorKind, EtlResult}; use etl::etl_error; -use etl::store::schema::SchemaStore; -use etl::store::state::StateStore; -use etl::types::{Cell, Event, TableId, TableRow, is_array_type}; +use etl::types::{Cell, Event, ReplicatedTableSchema, TableId, TableRow, is_array_type}; use etl::{destination::Destination, types::PgLsn}; use parking_lot::RwLock; use std::time::Instant; @@ -68,16 +63,15 @@ pub struct ClickHouseInserterConfig { /// /// Uses append-only MergeTree tables with two CDC columns (`cdc_operation`, `cdc_lsn`) /// appended to each row. Rows are encoded as RowBinary and sent via -/// `INSERT INTO "table" FORMAT RowBinary` — no column-name header required. +/// `INSERT INTO "table" FORMAT RowBinary` -- no column-name header required. /// /// The struct is cheaply cloneable: `client` wraps an `Arc` internally, and -/// `table_cache` is wrapped in `Arc>`. +/// `table_cache` is wrapped in `Arc>`. #[derive(Clone)] -pub struct ClickHouseDestination { +pub struct ClickHouseDestination { client: ClickHouseClient, inserter_config: Arc, - store: Arc, - /// Cache: ClickHouse table name → `Arc<[bool]>` (nullable flags per column, + /// Cache: ClickHouse table name -> `Arc<[bool]>` (nullable flags per column, /// including the two trailing CDC columns which are always `false`). /// /// `std::sync::RwLock` is appropriate here: both reads (hot path) and writes (rare, @@ -87,10 +81,7 @@ pub struct ClickHouseDestination { table_cache: Arc>>>, } -impl ClickHouseDestination -where - S: StateStore + SchemaStore + Send + Sync, -{ +impl ClickHouseDestination { /// Creates a new `ClickHouseDestination`. /// /// When using an `https://` URL, TLS is handled automatically by the `rustls-tls` @@ -101,51 +92,29 @@ where password: Option, database: impl Into, inserter_config: ClickHouseInserterConfig, - store: S, ) -> EtlResult { register_metrics(); Ok(Self { client: ClickHouseClient::new(url, user, password, database), inserter_config: Arc::new(inserter_config), - store: Arc::new(store), table_cache: Arc::new(RwLock::new(HashMap::new())), }) } - /// Ensures the ClickHouse table for `table_id` exists, returning + /// Ensures the ClickHouse table for the given schema exists, returning /// `(ch_table_name, nullable_flags)`. /// /// Uses a two-phase locking strategy: - /// 1. Fast-path read (no await) → return cached entry if present. + /// 1. Fast-path read (no await) -- return cached entry if present. /// 2. Slow-path: compute DDL, run `CREATE TABLE IF NOT EXISTS` (await, no lock held), /// then write-lock to insert (using `or_insert` for the concurrent first-writer race). - async fn ensure_table_exists(&self, table_id: TableId) -> EtlResult<(String, Arc<[bool]>)> { - let table_schema = self - .store - .get_table_schema(&table_id) - .await? - .ok_or_else(|| { - etl_error!( - ErrorKind::MissingTableSchema, - "Table schema not found", - format!("No schema found for table {table_id}") - ) - })?; - - let ch_table_name = { - if let Some(name) = self.store.get_table_mapping(&table_id).await? { - name - } else { - let name = table_name_to_clickhouse_table_name( - &table_schema.name.schema, - &table_schema.name.name, - ); - self.store - .store_table_mapping(table_id, name.clone()) - .await?; - name - } - }; + async fn ensure_table_exists( + &self, + schema: &ReplicatedTableSchema, + ) -> EtlResult<(String, Arc<[bool]>)> { + let table_name = schema.name(); + let ch_table_name = + table_name_to_clickhouse_table_name(&table_name.schema, &table_name.name); { let guard = self.table_cache.read(); @@ -162,7 +131,7 @@ where // `nullable_flags[i] = true` for an array column, `rb_encode_nullable` would prepend a // spurious `0x00` byte that ClickHouse reads as `varint(0)` (empty array), causing every // subsequent column to be read from the wrong offset and ultimately "Cannot read all data". - let column_schemas = &table_schema.column_schemas; + let column_schemas: Vec<_> = schema.column_schemas().cloned().collect(); let mut nullable_flags_vec: Vec = column_schemas .iter() .map(|c| c.nullable && !is_array_type(&c.typ)) @@ -172,7 +141,7 @@ where let nullable_flags: Arc<[bool]> = nullable_flags_vec.into(); // Execute DDL (no lock held during this await). - let ddl = build_create_table_sql(&ch_table_name, column_schemas); + let ddl = build_create_table_sql(&ch_table_name, &column_schemas); let ddl_start = Instant::now(); self.client.execute_ddl(&ddl).await?; metrics::histogram!(ETL_CH_DDL_DURATION_SECONDS, "table" => ch_table_name.clone()) @@ -191,17 +160,17 @@ where Ok((ch_table_name, stored_flags)) } - async fn truncate_table_inner(&self, table_id: TableId) -> EtlResult<()> { - let (ch_table_name, _) = self.ensure_table_exists(table_id).await?; + async fn truncate_table_inner(&self, schema: &ReplicatedTableSchema) -> EtlResult<()> { + let (ch_table_name, _) = self.ensure_table_exists(schema).await?; self.client.truncate_table(&ch_table_name).await } async fn write_table_rows_inner( &self, - table_id: TableId, + schema: &ReplicatedTableSchema, table_rows: Vec, ) -> EtlResult<()> { - let (ch_table_name, nullable_flags) = self.ensure_table_exists(table_id).await?; + let (ch_table_name, nullable_flags) = self.ensure_table_exists(schema).await?; let rows: Vec> = table_rows .into_iter() @@ -234,7 +203,7 @@ where /// 2. Writes those rows concurrently. /// 3. Drains consecutive Truncate events (deduplicated) and executes them. /// - /// Breaking at a Truncate never skips events — the outer loop resumes from that + /// Breaking at a Truncate never skips events -- the outer loop resumes from that /// position, so rows accumulated before the Truncate are flushed first, then the /// Truncate fires, then subsequent events (including inserts on the same table) /// are processed in the next pass. @@ -243,6 +212,8 @@ where while event_iter.peek().is_some() { // Accumulate non-truncate events grouped by table_id. + // We also track the ReplicatedTableSchema per table for ensure_table_exists. + let mut table_schemas: HashMap = HashMap::new(); let mut table_id_to_rows: HashMap> = HashMap::new(); while let Some(event) = event_iter.peek() { @@ -255,8 +226,12 @@ where .expect("event iterator should not be empty, we peeked at the next event; qed"); match event { Event::Insert(insert) => { + let table_id = insert.replicated_table_schema.id(); + table_schemas + .entry(table_id) + .or_insert_with(|| insert.replicated_table_schema.clone()); table_id_to_rows - .entry(insert.table_id) + .entry(table_id) .or_default() .push(PendingRow { operation: CdcOperation::Insert, @@ -265,8 +240,12 @@ where }); } Event::Update(update) => { + let table_id = update.replicated_table_schema.id(); + table_schemas + .entry(table_id) + .or_insert_with(|| update.replicated_table_schema.clone()); table_id_to_rows - .entry(update.table_id) + .entry(table_id) .or_default() .push(PendingRow { operation: CdcOperation::Update, @@ -279,8 +258,12 @@ where info!("delete event has no row data, skipping"); continue; }; + let table_id = delete.replicated_table_schema.id(); + table_schemas + .entry(table_id) + .or_insert_with(|| delete.replicated_table_schema.clone()); table_id_to_rows - .entry(delete.table_id) + .entry(table_id) .or_default() .push(PendingRow { operation: CdcOperation::Delete, @@ -302,8 +285,8 @@ where // Phase 1: ensure all tables exist (must happen outside JoinSet spawns // since ensure_table_exists holds &self which is not 'static). let mut table_meta: HashMap)> = HashMap::new(); - for &table_id in table_id_to_rows.keys() { - let (name, flags) = self.ensure_table_exists(table_id).await?; + for (&table_id, schema) in &table_schemas { + let (name, flags) = self.ensure_table_exists(schema).await?; table_meta.insert(table_id, (name, flags)); } @@ -358,19 +341,19 @@ where } // Collect and deduplicate truncate events. - let mut truncate_table_ids = HashSet::new(); + let mut truncate_schemas: HashMap = HashMap::new(); while let Some(Event::Truncate(_)) = event_iter.peek() { if let Some(Event::Truncate(truncate_event)) = event_iter.next() { - for table_id in truncate_event.rel_ids { - truncate_table_ids.insert(TableId::new(table_id)); + for schema in truncate_event.truncated_tables { + truncate_schemas.entry(schema.id()).or_insert(schema); } } } futures::future::try_join_all( - truncate_table_ids - .into_iter() - .map(|table_id| self.truncate_table_inner(table_id)), + truncate_schemas + .values() + .map(|schema| self.truncate_table_inner(schema)), ) .await?; } @@ -379,31 +362,30 @@ where } } -impl Destination for ClickHouseDestination -where - S: StateStore + SchemaStore + Send + Sync, -{ +impl Destination for ClickHouseDestination { fn name() -> &'static str { "clickhouse" } async fn truncate_table( &self, - table_id: TableId, + replicated_table_schema: &ReplicatedTableSchema, async_result: TruncateTableResult<()>, ) -> EtlResult<()> { - let result = self.truncate_table_inner(table_id).await; + let result = self.truncate_table_inner(replicated_table_schema).await; async_result.send(result); Ok(()) } async fn write_table_rows( &self, - table_id: TableId, + replicated_table_schema: &ReplicatedTableSchema, table_rows: Vec, async_result: WriteTableRowsResult<()>, ) -> EtlResult<()> { - let result = self.write_table_rows_inner(table_id, table_rows).await; + let result = self + .write_table_rows_inner(replicated_table_schema, table_rows) + .await; async_result.send(result); Ok(()) } diff --git a/etl-destinations/src/clickhouse/test_utils.rs b/etl-destinations/src/clickhouse/test_utils.rs index d9e64fe68..898639e08 100644 --- a/etl-destinations/src/clickhouse/test_utils.rs +++ b/etl-destinations/src/clickhouse/test_utils.rs @@ -1,9 +1,6 @@ //! Test utilities for ClickHouse destinations. use clickhouse::Client; -use etl::store::schema::SchemaStore; -use etl::store::state::StateStore; -use etl::types::PipelineId; use tokio::runtime::Handle; use url::Url; use uuid::Uuid; @@ -21,9 +18,12 @@ pub const CLICKHOUSE_PASSWORD_ENV: &str = "TESTS_CLICKHOUSE_PASSWORD"; /// /// # Panics /// -/// Panics if [`CLICKHOUSE_URL_ENV`] is not set. -pub fn get_clickhouse_url() -> String { - std::env::var(CLICKHOUSE_URL_ENV).unwrap_or_else(|_| panic!("{CLICKHOUSE_URL_ENV} must be set")) +/// Panics if [`CLICKHOUSE_URL_ENV`] is not set or is not a valid URL. +pub fn get_clickhouse_url() -> Url { + let value = std::env::var(CLICKHOUSE_URL_ENV) + .unwrap_or_else(|_| panic!("{CLICKHOUSE_URL_ENV} must be set")); + Url::parse(&value) + .unwrap_or_else(|error| panic!("{CLICKHOUSE_URL_ENV} must be a valid URL: {error}")) } /// Returns the ClickHouse user name from the environment. @@ -56,16 +56,16 @@ pub struct ClickHouseTestDatabase { root_client: Client, /// Client scoped to the test database for queries. db_client: Client, - url: String, + url: Url, user: String, password: Option, database: String, } impl ClickHouseTestDatabase { - fn new(url: String, user: String, password: Option, database: String) -> Self { + fn new(url: Url, user: String, password: Option, database: String) -> Self { let build_client = |db: Option<&str>| { - let mut c = Client::default().with_url(&url).with_user(&user); + let mut c = Client::default().with_url(url.as_str()).with_user(&user); if let Some(db) = db { c = c.with_database(db); } @@ -109,41 +109,24 @@ impl ClickHouseTestDatabase { /// Builds a [`ClickHouseDestination`] scoped to this test database with /// default inserter config (100 MiB per INSERT -- large enough that tests /// never hit an intermediate flush). - pub fn build_destination( - &self, - pipeline_id: PipelineId, - store: S, - ) -> ClickHouseDestination - where - S: StateStore + SchemaStore + Send + Sync, - { - self.build_destination_with_config( - pipeline_id, - store, - ClickHouseInserterConfig { - max_bytes_per_insert: 100 * 1024 * 1024, - }, - ) + pub fn build_destination(&self) -> ClickHouseDestination { + self.build_destination_with_config(ClickHouseInserterConfig { + max_bytes_per_insert: 100 * 1024 * 1024, + }) } /// Builds a [`ClickHouseDestination`] scoped to this test database with /// a caller-supplied [`ClickHouseInserterConfig`]. - pub fn build_destination_with_config( + pub fn build_destination_with_config( &self, - _pipeline_id: PipelineId, - store: S, config: ClickHouseInserterConfig, - ) -> ClickHouseDestination - where - S: StateStore + SchemaStore + Send + Sync, - { + ) -> ClickHouseDestination { ClickHouseDestination::new( - Url::parse(&self.url).expect("failed to parse test ClickHouse URL"), + self.url.clone(), &self.user, self.password.clone(), &self.database, config, - store, ) .expect("Failed to create ClickHouseDestination for test") } diff --git a/etl-destinations/tests/clickhouse_pipeline.rs b/etl-destinations/tests/clickhouse_pipeline.rs index 5c312c306..b835bb76e 100644 --- a/etl-destinations/tests/clickhouse_pipeline.rs +++ b/etl-destinations/tests/clickhouse_pipeline.rs @@ -1,5 +1,3 @@ -#![cfg(all(feature = "clickhouse", feature = "test-utils"))] - use etl::state::table::TableReplicationPhaseType; use etl::test_utils::database::{spawn_source_database, test_table_name}; use etl::test_utils::notifying_store::NotifyingStore; @@ -20,8 +18,6 @@ use url::Url; use crate::support::clickhouse::{AllTypesRow, BoundaryValuesRow}; -mod support; - /// Ensures the rustls crypto provider is only installed once across all tests. static INIT_CRYPTO: Once = Once::new(); @@ -351,7 +347,7 @@ async fn all_types_table_copy() { let ch_db = setup_clickhouse_database().await; let store = NotifyingStore::new(); let pipeline_id: PipelineId = random(); - let destination = ch_db.build_destination(pipeline_id, store.clone()); + let destination = ch_db.build_destination(); let table_ready = store .notify_on_table_state_type(table_id, TableReplicationPhaseType::Ready) @@ -495,7 +491,7 @@ async fn updates_are_streamed_to_clickhouse() { let ch_db = setup_clickhouse_database().await; let store = NotifyingStore::new(); let pipeline_id: PipelineId = random(); - let destination = ch_db.build_destination(pipeline_id, store.clone()); + let destination = ch_db.build_destination(); let table_ready = store .notify_on_table_state_type(table_id, TableReplicationPhaseType::Ready) @@ -654,7 +650,7 @@ async fn boundary_values_table_copy() { let ch_db = setup_clickhouse_database().await; let store = NotifyingStore::new(); let pipeline_id: PipelineId = random(); - let destination = ch_db.build_destination(pipeline_id, store.clone()); + let destination = ch_db.build_destination(); let table_ready = store .notify_on_table_state_type(table_id, TableReplicationPhaseType::Ready) @@ -794,7 +790,7 @@ async fn deletes_are_streamed_to_clickhouse() { let ch_db = setup_clickhouse_database().await; let store = NotifyingStore::new(); let pipeline_id: PipelineId = random(); - let destination = ch_db.build_destination(pipeline_id, store.clone()); + let destination = ch_db.build_destination(); let table_ready = store .notify_on_table_state_type(table_id, TableReplicationPhaseType::Ready) @@ -908,7 +904,7 @@ async fn pipeline_restart_resumes_streaming() { let ch_db = setup_clickhouse_database().await; let store = NotifyingStore::new(); let pipeline_id: PipelineId = random(); - let destination = ch_db.build_destination(pipeline_id, store.clone()); + let destination = ch_db.build_destination(); let table_ready = store .notify_on_table_state_type(table_id, TableReplicationPhaseType::Ready) @@ -933,7 +929,7 @@ async fn pipeline_restart_resumes_streaming() { assert_eq!(rows[0].value, "before_restart"); // --- WHEN: rebuild destination and pipeline, then stream a new insert --- - let destination = ch_db.build_destination(pipeline_id, store.clone()); + let destination = ch_db.build_destination(); let mut pipeline = create_pipeline( &database.config, @@ -1031,7 +1027,7 @@ async fn truncate_clears_table_and_accepts_new_inserts() { let ch_db = setup_clickhouse_database().await; let store = NotifyingStore::new(); let pipeline_id: PipelineId = random(); - let destination = ch_db.build_destination(pipeline_id, store.clone()); + let destination = ch_db.build_destination(); let table_ready = store .notify_on_table_state_type(table_id, TableReplicationPhaseType::Ready) @@ -1145,10 +1141,7 @@ async fn intermediate_flush_preserves_all_rows() { let ch_db = setup_clickhouse_database().await; let store = NotifyingStore::new(); let pipeline_id: PipelineId = random(); - let destination = ch_db.build_destination_with_config( - pipeline_id, - store.clone(), - ClickHouseInserterConfig { + let destination = ch_db.build_destination_with_config(ClickHouseInserterConfig { // 1 byte -- forces a new INSERT after every row. max_bytes_per_insert: 1, }, @@ -1254,7 +1247,7 @@ async fn multiple_tables_receive_independent_writes() { let ch_db = setup_clickhouse_database().await; let store = NotifyingStore::new(); let pipeline_id: PipelineId = random(); - let destination = ch_db.build_destination(pipeline_id, store.clone()); + let destination = ch_db.build_destination(); let table_a_ready = store .notify_on_table_state_type(table_a_id, TableReplicationPhaseType::Ready) @@ -1401,7 +1394,7 @@ async fn sequential_transactions_preserve_commit_order() { let ch_db = setup_clickhouse_database().await; let store = NotifyingStore::new(); let pipeline_id: PipelineId = random(); - let destination = ch_db.build_destination(pipeline_id, store.clone()); + let destination = ch_db.build_destination(); let table_ready = store .notify_on_table_state_type(table_id, TableReplicationPhaseType::Ready) @@ -1535,7 +1528,7 @@ async fn delete_with_default_replica_identity() { let ch_db = setup_clickhouse_database().await; let store = NotifyingStore::new(); let pipeline_id: PipelineId = random(); - let destination = ch_db.build_destination(pipeline_id, store.clone()); + let destination = ch_db.build_destination(); let table_ready = store .notify_on_table_state_type(table_id, TableReplicationPhaseType::Ready) @@ -1681,7 +1674,7 @@ async fn large_batch_table_copy() { let ch_db = setup_clickhouse_database().await; let store = NotifyingStore::new(); let pipeline_id: PipelineId = random(); - let destination = ch_db.build_destination(pipeline_id, store.clone()); + let destination = ch_db.build_destination(); let table_ready = store .notify_on_table_state_type(table_id, TableReplicationPhaseType::Ready) @@ -1728,7 +1721,7 @@ async fn large_batch_table_copy() { #[tokio::test(flavor = "multi_thread")] async fn ping_succeeds_against_running_clickhouse() { let client = ClickHouseClient::new( - Url::parse(&get_clickhouse_url()).unwrap(), + get_clickhouse_url(), get_clickhouse_user(), get_clickhouse_password(), "default", diff --git a/etl-destinations/tests/main.rs b/etl-destinations/tests/main.rs index 92582972c..e2fc3220a 100644 --- a/etl-destinations/tests/main.rs +++ b/etl-destinations/tests/main.rs @@ -2,6 +2,8 @@ mod support; #[cfg(all(feature = "bigquery", feature = "test-utils"))] mod bigquery_pipeline; +#[cfg(all(feature = "clickhouse", feature = "test-utils"))] +mod clickhouse_pipeline; #[cfg(feature = "ducklake")] mod ducklake_destination; #[cfg(feature = "ducklake")] diff --git a/etl-destinations/tests/support/clickhouse.rs b/etl-destinations/tests/support/clickhouse.rs index a4a0829d7..72758ea92 100644 --- a/etl-destinations/tests/support/clickhouse.rs +++ b/etl-destinations/tests/support/clickhouse.rs @@ -1,5 +1,4 @@ #![allow(dead_code)] -#![cfg(all(feature = "clickhouse", feature = "test-utils"))] /// A row read back from the ClickHouse `all_types_encoding` test table. /// diff --git a/etl-examples/src/bin/clickhouse.rs b/etl-examples/src/bin/clickhouse.rs index 50efc75f3..3b6287985 100644 --- a/etl-examples/src/bin/clickhouse.rs +++ b/etl-examples/src/bin/clickhouse.rs @@ -219,7 +219,6 @@ async fn main_impl() -> Result<(), Box> { ClickHouseInserterConfig { max_bytes_per_insert, }, - store.clone(), )?; let mut pipeline = Pipeline::new(pipeline_config, store, clickhouse_destination); diff --git a/etl-replicator/src/core.rs b/etl-replicator/src/core.rs index 1d9583570..c278d596c 100644 --- a/etl-replicator/src/core.rs +++ b/etl-replicator/src/core.rs @@ -216,7 +216,6 @@ pub async fn start_replicator_with_config( password.as_ref().map(|p| p.expose_secret().to_string()), database, inserter_config, - state_store.clone(), )?; let pipeline = Pipeline::new(replicator_config.pipeline, state_store, destination); From b3e910f63882148b81cd6aa2902123b956724bc5 Mon Sep 17 00:00:00 2001 From: Jordan McQueen Date: Mon, 20 Apr 2026 19:21:09 +0900 Subject: [PATCH 39/86] Add schema change (ALTER TABLE) support for ClickHouse destination - Re-add StateStore + SchemaStore to ClickHouseDestination for tracking destination table metadata (snapshot_id, replication_mask, apply status). - handle_relation_event detects schema changes by comparing snapshot_id and replication_mask against stored metadata, computes a SchemaDiff, and applies ALTER TABLE ADD/RENAME/DROP COLUMN statements. - ensure_table_exists now stores DestinationTableMetadata with Applying -> Applied transitions on initial table creation. - write_events_inner breaks at Event::Relation boundaries (same as Truncate), flushes accumulated rows, then processes schema changes before continuing. - New client methods: add_column, rename_column, drop_column. - Extracted clickhouse_column_type helper in schema.rs for shared use between CREATE TABLE and ALTER TABLE ADD COLUMN. --- etl-destinations/src/clickhouse/client.rs | 36 +++ etl-destinations/src/clickhouse/core.rs | 250 ++++++++++++++---- etl-destinations/src/clickhouse/schema.rs | 33 ++- etl-destinations/src/clickhouse/test_utils.rs | 25 +- etl-destinations/tests/clickhouse_pipeline.rs | 26 +- etl-examples/src/bin/clickhouse.rs | 1 + etl-replicator/src/core.rs | 1 + 7 files changed, 296 insertions(+), 76 deletions(-) diff --git a/etl-destinations/src/clickhouse/client.rs b/etl-destinations/src/clickhouse/client.rs index 80628a086..4276f1d35 100644 --- a/etl-destinations/src/clickhouse/client.rs +++ b/etl-destinations/src/clickhouse/client.rs @@ -8,6 +8,7 @@ use url::Url; use crate::clickhouse::encoding::{ClickHouseValue, rb_encode_row}; use crate::clickhouse::metrics::ETL_CH_INSERT_DURATION_SECONDS; +use crate::clickhouse::schema::clickhouse_column_type; /// Capacity of the internal write buffer used per INSERT statement. /// @@ -77,6 +78,41 @@ impl ClickHouseClient { }) } + /// Adds a column to an existing ClickHouse table. + /// + /// New columns are always Nullable since ClickHouse cannot backfill + /// existing rows with a NOT NULL default. + pub(crate) async fn add_column( + &self, + table_name: &str, + column: &etl::types::ColumnSchema, + ) -> EtlResult<()> { + let col_type = clickhouse_column_type(column, true); + let sql = format!( + "ALTER TABLE \"{table_name}\" ADD COLUMN \"{}\" {col_type}", + column.name + ); + self.execute_ddl(&sql).await + } + + /// Drops a column from an existing ClickHouse table. + pub(crate) async fn drop_column(&self, table_name: &str, column_name: &str) -> EtlResult<()> { + let sql = format!("ALTER TABLE \"{table_name}\" DROP COLUMN \"{column_name}\""); + self.execute_ddl(&sql).await + } + + /// Renames a column in an existing ClickHouse table. + pub(crate) async fn rename_column( + &self, + table_name: &str, + old_name: &str, + new_name: &str, + ) -> EtlResult<()> { + let sql = + format!("ALTER TABLE \"{table_name}\" RENAME COLUMN \"{old_name}\" TO \"{new_name}\""); + self.execute_ddl(&sql).await + } + /// Executes `TRUNCATE TABLE IF EXISTS ""`. pub(crate) async fn truncate_table(&self, table_name: &str) -> EtlResult<()> { self.inner diff --git a/etl-destinations/src/clickhouse/core.rs b/etl-destinations/src/clickhouse/core.rs index 9dd6b04d6..982853f53 100644 --- a/etl-destinations/src/clickhouse/core.rs +++ b/etl-destinations/src/clickhouse/core.rs @@ -8,16 +8,20 @@ use etl::destination::async_result::{ TruncateTableResult, WriteEventsResult, WriteTableRowsResult, }; use etl::error::{ErrorKind, EtlResult}; -use etl::etl_error; -use etl::types::{Cell, Event, ReplicatedTableSchema, TableId, TableRow, is_array_type}; -use etl::{destination::Destination, types::PgLsn}; +use etl::state::destination_metadata::{DestinationTableMetadata, DestinationTableSchemaStatus}; +use etl::store::schema::SchemaStore; +use etl::store::state::StateStore; +use etl::types::{ + Cell, Event, ReplicatedTableSchema, SchemaDiff, TableId, TableRow, is_array_type, +}; +use etl::{bail, destination::Destination, etl_error, types::PgLsn}; use parking_lot::RwLock; use std::time::Instant; use tokio::task::JoinSet; -use tracing::{debug, info}; +use tracing::{debug, info, warn}; use url::Url; -// ── CDC operation type ──────────────────────────────────────────────────────── +// -- CDC operation type -- #[derive(Copy, Clone)] enum CdcOperation { @@ -43,7 +47,7 @@ struct PendingRow { cells: Vec, } -// ── Inserter configuration ──────────────────────────────────────────────────── +// -- Inserter configuration -- /// Controls intermediate flushing inside a single `write_table_rows` / `write_events` call. /// @@ -52,12 +56,12 @@ struct PendingRow { pub struct ClickHouseInserterConfig { /// Start a new INSERT after this many uncompressed bytes. /// - /// Derive this from `BatchConfig::memory_budget_ratio × total_memory / max_table_sync_workers` + /// Derive this from `BatchConfig::memory_budget_ratio * total_memory / max_table_sync_workers` /// (the same formula used by `BatchBudget::ideal_batch_size_bytes`). pub max_bytes_per_insert: u64, } -// ── Destination struct ──────────────────────────────────────────────────────── +// -- Destination struct -- /// CDC-capable ClickHouse destination that replicates Postgres tables. /// @@ -68,20 +72,19 @@ pub struct ClickHouseInserterConfig { /// The struct is cheaply cloneable: `client` wraps an `Arc` internally, and /// `table_cache` is wrapped in `Arc>`. #[derive(Clone)] -pub struct ClickHouseDestination { +pub struct ClickHouseDestination { client: ClickHouseClient, inserter_config: Arc, + store: Arc, /// Cache: ClickHouse table name -> `Arc<[bool]>` (nullable flags per column, /// including the two trailing CDC columns which are always `false`). - /// - /// `std::sync::RwLock` is appropriate here: both reads (hot path) and writes (rare, - /// only on first encounter of a new table) are brief in-memory operations. The lock - /// is always released before any `.await` point (DDL is executed with no lock held), - /// so the async `tokio::sync::RwLock` would be unnecessary overhead. table_cache: Arc>>>, } -impl ClickHouseDestination { +impl ClickHouseDestination +where + S: StateStore + SchemaStore + Send + Sync, +{ /// Creates a new `ClickHouseDestination`. /// /// When using an `https://` URL, TLS is handled automatically by the `rustls-tls` @@ -92,11 +95,13 @@ impl ClickHouseDestination { password: Option, database: impl Into, inserter_config: ClickHouseInserterConfig, + store: S, ) -> EtlResult { register_metrics(); Ok(Self { client: ClickHouseClient::new(url, user, password, database), inserter_config: Arc::new(inserter_config), + store: Arc::new(store), table_cache: Arc::new(RwLock::new(HashMap::new())), }) } @@ -104,10 +109,9 @@ impl ClickHouseDestination { /// Ensures the ClickHouse table for the given schema exists, returning /// `(ch_table_name, nullable_flags)`. /// - /// Uses a two-phase locking strategy: - /// 1. Fast-path read (no await) -- return cached entry if present. - /// 2. Slow-path: compute DDL, run `CREATE TABLE IF NOT EXISTS` (await, no lock held), - /// then write-lock to insert (using `or_insert` for the concurrent first-writer race). + /// On first encounter, executes `CREATE TABLE IF NOT EXISTS` and stores + /// destination metadata with `Applied` status. Subsequent calls return + /// the cached result. async fn ensure_table_exists( &self, schema: &ReplicatedTableSchema, @@ -123,14 +127,25 @@ impl ClickHouseDestination { } } + let table_id = schema.id(); + let snapshot_id = schema.inner().snapshot_id; + let replication_mask = schema.replication_mask().clone(); + + // Store metadata as Applying before DDL. + let metadata = DestinationTableMetadata::new_applying( + ch_table_name.clone(), + snapshot_id, + replication_mask, + ); + self.store + .store_destination_table_metadata(table_id, metadata.clone()) + .await?; + // Compute nullable flags (user columns + 2 CDC columns always non-nullable). // - // Array columns are NEVER marked nullable here, even if the Postgres column is nullable. - // The DDL always emits `Array(Nullable(T))` (no outer `Nullable` wrapper), so ClickHouse - // does not expect a null-indicator byte before the array. If we mistakenly set - // `nullable_flags[i] = true` for an array column, `rb_encode_nullable` would prepend a - // spurious `0x00` byte that ClickHouse reads as `varint(0)` (empty array), causing every - // subsequent column to be read from the wrong offset and ultimately "Cannot read all data". + // Array columns are NEVER marked nullable here, even if the Postgres column + // is nullable. The DDL always emits `Array(Nullable(T))` (no outer `Nullable` + // wrapper), so ClickHouse does not expect a null-indicator byte before the array. let column_schemas: Vec<_> = schema.column_schemas().cloned().collect(); let mut nullable_flags_vec: Vec = column_schemas .iter() @@ -147,6 +162,11 @@ impl ClickHouseDestination { metrics::histogram!(ETL_CH_DDL_DURATION_SECONDS, "table" => ch_table_name.clone()) .record(ddl_start.elapsed().as_secs_f64()); + // Mark as Applied after successful DDL. + self.store + .store_destination_table_metadata(table_id, metadata.to_applied()) + .await?; + // Write-lock: insert, using or_insert to handle concurrent first-writer race. let stored_flags = { let mut guard = self.table_cache.write(); @@ -197,33 +217,166 @@ impl ClickHouseDestination { .await } - /// Processes events in passes driven by an outer loop that runs until the iterator - /// is exhausted. Each pass: - /// 1. Accumulates Insert/Update/Delete rows per table until a Truncate (or end). + // -- Schema change handling -- + + /// Handles a schema change event (Relation) by computing the diff and applying + /// ALTER TABLE statements. + async fn handle_relation_event(&self, new_schema: &ReplicatedTableSchema) -> EtlResult<()> { + let table_id = new_schema.id(); + let new_snapshot_id = new_schema.inner().snapshot_id; + let new_replication_mask = new_schema.replication_mask().clone(); + + let Some(metadata) = self + .store + .get_applied_destination_table_metadata(table_id) + .await? + else { + bail!( + ErrorKind::CorruptedTableSchema, + "Missing destination table metadata", + format!( + "No destination table metadata found for table {} when processing \ + schema change. The metadata should have been recorded during initial \ + table synchronization.", + table_id + ) + ); + }; + + let current_snapshot_id = metadata.snapshot_id; + let current_replication_mask = metadata.replication_mask.clone(); + + if current_snapshot_id == new_snapshot_id + && current_replication_mask == new_replication_mask + { + info!( + "schema for table {} unchanged (snapshot_id: {})", + table_id, new_snapshot_id + ); + return Ok(()); + } + + info!( + "schema change detected for table {}: snapshot_id {} -> {}", + table_id, current_snapshot_id, new_snapshot_id + ); + + // Retrieve the old schema to compute the diff. + let current_table_schema = self + .store + .get_table_schema(&table_id, current_snapshot_id) + .await? + .ok_or_else(|| { + etl_error!( + ErrorKind::InvalidState, + "Old schema not found", + format!( + "Could not find schema for table {} at snapshot_id {}", + table_id, current_snapshot_id + ) + ) + })?; + + let current_schema = ReplicatedTableSchema::from_mask( + current_table_schema, + current_replication_mask.clone(), + ); + + let ch_table_name = &metadata.destination_table_id; + + // Mark as Applying before DDL changes. + let updated_metadata = DestinationTableMetadata::new_applied( + ch_table_name.clone(), + current_snapshot_id, + current_replication_mask, + ) + .with_schema_change( + new_snapshot_id, + new_replication_mask, + DestinationTableSchemaStatus::Applying, + ); + self.store + .store_destination_table_metadata(table_id, updated_metadata.clone()) + .await?; + + // Compute and apply the diff. + let diff = current_schema.diff(new_schema); + if let Err(err) = self.apply_schema_diff(ch_table_name, &diff).await { + warn!( + "schema change failed for table {}: {}. Manual intervention may be required.", + table_id, err + ); + return Err(err); + } + + // Mark as Applied. + self.store + .store_destination_table_metadata(table_id, updated_metadata.to_applied()) + .await?; + + // Invalidate cached nullable flags so the next write recomputes them. + { + let mut guard = self.table_cache.write(); + guard.remove(ch_table_name); + } + + info!( + "schema change completed for table {}: snapshot_id {} applied", + table_id, new_snapshot_id + ); + + Ok(()) + } + + /// Applies a schema diff to a ClickHouse table: add columns, rename columns, + /// then drop columns (in that order for safety). + async fn apply_schema_diff(&self, ch_table_name: &str, diff: &SchemaDiff) -> EtlResult<()> { + if diff.is_empty() { + return Ok(()); + } + + for column in &diff.columns_to_add { + self.client.add_column(ch_table_name, column).await?; + } + + for rename in &diff.columns_to_rename { + self.client + .rename_column(ch_table_name, &rename.old_name, &rename.new_name) + .await?; + } + + for column in &diff.columns_to_remove { + self.client.drop_column(ch_table_name, &column.name).await?; + } + + Ok(()) + } + + // -- Event processing -- + + /// Processes events in passes driven by an outer loop that runs until the + /// iterator is exhausted. Each pass: + /// 1. Accumulates Insert/Update/Delete rows per table until a Truncate, + /// Relation, or end of events. /// 2. Writes those rows concurrently. - /// 3. Drains consecutive Truncate events (deduplicated) and executes them. - /// - /// Breaking at a Truncate never skips events -- the outer loop resumes from that - /// position, so rows accumulated before the Truncate are flushed first, then the - /// Truncate fires, then subsequent events (including inserts on the same table) - /// are processed in the next pass. + /// 3. Processes any Relation events (schema changes) sequentially. + /// 4. Drains consecutive Truncate events (deduplicated) and executes them. async fn write_events_inner(&self, events: Vec) -> EtlResult<()> { let mut event_iter = events.into_iter().peekable(); while event_iter.peek().is_some() { - // Accumulate non-truncate events grouped by table_id. - // We also track the ReplicatedTableSchema per table for ensure_table_exists. let mut table_schemas: HashMap = HashMap::new(); let mut table_id_to_rows: HashMap> = HashMap::new(); + // Accumulate data events until we hit a Truncate or Relation boundary. while let Some(event) = event_iter.peek() { - if matches!(event, Event::Truncate(_)) { + if matches!(event, Event::Truncate(_) | Event::Relation(_)) { break; } let event = event_iter .next() - .expect("event iterator should not be empty, we peeked at the next event; qed"); + .expect("peeked event must be present; qed"); match event { Event::Insert(insert) => { let table_id = insert.replicated_table_schema.id(); @@ -280,18 +433,14 @@ impl ClickHouseDestination { } } - // Write accumulated rows concurrently, one JoinSet task per table. + // Flush accumulated rows concurrently, one JoinSet task per table. if !table_id_to_rows.is_empty() { - // Phase 1: ensure all tables exist (must happen outside JoinSet spawns - // since ensure_table_exists holds &self which is not 'static). let mut table_meta: HashMap)> = HashMap::new(); for (&table_id, schema) in &table_schemas { let (name, flags) = self.ensure_table_exists(schema).await?; table_meta.insert(table_id, (name, flags)); } - // Phase 2: spawn concurrent writers with pre-resolved metadata. - // Only the ClickHouseClient (cheaply cloneable, 'static) goes into spawn. let mut join_set: JoinSet> = JoinSet::new(); for (table_id, row_data) in table_id_to_rows { let (ch_table_name, nullable_flags) = @@ -340,6 +489,14 @@ impl ClickHouseDestination { } } + // Process Relation events (schema changes) sequentially. + while let Some(Event::Relation(_)) = event_iter.peek() { + if let Some(Event::Relation(relation)) = event_iter.next() { + self.handle_relation_event(&relation.replicated_table_schema) + .await?; + } + } + // Collect and deduplicate truncate events. let mut truncate_schemas: HashMap = HashMap::new(); while let Some(Event::Truncate(_)) = event_iter.peek() { @@ -362,7 +519,10 @@ impl ClickHouseDestination { } } -impl Destination for ClickHouseDestination { +impl Destination for ClickHouseDestination +where + S: StateStore + SchemaStore + Send + Sync, +{ fn name() -> &'static str { "clickhouse" } @@ -401,8 +561,6 @@ impl Destination for ClickHouseDestination { } } -// ── Unit tests ──────────────────────────────────────────────────────────────── - #[cfg(test)] mod tests { #[test] diff --git a/etl-destinations/src/clickhouse/schema.rs b/etl-destinations/src/clickhouse/schema.rs index c373475e6..90203fc6f 100644 --- a/etl-destinations/src/clickhouse/schema.rs +++ b/etl-destinations/src/clickhouse/schema.rs @@ -71,9 +71,28 @@ pub fn table_name_to_clickhouse_table_name(schema: &str, table: &str) -> String format!("{escaped_schema}_{escaped_table}") } +/// Returns the full ClickHouse type string for a column, including Nullable +/// wrapping for scalar columns and Array(Nullable(T)) for array columns. +/// +/// New columns added via ALTER TABLE are always Nullable regardless of the +/// Postgres NOT NULL constraint, since ClickHouse cannot backfill existing rows. +pub fn clickhouse_column_type(col: &ColumnSchema, force_nullable: bool) -> String { + if is_array_type(&col.typ) { + let elem = postgres_array_element_clickhouse_sql(&col.typ); + format!("Array(Nullable({elem}))") + } else { + let base = postgres_column_type_to_clickhouse_sql(&col.typ); + if col.nullable || force_nullable { + format!("Nullable({base})") + } else { + base.to_string() + } + } +} + /// Generates a `CREATE TABLE IF NOT EXISTS` SQL statement for the given columns. /// -/// - Non-nullable columns use the bare ClickHouse type (`Int32`, `String`, …). +/// - Non-nullable columns use the bare ClickHouse type (`Int32`, `String`, ...). /// - Nullable columns use `Nullable(T)`. /// - Array columns always use `Array(Nullable(T))` (Postgres array elements are nullable). /// - Two CDC trailing columns are always appended as non-nullable: @@ -83,17 +102,7 @@ pub fn build_create_table_sql(table_name: &str, column_schemas: &[ColumnSchema]) let mut cols = Vec::with_capacity(column_schemas.len() + 2); for col in column_schemas { - let col_type = if is_array_type(&col.typ) { - let elem = postgres_array_element_clickhouse_sql(&col.typ); - format!("Array(Nullable({elem}))") - } else { - let base = postgres_column_type_to_clickhouse_sql(&col.typ); - if col.nullable { - format!("Nullable({base})") - } else { - base.to_string() - } - }; + let col_type = clickhouse_column_type(col, false); cols.push(format!(" \"{}\" {}", col.name, col_type)); } diff --git a/etl-destinations/src/clickhouse/test_utils.rs b/etl-destinations/src/clickhouse/test_utils.rs index 898639e08..bd36cafe8 100644 --- a/etl-destinations/src/clickhouse/test_utils.rs +++ b/etl-destinations/src/clickhouse/test_utils.rs @@ -1,6 +1,8 @@ //! Test utilities for ClickHouse destinations. use clickhouse::Client; +use etl::store::schema::SchemaStore; +use etl::store::state::StateStore; use tokio::runtime::Handle; use url::Url; use uuid::Uuid; @@ -109,24 +111,35 @@ impl ClickHouseTestDatabase { /// Builds a [`ClickHouseDestination`] scoped to this test database with /// default inserter config (100 MiB per INSERT -- large enough that tests /// never hit an intermediate flush). - pub fn build_destination(&self) -> ClickHouseDestination { - self.build_destination_with_config(ClickHouseInserterConfig { - max_bytes_per_insert: 100 * 1024 * 1024, - }) + pub fn build_destination(&self, store: S) -> ClickHouseDestination + where + S: StateStore + SchemaStore + Send + Sync, + { + self.build_destination_with_config( + store, + ClickHouseInserterConfig { + max_bytes_per_insert: 100 * 1024 * 1024, + }, + ) } /// Builds a [`ClickHouseDestination`] scoped to this test database with /// a caller-supplied [`ClickHouseInserterConfig`]. - pub fn build_destination_with_config( + pub fn build_destination_with_config( &self, + store: S, config: ClickHouseInserterConfig, - ) -> ClickHouseDestination { + ) -> ClickHouseDestination + where + S: StateStore + SchemaStore + Send + Sync, + { ClickHouseDestination::new( self.url.clone(), &self.user, self.password.clone(), &self.database, config, + store, ) .expect("Failed to create ClickHouseDestination for test") } diff --git a/etl-destinations/tests/clickhouse_pipeline.rs b/etl-destinations/tests/clickhouse_pipeline.rs index b835bb76e..f4f03403d 100644 --- a/etl-destinations/tests/clickhouse_pipeline.rs +++ b/etl-destinations/tests/clickhouse_pipeline.rs @@ -347,7 +347,7 @@ async fn all_types_table_copy() { let ch_db = setup_clickhouse_database().await; let store = NotifyingStore::new(); let pipeline_id: PipelineId = random(); - let destination = ch_db.build_destination(); + let destination = ch_db.build_destination(store.clone()); let table_ready = store .notify_on_table_state_type(table_id, TableReplicationPhaseType::Ready) @@ -491,7 +491,7 @@ async fn updates_are_streamed_to_clickhouse() { let ch_db = setup_clickhouse_database().await; let store = NotifyingStore::new(); let pipeline_id: PipelineId = random(); - let destination = ch_db.build_destination(); + let destination = ch_db.build_destination(store.clone()); let table_ready = store .notify_on_table_state_type(table_id, TableReplicationPhaseType::Ready) @@ -650,7 +650,7 @@ async fn boundary_values_table_copy() { let ch_db = setup_clickhouse_database().await; let store = NotifyingStore::new(); let pipeline_id: PipelineId = random(); - let destination = ch_db.build_destination(); + let destination = ch_db.build_destination(store.clone()); let table_ready = store .notify_on_table_state_type(table_id, TableReplicationPhaseType::Ready) @@ -790,7 +790,7 @@ async fn deletes_are_streamed_to_clickhouse() { let ch_db = setup_clickhouse_database().await; let store = NotifyingStore::new(); let pipeline_id: PipelineId = random(); - let destination = ch_db.build_destination(); + let destination = ch_db.build_destination(store.clone()); let table_ready = store .notify_on_table_state_type(table_id, TableReplicationPhaseType::Ready) @@ -904,7 +904,7 @@ async fn pipeline_restart_resumes_streaming() { let ch_db = setup_clickhouse_database().await; let store = NotifyingStore::new(); let pipeline_id: PipelineId = random(); - let destination = ch_db.build_destination(); + let destination = ch_db.build_destination(store.clone()); let table_ready = store .notify_on_table_state_type(table_id, TableReplicationPhaseType::Ready) @@ -929,7 +929,7 @@ async fn pipeline_restart_resumes_streaming() { assert_eq!(rows[0].value, "before_restart"); // --- WHEN: rebuild destination and pipeline, then stream a new insert --- - let destination = ch_db.build_destination(); + let destination = ch_db.build_destination(store.clone()); let mut pipeline = create_pipeline( &database.config, @@ -1027,7 +1027,7 @@ async fn truncate_clears_table_and_accepts_new_inserts() { let ch_db = setup_clickhouse_database().await; let store = NotifyingStore::new(); let pipeline_id: PipelineId = random(); - let destination = ch_db.build_destination(); + let destination = ch_db.build_destination(store.clone()); let table_ready = store .notify_on_table_state_type(table_id, TableReplicationPhaseType::Ready) @@ -1141,7 +1141,9 @@ async fn intermediate_flush_preserves_all_rows() { let ch_db = setup_clickhouse_database().await; let store = NotifyingStore::new(); let pipeline_id: PipelineId = random(); - let destination = ch_db.build_destination_with_config(ClickHouseInserterConfig { + let destination = ch_db.build_destination_with_config( + store.clone(), + ClickHouseInserterConfig { // 1 byte -- forces a new INSERT after every row. max_bytes_per_insert: 1, }, @@ -1247,7 +1249,7 @@ async fn multiple_tables_receive_independent_writes() { let ch_db = setup_clickhouse_database().await; let store = NotifyingStore::new(); let pipeline_id: PipelineId = random(); - let destination = ch_db.build_destination(); + let destination = ch_db.build_destination(store.clone()); let table_a_ready = store .notify_on_table_state_type(table_a_id, TableReplicationPhaseType::Ready) @@ -1394,7 +1396,7 @@ async fn sequential_transactions_preserve_commit_order() { let ch_db = setup_clickhouse_database().await; let store = NotifyingStore::new(); let pipeline_id: PipelineId = random(); - let destination = ch_db.build_destination(); + let destination = ch_db.build_destination(store.clone()); let table_ready = store .notify_on_table_state_type(table_id, TableReplicationPhaseType::Ready) @@ -1528,7 +1530,7 @@ async fn delete_with_default_replica_identity() { let ch_db = setup_clickhouse_database().await; let store = NotifyingStore::new(); let pipeline_id: PipelineId = random(); - let destination = ch_db.build_destination(); + let destination = ch_db.build_destination(store.clone()); let table_ready = store .notify_on_table_state_type(table_id, TableReplicationPhaseType::Ready) @@ -1674,7 +1676,7 @@ async fn large_batch_table_copy() { let ch_db = setup_clickhouse_database().await; let store = NotifyingStore::new(); let pipeline_id: PipelineId = random(); - let destination = ch_db.build_destination(); + let destination = ch_db.build_destination(store.clone()); let table_ready = store .notify_on_table_state_type(table_id, TableReplicationPhaseType::Ready) diff --git a/etl-examples/src/bin/clickhouse.rs b/etl-examples/src/bin/clickhouse.rs index 3b6287985..50efc75f3 100644 --- a/etl-examples/src/bin/clickhouse.rs +++ b/etl-examples/src/bin/clickhouse.rs @@ -219,6 +219,7 @@ async fn main_impl() -> Result<(), Box> { ClickHouseInserterConfig { max_bytes_per_insert, }, + store.clone(), )?; let mut pipeline = Pipeline::new(pipeline_config, store, clickhouse_destination); diff --git a/etl-replicator/src/core.rs b/etl-replicator/src/core.rs index c278d596c..1d9583570 100644 --- a/etl-replicator/src/core.rs +++ b/etl-replicator/src/core.rs @@ -216,6 +216,7 @@ pub async fn start_replicator_with_config( password.as_ref().map(|p| p.expose_secret().to_string()), database, inserter_config, + state_store.clone(), )?; let pipeline = Pipeline::new(replicator_config.pipeline, state_store, destination); From f7f834df1ab09269b146b1264efa2be7057d57e9 Mon Sep 17 00:00:00 2001 From: Jordan McQueen Date: Tue, 21 Apr 2026 23:52:46 +0900 Subject: [PATCH 40/86] Add schema change integration tests and fix ALTER TABLE bugs Two new tests: - schema_change_add_column: ADD COLUMN propagates to ClickHouse, Alice's pre-change row has NULL for the new column, Bob's post-change row has the real value. - schema_change_add_drop_rename: combined ADD + DROP + RENAME, verifies column names, data preservation, and metadata snapshot_id advancement. Two bugs found and fixed by these tests: - Column ordering: ALTER TABLE ADD COLUMN appended after the CDC columns (cdc_operation, cdc_lsn), misaligning RowBinary encoding. Fixed by using AFTER clause to place new columns before CDC columns. - Metadata overwrite: ensure_table_exists unconditionally stored Applying metadata on cache miss, overwriting the Applied metadata set by handle_relation_event. Fixed by checking for existing metadata before storing. Also adds column_names() and db_client() helpers to ClickHouseTestDatabase. --- etl-api/src/k8s/base.rs | 4 +- etl-api/src/k8s/http.rs | 7 +- etl-api/src/utils.rs | 5 +- etl-api/src/validation/validators.rs | 9 +- etl-destinations/src/clickhouse/client.rs | 42 +- etl-destinations/src/clickhouse/core.rs | 174 ++++--- etl-destinations/src/clickhouse/encoding.rs | 89 ++-- etl-destinations/src/clickhouse/metrics.rs | 11 +- etl-destinations/src/clickhouse/mod.rs | 3 +- etl-destinations/src/clickhouse/schema.rs | 40 +- etl-destinations/src/clickhouse/test_utils.rs | 33 +- etl-destinations/tests/clickhouse_pipeline.rs | 462 ++++++++++++++++-- etl-examples/src/bin/clickhouse.rs | 33 +- etl/tests/replication.rs | 18 +- 14 files changed, 697 insertions(+), 233 deletions(-) diff --git a/etl-api/src/k8s/base.rs b/etl-api/src/k8s/base.rs index ef5e08853..249f9b5c3 100644 --- a/etl-api/src/k8s/base.rs +++ b/etl-api/src/k8s/base.rs @@ -141,9 +141,7 @@ pub trait K8sClient: Send + Sync { bq_service_account_key: &str, ) -> Result<(), K8sError>; - /// Creates or updates the ClickHouse password for a replicator. - /// - /// The secret name is derived from `prefix` and stored in the data-plane namespace. + /// Creates or updates the ClickHouse password secret for a replicator. async fn create_or_update_clickhouse_secret( &self, prefix: &str, diff --git a/etl-api/src/k8s/http.rs b/etl-api/src/k8s/http.rs index 52a0da3e1..b35a20b57 100644 --- a/etl-api/src/k8s/http.rs +++ b/etl-api/src/k8s/http.rs @@ -355,9 +355,10 @@ impl K8sClient for HttpK8sClient { ); let secret: Secret = serde_json::from_value(clickhouse_secret_json)?; - // We are forcing the update since we are the field manager that should own the fields. If - // there is an override (likely during an incident or SREs intervention), we want to override - // their changes. The API database is the source of truth for credentials. + // We are forcing the update since we are the field manager that should own the + // fields. If there is an override (likely during an incident or + // SREs intervention), we want to override their changes. The API + // database is the source of truth for credentials. let pp = PatchParams::apply(&clickhouse_secret_name).force(); self.secrets_api.patch(&clickhouse_secret_name, &pp, &Patch::Apply(secret)).await?; } diff --git a/etl-api/src/utils.rs b/etl-api/src/utils.rs index 353b0e910..18b28239b 100644 --- a/etl-api/src/utils.rs +++ b/etl-api/src/utils.rs @@ -1,7 +1,6 @@ use etl_config::SerializableSecretString; use rand::Rng; -use serde::de::Error as _; -use serde::{Deserialize, Deserializer}; +use serde::{Deserialize, Deserializer, de::Error as _}; use url::Url; /// Deserializes a string and trims leading and trailing whitespace. @@ -35,7 +34,7 @@ where Ok(opt.map(|s| SerializableSecretString::from(s.trim().to_string()))) } -/// Deserializes an HTTP(S) URL string and trims leading and trailing whitespace. +/// Deserializes an HTTP(S) URL string, trimming whitespace. pub fn trim_http_url<'de, D>(deserializer: D) -> Result where D: Deserializer<'de>, diff --git a/etl-api/src/validation/validators.rs b/etl-api/src/validation/validators.rs index 3f67ce113..44fcca754 100644 --- a/etl-api/src/validation/validators.rs +++ b/etl-api/src/validation/validators.rs @@ -668,12 +668,9 @@ impl Validator for ClickHouseValidator { Ok(_) => Ok(Vec::new()), Err(_) => Ok(vec![ValidationFailure::critical( "ClickHouse Connection Failed", - "Unable to create clickhouse client.\n\n\ - Please verify:\n\ - (1) The url is valid and accessible\n\ - (2) The username is correct\n\ - (3) You set the right password\n\ - (4) You set the right database name + "Unable to create clickhouse client.\n\nPlease verify:\n(1) The url is valid and \ + accessible\n(2) The username is correct\n(3) You set the right password\n(4) You \ + set the right database name ", )]), } diff --git a/etl-destinations/src/clickhouse/client.rs b/etl-destinations/src/clickhouse/client.rs index aef1fac53..7be316932 100644 --- a/etl-destinations/src/clickhouse/client.rs +++ b/etl-destinations/src/clickhouse/client.rs @@ -1,14 +1,17 @@ -use std::sync::Arc; -use std::time::Instant; +use std::{sync::Arc, time::Instant}; use clickhouse::Client; -use etl::error::{ErrorKind, EtlResult}; -use etl::etl_error; +use etl::{ + error::{ErrorKind, EtlResult}, + etl_error, +}; use url::Url; -use crate::clickhouse::encoding::{ClickHouseValue, rb_encode_row}; -use crate::clickhouse::metrics::ETL_CH_INSERT_DURATION_SECONDS; -use crate::clickhouse::schema::clickhouse_column_type; +use crate::clickhouse::{ + encoding::{ClickHouseValue, rb_encode_row}, + metrics::ETL_CH_INSERT_DURATION_SECONDS, + schema::clickhouse_column_type, +}; /// Capacity of the internal write buffer used per INSERT statement. /// @@ -19,9 +22,10 @@ const BUFFERED_CAPACITY: usize = 256 * 1024; /// High-level ClickHouse client used by [`super::core::ClickHouseDestination`]. /// -/// Wraps a [`clickhouse::Client`] and exposes typed methods for DDL, truncation, -/// and RowBinary bulk inserts. Cheaply cloneable — the inner client holds an `Arc` -/// internally, and the outer `Arc` here ensures a single shared instance. +/// Wraps a [`clickhouse::Client`] and exposes typed methods for DDL, +/// truncation, and RowBinary bulk inserts. Cheaply cloneable — the inner client +/// holds an `Arc` internally, and the outer `Arc` here ensures a single shared +/// instance. #[derive(Clone)] pub struct ClickHouseClient { inner: Arc, @@ -72,13 +76,22 @@ impl ClickHouseClient { /// /// New columns are always Nullable since ClickHouse cannot backfill /// existing rows with a NOT NULL default. + /// + /// `after_column` controls placement: the new column is inserted AFTER + /// the named column. This is critical because RowBinary encoding is + /// positional -- new user columns must appear before the CDC columns + /// (`cdc_operation`, `cdc_lsn`), not appended after them. pub(crate) async fn add_column( &self, table_name: &str, column: &etl::types::ColumnSchema, + after_column: &str, ) -> EtlResult<()> { let col_type = clickhouse_column_type(column, true); - let sql = format!("ALTER TABLE \"{table_name}\" ADD COLUMN \"{}\" {col_type}", column.name); + let sql = format!( + "ALTER TABLE \"{table_name}\" ADD COLUMN \"{}\" {col_type} AFTER \"{after_column}\"", + column.name + ); self.execute_ddl(&sql).await } @@ -121,9 +134,10 @@ impl ClickHouseClient { /// [`ClickHouseValue`]s in column order (user columns + CDC columns). /// `nullable_flags` must have the same length as each row. /// - /// When the accumulated uncompressed byte count reaches `max_bytes_per_insert` - /// the current INSERT statement is committed and a new one is opened, keeping - /// peak memory usage bounded for large initial copies. + /// When the accumulated uncompressed byte count reaches + /// `max_bytes_per_insert` the current INSERT statement is committed and + /// a new one is opened, keeping peak memory usage bounded for large + /// initial copies. /// /// The `source` label (`"copy"` or `"streaming"`) is attached to the /// `etl_ch_insert_duration_seconds` histogram recorded after each committed diff --git a/etl-destinations/src/clickhouse/core.rs b/etl-destinations/src/clickhouse/core.rs index 102b4aa7e..fe638aec4 100644 --- a/etl-destinations/src/clickhouse/core.rs +++ b/etl-destinations/src/clickhouse/core.rs @@ -1,26 +1,31 @@ -use std::{collections::HashMap, sync::Arc}; - -use crate::clickhouse::client::ClickHouseClient; -use crate::clickhouse::encoding::{ClickHouseValue, cell_to_clickhouse_value}; -use crate::clickhouse::metrics::{ETL_CH_DDL_DURATION_SECONDS, register_metrics}; -use crate::clickhouse::schema::{build_create_table_sql, table_name_to_clickhouse_table_name}; -use etl::destination::async_result::{ - TruncateTableResult, WriteEventsResult, WriteTableRowsResult, +use std::{collections::HashMap, sync::Arc, time::Instant}; + +use etl::{ + bail, + destination::{ + Destination, + async_result::{TruncateTableResult, WriteEventsResult, WriteTableRowsResult}, + }, + error::{ErrorKind, EtlResult}, + etl_error, + state::destination_metadata::{DestinationTableMetadata, DestinationTableSchemaStatus}, + store::{schema::SchemaStore, state::StateStore}, + types::{ + Cell, Event, PgLsn, ReplicatedTableSchema, SchemaDiff, TableId, TableRow, is_array_type, + }, }; -use etl::error::{ErrorKind, EtlResult}; -use etl::state::destination_metadata::{DestinationTableMetadata, DestinationTableSchemaStatus}; -use etl::store::schema::SchemaStore; -use etl::store::state::StateStore; -use etl::types::{ - Cell, Event, ReplicatedTableSchema, SchemaDiff, TableId, TableRow, is_array_type, -}; -use etl::{bail, destination::Destination, etl_error, types::PgLsn}; use parking_lot::RwLock; -use std::time::Instant; use tokio::task::JoinSet; use tracing::{debug, info, warn}; use url::Url; +use crate::clickhouse::{ + client::ClickHouseClient, + encoding::{ClickHouseValue, cell_to_clickhouse_value}, + metrics::{ETL_CH_DDL_DURATION_SECONDS, register_metrics}, + schema::{build_create_table_sql, table_name_to_clickhouse_table_name}, +}; + // -- CDC operation type -- #[derive(Copy, Clone)] @@ -40,7 +45,7 @@ impl std::fmt::Display for CdcOperation { } } -/// A single row pending insertion, carrying the CDC metadata alongside the cell data. +/// A row pending insertion with its CDC metadata. struct PendingRow { operation: CdcOperation, lsn: PgLsn, @@ -49,15 +54,18 @@ struct PendingRow { // -- Inserter configuration -- -/// Controls intermediate flushing inside a single `write_table_rows` / `write_events` call. +/// Controls intermediate flushing inside a single `write_table_rows` / +/// `write_events` call. /// -/// The upstream `BatchConfig::max_fill_ms` controls when `write_events` is called; -/// these limits prevent unbounded memory use for very large batches (e.g. initial copy). +/// The upstream `BatchConfig::max_fill_ms` controls when `write_events` is +/// called; these limits prevent unbounded memory use for very large batches +/// (e.g. initial copy). pub struct ClickHouseInserterConfig { /// Start a new INSERT after this many uncompressed bytes. /// - /// Derive this from `BatchConfig::memory_budget_ratio * total_memory / max_table_sync_workers` - /// (the same formula used by `BatchBudget::ideal_batch_size_bytes`). + /// Derive this from `BatchConfig::memory_budget_ratio * total_memory / + /// max_table_sync_workers` (the same formula used by + /// `BatchBudget::ideal_batch_size_bytes`). pub max_bytes_per_insert: u64, } @@ -65,8 +73,8 @@ pub struct ClickHouseInserterConfig { /// CDC-capable ClickHouse destination that replicates Postgres tables. /// -/// Uses append-only MergeTree tables with two CDC columns (`cdc_operation`, `cdc_lsn`) -/// appended to each row. Rows are encoded as RowBinary and sent via +/// Uses append-only MergeTree tables with two CDC columns (`cdc_operation`, +/// `cdc_lsn`) appended to each row. Rows are encoded as RowBinary and sent via /// `INSERT INTO "table" FORMAT RowBinary` -- no column-name header required. /// /// The struct is cheaply cloneable: `client` wraps an `Arc` internally, and @@ -76,14 +84,15 @@ pub struct ClickHouseDestination { client: ClickHouseClient, inserter_config: Arc, store: Arc, - /// Cache: ClickHouse table name -> `Arc<[bool]>` (nullable flags per column, - /// including the two trailing CDC columns which are always `false`). + /// Cache: ClickHouse table name -> `Arc<[bool]>` (nullable flags per + /// column, including the two trailing CDC columns which are always + /// `false`). /// - /// `std::sync::RwLock` is appropriate here: both reads (hot path) and writes - /// (rare, only on first encounter of a new table) are brief in-memory - /// operations. The lock is always released before any `.await` point (DDL is - /// executed with no lock held), so the async `tokio::sync::RwLock` would be - /// unnecessary overhead. + /// `std::sync::RwLock` is appropriate here: both reads (hot path) and + /// writes (rare, only on first encounter of a new table) are brief + /// in-memory operations. The lock is always released before any + /// `.await` point (DDL is executed with no lock held), so the async + /// `tokio::sync::RwLock` would be unnecessary overhead. table_cache: Arc>>>, } @@ -137,19 +146,35 @@ where let snapshot_id = schema.inner().snapshot_id; let replication_mask = schema.replication_mask().clone(); - // Store metadata as Applying before DDL. - let metadata = DestinationTableMetadata::new_applying( - ch_table_name.clone(), - snapshot_id, - replication_mask, - ); - self.store.store_destination_table_metadata(table_id, metadata.clone()).await?; + // Only store metadata and execute DDL on first table creation. After a + // schema change (handle_relation_event), the cache is invalidated but + // metadata already exists -- we just need to recompute nullable flags. + let existing_metadata = self.store.get_destination_table_metadata(table_id).await?; + if existing_metadata.is_none() { + let metadata = DestinationTableMetadata::new_applying( + ch_table_name.clone(), + snapshot_id, + replication_mask, + ); + self.store.store_destination_table_metadata(table_id, metadata.clone()).await?; + + // Execute CREATE TABLE DDL. + let column_schemas: Vec<_> = schema.column_schemas().cloned().collect(); + let ddl = build_create_table_sql(&ch_table_name, &column_schemas); + let ddl_start = Instant::now(); + self.client.execute_ddl(&ddl).await?; + metrics::histogram!(ETL_CH_DDL_DURATION_SECONDS, "table" => ch_table_name.clone()) + .record(ddl_start.elapsed().as_secs_f64()); + + self.store.store_destination_table_metadata(table_id, metadata.to_applied()).await?; + } // Compute nullable flags (user columns + 2 CDC columns always non-nullable). // // Array columns are NEVER marked nullable here, even if the Postgres column // is nullable. The DDL always emits `Array(Nullable(T))` (no outer `Nullable` - // wrapper), so ClickHouse does not expect a null-indicator byte before the array. + // wrapper), so ClickHouse does not expect a null-indicator byte before the + // array. let column_schemas: Vec<_> = schema.column_schemas().cloned().collect(); let mut nullable_flags_vec: Vec = column_schemas.iter().map(|c| c.nullable && !is_array_type(&c.typ)).collect(); @@ -157,16 +182,6 @@ where nullable_flags_vec.push(false); // cdc_lsn let nullable_flags: Arc<[bool]> = nullable_flags_vec.into(); - // Execute DDL (no lock held during this await). - let ddl = build_create_table_sql(&ch_table_name, &column_schemas); - let ddl_start = Instant::now(); - self.client.execute_ddl(&ddl).await?; - metrics::histogram!(ETL_CH_DDL_DURATION_SECONDS, "table" => ch_table_name.clone()) - .record(ddl_start.elapsed().as_secs_f64()); - - // Mark as Applied after successful DDL. - self.store.store_destination_table_metadata(table_id, metadata.to_applied()).await?; - // Write-lock: insert, using or_insert to handle concurrent first-writer race. let stored_flags = { let mut guard = self.table_cache.write(); @@ -214,8 +229,8 @@ where // -- Schema change handling -- - /// Handles a schema change event (Relation) by computing the diff and applying - /// ALTER TABLE statements. + /// Handles a schema change event (Relation) by computing the diff and + /// applying ALTER TABLE statements. async fn handle_relation_event(&self, new_schema: &ReplicatedTableSchema) -> EtlResult<()> { let table_id = new_schema.id(); let new_snapshot_id = new_schema.inner().snapshot_id; @@ -227,9 +242,9 @@ where ErrorKind::CorruptedTableSchema, "Missing destination table metadata", format!( - "No destination table metadata found for table {} when processing \ - schema change. The metadata should have been recorded during initial \ - table synchronization.", + "No destination table metadata found for table {} when processing schema \ + change. The metadata should have been recorded during initial table \ + synchronization.", table_id ) ); @@ -287,7 +302,7 @@ where // Compute and apply the diff. let diff = current_schema.diff(new_schema); - if let Err(err) = self.apply_schema_diff(ch_table_name, &diff).await { + if let Err(err) = self.apply_schema_diff(ch_table_name, &diff, ¤t_schema).await { warn!( "schema change failed for table {}: {}. Manual intervention may be required.", table_id, err @@ -314,12 +329,17 @@ where Ok(()) } - /// Applies a schema diff to a ClickHouse table: add columns, rename columns, - /// then drop columns (in that order for safety). + /// Applies a schema diff to a ClickHouse table: add columns, rename + /// columns, then drop columns (in that order for safety). + /// + /// New columns are placed AFTER the last existing user column (before the + /// CDC columns) using ClickHouse's `AFTER` clause. This is critical because + /// RowBinary encoding is positional -- without explicit placement, ADD + /// COLUMN appends after `cdc_lsn`, misaligning the encoding. /// - /// Schema changes create an inherently inconsistent window: rows written before - /// the ALTER were encoded with the old column set, while rows after use the new - /// one. Specifically: + /// Schema changes create an inherently inconsistent window: rows written + /// before the ALTER were encoded with the old column set, while rows + /// after use the new one. Specifically: /// /// - ADD COLUMN: existing rows get NULL/default for the new column. /// - DROP COLUMN: data in the dropped column is lost for all rows. @@ -329,13 +349,25 @@ where /// killed between individual ALTER statements the table may be left in a /// partially altered state. The `DestinationTableMetadata` Applying/Applied /// status tracks this for diagnostic purposes. - async fn apply_schema_diff(&self, ch_table_name: &str, diff: &SchemaDiff) -> EtlResult<()> { + async fn apply_schema_diff( + &self, + ch_table_name: &str, + diff: &SchemaDiff, + current_schema: &ReplicatedTableSchema, + ) -> EtlResult<()> { if diff.is_empty() { return Ok(()); } + // Track the last user column name for AFTER placement. New columns are + // inserted after this column, and each added column becomes the new + // anchor for the next. + let mut last_user_column: String = + current_schema.column_schemas().last().map(|c| c.name.clone()).unwrap_or_default(); + for column in &diff.columns_to_add { - self.client.add_column(ch_table_name, column).await?; + self.client.add_column(ch_table_name, column, &last_user_column).await?; + last_user_column = column.name.clone(); } for rename in &diff.columns_to_rename { @@ -450,7 +482,11 @@ where values.push(ClickHouseValue::Int64( i64::try_from(u64::from(lsn)) .inspect_err(|error| { - tracing::error!(?error, "cannot convert u64 LSN to i64, falling back to i64::MAX"); + tracing::error!( + ?error, + "cannot convert u64 LSN to i64, falling back to \ + i64::MAX" + ); }) .unwrap_or(i64::MAX), )); @@ -459,7 +495,13 @@ where .collect(); client - .insert_rows(&ch_table_name, rows, &nullable_flags, max_bytes, "streaming") + .insert_rows( + &ch_table_name, + rows, + &nullable_flags, + max_bytes, + "streaming", + ) .await }); } @@ -541,7 +583,7 @@ where #[cfg(test)] mod tests { #[test] - fn test_nullable_flags_includes_cdc() { + fn nullable_flags_includes_cdc() { let mut all_flags: Vec = vec![true, false]; all_flags.push(false); // cdc_operation all_flags.push(false); // cdc_lsn diff --git a/etl-destinations/src/clickhouse/encoding.rs b/etl-destinations/src/clickhouse/encoding.rs index a67cac468..1e9388687 100644 --- a/etl-destinations/src/clickhouse/encoding.rs +++ b/etl-destinations/src/clickhouse/encoding.rs @@ -1,28 +1,36 @@ use std::fmt; use chrono::NaiveDate; -use etl::error::{ErrorKind, EtlResult}; -use etl::etl_error; -use etl::types::{ArrayCell, Cell}; - -// ── RowBinary encoding ──────────────────────────────────────────────────────── +use etl::{ + error::{ErrorKind, EtlResult}, + etl_error, + types::{ArrayCell, Cell}, +}; + +// ── RowBinary encoding +// ──────────────────────────────────────────────────────── // -// We bypass the `Row` / `Inserter` API entirely and write RowBinary bytes directly -// via `Client::insert_formatted_with("INSERT INTO \"t\" FORMAT RowBinary")`. +// We bypass the `Row` / `Inserter` API entirely and write RowBinary bytes +// directly via `Client::insert_formatted_with("INSERT INTO \"t\" FORMAT +// RowBinary")`. // // This avoids two fatal issues with the `Inserter` path: // -// 1. `Insert::new` always calls `join_column_names::().expect(…)`, which panics -// when `COLUMN_NAMES = &[]` regardless of whether validation is enabled. +// 1. `Insert::new` always calls `join_column_names::().expect(…)`, which +// panics when `COLUMN_NAMES = &[]` regardless of whether validation is +// enabled. // -// 2. The RowBinary serde serializer wraps its `BufMut` writer in a fresh `&mut` at -// every `serialize_some` call, telescoping the type to `&mut &mut … BytesMut` for -// nullable array elements and overflowing the compiler's recursion limit. +// 2. The RowBinary serde serializer wraps its `BufMut` writer in a fresh `&mut` +// at every `serialize_some` call, telescoping the type to `&mut &mut … +// BytesMut` for nullable array elements and overflowing the compiler's +// recursion limit. // -// Direct binary encoding has neither problem: it is a simple recursive function that -// writes bytes to a `Vec` with no generics and no type-level recursion. +// Direct binary encoding has neither problem: it is a simple recursive function +// that writes bytes to a `Vec` with no generics and no type-level +// recursion. -// ── ClickHouseValue ─────────────────────────────────────────────────────────── +// ── ClickHouseValue +// ─────────────────────────────────────────────────────────── /// Owned ClickHouse-compatible value, moved (not cloned) from a [`Cell`]. pub(crate) enum ClickHouseValue { @@ -38,9 +46,11 @@ pub(crate) enum ClickHouseValue { String(String), /// Days since Unix epoch (ClickHouse `Date` on wire = UInt16 LE) Date(u16), - /// Microseconds since Unix epoch (ClickHouse `DateTime64(6)` on wire = Int64 LE) + /// Microseconds since Unix epoch (ClickHouse `DateTime64(6)` on wire = + /// Int64 LE) DateTime64(i64), - /// UUID in standard 16-byte big-endian order (converted to ClickHouse wire format on encode) + /// UUID in standard 16-byte big-endian order (converted to ClickHouse wire + /// format on encode) Uuid([u8; 16]), Array(Vec), } @@ -172,9 +182,11 @@ fn bytes_to_hex(bytes: Vec) -> String { s } -// ── RowBinary wire encoding ─────────────────────────────────────────────────── +// ── RowBinary wire encoding +// ─────────────────────────────────────────────────── -/// Encodes a variable-length integer (LEB128) used by ClickHouse for string/array lengths. +/// Encodes a variable-length integer (LEB128) for ClickHouse string/array +/// lengths. pub(crate) fn rb_varint(mut v: usize, buf: &mut Vec) { loop { let byte = (v & 0x7f) as u8; @@ -187,7 +199,7 @@ pub(crate) fn rb_varint(mut v: usize, buf: &mut Vec) { } } -/// Encodes a value for a `Nullable(T)` column (1-byte null indicator + value if present). +/// Encodes a value for a `Nullable(T)` column (1-byte null indicator + value). pub(crate) fn rb_encode_nullable(val: ClickHouseValue, buf: &mut Vec) -> EtlResult<()> { match val { ClickHouseValue::Null => buf.push(1), @@ -252,7 +264,8 @@ pub(crate) fn rb_encode_value(val: ClickHouseValue, buf: &mut Vec) -> EtlRes Ok(()) } -/// Encodes a complete row into `buf`, selecting nullable vs non-nullable encoding per column. +/// Encodes a complete row into `buf`, selecting nullable vs non-nullable +/// encoding per column. pub(crate) fn rb_encode_row( values: Vec, nullable_flags: &[bool], @@ -268,32 +281,34 @@ pub(crate) fn rb_encode_row( Ok(()) } -// ── Unit tests ──────────────────────────────────────────────────────────────── +// ── Unit tests +// ──────────────────────────────────────────────────────────────── #[cfg(test)] mod tests { - use super::*; use chrono::NaiveDate; use etl::types::Cell; use uuid::Uuid; + use super::*; + #[test] - fn test_cell_to_clickhouse_value_null() { + fn cell_to_clickhouse_value_null() { assert!(matches!(cell_to_clickhouse_value(Cell::Null), ClickHouseValue::Null)); } #[test] - fn test_cell_to_clickhouse_value_bool() { + fn cell_to_clickhouse_value_bool() { assert!(matches!(cell_to_clickhouse_value(Cell::Bool(true)), ClickHouseValue::Bool(true))); } #[test] - fn test_cell_to_clickhouse_value_i32() { + fn cell_to_clickhouse_value_i32() { assert!(matches!(cell_to_clickhouse_value(Cell::I32(42)), ClickHouseValue::Int32(42))); } #[test] - fn test_cell_to_clickhouse_value_string() { + fn cell_to_clickhouse_value_string() { if let ClickHouseValue::String(s) = cell_to_clickhouse_value(Cell::String("hello".to_string())) { @@ -304,7 +319,7 @@ mod tests { } #[test] - fn test_cell_to_clickhouse_value_date() { + fn cell_to_clickhouse_value_date() { let epoch = NaiveDate::from_ymd_opt(1970, 1, 1).unwrap(); assert!(matches!(cell_to_clickhouse_value(Cell::Date(epoch)), ClickHouseValue::Date(0))); @@ -313,7 +328,7 @@ mod tests { } #[test] - fn test_cell_to_clickhouse_value_timestamp() { + fn cell_to_clickhouse_value_timestamp() { let epoch = chrono::DateTime::from_timestamp(0, 0).unwrap().naive_utc(); assert!(matches!( cell_to_clickhouse_value(Cell::Timestamp(epoch)), @@ -322,7 +337,7 @@ mod tests { } #[test] - fn test_cell_to_clickhouse_value_uuid() { + fn cell_to_clickhouse_value_uuid() { let u = Uuid::parse_str("550e8400-e29b-41d4-a716-446655440000").unwrap(); let expected_bytes = *u.as_bytes(); if let ClickHouseValue::Uuid(bytes) = cell_to_clickhouse_value(Cell::Uuid(u)) { @@ -333,7 +348,7 @@ mod tests { } #[test] - fn test_cell_to_clickhouse_value_bytes_hex() { + fn cell_to_clickhouse_value_bytes_hex() { let bytes = vec![0xde, 0xad, 0xbe, 0xef]; if let ClickHouseValue::String(s) = cell_to_clickhouse_value(Cell::Bytes(bytes)) { assert_eq!(s, "deadbeef"); @@ -343,7 +358,7 @@ mod tests { } #[test] - fn test_rb_encode_value_scalars() { + fn rb_encode_value_scalars() { let mut buf = Vec::new(); buf.clear(); @@ -364,7 +379,7 @@ mod tests { } #[test] - fn test_rb_encode_uuid_wire_format() { + fn rb_encode_uuid_wire_format() { let u = Uuid::parse_str("550e8400-e29b-41d4-a716-446655440000").unwrap(); let val = ClickHouseValue::Uuid(*u.as_bytes()); let mut buf = Vec::new(); @@ -380,7 +395,7 @@ mod tests { } #[test] - fn test_rb_encode_nullable() { + fn encode_nullable() { let mut buf = Vec::new(); rb_encode_nullable(ClickHouseValue::Null, &mut buf).unwrap(); @@ -394,7 +409,7 @@ mod tests { } #[test] - fn test_rb_varint() { + fn varint_encoding() { let mut buf = Vec::new(); rb_varint(0, &mut buf); assert_eq!(buf, [0x00]); @@ -413,7 +428,7 @@ mod tests { } #[test] - fn test_bytes_to_hex() { + fn hex_encoding() { assert_eq!(bytes_to_hex([].to_vec()), ""); assert_eq!(bytes_to_hex([0x00].to_vec()), "00"); assert_eq!(bytes_to_hex([0xff].to_vec()), "ff"); @@ -429,7 +444,7 @@ mod tests { /// # THEN /// It returns a ConversionError rather than writing invalid RowBinary. #[test] - fn test_rb_encode_value_rejects_null_for_non_nullable_column() { + fn rb_encode_value_rejects_null_for_non_nullable_column() { let mut buf = Vec::new(); let result = rb_encode_value(ClickHouseValue::Null, &mut buf); diff --git a/etl-destinations/src/clickhouse/metrics.rs b/etl-destinations/src/clickhouse/metrics.rs index 6b12a1bf3..61be889bf 100644 --- a/etl-destinations/src/clickhouse/metrics.rs +++ b/etl-destinations/src/clickhouse/metrics.rs @@ -8,8 +8,9 @@ static REGISTER_METRICS: Once = Once::new(); /// Labels: `table`. pub const ETL_CH_DDL_DURATION_SECONDS: &str = "etl_ch_ddl_duration_seconds"; -/// Duration of a single RowBinary INSERT statement from first write to server acknowledgement. -/// Labels: `table`, `source` (`copy` = initial table sync, `streaming` = CDC events). +/// Duration of a single RowBinary INSERT statement from first write to server +/// acknowledgement. Labels: `table`, `source` (`copy` = initial table sync, +/// `streaming` = CDC events). pub const ETL_CH_INSERT_DURATION_SECONDS: &str = "etl_ch_insert_duration_seconds"; /// Register ClickHouse-specific metrics. @@ -20,13 +21,15 @@ pub fn register_metrics() { describe_histogram!( ETL_CH_DDL_DURATION_SECONDS, Unit::Seconds, - "Duration of CREATE TABLE IF NOT EXISTS DDL operations sent to ClickHouse, labeled by table" + "Duration of CREATE TABLE IF NOT EXISTS DDL operations sent to ClickHouse, labeled by \ + table" ); describe_histogram!( ETL_CH_INSERT_DURATION_SECONDS, Unit::Seconds, - "Duration of RowBinary INSERT statements from first write to server acknowledgement, labeled by table and source" + "Duration of RowBinary INSERT statements from first write to server acknowledgement, \ + labeled by table and source" ); }); } diff --git a/etl-destinations/src/clickhouse/mod.rs b/etl-destinations/src/clickhouse/mod.rs index c293da0d8..87768db26 100644 --- a/etl-destinations/src/clickhouse/mod.rs +++ b/etl-destinations/src/clickhouse/mod.rs @@ -6,5 +6,6 @@ mod schema; #[cfg(feature = "test-utils")] pub mod test_utils; -pub use client::ClickHouseClient; pub use core::{ClickHouseDestination, ClickHouseInserterConfig}; + +pub use client::ClickHouseClient; diff --git a/etl-destinations/src/clickhouse/schema.rs b/etl-destinations/src/clickhouse/schema.rs index 0b47e0f02..c9c66e166 100644 --- a/etl-destinations/src/clickhouse/schema.rs +++ b/etl-destinations/src/clickhouse/schema.rs @@ -28,8 +28,6 @@ pub fn postgres_column_type_to_clickhouse_sql(typ: &Type) -> &'static str { } /// Returns the ClickHouse array element type for a Postgres array type. -/// -/// Used by [`build_create_table_sql`] to construct `Array(Nullable(T))` columns. fn postgres_array_element_clickhouse_sql(typ: &Type) -> &'static str { match typ { &Type::BOOL_ARRAY => "Boolean", @@ -56,8 +54,9 @@ fn postgres_array_element_clickhouse_sql(typ: &Type) -> &'static str { } } -/// Converts a Postgres `public.my_table` style table name into a ClickHouse table -/// name using the same double-underscore escaping convention used by DuckLake/Iceberg. +/// Converts a Postgres `public.my_table` style table name into a ClickHouse +/// table name using the same double-underscore escaping convention used by +/// DuckLake/Iceberg. /// /// - Schema and table are joined with `_` /// - Any literal `_` in the schema or table name is escaped to `__` @@ -71,11 +70,11 @@ pub fn table_name_to_clickhouse_table_name(schema: &str, table: &str) -> String format!("{escaped_schema}_{escaped_table}") } -/// Returns the full ClickHouse type string for a column, including Nullable -/// wrapping for scalar columns and Array(Nullable(T)) for array columns. +/// Returns the full ClickHouse type string for a column, with Nullable +/// wrapping. /// -/// New columns added via ALTER TABLE are always Nullable regardless of the -/// Postgres NOT NULL constraint, since ClickHouse cannot backfill existing rows. +/// When `force_nullable` is true (ALTER TABLE ADD), all scalar columns become +/// Nullable since ClickHouse cannot backfill existing rows. pub fn clickhouse_column_type(col: &ColumnSchema, force_nullable: bool) -> String { if is_array_type(&col.typ) { let elem = postgres_array_element_clickhouse_sql(&col.typ); @@ -86,14 +85,10 @@ pub fn clickhouse_column_type(col: &ColumnSchema, force_nullable: bool) -> Strin } } -/// Generates a `CREATE TABLE IF NOT EXISTS` SQL statement for the given columns. +/// Generates a `CREATE TABLE IF NOT EXISTS` DDL for the given columns. /// -/// - Non-nullable columns use the bare ClickHouse type (`Int32`, `String`, ...). -/// - Nullable columns use `Nullable(T)`. -/// - Array columns always use `Array(Nullable(T))` (Postgres array elements are nullable). -/// - Two CDC trailing columns are always appended as non-nullable: -/// `cdc_operation String, cdc_lsn Int64` -/// - The table uses `MergeTree()` with `ORDER BY tuple()` (pure append order). +/// Appends `cdc_operation String` and `cdc_lsn Int64` as trailing non-nullable +/// columns. Uses `MergeTree()` with `ORDER BY tuple()`. pub fn build_create_table_sql(table_name: &str, column_schemas: &[ColumnSchema]) -> String { let mut cols = Vec::with_capacity(column_schemas.len() + 2); @@ -108,7 +103,8 @@ pub fn build_create_table_sql(table_name: &str, column_schemas: &[ColumnSchema]) let col_defs = cols.join(",\n"); format!( - "CREATE TABLE IF NOT EXISTS \"{table_name}\" (\n{col_defs}\n) ENGINE = MergeTree()\nORDER BY tuple()" + "CREATE TABLE IF NOT EXISTS \"{table_name}\" (\n{col_defs}\n) ENGINE = MergeTree()\nORDER \ + BY tuple()" ) } @@ -117,7 +113,7 @@ mod tests { use super::*; #[test] - fn test_table_name_escaping() { + fn table_name_escaping() { assert_eq!(table_name_to_clickhouse_table_name("public", "orders"), "public_orders"); assert_eq!( table_name_to_clickhouse_table_name("my_schema", "my_table"), @@ -130,7 +126,7 @@ mod tests { } #[test] - fn test_scalar_type_mapping() { + fn scalar_type_mapping() { assert_eq!(postgres_column_type_to_clickhouse_sql(&Type::BOOL), "Boolean"); assert_eq!(postgres_column_type_to_clickhouse_sql(&Type::CHAR), "String"); assert_eq!(postgres_column_type_to_clickhouse_sql(&Type::BPCHAR), "String"); @@ -158,7 +154,7 @@ mod tests { } #[test] - fn test_array_type_mapping() { + fn array_type_mapping() { assert_eq!(postgres_array_element_clickhouse_sql(&Type::BOOL_ARRAY), "Boolean"); assert_eq!(postgres_array_element_clickhouse_sql(&Type::TEXT_ARRAY), "String"); assert_eq!(postgres_array_element_clickhouse_sql(&Type::INT4_ARRAY), "Int32"); @@ -169,7 +165,7 @@ mod tests { } #[test] - fn test_build_create_table_sql_nullable() { + fn build_create_table_sql_nullable() { let schemas = vec![ ColumnSchema { name: "id".to_string(), @@ -194,7 +190,7 @@ mod tests { } #[test] - fn test_build_create_table_sql_cdc_columns() { + fn build_create_table_sql_cdc_columns() { let schemas = vec![ColumnSchema { name: "id".to_string(), typ: Type::INT4, @@ -211,7 +207,7 @@ mod tests { } #[test] - fn test_build_create_table_sql_array_columns() { + fn build_create_table_sql_array_columns() { let schemas = vec![ColumnSchema { name: "tags".to_string(), typ: Type::TEXT_ARRAY, diff --git a/etl-destinations/src/clickhouse/test_utils.rs b/etl-destinations/src/clickhouse/test_utils.rs index ef3519226..d493d1167 100644 --- a/etl-destinations/src/clickhouse/test_utils.rs +++ b/etl-destinations/src/clickhouse/test_utils.rs @@ -1,8 +1,7 @@ //! Test utilities for ClickHouse destinations. use clickhouse::Client; -use etl::store::schema::SchemaStore; -use etl::store::state::StateStore; +use etl::store::{schema::SchemaStore, state::StateStore}; use tokio::runtime::Handle; use url::Url; use uuid::Uuid; @@ -13,7 +12,8 @@ use crate::clickhouse::{ClickHouseDestination, ClickHouseInserterConfig}; pub const CLICKHOUSE_URL_ENV: &str = "TESTS_CLICKHOUSE_URL"; /// ClickHouse user name (required). pub const CLICKHOUSE_USER_ENV: &str = "TESTS_CLICKHOUSE_USER"; -/// ClickHouse password (optional — omit or leave empty for passwordless access). +/// ClickHouse password (optional -- omit or leave empty for passwordless +/// access). pub const CLICKHOUSE_PASSWORD_ENV: &str = "TESTS_CLICKHOUSE_PASSWORD"; /// Returns the ClickHouse HTTP URL from the environment. @@ -148,6 +148,33 @@ impl ClickHouseTestDatabase { { self.db_client.query(sql).fetch_all::().await.expect("ClickHouse query failed") } + + /// Returns the underlying ClickHouse client for fallible queries. + pub fn db_client(&self) -> &Client { + &self.db_client + } + + /// Returns the column names of a ClickHouse table in position order, + /// excluding the CDC columns (`cdc_operation`, `cdc_lsn`). + pub async fn column_names(&self, table_name: &str) -> Vec { + #[derive(clickhouse::Row, serde::Deserialize)] + struct Col { + name: String, + } + let sql = format!( + "SELECT name FROM system.columns WHERE database = '{}' AND table = '{}' AND name NOT \ + IN ('cdc_operation', 'cdc_lsn') ORDER BY position", + self.database, table_name + ); + self.db_client + .query(&sql) + .fetch_all::() + .await + .expect("failed to query system.columns") + .into_iter() + .map(|c| c.name) + .collect() + } } impl Drop for ClickHouseTestDatabase { diff --git a/etl-destinations/tests/clickhouse_pipeline.rs b/etl-destinations/tests/clickhouse_pipeline.rs index 234429d14..f9c6aa1c5 100644 --- a/etl-destinations/tests/clickhouse_pipeline.rs +++ b/etl-destinations/tests/clickhouse_pipeline.rs @@ -1,18 +1,26 @@ -use etl::state::table::TableReplicationPhaseType; -use etl::test_utils::database::{spawn_source_database, test_table_name}; -use etl::test_utils::notifying_store::NotifyingStore; -use etl::test_utils::pipeline::create_pipeline; -use etl::types::PipelineId; -use etl_destinations::clickhouse::ClickHouseInserterConfig; -use etl_destinations::clickhouse::client::ClickHouseClient; -use etl_destinations::clickhouse::test_utils::{ - ClickHouseTestDatabase, get_clickhouse_password, get_clickhouse_url, get_clickhouse_user, - setup_clickhouse_database, +use std::{sync::Once, time::Duration}; + +use etl::{ + state::table::TableReplicationPhaseType, + store::state::StateStore, + test_utils::{ + database::{spawn_source_database, test_table_name}, + notifying_store::NotifyingStore, + pipeline::create_pipeline, + }, + types::PipelineId, }; +use etl_destinations::clickhouse::{ + ClickHouseInserterConfig, + client::ClickHouseClient, + test_utils::{ + ClickHouseTestDatabase, get_clickhouse_password, get_clickhouse_url, get_clickhouse_user, + setup_clickhouse_database, + }, +}; +use etl_postgres::tokio::test_utils::TableModification; use etl_telemetry::tracing::init_test_tracing; use rand::random; -use std::sync::Once; -use std::time::Duration; use tokio::time::sleep; use url::Url; @@ -31,9 +39,9 @@ fn install_crypto_provider() { /// SELECT query that fetches all verified columns from the ClickHouse table. /// -/// `uuid_col` is projected via `toString()` because the ClickHouse UUID RowBinary -/// wire format does not directly map to a Rust `String`; `toString()` gives us the -/// canonical `xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx` string form. +/// `uuid_col` is projected via `toString()` because the ClickHouse UUID +/// RowBinary wire format does not directly map to a Rust `String`; `toString()` +/// gives us the canonical `xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx` string form. /// /// All other columns are read with their native ClickHouse types: /// - `Date` -> u16 (days since 1970-01-01) @@ -89,18 +97,17 @@ const TRUNCATE_FLOW_SELECT: &str = concat!( "ORDER BY id, cdc_lsn", ); -/// Days from 1970-01-01 to 2024-01-15 (used to verify the `date_col` round-trip). +/// Days from 1970-01-01 to 2024-01-15 (used to verify the `date_col` +/// round-trip). /// /// Python: `(date(2024, 1, 15) - date(1970, 1, 1)).days` = 19737 const DATE_2024_01_15_DAYS: u16 = 19737; /// Microseconds from epoch for `2024-01-15 12:00:00 UTC`. -/// -/// Python: `int(datetime(2024, 1, 15, 12, 0, 0, tzinfo=timezone.utc).timestamp() * 1_000_000)` -/// = 1705320000000000 const TS_2024_01_15_12_00_US: i64 = 1_705_320_000_000_000; -/// Waits until ClickHouse returns at least `expected_rows` from `UPDATE_FLOW_SELECT`. +/// Waits until ClickHouse returns at least `expected_rows` from +/// `UPDATE_FLOW_SELECT`. async fn wait_for_update_flow_rows( ch_db: &ClickHouseTestDatabase, expected_rows: usize, @@ -121,7 +128,8 @@ async fn wait_for_update_flow_rows( ); } -/// Waits until ClickHouse returns at least `expected_rows` from `DELETE_FLOW_SELECT`. +/// Waits until ClickHouse returns at least `expected_rows` from +/// `DELETE_FLOW_SELECT`. async fn wait_for_delete_flow_rows( ch_db: &ClickHouseTestDatabase, expected_rows: usize, @@ -142,7 +150,8 @@ async fn wait_for_delete_flow_rows( ); } -/// Waits until ClickHouse returns at least `expected_rows` from `RESTART_FLOW_SELECT`. +/// Waits until ClickHouse returns at least `expected_rows` from +/// `RESTART_FLOW_SELECT`. async fn wait_for_restart_flow_rows( ch_db: &ClickHouseTestDatabase, expected_rows: usize, @@ -163,7 +172,8 @@ async fn wait_for_restart_flow_rows( ); } -/// Waits until ClickHouse returns exactly zero rows from `TRUNCATE_FLOW_SELECT`. +/// Waits until ClickHouse returns exactly zero rows from +/// `TRUNCATE_FLOW_SELECT`. async fn wait_for_truncate_flow_empty(ch_db: &ClickHouseTestDatabase) { for _ in 0..50 { let rows: Vec = ch_db.query(TRUNCATE_FLOW_SELECT).await; @@ -176,7 +186,8 @@ async fn wait_for_truncate_flow_empty(ch_db: &ClickHouseTestDatabase) { panic!("timed out waiting for clickhouse truncate_flow table to become empty"); } -/// Waits until ClickHouse returns at least `expected_rows` from `TRUNCATE_FLOW_SELECT`. +/// Waits until ClickHouse returns at least `expected_rows` from +/// `TRUNCATE_FLOW_SELECT`. async fn wait_for_truncate_flow_rows( ch_db: &ClickHouseTestDatabase, expected_rows: usize, @@ -225,10 +236,11 @@ async fn wait_for_truncate_flow_rows( /// # Regression /// /// Row 2's non-empty arrays specifically catch the nullable-array encoding bug -/// where `nullable_flags[i] = true` for array columns caused `rb_encode_nullable` -/// to prepend an extra null-indicator byte. ClickHouse read that byte as -/// `varint(0)` (empty array) and then parsed the actual element bytes as -/// subsequent column data, failing with "Cannot read all data" at row 2. +/// where `nullable_flags[i] = true` for array columns caused +/// `rb_encode_nullable` to prepend an extra null-indicator byte. ClickHouse +/// read that byte as `varint(0)` (empty array) and then parsed the actual +/// element bytes as subsequent column data, failing with "Cannot read all data" +/// at row 2. #[tokio::test(flavor = "multi_thread")] async fn all_types_table_copy() { init_test_tracing(); @@ -414,7 +426,8 @@ async fn all_types_table_copy() { assert_eq!(r2.bytea_col, "cafebabe"); assert_eq!(r2.uuid_col.to_lowercase(), "a1b2c3d4-e5f6-7890-abcd-ef1234567890"); assert_eq!(r2.cdc_operation, "INSERT"); - // Non-empty arrays -- the regression case that triggered the bug before the fix. + // Non-empty arrays -- the regression case that triggered the bug before the + // fix. assert_eq!( r2.integer_array_col, vec![Some(1), Some(2), Some(3)], @@ -427,7 +440,8 @@ async fn all_types_table_copy() { ); } -/// Tests that UPDATE events are streamed to ClickHouse after the initial table copy. +/// Tests that UPDATE events are streamed to ClickHouse after initial table +/// copy. /// /// # GIVEN /// @@ -537,8 +551,8 @@ const BOUNDARY_VALUES_SELECT: &str = concat!( /// /// 1. **All NULLs** -- nullable scalars are NULL, arrays are empty. /// 2. **NULL elements inside arrays** -- `{1, NULL, 3}`, `{'a', NULL, 'c'}`. -/// 3. **Empty strings** -- a present-but-empty text value next to a NULL integer, -/// plus single-element arrays (varint length = 1). +/// 3. **Empty strings** -- a present-but-empty text value next to a NULL +/// integer, plus single-element arrays (varint length = 1). /// 4. **Multi-byte UTF-8** -- emoji and CJK characters, verifying that the /// RowBinary varint encodes byte length (not character count) correctly. /// @@ -586,8 +600,8 @@ async fn boundary_values_table_copy() { // Row 1: all nullable columns are NULL, arrays are empty. database .run_sql(&format!( - "INSERT INTO {} (nullable_text, nullable_int, int_array_col, text_array_col) \ - VALUES (NULL, NULL, ARRAY[]::integer[], ARRAY[]::text[])", + "INSERT INTO {} (nullable_text, nullable_int, int_array_col, text_array_col) VALUES \ + (NULL, NULL, ARRAY[]::integer[], ARRAY[]::text[])", table_name.as_quoted_identifier(), )) .await @@ -597,8 +611,8 @@ async fn boundary_values_table_copy() { // while surrounding elements are present. database .run_sql(&format!( - "INSERT INTO {} (nullable_text, nullable_int, int_array_col, text_array_col) \ - VALUES ('present', 42, ARRAY[1, NULL, 3]::integer[], ARRAY['a', NULL, 'c']::text[])", + "INSERT INTO {} (nullable_text, nullable_int, int_array_col, text_array_col) VALUES \ + ('present', 42, ARRAY[1, NULL, 3]::integer[], ARRAY['a', NULL, 'c']::text[])", table_name.as_quoted_identifier(), )) .await @@ -608,19 +622,19 @@ async fn boundary_values_table_copy() { // single-element arrays (varint length byte = 0x01). database .run_sql(&format!( - "INSERT INTO {} (nullable_text, nullable_int, int_array_col, text_array_col) \ - VALUES ('', NULL, ARRAY[99]::integer[], ARRAY['only']::text[])", + "INSERT INTO {} (nullable_text, nullable_int, int_array_col, text_array_col) VALUES \ + ('', NULL, ARRAY[99]::integer[], ARRAY['only']::text[])", table_name.as_quoted_identifier(), )) .await .expect("Failed to insert row 3 (empty string + single-element arrays)"); - // Row 4: multi-byte UTF-8 -- emoji (4 bytes per char) and CJK (3 bytes per char). - // The RowBinary varint encodes byte length, not character count. + // Row 4: multi-byte UTF-8 -- emoji (4 bytes per char) and CJK (3 bytes per + // char). The RowBinary varint encodes byte length, not character count. database .run_sql(&format!( - "INSERT INTO {} (nullable_text, nullable_int, int_array_col, text_array_col) \ - VALUES ('hello 🌍🚀', 0, ARRAY[1, 2], ARRAY['日本語', '中文'])", + "INSERT INTO {} (nullable_text, nullable_int, int_array_col, text_array_col) VALUES \ + ('hello 🌍🚀', 0, ARRAY[1, 2], ARRAY['日本語', '中文'])", table_name.as_quoted_identifier(), )) .await @@ -699,7 +713,8 @@ async fn boundary_values_table_copy() { ); } -/// Tests that DELETE events are streamed to ClickHouse after the initial table copy. +/// Tests that DELETE events are streamed to ClickHouse after initial table +/// copy. /// /// # GIVEN /// @@ -718,7 +733,8 @@ async fn boundary_values_table_copy() { /// /// ClickHouse contains three rows (append-only CDC): /// - Two `INSERT` rows from the initial table copy (`cdc_lsn = 0`). -/// - One `DELETE` row for `id=2` with the old row data preserved and a positive LSN. +/// - One `DELETE` row for `id=2` with the old row data preserved and a positive +/// LSN. /// - The `id=1` row has no corresponding `DELETE`. #[tokio::test(flavor = "multi_thread")] async fn deletes_are_streamed_to_clickhouse() { @@ -944,8 +960,8 @@ async fn pipeline_restart_resumes_streaming() { /// # WHEN /// /// 1. Postgres issues `TRUNCATE` on the table. -/// 2. After the table becomes empty in ClickHouse, a new row -/// (`id=3, value='gamma'`) is inserted into Postgres. +/// 2. After the table becomes empty in ClickHouse, a new row (`id=3, +/// value='gamma'`) is inserted into Postgres. /// /// # THEN /// @@ -1434,8 +1450,8 @@ const DEFAULT_IDENTITY_DELETE_SELECT: &str = concat!( /// - `id=2, value='', cdc_operation='DELETE', cdc_lsn > 0` (streamed /// delete -- Postgres only sent the PK, so the non-PK `value` column is a /// zero-value empty string, not the original data) -/// - `id=3, value='after', cdc_operation='INSERT', cdc_lsn > 0` (proves -/// the pipeline continued after the delete) +/// - `id=3, value='after', cdc_operation='INSERT', cdc_lsn > 0` (proves the +/// pipeline continued after the delete) #[tokio::test(flavor = "multi_thread")] async fn delete_with_default_replica_identity() { init_test_tracing(); @@ -1677,3 +1693,353 @@ async fn ping_fails_against_unreachable_clickhouse() { ); assert!(client.ping().await.is_err()); } + +/// Row struct for the ADD COLUMN test after schema change. +/// Columns: id, name, age, email (email is Nullable after ALTER TABLE ADD). +#[derive(clickhouse::Row, serde::Deserialize, Debug)] +struct AddColumnRow { + id: i64, + name: String, + age: i32, + email: Option, + cdc_operation: String, +} + +/// Tests that ALTER TABLE ADD COLUMN in Postgres propagates to ClickHouse +/// and subsequent inserts include the new column. +/// +/// # GIVEN +/// +/// A Postgres table with columns (id serial PK, name text not null, age integer +/// not null) and one row ('Alice', 25), copied to ClickHouse. +/// +/// # WHEN +/// +/// A new column `email text` is added in Postgres, and a row ('Bob', 30, +/// 'bob@example.com') is inserted with the new schema. +/// +/// # THEN +/// +/// The ClickHouse table has an `email` column. Alice's row has NULL for email. +/// Bob's row has 'bob@example.com'. The destination metadata snapshot_id has +/// increased. +#[tokio::test(flavor = "multi_thread")] +async fn schema_change_add_column() { + init_test_tracing(); + install_crypto_provider(); + + let ch_table_name = "test_schema__add__col"; + + // --- GIVEN: table with one row, copied to ClickHouse --- + let database = spawn_source_database().await; + let table_name = test_table_name("schema_add_col"); + + let table_id = database + .create_table( + table_name.clone(), + true, + &[("name", "text not null"), ("age", "integer not null")], + ) + .await + .expect("Failed to create table"); + + let publication_name = "test_pub_ch_schema_add"; + database + .create_publication(publication_name, std::slice::from_ref(&table_name)) + .await + .expect("Failed to create publication"); + + database + .run_sql(&format!( + "INSERT INTO {} (name, age) VALUES ('Alice', 25)", + table_name.as_quoted_identifier(), + )) + .await + .expect("Failed to insert Alice"); + + let ch_db = setup_clickhouse_database().await; + let store = NotifyingStore::new(); + let pipeline_id: PipelineId = random(); + let destination = ch_db.build_destination(store.clone()); + + let table_ready = + store.notify_on_table_state_type(table_id, TableReplicationPhaseType::Ready).await; + + let mut pipeline = create_pipeline( + &database.config, + pipeline_id, + publication_name.to_owned(), + store.clone(), + destination, + ); + + pipeline.start().await.unwrap(); + table_ready.notified().await; + + // Verify initial state. + let initial_columns = ch_db.column_names(ch_table_name).await; + assert_eq!(initial_columns, vec!["id", "name", "age"]); + + let initial_metadata = store + .get_applied_destination_table_metadata(table_id) + .await + .unwrap() + .expect("metadata should exist after table creation"); + let initial_snapshot_id = initial_metadata.snapshot_id; + + // --- WHEN: add column, then insert with new schema --- + database + .alter_table( + table_name.clone(), + &[TableModification::AddColumn { name: "email", data_type: "text" }], + ) + .await + .unwrap(); + + database + .run_sql(&format!( + "INSERT INTO {} (name, age, email) VALUES ('Bob', 30, 'bob@example.com')", + table_name.as_quoted_identifier(), + )) + .await + .expect("Failed to insert Bob"); + + // Poll until Bob's row arrives (2 rows total = Alice from copy + Bob from + // streaming). + let select = concat!( + "SELECT id, name, age, email, cdc_operation ", + "FROM \"test_schema__add__col\" ", + "ORDER BY id", + ); + let mut rows: Vec = Vec::new(); + for _ in 0..50 { + // The SELECT will fail if the email column doesn't exist yet, so + // catch errors and retry. + if let Ok(r) = ch_db.db_client().query(select).fetch_all::().await { + if r.len() >= 2 { + rows = r; + break; + } + } + sleep(Duration::from_millis(200)).await; + } + + pipeline.shutdown_and_wait().await.unwrap(); + + // --- THEN: ClickHouse has the new column, both rows present --- + let final_columns = ch_db.column_names(ch_table_name).await; + assert_eq!(final_columns, vec!["id", "name", "age", "email"]); + + assert_eq!(rows.len(), 2, "expected Alice + Bob"); + + // Alice: pre-change row, email should be NULL. + assert_eq!(rows[0].id, 1); + assert_eq!(rows[0].name, "Alice"); + assert_eq!(rows[0].age, 25); + assert_eq!(rows[0].email, None, "Alice's email should be NULL (column added after her row)"); + assert_eq!(rows[0].cdc_operation, "INSERT"); + + // Bob: post-change row, email present. + assert_eq!(rows[1].id, 2); + assert_eq!(rows[1].name, "Bob"); + assert_eq!(rows[1].age, 30); + assert_eq!(rows[1].email, Some("bob@example.com".to_string())); + assert_eq!(rows[1].cdc_operation, "INSERT"); + + // Metadata snapshot_id should have advanced. + let final_metadata = store + .get_applied_destination_table_metadata(table_id) + .await + .unwrap() + .expect("metadata should exist after schema change"); + assert!( + final_metadata.snapshot_id > initial_snapshot_id, + "snapshot_id should increase after schema change" + ); +} + +/// Row struct for the combined schema change test after all changes. +/// Columns: id, full_name (renamed), status (kept), email (added). +/// age is dropped. +#[derive(clickhouse::Row, serde::Deserialize, Debug)] +struct CombinedSchemaChangeRow { + id: i64, + full_name: String, + status: Option, + email: Option, + cdc_operation: String, +} + +/// Tests that multiple schema changes (ADD, DROP, RENAME) in Postgres all +/// propagate to ClickHouse correctly. +/// +/// # GIVEN +/// +/// A Postgres table with columns (id serial PK, name text not null, age integer +/// not null, status text) and one row ('Alice', 25, 'active'), copied to +/// ClickHouse. +/// +/// # WHEN +/// +/// Three schema changes are applied: +/// 1. RENAME COLUMN name TO full_name +/// 2. DROP COLUMN age +/// 3. ADD COLUMN email text +/// A new row ('Bob', 'pending', 'bob@example.com') is inserted with the updated +/// schema. +/// +/// # THEN +/// +/// The ClickHouse table has columns: id, full_name, status, email. +/// - 'age' is dropped. +/// - 'name' is renamed to 'full_name'. +/// - 'email' is added. +/// Alice's row has 'Alice' under 'full_name', 'active' for status, NULL for +/// email. Bob's row has the new values. +/// The destination metadata snapshot_id has increased. +#[tokio::test(flavor = "multi_thread")] +async fn schema_change_add_drop_rename() { + init_test_tracing(); + install_crypto_provider(); + + let ch_table_name = "test_schema__multi"; + + // --- GIVEN: table with one row, copied to ClickHouse --- + let database = spawn_source_database().await; + let table_name = test_table_name("schema_multi"); + + let table_id = database + .create_table( + table_name.clone(), + true, + &[("name", "text not null"), ("age", "integer not null"), ("status", "text")], + ) + .await + .expect("Failed to create table"); + + let publication_name = "test_pub_ch_schema_multi"; + database + .create_publication(publication_name, std::slice::from_ref(&table_name)) + .await + .expect("Failed to create publication"); + + database + .run_sql(&format!( + "INSERT INTO {} (name, age, status) VALUES ('Alice', 25, 'active')", + table_name.as_quoted_identifier(), + )) + .await + .expect("Failed to insert Alice"); + + let ch_db = setup_clickhouse_database().await; + let store = NotifyingStore::new(); + let pipeline_id: PipelineId = random(); + let destination = ch_db.build_destination(store.clone()); + + let table_ready = + store.notify_on_table_state_type(table_id, TableReplicationPhaseType::Ready).await; + + let mut pipeline = create_pipeline( + &database.config, + pipeline_id, + publication_name.to_owned(), + store.clone(), + destination, + ); + + pipeline.start().await.unwrap(); + table_ready.notified().await; + + // Verify initial schema. + let initial_columns = ch_db.column_names(ch_table_name).await; + assert_eq!(initial_columns, vec!["id", "name", "age", "status"]); + + let initial_metadata = store + .get_applied_destination_table_metadata(table_id) + .await + .unwrap() + .expect("metadata should exist after table creation"); + let initial_snapshot_id = initial_metadata.snapshot_id; + + // --- WHEN: rename + drop + add, then insert with new schema --- + database + .alter_table( + table_name.clone(), + &[TableModification::RenameColumn { old_name: "name", new_name: "full_name" }], + ) + .await + .unwrap(); + + database + .alter_table(table_name.clone(), &[TableModification::DropColumn { name: "age" }]) + .await + .unwrap(); + + database + .alter_table( + table_name.clone(), + &[TableModification::AddColumn { name: "email", data_type: "text" }], + ) + .await + .unwrap(); + + database + .run_sql(&format!( + "INSERT INTO {} (full_name, status, email) VALUES ('Bob', 'pending', \ + 'bob@example.com')", + table_name.as_quoted_identifier(), + )) + .await + .expect("Failed to insert Bob"); + + // Poll until Bob's row arrives. + let select = concat!( + "SELECT id, full_name, status, email, cdc_operation ", + "FROM \"test_schema__multi\" ", + "ORDER BY id", + ); + let mut rows: Vec = Vec::new(); + for _ in 0..50 { + if let Ok(r) = ch_db.db_client().query(select).fetch_all::().await + { + if r.len() >= 2 { + rows = r; + break; + } + } + sleep(Duration::from_millis(200)).await; + } + + pipeline.shutdown_and_wait().await.unwrap(); + + // --- THEN: ClickHouse schema reflects all changes --- + let final_columns = ch_db.column_names(ch_table_name).await; + assert_eq!(final_columns, vec!["id", "full_name", "status", "email"]); + + assert_eq!(rows.len(), 2, "expected Alice + Bob"); + + // Alice: pre-change row. + assert_eq!(rows[0].id, 1); + assert_eq!(rows[0].full_name, "Alice", "renamed column should preserve data"); + assert_eq!(rows[0].status, Some("active".to_string())); + assert_eq!(rows[0].email, None, "Alice's email should be NULL (added after her row)"); + assert_eq!(rows[0].cdc_operation, "INSERT"); + + // Bob: post-change row. + assert_eq!(rows[1].id, 2); + assert_eq!(rows[1].full_name, "Bob"); + assert_eq!(rows[1].status, Some("pending".to_string())); + assert_eq!(rows[1].email, Some("bob@example.com".to_string())); + assert_eq!(rows[1].cdc_operation, "INSERT"); + + // Metadata snapshot_id should have advanced. + let final_metadata = store + .get_applied_destination_table_metadata(table_id) + .await + .unwrap() + .expect("metadata should exist after schema change"); + assert!( + final_metadata.snapshot_id > initial_snapshot_id, + "snapshot_id should increase after schema change" + ); +} diff --git a/etl-examples/src/bin/clickhouse.rs b/etl-examples/src/bin/clickhouse.rs index 216541c25..aa0eed567 100644 --- a/etl-examples/src/bin/clickhouse.rs +++ b/etl-examples/src/bin/clickhouse.rs @@ -36,16 +36,18 @@ requires authentication. */ +use std::{error::Error, sync::Once}; + use clap::{Args, Parser}; -use etl::config::{ - BatchConfig, InvalidatedSlotBehavior, MemoryBackpressureConfig, PgConnectionConfig, - PipelineConfig, TableSyncCopyConfig, TcpKeepaliveConfig, TlsConfig, +use etl::{ + config::{ + BatchConfig, InvalidatedSlotBehavior, MemoryBackpressureConfig, PgConnectionConfig, + PipelineConfig, TableSyncCopyConfig, TcpKeepaliveConfig, TlsConfig, + }, + pipeline::Pipeline, + store::both::memory::MemoryStore, }; -use etl::pipeline::Pipeline; -use etl::store::both::memory::MemoryStore; use etl_destinations::clickhouse::{ClickHouseDestination, ClickHouseInserterConfig}; -use std::error::Error; -use std::sync::Once; use sysinfo::MemoryRefreshKind; use tokio::signal; use tracing::{error, info}; @@ -74,7 +76,8 @@ struct AppArgs { /// ClickHouse destination parameters #[clap(flatten)] ch_args: ChArgs, - /// Postgres publication name (must be created beforehand with CREATE PUBLICATION) + /// Postgres publication name (must be created beforehand with CREATE + /// PUBLICATION) #[arg(long)] publication: String, } @@ -114,10 +117,12 @@ struct ChArgs { /// ClickHouse target database #[arg(long)] ch_database: String, - /// Maximum time to wait for a batch to fill in milliseconds (lower values = lower latency, less throughput) + /// Maximum time to wait for a batch to fill in milliseconds (lower values = + /// lower latency, less throughput) #[arg(long, default_value = "5000")] max_batch_fill_duration_ms: u64, - /// Maximum number of concurrent table sync workers (higher values = faster initial sync, more resource usage) + /// Maximum number of concurrent table sync workers (higher values = faster + /// initial sync, more resource usage) #[arg(long, default_value = "4")] max_table_sync_workers: u16, } @@ -133,7 +138,8 @@ async fn main() -> Result<(), Box> { Ok(()) } -/// Initialize structured logging with configurable log levels via RUST_LOG environment variable. +/// Initialize structured logging with configurable log levels via RUST_LOG +/// environment variable. fn init_tracing() { tracing_subscriber::registry() .with( @@ -200,8 +206,9 @@ async fn main_impl() -> Result<(), Box> { max_copy_connections_per_table: PipelineConfig::DEFAULT_MAX_COPY_CONNECTIONS_PER_TABLE, }; - // Compute max_bytes_per_insert using the same formula as BatchBudget::ideal_batch_size_bytes: - // total_memory * memory_budget_ratio / max_table_sync_workers + // Compute max_bytes_per_insert using the same formula as + // BatchBudget::ideal_batch_size_bytes: total_memory * memory_budget_ratio + // / max_table_sync_workers let max_bytes_per_insert = { let mut sys = sysinfo::System::new(); sys.refresh_memory_specifics(MemoryRefreshKind::nothing().with_ram()); diff --git a/etl/tests/replication.rs b/etl/tests/replication.rs index b37d00eea..505594ad0 100644 --- a/etl/tests/replication.rs +++ b/etl/tests/replication.rs @@ -51,16 +51,14 @@ fn test_column( } fn column_schemas_from_ddl_message(message: &JsonValue) -> Vec { - let primary_key_positions: std::collections::HashMap = - message["identity"]["primary_key_attnums"] - .as_array() - .unwrap() - .iter() - .enumerate() - .map(|(index, attnum)| { - (attnum.as_i64().unwrap() as i32, i32::try_from(index + 1).unwrap()) - }) - .collect(); + let primary_key_positions: std::collections::HashMap = message["identity"] + ["primary_key_attnums"] + .as_array() + .unwrap() + .iter() + .enumerate() + .map(|(index, attnum)| (attnum.as_i64().unwrap() as i32, i32::try_from(index + 1).unwrap())) + .collect(); let mut columns = message["columns"] .as_array() From 4ab4ab4f5e615e5e46e3a1dd616b3e97f3394d36 Mon Sep 17 00:00:00 2001 From: Jordan McQueen Date: Wed, 22 Apr 2026 00:40:36 +0900 Subject: [PATCH 41/86] Remove unused MemorySnapshot::total() method Dead code after inlining the sysinfo call in the replicator and example. Fixes CI -Dwarnings failure. --- etl/src/concurrency/memory_monitor.rs | 4 ---- 1 file changed, 4 deletions(-) diff --git a/etl/src/concurrency/memory_monitor.rs b/etl/src/concurrency/memory_monitor.rs index 254f16a3f..acd7d8045 100644 --- a/etl/src/concurrency/memory_monitor.rs +++ b/etl/src/concurrency/memory_monitor.rs @@ -49,10 +49,6 @@ impl MemorySnapshot { used_percent.clamp(0.0, 1.0) } - - pub fn total(&self) -> u64 { - self.total - } } /// Internal shared state for memory backpressure. From 2ffd34ad6c7fa39c87f7a19c6e814337fed918cd Mon Sep 17 00:00:00 2001 From: Jordan McQueen Date: Wed, 22 Apr 2026 09:45:45 +0900 Subject: [PATCH 42/86] Allow match_same_arms in type mapping and add retry to test setup Explicit match arms in type mapping functions document intentional Postgres-to-ClickHouse type decisions. Suppressing the lint preserves this clarity and prevents silent breakage if the wildcard default changes. Add retry with backoff to create_database in test setup to absorb transient connection failures under parallel test load. --- etl-destinations/src/clickhouse/schema.rs | 2 ++ etl-destinations/src/clickhouse/test_utils.rs | 20 +++++++++++++------ 2 files changed, 16 insertions(+), 6 deletions(-) diff --git a/etl-destinations/src/clickhouse/schema.rs b/etl-destinations/src/clickhouse/schema.rs index c9c66e166..49a544e55 100644 --- a/etl-destinations/src/clickhouse/schema.rs +++ b/etl-destinations/src/clickhouse/schema.rs @@ -5,6 +5,7 @@ use etl::types::{ColumnSchema, Type, is_array_type}; /// The returned string does not include `Nullable(...)` wrapping — callers are /// responsible for applying that when the column is nullable. Arrays always use /// `Array(Nullable(T))` since Postgres array elements are nullable. +#[allow(clippy::match_same_arms)] // Explicit arms document intentional type mappings. pub fn postgres_column_type_to_clickhouse_sql(typ: &Type) -> &'static str { match typ { &Type::BOOL => "Boolean", @@ -28,6 +29,7 @@ pub fn postgres_column_type_to_clickhouse_sql(typ: &Type) -> &'static str { } /// Returns the ClickHouse array element type for a Postgres array type. +#[allow(clippy::match_same_arms)] // Explicit arms document intentional type mappings. fn postgres_array_element_clickhouse_sql(typ: &Type) -> &'static str { match typ { &Type::BOOL_ARRAY => "Boolean", diff --git a/etl-destinations/src/clickhouse/test_utils.rs b/etl-destinations/src/clickhouse/test_utils.rs index d493d1167..a15d8c1e6 100644 --- a/etl-destinations/src/clickhouse/test_utils.rs +++ b/etl-destinations/src/clickhouse/test_utils.rs @@ -85,13 +85,21 @@ impl ClickHouseTestDatabase { } } - /// Creates the test database in ClickHouse. + /// Creates the test database in ClickHouse, retrying on transient errors. pub async fn create_database(&self) { - self.root_client - .query(&format!("CREATE DATABASE IF NOT EXISTS `{}`", self.database)) - .execute() - .await - .expect("Failed to create test ClickHouse database"); + let query = format!("CREATE DATABASE IF NOT EXISTS `{}`", self.database); + for attempt in 1..=5 { + match self.root_client.query(&query).execute().await { + Ok(()) => return, + Err(e) if attempt < 5 => { + eprintln!( + "warning: create_database attempt {attempt}/5 failed: {e}, retrying..." + ); + tokio::time::sleep(std::time::Duration::from_millis(200 * attempt)).await; + } + Err(e) => panic!("Failed to create test ClickHouse database after 5 attempts: {e}"), + } + } } /// Drops the test database from ClickHouse. From 5706d1afd90e79c8c5621260634ef546bcea838e Mon Sep 17 00:00:00 2001 From: Jordan McQueen Date: Wed, 22 Apr 2026 09:52:38 +0900 Subject: [PATCH 43/86] Remove match_same_arms allow attributes The lint was removed upstream in 57bff199. --- etl-destinations/src/clickhouse/schema.rs | 2 -- 1 file changed, 2 deletions(-) diff --git a/etl-destinations/src/clickhouse/schema.rs b/etl-destinations/src/clickhouse/schema.rs index 49a544e55..c9c66e166 100644 --- a/etl-destinations/src/clickhouse/schema.rs +++ b/etl-destinations/src/clickhouse/schema.rs @@ -5,7 +5,6 @@ use etl::types::{ColumnSchema, Type, is_array_type}; /// The returned string does not include `Nullable(...)` wrapping — callers are /// responsible for applying that when the column is nullable. Arrays always use /// `Array(Nullable(T))` since Postgres array elements are nullable. -#[allow(clippy::match_same_arms)] // Explicit arms document intentional type mappings. pub fn postgres_column_type_to_clickhouse_sql(typ: &Type) -> &'static str { match typ { &Type::BOOL => "Boolean", @@ -29,7 +28,6 @@ pub fn postgres_column_type_to_clickhouse_sql(typ: &Type) -> &'static str { } /// Returns the ClickHouse array element type for a Postgres array type. -#[allow(clippy::match_same_arms)] // Explicit arms document intentional type mappings. fn postgres_array_element_clickhouse_sql(typ: &Type) -> &'static str { match typ { &Type::BOOL_ARRAY => "Boolean", From ff33dc2e26b40a3babf2c01079e2b7ad69b54f52 Mon Sep 17 00:00:00 2001 From: Jordan McQueen Date: Wed, 22 Apr 2026 18:45:32 +0900 Subject: [PATCH 44/86] Add schema change crash recovery and fix test flakiness Recovery: if ensure_table_exists finds metadata in Applying state (interrupted schema change), it re-applies the diff idempotently using previous_snapshot_id and marks Applied. Idempotent ALTER statements: ADD COLUMN IF NOT EXISTS, DROP COLUMN IF EXISTS, and catch-and-skip for already-applied RENAME COLUMN. Test flakiness: add nextest thread group (max 4) for ClickHouse tests and --test-threads=4 to the local test script. Also retry create_database with backoff on transient connection failures. --- etl-api/src/utils.rs | 4 +- etl-destinations/src/clickhouse/client.rs | 25 ++++- etl-destinations/src/clickhouse/core.rs | 105 ++++++++++++++---- etl-destinations/tests/clickhouse_pipeline.rs | 19 ++-- 4 files changed, 119 insertions(+), 34 deletions(-) diff --git a/etl-api/src/utils.rs b/etl-api/src/utils.rs index 18b28239b..3f0ff62af 100644 --- a/etl-api/src/utils.rs +++ b/etl-api/src/utils.rs @@ -307,7 +307,7 @@ mod tests { } #[test] - fn test_trim_http_url_trims_and_parses() { + fn trim_http_url_trims_and_parses() { #[derive(Debug, Deserialize)] struct TestStruct { #[serde(rename = "value", deserialize_with = "trim_http_url")] @@ -320,7 +320,7 @@ mod tests { } #[test] - fn test_trim_http_url_rejects_non_http_scheme() { + fn trim_http_url_rejects_non_http_scheme() { #[derive(Debug, Deserialize)] struct TestStruct { #[serde(rename = "value", deserialize_with = "trim_http_url")] diff --git a/etl-destinations/src/clickhouse/client.rs b/etl-destinations/src/clickhouse/client.rs index 7be316932..960405c48 100644 --- a/etl-destinations/src/clickhouse/client.rs +++ b/etl-destinations/src/clickhouse/client.rs @@ -5,6 +5,7 @@ use etl::{ error::{ErrorKind, EtlResult}, etl_error, }; +use tracing::info; use url::Url; use crate::clickhouse::{ @@ -89,25 +90,43 @@ impl ClickHouseClient { ) -> EtlResult<()> { let col_type = clickhouse_column_type(column, true); let sql = format!( - "ALTER TABLE \"{table_name}\" ADD COLUMN \"{}\" {col_type} AFTER \"{after_column}\"", + "ALTER TABLE \"{table_name}\" ADD COLUMN IF NOT EXISTS \"{}\" {col_type} AFTER \ + \"{after_column}\"", column.name ); self.execute_ddl(&sql).await } - /// Drops a column from an existing ClickHouse table. + /// Drops a column from an existing ClickHouse table (idempotent). pub(crate) async fn drop_column(&self, table_name: &str, column_name: &str) -> EtlResult<()> { - let sql = format!("ALTER TABLE \"{table_name}\" DROP COLUMN \"{column_name}\""); + let sql = format!("ALTER TABLE \"{table_name}\" DROP COLUMN IF EXISTS \"{column_name}\""); self.execute_ddl(&sql).await } /// Renames a column in an existing ClickHouse table. + /// + /// Idempotent: checks system.columns before renaming. If the old column + /// doesn't exist, the rename is assumed already applied and skipped. pub(crate) async fn rename_column( &self, table_name: &str, old_name: &str, new_name: &str, ) -> EtlResult<()> { + let exists: u64 = self + .inner + .query(&format!( + "SELECT count() FROM system.columns WHERE table = '{table_name}' AND name = \ + '{old_name}'" + )) + .fetch_one() + .await + .map_err(|e| etl_error!(ErrorKind::Unknown, "ClickHouse column check failed", e))?; + if exists == 0 { + info!("rename {old_name} -> {new_name} already applied, skipping"); + return Ok(()); + } + let sql = format!("ALTER TABLE \"{table_name}\" RENAME COLUMN \"{old_name}\" TO \"{new_name}\""); self.execute_ddl(&sql).await diff --git a/etl-destinations/src/clickhouse/core.rs b/etl-destinations/src/clickhouse/core.rs index fe638aec4..ca8ec3161 100644 --- a/etl-destinations/src/clickhouse/core.rs +++ b/etl-destinations/src/clickhouse/core.rs @@ -121,6 +121,34 @@ where }) } + /// Creates a new ClickHouse table with Applying -> DDL -> Applied metadata. + async fn create_table_with_metadata( + &self, + table_id: TableId, + ch_table_name: &str, + schema: &ReplicatedTableSchema, + snapshot_id: etl::types::SnapshotId, + replication_mask: etl::types::ReplicationMask, + ) -> EtlResult<()> { + let metadata = DestinationTableMetadata::new_applying( + ch_table_name.to_string(), + snapshot_id, + replication_mask, + ); + self.store.store_destination_table_metadata(table_id, metadata.clone()).await?; + + let column_schemas: Vec<_> = schema.column_schemas().cloned().collect(); + let ddl = build_create_table_sql(ch_table_name, &column_schemas); + let ddl_start = Instant::now(); + self.client.execute_ddl(&ddl).await?; + metrics::histogram!(ETL_CH_DDL_DURATION_SECONDS, "table" => ch_table_name.to_string()) + .record(ddl_start.elapsed().as_secs_f64()); + + self.store.store_destination_table_metadata(table_id, metadata.to_applied()).await?; + + Ok(()) + } + /// Ensures the ClickHouse table for the given schema exists, returning /// `(ch_table_name, nullable_flags)`. /// @@ -146,27 +174,66 @@ where let snapshot_id = schema.inner().snapshot_id; let replication_mask = schema.replication_mask().clone(); - // Only store metadata and execute DDL on first table creation. After a - // schema change (handle_relation_event), the cache is invalidated but - // metadata already exists -- we just need to recompute nullable flags. let existing_metadata = self.store.get_destination_table_metadata(table_id).await?; - if existing_metadata.is_none() { - let metadata = DestinationTableMetadata::new_applying( - ch_table_name.clone(), - snapshot_id, - replication_mask, - ); - self.store.store_destination_table_metadata(table_id, metadata.clone()).await?; - - // Execute CREATE TABLE DDL. - let column_schemas: Vec<_> = schema.column_schemas().cloned().collect(); - let ddl = build_create_table_sql(&ch_table_name, &column_schemas); - let ddl_start = Instant::now(); - self.client.execute_ddl(&ddl).await?; - metrics::histogram!(ETL_CH_DDL_DURATION_SECONDS, "table" => ch_table_name.clone()) - .record(ddl_start.elapsed().as_secs_f64()); + match existing_metadata { + None => { + // First table creation: Applying -> CREATE TABLE -> Applied. + self.create_table_with_metadata( + table_id, + &ch_table_name, + schema, + snapshot_id, + replication_mask, + ) + .await?; + } + Some(metadata) if metadata.is_applying() => { + // Crash recovery: the replicator was killed during a DDL + // operation. Re-apply idempotently and mark Applied. + warn!("table {} has Applying metadata, recovering interrupted operation", table_id); + + match metadata.previous_snapshot_id { + Some(prev_snapshot_id) => { + // Interrupted schema change: re-apply the diff. + let old_table_schema = self + .store + .get_table_schema(&table_id, prev_snapshot_id) + .await? + .ok_or_else(|| { + etl_error!( + ErrorKind::InvalidState, + "Old schema not found for recovery", + format!( + "Cannot find schema for table {} at snapshot_id {}", + table_id, prev_snapshot_id + ) + ) + })?; + let old_schema = ReplicatedTableSchema::from_mask( + old_table_schema, + metadata.replication_mask.clone(), + ); + let diff = old_schema.diff(schema); + self.apply_schema_diff(&ch_table_name, &diff, &old_schema).await?; + } + None => { + // Interrupted initial table creation: re-run CREATE + // TABLE IF NOT EXISTS (idempotent). + let column_schemas: Vec<_> = schema.column_schemas().cloned().collect(); + let ddl = build_create_table_sql(&ch_table_name, &column_schemas); + self.client.execute_ddl(&ddl).await?; + } + } - self.store.store_destination_table_metadata(table_id, metadata.to_applied()).await?; + self.store + .store_destination_table_metadata(table_id, metadata.to_applied()) + .await?; + } + Some(_applied) => { + // Applied metadata, cache miss after handle_relation_event + // invalidated the cache. No DDL needed -- fall through to + // recompute nullable flags below. + } } // Compute nullable flags (user columns + 2 CDC columns always non-nullable). diff --git a/etl-destinations/tests/clickhouse_pipeline.rs b/etl-destinations/tests/clickhouse_pipeline.rs index f9c6aa1c5..14dfae151 100644 --- a/etl-destinations/tests/clickhouse_pipeline.rs +++ b/etl-destinations/tests/clickhouse_pipeline.rs @@ -1587,7 +1587,7 @@ const LARGE_BATCH_SELECT: &str = concat!( /// (first, last, powers of two, and a few interior points) are spot-checked /// for correct id and value. #[tokio::test(flavor = "multi_thread")] -async fn large_batch_table_copy() { +async fn exclusive_large_batch_table_copy() { init_test_tracing(); install_crypto_provider(); @@ -1815,11 +1815,11 @@ async fn schema_change_add_column() { for _ in 0..50 { // The SELECT will fail if the email column doesn't exist yet, so // catch errors and retry. - if let Ok(r) = ch_db.db_client().query(select).fetch_all::().await { - if r.len() >= 2 { - rows = r; - break; - } + if let Ok(r) = ch_db.db_client().query(select).fetch_all::().await + && r.len() >= 2 + { + rows = r; + break; } sleep(Duration::from_millis(200)).await; } @@ -2001,11 +2001,10 @@ async fn schema_change_add_drop_rename() { let mut rows: Vec = Vec::new(); for _ in 0..50 { if let Ok(r) = ch_db.db_client().query(select).fetch_all::().await + && r.len() >= 2 { - if r.len() >= 2 { - rows = r; - break; - } + rows = r; + break; } sleep(Duration::from_millis(200)).await; } From 892e480b566ff4ce38909500e3030937f068ae68 Mon Sep 17 00:00:00 2001 From: Jordan McQueen Date: Fri, 24 Apr 2026 14:02:51 +0900 Subject: [PATCH 45/86] Expand key-only DELETE rows into full tombstones for ClickHouse Key-only DELETE rows (default replica identity) now produce proper tombstone rows by expanding PK-only data to full column width: - Non-nullable scalars: type-appropriate zero values (0, false, "") - Date/Timestamp/UUID: typed epoch/nil defaults - Nullable scalars: NULL - Array columns: empty arrays (not NULL, since ClickHouse arrays use Array(Nullable(T)) without an outer Nullable wrapper) The test now exercises all supported column types in the tombstone: smallint, integer, bigint, real, double, numeric, boolean, text, varchar, date, timestamp, timestamptz, time, jsonb, bytea, uuid, plus nullable scalars and arrays. Also adds timeout panics to five inline poll loops that previously fell through silently with stale data. --- etl-destinations/src/clickhouse/core.rs | 78 ++++++- etl-destinations/tests/clickhouse_pipeline.rs | 216 ++++++++++++++---- 2 files changed, 249 insertions(+), 45 deletions(-) diff --git a/etl-destinations/src/clickhouse/core.rs b/etl-destinations/src/clickhouse/core.rs index a71552353..22bd9bce4 100644 --- a/etl-destinations/src/clickhouse/core.rs +++ b/etl-destinations/src/clickhouse/core.rs @@ -12,7 +12,7 @@ use etl::{ store::{schema::SchemaStore, state::StateStore}, types::{ Cell, Event, OldTableRow, PgLsn, ReplicatedTableSchema, SchemaDiff, TableId, TableRow, - UpdatedTableRow, is_array_type, + Type, UpdatedTableRow, is_array_type, }, }; use parking_lot::RwLock; @@ -508,7 +508,10 @@ where continue; }; let old_row = match old_table_row { - OldTableRow::Full(row) | OldTableRow::Key(row) => row, + OldTableRow::Full(row) => row, + OldTableRow::Key(key_row) => { + expand_key_row(key_row, &delete.replicated_table_schema) + } }; let table_id = delete.replicated_table_schema.id(); table_schemas @@ -618,6 +621,77 @@ where } } +/// Expands a key-only delete row to full column width for RowBinary encoding. +/// +/// PK columns keep their real values. Non-PK columns get `Cell::Null` if +/// nullable, or a type-appropriate zero value if non-nullable (since RowBinary +/// rejects NULL for non-nullable columns). +fn expand_key_row(key_row: TableRow, schema: &ReplicatedTableSchema) -> TableRow { + let key_cells = key_row.into_values(); + let mut key_iter = key_cells.into_iter(); + let cells: Vec = schema + .column_schemas() + .map(|col| { + if col.primary_key_ordinal_position.is_some() { + key_iter.next().unwrap_or(Cell::Null) + } else if col.nullable && !is_array_type(&col.typ) { + // Nullable scalars -> NULL. Array columns are never nullable + // in ClickHouse (Array(Nullable(T)) without outer Nullable), + // so they must use an empty array default instead. + Cell::Null + } else { + default_cell(&col.typ) + } + }) + .collect(); + TableRow::new(cells) +} + +/// Returns a zero-value Cell for a Postgres type, used to fill non-PK columns +/// in key-only DELETE tombstones. Array types produce empty arrays. All other +/// non-primitive types fall through to an empty String, which is a valid zero +/// value for every ClickHouse String-mapped type (numeric, time, json, bytea). +/// Date, Timestamp, and UUID use typed zero values because their ClickHouse +/// wire format is not String. +fn default_cell(typ: &Type) -> Cell { + use etl::types::ArrayCell; + + match *typ { + Type::BOOL => Cell::Bool(false), + Type::INT2 => Cell::I16(0), + Type::INT4 => Cell::I32(0), + Type::INT8 => Cell::I64(0), + Type::OID => Cell::U32(0), + Type::FLOAT4 => Cell::F32(0.0), + Type::FLOAT8 => Cell::F64(0.0), + Type::DATE => Cell::Date(chrono::NaiveDate::from_ymd_opt(1970, 1, 1).unwrap()), + Type::TIMESTAMP => Cell::Timestamp(chrono::NaiveDateTime::UNIX_EPOCH), + Type::TIMESTAMPTZ => Cell::TimestampTz(chrono::DateTime::UNIX_EPOCH), + Type::UUID => Cell::Uuid(uuid::Uuid::nil()), + Type::BOOL_ARRAY => Cell::Array(ArrayCell::Bool(Vec::new())), + Type::INT2_ARRAY => Cell::Array(ArrayCell::I16(Vec::new())), + Type::INT4_ARRAY => Cell::Array(ArrayCell::I32(Vec::new())), + Type::INT8_ARRAY => Cell::Array(ArrayCell::I64(Vec::new())), + Type::OID_ARRAY => Cell::Array(ArrayCell::U32(Vec::new())), + Type::FLOAT4_ARRAY => Cell::Array(ArrayCell::F32(Vec::new())), + Type::FLOAT8_ARRAY => Cell::Array(ArrayCell::F64(Vec::new())), + Type::TEXT_ARRAY + | Type::VARCHAR_ARRAY + | Type::CHAR_ARRAY + | Type::BPCHAR_ARRAY + | Type::NAME_ARRAY => Cell::Array(ArrayCell::String(Vec::new())), + Type::NUMERIC_ARRAY => Cell::Array(ArrayCell::Numeric(Vec::new())), + Type::DATE_ARRAY => Cell::Array(ArrayCell::Date(Vec::new())), + Type::TIME_ARRAY => Cell::Array(ArrayCell::Time(Vec::new())), + Type::TIMESTAMP_ARRAY => Cell::Array(ArrayCell::Timestamp(Vec::new())), + Type::TIMESTAMPTZ_ARRAY => Cell::Array(ArrayCell::TimestampTz(Vec::new())), + Type::UUID_ARRAY => Cell::Array(ArrayCell::Uuid(Vec::new())), + Type::JSON_ARRAY | Type::JSONB_ARRAY => Cell::Array(ArrayCell::Json(Vec::new())), + Type::BYTEA_ARRAY => Cell::Array(ArrayCell::Bytes(Vec::new())), + _ => Cell::String(String::new()), + } +} + impl Destination for ClickHouseDestination where S: StateStore + SchemaStore + Send + Sync, diff --git a/etl-destinations/tests/clickhouse_pipeline.rs b/etl-destinations/tests/clickhouse_pipeline.rs index 14dfae151..6324098c8 100644 --- a/etl-destinations/tests/clickhouse_pipeline.rs +++ b/etl-destinations/tests/clickhouse_pipeline.rs @@ -1267,6 +1267,12 @@ async fn multiple_tables_receive_independent_writes() { } sleep(Duration::from_millis(100)).await; } + assert!( + rows_a.len() >= 2 && rows_b.len() >= 2, + "timed out: multi_a has {} rows, multi_b has {} rows", + rows_a.len(), + rows_b.len() + ); pipeline.shutdown_and_wait().await.unwrap(); @@ -1399,6 +1405,7 @@ async fn sequential_transactions_preserve_commit_order() { } sleep(Duration::from_millis(100)).await; } + assert!(rows.len() >= 3, "timed out waiting for tx_order rows: got {} of 3", rows.len()); pipeline.shutdown_and_wait().await.unwrap(); @@ -1421,48 +1428,108 @@ async fn sequential_transactions_preserve_commit_order() { assert!(r.cdc_lsn > rows[1].cdc_lsn, "update_b must have a higher LSN than update_a"); } -/// SELECT query used to verify the `default_identity_delete` test. +/// Row struct for the wide default-identity delete test. +#[derive(clickhouse::Row, serde::Deserialize, Debug)] +struct DefaultIdentityDeleteRow { + id: i64, + smallint_col: i16, + integer_col: i32, + bigint_col: i64, + real_col: f32, + double_col: f64, + numeric_col: String, + boolean_col: bool, + text_col: String, + varchar_col: String, + date_col: u16, + timestamp_col: i64, + timestamptz_col: i64, + time_col: String, + jsonb_col: String, + bytea_col: String, + uuid_col: String, + nullable_text: Option, + nullable_int: Option, + int_array_col: Vec>, + text_array_col: Vec>, + cdc_operation: String, + cdc_lsn: i64, +} + const DEFAULT_IDENTITY_DELETE_SELECT: &str = concat!( - "SELECT id, value, cdc_operation, cdc_lsn ", + "SELECT id, ", + "smallint_col, integer_col, bigint_col, real_col, double_col, ", + "numeric_col, boolean_col, text_col, varchar_col, ", + "date_col, timestamp_col, timestamptz_col, time_col, ", + "jsonb_col, bytea_col, toString(uuid_col) AS uuid_col, ", + "nullable_text, nullable_int, ", + "int_array_col, text_array_col, ", + "cdc_operation, cdc_lsn ", "FROM \"test_default__identity__delete\" ", "ORDER BY id, cdc_lsn", ); /// Tests that a DELETE under default replica identity (PK only) produces a -/// valid CDC row in ClickHouse with the correct PK and zero-value non-PK -/// columns. +/// tombstone row with the correct PK and zero-value defaults for every +/// supported column type. /// /// # GIVEN /// -/// A Postgres table with **default replica identity** (not FULL) and two rows -/// (`id=1, value='keep'` and `id=2, value='to_delete'`), copied to ClickHouse. +/// A wide Postgres table with **default replica identity** (not FULL) covering: +/// - Non-nullable scalars: smallint, integer, bigint, real, double, numeric, +/// boolean, text, varchar, date, timestamp, timestamptz, time, jsonb, bytea, +/// uuid +/// - Nullable scalars: text, integer +/// - Nullable arrays: integer[], text[] +/// Two rows inserted and copied to ClickHouse. /// /// # WHEN /// -/// Row `id=2` is deleted in Postgres, and a new row (`id=3, value='after'`) -/// is inserted. +/// Row `id=2` is deleted in Postgres, and a new row (`id=3`) is inserted. /// /// # THEN /// -/// ClickHouse contains four rows: -/// - `id=1, value='keep', cdc_operation='INSERT', cdc_lsn=0` (untouched) -/// - `id=2, value='to_delete', cdc_operation='INSERT', cdc_lsn=0` (table copy) -/// - `id=2, value='', cdc_operation='DELETE', cdc_lsn > 0` (streamed -/// delete -- Postgres only sent the PK, so the non-PK `value` column is a -/// zero-value empty string, not the original data) -/// - `id=3, value='after', cdc_operation='INSERT', cdc_lsn > 0` (proves the -/// pipeline continued after the delete) +/// The DELETE tombstone has: +/// - Correct PK (`id=2`) +/// - Non-nullable scalars filled with type-appropriate zero values +/// - Nullable scalars filled with NULL +/// - Arrays filled with empty arrays #[tokio::test(flavor = "multi_thread")] async fn delete_with_default_replica_identity() { init_test_tracing(); install_crypto_provider(); - // --- GIVEN: two rows, default replica identity (NOT full) --- + // --- GIVEN: wide table with all types, default replica identity --- let database = spawn_source_database().await; let table_name = test_table_name("default_identity_delete"); let table_id = database - .create_table(table_name.clone(), true, &[("value", "text not null")]) + .create_table( + table_name.clone(), + true, + &[ + ("smallint_col", "smallint not null"), + ("integer_col", "integer not null"), + ("bigint_col", "bigint not null"), + ("real_col", "real not null"), + ("double_col", "double precision not null"), + ("numeric_col", "numeric(10,2) not null"), + ("boolean_col", "boolean not null"), + ("text_col", "text not null"), + ("varchar_col", "varchar(100) not null"), + ("date_col", "date not null"), + ("timestamp_col", "timestamp not null"), + ("timestamptz_col", "timestamptz not null"), + ("time_col", "time not null"), + ("jsonb_col", "jsonb not null"), + ("bytea_col", "bytea not null"), + ("uuid_col", "uuid not null"), + ("nullable_text", "text"), + ("nullable_int", "integer"), + ("int_array_col", "integer[]"), + ("text_array_col", "text[]"), + ], + ) .await .expect("Failed to create test table"); @@ -1476,8 +1543,26 @@ async fn delete_with_default_replica_identity() { database .run_sql(&format!( - "INSERT INTO {} (value) VALUES ('keep'), ('to_delete')", - table_name.as_quoted_identifier(), + r#"INSERT INTO {table} ( + smallint_col, integer_col, bigint_col, real_col, double_col, + numeric_col, boolean_col, text_col, varchar_col, + date_col, timestamp_col, timestamptz_col, time_col, + jsonb_col, bytea_col, uuid_col, + nullable_text, nullable_int, int_array_col, text_array_col + ) VALUES + (1, 10, 100, 1.5, 2.5, 123.45, true, + 'keep', 'keeper', '2024-01-15', '2024-01-15 12:00:00', + '2024-01-15 12:00:00+00', '14:30:00', + '{{"key":"value"}}', '\xdeadbeef', + 'f47ac10b-58cc-4372-a567-0e02b2c3d479', + 'present', 42, ARRAY[1,2,3]::integer[], ARRAY['a','b']::text[]), + (2, 20, 200, 3.5, 4.5, 678.90, false, + 'delete_me', 'doomed', '2024-06-01', '2024-06-01 08:00:00', + '2024-06-01 08:00:00+00', '09:00:00', + '{{"x":1}}', '\xcafebabe', + 'a1b2c3d4-e5f6-7890-abcd-ef1234567890', + 'also_present', 99, ARRAY[4,5]::integer[], ARRAY['c']::text[])"#, + table = table_name.as_quoted_identifier(), )) .await .expect("Failed to insert rows"); @@ -1501,22 +1586,34 @@ async fn delete_with_default_replica_identity() { pipeline.start().await.unwrap(); table_ready.notified().await; - // --- WHEN: delete id=2, and insert a new row --- + // --- WHEN: delete id=2, insert id=3 --- database - .run_sql(&format!("DELETE FROM {} WHERE id = 2", table_name.as_quoted_identifier(),)) + .run_sql(&format!("DELETE FROM {} WHERE id = 2", table_name.as_quoted_identifier())) .await .expect("Failed to delete row"); database .run_sql(&format!( - "INSERT INTO {} (value) VALUES ('after')", - table_name.as_quoted_identifier(), + r#"INSERT INTO {table} ( + smallint_col, integer_col, bigint_col, real_col, double_col, + numeric_col, boolean_col, text_col, varchar_col, + date_col, timestamp_col, timestamptz_col, time_col, + jsonb_col, bytea_col, uuid_col, + int_array_col, text_array_col + ) VALUES ( + 3, 30, 300, 5.5, 6.5, 111.11, true, + 'after', 'survivor', '2025-01-01', '2025-01-01 00:00:00', + '2025-01-01 00:00:00+00', '00:00:00', + '{{"new":true}}', '\x00', + 'b2c3d4e5-f6a7-8901-bcde-f12345678901', + ARRAY[7]::integer[], ARRAY['d']::text[])"#, + table = table_name.as_quoted_identifier(), )) .await .expect("Failed to insert post-delete row"); - // Poll for 4 rows: 2 copied INSERTs + DELETE + new INSERT. - let mut rows: Vec = Vec::with_capacity(4); + // Poll for 4 rows: 2 copied INSERTs + DELETE tombstone + new INSERT. + let mut rows: Vec = Vec::new(); for _ in 0..50 { rows = ch_db.query(DEFAULT_IDENTITY_DELETE_SELECT).await; if rows.len() >= 4 { @@ -1524,42 +1621,65 @@ async fn delete_with_default_replica_identity() { } sleep(Duration::from_millis(100)).await; } + assert!( + rows.len() >= 4, + "timed out waiting for default_identity_delete rows: got {} of 4", + rows.len() + ); pipeline.shutdown_and_wait().await.unwrap(); - // --- THEN: DELETE targets the correct row, non-PK columns are zero-values --- + // --- THEN: DELETE tombstone has zero-value defaults for all types --- assert_eq!(rows.len(), 4, "expected 2 copied INSERTs + DELETE + new INSERT"); + // Row 1: copied, untouched -- spot check. let r = &rows[0]; assert_eq!(r.id, 1); - assert_eq!(r.value, "keep"); + assert_eq!(r.text_col, "keep"); + assert_eq!(r.integer_col, 10); + assert_eq!(r.boolean_col, true); + assert_eq!(r.nullable_text, Some("present".to_string())); + assert_eq!(r.int_array_col, vec![Some(1), Some(2), Some(3)]); assert_eq!(r.cdc_operation, "INSERT"); - assert_eq!(r.cdc_lsn, 0); + // Row 2: copied, will be deleted. let r = &rows[1]; assert_eq!(r.id, 2); - assert_eq!(r.value, "to_delete"); + assert_eq!(r.text_col, "delete_me"); assert_eq!(r.cdc_operation, "INSERT"); - assert_eq!(r.cdc_lsn, 0); + // Row 3: DELETE tombstone -- every non-PK column type verified. let r = &rows[2]; - assert_eq!(r.id, 2, "DELETE must target the correct row among multiple"); - assert_eq!( - r.value, "", - "non-PK column should be zero-value (empty string) under default replica identity" - ); + assert_eq!(r.id, 2, "DELETE must target the correct row"); assert_eq!(r.cdc_operation, "DELETE"); assert!(r.cdc_lsn > 0); - + // Non-nullable scalars -> zero values. + assert_eq!(r.smallint_col, 0, "smallint -> 0"); + assert_eq!(r.integer_col, 0, "integer -> 0"); + assert_eq!(r.bigint_col, 0, "bigint -> 0"); + assert!(r.real_col.abs() < 1e-6, "real -> 0.0"); + assert!(r.double_col.abs() < 1e-9, "double -> 0.0"); + assert_eq!(r.numeric_col, "", "numeric -> empty string"); + assert_eq!(r.boolean_col, false, "boolean -> false"); + assert_eq!(r.text_col, "", "text -> empty string"); + assert_eq!(r.varchar_col, "", "varchar -> empty string"); + assert_eq!(r.time_col, "", "time -> empty string (String-mapped)"); + assert_eq!(r.jsonb_col, "", "jsonb -> empty string (String-mapped)"); + assert_eq!(r.bytea_col, "", "bytea -> empty string"); + assert_eq!(r.uuid_col, "00000000-0000-0000-0000-000000000000", "uuid -> nil UUID"); + // Nullable scalars -> NULL. + assert_eq!(r.nullable_text, None, "nullable text -> NULL"); + assert_eq!(r.nullable_int, None, "nullable int -> NULL"); + // Arrays -> empty. + assert!(r.int_array_col.is_empty(), "int array -> empty"); + assert!(r.text_array_col.is_empty(), "text array -> empty"); + + // Row 4: post-delete INSERT proves pipeline continued. let r = &rows[3]; assert_eq!(r.id, 3); - assert_eq!(r.value, "after"); + assert_eq!(r.text_col, "after"); assert_eq!(r.cdc_operation, "INSERT"); - assert!(r.cdc_lsn > 0, "pipeline must continue after the delete"); - assert!( - r.cdc_lsn > rows[2].cdc_lsn, - "post-delete INSERT must have a higher LSN than the DELETE" - ); + assert!(r.cdc_lsn > 0); } /// SELECT query used to verify the `large_batch` test. @@ -1823,6 +1943,11 @@ async fn schema_change_add_column() { } sleep(Duration::from_millis(200)).await; } + assert!( + rows.len() >= 2, + "timed out waiting for schema_change_add_column rows: got {} of 2", + rows.len() + ); pipeline.shutdown_and_wait().await.unwrap(); @@ -2008,6 +2133,11 @@ async fn schema_change_add_drop_rename() { } sleep(Duration::from_millis(200)).await; } + assert!( + rows.len() >= 2, + "timed out waiting for schema_change_add_drop_rename rows: got {} of 2", + rows.len() + ); pipeline.shutdown_and_wait().await.unwrap(); From 90b8016d0bd47ed69ee2c0a84feb95cb7e8b8bd0 Mon Sep 17 00:00:00 2001 From: Jordan McQueen Date: Fri, 24 Apr 2026 16:45:57 +0900 Subject: [PATCH 46/86] Use DateTime::UNIX_EPOCH.naive_utc() for Timestamp default cell NaiveDateTime::UNIX_EPOCH is not available in the pinned chrono version; derive it from DateTime::UNIX_EPOCH instead to match the TIMESTAMPTZ arm just below. --- etl-destinations/src/clickhouse/core.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/etl-destinations/src/clickhouse/core.rs b/etl-destinations/src/clickhouse/core.rs index 22bd9bce4..1432881e6 100644 --- a/etl-destinations/src/clickhouse/core.rs +++ b/etl-destinations/src/clickhouse/core.rs @@ -665,7 +665,7 @@ fn default_cell(typ: &Type) -> Cell { Type::FLOAT4 => Cell::F32(0.0), Type::FLOAT8 => Cell::F64(0.0), Type::DATE => Cell::Date(chrono::NaiveDate::from_ymd_opt(1970, 1, 1).unwrap()), - Type::TIMESTAMP => Cell::Timestamp(chrono::NaiveDateTime::UNIX_EPOCH), + Type::TIMESTAMP => Cell::Timestamp(chrono::DateTime::UNIX_EPOCH.naive_utc()), Type::TIMESTAMPTZ => Cell::TimestampTz(chrono::DateTime::UNIX_EPOCH), Type::UUID => Cell::Uuid(uuid::Uuid::nil()), Type::BOOL_ARRAY => Cell::Array(ArrayCell::Bool(Vec::new())), From a86e453d1ff1e49899e0a4b2a2aca3a2295b6574 Mon Sep 17 00:00:00 2001 From: Jordan McQueen Date: Fri, 24 Apr 2026 17:10:43 +0900 Subject: [PATCH 47/86] Use RENAME COLUMN IF EXISTS for idempotent column rename Collapses the read-then-write pair into one DDL statement that does the existence check and rename atomically on the server, matching ADD COLUMN IF NOT EXISTS / DROP COLUMN IF EXISTS used by add_column and drop_column in the same file. --- etl-destinations/src/clickhouse/client.rs | 27 ++++++----------------- 1 file changed, 7 insertions(+), 20 deletions(-) diff --git a/etl-destinations/src/clickhouse/client.rs b/etl-destinations/src/clickhouse/client.rs index 960405c48..7d450d23a 100644 --- a/etl-destinations/src/clickhouse/client.rs +++ b/etl-destinations/src/clickhouse/client.rs @@ -5,7 +5,6 @@ use etl::{ error::{ErrorKind, EtlResult}, etl_error, }; -use tracing::info; use url::Url; use crate::clickhouse::{ @@ -103,32 +102,20 @@ impl ClickHouseClient { self.execute_ddl(&sql).await } - /// Renames a column in an existing ClickHouse table. + /// Renames a column in an existing ClickHouse table (idempotent). /// - /// Idempotent: checks system.columns before renaming. If the old column - /// doesn't exist, the rename is assumed already applied and skipped. + /// `RENAME COLUMN IF EXISTS` makes the ALTER a server-side noop when the + /// old column is already absent, so the check and the rename happen in + /// one statement without a racy read-then-write. pub(crate) async fn rename_column( &self, table_name: &str, old_name: &str, new_name: &str, ) -> EtlResult<()> { - let exists: u64 = self - .inner - .query(&format!( - "SELECT count() FROM system.columns WHERE table = '{table_name}' AND name = \ - '{old_name}'" - )) - .fetch_one() - .await - .map_err(|e| etl_error!(ErrorKind::Unknown, "ClickHouse column check failed", e))?; - if exists == 0 { - info!("rename {old_name} -> {new_name} already applied, skipping"); - return Ok(()); - } - - let sql = - format!("ALTER TABLE \"{table_name}\" RENAME COLUMN \"{old_name}\" TO \"{new_name}\""); + let sql = format!( + "ALTER TABLE \"{table_name}\" RENAME COLUMN IF EXISTS \"{old_name}\" TO \"{new_name}\"" + ); self.execute_ddl(&sql).await } From cd5d5fab46bfb52edacf23ff19eb3541a3c13c3b Mon Sep 17 00:00:00 2001 From: Jordan McQueen Date: Fri, 24 Apr 2026 17:32:02 +0900 Subject: [PATCH 48/86] Assert date/time zero values in default-identity DELETE tombstone test The DELETE tombstone test already verifies every other non-PK column type gets the expected zero value. date_col, timestamp_col, and timestamptz_col were decoded from the SELECT but never asserted on, which tripped the dead_code lint. Add the missing zero-value assertions and clean up two assert_eq! calls on bool literals. --- etl-destinations/tests/clickhouse_pipeline.rs | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/etl-destinations/tests/clickhouse_pipeline.rs b/etl-destinations/tests/clickhouse_pipeline.rs index 6324098c8..4dc5c8be4 100644 --- a/etl-destinations/tests/clickhouse_pipeline.rs +++ b/etl-destinations/tests/clickhouse_pipeline.rs @@ -1637,7 +1637,7 @@ async fn delete_with_default_replica_identity() { assert_eq!(r.id, 1); assert_eq!(r.text_col, "keep"); assert_eq!(r.integer_col, 10); - assert_eq!(r.boolean_col, true); + assert!(r.boolean_col); assert_eq!(r.nullable_text, Some("present".to_string())); assert_eq!(r.int_array_col, vec![Some(1), Some(2), Some(3)]); assert_eq!(r.cdc_operation, "INSERT"); @@ -1660,9 +1660,12 @@ async fn delete_with_default_replica_identity() { assert!(r.real_col.abs() < 1e-6, "real -> 0.0"); assert!(r.double_col.abs() < 1e-9, "double -> 0.0"); assert_eq!(r.numeric_col, "", "numeric -> empty string"); - assert_eq!(r.boolean_col, false, "boolean -> false"); + assert!(!r.boolean_col, "boolean -> false"); assert_eq!(r.text_col, "", "text -> empty string"); assert_eq!(r.varchar_col, "", "varchar -> empty string"); + assert_eq!(r.date_col, 0, "date -> 1970-01-01 (day 0)"); + assert_eq!(r.timestamp_col, 0, "timestamp -> unix epoch"); + assert_eq!(r.timestamptz_col, 0, "timestamptz -> unix epoch"); assert_eq!(r.time_col, "", "time -> empty string (String-mapped)"); assert_eq!(r.jsonb_col, "", "jsonb -> empty string (String-mapped)"); assert_eq!(r.bytea_col, "", "bytea -> empty string"); From 7eb9bbdd57e3f2ee4d85764b98f6aa36dc82c772 Mon Sep 17 00:00:00 2001 From: Jordan McQueen Date: Mon, 27 Apr 2026 13:43:54 +0900 Subject: [PATCH 49/86] Fix passwordless ClickHouse K8s deployment Passwordless ClickHouse configs created no password secret, but the replicator StatefulSet still referenced the ClickHouse password secret unconditionally. That could leave pods unable to start with a missing secretKeyRef, or keep using stale credentials after a password was removed. Carry whether the password secret is required into env generation, omit the password env var when it is not needed, and delete any stale ClickHouse password secret during reconciliation. --- etl-api/src/k8s/base.rs | 10 ++++-- etl-api/src/k8s/core.rs | 74 ++++++++++++++++++++++++++++++++++++++--- etl-api/src/k8s/http.rs | 71 +++++++++++++++++++++++++++++++++++---- 3 files changed, 142 insertions(+), 13 deletions(-) diff --git a/etl-api/src/k8s/base.rs b/etl-api/src/k8s/base.rs index d6c35e680..3bfaf4014 100644 --- a/etl-api/src/k8s/base.rs +++ b/etl-api/src/k8s/base.rs @@ -46,7 +46,11 @@ pub enum DestinationType { /// Apache Iceberg destination. Iceberg, /// ClickHouse destination. - ClickHouse, + ClickHouse { + /// Whether the StatefulSet must reference the ClickHouse password + /// secret. + password_secret_required: bool, + }, /// DuckLake destination. Ducklake, } @@ -57,7 +61,9 @@ impl From<&StoredDestinationConfig> for DestinationType { match value { StoredDestinationConfig::BigQuery { .. } => DestinationType::BigQuery, StoredDestinationConfig::Iceberg { .. } => DestinationType::Iceberg, - StoredDestinationConfig::ClickHouse { .. } => DestinationType::ClickHouse, + StoredDestinationConfig::ClickHouse { password, .. } => { + DestinationType::ClickHouse { password_secret_required: password.is_some() } + } StoredDestinationConfig::Ducklake { .. } => DestinationType::Ducklake, } } diff --git a/etl-api/src/k8s/core.rs b/etl-api/src/k8s/core.rs index 9252355f9..0d4eed0c0 100644 --- a/etl-api/src/k8s/core.rs +++ b/etl-api/src/k8s/core.rs @@ -327,7 +327,11 @@ async fn create_or_update_dynamic_replicator_secrets( } Secrets::ClickHouse { postgres_password, password } => { k8s_client.create_or_update_postgres_secret(prefix, &postgres_password).await?; - k8s_client.create_or_update_clickhouse_secret(prefix, password.as_deref()).await?; + if let Some(password) = password.as_deref() { + k8s_client.create_or_update_clickhouse_secret(prefix, Some(password)).await?; + } else { + k8s_client.delete_clickhouse_secret(prefix).await?; + } } Secrets::Ducklake { postgres_password, s3_access_key_id, s3_secret_access_key } => { k8s_client.create_or_update_postgres_secret(prefix, &postgres_password).await?; @@ -486,6 +490,25 @@ mod tests { } } + fn source_config_with_password() -> StoredSourceConfig { + StoredSourceConfig { + host: "localhost".to_string(), + port: 5432, + name: "postgres".to_string(), + username: "postgres".to_string(), + password: Some(SerializableSecretString::from("password".to_string())), + } + } + + fn clickhouse_destination_config(password: Option<&str>) -> StoredDestinationConfig { + StoredDestinationConfig::ClickHouse { + url: "http://localhost:8123".parse().unwrap(), + user: "default".to_string(), + password: password.map(ToOwned::to_owned).map(SerializableSecretString::from), + database: "default".to_string(), + } + } + #[async_trait] impl K8sClient for RecordingK8sClient { async fn create_or_update_postgres_secret( @@ -517,9 +540,13 @@ mod tests { async fn create_or_update_clickhouse_secret( &self, - _prefix: &str, - _password: Option<&str>, + prefix: &str, + password: Option<&str>, ) -> Result<(), K8sError> { + self.calls + .lock() + .unwrap() + .push(format!("clickhouse:{prefix}:{}", password.unwrap_or(""))); Ok(()) } @@ -548,7 +575,8 @@ mod tests { Ok(()) } - async fn delete_clickhouse_secret(&self, _prefix: &str) -> Result<(), K8sError> { + async fn delete_clickhouse_secret(&self, prefix: &str) -> Result<(), K8sError> { + self.calls.lock().unwrap().push(format!("delete-clickhouse:{prefix}")); Ok(()) } @@ -607,6 +635,44 @@ mod tests { assert!(is_active); } + #[tokio::test] + async fn clickhouse_with_password_creates_password_secret() { + let source_config = source_config_with_password(); + let destination_config = clickhouse_destination_config(Some("clickhouse-password")); + + let secrets = build_secrets_from_configs(&source_config, &destination_config); + let client = RecordingK8sClient::default(); + + create_or_update_dynamic_replicator_secrets(&client, "tenant-42", secrets).await.unwrap(); + + assert_eq!( + client.calls(), + vec![ + "postgres:tenant-42:password".to_string(), + "clickhouse:tenant-42:clickhouse-password".to_string(), + ] + ); + } + + #[tokio::test] + async fn passwordless_clickhouse_deletes_any_stale_password_secret() { + let source_config = source_config_with_password(); + let destination_config = clickhouse_destination_config(None); + + let secrets = build_secrets_from_configs(&source_config, &destination_config); + let client = RecordingK8sClient::default(); + + create_or_update_dynamic_replicator_secrets(&client, "tenant-42", secrets).await.unwrap(); + + assert_eq!( + client.calls(), + vec![ + "postgres:tenant-42:password".to_string(), + "delete-clickhouse:tenant-42".to_string(), + ] + ); + } + #[tokio::test] async fn ducklake_creates_postgres_and_s3_secrets() { let source_config = StoredSourceConfig { diff --git a/etl-api/src/k8s/http.rs b/etl-api/src/k8s/http.rs index 5791a470d..6bdb4d410 100644 --- a/etl-api/src/k8s/http.rs +++ b/etl-api/src/k8s/http.rs @@ -911,16 +911,18 @@ fn create_container_environment_json( let bq_secret_env_var_json = create_bq_secret_env_var_json(&bq_secret_name); container_environment.push(bq_secret_env_var_json); } - DestinationType::ClickHouse => { + DestinationType::ClickHouse { password_secret_required } => { let postgres_secret_name = create_postgres_secret_name(prefix); let postgres_secret_env_var_json = create_postgres_secret_env_var_json(&postgres_secret_name); container_environment.push(postgres_secret_env_var_json); - let clickhouse_secret_name = create_clickhouse_secret_name(prefix); - let clickhouse_secret_env_var_json = - create_clickhouse_secret_env_var_json(&clickhouse_secret_name); - container_environment.push(clickhouse_secret_env_var_json); + if password_secret_required { + let clickhouse_secret_name = create_clickhouse_secret_name(prefix); + let clickhouse_secret_env_var_json = + create_clickhouse_secret_env_var_json(&clickhouse_secret_name); + container_environment.push(clickhouse_secret_env_var_json); + } } DestinationType::Iceberg => { let postgres_secret_name = create_postgres_secret_name(prefix); @@ -1103,12 +1105,12 @@ fn create_bq_secret_env_var_json(bq_secret_name: &str) -> serde_json::Value { }) } -fn create_clickhouse_secret_env_var_json(clickouse_secret_name: &str) -> serde_json::Value { +fn create_clickhouse_secret_env_var_json(clickhouse_secret_name: &str) -> serde_json::Value { json!({ "name": "APP_DESTINATION__CLICK_HOUSE__PASSWORD", "valueFrom": { "secretKeyRef": { - "name": clickouse_secret_name, + "name": clickhouse_secret_name, "key": CLICKHOUSE_PASSWORD_NAME } } @@ -1300,6 +1302,15 @@ mod tests { format!("{tenant_id}-{replicator_id}") } + fn container_environment_has_var( + container_environment: &[serde_json::Value], + name: &str, + ) -> bool { + container_environment + .iter() + .any(|entry| entry.get("name").and_then(serde_json::Value::as_str) == Some(name)) + } + #[test] fn test_replicator_resource_config_uses_environment_defaults() { let prod = ReplicatorResourceConfig::load(&Environment::Prod).unwrap(); @@ -1639,6 +1650,52 @@ mod tests { assert_json_snapshot!(container_environment); } + #[test] + fn clickhouse_with_password_references_password_secret() { + let prefix = create_k8s_object_prefix(TENANT_ID, 42); + let replicator_image = "ramsup/etl-replicator:2a41356af735f891de37d71c0e1a62864fe4630e"; + + let container_environment = create_container_environment_json( + &prefix, + &Environment::Dev, + replicator_image, + DestinationType::ClickHouse { password_secret_required: true }, + LogLevel::Info, + ); + + assert!(container_environment_has_var( + &container_environment, + "APP_PIPELINE__PG_CONNECTION__PASSWORD", + )); + assert!(container_environment_has_var( + &container_environment, + "APP_DESTINATION__CLICK_HOUSE__PASSWORD", + )); + } + + #[test] + fn passwordless_clickhouse_does_not_reference_missing_password_secret() { + let prefix = create_k8s_object_prefix(TENANT_ID, 42); + let replicator_image = "ramsup/etl-replicator:2a41356af735f891de37d71c0e1a62864fe4630e"; + + let container_environment = create_container_environment_json( + &prefix, + &Environment::Dev, + replicator_image, + DestinationType::ClickHouse { password_secret_required: false }, + LogLevel::Info, + ); + + assert!(container_environment_has_var( + &container_environment, + "APP_PIPELINE__PG_CONNECTION__PASSWORD", + )); + assert!(!container_environment_has_var( + &container_environment, + "APP_DESTINATION__CLICK_HOUSE__PASSWORD", + )); + } + #[test] fn test_create_node_selector() { let node_selector = create_node_selector_json(&Environment::Dev); From 690005dd27981bedb2dcdbfb996f700c59deb158 Mon Sep 17 00:00:00 2001 From: Jordan McQueen Date: Mon, 27 Apr 2026 13:55:20 +0900 Subject: [PATCH 50/86] Quote ClickHouse SQL identifiers ClickHouse DDL and RowBinary insert statements interpolated table and column names inside double quotes without escaping embedded quotes. Legal Postgres identifiers containing quotes could therefore generate invalid ClickHouse SQL, and tenant-controlled identifiers could turn that into an injection risk. Centralize identifier quoting, use it for CREATE, ALTER, TRUNCATE, and INSERT SQL generation, and add targeted tests for quoted table and column names. --- etl-destinations/src/clickhouse/client.rs | 143 ++++++++++++++++++---- etl-destinations/src/clickhouse/schema.rs | 45 ++++++- 2 files changed, 159 insertions(+), 29 deletions(-) diff --git a/etl-destinations/src/clickhouse/client.rs b/etl-destinations/src/clickhouse/client.rs index 7d450d23a..476233c56 100644 --- a/etl-destinations/src/clickhouse/client.rs +++ b/etl-destinations/src/clickhouse/client.rs @@ -10,7 +10,7 @@ use url::Url; use crate::clickhouse::{ encoding::{ClickHouseValue, rb_encode_row}, metrics::ETL_CH_INSERT_DURATION_SECONDS, - schema::clickhouse_column_type, + schema::{clickhouse_column_type, quote_identifier}, }; /// Capacity of the internal write buffer used per INSERT statement. @@ -20,6 +20,50 @@ use crate::clickhouse::{ /// when `end()` is called or the `max_bytes_per_insert` limit is reached). const BUFFERED_CAPACITY: usize = 256 * 1024; +/// Builds the SQL used to add a column to a ClickHouse table. +fn build_add_column_sql( + table_name: &str, + column: &etl::types::ColumnSchema, + after_column: &str, +) -> String { + let col_type = clickhouse_column_type(column, true); + let table_name = quote_identifier(table_name); + let column_name = quote_identifier(&column.name); + let after_column = quote_identifier(after_column); + + format!( + "ALTER TABLE {table_name} ADD COLUMN IF NOT EXISTS {column_name} {col_type} AFTER \ + {after_column}" + ) +} + +/// Builds the SQL used to drop a column from a ClickHouse table. +fn build_drop_column_sql(table_name: &str, column_name: &str) -> String { + let table_name = quote_identifier(table_name); + let column_name = quote_identifier(column_name); + format!("ALTER TABLE {table_name} DROP COLUMN IF EXISTS {column_name}") +} + +/// Builds the SQL used to rename a column in a ClickHouse table. +fn build_rename_column_sql(table_name: &str, old_name: &str, new_name: &str) -> String { + let table_name = quote_identifier(table_name); + let old_name = quote_identifier(old_name); + let new_name = quote_identifier(new_name); + format!("ALTER TABLE {table_name} RENAME COLUMN IF EXISTS {old_name} TO {new_name}") +} + +/// Builds the SQL used to truncate a ClickHouse table. +fn build_truncate_table_sql(table_name: &str) -> String { + let table_name = quote_identifier(table_name); + format!("TRUNCATE TABLE IF EXISTS {table_name}") +} + +/// Builds the SQL used to insert RowBinary rows into a ClickHouse table. +fn build_insert_rows_sql(table_name: &str) -> String { + let table_name = quote_identifier(table_name); + format!("INSERT INTO {table_name} FORMAT RowBinary") +} + /// High-level ClickHouse client used by [`super::core::ClickHouseDestination`]. /// /// Wraps a [`clickhouse::Client`] and exposes typed methods for DDL, @@ -87,18 +131,13 @@ impl ClickHouseClient { column: &etl::types::ColumnSchema, after_column: &str, ) -> EtlResult<()> { - let col_type = clickhouse_column_type(column, true); - let sql = format!( - "ALTER TABLE \"{table_name}\" ADD COLUMN IF NOT EXISTS \"{}\" {col_type} AFTER \ - \"{after_column}\"", - column.name - ); + let sql = build_add_column_sql(table_name, column, after_column); self.execute_ddl(&sql).await } /// Drops a column from an existing ClickHouse table (idempotent). pub(crate) async fn drop_column(&self, table_name: &str, column_name: &str) -> EtlResult<()> { - let sql = format!("ALTER TABLE \"{table_name}\" DROP COLUMN IF EXISTS \"{column_name}\""); + let sql = build_drop_column_sql(table_name, column_name); self.execute_ddl(&sql).await } @@ -113,25 +152,19 @@ impl ClickHouseClient { old_name: &str, new_name: &str, ) -> EtlResult<()> { - let sql = format!( - "ALTER TABLE \"{table_name}\" RENAME COLUMN IF EXISTS \"{old_name}\" TO \"{new_name}\"" - ); + let sql = build_rename_column_sql(table_name, old_name, new_name); self.execute_ddl(&sql).await } - /// Executes `TRUNCATE TABLE IF EXISTS ""`. + /// Executes `TRUNCATE TABLE IF EXISTS` for the supplied table. pub(crate) async fn truncate_table(&self, table_name: &str) -> EtlResult<()> { - self.inner - .query(&format!("TRUNCATE TABLE IF EXISTS \"{table_name}\"")) - .execute() - .await - .map_err(|e| { - etl_error!( - ErrorKind::Unknown, - "ClickHouse truncate failed", - format!("Failed to truncate table '{table_name}': {e}") - ) - }) + self.inner.query(&build_truncate_table_sql(table_name)).execute().await.map_err(|e| { + etl_error!( + ErrorKind::Unknown, + "ClickHouse truncate failed", + format!("Failed to truncate table '{table_name}': {e}") + ) + }) } /// Inserts `rows` into `table_name` using the RowBinary format. @@ -156,7 +189,7 @@ impl ClickHouseClient { max_bytes_per_insert: u64, source: &'static str, ) -> EtlResult<()> { - let sql = format!("INSERT INTO \"{table_name}\" FORMAT RowBinary"); + let sql = build_insert_rows_sql(table_name); let mut insert = self.inner.insert_formatted_with(sql.clone()).buffered_with_capacity(BUFFERED_CAPACITY); @@ -212,3 +245,65 @@ impl ClickHouseClient { Ok(()) } } + +#[cfg(test)] +mod tests { + use etl::types::{ColumnSchema, Type}; + + use super::*; + + fn column_schema(name: &str) -> ColumnSchema { + ColumnSchema { + name: name.to_string(), + typ: Type::INT4, + modifier: -1, + ordinal_position: 1, + primary_key_ordinal_position: Some(1), + nullable: false, + } + } + + #[test] + fn add_column_sql_quotes_identifiers() { + let column = column_schema("new\"column"); + let sql = build_add_column_sql("table\"name", &column, "old\"column"); + + assert_eq!( + sql, + "ALTER TABLE \"table\"\"name\" ADD COLUMN IF NOT EXISTS \"new\"\"column\" \ + Nullable(Int32) AFTER \"old\"\"column\"" + ); + } + + #[test] + fn drop_column_sql_quotes_identifiers() { + let sql = build_drop_column_sql("table\"name", "old\"column"); + + assert_eq!(sql, "ALTER TABLE \"table\"\"name\" DROP COLUMN IF EXISTS \"old\"\"column\""); + } + + #[test] + fn rename_column_sql_quotes_identifiers() { + let sql = build_rename_column_sql("table\"name", "old\"column", "new\"column"); + + assert_eq!( + sql, + "ALTER TABLE \"table\"\"name\" RENAME COLUMN IF EXISTS \"old\"\"column\" TO \ + \"new\"\"column\"" + ); + } + + #[test] + fn truncate_table_sql_quotes_identifiers() { + let sql = build_truncate_table_sql("table\"name"); + + assert_eq!(sql, "TRUNCATE TABLE IF EXISTS \"table\"\"name\""); + } + + #[test] + fn insert_rows_sql_quotes_identifiers() { + let sql = build_insert_rows_sql("table\"name"); + + assert_eq!(sql, "INSERT INTO \"table\"\"name\" FORMAT RowBinary"); + } +} diff --git a/etl-destinations/src/clickhouse/schema.rs b/etl-destinations/src/clickhouse/schema.rs index c9c66e166..658ef8bd5 100644 --- a/etl-destinations/src/clickhouse/schema.rs +++ b/etl-destinations/src/clickhouse/schema.rs @@ -54,6 +54,11 @@ fn postgres_array_element_clickhouse_sql(typ: &Type) -> &'static str { } } +/// Quotes a ClickHouse identifier, escaping embedded double quotes. +pub(crate) fn quote_identifier(identifier: &str) -> String { + format!("\"{}\"", identifier.replace('"', "\"\"")) +} + /// Converts a Postgres `public.my_table` style table name into a ClickHouse /// table name using the same double-underscore escaping convention used by /// DuckLake/Iceberg. @@ -94,17 +99,18 @@ pub fn build_create_table_sql(table_name: &str, column_schemas: &[ColumnSchema]) for col in column_schemas { let col_type = clickhouse_column_type(col, false); - cols.push(format!(" \"{}\" {}", col.name, col_type)); + cols.push(format!(" {} {}", quote_identifier(&col.name), col_type)); } // CDC columns — always non-nullable - cols.push(" \"cdc_operation\" String".to_string()); - cols.push(" \"cdc_lsn\" Int64".to_string()); + cols.push(format!(" {} String", quote_identifier("cdc_operation"))); + cols.push(format!(" {} Int64", quote_identifier("cdc_lsn"))); let col_defs = cols.join(",\n"); + let quoted_table_name = quote_identifier(table_name); format!( - "CREATE TABLE IF NOT EXISTS \"{table_name}\" (\n{col_defs}\n) ENGINE = MergeTree()\nORDER \ - BY tuple()" + "CREATE TABLE IF NOT EXISTS {quoted_table_name} (\n{col_defs}\n) ENGINE = \ + MergeTree()\nORDER BY tuple()" ) } @@ -112,6 +118,12 @@ pub fn build_create_table_sql(table_name: &str, column_schemas: &[ColumnSchema]) mod tests { use super::*; + #[test] + fn quote_identifier_escapes_embedded_quotes() { + assert_eq!(quote_identifier("plain"), "\"plain\""); + assert_eq!(quote_identifier("has\"quote"), "\"has\"\"quote\""); + } + #[test] fn table_name_escaping() { assert_eq!(table_name_to_clickhouse_table_name("public", "orders"), "public_orders"); @@ -125,6 +137,29 @@ mod tests { ); } + #[test] + fn build_create_table_sql_quotes_identifiers() { + let schemas = vec![ColumnSchema { + name: "id\"value".to_string(), + typ: Type::INT4, + modifier: -1, + ordinal_position: 1, + primary_key_ordinal_position: Some(1), + nullable: false, + }]; + let table_name = table_name_to_clickhouse_table_name("sche\"ma", "ta\"ble"); + let sql = build_create_table_sql(&table_name, &schemas); + + assert!( + sql.contains("CREATE TABLE IF NOT EXISTS \"sche\"\"ma_ta\"\"ble\""), + "schema-derived table name should be quoted and escaped: {sql}" + ); + assert!( + sql.contains("\"id\"\"value\" Int32"), + "column name should be quoted and escaped: {sql}" + ); + } + #[test] fn scalar_type_mapping() { assert_eq!(postgres_column_type_to_clickhouse_sql(&Type::BOOL), "Boolean"); From c76b1c6226c7b842365c141e6e40f0651e9aad4d Mon Sep 17 00:00:00 2001 From: Jordan McQueen Date: Mon, 27 Apr 2026 14:00:24 +0900 Subject: [PATCH 51/86] Validate ClickHouse RowBinary row width The RowBinary encoder zipped values with nullable flags, which silently dropped extra values or flags when an internal schema/cache mismatch occurred. That could mask the real bug and emit malformed positional RowBinary. Check row width before encoding and return a ConversionError that includes both lengths. Add tests for too few and too many values. --- etl-destinations/src/clickhouse/encoding.rs | 42 +++++++++++++++++++++ 1 file changed, 42 insertions(+) diff --git a/etl-destinations/src/clickhouse/encoding.rs b/etl-destinations/src/clickhouse/encoding.rs index 1e9388687..c6a50c06f 100644 --- a/etl-destinations/src/clickhouse/encoding.rs +++ b/etl-destinations/src/clickhouse/encoding.rs @@ -271,6 +271,18 @@ pub(crate) fn rb_encode_row( nullable_flags: &[bool], buf: &mut Vec, ) -> EtlResult<()> { + if values.len() != nullable_flags.len() { + return Err(etl_error!( + ErrorKind::ConversionError, + "ClickHouse RowBinary row width mismatch", + format!( + "values length {} does not match nullable flags length {}", + values.len(), + nullable_flags.len() + ) + )); + } + for (val, &is_nullable) in values.into_iter().zip(nullable_flags.iter()) { if is_nullable { rb_encode_nullable(val, buf)?; @@ -453,4 +465,34 @@ mod tests { assert_eq!(err.kind(), ErrorKind::ConversionError); assert!(buf.is_empty(), "no bytes should be written on error"); } + + #[test] + fn rb_encode_row_rejects_fewer_values_than_nullable_flags() { + let mut buf = vec![0xaa]; + let result = rb_encode_row(vec![ClickHouseValue::Int32(1)], &[false, false], &mut buf); + + assert!(result.is_err(), "row width mismatch must error"); + let err = result.unwrap_err(); + assert_eq!(err.kind(), ErrorKind::ConversionError); + assert_eq!(err.description(), Some("ClickHouse RowBinary row width mismatch")); + assert_eq!(err.detail(), Some("values length 1 does not match nullable flags length 2")); + assert_eq!(buf, vec![0xaa], "no bytes should be written on error"); + } + + #[test] + fn rb_encode_row_rejects_more_values_than_nullable_flags() { + let mut buf = vec![0xaa]; + let result = rb_encode_row( + vec![ClickHouseValue::Int32(1), ClickHouseValue::Int32(2)], + &[false], + &mut buf, + ); + + assert!(result.is_err(), "row width mismatch must error"); + let err = result.unwrap_err(); + assert_eq!(err.kind(), ErrorKind::ConversionError); + assert_eq!(err.description(), Some("ClickHouse RowBinary row width mismatch")); + assert_eq!(err.detail(), Some("values length 2 does not match nullable flags length 1")); + assert_eq!(buf, vec![0xaa], "no bytes should be written on error"); + } } From 2ca3aaf638c76a5fbd354f2bba3ef542bf82d22f Mon Sep 17 00:00:00 2001 From: Jordan McQueen Date: Mon, 27 Apr 2026 14:07:53 +0900 Subject: [PATCH 52/86] Store ClickHouse CDC LSN as UInt64 Postgres LSNs are u64 values, but ClickHouse cdc_lsn was stored as Int64. Values outside the signed range were logged and collapsed to i64::MAX, corrupting CDC metadata and losing ordering information. Change generated ClickHouse DDL to use UInt64, encode CDC LSNs as unsigned RowBinary values, remove the overflow fallback, and update tests to read cdc_lsn as u64. --- etl-destinations/src/clickhouse/core.rs | 31 ++++++++++++------- etl-destinations/src/clickhouse/encoding.rs | 7 +++++ etl-destinations/src/clickhouse/schema.rs | 6 ++-- etl-destinations/tests/clickhouse_pipeline.rs | 4 +-- 4 files changed, 31 insertions(+), 17 deletions(-) diff --git a/etl-destinations/src/clickhouse/core.rs b/etl-destinations/src/clickhouse/core.rs index 1432881e6..6a543dd1f 100644 --- a/etl-destinations/src/clickhouse/core.rs +++ b/etl-destinations/src/clickhouse/core.rs @@ -53,6 +53,11 @@ struct PendingRow { cells: Vec, } +/// Converts a Postgres LSN into the ClickHouse CDC LSN value. +fn cdc_lsn_to_clickhouse_value(lsn: PgLsn) -> ClickHouseValue { + ClickHouseValue::UInt64(u64::from(lsn)) +} + // -- Inserter configuration -- /// Controls intermediate flushing inside a single `write_table_rows` / @@ -279,7 +284,7 @@ where let mut values: Vec = table_row.into_values().into_iter().map(cell_to_clickhouse_value).collect(); values.push(ClickHouseValue::String(String::from("INSERT"))); - values.push(ClickHouseValue::Int64(0)); + values.push(ClickHouseValue::UInt64(0)); values }) .collect(); @@ -560,17 +565,7 @@ where let mut values: Vec = cells.into_iter().map(cell_to_clickhouse_value).collect(); values.push(ClickHouseValue::String(operation.to_string())); - values.push(ClickHouseValue::Int64( - i64::try_from(u64::from(lsn)) - .inspect_err(|error| { - tracing::error!( - ?error, - "cannot convert u64 LSN to i64, falling back to \ - i64::MAX" - ); - }) - .unwrap_or(i64::MAX), - )); + values.push(cdc_lsn_to_clickhouse_value(lsn)); values }) .collect(); @@ -734,6 +729,18 @@ where #[cfg(test)] mod tests { + use super::*; + + #[test] + fn cdc_lsn_value_preserves_full_u64_range() { + let value = cdc_lsn_to_clickhouse_value(PgLsn::from(u64::MAX)); + + match value { + ClickHouseValue::UInt64(lsn) => assert_eq!(lsn, u64::MAX), + _ => panic!("expected UInt64 CDC LSN value"), + } + } + #[test] fn nullable_flags_includes_cdc() { let mut all_flags: Vec = vec![true, false]; diff --git a/etl-destinations/src/clickhouse/encoding.rs b/etl-destinations/src/clickhouse/encoding.rs index c6a50c06f..b75a81b36 100644 --- a/etl-destinations/src/clickhouse/encoding.rs +++ b/etl-destinations/src/clickhouse/encoding.rs @@ -40,6 +40,8 @@ pub(crate) enum ClickHouseValue { Int32(i32), Int64(i64), UInt32(u32), + /// Unsigned 64-bit integer, used for CDC LSN metadata. + UInt64(u64), Float32(f32), Float64(f64), /// TEXT, NUMERIC (string), TIME (string), JSON, BYTEA (hex-encoded) @@ -228,6 +230,7 @@ pub(crate) fn rb_encode_value(val: ClickHouseValue, buf: &mut Vec) -> EtlRes ClickHouseValue::Int32(v) => buf.extend_from_slice(&v.to_le_bytes()), ClickHouseValue::Int64(v) => buf.extend_from_slice(&v.to_le_bytes()), ClickHouseValue::UInt32(v) => buf.extend_from_slice(&v.to_le_bytes()), + ClickHouseValue::UInt64(v) => buf.extend_from_slice(&v.to_le_bytes()), ClickHouseValue::Float32(v) => buf.extend_from_slice(&v.to_le_bytes()), ClickHouseValue::Float64(v) => buf.extend_from_slice(&v.to_le_bytes()), ClickHouseValue::String(s) => { @@ -381,6 +384,10 @@ mod tests { rb_encode_value(ClickHouseValue::Int32(-1), &mut buf).unwrap(); assert_eq!(buf, (-1i32).to_le_bytes()); + buf.clear(); + rb_encode_value(ClickHouseValue::UInt64(u64::MAX), &mut buf).unwrap(); + assert_eq!(buf, u64::MAX.to_le_bytes()); + buf.clear(); rb_encode_value(ClickHouseValue::String("hi".to_string()), &mut buf).unwrap(); assert_eq!(buf, [2, b'h', b'i']); // varint(2) + bytes diff --git a/etl-destinations/src/clickhouse/schema.rs b/etl-destinations/src/clickhouse/schema.rs index 658ef8bd5..f8debd410 100644 --- a/etl-destinations/src/clickhouse/schema.rs +++ b/etl-destinations/src/clickhouse/schema.rs @@ -92,7 +92,7 @@ pub fn clickhouse_column_type(col: &ColumnSchema, force_nullable: bool) -> Strin /// Generates a `CREATE TABLE IF NOT EXISTS` DDL for the given columns. /// -/// Appends `cdc_operation String` and `cdc_lsn Int64` as trailing non-nullable +/// Appends `cdc_operation String` and `cdc_lsn UInt64` as trailing non-nullable /// columns. Uses `MergeTree()` with `ORDER BY tuple()`. pub fn build_create_table_sql(table_name: &str, column_schemas: &[ColumnSchema]) -> String { let mut cols = Vec::with_capacity(column_schemas.len() + 2); @@ -104,7 +104,7 @@ pub fn build_create_table_sql(table_name: &str, column_schemas: &[ColumnSchema]) // CDC columns — always non-nullable cols.push(format!(" {} String", quote_identifier("cdc_operation"))); - cols.push(format!(" {} Int64", quote_identifier("cdc_lsn"))); + cols.push(format!(" {} UInt64", quote_identifier("cdc_lsn"))); let col_defs = cols.join(",\n"); let quoted_table_name = quote_identifier(table_name); @@ -236,7 +236,7 @@ mod tests { }]; let sql = build_create_table_sql("public_t", &schemas); assert!(sql.contains("\"cdc_operation\" String"), "cdc_operation should be non-nullable"); - assert!(sql.contains("\"cdc_lsn\" Int64"), "cdc_lsn should be non-nullable Int64"); + assert!(sql.contains("\"cdc_lsn\" UInt64"), "cdc_lsn should be non-nullable UInt64"); assert!(sql.contains("ENGINE = MergeTree()")); assert!(sql.contains("ORDER BY tuple()")); } diff --git a/etl-destinations/tests/clickhouse_pipeline.rs b/etl-destinations/tests/clickhouse_pipeline.rs index 4dc5c8be4..7c812192c 100644 --- a/etl-destinations/tests/clickhouse_pipeline.rs +++ b/etl-destinations/tests/clickhouse_pipeline.rs @@ -66,7 +66,7 @@ struct UpdateFlowRow { id: i64, value: String, cdc_operation: String, - cdc_lsn: i64, + cdc_lsn: u64, } /// SELECT query used to verify the `update_flow` streaming test. @@ -1453,7 +1453,7 @@ struct DefaultIdentityDeleteRow { int_array_col: Vec>, text_array_col: Vec>, cdc_operation: String, - cdc_lsn: i64, + cdc_lsn: u64, } const DEFAULT_IDENTITY_DELETE_SELECT: &str = concat!( From a60dd1c808b6d461f1c1ee38a6f7cdc5961c1a63 Mon Sep 17 00:00:00 2001 From: Jordan McQueen Date: Mon, 27 Apr 2026 14:37:55 +0900 Subject: [PATCH 53/86] Derive ClickHouse nullable flags from destination schema ALTER TABLE ADD COLUMN forces new ClickHouse scalar columns to Nullable(T), even when the upstream Postgres column is NOT NULL. Recomputing RowBinary nullable flags from the Postgres schema could then omit the nullable marker byte and corrupt positional RowBinary encoding. Query system.columns on cache miss, validate the actual destination column order against the replicated schema, and derive nullable flags from the ClickHouse type strings. Strengthen the schema-change integration test with a NOT NULL DEFAULT column that fails before this fix. --- etl-destinations/src/clickhouse/client.rs | 31 ++++ etl-destinations/src/clickhouse/core.rs | 145 ++++++++++++++++-- etl-destinations/src/clickhouse/schema.rs | 9 +- etl-destinations/src/clickhouse/test_utils.rs | 21 ++- etl-destinations/tests/clickhouse_pipeline.rs | 49 ++++-- 5 files changed, 219 insertions(+), 36 deletions(-) diff --git a/etl-destinations/src/clickhouse/client.rs b/etl-destinations/src/clickhouse/client.rs index 476233c56..15bda78f6 100644 --- a/etl-destinations/src/clickhouse/client.rs +++ b/etl-destinations/src/clickhouse/client.rs @@ -20,6 +20,15 @@ use crate::clickhouse::{ /// when `end()` is called or the `max_bytes_per_insert` limit is reached). const BUFFERED_CAPACITY: usize = 256 * 1024; +/// A ClickHouse table column returned from `system.columns`. +#[derive(Debug, Clone, PartialEq, Eq, clickhouse::Row, serde::Deserialize)] +pub(crate) struct ClickHouseTableColumn { + /// Column name. + pub(crate) name: String, + /// ClickHouse type string, for example `Int32` or `Nullable(String)`. + pub(crate) type_name: String, +} + /// Builds the SQL used to add a column to a ClickHouse table. fn build_add_column_sql( table_name: &str, @@ -116,6 +125,28 @@ impl ClickHouseClient { }) } + /// Returns ClickHouse columns for a table in position order. + pub(crate) async fn table_columns( + &self, + table_name: &str, + ) -> EtlResult> { + self.inner + .query( + "SELECT name, type AS type_name FROM system.columns WHERE database = \ + currentDatabase() AND table = ? ORDER BY position", + ) + .bind(table_name) + .fetch_all::() + .await + .map_err(|e| { + etl_error!( + ErrorKind::Unknown, + "ClickHouse schema query failed", + format!("Failed to query columns for table '{table_name}': {e}") + ) + }) + } + /// Adds a column to an existing ClickHouse table. /// /// New columns are always Nullable since ClickHouse cannot backfill diff --git a/etl-destinations/src/clickhouse/core.rs b/etl-destinations/src/clickhouse/core.rs index 6a543dd1f..f1929c8ba 100644 --- a/etl-destinations/src/clickhouse/core.rs +++ b/etl-destinations/src/clickhouse/core.rs @@ -21,10 +21,13 @@ use tracing::{debug, info, warn}; use url::Url; use crate::clickhouse::{ - client::ClickHouseClient, + client::{ClickHouseClient, ClickHouseTableColumn}, encoding::{ClickHouseValue, cell_to_clickhouse_value}, metrics::{ETL_CH_DDL_DURATION_SECONDS, register_metrics}, - schema::{build_create_table_sql, table_name_to_clickhouse_table_name}, + schema::{ + CDC_LSN_COLUMN_NAME, CDC_OPERATION_COLUMN_NAME, build_create_table_sql, + table_name_to_clickhouse_table_name, + }, }; // -- CDC operation type -- @@ -58,6 +61,63 @@ fn cdc_lsn_to_clickhouse_value(lsn: PgLsn) -> ClickHouseValue { ClickHouseValue::UInt64(u64::from(lsn)) } +/// Returns true if the ClickHouse type has an outer Nullable wrapper. +fn clickhouse_type_expects_nullable_marker(type_name: &str) -> bool { + type_name.starts_with("Nullable(") +} + +/// Returns expected ClickHouse column names for a replicated schema. +fn expected_clickhouse_column_names(schema: &ReplicatedTableSchema) -> Vec { + let mut names: Vec = + schema.column_schemas().map(|column| column.name.clone()).collect(); + names.push(CDC_OPERATION_COLUMN_NAME.to_string()); + names.push(CDC_LSN_COLUMN_NAME.to_string()); + names +} + +/// Derives RowBinary nullable flags from the actual ClickHouse table schema. +fn nullable_flags_from_clickhouse_columns( + ch_table_name: &str, + expected_column_names: &[String], + actual_columns: &[ClickHouseTableColumn], +) -> EtlResult> { + if actual_columns.len() != expected_column_names.len() { + return Err(etl_error!( + ErrorKind::CorruptedTableSchema, + "ClickHouse table schema does not match replicated schema", + format!( + "table '{}' has {} columns, but {} were expected", + ch_table_name, + actual_columns.len(), + expected_column_names.len() + ) + )); + } + + let mut nullable_flags = Vec::with_capacity(actual_columns.len()); + for (index, (actual_column, expected_name)) in + actual_columns.iter().zip(expected_column_names).enumerate() + { + if actual_column.name != *expected_name { + return Err(etl_error!( + ErrorKind::CorruptedTableSchema, + "ClickHouse table schema does not match replicated schema", + format!( + "table '{}' column {} is named '{}', but '{}' was expected", + ch_table_name, + index + 1, + actual_column.name, + expected_name + ) + )); + } + + nullable_flags.push(clickhouse_type_expects_nullable_marker(&actual_column.type_name)); + } + + Ok(nullable_flags.into()) +} + // -- Inserter configuration -- /// Controls intermediate flushing inside a single `write_table_rows` / @@ -242,18 +302,17 @@ where } } - // Compute nullable flags (user columns + 2 CDC columns always non-nullable). - // - // Array columns are NEVER marked nullable here, even if the Postgres column - // is nullable. The DDL always emits `Array(Nullable(T))` (no outer `Nullable` - // wrapper), so ClickHouse does not expect a null-indicator byte before the - // array. - let column_schemas: Vec<_> = schema.column_schemas().cloned().collect(); - let mut nullable_flags_vec: Vec = - column_schemas.iter().map(|c| c.nullable && !is_array_type(&c.typ)).collect(); - nullable_flags_vec.push(false); // cdc_operation - nullable_flags_vec.push(false); // cdc_lsn - let nullable_flags: Arc<[bool]> = nullable_flags_vec.into(); + // Compute nullable flags from the actual ClickHouse schema. This matters after + // `ALTER TABLE ADD COLUMN`: ClickHouse scalar columns are forced to + // `Nullable(T)` even when the Postgres column is `NOT NULL`, so RowBinary must + // include the nullable marker byte ClickHouse expects. + let actual_columns = self.client.table_columns(&ch_table_name).await?; + let expected_column_names = expected_clickhouse_column_names(schema); + let nullable_flags = nullable_flags_from_clickhouse_columns( + &ch_table_name, + &expected_column_names, + &actual_columns, + )?; // Write-lock: insert, using or_insert to handle concurrent first-writer race. let stored_flags = { @@ -731,6 +790,10 @@ where mod tests { use super::*; + fn clickhouse_column(name: &str, type_name: &str) -> ClickHouseTableColumn { + ClickHouseTableColumn { name: name.to_string(), type_name: type_name.to_string() } + } + #[test] fn cdc_lsn_value_preserves_full_u64_range() { let value = cdc_lsn_to_clickhouse_value(PgLsn::from(u64::MAX)); @@ -741,6 +804,60 @@ mod tests { } } + #[test] + fn nullable_flags_use_clickhouse_destination_nullability() { + let expected_names = vec![ + "id".to_string(), + "score".to_string(), + "tags".to_string(), + CDC_OPERATION_COLUMN_NAME.to_string(), + CDC_LSN_COLUMN_NAME.to_string(), + ]; + let actual_columns = vec![ + clickhouse_column("id", "Int64"), + clickhouse_column("score", "Nullable(Int32)"), + clickhouse_column("tags", "Array(Nullable(String))"), + clickhouse_column(CDC_OPERATION_COLUMN_NAME, "String"), + clickhouse_column(CDC_LSN_COLUMN_NAME, "UInt64"), + ]; + + let flags = + nullable_flags_from_clickhouse_columns("test_table", &expected_names, &actual_columns) + .unwrap(); + + assert_eq!(flags.as_ref(), [false, true, false, false, false]); + } + + #[test] + fn nullable_flags_reject_clickhouse_column_count_mismatch() { + let expected_names = vec!["id".to_string(), CDC_OPERATION_COLUMN_NAME.to_string()]; + let actual_columns = vec![clickhouse_column("id", "Int64")]; + + let err = + nullable_flags_from_clickhouse_columns("test_table", &expected_names, &actual_columns) + .unwrap_err(); + + assert_eq!(err.kind(), ErrorKind::CorruptedTableSchema); + assert_eq!(err.detail(), Some("table 'test_table' has 1 columns, but 2 were expected")); + } + + #[test] + fn nullable_flags_reject_clickhouse_column_order_mismatch() { + let expected_names = vec!["id".to_string(), "name".to_string()]; + let actual_columns = + vec![clickhouse_column("name", "String"), clickhouse_column("id", "Int64")]; + + let err = + nullable_flags_from_clickhouse_columns("test_table", &expected_names, &actual_columns) + .unwrap_err(); + + assert_eq!(err.kind(), ErrorKind::CorruptedTableSchema); + assert_eq!( + err.detail(), + Some("table 'test_table' column 1 is named 'name', but 'id' was expected") + ); + } + #[test] fn nullable_flags_includes_cdc() { let mut all_flags: Vec = vec![true, false]; diff --git a/etl-destinations/src/clickhouse/schema.rs b/etl-destinations/src/clickhouse/schema.rs index f8debd410..12de37943 100644 --- a/etl-destinations/src/clickhouse/schema.rs +++ b/etl-destinations/src/clickhouse/schema.rs @@ -1,5 +1,10 @@ use etl::types::{ColumnSchema, Type, is_array_type}; +/// Name of the CDC operation metadata column appended to ClickHouse tables. +pub(crate) const CDC_OPERATION_COLUMN_NAME: &str = "cdc_operation"; +/// Name of the CDC LSN metadata column appended to ClickHouse tables. +pub(crate) const CDC_LSN_COLUMN_NAME: &str = "cdc_lsn"; + /// Returns the base ClickHouse type string for a Postgres scalar type. /// /// The returned string does not include `Nullable(...)` wrapping — callers are @@ -103,8 +108,8 @@ pub fn build_create_table_sql(table_name: &str, column_schemas: &[ColumnSchema]) } // CDC columns — always non-nullable - cols.push(format!(" {} String", quote_identifier("cdc_operation"))); - cols.push(format!(" {} UInt64", quote_identifier("cdc_lsn"))); + cols.push(format!(" {} String", quote_identifier(CDC_OPERATION_COLUMN_NAME))); + cols.push(format!(" {} UInt64", quote_identifier(CDC_LSN_COLUMN_NAME))); let col_defs = cols.join(",\n"); let quoted_table_name = quote_identifier(table_name); diff --git a/etl-destinations/src/clickhouse/test_utils.rs b/etl-destinations/src/clickhouse/test_utils.rs index a15d8c1e6..cfb54a8af 100644 --- a/etl-destinations/src/clickhouse/test_utils.rs +++ b/etl-destinations/src/clickhouse/test_utils.rs @@ -165,22 +165,29 @@ impl ClickHouseTestDatabase { /// Returns the column names of a ClickHouse table in position order, /// excluding the CDC columns (`cdc_operation`, `cdc_lsn`). pub async fn column_names(&self, table_name: &str) -> Vec { + self.column_types(table_name).await.into_iter().map(|(name, _)| name).collect() + } + + /// Returns the column names and ClickHouse type strings in position order, + /// excluding the CDC columns (`cdc_operation`, `cdc_lsn`). + pub async fn column_types(&self, table_name: &str) -> Vec<(String, String)> { #[derive(clickhouse::Row, serde::Deserialize)] struct Col { name: String, + type_name: String, } - let sql = format!( - "SELECT name FROM system.columns WHERE database = '{}' AND table = '{}' AND name NOT \ - IN ('cdc_operation', 'cdc_lsn') ORDER BY position", - self.database, table_name - ); self.db_client - .query(&sql) + .query( + "SELECT name, type AS type_name FROM system.columns WHERE database = ? AND table \ + = ? AND name NOT IN ('cdc_operation', 'cdc_lsn') ORDER BY position", + ) + .bind(&self.database) + .bind(table_name) .fetch_all::() .await .expect("failed to query system.columns") .into_iter() - .map(|c| c.name) + .map(|c| (c.name, c.type_name)) .collect() } } diff --git a/etl-destinations/tests/clickhouse_pipeline.rs b/etl-destinations/tests/clickhouse_pipeline.rs index 7c812192c..c461c377c 100644 --- a/etl-destinations/tests/clickhouse_pipeline.rs +++ b/etl-destinations/tests/clickhouse_pipeline.rs @@ -1818,13 +1818,14 @@ async fn ping_fails_against_unreachable_clickhouse() { } /// Row struct for the ADD COLUMN test after schema change. -/// Columns: id, name, age, email (email is Nullable after ALTER TABLE ADD). +/// Columns: id, name, age, email, score. #[derive(clickhouse::Row, serde::Deserialize, Debug)] struct AddColumnRow { id: i64, name: String, age: i32, email: Option, + score: Option, cdc_operation: String, } @@ -1838,14 +1839,16 @@ struct AddColumnRow { /// /// # WHEN /// -/// A new column `email text` is added in Postgres, and a row ('Bob', 30, -/// 'bob@example.com') is inserted with the new schema. +/// A nullable `email text` column and a `score integer NOT NULL DEFAULT 0` +/// column are added in Postgres, and a row ('Bob', 30, 'bob@example.com', 7) +/// is inserted with the new schema. /// /// # THEN /// -/// The ClickHouse table has an `email` column. Alice's row has NULL for email. -/// Bob's row has 'bob@example.com'. The destination metadata snapshot_id has -/// increased. +/// The ClickHouse table has `email` and `score` columns. Alice's row has NULL +/// for both added columns because ClickHouse does not backfill historical CDC +/// rows. Bob's row has 'bob@example.com' and 7. The destination metadata +/// snapshot_id has increased. #[tokio::test(flavor = "multi_thread")] async fn schema_change_add_column() { init_test_tracing(); @@ -1914,14 +1917,20 @@ async fn schema_change_add_column() { database .alter_table( table_name.clone(), - &[TableModification::AddColumn { name: "email", data_type: "text" }], + &[ + TableModification::AddColumn { name: "email", data_type: "text" }, + TableModification::AddColumn { + name: "score", + data_type: "integer not null default 0", + }, + ], ) .await .unwrap(); database .run_sql(&format!( - "INSERT INTO {} (name, age, email) VALUES ('Bob', 30, 'bob@example.com')", + "INSERT INTO {} (name, age, email, score) VALUES ('Bob', 30, 'bob@example.com', 7)", table_name.as_quoted_identifier(), )) .await @@ -1930,7 +1939,7 @@ async fn schema_change_add_column() { // Poll until Bob's row arrives (2 rows total = Alice from copy + Bob from // streaming). let select = concat!( - "SELECT id, name, age, email, cdc_operation ", + "SELECT id, name, age, email, score, cdc_operation ", "FROM \"test_schema__add__col\" ", "ORDER BY id", ); @@ -1954,24 +1963,38 @@ async fn schema_change_add_column() { pipeline.shutdown_and_wait().await.unwrap(); - // --- THEN: ClickHouse has the new column, both rows present --- + // --- THEN: ClickHouse has the new columns, both rows present --- let final_columns = ch_db.column_names(ch_table_name).await; - assert_eq!(final_columns, vec!["id", "name", "age", "email"]); + assert_eq!(final_columns, vec!["id", "name", "age", "email", "score"]); + + let final_column_types = ch_db.column_types(ch_table_name).await; + assert_eq!( + final_column_types, + vec![ + ("id".to_string(), "Int64".to_string()), + ("name".to_string(), "String".to_string()), + ("age".to_string(), "Int32".to_string()), + ("email".to_string(), "Nullable(String)".to_string()), + ("score".to_string(), "Nullable(Int32)".to_string()), + ] + ); assert_eq!(rows.len(), 2, "expected Alice + Bob"); - // Alice: pre-change row, email should be NULL. + // Alice: pre-change row, added columns should be NULL. assert_eq!(rows[0].id, 1); assert_eq!(rows[0].name, "Alice"); assert_eq!(rows[0].age, 25); assert_eq!(rows[0].email, None, "Alice's email should be NULL (column added after her row)"); + assert_eq!(rows[0].score, None, "Alice's score should be NULL (column added after her row)"); assert_eq!(rows[0].cdc_operation, "INSERT"); - // Bob: post-change row, email present. + // Bob: post-change row, added columns present. assert_eq!(rows[1].id, 2); assert_eq!(rows[1].name, "Bob"); assert_eq!(rows[1].age, 30); assert_eq!(rows[1].email, Some("bob@example.com".to_string())); + assert_eq!(rows[1].score, Some(7)); assert_eq!(rows[1].cdc_operation, "INSERT"); // Metadata snapshot_id should have advanced. From 38316daa1d2455036ea2e8fd09a73fb9bc6f2a0e Mon Sep 17 00:00:00 2001 From: Jordan McQueen Date: Mon, 27 Apr 2026 14:46:29 +0900 Subject: [PATCH 54/86] Fix ClickHouse test script target The script was added when tests/clickhouse_pipeline.rs was auto-discovered as its own Cargo integration test target. Later, integration tests were consolidated under tests/main.rs with autotests disabled, making the Cargo target name main and leaving clickhouse_pipeline as a module/test filter. Default TEST_TARGET to main and default TEST_NAME_FILTER to clickhouse_pipeline so the helper runs the ClickHouse tests against the current test layout. --- scripts/test-clickhouse.sh | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/scripts/test-clickhouse.sh b/scripts/test-clickhouse.sh index 49fb63c98..669c7e5ac 100755 --- a/scripts/test-clickhouse.sh +++ b/scripts/test-clickhouse.sh @@ -12,8 +12,11 @@ CLICKHOUSE_HTTP_PORT="${CLICKHOUSE_HTTP_PORT:-8123}" CLICKHOUSE_USER="${CLICKHOUSE_USER:-etl}" CLICKHOUSE_PASSWORD="${CLICKHOUSE_PASSWORD:-etl}" CARGO_TOOLCHAIN="${CARGO_TOOLCHAIN:-}" -TEST_TARGET="${TEST_TARGET:-clickhouse_pipeline}" -TEST_NAME_FILTER="${TEST_NAME_FILTER:-}" +# Cargo integration test target name. ClickHouse tests live in tests/main.rs under +# the clickhouse_pipeline module; use TEST_NAME_FILTER to select that module or a +# specific test. Set TEST_NAME_FILTER='' to run all tests in the target. +TEST_TARGET="${TEST_TARGET:-main}" +TEST_NAME_FILTER="${TEST_NAME_FILTER-clickhouse_pipeline}" CARGO_PACKAGE="${CARGO_PACKAGE:-etl-destinations}" FEATURES="${FEATURES:-clickhouse,test-utils}" From 6f1556b7ee4072774884530eef3f183bb501ff32 Mon Sep 17 00:00:00 2001 From: Jordan McQueen Date: Tue, 28 Apr 2026 14:21:34 +0900 Subject: [PATCH 55/86] Drop ClickHouse section banners and tighten doccomments --- etl-destinations/src/clickhouse/client.rs | 4 +- etl-destinations/src/clickhouse/core.rs | 68 ++++++++++----------- etl-destinations/src/clickhouse/encoding.rs | 39 +++--------- 3 files changed, 42 insertions(+), 69 deletions(-) diff --git a/etl-destinations/src/clickhouse/client.rs b/etl-destinations/src/clickhouse/client.rs index 15bda78f6..c5b107e4e 100644 --- a/etl-destinations/src/clickhouse/client.rs +++ b/etl-destinations/src/clickhouse/client.rs @@ -76,9 +76,7 @@ fn build_insert_rows_sql(table_name: &str) -> String { /// High-level ClickHouse client used by [`super::core::ClickHouseDestination`]. /// /// Wraps a [`clickhouse::Client`] and exposes typed methods for DDL, -/// truncation, and RowBinary bulk inserts. Cheaply cloneable — the inner client -/// holds an `Arc` internally, and the outer `Arc` here ensures a single shared -/// instance. +/// truncation, and RowBinary bulk inserts. #[derive(Clone)] pub struct ClickHouseClient { inner: Arc, diff --git a/etl-destinations/src/clickhouse/core.rs b/etl-destinations/src/clickhouse/core.rs index f1929c8ba..d6acde29e 100644 --- a/etl-destinations/src/clickhouse/core.rs +++ b/etl-destinations/src/clickhouse/core.rs @@ -30,8 +30,7 @@ use crate::clickhouse::{ }, }; -// -- CDC operation type -- - +/// Postgres CDC operation kind, written to the `cdc_operation` column. #[derive(Copy, Clone)] enum CdcOperation { Insert, @@ -51,8 +50,14 @@ impl std::fmt::Display for CdcOperation { /// A row pending insertion with its CDC metadata. struct PendingRow { + /// CDC op kind, written into the `cdc_operation` column. operation: CdcOperation, + /// Commit LSN of the source transaction, written into the `cdc_lsn` + /// column. lsn: PgLsn, + /// User column values in source schema order. The two CDC columns + /// (`cdc_operation`, `cdc_lsn`) are appended at encode time and are + /// not present here. cells: Vec, } @@ -118,8 +123,6 @@ fn nullable_flags_from_clickhouse_columns( Ok(nullable_flags.into()) } -// -- Inserter configuration -- - /// Controls intermediate flushing inside a single `write_table_rows` / /// `write_events` call. /// @@ -135,30 +138,28 @@ pub struct ClickHouseInserterConfig { pub max_bytes_per_insert: u64, } -// -- Destination struct -- - /// CDC-capable ClickHouse destination that replicates Postgres tables. /// /// Uses append-only MergeTree tables with two CDC columns (`cdc_operation`, /// `cdc_lsn`) appended to each row. Rows are encoded as RowBinary and sent via /// `INSERT INTO "table" FORMAT RowBinary` -- no column-name header required. -/// -/// The struct is cheaply cloneable: `client` wraps an `Arc` internally, and -/// `table_cache` is wrapped in `Arc>`. #[derive(Clone)] pub struct ClickHouseDestination { + /// HTTP client used for all DDL and RowBinary INSERT traffic. client: ClickHouseClient, + /// Per-INSERT byte budget; gates intermediate flushes within a single + /// `write_table_rows` / `write_events` call. inserter_config: Arc, + /// Schema/state store used to persist destination table metadata + /// (Applying / Applied) and to look up replicated schemas. store: Arc, - /// Cache: ClickHouse table name -> `Arc<[bool]>` (nullable flags per - /// column, including the two trailing CDC columns which are always - /// `false`). + /// ClickHouse table name -> per-column nullable flags (in column order, + /// including the two trailing CDC columns which are always `false`). /// - /// `std::sync::RwLock` is appropriate here: both reads (hot path) and - /// writes (rare, only on first encounter of a new table) are brief - /// in-memory operations. The lock is always released before any - /// `.await` point (DDL is executed with no lock held), so the async - /// `tokio::sync::RwLock` would be unnecessary overhead. + /// Populated lazily on first encounter of a table and consulted on the + /// hot insert path. `std::sync::RwLock` is sufficient: every critical + /// section is a brief in-memory map op with no `.await` inside, so the + /// async `tokio::sync::RwLock` would be needless overhead. table_cache: Arc>>>, } @@ -187,7 +188,21 @@ where }) } - /// Creates a new ClickHouse table with Applying -> DDL -> Applied metadata. + /// Creates a ClickHouse table for a never-before-seen `table_id`, + /// bracketing the DDL with `DestinationTableMetadata` writes so the + /// operation is crash-recoverable. + /// + /// Sequence: + /// 1. Persist `Applying` metadata (so a crash between this write and step 3 + /// leaves a marker that lets restart logic detect the interrupted + /// operation). + /// 2. Execute `CREATE TABLE IF NOT EXISTS` against ClickHouse. + /// 3. Persist `Applied` metadata. + /// + /// Recovery is handled by `ensure_table_exists`: on restart, an + /// `Applying` row signals that the previous run died mid-creation, so + /// it re-runs the idempotent DDL and transitions the metadata to + /// `Applied` itself. async fn create_table_with_metadata( &self, table_id: TableId, @@ -359,8 +374,6 @@ where .await } - // -- Schema change handling -- - /// Handles a schema change event (Relation) by computing the diff and /// applying ALTER TABLE statements. async fn handle_relation_event(&self, new_schema: &ReplicatedTableSchema) -> EtlResult<()> { @@ -513,8 +526,6 @@ where Ok(()) } - // -- Event processing -- - /// Processes events in passes driven by an outer loop that runs until the /// iterator is exhausted. Each pass: /// 1. Accumulates Insert/Update/Delete rows per table until a Truncate, @@ -857,17 +868,4 @@ mod tests { Some("table 'test_table' column 1 is named 'name', but 'id' was expected") ); } - - #[test] - fn nullable_flags_includes_cdc() { - let mut all_flags: Vec = vec![true, false]; - all_flags.push(false); // cdc_operation - all_flags.push(false); // cdc_lsn - - assert_eq!(all_flags.len(), 4); - assert!(all_flags[0]); - assert!(!all_flags[1]); - assert!(!all_flags[2]); - assert!(!all_flags[3]); - } } diff --git a/etl-destinations/src/clickhouse/encoding.rs b/etl-destinations/src/clickhouse/encoding.rs index b75a81b36..bf844baf6 100644 --- a/etl-destinations/src/clickhouse/encoding.rs +++ b/etl-destinations/src/clickhouse/encoding.rs @@ -7,30 +7,15 @@ use etl::{ types::{ArrayCell, Cell}, }; -// ── RowBinary encoding -// ──────────────────────────────────────────────────────── +// RowBinary bytes are written directly via `Client::insert_formatted_with`, +// bypassing the typed `Inserter` / serde path because: +// - `Insert::new` panics on empty `COLUMN_NAMES` (via `join_column_names`) even +// with validation disabled. +// - The RowBinary serde serializer wraps `BufMut` with a fresh `&mut` on every +// `serialize_some`, telescoping `&mut &mut ... BytesMut` on nullable array +// elements and overflowing the compiler recursion limit. // -// We bypass the `Row` / `Inserter` API entirely and write RowBinary bytes -// directly via `Client::insert_formatted_with("INSERT INTO \"t\" FORMAT -// RowBinary")`. -// -// This avoids two fatal issues with the `Inserter` path: -// -// 1. `Insert::new` always calls `join_column_names::().expect(…)`, which -// panics when `COLUMN_NAMES = &[]` regardless of whether validation is -// enabled. -// -// 2. The RowBinary serde serializer wraps its `BufMut` writer in a fresh `&mut` -// at every `serialize_some` call, telescoping the type to `&mut &mut … -// BytesMut` for nullable array elements and overflowing the compiler's -// recursion limit. -// -// Direct binary encoding has neither problem: it is a simple recursive function -// that writes bytes to a `Vec` with no generics and no type-level -// recursion. - -// ── ClickHouseValue -// ─────────────────────────────────────────────────────────── +// Direct byte-writing has no generics and no type-level recursion. /// Owned ClickHouse-compatible value, moved (not cloned) from a [`Cell`]. pub(crate) enum ClickHouseValue { @@ -57,8 +42,6 @@ pub(crate) enum ClickHouseValue { Array(Vec), } -// ── Cell → ClickHouseValue conversion ──────────────────────────────────────── - /// Converts a [`Cell`] to a [`ClickHouseValue`], consuming it (no clone). pub(crate) fn cell_to_clickhouse_value(cell: Cell) -> ClickHouseValue { match cell { @@ -184,9 +167,6 @@ fn bytes_to_hex(bytes: Vec) -> String { s } -// ── RowBinary wire encoding -// ─────────────────────────────────────────────────── - /// Encodes a variable-length integer (LEB128) for ClickHouse string/array /// lengths. pub(crate) fn rb_varint(mut v: usize, buf: &mut Vec) { @@ -296,9 +276,6 @@ pub(crate) fn rb_encode_row( Ok(()) } -// ── Unit tests -// ──────────────────────────────────────────────────────────────── - #[cfg(test)] mod tests { use chrono::NaiveDate; From b0cd1299f88e7c293c13340e5deae0c5b14d3ec7 Mon Sep 17 00:00:00 2001 From: Jordan McQueen Date: Tue, 28 Apr 2026 14:22:51 +0900 Subject: [PATCH 56/86] Rename ClickHouseClient::ping to validate_connectivity Aligns the connectivity-probe method name with IcebergClient::validate_connectivity so the etl-api validators treat both destinations uniformly. --- etl-api/src/validation/validators.rs | 2 +- etl-destinations/src/clickhouse/client.rs | 8 +++++++- etl-destinations/tests/clickhouse_pipeline.rs | 12 ++++++------ 3 files changed, 14 insertions(+), 8 deletions(-) diff --git a/etl-api/src/validation/validators.rs b/etl-api/src/validation/validators.rs index f178013b0..7ce5e9e35 100644 --- a/etl-api/src/validation/validators.rs +++ b/etl-api/src/validation/validators.rs @@ -664,7 +664,7 @@ impl Validator for ClickHouseValidator { self.password.as_ref().map(|password| password.expose_secret().to_owned()), self.database.clone(), ); - match client.ping().await { + match client.validate_connectivity().await { Ok(_) => Ok(Vec::new()), Err(_) => Ok(vec![ValidationFailure::critical( "ClickHouse Connection Failed", diff --git a/etl-destinations/src/clickhouse/client.rs b/etl-destinations/src/clickhouse/client.rs index c5b107e4e..3ce73c1f1 100644 --- a/etl-destinations/src/clickhouse/client.rs +++ b/etl-destinations/src/clickhouse/client.rs @@ -103,7 +103,13 @@ impl ClickHouseClient { Self { inner: Arc::new(client) } } - pub async fn ping(&self) -> EtlResult<()> { + /// Verifies that the ClickHouse server is reachable. + /// + /// Issues a `SELECT 1` round-trip; cheaper than any DDL or metadata + /// query and exercises the auth/transport path. Mirrors the Iceberg + /// destination's `validate_connectivity` so callers (notably the + /// `etl-api` validators) can treat the two destinations uniformly. + pub async fn validate_connectivity(&self) -> EtlResult<()> { self.inner .query("SELECT 1") .fetch_one::() diff --git a/etl-destinations/tests/clickhouse_pipeline.rs b/etl-destinations/tests/clickhouse_pipeline.rs index c461c377c..d8cbbb597 100644 --- a/etl-destinations/tests/clickhouse_pipeline.rs +++ b/etl-destinations/tests/clickhouse_pipeline.rs @@ -1783,38 +1783,38 @@ async fn exclusive_large_batch_table_copy() { /// A ClickHouseClient pointed at the running test ClickHouse instance. /// /// # WHEN -/// `ping()` is called. +/// `validate_connectivity()` is called. /// /// # THEN /// It returns Ok(()). #[tokio::test(flavor = "multi_thread")] -async fn ping_succeeds_against_running_clickhouse() { +async fn validate_connectivity_succeeds_against_running_clickhouse() { let client = ClickHouseClient::new( get_clickhouse_url(), get_clickhouse_user(), get_clickhouse_password(), "default", ); - assert!(client.ping().await.is_ok()); + assert!(client.validate_connectivity().await.is_ok()); } /// # GIVEN /// A ClickHouseClient pointed at a URL where nothing is listening. /// /// # WHEN -/// `ping()` is called. +/// `validate_connectivity()` is called. /// /// # THEN /// It returns Err. #[tokio::test(flavor = "multi_thread")] -async fn ping_fails_against_unreachable_clickhouse() { +async fn validate_connectivity_fails_against_unreachable_clickhouse() { let client = ClickHouseClient::new( Url::parse("http://localhost:1").unwrap(), "nobody", None::, "default", ); - assert!(client.ping().await.is_err()); + assert!(client.validate_connectivity().await.is_err()); } /// Row struct for the ADD COLUMN test after schema change. From 02696d37b3214ff49eb4356e6b5cc5b9396280b5 Mon Sep 17 00:00:00 2001 From: Jordan McQueen Date: Tue, 28 Apr 2026 14:23:31 +0900 Subject: [PATCH 57/86] Have ClickHouse insert_rows call insert.end() once per chunk Restructures the row-encoding loop into an outer 'while rows remain' / inner 'while under byte budget' shape so each INSERT is closed in exactly one place. The row that crosses the budget still goes into the current INSERT, matching the prior behavior. An empty input now skips opening an INSERT instead of issuing a no-op flush. --- etl-destinations/src/clickhouse/client.rs | 76 +++++++++-------------- 1 file changed, 29 insertions(+), 47 deletions(-) diff --git a/etl-destinations/src/clickhouse/client.rs b/etl-destinations/src/clickhouse/client.rs index 3ce73c1f1..6fa464039 100644 --- a/etl-destinations/src/clickhouse/client.rs +++ b/etl-destinations/src/clickhouse/client.rs @@ -225,57 +225,39 @@ impl ClickHouseClient { source: &'static str, ) -> EtlResult<()> { let sql = build_insert_rows_sql(table_name); - - let mut insert = - self.inner.insert_formatted_with(sql.clone()).buffered_with_capacity(BUFFERED_CAPACITY); - let mut bytes = 0u64; + let mut rows = rows.into_iter().peekable(); let mut row_buf = Vec::new(); - let mut insert_start = Instant::now(); - - for row in rows { - row_buf.clear(); - rb_encode_row(row, nullable_flags, &mut row_buf)?; - - insert.write_buffered(&row_buf); - bytes += row_buf.len() as u64; - - if bytes >= max_bytes_per_insert { - insert.end().await.map_err(|e| { - etl_error!( - ErrorKind::Unknown, - "ClickHouse insert flush failed", - format!("Failed to flush INSERT for '{table_name}': {e}") - ) - })?; - metrics::histogram!( - ETL_CH_INSERT_DURATION_SECONDS, - "table" => table_name.to_string(), - "source" => source - ) - .record(insert_start.elapsed().as_secs_f64()); - - insert = self - .inner - .insert_formatted_with(sql.clone()) - .buffered_with_capacity(BUFFERED_CAPACITY); - insert_start = Instant::now(); - bytes = 0; + + while rows.peek().is_some() { + let mut insert = self + .inner + .insert_formatted_with(sql.clone()) + .buffered_with_capacity(BUFFERED_CAPACITY); + let mut bytes = 0u64; + let insert_start = Instant::now(); + + while bytes < max_bytes_per_insert { + let Some(row) = rows.next() else { break }; + row_buf.clear(); + rb_encode_row(row, nullable_flags, &mut row_buf)?; + insert.write_buffered(&row_buf); + bytes += row_buf.len() as u64; } - } - insert.end().await.map_err(|e| { - etl_error!( - ErrorKind::Unknown, - "ClickHouse insert flush failed", - format!("Failed to flush INSERT for '{table_name}': {e}") + insert.end().await.map_err(|e| { + etl_error!( + ErrorKind::Unknown, + "ClickHouse insert flush failed", + format!("Failed to flush INSERT for '{table_name}': {e}") + ) + })?; + metrics::histogram!( + ETL_CH_INSERT_DURATION_SECONDS, + "table" => table_name.to_string(), + "source" => source ) - })?; - metrics::histogram!( - ETL_CH_INSERT_DURATION_SECONDS, - "table" => table_name.to_string(), - "source" => source - ) - .record(insert_start.elapsed().as_secs_f64()); + .record(insert_start.elapsed().as_secs_f64()); + } Ok(()) } From 28d0bc30bd40178b6029bb219dd6a9ba73b4d082 Mon Sep 17 00:00:00 2001 From: Jordan McQueen Date: Tue, 28 Apr 2026 14:24:31 +0900 Subject: [PATCH 58/86] Simplify ClickHouse table-exists, event-flush, and array encoding paths core.rs: extract recover_applying_metadata from ensure_table_exists to flatten the crash-recovery branch out of the cache/metadata top-level. Merge the two parallel maps in write_events_inner into a single HashMap)>, eliminating the defensive remove().ok_or_else() that dressed up an impossible mismatch. Extract the JoinSet-based row flush into flush_pending_rows. encoding.rs: introduce a map_array helper and date_to_days extractor so each ArrayCell variant becomes a one-liner that mirrors cell_to_clickhouse_value. Rewrite bytes_to_hex to encode via a 16-byte ASCII lookup table and take &[u8] instead of consuming a Vec. --- etl-destinations/src/clickhouse/core.rs | 256 ++++++++++---------- etl-destinations/src/clickhouse/encoding.rs | 143 ++++------- 2 files changed, 179 insertions(+), 220 deletions(-) diff --git a/etl-destinations/src/clickhouse/core.rs b/etl-destinations/src/clickhouse/core.rs index d6acde29e..11d85d794 100644 --- a/etl-destinations/src/clickhouse/core.rs +++ b/etl-destinations/src/clickhouse/core.rs @@ -244,77 +244,28 @@ where let ch_table_name = table_name_to_clickhouse_table_name(&table_name.schema, &table_name.name); - { - let guard = self.table_cache.read(); - if let Some(flags) = guard.get(&ch_table_name) { - return Ok((ch_table_name, Arc::clone(flags))); - } + if let Some(flags) = self.table_cache.read().get(&ch_table_name).cloned() { + return Ok((ch_table_name, flags)); } let table_id = schema.id(); - let snapshot_id = schema.inner().snapshot_id; - let replication_mask = schema.replication_mask().clone(); - - let existing_metadata = self.store.get_destination_table_metadata(table_id).await?; - match existing_metadata { + match self.store.get_destination_table_metadata(table_id).await? { None => { - // First table creation: Applying -> CREATE TABLE -> Applied. self.create_table_with_metadata( table_id, &ch_table_name, schema, - snapshot_id, - replication_mask, + schema.inner().snapshot_id, + schema.replication_mask().clone(), ) .await?; } Some(metadata) if metadata.is_applying() => { - // Crash recovery: the replicator was killed during a DDL - // operation. Re-apply idempotently and mark Applied. - warn!("table {} has Applying metadata, recovering interrupted operation", table_id); - - match metadata.previous_snapshot_id { - Some(prev_snapshot_id) => { - // Interrupted schema change: re-apply the diff. - let old_table_schema = self - .store - .get_table_schema(&table_id, prev_snapshot_id) - .await? - .ok_or_else(|| { - etl_error!( - ErrorKind::InvalidState, - "Old schema not found for recovery", - format!( - "Cannot find schema for table {} at snapshot_id {}", - table_id, prev_snapshot_id - ) - ) - })?; - let old_schema = ReplicatedTableSchema::from_mask( - old_table_schema, - metadata.replication_mask.clone(), - ); - let diff = old_schema.diff(schema); - self.apply_schema_diff(&ch_table_name, &diff, &old_schema).await?; - } - None => { - // Interrupted initial table creation: re-run CREATE - // TABLE IF NOT EXISTS (idempotent). - let column_schemas: Vec<_> = schema.column_schemas().cloned().collect(); - let ddl = build_create_table_sql(&ch_table_name, &column_schemas); - self.client.execute_ddl(&ddl).await?; - } - } - - self.store - .store_destination_table_metadata(table_id, metadata.to_applied()) - .await?; - } - Some(_applied) => { - // Applied metadata, cache miss after handle_relation_event - // invalidated the cache. No DDL needed -- fall through to - // recompute nullable flags below. + self.recover_applying_metadata(table_id, &ch_table_name, schema, metadata).await?; } + // Applied metadata, cache miss after `handle_relation_event` + // invalidated the cache. No DDL needed. + Some(_applied) => {} } // Compute nullable flags from the actual ClickHouse schema. This matters after @@ -329,15 +280,62 @@ where &actual_columns, )?; - // Write-lock: insert, using or_insert to handle concurrent first-writer race. - let stored_flags = { + // `or_insert_with` handles the race where a concurrent caller populated + // the entry between our read-miss and this write. + let flags = { let mut guard = self.table_cache.write(); Arc::clone( guard.entry(ch_table_name.clone()).or_insert_with(|| Arc::clone(&nullable_flags)), ) }; - Ok((ch_table_name, stored_flags)) + Ok((ch_table_name, flags)) + } + + /// Re-runs an interrupted DDL idempotently and transitions metadata to + /// `Applied`. Distinguishes between an interrupted schema change (replays + /// the diff against the previous snapshot) and an interrupted initial + /// creation (re-issues `CREATE TABLE IF NOT EXISTS`). + async fn recover_applying_metadata( + &self, + table_id: TableId, + ch_table_name: &str, + schema: &ReplicatedTableSchema, + metadata: DestinationTableMetadata, + ) -> EtlResult<()> { + warn!("table {} has Applying metadata, recovering interrupted operation", table_id); + + match metadata.previous_snapshot_id { + Some(prev_snapshot_id) => { + let old_table_schema = + self.store.get_table_schema(&table_id, prev_snapshot_id).await?.ok_or_else( + || { + etl_error!( + ErrorKind::InvalidState, + "Old schema not found for recovery", + format!( + "Cannot find schema for table {} at snapshot_id {}", + table_id, prev_snapshot_id + ) + ) + }, + )?; + let old_schema = ReplicatedTableSchema::from_mask( + old_table_schema, + metadata.replication_mask.clone(), + ); + let diff = old_schema.diff(schema); + self.apply_schema_diff(ch_table_name, &diff, &old_schema).await?; + } + None => { + let column_schemas: Vec<_> = schema.column_schemas().cloned().collect(); + let ddl = build_create_table_sql(ch_table_name, &column_schemas); + self.client.execute_ddl(&ddl).await?; + } + } + + self.store.store_destination_table_metadata(table_id, metadata.to_applied()).await?; + Ok(()) } async fn truncate_table_inner(&self, schema: &ReplicatedTableSchema) -> EtlResult<()> { @@ -537,8 +535,8 @@ where let mut event_iter = events.into_iter().peekable(); while event_iter.peek().is_some() { - let mut table_schemas: HashMap = HashMap::new(); - let mut table_id_to_rows: HashMap> = HashMap::new(); + let mut pending: HashMap)> = + HashMap::new(); // Accumulate data events until we hit a Truncate or Relation boundary. while let Some(event) = event_iter.peek() { @@ -550,28 +548,25 @@ where match event { Event::Insert(insert) => { let table_id = insert.replicated_table_schema.id(); - table_schemas + let entry = pending .entry(table_id) - .or_insert_with(|| insert.replicated_table_schema.clone()); - table_id_to_rows.entry(table_id).or_default().push(PendingRow { + .or_insert_with(|| (insert.replicated_table_schema, Vec::new())); + entry.1.push(PendingRow { operation: CdcOperation::Insert, lsn: insert.commit_lsn, cells: insert.table_row.into_values(), }); } Event::Update(update) => { - let table_row = match update.updated_table_row { - UpdatedTableRow::Full(row) => row, - UpdatedTableRow::Partial(_) => { - warn!("skipping partial update row for ClickHouse"); - continue; - } + let UpdatedTableRow::Full(table_row) = update.updated_table_row else { + warn!("skipping partial update row for ClickHouse"); + continue; }; let table_id = update.replicated_table_schema.id(); - table_schemas + let entry = pending .entry(table_id) - .or_insert_with(|| update.replicated_table_schema.clone()); - table_id_to_rows.entry(table_id).or_default().push(PendingRow { + .or_insert_with(|| (update.replicated_table_schema, Vec::new())); + entry.1.push(PendingRow { operation: CdcOperation::Update, lsn: update.commit_lsn, cells: table_row.into_values(), @@ -589,10 +584,10 @@ where } }; let table_id = delete.replicated_table_schema.id(); - table_schemas + let entry = pending .entry(table_id) - .or_insert_with(|| delete.replicated_table_schema.clone()); - table_id_to_rows.entry(table_id).or_default().push(PendingRow { + .or_insert_with(|| (delete.replicated_table_schema, Vec::new())); + entry.1.push(PendingRow { operation: CdcOperation::Delete, lsn: delete.commit_lsn, cells: old_row.into_values(), @@ -607,57 +602,7 @@ where } } - // Flush accumulated rows concurrently, one JoinSet task per table. - if !table_id_to_rows.is_empty() { - let mut table_meta: HashMap)> = HashMap::new(); - for (&table_id, schema) in &table_schemas { - let (name, flags) = self.ensure_table_exists(schema).await?; - table_meta.insert(table_id, (name, flags)); - } - - let mut join_set: JoinSet> = JoinSet::new(); - for (table_id, row_data) in table_id_to_rows { - let (ch_table_name, nullable_flags) = - table_meta.remove(&table_id).ok_or_else(|| { - etl_error!( - ErrorKind::Unknown, - "ClickHouse insert failed", - format!("Failed to remove metadata for table ID {table_id}") - ) - })?; - let client = self.client.clone(); - let max_bytes = self.inserter_config.max_bytes_per_insert; - - join_set.spawn(async move { - let rows: Vec> = row_data - .into_iter() - .map(|PendingRow { operation, lsn, cells }| { - let mut values: Vec = - cells.into_iter().map(cell_to_clickhouse_value).collect(); - values.push(ClickHouseValue::String(operation.to_string())); - values.push(cdc_lsn_to_clickhouse_value(lsn)); - values - }) - .collect(); - - client - .insert_rows( - &ch_table_name, - rows, - &nullable_flags, - max_bytes, - "streaming", - ) - .await - }); - } - - while let Some(result) = join_set.join_next().await { - result.map_err(|e| { - etl_error!(ErrorKind::ApplyWorkerPanic, "insert task failed", e.to_string()) - })??; - } - } + self.flush_pending_rows(pending).await?; // Process Relation events (schema changes) sequentially. while let Some(Event::Relation(_)) = event_iter.peek() { @@ -684,6 +629,59 @@ where Ok(()) } + + /// Encodes the accumulated `PendingRow` batches and inserts them into + /// ClickHouse, one `JoinSet` task per table. No-op if `pending` is empty. + /// + /// All `ensure_table_exists` calls run sequentially before any insert is + /// spawned, so a schema-resolution failure aborts the whole pass without + /// any partial-write side effects. + async fn flush_pending_rows( + &self, + pending: HashMap)>, + ) -> EtlResult<()> { + if pending.is_empty() { + return Ok(()); + } + + let mut prepared: Vec<(String, Arc<[bool]>, Vec)> = + Vec::with_capacity(pending.len()); + for (_, (schema, rows)) in pending { + let (ch_table_name, nullable_flags) = self.ensure_table_exists(&schema).await?; + prepared.push((ch_table_name, nullable_flags, rows)); + } + + let mut join_set: JoinSet> = JoinSet::new(); + for (ch_table_name, nullable_flags, rows) in prepared { + let client = self.client.clone(); + let max_bytes = self.inserter_config.max_bytes_per_insert; + + join_set.spawn(async move { + let rows: Vec> = rows + .into_iter() + .map(|PendingRow { operation, lsn, cells }| { + let mut values: Vec = + cells.into_iter().map(cell_to_clickhouse_value).collect(); + values.push(ClickHouseValue::String(operation.to_string())); + values.push(cdc_lsn_to_clickhouse_value(lsn)); + values + }) + .collect(); + + client + .insert_rows(&ch_table_name, rows, &nullable_flags, max_bytes, "streaming") + .await + }); + } + + while let Some(result) = join_set.join_next().await { + result.map_err(|e| { + etl_error!(ErrorKind::ApplyWorkerPanic, "insert task failed", e.to_string()) + })??; + } + + Ok(()) + } } /// Expands a key-only delete row to full column width for RowBinary encoding. diff --git a/etl-destinations/src/clickhouse/encoding.rs b/etl-destinations/src/clickhouse/encoding.rs index bf844baf6..5f9b4c30e 100644 --- a/etl-destinations/src/clickhouse/encoding.rs +++ b/etl-destinations/src/clickhouse/encoding.rs @@ -1,5 +1,3 @@ -use std::fmt; - use chrono::NaiveDate; use etl::{ error::{ErrorKind, EtlResult}, @@ -54,18 +52,13 @@ pub(crate) fn cell_to_clickhouse_value(cell: Cell) -> ClickHouseValue { Cell::F32(v) => ClickHouseValue::Float32(v), Cell::F64(v) => ClickHouseValue::Float64(v), Cell::Numeric(n) => ClickHouseValue::String(n.to_string()), - Cell::Date(d) => { - let days = - d.signed_duration_since(unix_epoch()).num_days().clamp(0, i64::from(u16::MAX)) - as u16; - ClickHouseValue::Date(days) - } + Cell::Date(d) => ClickHouseValue::Date(date_to_days(d)), Cell::Time(t) => ClickHouseValue::String(t.to_string()), Cell::Timestamp(dt) => ClickHouseValue::DateTime64(dt.and_utc().timestamp_micros()), Cell::TimestampTz(dt) => ClickHouseValue::DateTime64(dt.timestamp_micros()), Cell::Uuid(u) => ClickHouseValue::Uuid(*u.as_bytes()), Cell::Json(j) => ClickHouseValue::String(j.to_string()), - Cell::Bytes(b) => ClickHouseValue::String(bytes_to_hex(b)), + Cell::Bytes(b) => ClickHouseValue::String(bytes_to_hex(&b)), Cell::String(s) => ClickHouseValue::String(s), Cell::Array(array_cell) => { ClickHouseValue::Array(array_cell_to_clickhouse_values(array_cell)) @@ -73,96 +66,64 @@ pub(crate) fn cell_to_clickhouse_value(cell: Cell) -> ClickHouseValue { } } +/// Converts an [`ArrayCell`] to a flat `Vec`, mapping each +/// `Some(x)` to the matching scalar variant and each `None` to +/// [`ClickHouseValue::Null`]. Per-element conversions mirror +/// [`cell_to_clickhouse_value`]. fn array_cell_to_clickhouse_values(array_cell: ArrayCell) -> Vec { match array_cell { - ArrayCell::Bool(v) => { - v.into_iter().map(|o| o.map_or(ClickHouseValue::Null, ClickHouseValue::Bool)).collect() - } - ArrayCell::String(v) => v - .into_iter() - .map(|o| o.map_or(ClickHouseValue::Null, ClickHouseValue::String)) - .collect(), - ArrayCell::I16(v) => { - v.into_iter().map(|o| o.map_or(ClickHouseValue::Null, ClickHouseValue::Int16)).collect() - } - ArrayCell::I32(v) => { - v.into_iter().map(|o| o.map_or(ClickHouseValue::Null, ClickHouseValue::Int32)).collect() + ArrayCell::Bool(v) => map_array(v, ClickHouseValue::Bool), + ArrayCell::String(v) => map_array(v, ClickHouseValue::String), + ArrayCell::I16(v) => map_array(v, ClickHouseValue::Int16), + ArrayCell::I32(v) => map_array(v, ClickHouseValue::Int32), + ArrayCell::I64(v) => map_array(v, ClickHouseValue::Int64), + ArrayCell::U32(v) => map_array(v, ClickHouseValue::UInt32), + ArrayCell::F32(v) => map_array(v, ClickHouseValue::Float32), + ArrayCell::F64(v) => map_array(v, ClickHouseValue::Float64), + ArrayCell::Numeric(v) => map_array(v, |n| ClickHouseValue::String(n.to_string())), + ArrayCell::Date(v) => map_array(v, |d| ClickHouseValue::Date(date_to_days(d))), + ArrayCell::Time(v) => map_array(v, |t| ClickHouseValue::String(t.to_string())), + ArrayCell::Timestamp(v) => { + map_array(v, |dt| ClickHouseValue::DateTime64(dt.and_utc().timestamp_micros())) } - ArrayCell::I64(v) => { - v.into_iter().map(|o| o.map_or(ClickHouseValue::Null, ClickHouseValue::Int64)).collect() + ArrayCell::TimestampTz(v) => { + map_array(v, |dt| ClickHouseValue::DateTime64(dt.timestamp_micros())) } - ArrayCell::U32(v) => v - .into_iter() - .map(|o| o.map_or(ClickHouseValue::Null, ClickHouseValue::UInt32)) - .collect(), - ArrayCell::F32(v) => v - .into_iter() - .map(|o| o.map_or(ClickHouseValue::Null, ClickHouseValue::Float32)) - .collect(), - ArrayCell::F64(v) => v - .into_iter() - .map(|o| o.map_or(ClickHouseValue::Null, ClickHouseValue::Float64)) - .collect(), - ArrayCell::Numeric(v) => v - .into_iter() - .map(|o| o.map_or(ClickHouseValue::Null, |n| ClickHouseValue::String(n.to_string()))) - .collect(), - ArrayCell::Date(v) => v - .into_iter() - .map(|o| { - o.map_or(ClickHouseValue::Null, |d| { - let days = d - .signed_duration_since(unix_epoch()) - .num_days() - .clamp(0, i64::from(u16::MAX)) as u16; - ClickHouseValue::Date(days) - }) - }) - .collect(), - ArrayCell::Time(v) => v - .into_iter() - .map(|o| o.map_or(ClickHouseValue::Null, |t| ClickHouseValue::String(t.to_string()))) - .collect(), - ArrayCell::Timestamp(v) => v - .into_iter() - .map(|o| { - o.map_or(ClickHouseValue::Null, |dt| { - ClickHouseValue::DateTime64(dt.and_utc().timestamp_micros()) - }) - }) - .collect(), - ArrayCell::TimestampTz(v) => v - .into_iter() - .map(|o| { - o.map_or(ClickHouseValue::Null, |dt| { - ClickHouseValue::DateTime64(dt.timestamp_micros()) - }) - }) - .collect(), - ArrayCell::Uuid(v) => v - .into_iter() - .map(|o| o.map_or(ClickHouseValue::Null, |u| ClickHouseValue::Uuid(*u.as_bytes()))) - .collect(), - ArrayCell::Json(v) => v - .into_iter() - .map(|o| o.map_or(ClickHouseValue::Null, |j| ClickHouseValue::String(j.to_string()))) - .collect(), - ArrayCell::Bytes(v) => v - .into_iter() - .map(|o| o.map_or(ClickHouseValue::Null, |b| ClickHouseValue::String(bytes_to_hex(b)))) - .collect(), + ArrayCell::Uuid(v) => map_array(v, |u| ClickHouseValue::Uuid(*u.as_bytes())), + ArrayCell::Json(v) => map_array(v, |j| ClickHouseValue::String(j.to_string())), + ArrayCell::Bytes(v) => map_array(v, |b| ClickHouseValue::String(bytes_to_hex(&b))), } } +/// Maps a `Vec>` to `Vec`, applying `f` to each +/// `Some` and substituting [`ClickHouseValue::Null`] for each `None`. +fn map_array(v: Vec>, mut f: F) -> Vec +where + F: FnMut(T) -> ClickHouseValue, +{ + v.into_iter() + .map(|o| match o { + Some(t) => f(t), + None => ClickHouseValue::Null, + }) + .collect() +} + +fn date_to_days(d: NaiveDate) -> u16 { + d.signed_duration_since(unix_epoch()).num_days().clamp(0, i64::from(u16::MAX)) as u16 +} + fn unix_epoch() -> NaiveDate { NaiveDate::from_ymd_opt(1970, 1, 1).expect("valid date") } -fn bytes_to_hex(bytes: Vec) -> String { +/// Lowercase hex-encodes `bytes` into a fresh `String`. +fn bytes_to_hex(bytes: &[u8]) -> String { + const HEX: &[u8; 16] = b"0123456789abcdef"; let mut s = String::with_capacity(bytes.len() * 2); - for b in bytes { - use fmt::Write; - let _ = write!(s, "{b:02x}"); + for &b in bytes { + s.push(HEX[(b >> 4) as usize] as char); + s.push(HEX[(b & 0x0f) as usize] as char); } s } @@ -425,10 +386,10 @@ mod tests { #[test] fn hex_encoding() { - assert_eq!(bytes_to_hex([].to_vec()), ""); - assert_eq!(bytes_to_hex([0x00].to_vec()), "00"); - assert_eq!(bytes_to_hex([0xff].to_vec()), "ff"); - assert_eq!(bytes_to_hex([0xde, 0xad, 0xbe, 0xef].to_vec()), "deadbeef"); + assert_eq!(bytes_to_hex(&[]), ""); + assert_eq!(bytes_to_hex(&[0x00]), "00"); + assert_eq!(bytes_to_hex(&[0xff]), "ff"); + assert_eq!(bytes_to_hex(&[0xde, 0xad, 0xbe, 0xef]), "deadbeef"); } /// # GIVEN From 3fca7421997b11bc9194618205f649a46c6fd40b Mon Sep 17 00:00:00 2001 From: Jordan McQueen Date: Tue, 28 Apr 2026 17:14:04 +0900 Subject: [PATCH 59/86] Polish ClickHouse doccomments and tidy encoding helpers --- etl-destinations/src/clickhouse/core.rs | 31 +++++++++++++++++---- etl-destinations/src/clickhouse/encoding.rs | 25 ++++++----------- etl-destinations/src/clickhouse/schema.rs | 14 +++++----- 3 files changed, 41 insertions(+), 29 deletions(-) diff --git a/etl-destinations/src/clickhouse/core.rs b/etl-destinations/src/clickhouse/core.rs index 11d85d794..1a6efbcdf 100644 --- a/etl-destinations/src/clickhouse/core.rs +++ b/etl-destinations/src/clickhouse/core.rs @@ -30,11 +30,19 @@ use crate::clickhouse::{ }, }; -/// Postgres CDC operation kind, written to the `cdc_operation` column. +/// Postgres CDC operation kind. Written to the `cdc_operation` column as the +/// matching uppercase string (`"INSERT"`, `"UPDATE"`, `"DELETE"`) so downstream +/// consumers (ReplacingMergeTree dedup, materialized views, etc.) can filter +/// or branch on operation type. #[derive(Copy, Clone)] enum CdcOperation { + /// New row inserted on the source. Insert, + /// Existing row updated on the source. Carries the post-update values. Update, + /// Row deleted on the source. Carries pre-delete values for the PK + /// columns; non-PK columns are filled in by `expand_key_row` (NULL for + /// nullable columns, type-appropriate zero for non-nullable). Delete, } @@ -52,8 +60,7 @@ impl std::fmt::Display for CdcOperation { struct PendingRow { /// CDC op kind, written into the `cdc_operation` column. operation: CdcOperation, - /// Commit LSN of the source transaction, written into the `cdc_lsn` - /// column. + /// Commit LSN of the source transaction, written into `cdc_lsn`. lsn: PgLsn, /// User column values in source schema order. The two CDC columns /// (`cdc_operation`, `cdc_lsn`) are appended at encode time and are @@ -81,6 +88,18 @@ fn expected_clickhouse_column_names(schema: &ReplicatedTableSchema) -> Vec = table_row.into_values().into_iter().map(cell_to_clickhouse_value).collect(); - values.push(ClickHouseValue::String(String::from("INSERT"))); - values.push(ClickHouseValue::UInt64(0)); + // CDC columns: initial-copy rows are tagged as INSERT with LSN 0 + // (sentinel meaning "this row pre-dates the streaming cursor"). + values.push(ClickHouseValue::String(CdcOperation::Insert.to_string())); + values.push(cdc_lsn_to_clickhouse_value(PgLsn::from(0))); values }) .collect(); diff --git a/etl-destinations/src/clickhouse/encoding.rs b/etl-destinations/src/clickhouse/encoding.rs index 5f9b4c30e..0bb95db85 100644 --- a/etl-destinations/src/clickhouse/encoding.rs +++ b/etl-destinations/src/clickhouse/encoding.rs @@ -128,10 +128,10 @@ fn bytes_to_hex(bytes: &[u8]) -> String { s } -/// Encodes a variable-length integer (LEB128) for ClickHouse string/array -/// lengths. +/// Encodes `v` as LEB128 for ClickHouse string/array lengths. pub(crate) fn rb_varint(mut v: usize, buf: &mut Vec) { loop { + // LEB128: bottom 7 bits per byte, MSB set on all but the last. let byte = (v & 0x7f) as u8; v >>= 7; if v == 0 { @@ -181,21 +181,12 @@ pub(crate) fn rb_encode_value(val: ClickHouseValue, buf: &mut Vec) -> EtlRes ClickHouseValue::Date(days) => buf.extend_from_slice(&days.to_le_bytes()), ClickHouseValue::DateTime64(micros) => buf.extend_from_slice(µs.to_le_bytes()), ClickHouseValue::Uuid(bytes) => { - // ClickHouse RowBinary UUID = two little-endian u64 (high bits then low bits). - // Our bytes are in standard UUID big-endian order, so we split into two u64 - // and write each in little-endian. - let high = u64::from_be_bytes(bytes[0..8].try_into().map_err( - |e: std::array::TryFromSliceError| { - etl_error!(ErrorKind::ConversionError, "UUID high-half conversion failed", e) - }, - )?); - let low = u64::from_be_bytes(bytes[8..16].try_into().map_err( - |e: std::array::TryFromSliceError| { - etl_error!(ErrorKind::ConversionError, "UUID low-half conversion failed", e) - }, - )?); - buf.extend_from_slice(&high.to_le_bytes()); - buf.extend_from_slice(&low.to_le_bytes()); + // ClickHouse RowBinary UUID = high u64 (LE) then low u64 (LE). Our + // bytes are in standard UUID big-endian order; reinterpret as a + // u128 to split halves cleanly. + let n = u128::from_be_bytes(bytes); + buf.extend_from_slice(&((n >> 64) as u64).to_le_bytes()); + buf.extend_from_slice(&(n as u64).to_le_bytes()); } // Array elements are always Nullable in ClickHouse: Array(Nullable(T)). ClickHouseValue::Array(items) => { diff --git a/etl-destinations/src/clickhouse/schema.rs b/etl-destinations/src/clickhouse/schema.rs index 12de37943..e7c06a8ea 100644 --- a/etl-destinations/src/clickhouse/schema.rs +++ b/etl-destinations/src/clickhouse/schema.rs @@ -64,16 +64,16 @@ pub(crate) fn quote_identifier(identifier: &str) -> String { format!("\"{}\"", identifier.replace('"', "\"\"")) } -/// Converts a Postgres `public.my_table` style table name into a ClickHouse -/// table name using the same double-underscore escaping convention used by -/// DuckLake/Iceberg. +/// Converts a Postgres `schema.table` name into a single-segment ClickHouse +/// table name. /// -/// - Schema and table are joined with `_` -/// - Any literal `_` in the schema or table name is escaped to `__` +/// Schema and table are joined with `_`; any literal `_` in either component +/// is doubled to `__` so the join character is unambiguous and the original +/// pair can be recovered by splitting on a single underscore. /// /// Examples: -/// - `public.orders` → `public_orders` -/// - `my_schema.t` → `my__schema_t` +/// - `public.orders` -> `public_orders` +/// - `my_schema.t` -> `my__schema_t` pub fn table_name_to_clickhouse_table_name(schema: &str, table: &str) -> String { let escaped_schema = schema.replace('_', "__"); let escaped_table = table.replace('_', "__"); From fce89756dcb0376654d3bc57fe4a750dc4a36da9 Mon Sep 17 00:00:00 2001 From: Jordan McQueen Date: Wed, 29 Apr 2026 14:16:25 +0900 Subject: [PATCH 60/86] Polish ClickHouse destination naming and metadata branch --- etl-destinations/src/clickhouse/client.rs | 4 +- etl-destinations/src/clickhouse/core.rs | 52 +++++++++++++-------- etl-destinations/src/clickhouse/encoding.rs | 11 +++-- 3 files changed, 41 insertions(+), 26 deletions(-) diff --git a/etl-destinations/src/clickhouse/client.rs b/etl-destinations/src/clickhouse/client.rs index 6fa464039..9c14f13ac 100644 --- a/etl-destinations/src/clickhouse/client.rs +++ b/etl-destinations/src/clickhouse/client.rs @@ -8,7 +8,7 @@ use etl::{ use url::Url; use crate::clickhouse::{ - encoding::{ClickHouseValue, rb_encode_row}, + encoding::{ClickHouseValue, encode_to_row_binary}, metrics::ETL_CH_INSERT_DURATION_SECONDS, schema::{clickhouse_column_type, quote_identifier}, }; @@ -239,7 +239,7 @@ impl ClickHouseClient { while bytes < max_bytes_per_insert { let Some(row) = rows.next() else { break }; row_buf.clear(); - rb_encode_row(row, nullable_flags, &mut row_buf)?; + encode_to_row_binary(row, nullable_flags, &mut row_buf)?; insert.write_buffered(&row_buf); bytes += row_buf.len() as u64; } diff --git a/etl-destinations/src/clickhouse/core.rs b/etl-destinations/src/clickhouse/core.rs index 1a6efbcdf..aee56d3ea 100644 --- a/etl-destinations/src/clickhouse/core.rs +++ b/etl-destinations/src/clickhouse/core.rs @@ -1,7 +1,6 @@ use std::{collections::HashMap, sync::Arc, time::Instant}; use etl::{ - bail, destination::{ Destination, async_result::{TruncateTableResult, WriteEventsResult, WriteTableRowsResult}, @@ -279,12 +278,16 @@ where ) .await?; } - Some(metadata) if metadata.is_applying() => { - self.recover_applying_metadata(table_id, &ch_table_name, schema, metadata).await?; + Some(metadata) => { + if metadata.is_applying() { + self.recover_applying_metadata(table_id, &ch_table_name, schema, metadata) + .await?; + } + // Otherwise the metadata is already `Applied`: this branch + // runs after `handle_relation_event` invalidated the cache, + // so no DDL is needed and we just fall through to recompute + // nullable flags below. } - // Applied metadata, cache miss after `handle_relation_event` - // invalidated the cache. No DDL needed. - Some(_applied) => {} } // Compute nullable flags from the actual ClickHouse schema. This matters after @@ -400,19 +403,21 @@ where let new_snapshot_id = new_schema.inner().snapshot_id; let new_replication_mask = new_schema.replication_mask().clone(); - let Some(metadata) = self.store.get_applied_destination_table_metadata(table_id).await? - else { - bail!( - ErrorKind::CorruptedTableSchema, - "Missing destination table metadata", - format!( - "No destination table metadata found for table {} when processing schema \ - change. The metadata should have been recorded during initial table \ - synchronization.", - table_id - ) - ); - }; + let metadata = + self.store.get_applied_destination_table_metadata(table_id).await?.ok_or_else( + || { + etl_error!( + ErrorKind::CorruptedTableSchema, + "Missing destination table metadata", + format!( + "No destination table metadata found for table {} when processing \ + schema change. The metadata should have been recorded during initial \ + table synchronization.", + table_id + ) + ) + }, + )?; let current_snapshot_id = metadata.snapshot_id; let current_replication_mask = metadata.replication_mask.clone(); @@ -784,6 +789,15 @@ where "clickhouse" } + // The trait methods below intentionally do not use `?` on the inner work. + // Errors must reach the caller via `async_result.send(result)`, not via the + // outer `EtlResult<()>`; using `?` would short-circuit before `send` runs + // and leave the receiver waiting. The outer return value just signals + // "work accepted, watch the channel for completion". `AsyncResult::send` + // itself returns `()`, and its `Drop` impl synthesizes a "dropped without + // sending" error if the path ever skips `send`, so the receiver is never + // silently abandoned. + async fn truncate_table( &self, replicated_table_schema: &ReplicatedTableSchema, diff --git a/etl-destinations/src/clickhouse/encoding.rs b/etl-destinations/src/clickhouse/encoding.rs index 0bb95db85..0636f3721 100644 --- a/etl-destinations/src/clickhouse/encoding.rs +++ b/etl-destinations/src/clickhouse/encoding.rs @@ -201,7 +201,7 @@ pub(crate) fn rb_encode_value(val: ClickHouseValue, buf: &mut Vec) -> EtlRes /// Encodes a complete row into `buf`, selecting nullable vs non-nullable /// encoding per column. -pub(crate) fn rb_encode_row( +pub(crate) fn encode_to_row_binary( values: Vec, nullable_flags: &[bool], buf: &mut Vec, @@ -403,9 +403,10 @@ mod tests { } #[test] - fn rb_encode_row_rejects_fewer_values_than_nullable_flags() { + fn encode_to_row_binary_rejects_fewer_values_than_nullable_flags() { let mut buf = vec![0xaa]; - let result = rb_encode_row(vec![ClickHouseValue::Int32(1)], &[false, false], &mut buf); + let result = + encode_to_row_binary(vec![ClickHouseValue::Int32(1)], &[false, false], &mut buf); assert!(result.is_err(), "row width mismatch must error"); let err = result.unwrap_err(); @@ -416,9 +417,9 @@ mod tests { } #[test] - fn rb_encode_row_rejects_more_values_than_nullable_flags() { + fn encode_to_row_binary_rejects_more_values_than_nullable_flags() { let mut buf = vec![0xaa]; - let result = rb_encode_row( + let result = encode_to_row_binary( vec![ClickHouseValue::Int32(1), ClickHouseValue::Int32(2)], &[false], &mut buf, From f4b20cf96158bbd39dd85c212974d01c05216095 Mon Sep 17 00:00:00 2001 From: Jordan McQueen Date: Wed, 29 Apr 2026 15:35:02 +0900 Subject: [PATCH 61/86] Use shared try_stringify_table_name in ClickHouse destination --- etl-destinations/src/clickhouse/core.rs | 18 ++++++------ etl-destinations/src/clickhouse/schema.rs | 34 ++--------------------- 2 files changed, 11 insertions(+), 41 deletions(-) diff --git a/etl-destinations/src/clickhouse/core.rs b/etl-destinations/src/clickhouse/core.rs index aee56d3ea..5dc3daf70 100644 --- a/etl-destinations/src/clickhouse/core.rs +++ b/etl-destinations/src/clickhouse/core.rs @@ -19,14 +19,14 @@ use tokio::task::JoinSet; use tracing::{debug, info, warn}; use url::Url; -use crate::clickhouse::{ - client::{ClickHouseClient, ClickHouseTableColumn}, - encoding::{ClickHouseValue, cell_to_clickhouse_value}, - metrics::{ETL_CH_DDL_DURATION_SECONDS, register_metrics}, - schema::{ - CDC_LSN_COLUMN_NAME, CDC_OPERATION_COLUMN_NAME, build_create_table_sql, - table_name_to_clickhouse_table_name, +use crate::{ + clickhouse::{ + client::{ClickHouseClient, ClickHouseTableColumn}, + encoding::{ClickHouseValue, cell_to_clickhouse_value}, + metrics::{ETL_CH_DDL_DURATION_SECONDS, register_metrics}, + schema::{CDC_LSN_COLUMN_NAME, CDC_OPERATION_COLUMN_NAME, build_create_table_sql}, }, + table_name::try_stringify_table_name, }; /// Postgres CDC operation kind. Written to the `cdc_operation` column as the @@ -258,9 +258,7 @@ where &self, schema: &ReplicatedTableSchema, ) -> EtlResult<(String, Arc<[bool]>)> { - let table_name = schema.name(); - let ch_table_name = - table_name_to_clickhouse_table_name(&table_name.schema, &table_name.name); + let ch_table_name = try_stringify_table_name(schema.name())?; if let Some(flags) = self.table_cache.read().get(&ch_table_name).cloned() { return Ok((ch_table_name, flags)); diff --git a/etl-destinations/src/clickhouse/schema.rs b/etl-destinations/src/clickhouse/schema.rs index e7c06a8ea..639d7e776 100644 --- a/etl-destinations/src/clickhouse/schema.rs +++ b/etl-destinations/src/clickhouse/schema.rs @@ -64,22 +64,6 @@ pub(crate) fn quote_identifier(identifier: &str) -> String { format!("\"{}\"", identifier.replace('"', "\"\"")) } -/// Converts a Postgres `schema.table` name into a single-segment ClickHouse -/// table name. -/// -/// Schema and table are joined with `_`; any literal `_` in either component -/// is doubled to `__` so the join character is unambiguous and the original -/// pair can be recovered by splitting on a single underscore. -/// -/// Examples: -/// - `public.orders` -> `public_orders` -/// - `my_schema.t` -> `my__schema_t` -pub fn table_name_to_clickhouse_table_name(schema: &str, table: &str) -> String { - let escaped_schema = schema.replace('_', "__"); - let escaped_table = table.replace('_', "__"); - format!("{escaped_schema}_{escaped_table}") -} - /// Returns the full ClickHouse type string for a column, with Nullable /// wrapping. /// @@ -129,19 +113,6 @@ mod tests { assert_eq!(quote_identifier("has\"quote"), "\"has\"\"quote\""); } - #[test] - fn table_name_escaping() { - assert_eq!(table_name_to_clickhouse_table_name("public", "orders"), "public_orders"); - assert_eq!( - table_name_to_clickhouse_table_name("my_schema", "my_table"), - "my__schema_my__table" - ); - assert_eq!( - table_name_to_clickhouse_table_name("public", "my__table"), - "public_my____table" - ); - } - #[test] fn build_create_table_sql_quotes_identifiers() { let schemas = vec![ColumnSchema { @@ -152,8 +123,9 @@ mod tests { primary_key_ordinal_position: Some(1), nullable: false, }]; - let table_name = table_name_to_clickhouse_table_name("sche\"ma", "ta\"ble"); - let sql = build_create_table_sql(&table_name, &schemas); + // Pre-encoded table name with embedded quotes to verify the SQL + // builder quotes/escapes the identifier itself. + let sql = build_create_table_sql("sche\"ma_ta\"ble", &schemas); assert!( sql.contains("CREATE TABLE IF NOT EXISTS \"sche\"\"ma_ta\"\"ble\""), From 4d1de92cf550b4a52fd47168bc6d92f932746520 Mon Sep 17 00:00:00 2001 From: Jordan McQueen Date: Wed, 29 Apr 2026 15:45:55 +0900 Subject: [PATCH 62/86] Record DDL duration histogram for all ClickHouse DDL paths --- etl-destinations/src/clickhouse/client.rs | 22 ++++++++++++++-------- etl-destinations/src/clickhouse/core.rs | 11 ++++------- 2 files changed, 18 insertions(+), 15 deletions(-) diff --git a/etl-destinations/src/clickhouse/client.rs b/etl-destinations/src/clickhouse/client.rs index 9c14f13ac..2af4b72bb 100644 --- a/etl-destinations/src/clickhouse/client.rs +++ b/etl-destinations/src/clickhouse/client.rs @@ -9,7 +9,7 @@ use url::Url; use crate::clickhouse::{ encoding::{ClickHouseValue, encode_to_row_binary}, - metrics::ETL_CH_INSERT_DURATION_SECONDS, + metrics::{ETL_CH_DDL_DURATION_SECONDS, ETL_CH_INSERT_DURATION_SECONDS}, schema::{clickhouse_column_type, quote_identifier}, }; @@ -118,15 +118,21 @@ impl ClickHouseClient { .map_err(|e| etl_error!(ErrorKind::Unknown, "ClickHouse connectivity check failed", e)) } - /// Executes a DDL statement (e.g. `CREATE TABLE IF NOT EXISTS …`). - pub(crate) async fn execute_ddl(&self, sql: &str) -> EtlResult<()> { - self.inner.query(sql).execute().await.map_err(|e| { + /// Executes a DDL statement (e.g. `CREATE TABLE IF NOT EXISTS …`) and + /// records its duration in the `etl_ch_ddl_duration_seconds` histogram + /// labelled with `table_name`. + pub(crate) async fn execute_ddl(&self, table_name: &str, sql: &str) -> EtlResult<()> { + let ddl_start = Instant::now(); + let result = self.inner.query(sql).execute().await.map_err(|e| { etl_error!( ErrorKind::Unknown, "ClickHouse DDL failed", format!("DDL execution failed: {e}") ) - }) + }); + metrics::histogram!(ETL_CH_DDL_DURATION_SECONDS, "table" => table_name.to_string()) + .record(ddl_start.elapsed().as_secs_f64()); + result } /// Returns ClickHouse columns for a table in position order. @@ -167,13 +173,13 @@ impl ClickHouseClient { after_column: &str, ) -> EtlResult<()> { let sql = build_add_column_sql(table_name, column, after_column); - self.execute_ddl(&sql).await + self.execute_ddl(table_name, &sql).await } /// Drops a column from an existing ClickHouse table (idempotent). pub(crate) async fn drop_column(&self, table_name: &str, column_name: &str) -> EtlResult<()> { let sql = build_drop_column_sql(table_name, column_name); - self.execute_ddl(&sql).await + self.execute_ddl(table_name, &sql).await } /// Renames a column in an existing ClickHouse table (idempotent). @@ -188,7 +194,7 @@ impl ClickHouseClient { new_name: &str, ) -> EtlResult<()> { let sql = build_rename_column_sql(table_name, old_name, new_name); - self.execute_ddl(&sql).await + self.execute_ddl(table_name, &sql).await } /// Executes `TRUNCATE TABLE IF EXISTS` for the supplied table. diff --git a/etl-destinations/src/clickhouse/core.rs b/etl-destinations/src/clickhouse/core.rs index 5dc3daf70..48207afa7 100644 --- a/etl-destinations/src/clickhouse/core.rs +++ b/etl-destinations/src/clickhouse/core.rs @@ -1,4 +1,4 @@ -use std::{collections::HashMap, sync::Arc, time::Instant}; +use std::{collections::HashMap, sync::Arc}; use etl::{ destination::{ @@ -23,7 +23,7 @@ use crate::{ clickhouse::{ client::{ClickHouseClient, ClickHouseTableColumn}, encoding::{ClickHouseValue, cell_to_clickhouse_value}, - metrics::{ETL_CH_DDL_DURATION_SECONDS, register_metrics}, + metrics::register_metrics, schema::{CDC_LSN_COLUMN_NAME, CDC_OPERATION_COLUMN_NAME, build_create_table_sql}, }, table_name::try_stringify_table_name, @@ -238,10 +238,7 @@ where let column_schemas: Vec<_> = schema.column_schemas().cloned().collect(); let ddl = build_create_table_sql(ch_table_name, &column_schemas); - let ddl_start = Instant::now(); - self.client.execute_ddl(&ddl).await?; - metrics::histogram!(ETL_CH_DDL_DURATION_SECONDS, "table" => ch_table_name.to_string()) - .record(ddl_start.elapsed().as_secs_f64()); + self.client.execute_ddl(ch_table_name, &ddl).await?; self.store.store_destination_table_metadata(table_id, metadata.to_applied()).await?; @@ -350,7 +347,7 @@ where None => { let column_schemas: Vec<_> = schema.column_schemas().cloned().collect(); let ddl = build_create_table_sql(ch_table_name, &column_schemas); - self.client.execute_ddl(&ddl).await?; + self.client.execute_ddl(ch_table_name, &ddl).await?; } } From 4c9561be96e9ffcb764751982e49a9064850579a Mon Sep 17 00:00:00 2001 From: Jordan McQueen Date: Wed, 29 Apr 2026 16:21:00 +0900 Subject: [PATCH 63/86] Add DDL kind label to etl_ch_ddl_duration_seconds histogram --- etl-destinations/src/clickhouse/client.rs | 45 +++++++++++++++++++---- etl-destinations/src/clickhouse/core.rs | 6 +-- 2 files changed, 41 insertions(+), 10 deletions(-) diff --git a/etl-destinations/src/clickhouse/client.rs b/etl-destinations/src/clickhouse/client.rs index 2af4b72bb..c8ce64d7d 100644 --- a/etl-destinations/src/clickhouse/client.rs +++ b/etl-destinations/src/clickhouse/client.rs @@ -73,6 +73,28 @@ fn build_insert_rows_sql(table_name: &str) -> String { format!("INSERT INTO {table_name} FORMAT RowBinary") } +/// Kind of DDL being executed; surfaces as a `kind` label on the +/// `etl_ch_ddl_duration_seconds` histogram so per-operation latencies can be +/// distinguished (one-shot CREATE vs. online ALTER, etc.). +#[derive(Copy, Clone)] +pub(crate) enum DdlKind { + CreateTable, + AddColumn, + DropColumn, + RenameColumn, +} + +impl DdlKind { + fn as_label(self) -> &'static str { + match self { + DdlKind::CreateTable => "create_table", + DdlKind::AddColumn => "add_column", + DdlKind::DropColumn => "drop_column", + DdlKind::RenameColumn => "rename_column", + } + } +} + /// High-level ClickHouse client used by [`super::core::ClickHouseDestination`]. /// /// Wraps a [`clickhouse::Client`] and exposes typed methods for DDL, @@ -120,8 +142,13 @@ impl ClickHouseClient { /// Executes a DDL statement (e.g. `CREATE TABLE IF NOT EXISTS …`) and /// records its duration in the `etl_ch_ddl_duration_seconds` histogram - /// labelled with `table_name`. - pub(crate) async fn execute_ddl(&self, table_name: &str, sql: &str) -> EtlResult<()> { + /// labelled with the DDL `kind` and `table_name`. + pub(crate) async fn execute_ddl( + &self, + kind: DdlKind, + table_name: &str, + sql: &str, + ) -> EtlResult<()> { let ddl_start = Instant::now(); let result = self.inner.query(sql).execute().await.map_err(|e| { etl_error!( @@ -130,8 +157,12 @@ impl ClickHouseClient { format!("DDL execution failed: {e}") ) }); - metrics::histogram!(ETL_CH_DDL_DURATION_SECONDS, "table" => table_name.to_string()) - .record(ddl_start.elapsed().as_secs_f64()); + metrics::histogram!( + ETL_CH_DDL_DURATION_SECONDS, + "kind" => kind.as_label(), + "table" => table_name.to_string(), + ) + .record(ddl_start.elapsed().as_secs_f64()); result } @@ -173,13 +204,13 @@ impl ClickHouseClient { after_column: &str, ) -> EtlResult<()> { let sql = build_add_column_sql(table_name, column, after_column); - self.execute_ddl(table_name, &sql).await + self.execute_ddl(DdlKind::AddColumn, table_name, &sql).await } /// Drops a column from an existing ClickHouse table (idempotent). pub(crate) async fn drop_column(&self, table_name: &str, column_name: &str) -> EtlResult<()> { let sql = build_drop_column_sql(table_name, column_name); - self.execute_ddl(table_name, &sql).await + self.execute_ddl(DdlKind::DropColumn, table_name, &sql).await } /// Renames a column in an existing ClickHouse table (idempotent). @@ -194,7 +225,7 @@ impl ClickHouseClient { new_name: &str, ) -> EtlResult<()> { let sql = build_rename_column_sql(table_name, old_name, new_name); - self.execute_ddl(table_name, &sql).await + self.execute_ddl(DdlKind::RenameColumn, table_name, &sql).await } /// Executes `TRUNCATE TABLE IF EXISTS` for the supplied table. diff --git a/etl-destinations/src/clickhouse/core.rs b/etl-destinations/src/clickhouse/core.rs index 48207afa7..cf46b1fb8 100644 --- a/etl-destinations/src/clickhouse/core.rs +++ b/etl-destinations/src/clickhouse/core.rs @@ -21,7 +21,7 @@ use url::Url; use crate::{ clickhouse::{ - client::{ClickHouseClient, ClickHouseTableColumn}, + client::{ClickHouseClient, ClickHouseTableColumn, DdlKind}, encoding::{ClickHouseValue, cell_to_clickhouse_value}, metrics::register_metrics, schema::{CDC_LSN_COLUMN_NAME, CDC_OPERATION_COLUMN_NAME, build_create_table_sql}, @@ -238,7 +238,7 @@ where let column_schemas: Vec<_> = schema.column_schemas().cloned().collect(); let ddl = build_create_table_sql(ch_table_name, &column_schemas); - self.client.execute_ddl(ch_table_name, &ddl).await?; + self.client.execute_ddl(DdlKind::CreateTable, ch_table_name, &ddl).await?; self.store.store_destination_table_metadata(table_id, metadata.to_applied()).await?; @@ -347,7 +347,7 @@ where None => { let column_schemas: Vec<_> = schema.column_schemas().cloned().collect(); let ddl = build_create_table_sql(ch_table_name, &column_schemas); - self.client.execute_ddl(ch_table_name, &ddl).await?; + self.client.execute_ddl(DdlKind::CreateTable, ch_table_name, &ddl).await?; } } From 72e32a2aa3666eee29d4cc3ccf588a5a9b0c45d5 Mon Sep 17 00:00:00 2001 From: Jordan McQueen Date: Thu, 30 Apr 2026 11:51:48 +0900 Subject: [PATCH 64/86] Spell out 'clickhouse' in identifiers, env vars, and config keys Drops the 'ch' shorthand from ClickHouse-related metric names, K8s secret names, container env vars, YAML config keys, CLI flags, Rust constants, and local variable bindings. Matches how 'iceberg' and 'ducklake' are spelled out in the same surfaces. --- etl-api/src/configs/destination.rs | 5 +- etl-api/src/k8s/http.rs | 24 ++-- etl-config/src/shared/destination.rs | 2 + etl-destinations/src/clickhouse/client.rs | 18 +-- etl-destinations/src/clickhouse/core.rs | 93 +++++++------ etl-destinations/src/clickhouse/metrics.rs | 8 +- etl-destinations/tests/clickhouse_pipeline.rs | 123 +++++++++--------- etl-examples/README.md | 8 +- etl-examples/src/bin/clickhouse.rs | 36 ++--- etl-replicator/configuration/base.yaml | 2 +- etl-replicator/configuration/dev.yaml | 2 +- etl-replicator/configuration/prod.yaml | 2 +- 12 files changed, 173 insertions(+), 150 deletions(-) diff --git a/etl-api/src/configs/destination.rs b/etl-api/src/configs/destination.rs index 6b0849188..066250f83 100644 --- a/etl-api/src/configs/destination.rs +++ b/etl-api/src/configs/destination.rs @@ -44,6 +44,7 @@ pub enum FullApiDestinationConfig { #[serde(skip_serializing_if = "Option::is_none")] connection_pool_size: Option, }, + #[serde(rename = "clickhouse")] ClickHouse { /// ClickHouse HTTP(S) endpoint URL. #[schema(value_type = String, example = "http://test:8123")] @@ -1124,7 +1125,7 @@ mod tests { fn full_api_destination_config_deserializes_clickhouse_url() { let json = r#" { - "click_house": { + "clickhouse": { "url": " https://example.com:8443 ", "user": "etl", "database": "analytics" @@ -1148,7 +1149,7 @@ mod tests { fn full_api_destination_config_rejects_non_http_clickhouse_url() { let json = r#" { - "click_house": { + "clickhouse": { "url": "ftp://example.com/data", "user": "etl", "database": "analytics" diff --git a/etl-api/src/k8s/http.rs b/etl-api/src/k8s/http.rs index 6bdb4d410..6a7740227 100644 --- a/etl-api/src/k8s/http.rs +++ b/etl-api/src/k8s/http.rs @@ -22,9 +22,9 @@ use crate::{ /// Secret name suffix for the BigQuery service account key. const BQ_SECRET_NAME_SUFFIX: &str = "bq-service-account-key"; /// Secret name suffix for the ClickHouse password. -const CLICKHOUSE_SECRET_NAME_SUFFIX: &str = "ch-password"; +const CLICKHOUSE_SECRET_NAME_SUFFIX: &str = "clickhouse-password"; /// Name of the password in the ClickHouse secret and its reference. -const CLICKHOUSE_PASSWORD_NAME: &str = "ch-password"; +const CLICKHOUSE_PASSWORD_NAME: &str = "clickhouse-password"; /// Name of the service account key in the BigQuery secret and its reference. const BQ_SERVICE_ACCOUNT_KEY_NAME: &str = "service-account-key"; /// Secret name suffix for iceberg secrets (includes catalog token, @@ -345,13 +345,13 @@ impl K8sClient for HttpK8sClient { debug!("patching clickhouse secret"); if let Some(password) = password { - let encoded_ch_password = BASE64_STANDARD.encode(password); + let encoded_clickhouse_password = BASE64_STANDARD.encode(password); let clickhouse_secret_name = create_clickhouse_secret_name(prefix); let replicator_app_name = create_replicator_app_name(prefix); let clickhouse_secret_json = create_clickhouse_password_secret_json( &clickhouse_secret_name, &replicator_app_name, - &encoded_ch_password, + &encoded_clickhouse_password, ); let secret: Secret = serde_json::from_value(clickhouse_secret_json)?; @@ -442,9 +442,11 @@ impl K8sClient for HttpK8sClient { async fn delete_clickhouse_secret(&self, prefix: &str) -> Result<(), K8sError> { debug!("deleting clickhouse secret"); - let ch_secret_name = create_clickhouse_secret_name(prefix); + let clickhouse_secret_name = create_clickhouse_secret_name(prefix); let dp = DeleteParams::default(); - Self::handle_delete_with_404_ignore(self.secrets_api.delete(&ch_secret_name, &dp).await)?; + Self::handle_delete_with_404_ignore( + self.secrets_api.delete(&clickhouse_secret_name, &dp).await, + )?; Ok(()) } @@ -712,7 +714,7 @@ fn create_postgres_secret_json( fn create_clickhouse_password_secret_json( secret_name: &str, replicator_app_name: &str, - encoded_ch_password: &str, + encoded_clickhouse_password: &str, ) -> serde_json::Value { json!({ "apiVersion": "v1", @@ -727,7 +729,7 @@ fn create_clickhouse_password_secret_json( }, "type": "Opaque", "data": { - CLICKHOUSE_PASSWORD_NAME: encoded_ch_password, + CLICKHOUSE_PASSWORD_NAME: encoded_clickhouse_password, } }) } @@ -1107,7 +1109,7 @@ fn create_bq_secret_env_var_json(bq_secret_name: &str) -> serde_json::Value { fn create_clickhouse_secret_env_var_json(clickhouse_secret_name: &str) -> serde_json::Value { json!({ - "name": "APP_DESTINATION__CLICK_HOUSE__PASSWORD", + "name": "APP_DESTINATION__CLICKHOUSE__PASSWORD", "valueFrom": { "secretKeyRef": { "name": clickhouse_secret_name, @@ -1669,7 +1671,7 @@ mod tests { )); assert!(container_environment_has_var( &container_environment, - "APP_DESTINATION__CLICK_HOUSE__PASSWORD", + "APP_DESTINATION__CLICKHOUSE__PASSWORD", )); } @@ -1692,7 +1694,7 @@ mod tests { )); assert!(!container_environment_has_var( &container_environment, - "APP_DESTINATION__CLICK_HOUSE__PASSWORD", + "APP_DESTINATION__CLICKHOUSE__PASSWORD", )); } diff --git a/etl-config/src/shared/destination.rs b/etl-config/src/shared/destination.rs index de48e8908..b3cd5d3c0 100644 --- a/etl-config/src/shared/destination.rs +++ b/etl-config/src/shared/destination.rs @@ -48,6 +48,7 @@ pub enum DestinationConfig { #[serde(default = "default_connection_pool_size")] connection_pool_size: usize, }, + #[serde(rename = "clickhouse")] ClickHouse { /// ClickHouse HTTP(S) endpoint URL. url: Url, @@ -243,6 +244,7 @@ pub enum DestinationConfigWithoutSecrets { #[serde(default = "default_connection_pool_size")] connection_pool_size: usize, }, + #[serde(rename = "clickhouse")] ClickHouse { /// ClickHouse HTTP(S) endpoint URL. url: Url, diff --git a/etl-destinations/src/clickhouse/client.rs b/etl-destinations/src/clickhouse/client.rs index c8ce64d7d..625c41fb7 100644 --- a/etl-destinations/src/clickhouse/client.rs +++ b/etl-destinations/src/clickhouse/client.rs @@ -9,7 +9,7 @@ use url::Url; use crate::clickhouse::{ encoding::{ClickHouseValue, encode_to_row_binary}, - metrics::{ETL_CH_DDL_DURATION_SECONDS, ETL_CH_INSERT_DURATION_SECONDS}, + metrics::{ETL_CLICKHOUSE_DDL_DURATION_SECONDS, ETL_CLICKHOUSE_INSERT_DURATION_SECONDS}, schema::{clickhouse_column_type, quote_identifier}, }; @@ -74,8 +74,8 @@ fn build_insert_rows_sql(table_name: &str) -> String { } /// Kind of DDL being executed; surfaces as a `kind` label on the -/// `etl_ch_ddl_duration_seconds` histogram so per-operation latencies can be -/// distinguished (one-shot CREATE vs. online ALTER, etc.). +/// `etl_clickhouse_ddl_duration_seconds` histogram so per-operation latencies +/// can be distinguished (one-shot CREATE vs. online ALTER, etc.). #[derive(Copy, Clone)] pub(crate) enum DdlKind { CreateTable, @@ -141,8 +141,8 @@ impl ClickHouseClient { } /// Executes a DDL statement (e.g. `CREATE TABLE IF NOT EXISTS …`) and - /// records its duration in the `etl_ch_ddl_duration_seconds` histogram - /// labelled with the DDL `kind` and `table_name`. + /// records its duration in the `etl_clickhouse_ddl_duration_seconds` + /// histogram labelled with the DDL `kind` and `table_name`. pub(crate) async fn execute_ddl( &self, kind: DdlKind, @@ -158,7 +158,7 @@ impl ClickHouseClient { ) }); metrics::histogram!( - ETL_CH_DDL_DURATION_SECONDS, + ETL_CLICKHOUSE_DDL_DURATION_SECONDS, "kind" => kind.as_label(), "table" => table_name.to_string(), ) @@ -251,8 +251,8 @@ impl ClickHouseClient { /// initial copies. /// /// The `source` label (`"copy"` or `"streaming"`) is attached to the - /// `etl_ch_insert_duration_seconds` histogram recorded after each committed - /// INSERT statement. + /// `etl_clickhouse_insert_duration_seconds` histogram recorded after each + /// committed INSERT statement. pub(crate) async fn insert_rows( &self, table_name: &str, @@ -289,7 +289,7 @@ impl ClickHouseClient { ) })?; metrics::histogram!( - ETL_CH_INSERT_DURATION_SECONDS, + ETL_CLICKHOUSE_INSERT_DURATION_SECONDS, "table" => table_name.to_string(), "source" => source ) diff --git a/etl-destinations/src/clickhouse/core.rs b/etl-destinations/src/clickhouse/core.rs index cf46b1fb8..792446783 100644 --- a/etl-destinations/src/clickhouse/core.rs +++ b/etl-destinations/src/clickhouse/core.rs @@ -100,7 +100,7 @@ fn expected_clickhouse_column_names(schema: &ReplicatedTableSchema) -> Vec EtlResult> { @@ -110,7 +110,7 @@ fn nullable_flags_from_clickhouse_columns( "ClickHouse table schema does not match replicated schema", format!( "table '{}' has {} columns, but {} were expected", - ch_table_name, + clickhouse_table_name, actual_columns.len(), expected_column_names.len() ) @@ -127,7 +127,7 @@ fn nullable_flags_from_clickhouse_columns( "ClickHouse table schema does not match replicated schema", format!( "table '{}' column {} is named '{}', but '{}' was expected", - ch_table_name, + clickhouse_table_name, index + 1, actual_column.name, expected_name @@ -224,21 +224,21 @@ where async fn create_table_with_metadata( &self, table_id: TableId, - ch_table_name: &str, + clickhouse_table_name: &str, schema: &ReplicatedTableSchema, snapshot_id: etl::types::SnapshotId, replication_mask: etl::types::ReplicationMask, ) -> EtlResult<()> { let metadata = DestinationTableMetadata::new_applying( - ch_table_name.to_string(), + clickhouse_table_name.to_string(), snapshot_id, replication_mask, ); self.store.store_destination_table_metadata(table_id, metadata.clone()).await?; let column_schemas: Vec<_> = schema.column_schemas().cloned().collect(); - let ddl = build_create_table_sql(ch_table_name, &column_schemas); - self.client.execute_ddl(DdlKind::CreateTable, ch_table_name, &ddl).await?; + let ddl = build_create_table_sql(clickhouse_table_name, &column_schemas); + self.client.execute_ddl(DdlKind::CreateTable, clickhouse_table_name, &ddl).await?; self.store.store_destination_table_metadata(table_id, metadata.to_applied()).await?; @@ -246,7 +246,7 @@ where } /// Ensures the ClickHouse table for the given schema exists, returning - /// `(ch_table_name, nullable_flags)`. + /// `(clickhouse_table_name, nullable_flags)`. /// /// On first encounter, executes `CREATE TABLE IF NOT EXISTS` and stores /// destination metadata with `Applied` status. Subsequent calls return @@ -255,10 +255,10 @@ where &self, schema: &ReplicatedTableSchema, ) -> EtlResult<(String, Arc<[bool]>)> { - let ch_table_name = try_stringify_table_name(schema.name())?; + let clickhouse_table_name = try_stringify_table_name(schema.name())?; - if let Some(flags) = self.table_cache.read().get(&ch_table_name).cloned() { - return Ok((ch_table_name, flags)); + if let Some(flags) = self.table_cache.read().get(&clickhouse_table_name).cloned() { + return Ok((clickhouse_table_name, flags)); } let table_id = schema.id(); @@ -266,7 +266,7 @@ where None => { self.create_table_with_metadata( table_id, - &ch_table_name, + &clickhouse_table_name, schema, schema.inner().snapshot_id, schema.replication_mask().clone(), @@ -275,8 +275,13 @@ where } Some(metadata) => { if metadata.is_applying() { - self.recover_applying_metadata(table_id, &ch_table_name, schema, metadata) - .await?; + self.recover_applying_metadata( + table_id, + &clickhouse_table_name, + schema, + metadata, + ) + .await?; } // Otherwise the metadata is already `Applied`: this branch // runs after `handle_relation_event` invalidated the cache, @@ -289,10 +294,10 @@ where // `ALTER TABLE ADD COLUMN`: ClickHouse scalar columns are forced to // `Nullable(T)` even when the Postgres column is `NOT NULL`, so RowBinary must // include the nullable marker byte ClickHouse expects. - let actual_columns = self.client.table_columns(&ch_table_name).await?; + let actual_columns = self.client.table_columns(&clickhouse_table_name).await?; let expected_column_names = expected_clickhouse_column_names(schema); let nullable_flags = nullable_flags_from_clickhouse_columns( - &ch_table_name, + &clickhouse_table_name, &expected_column_names, &actual_columns, )?; @@ -302,11 +307,13 @@ where let flags = { let mut guard = self.table_cache.write(); Arc::clone( - guard.entry(ch_table_name.clone()).or_insert_with(|| Arc::clone(&nullable_flags)), + guard + .entry(clickhouse_table_name.clone()) + .or_insert_with(|| Arc::clone(&nullable_flags)), ) }; - Ok((ch_table_name, flags)) + Ok((clickhouse_table_name, flags)) } /// Re-runs an interrupted DDL idempotently and transitions metadata to @@ -316,7 +323,7 @@ where async fn recover_applying_metadata( &self, table_id: TableId, - ch_table_name: &str, + clickhouse_table_name: &str, schema: &ReplicatedTableSchema, metadata: DestinationTableMetadata, ) -> EtlResult<()> { @@ -342,12 +349,12 @@ where metadata.replication_mask.clone(), ); let diff = old_schema.diff(schema); - self.apply_schema_diff(ch_table_name, &diff, &old_schema).await?; + self.apply_schema_diff(clickhouse_table_name, &diff, &old_schema).await?; } None => { let column_schemas: Vec<_> = schema.column_schemas().cloned().collect(); - let ddl = build_create_table_sql(ch_table_name, &column_schemas); - self.client.execute_ddl(DdlKind::CreateTable, ch_table_name, &ddl).await?; + let ddl = build_create_table_sql(clickhouse_table_name, &column_schemas); + self.client.execute_ddl(DdlKind::CreateTable, clickhouse_table_name, &ddl).await?; } } @@ -356,8 +363,8 @@ where } async fn truncate_table_inner(&self, schema: &ReplicatedTableSchema) -> EtlResult<()> { - let (ch_table_name, _) = self.ensure_table_exists(schema).await?; - self.client.truncate_table(&ch_table_name).await + let (clickhouse_table_name, _) = self.ensure_table_exists(schema).await?; + self.client.truncate_table(&clickhouse_table_name).await } async fn write_table_rows_inner( @@ -365,7 +372,7 @@ where schema: &ReplicatedTableSchema, table_rows: Vec, ) -> EtlResult<()> { - let (ch_table_name, nullable_flags) = self.ensure_table_exists(schema).await?; + let (clickhouse_table_name, nullable_flags) = self.ensure_table_exists(schema).await?; let rows: Vec> = table_rows .into_iter() @@ -382,7 +389,7 @@ where self.client .insert_rows( - &ch_table_name, + &clickhouse_table_name, rows, &nullable_flags, self.inserter_config.max_bytes_per_insert, @@ -449,11 +456,11 @@ where current_replication_mask.clone(), ); - let ch_table_name = &metadata.destination_table_id; + let clickhouse_table_name = &metadata.destination_table_id; // Mark as Applying before DDL changes. let updated_metadata = DestinationTableMetadata::new_applied( - ch_table_name.clone(), + clickhouse_table_name.clone(), current_snapshot_id, current_replication_mask, ) @@ -466,7 +473,9 @@ where // Compute and apply the diff. let diff = current_schema.diff(new_schema); - if let Err(err) = self.apply_schema_diff(ch_table_name, &diff, ¤t_schema).await { + if let Err(err) = + self.apply_schema_diff(clickhouse_table_name, &diff, ¤t_schema).await + { warn!( "schema change failed for table {}: {}. Manual intervention may be required.", table_id, err @@ -482,7 +491,7 @@ where // Invalidate cached nullable flags so the next write recomputes them. { let mut guard = self.table_cache.write(); - guard.remove(ch_table_name); + guard.remove(clickhouse_table_name); } info!( @@ -515,7 +524,7 @@ where /// status tracks this for diagnostic purposes. async fn apply_schema_diff( &self, - ch_table_name: &str, + clickhouse_table_name: &str, diff: &SchemaDiff, current_schema: &ReplicatedTableSchema, ) -> EtlResult<()> { @@ -530,16 +539,18 @@ where current_schema.column_schemas().last().map(|c| c.name.clone()).unwrap_or_default(); for column in &diff.columns_to_add { - self.client.add_column(ch_table_name, column, &last_user_column).await?; + self.client.add_column(clickhouse_table_name, column, &last_user_column).await?; last_user_column = column.name.clone(); } for rename in &diff.columns_to_rename { - self.client.rename_column(ch_table_name, &rename.old_name, &rename.new_name).await?; + self.client + .rename_column(clickhouse_table_name, &rename.old_name, &rename.new_name) + .await?; } for column in &diff.columns_to_remove { - self.client.drop_column(ch_table_name, &column.name).await?; + self.client.drop_column(clickhouse_table_name, &column.name).await?; } Ok(()) @@ -668,12 +679,12 @@ where let mut prepared: Vec<(String, Arc<[bool]>, Vec)> = Vec::with_capacity(pending.len()); for (_, (schema, rows)) in pending { - let (ch_table_name, nullable_flags) = self.ensure_table_exists(&schema).await?; - prepared.push((ch_table_name, nullable_flags, rows)); + let (clickhouse_table_name, nullable_flags) = self.ensure_table_exists(&schema).await?; + prepared.push((clickhouse_table_name, nullable_flags, rows)); } let mut join_set: JoinSet> = JoinSet::new(); - for (ch_table_name, nullable_flags, rows) in prepared { + for (clickhouse_table_name, nullable_flags, rows) in prepared { let client = self.client.clone(); let max_bytes = self.inserter_config.max_bytes_per_insert; @@ -690,7 +701,13 @@ where .collect(); client - .insert_rows(&ch_table_name, rows, &nullable_flags, max_bytes, "streaming") + .insert_rows( + &clickhouse_table_name, + rows, + &nullable_flags, + max_bytes, + "streaming", + ) .await }); } diff --git a/etl-destinations/src/clickhouse/metrics.rs b/etl-destinations/src/clickhouse/metrics.rs index 61be889bf..df480c04a 100644 --- a/etl-destinations/src/clickhouse/metrics.rs +++ b/etl-destinations/src/clickhouse/metrics.rs @@ -6,12 +6,12 @@ static REGISTER_METRICS: Once = Once::new(); /// Duration of `CREATE TABLE IF NOT EXISTS` DDL operations sent to ClickHouse. /// Labels: `table`. -pub const ETL_CH_DDL_DURATION_SECONDS: &str = "etl_ch_ddl_duration_seconds"; +pub const ETL_CLICKHOUSE_DDL_DURATION_SECONDS: &str = "etl_clickhouse_ddl_duration_seconds"; /// Duration of a single RowBinary INSERT statement from first write to server /// acknowledgement. Labels: `table`, `source` (`copy` = initial table sync, /// `streaming` = CDC events). -pub const ETL_CH_INSERT_DURATION_SECONDS: &str = "etl_ch_insert_duration_seconds"; +pub const ETL_CLICKHOUSE_INSERT_DURATION_SECONDS: &str = "etl_clickhouse_insert_duration_seconds"; /// Register ClickHouse-specific metrics. /// @@ -19,14 +19,14 @@ pub const ETL_CH_INSERT_DURATION_SECONDS: &str = "etl_ch_insert_duration_seconds pub fn register_metrics() { REGISTER_METRICS.call_once(|| { describe_histogram!( - ETL_CH_DDL_DURATION_SECONDS, + ETL_CLICKHOUSE_DDL_DURATION_SECONDS, Unit::Seconds, "Duration of CREATE TABLE IF NOT EXISTS DDL operations sent to ClickHouse, labeled by \ table" ); describe_histogram!( - ETL_CH_INSERT_DURATION_SECONDS, + ETL_CLICKHOUSE_INSERT_DURATION_SECONDS, Unit::Seconds, "Duration of RowBinary INSERT statements from first write to server acknowledgement, \ labeled by table and source" diff --git a/etl-destinations/tests/clickhouse_pipeline.rs b/etl-destinations/tests/clickhouse_pipeline.rs index d8cbbb597..198cbeb52 100644 --- a/etl-destinations/tests/clickhouse_pipeline.rs +++ b/etl-destinations/tests/clickhouse_pipeline.rs @@ -109,12 +109,12 @@ const TS_2024_01_15_12_00_US: i64 = 1_705_320_000_000_000; /// Waits until ClickHouse returns at least `expected_rows` from /// `UPDATE_FLOW_SELECT`. async fn wait_for_update_flow_rows( - ch_db: &ClickHouseTestDatabase, + clickhouse_db: &ClickHouseTestDatabase, expected_rows: usize, ) -> Vec { let mut rows: Vec = Vec::with_capacity(expected_rows); for _ in 0..50 { - rows = ch_db.query(UPDATE_FLOW_SELECT).await; + rows = clickhouse_db.query(UPDATE_FLOW_SELECT).await; if rows.len() >= expected_rows { return rows; } @@ -131,12 +131,12 @@ async fn wait_for_update_flow_rows( /// Waits until ClickHouse returns at least `expected_rows` from /// `DELETE_FLOW_SELECT`. async fn wait_for_delete_flow_rows( - ch_db: &ClickHouseTestDatabase, + clickhouse_db: &ClickHouseTestDatabase, expected_rows: usize, ) -> Vec { let mut rows: Vec = Vec::with_capacity(expected_rows); for _ in 0..50 { - rows = ch_db.query(DELETE_FLOW_SELECT).await; + rows = clickhouse_db.query(DELETE_FLOW_SELECT).await; if rows.len() >= expected_rows { return rows; } @@ -153,12 +153,12 @@ async fn wait_for_delete_flow_rows( /// Waits until ClickHouse returns at least `expected_rows` from /// `RESTART_FLOW_SELECT`. async fn wait_for_restart_flow_rows( - ch_db: &ClickHouseTestDatabase, + clickhouse_db: &ClickHouseTestDatabase, expected_rows: usize, ) -> Vec { let mut rows: Vec = Vec::with_capacity(expected_rows); for _ in 0..50 { - rows = ch_db.query(RESTART_FLOW_SELECT).await; + rows = clickhouse_db.query(RESTART_FLOW_SELECT).await; if rows.len() >= expected_rows { return rows; } @@ -174,9 +174,9 @@ async fn wait_for_restart_flow_rows( /// Waits until ClickHouse returns exactly zero rows from /// `TRUNCATE_FLOW_SELECT`. -async fn wait_for_truncate_flow_empty(ch_db: &ClickHouseTestDatabase) { +async fn wait_for_truncate_flow_empty(clickhouse_db: &ClickHouseTestDatabase) { for _ in 0..50 { - let rows: Vec = ch_db.query(TRUNCATE_FLOW_SELECT).await; + let rows: Vec = clickhouse_db.query(TRUNCATE_FLOW_SELECT).await; if rows.is_empty() { return; } @@ -189,12 +189,12 @@ async fn wait_for_truncate_flow_empty(ch_db: &ClickHouseTestDatabase) { /// Waits until ClickHouse returns at least `expected_rows` from /// `TRUNCATE_FLOW_SELECT`. async fn wait_for_truncate_flow_rows( - ch_db: &ClickHouseTestDatabase, + clickhouse_db: &ClickHouseTestDatabase, expected_rows: usize, ) -> Vec { let mut rows: Vec = Vec::with_capacity(expected_rows); for _ in 0..50 { - rows = ch_db.query(TRUNCATE_FLOW_SELECT).await; + rows = clickhouse_db.query(TRUNCATE_FLOW_SELECT).await; if rows.len() >= expected_rows { return rows; } @@ -356,10 +356,10 @@ async fn all_types_table_copy() { .expect("Failed to insert row 2"); // --- WHEN: pipeline copies data to ClickHouse --- - let ch_db = setup_clickhouse_database().await; + let clickhouse_db = setup_clickhouse_database().await; let store = NotifyingStore::new(); let pipeline_id: PipelineId = random(); - let destination = ch_db.build_destination(store.clone()); + let destination = clickhouse_db.build_destination(store.clone()); let table_ready = store.notify_on_table_state_type(table_id, TableReplicationPhaseType::Ready).await; @@ -377,7 +377,7 @@ async fn all_types_table_copy() { pipeline.shutdown_and_wait().await.unwrap(); // --- THEN: every column round-trips correctly --- - let rows: Vec = ch_db.query(ALL_TYPES_SELECT).await; + let rows: Vec = clickhouse_db.query(ALL_TYPES_SELECT).await; assert_eq!(rows.len(), 2, "expected 2 rows in ClickHouse"); @@ -486,10 +486,10 @@ async fn updates_are_streamed_to_clickhouse() { .expect("Failed to insert initial update_flow row"); // --- WHEN: pipeline copies data, then an UPDATE is streamed --- - let ch_db = setup_clickhouse_database().await; + let clickhouse_db = setup_clickhouse_database().await; let store = NotifyingStore::new(); let pipeline_id: PipelineId = random(); - let destination = ch_db.build_destination(store.clone()); + let destination = clickhouse_db.build_destination(store.clone()); let table_ready = store.notify_on_table_state_type(table_id, TableReplicationPhaseType::Ready).await; @@ -513,7 +513,7 @@ async fn updates_are_streamed_to_clickhouse() { .await .expect("Failed to update update_flow row"); - let rows = wait_for_update_flow_rows(&ch_db, 2).await; + let rows = wait_for_update_flow_rows(&clickhouse_db, 2).await; pipeline.shutdown_and_wait().await.unwrap(); @@ -641,10 +641,10 @@ async fn boundary_values_table_copy() { .expect("Failed to insert row 4 (multi-byte UTF-8)"); // --- WHEN: pipeline copies data to ClickHouse --- - let ch_db = setup_clickhouse_database().await; + let clickhouse_db = setup_clickhouse_database().await; let store = NotifyingStore::new(); let pipeline_id: PipelineId = random(); - let destination = ch_db.build_destination(store.clone()); + let destination = clickhouse_db.build_destination(store.clone()); let table_ready = store.notify_on_table_state_type(table_id, TableReplicationPhaseType::Ready).await; @@ -662,7 +662,7 @@ async fn boundary_values_table_copy() { pipeline.shutdown_and_wait().await.unwrap(); // --- THEN: ClickHouse data matches Postgres exactly --- - let rows: Vec = ch_db.query(BOUNDARY_VALUES_SELECT).await; + let rows: Vec = clickhouse_db.query(BOUNDARY_VALUES_SELECT).await; assert_eq!(rows.len(), 4, "expected 4 rows in ClickHouse"); // Row 1: NULL scalars stay NULL, empty arrays stay empty. @@ -775,10 +775,10 @@ async fn deletes_are_streamed_to_clickhouse() { // --- WHEN: pipeline copies data, then a DELETE is streamed --- - let ch_db = setup_clickhouse_database().await; + let clickhouse_db = setup_clickhouse_database().await; let store = NotifyingStore::new(); let pipeline_id: PipelineId = random(); - let destination = ch_db.build_destination(store.clone()); + let destination = clickhouse_db.build_destination(store.clone()); let table_ready = store.notify_on_table_state_type(table_id, TableReplicationPhaseType::Ready).await; @@ -799,7 +799,7 @@ async fn deletes_are_streamed_to_clickhouse() { .await .expect("Failed to delete delete_flow row"); - let rows = wait_for_delete_flow_rows(&ch_db, 3).await; + let rows = wait_for_delete_flow_rows(&clickhouse_db, 3).await; pipeline.shutdown_and_wait().await.unwrap(); @@ -878,10 +878,10 @@ async fn pipeline_restart_resumes_streaming() { .await .expect("Failed to insert initial restart_flow row"); - let ch_db = setup_clickhouse_database().await; + let clickhouse_db = setup_clickhouse_database().await; let store = NotifyingStore::new(); let pipeline_id: PipelineId = random(); - let destination = ch_db.build_destination(store.clone()); + let destination = clickhouse_db.build_destination(store.clone()); let table_ready = store.notify_on_table_state_type(table_id, TableReplicationPhaseType::Ready).await; @@ -899,13 +899,13 @@ async fn pipeline_restart_resumes_streaming() { pipeline.shutdown_and_wait().await.unwrap(); // Verify first run produced exactly one row. - let rows: Vec = ch_db.query(RESTART_FLOW_SELECT).await; + let rows: Vec = clickhouse_db.query(RESTART_FLOW_SELECT).await; assert_eq!(rows.len(), 1, "first run should copy exactly one row"); assert_eq!(rows[0].id, 1); assert_eq!(rows[0].value, "before_restart"); // --- WHEN: rebuild destination and pipeline, then stream a new insert --- - let destination = ch_db.build_destination(store.clone()); + let destination = clickhouse_db.build_destination(store.clone()); let mut pipeline = create_pipeline( &database.config, @@ -925,7 +925,7 @@ async fn pipeline_restart_resumes_streaming() { .await .expect("Failed to insert post-restart row"); - let rows = wait_for_restart_flow_rows(&ch_db, 2).await; + let rows = wait_for_restart_flow_rows(&clickhouse_db, 2).await; pipeline.shutdown_and_wait().await.unwrap(); @@ -997,10 +997,10 @@ async fn truncate_clears_table_and_accepts_new_inserts() { .await .expect("Failed to insert truncate_flow rows"); - let ch_db = setup_clickhouse_database().await; + let clickhouse_db = setup_clickhouse_database().await; let store = NotifyingStore::new(); let pipeline_id: PipelineId = random(); - let destination = ch_db.build_destination(store.clone()); + let destination = clickhouse_db.build_destination(store.clone()); let table_ready = store.notify_on_table_state_type(table_id, TableReplicationPhaseType::Ready).await; @@ -1017,7 +1017,7 @@ async fn truncate_clears_table_and_accepts_new_inserts() { table_ready.notified().await; // Verify both rows arrived from table copy. - let rows: Vec = ch_db.query(TRUNCATE_FLOW_SELECT).await; + let rows: Vec = clickhouse_db.query(TRUNCATE_FLOW_SELECT).await; assert_eq!(rows.len(), 2, "table copy should produce two rows"); // --- WHEN: truncate, then insert a new row --- @@ -1026,7 +1026,7 @@ async fn truncate_clears_table_and_accepts_new_inserts() { .await .expect("Failed to truncate table in Postgres"); - wait_for_truncate_flow_empty(&ch_db).await; + wait_for_truncate_flow_empty(&clickhouse_db).await; database .run_sql(&format!( @@ -1036,7 +1036,7 @@ async fn truncate_clears_table_and_accepts_new_inserts() { .await .expect("Failed to insert post-truncate row"); - let rows = wait_for_truncate_flow_rows(&ch_db, 1).await; + let rows = wait_for_truncate_flow_rows(&clickhouse_db, 1).await; pipeline.shutdown_and_wait().await.unwrap(); @@ -1104,10 +1104,10 @@ async fn intermediate_flush_preserves_all_rows() { .await .expect("Failed to insert flush_split rows"); - let ch_db = setup_clickhouse_database().await; + let clickhouse_db = setup_clickhouse_database().await; let store = NotifyingStore::new(); let pipeline_id: PipelineId = random(); - let destination = ch_db.build_destination_with_config( + let destination = clickhouse_db.build_destination_with_config( store.clone(), ClickHouseInserterConfig { // 1 byte -- forces a new INSERT after every row. @@ -1132,7 +1132,7 @@ async fn intermediate_flush_preserves_all_rows() { pipeline.shutdown_and_wait().await.unwrap(); // --- THEN: all rows arrive despite being split across many INSERTs --- - let rows: Vec = ch_db.query(FLUSH_SPLIT_SELECT).await; + let rows: Vec = clickhouse_db.query(FLUSH_SPLIT_SELECT).await; assert_eq!(rows.len(), row_count, "all rows must survive intermediate flush splits"); for (i, r) in rows.iter().enumerate() { @@ -1207,10 +1207,10 @@ async fn multiple_tables_receive_independent_writes() { .await .expect("Failed to insert into multi_b"); - let ch_db = setup_clickhouse_database().await; + let clickhouse_db = setup_clickhouse_database().await; let store = NotifyingStore::new(); let pipeline_id: PipelineId = random(); - let destination = ch_db.build_destination(store.clone()); + let destination = clickhouse_db.build_destination(store.clone()); let table_a_ready = store.notify_on_table_state_type(table_a_id, TableReplicationPhaseType::Ready).await; @@ -1260,8 +1260,8 @@ async fn multiple_tables_receive_independent_writes() { let mut rows_a: Vec = Vec::with_capacity(2); let mut rows_b: Vec = Vec::with_capacity(2); for _ in 0..50 { - rows_a = ch_db.query(select_a).await; - rows_b = ch_db.query(select_b).await; + rows_a = clickhouse_db.query(select_a).await; + rows_b = clickhouse_db.query(select_b).await; if rows_a.len() >= 2 && rows_b.len() >= 2 { break; } @@ -1358,10 +1358,10 @@ async fn sequential_transactions_preserve_commit_order() { .await .expect("Failed to insert initial tx_order row"); - let ch_db = setup_clickhouse_database().await; + let clickhouse_db = setup_clickhouse_database().await; let store = NotifyingStore::new(); let pipeline_id: PipelineId = random(); - let destination = ch_db.build_destination(store.clone()); + let destination = clickhouse_db.build_destination(store.clone()); let table_ready = store.notify_on_table_state_type(table_id, TableReplicationPhaseType::Ready).await; @@ -1399,7 +1399,7 @@ async fn sequential_transactions_preserve_commit_order() { // Poll until all three rows arrive. let mut rows: Vec = Vec::with_capacity(3); for _ in 0..50 { - rows = ch_db.query(TX_ORDER_SELECT).await; + rows = clickhouse_db.query(TX_ORDER_SELECT).await; if rows.len() >= 3 { break; } @@ -1567,10 +1567,10 @@ async fn delete_with_default_replica_identity() { .await .expect("Failed to insert rows"); - let ch_db = setup_clickhouse_database().await; + let clickhouse_db = setup_clickhouse_database().await; let store = NotifyingStore::new(); let pipeline_id: PipelineId = random(); - let destination = ch_db.build_destination(store.clone()); + let destination = clickhouse_db.build_destination(store.clone()); let table_ready = store.notify_on_table_state_type(table_id, TableReplicationPhaseType::Ready).await; @@ -1615,7 +1615,7 @@ async fn delete_with_default_replica_identity() { // Poll for 4 rows: 2 copied INSERTs + DELETE tombstone + new INSERT. let mut rows: Vec = Vec::new(); for _ in 0..50 { - rows = ch_db.query(DEFAULT_IDENTITY_DELETE_SELECT).await; + rows = clickhouse_db.query(DEFAULT_IDENTITY_DELETE_SELECT).await; if rows.len() >= 4 { break; } @@ -1743,10 +1743,10 @@ async fn exclusive_large_batch_table_copy() { .expect("Failed to insert large_batch rows"); } - let ch_db = setup_clickhouse_database().await; + let clickhouse_db = setup_clickhouse_database().await; let store = NotifyingStore::new(); let pipeline_id: PipelineId = random(); - let destination = ch_db.build_destination(store.clone()); + let destination = clickhouse_db.build_destination(store.clone()); let table_ready = store.notify_on_table_state_type(table_id, TableReplicationPhaseType::Ready).await; @@ -1765,7 +1765,7 @@ async fn exclusive_large_batch_table_copy() { pipeline.shutdown_and_wait().await.unwrap(); // --- THEN: all rows arrive, spot-check a sample --- - let rows: Vec = ch_db.query(LARGE_BATCH_SELECT).await; + let rows: Vec = clickhouse_db.query(LARGE_BATCH_SELECT).await; assert_eq!(rows.len(), row_count, "all 1024 rows must arrive"); // Spot-check: first, last, powers of two, and a few interior points. @@ -1854,7 +1854,7 @@ async fn schema_change_add_column() { init_test_tracing(); install_crypto_provider(); - let ch_table_name = "test_schema__add__col"; + let clickhouse_table_name = "test_schema__add__col"; // --- GIVEN: table with one row, copied to ClickHouse --- let database = spawn_source_database().await; @@ -1883,10 +1883,10 @@ async fn schema_change_add_column() { .await .expect("Failed to insert Alice"); - let ch_db = setup_clickhouse_database().await; + let clickhouse_db = setup_clickhouse_database().await; let store = NotifyingStore::new(); let pipeline_id: PipelineId = random(); - let destination = ch_db.build_destination(store.clone()); + let destination = clickhouse_db.build_destination(store.clone()); let table_ready = store.notify_on_table_state_type(table_id, TableReplicationPhaseType::Ready).await; @@ -1903,7 +1903,7 @@ async fn schema_change_add_column() { table_ready.notified().await; // Verify initial state. - let initial_columns = ch_db.column_names(ch_table_name).await; + let initial_columns = clickhouse_db.column_names(clickhouse_table_name).await; assert_eq!(initial_columns, vec!["id", "name", "age"]); let initial_metadata = store @@ -1947,7 +1947,7 @@ async fn schema_change_add_column() { for _ in 0..50 { // The SELECT will fail if the email column doesn't exist yet, so // catch errors and retry. - if let Ok(r) = ch_db.db_client().query(select).fetch_all::().await + if let Ok(r) = clickhouse_db.db_client().query(select).fetch_all::().await && r.len() >= 2 { rows = r; @@ -1964,10 +1964,10 @@ async fn schema_change_add_column() { pipeline.shutdown_and_wait().await.unwrap(); // --- THEN: ClickHouse has the new columns, both rows present --- - let final_columns = ch_db.column_names(ch_table_name).await; + let final_columns = clickhouse_db.column_names(clickhouse_table_name).await; assert_eq!(final_columns, vec!["id", "name", "age", "email", "score"]); - let final_column_types = ch_db.column_types(ch_table_name).await; + let final_column_types = clickhouse_db.column_types(clickhouse_table_name).await; assert_eq!( final_column_types, vec![ @@ -2053,7 +2053,7 @@ async fn schema_change_add_drop_rename() { init_test_tracing(); install_crypto_provider(); - let ch_table_name = "test_schema__multi"; + let clickhouse_table_name = "test_schema__multi"; // --- GIVEN: table with one row, copied to ClickHouse --- let database = spawn_source_database().await; @@ -2082,10 +2082,10 @@ async fn schema_change_add_drop_rename() { .await .expect("Failed to insert Alice"); - let ch_db = setup_clickhouse_database().await; + let clickhouse_db = setup_clickhouse_database().await; let store = NotifyingStore::new(); let pipeline_id: PipelineId = random(); - let destination = ch_db.build_destination(store.clone()); + let destination = clickhouse_db.build_destination(store.clone()); let table_ready = store.notify_on_table_state_type(table_id, TableReplicationPhaseType::Ready).await; @@ -2102,7 +2102,7 @@ async fn schema_change_add_drop_rename() { table_ready.notified().await; // Verify initial schema. - let initial_columns = ch_db.column_names(ch_table_name).await; + let initial_columns = clickhouse_db.column_names(clickhouse_table_name).await; assert_eq!(initial_columns, vec!["id", "name", "age", "status"]); let initial_metadata = store @@ -2151,7 +2151,8 @@ async fn schema_change_add_drop_rename() { ); let mut rows: Vec = Vec::new(); for _ in 0..50 { - if let Ok(r) = ch_db.db_client().query(select).fetch_all::().await + if let Ok(r) = + clickhouse_db.db_client().query(select).fetch_all::().await && r.len() >= 2 { rows = r; @@ -2168,7 +2169,7 @@ async fn schema_change_add_drop_rename() { pipeline.shutdown_and_wait().await.unwrap(); // --- THEN: ClickHouse schema reflects all changes --- - let final_columns = ch_db.column_names(ch_table_name).await; + let final_columns = clickhouse_db.column_names(clickhouse_table_name).await; assert_eq!(final_columns, vec!["id", "full_name", "status", "email"]); assert_eq!(rows.len(), 2, "expected Alice + Bob"); diff --git a/etl-examples/README.md b/etl-examples/README.md index aa72955f9..ca44c2b5e 100644 --- a/etl-examples/README.md +++ b/etl-examples/README.md @@ -116,9 +116,9 @@ cargo run -p etl-examples --bin clickhouse -- \ --db-name postgres \ --db-username postgres \ --db-password password \ - --ch-url http://localhost:8123 \ - --ch-user default \ - --ch-database default \ + --clickhouse-url http://localhost:8123 \ + --clickhouse-user default \ + --clickhouse-database default \ --publication my_pub ``` @@ -132,7 +132,7 @@ Table names are derived from the Postgres schema and table name using double-und escaping (e.g. `public.orders` → `public_orders`, `my_schema.t` → `my__schema_t`). For HTTPS connections, provide an `https://` URL — TLS is handled automatically using -webpki root certificates. Use `--ch-password` if your ClickHouse instance requires +webpki root certificates. Use `--clickhouse-password` if your ClickHouse instance requires authentication. ### Example configuration diff --git a/etl-examples/src/bin/clickhouse.rs b/etl-examples/src/bin/clickhouse.rs index aa0eed567..1afe70a76 100644 --- a/etl-examples/src/bin/clickhouse.rs +++ b/etl-examples/src/bin/clickhouse.rs @@ -25,13 +25,13 @@ Usage: --db-name postgres \ --db-username postgres \ --db-password password \ - --ch-url http://localhost:8123 \ - --ch-user default \ - --ch-database default \ + --clickhouse-url http://localhost:8123 \ + --clickhouse-user default \ + --clickhouse-database default \ --publication my_pub For HTTPS connections, provide an `https://` URL — TLS is handled automatically -using webpki root certificates. Use `--ch-password` if your ClickHouse instance +using webpki root certificates. Use `--clickhouse-password` if your ClickHouse instance requires authentication. */ @@ -75,7 +75,7 @@ struct AppArgs { db_args: DbArgs, /// ClickHouse destination parameters #[clap(flatten)] - ch_args: ChArgs, + clickhouse_args: ClickHouseArgs, /// Postgres publication name (must be created beforehand with CREATE /// PUBLICATION) #[arg(long)] @@ -104,19 +104,19 @@ struct DbArgs { /// ClickHouse destination configuration. #[derive(Debug, Args)] -struct ChArgs { +struct ClickHouseArgs { /// ClickHouse HTTP(S) endpoint (e.g. http://localhost:8123 or https://host:8443) #[arg(long)] - ch_url: String, + clickhouse_url: String, /// ClickHouse user name #[arg(long)] - ch_user: String, + clickhouse_user: String, /// ClickHouse user password (optional) #[arg(long)] - ch_password: Option, + clickhouse_password: Option, /// ClickHouse target database #[arg(long)] - ch_database: String, + clickhouse_database: String, /// Maximum time to wait for a batch to fill in milliseconds (lower values = /// lower latency, less throughput) #[arg(long, default_value = "5000")] @@ -164,7 +164,7 @@ async fn main_impl() -> Result<(), Box> { set_log_level(); init_tracing(); - // Install required crypto provider for TLS (used when ch_url is https://) + // Install required crypto provider for TLS (used when clickhouse_url is https://) install_crypto_provider(); let args = AppArgs::parse(); @@ -193,12 +193,12 @@ async fn main_impl() -> Result<(), Box> { publication_name: args.publication, pg_connection: pg_connection_config, batch: BatchConfig { - max_fill_ms: args.ch_args.max_batch_fill_duration_ms, + max_fill_ms: args.clickhouse_args.max_batch_fill_duration_ms, memory_budget_ratio: BatchConfig::DEFAULT_MEMORY_BUDGET_RATIO, }, table_error_retry_delay_ms: 10000, table_error_retry_max_attempts: 5, - max_table_sync_workers: args.ch_args.max_table_sync_workers, + max_table_sync_workers: args.clickhouse_args.max_table_sync_workers, memory_refresh_interval_ms: 100, memory_backpressure: Some(MemoryBackpressureConfig::default()), table_sync_copy: TableSyncCopyConfig::default(), @@ -218,16 +218,16 @@ async fn main_impl() -> Result<(), Box> { }; let budget = (total_memory as f64 * f64::from(BatchConfig::DEFAULT_MEMORY_BUDGET_RATIO)) as u64; - (budget / u64::from(args.ch_args.max_table_sync_workers)).max(1) + (budget / u64::from(args.clickhouse_args.max_table_sync_workers)).max(1) }; // Initialize the ClickHouse destination. // Tables are created automatically as append-only MergeTree tables. let clickhouse_destination = ClickHouseDestination::new( - Url::parse(&args.ch_args.ch_url)?, - args.ch_args.ch_user, - args.ch_args.ch_password, - args.ch_args.ch_database, + Url::parse(&args.clickhouse_args.clickhouse_url)?, + args.clickhouse_args.clickhouse_user, + args.clickhouse_args.clickhouse_password, + args.clickhouse_args.clickhouse_database, ClickHouseInserterConfig { max_bytes_per_insert }, store.clone(), )?; diff --git a/etl-replicator/configuration/base.yaml b/etl-replicator/configuration/base.yaml index 14fec2eea..1618d4c8f 100644 --- a/etl-replicator/configuration/base.yaml +++ b/etl-replicator/configuration/base.yaml @@ -2,7 +2,7 @@ application: host: "[::]" port: 8080 destination: - click_house: + clickhouse: url: http://clickhouse.etl-data-plane.svc.cluster.local:8123 user: default password: password diff --git a/etl-replicator/configuration/dev.yaml b/etl-replicator/configuration/dev.yaml index 14fec2eea..1618d4c8f 100644 --- a/etl-replicator/configuration/dev.yaml +++ b/etl-replicator/configuration/dev.yaml @@ -2,7 +2,7 @@ application: host: "[::]" port: 8080 destination: - click_house: + clickhouse: url: http://clickhouse.etl-data-plane.svc.cluster.local:8123 user: default password: password diff --git a/etl-replicator/configuration/prod.yaml b/etl-replicator/configuration/prod.yaml index 14fec2eea..1618d4c8f 100644 --- a/etl-replicator/configuration/prod.yaml +++ b/etl-replicator/configuration/prod.yaml @@ -2,7 +2,7 @@ application: host: "[::]" port: 8080 destination: - click_house: + clickhouse: url: http://clickhouse.etl-data-plane.svc.cluster.local:8123 user: default password: password From cff36a71cf356476e9f6e6aa4b05f5abf9210309 Mon Sep 17 00:00:00 2001 From: Jordan McQueen Date: Thu, 30 Apr 2026 13:40:22 +0900 Subject: [PATCH 65/86] Cap ClickHouse INSERT size at a fixed 64 MiB Replaces the host-memory-derived max_bytes_per_insert in both the example and the replicator with a fixed cap on ClickHouseInserterConfig (DEFAULT_MAX_BYTES_PER_INSERT). The previous formula reused BatchConfig::memory_budget_ratio (designed for incoming batch buffers) to size outgoing INSERT buffers, which can both be near-full simultaneously, and used different ratios in the example (0.2) and replicator (0.85). The cap is well within ClickHouse's 10k-100k rows-per-INSERT bulk-insert recommendation for typical CDC payloads. Both call sites now use ClickHouseInserterConfig::default(). --- Cargo.lock | 2 -- etl-destinations/src/clickhouse/core.rs | 25 +++++++++++++++++++------ etl-examples/Cargo.toml | 1 - etl-examples/src/bin/clickhouse.rs | 18 +----------------- etl-replicator/Cargo.toml | 1 - etl-replicator/src/core.rs | 25 ++----------------------- 6 files changed, 22 insertions(+), 50 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 942cbea19..9e6679d7b 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1976,7 +1976,6 @@ dependencies = [ "etl-destinations", "etl-telemetry", "rustls", - "sysinfo", "tokio", "tracing", "tracing-subscriber", @@ -2017,7 +2016,6 @@ dependencies = [ "serde", "serde_json", "sqlx", - "sysinfo", "tikv-jemalloc-ctl", "tikv-jemallocator", "tokio", diff --git a/etl-destinations/src/clickhouse/core.rs b/etl-destinations/src/clickhouse/core.rs index 792446783..565f8f9da 100644 --- a/etl-destinations/src/clickhouse/core.rs +++ b/etl-destinations/src/clickhouse/core.rs @@ -145,17 +145,30 @@ fn nullable_flags_from_clickhouse_columns( /// `write_events` call. /// /// The upstream `BatchConfig::max_fill_ms` controls when `write_events` is -/// called; these limits prevent unbounded memory use for very large batches +/// called; this limit prevents unbounded memory use for very large batches /// (e.g. initial copy). pub struct ClickHouseInserterConfig { - /// Start a new INSERT after this many uncompressed bytes. - /// - /// Derive this from `BatchConfig::memory_budget_ratio * total_memory / - /// max_table_sync_workers` (the same formula used by - /// `BatchBudget::ideal_batch_size_bytes`). + /// Start a new INSERT after this many uncompressed bytes. Fixed cap + /// because incoming and outgoing buffers can both be near-full at once; + /// could be made tunable later if needed. pub max_bytes_per_insert: u64, } +impl ClickHouseInserterConfig { + /// Default per-INSERT byte cap. 64 MiB lands in the upper end of + /// ClickHouse's recommended bulk-insert range (10k - 100k rows per + /// INSERT) for typical CDC payload widths. + /// + /// See . + pub const DEFAULT_MAX_BYTES_PER_INSERT: u64 = 64 * 1024 * 1024; +} + +impl Default for ClickHouseInserterConfig { + fn default() -> Self { + Self { max_bytes_per_insert: Self::DEFAULT_MAX_BYTES_PER_INSERT } + } +} + /// CDC-capable ClickHouse destination that replicates Postgres tables. /// /// Uses append-only MergeTree tables with two CDC columns (`cdc_operation`, diff --git a/etl-examples/Cargo.toml b/etl-examples/Cargo.toml index b8a256f5f..25e8878a3 100644 --- a/etl-examples/Cargo.toml +++ b/etl-examples/Cargo.toml @@ -24,7 +24,6 @@ etl-telemetry = { workspace = true } clap = { workspace = true, default-features = true, features = ["std", "derive"] } rustls = { workspace = true, features = ["aws-lc-rs", "logging"] } -sysinfo = { workspace = true, features = ["system"] } tokio = { workspace = true, features = ["macros", "signal"] } tracing = { workspace = true, default-features = true } tracing-subscriber = { workspace = true, default-features = true, features = ["env-filter"] } diff --git a/etl-examples/src/bin/clickhouse.rs b/etl-examples/src/bin/clickhouse.rs index 1afe70a76..c03627da6 100644 --- a/etl-examples/src/bin/clickhouse.rs +++ b/etl-examples/src/bin/clickhouse.rs @@ -48,7 +48,6 @@ use etl::{ store::both::memory::MemoryStore, }; use etl_destinations::clickhouse::{ClickHouseDestination, ClickHouseInserterConfig}; -use sysinfo::MemoryRefreshKind; use tokio::signal; use tracing::{error, info}; use tracing_subscriber::{layer::SubscriberExt, util::SubscriberInitExt}; @@ -206,21 +205,6 @@ async fn main_impl() -> Result<(), Box> { max_copy_connections_per_table: PipelineConfig::DEFAULT_MAX_COPY_CONNECTIONS_PER_TABLE, }; - // Compute max_bytes_per_insert using the same formula as - // BatchBudget::ideal_batch_size_bytes: total_memory * memory_budget_ratio - // / max_table_sync_workers - let max_bytes_per_insert = { - let mut sys = sysinfo::System::new(); - sys.refresh_memory_specifics(MemoryRefreshKind::nothing().with_ram()); - let total_memory = match sys.cgroup_limits() { - Some(cgroup) => cgroup.total_memory, - None => sys.total_memory(), - }; - let budget = - (total_memory as f64 * f64::from(BatchConfig::DEFAULT_MEMORY_BUDGET_RATIO)) as u64; - (budget / u64::from(args.clickhouse_args.max_table_sync_workers)).max(1) - }; - // Initialize the ClickHouse destination. // Tables are created automatically as append-only MergeTree tables. let clickhouse_destination = ClickHouseDestination::new( @@ -228,7 +212,7 @@ async fn main_impl() -> Result<(), Box> { args.clickhouse_args.clickhouse_user, args.clickhouse_args.clickhouse_password, args.clickhouse_args.clickhouse_database, - ClickHouseInserterConfig { max_bytes_per_insert }, + ClickHouseInserterConfig::default(), store.clone(), )?; diff --git a/etl-replicator/Cargo.toml b/etl-replicator/Cargo.toml index f82ecaf70..dcfb9b716 100644 --- a/etl-replicator/Cargo.toml +++ b/etl-replicator/Cargo.toml @@ -26,7 +26,6 @@ sentry = { workspace = true } serde = { workspace = true, features = ["derive"] } serde_json = { workspace = true } sqlx = { workspace = true, features = ["runtime-tokio-rustls", "postgres", "migrate"] } -sysinfo = { workspace = true, features = ["system"] } tokio = { workspace = true, features = ["rt-multi-thread", "macros", "signal"] } tracing = { workspace = true, default-features = true } diff --git a/etl-replicator/src/core.rs b/etl-replicator/src/core.rs index 1fb7384d5..3f1d80501 100644 --- a/etl-replicator/src/core.rs +++ b/etl-replicator/src/core.rs @@ -1,7 +1,7 @@ use std::collections::HashMap; use etl::{ - config::{IcebergConfig, MemoryBackpressureConfig}, + config::IcebergConfig, destination::Destination, pipeline::Pipeline, store::{ @@ -24,7 +24,6 @@ use etl_destinations::{ }, }; use secrecy::ExposeSecret; -use sysinfo::{MemoryRefreshKind, RefreshKind, System}; use tokio::signal::unix::{SignalKind, signal}; use tracing::{error, info, warn}; @@ -201,32 +200,12 @@ pub(crate) async fn start_replicator_with_config( start_pipeline(pipeline).await?; } DestinationConfig::ClickHouse { url, user, password, database } => { - let mut sys = System::new_with_specifics( - RefreshKind::nothing().with_memory(MemoryRefreshKind::everything()), - ); - sys.refresh_memory_specifics(sysinfo::MemoryRefreshKind::nothing().with_ram()); - let total_memory_bytes = match sys.cgroup_limits() { - Some(cgroup) => cgroup.total_memory, - None => sys.total_memory(), - }; - let max_bytes_per_insert = (total_memory_bytes as f64 - * replicator_config - .pipeline - .memory_backpressure - .as_ref() - .map_or(MemoryBackpressureConfig::default().activate_threshold, |config| { - config.activate_threshold - }) as f64 - / replicator_config.pipeline.max_table_sync_workers as f64) - as u64; - - let inserter_config = ClickHouseInserterConfig { max_bytes_per_insert }; let destination = ClickHouseDestination::new( url.clone(), user, password.as_ref().map(|p| p.expose_secret().to_string()), database, - inserter_config, + ClickHouseInserterConfig::default(), state_store.clone(), )?; From f50d064ecd1787cfbc56d65a801415eac2a0038d Mon Sep 17 00:00:00 2001 From: Jordan McQueen Date: Thu, 30 Apr 2026 14:21:55 +0900 Subject: [PATCH 66/86] Revert MemorySnapshot to private after dropping external sysinfo probes --- etl/src/concurrency/memory_monitor.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/etl/src/concurrency/memory_monitor.rs b/etl/src/concurrency/memory_monitor.rs index acd7d8045..b595abdb2 100644 --- a/etl/src/concurrency/memory_monitor.rs +++ b/etl/src/concurrency/memory_monitor.rs @@ -24,14 +24,14 @@ use crate::{ }; /// Represents a memory snapshot. #[derive(Debug, Clone, Copy)] -pub struct MemorySnapshot { +struct MemorySnapshot { used: u64, total: u64, } impl MemorySnapshot { /// Refreshes memory readings from the operating system. - pub fn from_system(system: &mut sysinfo::System) -> Self { + fn from_system(system: &mut sysinfo::System) -> Self { system.refresh_memory_specifics(sysinfo::MemoryRefreshKind::nothing().with_ram()); match system.cgroup_limits() { From 6154433405ba0fcef448c80c390014256dc51320 Mon Sep 17 00:00:00 2001 From: Jordan McQueen Date: Thu, 30 Apr 2026 14:33:46 +0900 Subject: [PATCH 67/86] Tighten visibility of ClickHouse metrics and schema helpers --- etl-destinations/src/clickhouse/metrics.rs | 7 ++++--- etl-destinations/src/clickhouse/schema.rs | 6 +++--- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/etl-destinations/src/clickhouse/metrics.rs b/etl-destinations/src/clickhouse/metrics.rs index df480c04a..55188f424 100644 --- a/etl-destinations/src/clickhouse/metrics.rs +++ b/etl-destinations/src/clickhouse/metrics.rs @@ -6,17 +6,18 @@ static REGISTER_METRICS: Once = Once::new(); /// Duration of `CREATE TABLE IF NOT EXISTS` DDL operations sent to ClickHouse. /// Labels: `table`. -pub const ETL_CLICKHOUSE_DDL_DURATION_SECONDS: &str = "etl_clickhouse_ddl_duration_seconds"; +pub(super) const ETL_CLICKHOUSE_DDL_DURATION_SECONDS: &str = "etl_clickhouse_ddl_duration_seconds"; /// Duration of a single RowBinary INSERT statement from first write to server /// acknowledgement. Labels: `table`, `source` (`copy` = initial table sync, /// `streaming` = CDC events). -pub const ETL_CLICKHOUSE_INSERT_DURATION_SECONDS: &str = "etl_clickhouse_insert_duration_seconds"; +pub(super) const ETL_CLICKHOUSE_INSERT_DURATION_SECONDS: &str = + "etl_clickhouse_insert_duration_seconds"; /// Register ClickHouse-specific metrics. /// /// Safe to call multiple times — registration happens only once. -pub fn register_metrics() { +pub(super) fn register_metrics() { REGISTER_METRICS.call_once(|| { describe_histogram!( ETL_CLICKHOUSE_DDL_DURATION_SECONDS, diff --git a/etl-destinations/src/clickhouse/schema.rs b/etl-destinations/src/clickhouse/schema.rs index 639d7e776..22902e3fd 100644 --- a/etl-destinations/src/clickhouse/schema.rs +++ b/etl-destinations/src/clickhouse/schema.rs @@ -10,7 +10,7 @@ pub(crate) const CDC_LSN_COLUMN_NAME: &str = "cdc_lsn"; /// The returned string does not include `Nullable(...)` wrapping — callers are /// responsible for applying that when the column is nullable. Arrays always use /// `Array(Nullable(T))` since Postgres array elements are nullable. -pub fn postgres_column_type_to_clickhouse_sql(typ: &Type) -> &'static str { +fn postgres_column_type_to_clickhouse_sql(typ: &Type) -> &'static str { match typ { &Type::BOOL => "Boolean", &Type::CHAR | &Type::BPCHAR | &Type::VARCHAR | &Type::NAME | &Type::TEXT => "String", @@ -69,7 +69,7 @@ pub(crate) fn quote_identifier(identifier: &str) -> String { /// /// When `force_nullable` is true (ALTER TABLE ADD), all scalar columns become /// Nullable since ClickHouse cannot backfill existing rows. -pub fn clickhouse_column_type(col: &ColumnSchema, force_nullable: bool) -> String { +pub(super) fn clickhouse_column_type(col: &ColumnSchema, force_nullable: bool) -> String { if is_array_type(&col.typ) { let elem = postgres_array_element_clickhouse_sql(&col.typ); format!("Array(Nullable({elem}))") @@ -83,7 +83,7 @@ pub fn clickhouse_column_type(col: &ColumnSchema, force_nullable: bool) -> Strin /// /// Appends `cdc_operation String` and `cdc_lsn UInt64` as trailing non-nullable /// columns. Uses `MergeTree()` with `ORDER BY tuple()`. -pub fn build_create_table_sql(table_name: &str, column_schemas: &[ColumnSchema]) -> String { +pub(super) fn build_create_table_sql(table_name: &str, column_schemas: &[ColumnSchema]) -> String { let mut cols = Vec::with_capacity(column_schemas.len() + 2); for col in column_schemas { From cd0cb3e0dcf7d1264122909a4cab56b90cd03053 Mon Sep 17 00:00:00 2001 From: Jordan McQueen Date: Thu, 30 Apr 2026 14:41:24 +0900 Subject: [PATCH 68/86] Error on partial update rows in ClickHouse instead of skipping --- etl-destinations/src/clickhouse/core.rs | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/etl-destinations/src/clickhouse/core.rs b/etl-destinations/src/clickhouse/core.rs index 565f8f9da..ec6d62c7b 100644 --- a/etl-destinations/src/clickhouse/core.rs +++ b/etl-destinations/src/clickhouse/core.rs @@ -604,8 +604,19 @@ where } Event::Update(update) => { let UpdatedTableRow::Full(table_row) = update.updated_table_row else { - warn!("skipping partial update row for ClickHouse"); - continue; + return Err(etl_error!( + ErrorKind::InvalidState, + "ClickHouse update requires a full new row image", + format!( + "Table '{}' emitted a partial update row: some column values \ + could not be reconstructed. Writing it would record NULL for \ + the missing columns and misrepresent the source. Configuring \ + the source so that all column values are available in the \ + new- or old-row image (e.g. REPLICA IDENTITY FULL) prevents \ + this.", + update.replicated_table_schema.name() + ) + )); }; let table_id = update.replicated_table_schema.id(); let entry = pending From 5ff2ba67bbd20ed46d232e4813c21c2a4a372896 Mon Sep 17 00:00:00 2001 From: Jordan McQueen Date: Thu, 30 Apr 2026 15:04:41 +0900 Subject: [PATCH 69/86] Reject unsupported replica identities for ClickHouse --- etl-destinations/src/clickhouse/core.rs | 88 ++++++++++++++++++++++++- 1 file changed, 86 insertions(+), 2 deletions(-) diff --git a/etl-destinations/src/clickhouse/core.rs b/etl-destinations/src/clickhouse/core.rs index ec6d62c7b..0cfbbd018 100644 --- a/etl-destinations/src/clickhouse/core.rs +++ b/etl-destinations/src/clickhouse/core.rs @@ -10,8 +10,8 @@ use etl::{ state::destination_metadata::{DestinationTableMetadata, DestinationTableSchemaStatus}, store::{schema::SchemaStore, state::StateStore}, types::{ - Cell, Event, OldTableRow, PgLsn, ReplicatedTableSchema, SchemaDiff, TableId, TableRow, - Type, UpdatedTableRow, is_array_type, + Cell, Event, IdentityType, OldTableRow, PgLsn, ReplicatedTableSchema, SchemaDiff, TableId, + TableRow, Type, UpdatedTableRow, is_array_type, }, }; use parking_lot::RwLock; @@ -268,6 +268,7 @@ where &self, schema: &ReplicatedTableSchema, ) -> EtlResult<(String, Arc<[bool]>)> { + validate_replica_identity_for_clickhouse(schema)?; let clickhouse_table_name = try_stringify_table_name(schema.name())?; if let Some(flags) = self.table_cache.read().get(&clickhouse_table_name).cloned() { @@ -414,6 +415,8 @@ where /// Handles a schema change event (Relation) by computing the diff and /// applying ALTER TABLE statements. async fn handle_relation_event(&self, new_schema: &ReplicatedTableSchema) -> EtlResult<()> { + validate_replica_identity_for_clickhouse(new_schema)?; + let table_id = new_schema.id(); let new_snapshot_id = new_schema.inner().snapshot_id; let new_replication_mask = new_schema.replication_mask().clone(); @@ -746,11 +749,43 @@ where } } +/// Rejects replica identities the ClickHouse destination cannot represent. +/// +/// `expand_key_row` assumes the key-only old-row image carries primary-key +/// values, so the row identity must match the primary key. `Full` is also +/// fine because it bypasses `expand_key_row` entirely. `AlternativeKey` +/// (a non-PK unique index) and `Missing` would either land identity values +/// in the wrong PK slots or leave us without enough data to write a +/// well-formed tombstone. +fn validate_replica_identity_for_clickhouse( + replicated_table_schema: &ReplicatedTableSchema, +) -> EtlResult<()> { + match replicated_table_schema.identity_type() { + IdentityType::PrimaryKey | IdentityType::Full => Ok(()), + identity_type => Err(etl_error!( + ErrorKind::SourceSchemaError, + "ClickHouse requires primary-key or full replica identity", + format!( + "Table '{}' uses replica identity {:?}. ClickHouse needs the source row identity \ + to match the primary key (so DELETE tombstones land in the right PK slots) or to \ + carry the full row image. Configure REPLICA IDENTITY DEFAULT (when the PK is the \ + natural identity) or REPLICA IDENTITY FULL.", + replicated_table_schema.name(), + identity_type + ) + )), + } +} + /// Expands a key-only delete row to full column width for RowBinary encoding. /// /// PK columns keep their real values. Non-PK columns get `Cell::Null` if /// nullable, or a type-appropriate zero value if non-nullable (since RowBinary /// rejects NULL for non-nullable columns). +/// +/// Caller must ensure the source replica identity is `PrimaryKey` (or `Full`, +/// in which case this function isn't invoked) -- see +/// [`validate_replica_identity_for_clickhouse`]. fn expand_key_row(key_row: TableRow, schema: &ReplicatedTableSchema) -> TableRow { let key_cells = key_row.into_values(); let mut key_iter = key_cells.into_iter(); @@ -868,12 +903,61 @@ where #[cfg(test)] mod tests { + use etl::types::{ColumnSchema, IdentityMask, ReplicationMask, TableName, TableSchema}; + use super::*; fn clickhouse_column(name: &str, type_name: &str) -> ClickHouseTableColumn { ClickHouseTableColumn { name: name.to_string(), type_name: type_name.to_string() } } + fn replicated_schema(identity_type: IdentityType) -> ReplicatedTableSchema { + let table_schema = Arc::new(TableSchema::new( + TableId::new(1), + TableName::new("public".to_string(), "users".to_string()), + vec![ + ColumnSchema::new("id".to_string(), Type::INT4, -1, 1, Some(1), false), + ColumnSchema::new("name".to_string(), Type::TEXT, -1, 2, None, true), + ], + )); + let replication_mask = ReplicationMask::all(&table_schema); + let identity_mask = match identity_type { + IdentityType::Full => IdentityMask::from_bytes(vec![1, 1]), + IdentityType::PrimaryKey => IdentityMask::from_bytes(vec![1, 0]), + IdentityType::AlternativeKey => IdentityMask::from_bytes(vec![0, 1]), + IdentityType::Missing => IdentityMask::from_bytes(vec![0, 0]), + }; + ReplicatedTableSchema::from_masks(table_schema, replication_mask, identity_mask) + } + + #[test] + fn validate_replica_identity_for_clickhouse_accepts_primary_key() { + validate_replica_identity_for_clickhouse(&replicated_schema(IdentityType::PrimaryKey)) + .unwrap(); + } + + #[test] + fn validate_replica_identity_for_clickhouse_accepts_full() { + validate_replica_identity_for_clickhouse(&replicated_schema(IdentityType::Full)).unwrap(); + } + + #[test] + fn validate_replica_identity_for_clickhouse_rejects_alternative_key() { + let err = validate_replica_identity_for_clickhouse(&replicated_schema( + IdentityType::AlternativeKey, + )) + .unwrap_err(); + assert_eq!(err.kind(), ErrorKind::SourceSchemaError); + } + + #[test] + fn validate_replica_identity_for_clickhouse_rejects_missing() { + let err = + validate_replica_identity_for_clickhouse(&replicated_schema(IdentityType::Missing)) + .unwrap_err(); + assert_eq!(err.kind(), ErrorKind::SourceSchemaError); + } + #[test] fn cdc_lsn_value_preserves_full_u64_range() { let value = cdc_lsn_to_clickhouse_value(PgLsn::from(u64::MAX)); From c66924c6e6afbe7cac64801d60ebfb1d8cf9027d Mon Sep 17 00:00:00 2001 From: Jordan McQueen Date: Fri, 1 May 2026 12:50:14 +0900 Subject: [PATCH 70/86] Update etl-api/src/configs/destination.rs Co-authored-by: Victor Farazdagi --- etl-api/src/configs/destination.rs | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/etl-api/src/configs/destination.rs b/etl-api/src/configs/destination.rs index 110ce48db..4a1bbf624 100644 --- a/etl-api/src/configs/destination.rs +++ b/etl-api/src/configs/destination.rs @@ -478,10 +478,9 @@ impl Encrypt for StoredDestinationConfig { }) } Self::ClickHouse { url, user, password, database } => { - let encrypted_password = match password { - Some(p) => Some(encrypt_text(p.expose_secret().to_owned(), encryption_key)?), - None => None, - }; + let encrypted_password = password + .map(|p| encrypt_text(p.expose_secret().to_owned(), encryption_key)) + .transpose()?; Ok(EncryptedStoredDestinationConfig::ClickHouse { url, From e9168b914eb31b7943fff4f3921f07b43ff84e98 Mon Sep 17 00:00:00 2001 From: Jordan McQueen Date: Fri, 1 May 2026 12:54:29 +0900 Subject: [PATCH 71/86] Use map().transpose() for ClickHouse password decryption --- etl-api/src/configs/destination.rs | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/etl-api/src/configs/destination.rs b/etl-api/src/configs/destination.rs index 4a1bbf624..f27d215b7 100644 --- a/etl-api/src/configs/destination.rs +++ b/etl-api/src/configs/destination.rs @@ -722,12 +722,10 @@ impl Decrypt for EncryptedStoredDestinationConfig { } }, EncryptedStoredDestinationConfig::ClickHouse { url, user, password, database } => { - let password = match password { - Some(p) => { - Some(SerializableSecretString::from(decrypt_text(p, encryption_key)?)) - } - None => None, - }; + let password = password + .map(|p| decrypt_text(p, encryption_key)) + .transpose()? + .map(SerializableSecretString::from); Ok(StoredDestinationConfig::ClickHouse { url, user, password, database }) } From d2a68210f8e8678f72025b17dbf960fa05ccbfd1 Mon Sep 17 00:00:00 2001 From: Jordan McQueen Date: Fri, 1 May 2026 13:07:42 +0900 Subject: [PATCH 72/86] Drop redundant set_log_level() from ClickHouse example --- etl-examples/src/bin/clickhouse.rs | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/etl-examples/src/bin/clickhouse.rs b/etl-examples/src/bin/clickhouse.rs index c03627da6..4b3973b35 100644 --- a/etl-examples/src/bin/clickhouse.rs +++ b/etl-examples/src/bin/clickhouse.rs @@ -149,18 +149,8 @@ fn init_tracing() { .init(); } -/// Set default log level if RUST_LOG environment variable is not set. -fn set_log_level() { - if std::env::var("RUST_LOG").is_err() { - unsafe { - std::env::set_var("RUST_LOG", "info"); - } - } -} - /// Main implementation containing all pipeline setup and execution logic. async fn main_impl() -> Result<(), Box> { - set_log_level(); init_tracing(); // Install required crypto provider for TLS (used when clickhouse_url is https://) From 51cd46ee71b2404c4b5f1bf332bfaef5e628b166 Mon Sep 17 00:00:00 2001 From: Jordan McQueen Date: Fri, 1 May 2026 13:09:46 +0900 Subject: [PATCH 73/86] Drop Arc around ClickHouseInserterConfig --- etl-destinations/src/clickhouse/core.rs | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/etl-destinations/src/clickhouse/core.rs b/etl-destinations/src/clickhouse/core.rs index 19eb4af62..8fe7485fa 100644 --- a/etl-destinations/src/clickhouse/core.rs +++ b/etl-destinations/src/clickhouse/core.rs @@ -147,6 +147,7 @@ fn nullable_flags_from_clickhouse_columns( /// The upstream `BatchConfig::max_fill_ms` controls when `write_events` is /// called; this limit prevents unbounded memory use for very large batches /// (e.g. initial copy). +#[derive(Copy, Clone)] pub struct ClickHouseInserterConfig { /// Start a new INSERT after this many uncompressed bytes. Fixed cap /// because incoming and outgoing buffers can both be near-full at once; @@ -180,7 +181,7 @@ pub struct ClickHouseDestination { client: ClickHouseClient, /// Per-INSERT byte budget; gates intermediate flushes within a single /// `write_table_rows` / `write_events` call. - inserter_config: Arc, + inserter_config: ClickHouseInserterConfig, /// Schema/state store used to persist destination table metadata /// (Applying / Applied) and to look up replicated schemas. store: Arc, @@ -213,7 +214,7 @@ where register_metrics(); Ok(Self { client: ClickHouseClient::new(url, user, password, database), - inserter_config: Arc::new(inserter_config), + inserter_config, store: Arc::new(store), table_cache: Arc::new(RwLock::new(HashMap::new())), }) From d21f04b637a840d44454b366dfe4e124f7d9ceff Mon Sep 17 00:00:00 2001 From: Jordan McQueen Date: Fri, 1 May 2026 13:31:56 +0900 Subject: [PATCH 74/86] Use FIRST placement when ClickHouse ADD COLUMN has no anchor --- etl-destinations/src/clickhouse/client.rs | 44 +++++++++++++++++------ etl-destinations/src/clickhouse/core.rs | 18 ++++++---- 2 files changed, 45 insertions(+), 17 deletions(-) diff --git a/etl-destinations/src/clickhouse/client.rs b/etl-destinations/src/clickhouse/client.rs index 65440c505..fdec3ef5b 100644 --- a/etl-destinations/src/clickhouse/client.rs +++ b/etl-destinations/src/clickhouse/client.rs @@ -29,20 +29,31 @@ pub(crate) struct ClickHouseTableColumn { pub(crate) type_name: String, } +/// Returns the placement clause for an `ADD COLUMN` statement. +/// +/// `None` means the destination table has no user columns to anchor on, so +/// the new column goes at the front via `FIRST` (which still places it +/// before the trailing CDC columns). +fn add_column_placement_clause(after_column: Option<&str>) -> String { + match after_column { + Some(anchor) => format!("AFTER {}", quote_identifier(anchor)), + None => "FIRST".to_owned(), + } +} + /// Builds the SQL used to add a column to a ClickHouse table. fn build_add_column_sql( table_name: &str, column: &etl::types::ColumnSchema, - after_column: &str, + after_column: Option<&str>, ) -> String { let col_type = clickhouse_column_type(column, true); let table_name = quote_identifier(table_name); let column_name = quote_identifier(&column.name); - let after_column = quote_identifier(after_column); + let placement = add_column_placement_clause(after_column); format!( - "ALTER TABLE {table_name} ADD COLUMN IF NOT EXISTS {column_name} {col_type} AFTER \ - {after_column}" + "ALTER TABLE {table_name} ADD COLUMN IF NOT EXISTS {column_name} {col_type} {placement}" ) } @@ -193,15 +204,16 @@ impl ClickHouseClient { /// New columns are always Nullable since ClickHouse cannot backfill /// existing rows with a NOT NULL default. /// - /// `after_column` controls placement: the new column is inserted AFTER - /// the named column. This is critical because RowBinary encoding is - /// positional -- new user columns must appear before the CDC columns - /// (`cdc_operation`, `cdc_lsn`), not appended after them. + /// `after_column` controls placement: `Some(name)` inserts the new column + /// immediately AFTER `name`, `None` inserts it FIRST (used when the table + /// has no user columns yet). Either way the new column lands before the + /// trailing CDC columns (`cdc_operation`, `cdc_lsn`), which is required + /// because RowBinary encoding is positional. pub(crate) async fn add_column( &self, table_name: &str, column: &etl::types::ColumnSchema, - after_column: &str, + after_column: Option<&str>, ) -> EtlResult<()> { let sql = build_add_column_sql(table_name, column, after_column); self.execute_ddl(DdlKind::AddColumn, table_name, &sql).await @@ -320,7 +332,7 @@ mod tests { #[test] fn add_column_sql_quotes_identifiers() { let column = column_schema("new\"column"); - let sql = build_add_column_sql("table\"name", &column, "old\"column"); + let sql = build_add_column_sql("table\"name", &column, Some("old\"column")); assert_eq!( sql, @@ -329,6 +341,18 @@ mod tests { ); } + #[test] + fn add_column_sql_uses_first_when_anchor_is_none() { + let column = column_schema("only_col"); + let sql = build_add_column_sql("test_table", &column, None); + + assert_eq!( + sql, + "ALTER TABLE \"test_table\" ADD COLUMN IF NOT EXISTS \"only_col\" Nullable(Int32) \ + FIRST" + ); + } + #[test] fn drop_column_sql_quotes_identifiers() { let sql = build_drop_column_sql("table\"name", "old\"column"); diff --git a/etl-destinations/src/clickhouse/core.rs b/etl-destinations/src/clickhouse/core.rs index 8fe7485fa..fffca6d72 100644 --- a/etl-destinations/src/clickhouse/core.rs +++ b/etl-destinations/src/clickhouse/core.rs @@ -549,15 +549,19 @@ where return Ok(()); } - // Track the last user column name for AFTER placement. New columns are - // inserted after this column, and each added column becomes the new - // anchor for the next. - let mut last_user_column: String = - current_schema.column_schemas().last().map(|c| c.name.clone()).unwrap_or_default(); + // Track the last user column name for AFTER placement. New columns + // are inserted after this column, and each added column becomes the + // new anchor for the next. `None` (no user columns in the current + // schema) falls through to `FIRST` placement inside `add_column`, + // which still keeps the new column before the trailing CDC columns. + let mut last_user_column: Option = + current_schema.column_schemas().last().map(|c| c.name.clone()); for column in &diff.columns_to_add { - self.client.add_column(clickhouse_table_name, column, &last_user_column).await?; - last_user_column = column.name.clone(); + self.client + .add_column(clickhouse_table_name, column, last_user_column.as_deref()) + .await?; + last_user_column = Some(column.name.clone()); } for rename in &diff.columns_to_rename { From 81fbd106247180c0f97a0eb276a0327e80986b7f Mon Sep 17 00:00:00 2001 From: Jordan McQueen Date: Fri, 1 May 2026 13:52:19 +0900 Subject: [PATCH 75/86] fix(replicator): handle ClickHouse in destination_name The merge from main brought `etl-replicator/src/init/destination.rs`, whose `destination_name` match did not include the `ClickHouse` variant that exists on this branch. Build broke with E0004 non-exhaustive patterns. Add the missing arm. --- etl-replicator/src/init/destination.rs | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/etl-replicator/src/init/destination.rs b/etl-replicator/src/init/destination.rs index a24aee647..6dc8fb36d 100644 --- a/etl-replicator/src/init/destination.rs +++ b/etl-replicator/src/init/destination.rs @@ -1,7 +1,8 @@ use etl::{destination::Destination, store::both::postgres::PostgresStore}; use etl_config::shared::{DestinationConfig, IcebergConfig}; use etl_destinations::{ - bigquery::BigQueryDestination, ducklake::DuckLakeDestination, iceberg::IcebergDestination, + bigquery::BigQueryDestination, clickhouse::ClickHouseDestination, + ducklake::DuckLakeDestination, iceberg::IcebergDestination, }; use crate::error_reporting::ErrorReportingStateStore; @@ -16,5 +17,8 @@ pub(crate) fn destination_name(destination_config: &DestinationConfig) -> &'stat config: IcebergConfig::Supabase { .. } | IcebergConfig::Rest { .. }, } => IcebergDestination::::name(), DestinationConfig::Ducklake { .. } => DuckLakeDestination::::name(), + DestinationConfig::ClickHouse { .. } => { + ClickHouseDestination::::name() + } } } From b9fe4afe588441758d5a5102938300645bd3d8ad Mon Sep 17 00:00:00 2001 From: Jordan McQueen Date: Fri, 1 May 2026 18:15:08 +0900 Subject: [PATCH 76/86] Construct ClickHouse password Secret directly without json round-trip --- etl-api/src/k8s/http.rs | 56 ++++++++++++++++++++++------------------- 1 file changed, 30 insertions(+), 26 deletions(-) diff --git a/etl-api/src/k8s/http.rs b/etl-api/src/k8s/http.rs index 99e164f5c..dbb5e2fbb 100644 --- a/etl-api/src/k8s/http.rs +++ b/etl-api/src/k8s/http.rs @@ -1,10 +1,15 @@ +use std::collections::BTreeMap; + use async_trait::async_trait; use base64::{Engine, prelude::BASE64_STANDARD}; use chrono::Utc; use etl_config::Environment; -use k8s_openapi::api::{ - apps::v1::StatefulSet, - core::v1::{ConfigMap, Pod, Secret}, +use k8s_openapi::{ + api::{ + apps::v1::StatefulSet, + core::v1::{ConfigMap, Pod, Secret}, + }, + apimachinery::pkg::apis::meta::v1::ObjectMeta, }; use kube::{ Client, @@ -345,15 +350,13 @@ impl K8sClient for HttpK8sClient { debug!("patching clickhouse secret"); if let Some(password) = password { - let encoded_clickhouse_password = BASE64_STANDARD.encode(password); let clickhouse_secret_name = create_clickhouse_secret_name(prefix); let replicator_app_name = create_replicator_app_name(prefix); - let clickhouse_secret_json = create_clickhouse_password_secret_json( + let secret = create_clickhouse_password_secret( &clickhouse_secret_name, &replicator_app_name, - &encoded_clickhouse_password, + password, ); - let secret: Secret = serde_json::from_value(clickhouse_secret_json)?; // We are forcing the update since we are the field manager that should own the // fields. If there is an override (likely during an incident or @@ -711,27 +714,28 @@ fn create_postgres_secret_json( }) } -fn create_clickhouse_password_secret_json( +fn create_clickhouse_password_secret( secret_name: &str, replicator_app_name: &str, - encoded_clickhouse_password: &str, -) -> serde_json::Value { - json!({ - "apiVersion": "v1", - "kind": "Secret", - "metadata": { - "name": secret_name, - "namespace": DATA_PLANE_NAMESPACE, - "labels": { - "etl.supabase.com/app-name": replicator_app_name, - "etl.supabase.com/app-type": REPLICATOR_APP_LABEL, - } - }, - "type": "Opaque", - "data": { - CLICKHOUSE_PASSWORD_NAME: encoded_clickhouse_password, - } - }) + clickhouse_password: &str, +) -> Secret { + Secret { + metadata: ObjectMeta { + name: Some(secret_name.to_owned()), + namespace: Some(DATA_PLANE_NAMESPACE.to_owned()), + labels: Some(BTreeMap::from([ + ("etl.supabase.com/app-name".to_owned(), replicator_app_name.to_owned()), + ("etl.supabase.com/app-type".to_owned(), REPLICATOR_APP_LABEL.to_owned()), + ])), + ..ObjectMeta::default() + }, + type_: Some("Opaque".to_owned()), + string_data: Some(BTreeMap::from([( + CLICKHOUSE_PASSWORD_NAME.to_owned(), + clickhouse_password.to_owned(), + )])), + ..Secret::default() + } } fn create_bq_service_account_key_secret_json( From 55ca2b5eab6e9c2d47294955a3e702da100cfee7 Mon Sep 17 00:00:00 2001 From: Jordan McQueen Date: Fri, 1 May 2026 19:06:17 +0900 Subject: [PATCH 77/86] Lower 'delete event has no row data' log to debug --- etl-destinations/src/clickhouse/core.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/etl-destinations/src/clickhouse/core.rs b/etl-destinations/src/clickhouse/core.rs index fffca6d72..91a114734 100644 --- a/etl-destinations/src/clickhouse/core.rs +++ b/etl-destinations/src/clickhouse/core.rs @@ -638,7 +638,7 @@ where } Event::Delete(delete) => { let Some(old_table_row) = delete.old_table_row else { - info!("delete event has no row data, skipping"); + debug!("delete event has no row data, skipping"); continue; }; let old_row = match old_table_row { From 79cd26e5f52b2fb837872d7f81f596f66b4d526f Mon Sep 17 00:00:00 2001 From: Jordan McQueen Date: Fri, 1 May 2026 19:10:38 +0900 Subject: [PATCH 78/86] Pass column iterator to build_create_table_sql to drop intermediate Vec --- etl-destinations/src/clickhouse/core.rs | 6 ++---- etl-destinations/src/clickhouse/schema.rs | 13 +++++++++---- 2 files changed, 11 insertions(+), 8 deletions(-) diff --git a/etl-destinations/src/clickhouse/core.rs b/etl-destinations/src/clickhouse/core.rs index 91a114734..c987fd3ac 100644 --- a/etl-destinations/src/clickhouse/core.rs +++ b/etl-destinations/src/clickhouse/core.rs @@ -250,8 +250,7 @@ where ); self.store.store_destination_table_metadata(table_id, metadata.clone()).await?; - let column_schemas: Vec<_> = schema.column_schemas().cloned().collect(); - let ddl = build_create_table_sql(clickhouse_table_name, &column_schemas); + let ddl = build_create_table_sql(clickhouse_table_name, schema.column_schemas()); self.client.execute_ddl(DdlKind::CreateTable, clickhouse_table_name, &ddl).await?; self.store.store_destination_table_metadata(table_id, metadata.to_applied()).await?; @@ -367,8 +366,7 @@ where self.apply_schema_diff(clickhouse_table_name, &diff, &old_schema).await?; } None => { - let column_schemas: Vec<_> = schema.column_schemas().cloned().collect(); - let ddl = build_create_table_sql(clickhouse_table_name, &column_schemas); + let ddl = build_create_table_sql(clickhouse_table_name, schema.column_schemas()); self.client.execute_ddl(DdlKind::CreateTable, clickhouse_table_name, &ddl).await?; } } diff --git a/etl-destinations/src/clickhouse/schema.rs b/etl-destinations/src/clickhouse/schema.rs index 12118abf1..4f1d1e1f8 100644 --- a/etl-destinations/src/clickhouse/schema.rs +++ b/etl-destinations/src/clickhouse/schema.rs @@ -83,10 +83,15 @@ pub(super) fn clickhouse_column_type(col: &ColumnSchema, force_nullable: bool) - /// /// Appends `cdc_operation String` and `cdc_lsn UInt64` as trailing non-nullable /// columns. Uses `MergeTree()` with `ORDER BY tuple()`. -pub(super) fn build_create_table_sql(table_name: &str, column_schemas: &[ColumnSchema]) -> String { - let mut cols = Vec::with_capacity(column_schemas.len() + 2); - - for col in column_schemas { +pub(super) fn build_create_table_sql<'a, I>(table_name: &str, column_schemas: I) -> String +where + I: IntoIterator, + I::IntoIter: ExactSizeIterator, +{ + let iter = column_schemas.into_iter(); + let mut cols = Vec::with_capacity(iter.len() + 2); + + for col in iter { let col_type = clickhouse_column_type(col, false); cols.push(format!(" {} {}", quote_identifier(&col.name), col_type)); } From cd486b8e1499a02ed5f61efe24c816ab58dd0def Mon Sep 17 00:00:00 2001 From: Jordan McQueen Date: Mon, 4 May 2026 11:53:11 +0900 Subject: [PATCH 79/86] test(ci): include clickhouse_pipeline in shared-pg test group The `clickhouse_pipeline::*` integration tests call `spawn_source_database()` and so share the source Postgres cluster, but they were missing from the `shared-pg` filter in `.config/nextest.toml` and `xtask/src/commands/nextest.rs`. Under `cargo xtask nextest run` they were therefore landing in the non-pg lane and running in parallel against whichever cluster `TESTS_DATABASE_PORT` happens to point at, defeating the `max-threads = 1` serialization the test group is supposed to provide and risking timing-sensitive contention under load. Add `clickhouse_pipeline` to both filter copies. --- .config/nextest.toml | 2 +- xtask/src/commands/nextest.rs | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.config/nextest.toml b/.config/nextest.toml index 02c759624..bfe02928f 100644 --- a/.config/nextest.toml +++ b/.config/nextest.toml @@ -10,5 +10,5 @@ test-threads = "num-cpus" max-threads = 1 [[profile.default.overrides]] -filter = "test(exclusive_) | binary_id(etl::main) | (binary_id(etl-destinations::main) & test(/^(bigquery_pipeline|ducklake_destination|ducklake_pipeline|iceberg_destination)::/)) | (binary_id(etl-destinations) & test(/ducklake::core::tests::postgres_backed::/))" +filter = "test(exclusive_) | binary_id(etl::main) | (binary_id(etl-destinations::main) & test(/^(bigquery_pipeline|clickhouse_pipeline|ducklake_destination|ducklake_pipeline|iceberg_destination)::/)) | (binary_id(etl-destinations) & test(/ducklake::core::tests::postgres_backed::/))" test-group = "shared-pg" diff --git a/xtask/src/commands/nextest.rs b/xtask/src/commands/nextest.rs index fa6bfddea..f98a802c0 100644 --- a/xtask/src/commands/nextest.rs +++ b/xtask/src/commands/nextest.rs @@ -14,7 +14,7 @@ use clap::{Args, ValueEnum}; const SHARED_PG_FILTER: &str = "\ test(exclusive_) | binary_id(etl::main) | (binary_id(etl-destinations::main) & \ - test(/^(bigquery_pipeline|ducklake_destination|ducklake_pipeline|iceberg_destination)::/)) | \ + test(/^(bigquery_pipeline|clickhouse_pipeline|ducklake_destination|ducklake_pipeline|iceberg_destination)::/)) | \ (binary_id(etl-destinations) & test(/ducklake::core::tests::postgres_backed::/))"; use super::shared::{DEFAULT_BASE_PORT, DEFAULT_PG_SHARD_COUNT}; From db30d035b88d0b39a201f79e43140a44da3a32e9 Mon Sep 17 00:00:00 2001 From: Jordan McQueen Date: Mon, 4 May 2026 12:21:35 +0900 Subject: [PATCH 80/86] fix(clickhouse): map Postgres date to Date32 and error on out-of-range ClickHouse `Date` is a UInt16 day offset from 1970-01-01 and only covers 1970-01-01..=2149-06-06. Postgres `date` has a much wider range, and the previous `date_to_days(...).clamp(...)` silently turned pre-1970 dates into 1970-01-01 and far-future dates into the ClickHouse maximum, corrupting historical and edge-case data without surfacing anything to the operator. Switch the schema mapping for `Type::DATE` and `Type::DATE_ARRAY` to ClickHouse `Date32`, swap `ClickHouseValue::Date(u16)` for `ClickHouseValue::Date32(i32)`, and convert dates through a fallible `date_to_date32_days` that returns `ConversionError` when the value falls outside `Date32`'s `1900-01-01..=2299-12-31` range. Failing the batch is strictly better than silent corruption. `cell_to_clickhouse_value` and `array_cell_to_clickhouse_values` now return `EtlResult` so the conversion error can propagate; both RowBinary call sites in `core.rs` thread the error up via `?`. --- etl-destinations/src/clickhouse/core.rs | 21 ++-- etl-destinations/src/clickhouse/encoding.rs | 127 ++++++++++++++++---- etl-destinations/src/clickhouse/schema.rs | 6 +- 3 files changed, 118 insertions(+), 36 deletions(-) diff --git a/etl-destinations/src/clickhouse/core.rs b/etl-destinations/src/clickhouse/core.rs index c987fd3ac..b3b6b36af 100644 --- a/etl-destinations/src/clickhouse/core.rs +++ b/etl-destinations/src/clickhouse/core.rs @@ -390,15 +390,18 @@ where let rows: Vec> = table_rows .into_iter() .map(|table_row| { - let mut values: Vec = - table_row.into_values().into_iter().map(cell_to_clickhouse_value).collect(); + let mut values: Vec = table_row + .into_values() + .into_iter() + .map(cell_to_clickhouse_value) + .collect::>()?; // CDC columns: initial-copy rows are tagged as INSERT with LSN 0 // (sentinel meaning "this row pre-dates the streaming cursor"). values.push(ClickHouseValue::String(CdcOperation::Insert.to_string())); values.push(cdc_lsn_to_clickhouse_value(PgLsn::from(0))); - values + Ok(values) }) - .collect(); + .collect::>()?; self.client .insert_rows( @@ -722,13 +725,15 @@ where let rows: Vec> = rows .into_iter() .map(|PendingRow { operation, lsn, cells }| { - let mut values: Vec = - cells.into_iter().map(cell_to_clickhouse_value).collect(); + let mut values: Vec = cells + .into_iter() + .map(cell_to_clickhouse_value) + .collect::>()?; values.push(ClickHouseValue::String(operation.to_string())); values.push(cdc_lsn_to_clickhouse_value(lsn)); - values + Ok(values) }) - .collect(); + .collect::>()?; client .insert_rows( diff --git a/etl-destinations/src/clickhouse/encoding.rs b/etl-destinations/src/clickhouse/encoding.rs index 8d69a1d59..92689dca7 100644 --- a/etl-destinations/src/clickhouse/encoding.rs +++ b/etl-destinations/src/clickhouse/encoding.rs @@ -29,8 +29,10 @@ pub(crate) enum ClickHouseValue { Float64(f64), /// TEXT, NUMERIC (string), TIME (string), JSON, BYTEA (hex-encoded) String(String), - /// Days since Unix epoch (ClickHouse `Date` on wire = UInt16 LE) - Date(u16), + /// Days from Unix epoch (ClickHouse `Date32` on wire = Int32 LE). The + /// signed offset lets us represent pre-1970 dates that ClickHouse `Date` + /// (UInt16) would have silently clamped to 1970-01-01. + Date32(i32), /// Microseconds since Unix epoch (ClickHouse `DateTime64(6)` on wire = /// Int64 LE) DateTime64(i64), @@ -41,8 +43,12 @@ pub(crate) enum ClickHouseValue { } /// Converts a [`Cell`] to a [`ClickHouseValue`], consuming it (no clone). -pub(crate) fn cell_to_clickhouse_value(cell: Cell) -> ClickHouseValue { - match cell { +/// +/// Returns [`ErrorKind::ConversionError`] when a value cannot be represented +/// in ClickHouse without loss, e.g. a Postgres `date` outside ClickHouse +/// `Date32`'s `1900-01-01..=2299-12-31` range. +pub(crate) fn cell_to_clickhouse_value(cell: Cell) -> EtlResult { + Ok(match cell { Cell::Null => ClickHouseValue::Null, Cell::Bool(b) => ClickHouseValue::Bool(b), Cell::I16(v) => ClickHouseValue::Int16(v), @@ -52,7 +58,7 @@ pub(crate) fn cell_to_clickhouse_value(cell: Cell) -> ClickHouseValue { Cell::F32(v) => ClickHouseValue::Float32(v), Cell::F64(v) => ClickHouseValue::Float64(v), Cell::Numeric(n) => ClickHouseValue::String(n.to_string()), - Cell::Date(d) => ClickHouseValue::Date(date_to_days(d)), + Cell::Date(d) => ClickHouseValue::Date32(date_to_date32_days(d)?), Cell::Time(t) => ClickHouseValue::String(t.to_string()), Cell::Timestamp(dt) => ClickHouseValue::DateTime64(dt.and_utc().timestamp_micros()), Cell::TimestampTz(dt) => ClickHouseValue::DateTime64(dt.timestamp_micros()), @@ -61,17 +67,17 @@ pub(crate) fn cell_to_clickhouse_value(cell: Cell) -> ClickHouseValue { Cell::Bytes(b) => ClickHouseValue::String(bytes_to_hex(&b)), Cell::String(s) => ClickHouseValue::String(s), Cell::Array(array_cell) => { - ClickHouseValue::Array(array_cell_to_clickhouse_values(array_cell)) + ClickHouseValue::Array(array_cell_to_clickhouse_values(array_cell)?) } - } + }) } /// Converts an [`ArrayCell`] to a flat `Vec`, mapping each /// `Some(x)` to the matching scalar variant and each `None` to /// [`ClickHouseValue::Null`]. Per-element conversions mirror /// [`cell_to_clickhouse_value`]. -fn array_cell_to_clickhouse_values(array_cell: ArrayCell) -> Vec { - match array_cell { +fn array_cell_to_clickhouse_values(array_cell: ArrayCell) -> EtlResult> { + Ok(match array_cell { ArrayCell::Bool(v) => map_array(v, ClickHouseValue::Bool), ArrayCell::String(v) => map_array(v, ClickHouseValue::String), ArrayCell::I16(v) => map_array(v, ClickHouseValue::Int16), @@ -81,7 +87,9 @@ fn array_cell_to_clickhouse_values(array_cell: ArrayCell) -> Vec map_array(v, ClickHouseValue::Float32), ArrayCell::F64(v) => map_array(v, ClickHouseValue::Float64), ArrayCell::Numeric(v) => map_array(v, |n| ClickHouseValue::String(n.to_string())), - ArrayCell::Date(v) => map_array(v, |d| ClickHouseValue::Date(date_to_days(d))), + ArrayCell::Date(v) => { + try_map_array(v, |d| Ok(ClickHouseValue::Date32(date_to_date32_days(d)?)))? + } ArrayCell::Time(v) => map_array(v, |t| ClickHouseValue::String(t.to_string())), ArrayCell::Timestamp(v) => { map_array(v, |dt| ClickHouseValue::DateTime64(dt.and_utc().timestamp_micros())) @@ -92,7 +100,7 @@ fn array_cell_to_clickhouse_values(array_cell: ArrayCell) -> Vec map_array(v, |u| ClickHouseValue::Uuid(*u.as_bytes())), ArrayCell::Json(v) => map_array(v, |j| ClickHouseValue::String(j.to_string())), ArrayCell::Bytes(v) => map_array(v, |b| ClickHouseValue::String(bytes_to_hex(&b))), - } + }) } /// Maps a `Vec>` to `Vec`, applying `f` to each @@ -109,14 +117,50 @@ where .collect() } -fn date_to_days(d: NaiveDate) -> u16 { - d.signed_duration_since(unix_epoch()).num_days().clamp(0, i64::from(u16::MAX)) as u16 +/// Fallible variant of [`map_array`] for element converters that can fail. +fn try_map_array(v: Vec>, mut f: F) -> EtlResult> +where + F: FnMut(T) -> EtlResult, +{ + v.into_iter() + .map(|o| match o { + Some(t) => f(t), + None => Ok(ClickHouseValue::Null), + }) + .collect() +} + +/// Converts a [`NaiveDate`] to a ClickHouse `Date32` day offset (signed days +/// from 1970-01-01). +/// +/// Returns [`ErrorKind::ConversionError`] when the date falls outside +/// ClickHouse `Date32`'s `1900-01-01..=2299-12-31` range. Silent clamping +/// would corrupt historical or far-future values, so we fail the batch +/// instead. +fn date_to_date32_days(d: NaiveDate) -> EtlResult { + if d < date32_min() || d > date32_max() { + return Err(etl_error!( + ErrorKind::ConversionError, + "date out of ClickHouse Date32 range", + format!("{d} is outside the supported range {}..={}", date32_min(), date32_max()) + )); + } + // The bounds check above guarantees the day count fits in i32. + Ok(d.signed_duration_since(unix_epoch()).num_days() as i32) } fn unix_epoch() -> NaiveDate { NaiveDate::from_ymd_opt(1970, 1, 1).expect("valid date") } +fn date32_min() -> NaiveDate { + NaiveDate::from_ymd_opt(1900, 1, 1).expect("valid date") +} + +fn date32_max() -> NaiveDate { + NaiveDate::from_ymd_opt(2299, 12, 31).expect("valid date") +} + /// Lowercase hex-encodes `bytes` into a fresh `String`. fn bytes_to_hex(bytes: &[u8]) -> String { const HEX: &[u8; 16] = b"0123456789abcdef"; @@ -178,7 +222,7 @@ pub(crate) fn rb_encode_value(val: ClickHouseValue, buf: &mut Vec) -> EtlRes rb_varint(s.len(), buf); buf.extend_from_slice(s.as_bytes()); } - ClickHouseValue::Date(days) => buf.extend_from_slice(&days.to_le_bytes()), + ClickHouseValue::Date32(days) => buf.extend_from_slice(&days.to_le_bytes()), ClickHouseValue::DateTime64(micros) => buf.extend_from_slice(µs.to_le_bytes()), ClickHouseValue::Uuid(bytes) => { // ClickHouse RowBinary UUID = high u64 (LE) then low u64 (LE). Our @@ -238,23 +282,29 @@ mod tests { #[test] fn cell_to_clickhouse_value_null() { - assert!(matches!(cell_to_clickhouse_value(Cell::Null), ClickHouseValue::Null)); + assert!(matches!(cell_to_clickhouse_value(Cell::Null).unwrap(), ClickHouseValue::Null)); } #[test] fn cell_to_clickhouse_value_bool() { - assert!(matches!(cell_to_clickhouse_value(Cell::Bool(true)), ClickHouseValue::Bool(true))); + assert!(matches!( + cell_to_clickhouse_value(Cell::Bool(true)).unwrap(), + ClickHouseValue::Bool(true) + )); } #[test] fn cell_to_clickhouse_value_i32() { - assert!(matches!(cell_to_clickhouse_value(Cell::I32(42)), ClickHouseValue::Int32(42))); + assert!(matches!( + cell_to_clickhouse_value(Cell::I32(42)).unwrap(), + ClickHouseValue::Int32(42) + )); } #[test] fn cell_to_clickhouse_value_string() { if let ClickHouseValue::String(s) = - cell_to_clickhouse_value(Cell::String("hello".to_owned())) + cell_to_clickhouse_value(Cell::String("hello".to_owned())).unwrap() { assert_eq!(s, "hello"); } else { @@ -265,17 +315,40 @@ mod tests { #[test] fn cell_to_clickhouse_value_date() { let epoch = NaiveDate::from_ymd_opt(1970, 1, 1).unwrap(); - assert!(matches!(cell_to_clickhouse_value(Cell::Date(epoch)), ClickHouseValue::Date(0))); + assert!(matches!( + cell_to_clickhouse_value(Cell::Date(epoch)).unwrap(), + ClickHouseValue::Date32(0) + )); let day1 = NaiveDate::from_ymd_opt(1970, 1, 2).unwrap(); - assert!(matches!(cell_to_clickhouse_value(Cell::Date(day1)), ClickHouseValue::Date(1))); + assert!(matches!( + cell_to_clickhouse_value(Cell::Date(day1)).unwrap(), + ClickHouseValue::Date32(1) + )); + + // Pre-1970 dates round-trip through Date32 as a negative offset rather + // than being silently clamped to the epoch. + let pre_epoch = NaiveDate::from_ymd_opt(1969, 12, 31).unwrap(); + assert!(matches!( + cell_to_clickhouse_value(Cell::Date(pre_epoch)).unwrap(), + ClickHouseValue::Date32(-1) + )); + } + + #[test] + fn cell_to_clickhouse_value_date_out_of_range_errors() { + let too_old = NaiveDate::from_ymd_opt(1899, 12, 31).unwrap(); + assert!(cell_to_clickhouse_value(Cell::Date(too_old)).is_err()); + + let too_new = NaiveDate::from_ymd_opt(2300, 1, 1).unwrap(); + assert!(cell_to_clickhouse_value(Cell::Date(too_new)).is_err()); } #[test] fn cell_to_clickhouse_value_timestamp() { let epoch = chrono::DateTime::from_timestamp(0, 0).unwrap().naive_utc(); assert!(matches!( - cell_to_clickhouse_value(Cell::Timestamp(epoch)), + cell_to_clickhouse_value(Cell::Timestamp(epoch)).unwrap(), ClickHouseValue::DateTime64(0) )); } @@ -284,7 +357,7 @@ mod tests { fn cell_to_clickhouse_value_uuid() { let u = Uuid::parse_str("550e8400-e29b-41d4-a716-446655440000").unwrap(); let expected_bytes = *u.as_bytes(); - if let ClickHouseValue::Uuid(bytes) = cell_to_clickhouse_value(Cell::Uuid(u)) { + if let ClickHouseValue::Uuid(bytes) = cell_to_clickhouse_value(Cell::Uuid(u)).unwrap() { assert_eq!(bytes, expected_bytes); } else { panic!("expected Uuid variant"); @@ -294,7 +367,7 @@ mod tests { #[test] fn cell_to_clickhouse_value_bytes_hex() { let bytes = vec![0xde, 0xad, 0xbe, 0xef]; - if let ClickHouseValue::String(s) = cell_to_clickhouse_value(Cell::Bytes(bytes)) { + if let ClickHouseValue::String(s) = cell_to_clickhouse_value(Cell::Bytes(bytes)).unwrap() { assert_eq!(s, "deadbeef"); } else { panic!("expected String variant"); @@ -322,8 +395,12 @@ mod tests { assert_eq!(buf, [2, b'h', b'i']); // varint(2) + bytes buf.clear(); - rb_encode_value(ClickHouseValue::Date(1), &mut buf).unwrap(); - assert_eq!(buf, 1u16.to_le_bytes()); + rb_encode_value(ClickHouseValue::Date32(1), &mut buf).unwrap(); + assert_eq!(buf, 1i32.to_le_bytes()); + + buf.clear(); + rb_encode_value(ClickHouseValue::Date32(-1), &mut buf).unwrap(); + assert_eq!(buf, (-1i32).to_le_bytes()); } #[test] diff --git a/etl-destinations/src/clickhouse/schema.rs b/etl-destinations/src/clickhouse/schema.rs index 4f1d1e1f8..cae5b3aa3 100644 --- a/etl-destinations/src/clickhouse/schema.rs +++ b/etl-destinations/src/clickhouse/schema.rs @@ -20,7 +20,7 @@ fn postgres_column_type_to_clickhouse_sql(typ: &Type) -> &'static str { &Type::FLOAT4 => "Float32", &Type::FLOAT8 => "Float64", &Type::NUMERIC => "String", - &Type::DATE => "Date", + &Type::DATE => "Date32", &Type::TIME => "String", &Type::TIMESTAMP => "DateTime64(6)", &Type::TIMESTAMPTZ => "DateTime64(6, 'UTC')", @@ -47,7 +47,7 @@ fn postgres_array_element_clickhouse_sql(typ: &Type) -> &'static str { &Type::FLOAT4_ARRAY => "Float32", &Type::FLOAT8_ARRAY => "Float64", &Type::NUMERIC_ARRAY => "String", - &Type::DATE_ARRAY => "Date", + &Type::DATE_ARRAY => "Date32", &Type::TIME_ARRAY => "String", &Type::TIMESTAMP_ARRAY => "DateTime64(6)", &Type::TIMESTAMPTZ_ARRAY => "DateTime64(6, 'UTC')", @@ -156,7 +156,7 @@ mod tests { assert_eq!(postgres_column_type_to_clickhouse_sql(&Type::FLOAT4), "Float32"); assert_eq!(postgres_column_type_to_clickhouse_sql(&Type::FLOAT8), "Float64"); assert_eq!(postgres_column_type_to_clickhouse_sql(&Type::NUMERIC), "String"); - assert_eq!(postgres_column_type_to_clickhouse_sql(&Type::DATE), "Date"); + assert_eq!(postgres_column_type_to_clickhouse_sql(&Type::DATE), "Date32"); assert_eq!(postgres_column_type_to_clickhouse_sql(&Type::TIME), "String"); assert_eq!(postgres_column_type_to_clickhouse_sql(&Type::TIMESTAMP), "DateTime64(6)"); assert_eq!( From 6e54249b94d275d52cb07619d64a8b7dbeab6285 Mon Sep 17 00:00:00 2001 From: Jordan McQueen Date: Mon, 4 May 2026 12:35:49 +0900 Subject: [PATCH 81/86] test(clickhouse): cover pre-1970 and far-future dates in pipeline copy Add `pre_1970_and_far_future_dates_round_trip`, an integration test that runs the pipeline against a Postgres `date not null` column populated with the Date32 boundaries (1900-01-01, 1969-12-31, 1970-01-01, 2024-01-15, 2299-12-31) and asserts each value lands in ClickHouse as the expected signed day offset. This pins the new Date32 behavior so the previous silent-clamp regression cannot return. Update `AllTypesRow.date_col` from `u16` to `i32` to match the `Date32` schema the destination now produces. --- etl-destinations/tests/clickhouse_pipeline.rs | 120 +++++++++++++++++- etl-destinations/tests/support/clickhouse.rs | 15 ++- 2 files changed, 131 insertions(+), 4 deletions(-) diff --git a/etl-destinations/tests/clickhouse_pipeline.rs b/etl-destinations/tests/clickhouse_pipeline.rs index faa821442..19d04910b 100644 --- a/etl-destinations/tests/clickhouse_pipeline.rs +++ b/etl-destinations/tests/clickhouse_pipeline.rs @@ -24,7 +24,7 @@ use rand::random; use tokio::time::sleep; use url::Url; -use crate::support::clickhouse::{AllTypesRow, BoundaryValuesRow}; +use crate::support::clickhouse::{AllTypesRow, BoundaryValuesRow, DateBoundariesRow}; /// Ensures the rustls crypto provider is only installed once across all tests. static INIT_CRYPTO: Once = Once::new(); @@ -101,7 +101,7 @@ const TRUNCATE_FLOW_SELECT: &str = concat!( /// round-trip). /// /// Python: `(date(2024, 1, 15) - date(1970, 1, 1)).days` = 19737 -const DATE_2024_01_15_DAYS: u16 = 19737; +const DATE_2024_01_15_DAYS: i32 = 19737; /// Microseconds from epoch for `2024-01-15 12:00:00 UTC`. const TS_2024_01_15_12_00_US: i64 = 1_705_320_000_000_000; @@ -713,6 +713,122 @@ async fn boundary_values_table_copy() { ); } +/// Days from 1970-01-01 to 1900-01-01 (ClickHouse `Date32` minimum). +/// +/// Python: `(date(1900, 1, 1) - date(1970, 1, 1)).days` = -25567. +const DATE_1900_01_01_DAYS: i32 = -25567; + +/// Days from 1970-01-01 to 1969-12-31 (just before the Unix epoch). +const DATE_1969_12_31_DAYS: i32 = -1; + +/// Days from 1970-01-01 to 2299-12-31 (ClickHouse `Date32` maximum). +/// +/// Python: `(date(2299, 12, 31) - date(1970, 1, 1)).days` = 120529. +const DATE_2299_12_31_DAYS: i32 = 120529; + +const DATE_BOUNDARIES_SELECT: &str = concat!( + "SELECT id, date_col, cdc_operation ", + "FROM \"test_date__boundaries\" ", + "ORDER BY id", +); + +/// Tests that Postgres `date` values outside the Unix epoch (pre-1970 and +/// far-future) round-trip through ClickHouse `Date32` as signed day offsets, +/// rather than being silently clamped as the previous `Date` (UInt16) mapping +/// would have done. +/// +/// # GIVEN +/// +/// A Postgres table with a single non-null `date` column populated with the +/// `Date32` boundary values: `1900-01-01` (minimum), `1969-12-31` (just before +/// epoch), `1970-01-01` (epoch), `2024-01-15` (typical), and `2299-12-31` +/// (maximum). +/// +/// # WHEN +/// +/// The pipeline copies the rows to ClickHouse. +/// +/// # THEN +/// +/// Each `date_col` lands in ClickHouse as the matching signed day offset: +/// negative for pre-1970 dates, zero at the epoch, and a large positive value +/// for the far-future date. No value is clamped. +#[tokio::test(flavor = "multi_thread")] +async fn pre_1970_and_far_future_dates_round_trip() { + init_test_tracing(); + install_crypto_provider(); + + // --- GIVEN: Postgres source with date boundary rows --- + + let database = spawn_source_database().await; + let table_name = test_table_name("date_boundaries"); + + let table_id = database + .create_table(table_name.clone(), true, &[("date_col", "date not null")]) + .await + .expect("Failed to create date_boundaries table"); + + let publication_name = "test_pub_ch_date_boundaries"; + database + .create_publication(publication_name, std::slice::from_ref(&table_name)) + .await + .expect("Failed to create publication"); + + for date_literal in ["1900-01-01", "1969-12-31", "1970-01-01", "2024-01-15", "2299-12-31"] { + database + .run_sql(&format!( + "INSERT INTO {} (date_col) VALUES (DATE '{}')", + table_name.as_quoted_identifier(), + date_literal, + )) + .await + .unwrap_or_else(|_| panic!("Failed to insert {date_literal}")); + } + + // --- WHEN: pipeline copies data to ClickHouse --- + let clickhouse_db = setup_clickhouse_database().await; + let store = NotifyingStore::new(); + let pipeline_id: PipelineId = random(); + let destination = clickhouse_db.build_destination(store.clone()); + + let table_ready = + store.notify_on_table_state_type(table_id, TableReplicationPhaseType::Ready).await; + + let mut pipeline = create_pipeline( + &database.config, + pipeline_id, + publication_name.to_owned(), + store.clone(), + destination, + ); + + pipeline.start().await.unwrap(); + table_ready.notified().await; + pipeline.shutdown_and_wait().await.unwrap(); + + // --- THEN: each date encodes as the expected signed day offset --- + let rows: Vec = clickhouse_db.query(DATE_BOUNDARIES_SELECT).await; + assert_eq!(rows.len(), 5, "expected 5 rows in ClickHouse"); + + assert_eq!( + rows[0].date_col, DATE_1900_01_01_DAYS, + "1900-01-01 must encode as the Date32 minimum offset, not be clamped" + ); + assert_eq!( + rows[1].date_col, DATE_1969_12_31_DAYS, + "1969-12-31 must encode as -1 (one day before the Unix epoch)" + ); + assert_eq!(rows[2].date_col, 0, "1970-01-01 must encode as 0 (the Unix epoch)"); + assert_eq!( + rows[3].date_col, DATE_2024_01_15_DAYS, + "2024-01-15 must encode as 19737 (typical post-epoch value)" + ); + assert_eq!( + rows[4].date_col, DATE_2299_12_31_DAYS, + "2299-12-31 must encode as the Date32 maximum offset, not be clamped" + ); +} + /// Tests that DELETE events are streamed to ClickHouse after initial table /// copy. /// diff --git a/etl-destinations/tests/support/clickhouse.rs b/etl-destinations/tests/support/clickhouse.rs index 72758ea92..acced45df 100644 --- a/etl-destinations/tests/support/clickhouse.rs +++ b/etl-destinations/tests/support/clickhouse.rs @@ -3,7 +3,7 @@ /// A row read back from the ClickHouse `all_types_encoding` test table. /// /// Column-to-type mapping: -/// - `Date` -> `u16` (days since 1970-01-01 in RowBinary) +/// - `Date32` -> `i32` (signed days from 1970-01-01 in RowBinary) /// - `DateTime64(6)` -> `i64` (microseconds since epoch in RowBinary) /// - `UUID` -> `String` (via `toString()` in the SELECT query) /// - `Array(Nullable(T))` -> `Vec>` @@ -21,7 +21,7 @@ pub struct AllTypesRow { pub boolean_col: bool, pub text_col: String, pub varchar_col: String, - pub date_col: u16, // Date -> days since epoch + pub date_col: i32, // Date32 -> signed days from 1970-01-01 pub timestamp_col: i64, // DateTime64(6) -> microseconds pub timestamptz_col: i64, // DateTime64(6,'UTC') -> microseconds pub time_col: String, @@ -51,3 +51,14 @@ pub struct BoundaryValuesRow { pub text_array_col: Vec>, pub cdc_operation: String, } + +/// A row read back from a ClickHouse table with a single `Date32` column, +/// used to verify Postgres `date` round-tripping for values outside the Unix +/// epoch (pre-1970 and far-future). The `date_col` is the signed day offset +/// from 1970-01-01. +#[derive(clickhouse::Row, serde::Deserialize, Debug)] +pub struct DateBoundariesRow { + pub id: i64, + pub date_col: i32, + pub cdc_operation: String, +} From 1fe9f2b713c879d42eeebca2c37e14ee26d1e539 Mon Sep 17 00:00:00 2001 From: Jordan McQueen Date: Mon, 4 May 2026 12:40:31 +0900 Subject: [PATCH 82/86] test(clickhouse): rename Date32 boundary constants and inline -1 Address review on the date-boundaries test: - Rename `DATE_1900_01_01_DAYS` -> `DATE32_MIN_DAYS_FROM_UNIX_EPOCH` and `DATE_2299_12_31_DAYS` -> `DATE32_MAX_DAYS_FROM_UNIX_EPOCH` so the constant names describe their semantic role (the Date32 boundaries) rather than the underlying date. - Drop `DATE_1969_12_31_DAYS = -1`; the literal `-1` reads more clearly inline with a description in the assertion message. --- etl-destinations/tests/clickhouse_pipeline.rs | 22 ++++++++----------- 1 file changed, 9 insertions(+), 13 deletions(-) diff --git a/etl-destinations/tests/clickhouse_pipeline.rs b/etl-destinations/tests/clickhouse_pipeline.rs index 19d04910b..eaa7fd012 100644 --- a/etl-destinations/tests/clickhouse_pipeline.rs +++ b/etl-destinations/tests/clickhouse_pipeline.rs @@ -713,18 +713,17 @@ async fn boundary_values_table_copy() { ); } -/// Days from 1970-01-01 to 1900-01-01 (ClickHouse `Date32` minimum). +/// Signed day offset from the Unix epoch to ClickHouse `Date32`'s minimum +/// representable date, `1900-01-01`. /// /// Python: `(date(1900, 1, 1) - date(1970, 1, 1)).days` = -25567. -const DATE_1900_01_01_DAYS: i32 = -25567; +const DATE32_MIN_DAYS_FROM_UNIX_EPOCH: i32 = -25567; -/// Days from 1970-01-01 to 1969-12-31 (just before the Unix epoch). -const DATE_1969_12_31_DAYS: i32 = -1; - -/// Days from 1970-01-01 to 2299-12-31 (ClickHouse `Date32` maximum). +/// Signed day offset from the Unix epoch to ClickHouse `Date32`'s maximum +/// representable date, `2299-12-31`. /// /// Python: `(date(2299, 12, 31) - date(1970, 1, 1)).days` = 120529. -const DATE_2299_12_31_DAYS: i32 = 120529; +const DATE32_MAX_DAYS_FROM_UNIX_EPOCH: i32 = 120529; const DATE_BOUNDARIES_SELECT: &str = concat!( "SELECT id, date_col, cdc_operation ", @@ -811,20 +810,17 @@ async fn pre_1970_and_far_future_dates_round_trip() { assert_eq!(rows.len(), 5, "expected 5 rows in ClickHouse"); assert_eq!( - rows[0].date_col, DATE_1900_01_01_DAYS, + rows[0].date_col, DATE32_MIN_DAYS_FROM_UNIX_EPOCH, "1900-01-01 must encode as the Date32 minimum offset, not be clamped" ); - assert_eq!( - rows[1].date_col, DATE_1969_12_31_DAYS, - "1969-12-31 must encode as -1 (one day before the Unix epoch)" - ); + assert_eq!(rows[1].date_col, -1, "1969-12-31 must encode as one day before the Unix epoch"); assert_eq!(rows[2].date_col, 0, "1970-01-01 must encode as 0 (the Unix epoch)"); assert_eq!( rows[3].date_col, DATE_2024_01_15_DAYS, "2024-01-15 must encode as 19737 (typical post-epoch value)" ); assert_eq!( - rows[4].date_col, DATE_2299_12_31_DAYS, + rows[4].date_col, DATE32_MAX_DAYS_FROM_UNIX_EPOCH, "2299-12-31 must encode as the Date32 maximum offset, not be clamped" ); } From 4323ff649ec19ff1373fd230f29798167999e940 Mon Sep 17 00:00:00 2001 From: Jordan McQueen Date: Mon, 4 May 2026 13:05:33 +0900 Subject: [PATCH 83/86] Drop high-cardinality table label from ClickHouse DDL/INSERT histograms --- etl-destinations/src/clickhouse/client.rs | 17 +++++------------ etl-destinations/src/clickhouse/core.rs | 4 ++-- 2 files changed, 7 insertions(+), 14 deletions(-) diff --git a/etl-destinations/src/clickhouse/client.rs b/etl-destinations/src/clickhouse/client.rs index fdec3ef5b..1dfdd3328 100644 --- a/etl-destinations/src/clickhouse/client.rs +++ b/etl-destinations/src/clickhouse/client.rs @@ -154,12 +154,7 @@ impl ClickHouseClient { /// Executes a DDL statement (e.g. `CREATE TABLE IF NOT EXISTS …`) and /// records its duration in the `etl_clickhouse_ddl_duration_seconds` /// histogram labelled with the DDL `kind` and `table_name`. - pub(crate) async fn execute_ddl( - &self, - kind: DdlKind, - table_name: &str, - sql: &str, - ) -> EtlResult<()> { + pub(crate) async fn execute_ddl(&self, kind: DdlKind, sql: &str) -> EtlResult<()> { let ddl_start = Instant::now(); let result = self.inner.query(sql).execute().await.map_err(|e| { etl_error!( @@ -171,7 +166,6 @@ impl ClickHouseClient { metrics::histogram!( ETL_CLICKHOUSE_DDL_DURATION_SECONDS, "kind" => kind.as_label(), - "table" => table_name.to_owned(), ) .record(ddl_start.elapsed().as_secs_f64()); result @@ -216,13 +210,13 @@ impl ClickHouseClient { after_column: Option<&str>, ) -> EtlResult<()> { let sql = build_add_column_sql(table_name, column, after_column); - self.execute_ddl(DdlKind::AddColumn, table_name, &sql).await + self.execute_ddl(DdlKind::AddColumn, &sql).await } /// Drops a column from an existing ClickHouse table (idempotent). pub(crate) async fn drop_column(&self, table_name: &str, column_name: &str) -> EtlResult<()> { let sql = build_drop_column_sql(table_name, column_name); - self.execute_ddl(DdlKind::DropColumn, table_name, &sql).await + self.execute_ddl(DdlKind::DropColumn, &sql).await } /// Renames a column in an existing ClickHouse table (idempotent). @@ -237,7 +231,7 @@ impl ClickHouseClient { new_name: &str, ) -> EtlResult<()> { let sql = build_rename_column_sql(table_name, old_name, new_name); - self.execute_ddl(DdlKind::RenameColumn, table_name, &sql).await + self.execute_ddl(DdlKind::RenameColumn, &sql).await } /// Executes `TRUNCATE TABLE IF EXISTS` for the supplied table. @@ -302,8 +296,7 @@ impl ClickHouseClient { })?; metrics::histogram!( ETL_CLICKHOUSE_INSERT_DURATION_SECONDS, - "table" => table_name.to_owned(), - "source" => source + "source" => source, ) .record(insert_start.elapsed().as_secs_f64()); } diff --git a/etl-destinations/src/clickhouse/core.rs b/etl-destinations/src/clickhouse/core.rs index b3b6b36af..e9353f524 100644 --- a/etl-destinations/src/clickhouse/core.rs +++ b/etl-destinations/src/clickhouse/core.rs @@ -251,7 +251,7 @@ where self.store.store_destination_table_metadata(table_id, metadata.clone()).await?; let ddl = build_create_table_sql(clickhouse_table_name, schema.column_schemas()); - self.client.execute_ddl(DdlKind::CreateTable, clickhouse_table_name, &ddl).await?; + self.client.execute_ddl(DdlKind::CreateTable, &ddl).await?; self.store.store_destination_table_metadata(table_id, metadata.to_applied()).await?; @@ -367,7 +367,7 @@ where } None => { let ddl = build_create_table_sql(clickhouse_table_name, schema.column_schemas()); - self.client.execute_ddl(DdlKind::CreateTable, clickhouse_table_name, &ddl).await?; + self.client.execute_ddl(DdlKind::CreateTable, &ddl).await?; } } From 4e3ebe4cc5d24f3c75b82c75976bb8ccf98e2b60 Mon Sep 17 00:00:00 2001 From: Jordan McQueen Date: Mon, 4 May 2026 13:09:32 +0900 Subject: [PATCH 84/86] Update ClickHouse metric descriptions to match actual labels --- etl-destinations/src/clickhouse/metrics.rs | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/etl-destinations/src/clickhouse/metrics.rs b/etl-destinations/src/clickhouse/metrics.rs index 55188f424..be287129e 100644 --- a/etl-destinations/src/clickhouse/metrics.rs +++ b/etl-destinations/src/clickhouse/metrics.rs @@ -4,13 +4,14 @@ use metrics::{Unit, describe_histogram}; static REGISTER_METRICS: Once = Once::new(); -/// Duration of `CREATE TABLE IF NOT EXISTS` DDL operations sent to ClickHouse. -/// Labels: `table`. +/// Duration of DDL operations sent to ClickHouse. +/// Labels: `kind` (`create_table`, `add_column`, `drop_column`, +/// `rename_column`). pub(super) const ETL_CLICKHOUSE_DDL_DURATION_SECONDS: &str = "etl_clickhouse_ddl_duration_seconds"; /// Duration of a single RowBinary INSERT statement from first write to server -/// acknowledgement. Labels: `table`, `source` (`copy` = initial table sync, -/// `streaming` = CDC events). +/// acknowledgement. Labels: `source` (`copy` = initial table sync, `streaming` +/// = CDC events). pub(super) const ETL_CLICKHOUSE_INSERT_DURATION_SECONDS: &str = "etl_clickhouse_insert_duration_seconds"; @@ -22,15 +23,14 @@ pub(super) fn register_metrics() { describe_histogram!( ETL_CLICKHOUSE_DDL_DURATION_SECONDS, Unit::Seconds, - "Duration of CREATE TABLE IF NOT EXISTS DDL operations sent to ClickHouse, labeled by \ - table" + "Duration of DDL operations sent to ClickHouse, labeled by kind" ); describe_histogram!( ETL_CLICKHOUSE_INSERT_DURATION_SECONDS, Unit::Seconds, "Duration of RowBinary INSERT statements from first write to server acknowledgement, \ - labeled by table and source" + labeled by source" ); }); } From 999247e400f6c5fef5efa85fe85ce1fb34bcded1 Mon Sep 17 00:00:00 2001 From: Jordan McQueen Date: Mon, 4 May 2026 13:23:53 +0900 Subject: [PATCH 85/86] chore(replicator): drop sample configuration yaml files `etl-replicator/configuration/{base,dev,prod}.yaml` are not on main and only ever held placeholder credentials (`password: password`) plus Kubernetes service URLs. The deployment model is explicitly to provide configuration at runtime via `APP_CONFIG_DIR` (see Dockerfile and README), so committed dummy configs serve no purpose and risk normalising checked-in placeholder secrets. --- etl-replicator/configuration/base.yaml | 21 --------------------- etl-replicator/configuration/dev.yaml | 21 --------------------- etl-replicator/configuration/prod.yaml | 21 --------------------- 3 files changed, 63 deletions(-) delete mode 100644 etl-replicator/configuration/base.yaml delete mode 100644 etl-replicator/configuration/dev.yaml delete mode 100644 etl-replicator/configuration/prod.yaml diff --git a/etl-replicator/configuration/base.yaml b/etl-replicator/configuration/base.yaml deleted file mode 100644 index 1618d4c8f..000000000 --- a/etl-replicator/configuration/base.yaml +++ /dev/null @@ -1,21 +0,0 @@ -application: - host: "[::]" - port: 8080 -destination: - clickhouse: - url: http://clickhouse.etl-data-plane.svc.cluster.local:8123 - user: default - password: password - database: mydb -pipeline: - id: 42 - publication_name: my_pub - pg_connection: - host: postgres.etl-data-plane.svc.cluster.local - port: 5432 - name: mydb - username: postgres - password: password - tls: - trusted_root_certs: "" - enabled: false diff --git a/etl-replicator/configuration/dev.yaml b/etl-replicator/configuration/dev.yaml deleted file mode 100644 index 1618d4c8f..000000000 --- a/etl-replicator/configuration/dev.yaml +++ /dev/null @@ -1,21 +0,0 @@ -application: - host: "[::]" - port: 8080 -destination: - clickhouse: - url: http://clickhouse.etl-data-plane.svc.cluster.local:8123 - user: default - password: password - database: mydb -pipeline: - id: 42 - publication_name: my_pub - pg_connection: - host: postgres.etl-data-plane.svc.cluster.local - port: 5432 - name: mydb - username: postgres - password: password - tls: - trusted_root_certs: "" - enabled: false diff --git a/etl-replicator/configuration/prod.yaml b/etl-replicator/configuration/prod.yaml deleted file mode 100644 index 1618d4c8f..000000000 --- a/etl-replicator/configuration/prod.yaml +++ /dev/null @@ -1,21 +0,0 @@ -application: - host: "[::]" - port: 8080 -destination: - clickhouse: - url: http://clickhouse.etl-data-plane.svc.cluster.local:8123 - user: default - password: password - database: mydb -pipeline: - id: 42 - publication_name: my_pub - pg_connection: - host: postgres.etl-data-plane.svc.cluster.local - port: 5432 - name: mydb - username: postgres - password: password - tls: - trusted_root_certs: "" - enabled: false From a314b477a1f3a48252809296f761722d7a003d10 Mon Sep 17 00:00:00 2001 From: Jordan McQueen Date: Mon, 4 May 2026 13:26:54 +0900 Subject: [PATCH 86/86] test(clickhouse): switch DefaultIdentityDeleteRow.date_col to i32 Missed in the Date -> Date32 migration. CI surfaced it as `SchemaMismatch: attempting to (de)serialize ClickHouse type Date32 as u16` when `delete_with_default_replica_identity` ran. Match the row struct to the new schema. --- etl-destinations/tests/clickhouse_pipeline.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/etl-destinations/tests/clickhouse_pipeline.rs b/etl-destinations/tests/clickhouse_pipeline.rs index eaa7fd012..e5265236e 100644 --- a/etl-destinations/tests/clickhouse_pipeline.rs +++ b/etl-destinations/tests/clickhouse_pipeline.rs @@ -1553,7 +1553,7 @@ struct DefaultIdentityDeleteRow { boolean_col: bool, text_col: String, varchar_col: String, - date_col: u16, + date_col: i32, timestamp_col: i64, timestamptz_col: i64, time_col: String,