Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
39 changes: 11 additions & 28 deletions duckdb-vortex-partitioned/benchmark.sh
Original file line number Diff line number Diff line change
@@ -1,45 +1,28 @@
#!/bin/bash

set -Eeuo pipefail

# Install
sudo apt-get update -y
sudo apt-get install -y ninja-build cmake build-essential make ccache pip clang pkg-config

curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y --default-toolchain stable --no-modify-path

export CC=clang
export CXX=clang++
git clone https://github.com/vortex-data/duckdb-vortex --recursive
cd duckdb-vortex
git fetch --tags
git checkout v0.44.0
git submodule update --init --recursive
GEN=ninja NATIVE_ARCH=1 LTO=thin make
export PATH="`pwd`/build/release/:$PATH"
cd ..
export HOME=${HOME:=~}
curl https://install.duckdb.org | sh
export PATH=$HOME'/.duckdb/cli/latest':$PATH

duckdb -c "INSTALL vortex;"

# Load the data
seq 0 99 | xargs -P100 -I{} bash -c 'wget --continue --progress=dot:giga https://datasets.clickhouse.com/hits_compatible/athena_partitioned/hits_{}.parquet'
seq 0 99 | xargs -P100 -I{} bash -c 'wget --continue --quiet https://datasets.clickhouse.com/hits_compatible/athena_partitioned/hits_{}.parquet'

# Convert parquet files to vortex partitioned
echo -n "Load time: "
seq 0 99 | command time -f '%e' xargs -P"$(nproc)" -I{} bash -c '
if [ ! -f "hits_{}.vortex" ]; then
duckdb -c "
COPY (
SELECT *
REPLACE (
make_date(EventDate) AS EventDate,
epoch_ms(EventTime * 1000) as EventTime
)
FROM read_parquet('"'"'hits_{}.parquet'"'"', binary_as_string=True)
)
TO '"'"'hits_{}.vortex'"'"' (FORMAT VORTEX)
"
duckdb -c "LOAD vortex;" -c "COPY (SELECT * REPLACE (URL::VARCHAR AS URL, Title::VARCHAR AS Title, Referer::VARCHAR AS Referer) FROM '"'"'hits_{}.parquet'"'"') TO '"'"'hits_{}.vortex'"'"' (FORMAT vortex);"
fi
'

# Create view and macro
echo -n "Load time: "
command time -f '%e' duckdb hits-partitioned.db -c "CREATE VIEW hits AS SELECT * FROM read_vortex('hits_*.vortex')";
command time -f '%e' duckdb hits-partitioned.db -f create.sql

# Run the queries
echo 'partitioned'
Expand Down
6 changes: 6 additions & 0 deletions duckdb-vortex-partitioned/create.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
LOAD vortex;

CREATE VIEW hits AS
SELECT * REPLACE (make_date(EventDate) AS EventDate)
FROM read_vortex('hits_*.vortex');
CREATE MACRO toDateTime(t) AS epoch_ms(t * 1000);
4 changes: 2 additions & 2 deletions duckdb-vortex-partitioned/queries.sql
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ SELECT SearchEngineID, SearchPhrase, COUNT(*) AS c FROM hits WHERE SearchPhrase
SELECT UserID, COUNT(*) FROM hits GROUP BY UserID ORDER BY COUNT(*) DESC LIMIT 10;
SELECT UserID, SearchPhrase, COUNT(*) FROM hits GROUP BY UserID, SearchPhrase ORDER BY COUNT(*) DESC LIMIT 10;
SELECT UserID, SearchPhrase, COUNT(*) FROM hits GROUP BY UserID, SearchPhrase LIMIT 10;
SELECT UserID, extract(minute FROM EventTime) AS m, SearchPhrase, COUNT(*) FROM hits GROUP BY UserID, m, SearchPhrase ORDER BY COUNT(*) DESC LIMIT 10;
SELECT UserID, extract(minute FROM toDateTime(EventTime)) AS m, SearchPhrase, COUNT(*) FROM hits GROUP BY UserID, m, SearchPhrase ORDER BY COUNT(*) DESC LIMIT 10;
SELECT UserID FROM hits WHERE UserID = 435090932899640449;
SELECT COUNT(*) FROM hits WHERE URL LIKE '%google%';
SELECT SearchPhrase, MIN(URL), COUNT(*) AS c FROM hits WHERE URL LIKE '%google%' AND SearchPhrase <> '' GROUP BY SearchPhrase ORDER BY c DESC LIMIT 10;
Expand All @@ -40,4 +40,4 @@ SELECT URL, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >
SELECT TraficSourceID, SearchEngineID, AdvEngineID, CASE WHEN (SearchEngineID = 0 AND AdvEngineID = 0) THEN Referer ELSE '' END AS Src, URL AS Dst, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND IsRefresh = 0 GROUP BY TraficSourceID, SearchEngineID, AdvEngineID, Src, Dst ORDER BY PageViews DESC LIMIT 10 OFFSET 1000;
SELECT URLHash, EventDate, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND IsRefresh = 0 AND TraficSourceID IN (-1, 6) AND RefererHash = 3594120000172545465 GROUP BY URLHash, EventDate ORDER BY PageViews DESC LIMIT 10 OFFSET 100;
SELECT WindowClientWidth, WindowClientHeight, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND IsRefresh = 0 AND DontCountHits = 0 AND URLHash = 2868770270353813622 GROUP BY WindowClientWidth, WindowClientHeight ORDER BY PageViews DESC LIMIT 10 OFFSET 10000;
SELECT DATE_TRUNC('minute', EventTime) AS M, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-14' AND EventDate <= '2013-07-15' AND IsRefresh = 0 AND DontCountHits = 0 GROUP BY DATE_TRUNC('minute', EventTime) ORDER BY DATE_TRUNC('minute', EventTime) LIMIT 10 OFFSET 1000;
SELECT DATE_TRUNC('minute', toDateTime(EventTime)) AS M, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-14' AND EventDate <= '2013-07-15' AND IsRefresh = 0 AND DontCountHits = 0 GROUP BY DATE_TRUNC('minute', toDateTime(EventTime)) ORDER BY DATE_TRUNC('minute', toDateTime(EventTime)) LIMIT 10 OFFSET 1000;
92 changes: 46 additions & 46 deletions duckdb-vortex-partitioned/results/c6a.4xlarge.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"system": "DuckDB (Vortex, partitioned)",
"date": "2025-08-06",
"date": "2026-01-26",
"machine": "c6a.4xlarge",
"cluster_size": 1,
"proprietary": "no",
Expand All @@ -10,52 +10,52 @@

"tags": ["Rust", "column-oriented", "embedded", "stateless", "lukewarm-cold-run"],

"load_time": 742.26,
"data_size": 15961049404,
"load_time": 306.10,
"data_size": 62297495012,

"result": [
[0.184,0.013,0.003],
[0.523,0.014,0.014],
[1.610,0.035,0.035],
[3.435,0.049,0.052],
[3.466,0.329,0.332],
[4.172,0.297,0.292],
[0.181,0.022,0.020],
[0.567,0.019,0.018],
[4.334,0.415,0.405],
[4.319,0.561,0.558],
[2.784,0.097,0.091],
[3.485,0.124,0.107],
[4.721,0.307,0.316],
[7.135,0.672,0.675],
[4.478,0.343,0.342],
[2.783,0.392,0.387],
[7.061,0.852,0.847],
[6.856,0.740,0.628],
[9.200,1.517,1.505],
[1.971,0.038,0.031],
[33.849,0.556,0.530],
[36.486,0.679,0.636],
[40.129,1.065,1.072],
[7.566,0.392,0.382],
[1.880,0.122,0.062],
[4.896,0.096,0.098],
[1.791,0.126,0.031],
[34.787,0.863,0.790],
[28.059,9.317,9.314],
[0.717,0.033,0.033],
[7.835,0.279,0.317],
[13.900,0.420,0.403],
[10.751,1.919,1.892],
[34.222,2.047,1.980],
[34.208,2.288,2.131],
[1.861,0.511,0.506],
[0.258,0.025,0.024],
[0.840,0.012,0.021],
[1.098,0.024,0.018],
[1.265,0.063,0.053],
[0.815,0.022,0.008],
[0.889,0.010,0.010],
[0.833,0.032,0.012]
[0.081,0.018,0.006],
[0.079,0.027,0.024],
[0.667,0.038,0.041],
[1.835,0.057,0.055],
[1.839,0.256,0.257],
[9.272,0.599,0.610],
[0.093,0.017,0.018],
[0.102,0.031,0.026],
[2.230,0.408,0.412],
[2.011,0.556,0.555],
[7.812,0.290,0.288],
[7.884,0.305,0.301],
[9.172,0.608,0.575],
[10.977,0.967,0.951],
[9.324,0.648,0.621],
[1.305,0.358,0.357],
[10.873,1.085,1.112],
[10.675,0.880,0.917],
[12.196,1.869,1.859],
[0.723,0.037,0.036],
[15.900,0.729,0.709],
[21.164,0.877,0.884],
[29.683,1.692,1.686],
[43.726,13.442,1.409],
[0.870,1.021,1.554],
[9.743,0.394,0.334],
[1.740,1.337,0.620],
[15.907,0.733,0.719],
[13.537,6.344,6.282],
[0.118,0.038,0.037],
[11.198,0.750,0.751],
[13.790,0.897,0.902],
[5.408,1.860,1.927],
[15.674,2.265,2.199],
[15.707,3.102,3.082],
[0.671,0.548,0.552],
[0.105,0.027,0.029],
[0.089,0.016,0.016],
[0.534,0.021,0.022],
[0.645,0.063,0.063],
[0.330,0.013,0.012],
[0.457,0.014,0.014],
[0.429,0.018,0.018]
]
}
4 changes: 4 additions & 0 deletions duckdb-vortex-partitioned/run.sh
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
#!/bin/bash

set -Eeuo pipefail

TRIES=3

cat queries.sql | while read -r query; do
Expand All @@ -9,6 +11,8 @@ cat queries.sql | while read -r query; do
echo "$query";
cli_params=()
cli_params+=("-c")
cli_params+=("LOAD vortex;")
cli_params+=("-c")
cli_params+=(".timer on")
for i in $(seq 1 $TRIES); do
cli_params+=("-c")
Expand Down