From 7fb0a2a88a72679d59b4c1e2773d9dcbfc171e40 Mon Sep 17 00:00:00 2001 From: Justin Kim Date: Thu, 16 Apr 2026 21:03:03 +0900 Subject: [PATCH] fix(bench): Update DOOP zxing dataset URL to HuggingFace mirror The original host (pages.cs.wisc.edu/~m0riarty) returns 404 and is no longer maintained. Point at the FlowLog VLDB 2026 artifact mirror on HuggingFace (NemoYuu/flowlog_benchmark), which the upstream project now uses for dataset distribution. Also harden the script: - Honour DOOP_ZXING_URL to override the source without editing the file - Use curl --fail so HTTP errors abort the script - Validate the archive with unzip -tq before extraction, so a future broken mirror surfaces a clear error instead of a misleading "cannot find zipfile directory" failure - Move tmpdir/zip cleanup into a trap so partial downloads are removed even when the script exits early --- bench/data/doop/download.sh | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/bench/data/doop/download.sh b/bench/data/doop/download.sh index 9b18cc8c..5208b7c2 100755 --- a/bench/data/doop/download.sh +++ b/bench/data/doop/download.sh @@ -1,15 +1,26 @@ #!/bin/bash # Download DOOP (zxing) benchmark dataset -# Source: FlowLog artifact (VLDB 2026) +# Source: FlowLog VLDB 2026 artifact (mirrored on HuggingFace). +# The original host (pages.cs.wisc.edu/~m0riarty) is no longer available. set -e SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)" -URL="https://pages.cs.wisc.edu/~m0riarty/dataset/csv/zxing.zip" +URL="${DOOP_ZXING_URL:-https://huggingface.co/datasets/NemoYuu/flowlog_benchmark/resolve/main/dataset/csv/zxing.zip}" TMPZIP="/tmp/zxing_doop_$$.zip" TMPDIR="/tmp/zxing_doop_$$" -echo "Downloading zxing dataset (~112 MB)..." -curl -L "$URL" -o "$TMPZIP" +cleanup() { rm -rf "$TMPZIP" "$TMPDIR"; } +trap cleanup EXIT + +echo "Downloading zxing dataset (~112 MB) from $URL ..." +curl --fail -L "$URL" -o "$TMPZIP" + +# Fail fast if the server returned HTML (e.g. 404 page) instead of a zip. +if ! unzip -tq "$TMPZIP" > /dev/null 2>&1; then + echo "ERROR: downloaded file is not a valid zip archive." >&2 + echo " Check connectivity and DOOP_ZXING_URL." >&2 + exit 1 +fi echo "Extracting required CSV files..." mkdir -p "$TMPDIR" @@ -28,5 +39,4 @@ for f in $REQUIRED; do cp "$TMPDIR/zxing/${f}.csv" "$SCRIPT_DIR/" done -rm -rf "$TMPDIR" "$TMPZIP" echo "Done. 34 CSV files copied to $SCRIPT_DIR/"