scverse · ehsanestaji · May 21, 2026 · May 21, 2026
diff --git a/.gitignore b/.gitignore
@@ -35,3 +35,4 @@ __pycache__/
 benchmark/benchmarks/data
 benchmarks/benchmarks/data
 benchmarks/pkgs
+benchmarks/results
diff --git a/benchmarks/README.md b/benchmarks/README.md
@@ -74,3 +74,44 @@ All benchmarks:
 ### View in the browser:
 
 You can view the benchmarks in the browser with `asv publish` followed by `asv preview`. If you want to include benchmarks of a local branch, I think you'll have to add that branch to the `"branches"` list in `asv.conf.json`.
+
+## Dask Chunk Default Exploration
+
+Issue [#2036](https://github.com/scverse/anndata/issues/2036) needs benchmark data for choosing more sensible virtual Dask chunk defaults when reading chunked HDF5/Zarr arrays lazily. The script below creates dense `X` arrays with controlled on-disk chunks, reads them through `anndata.experimental.read_elem_lazy`, runs a small set of Dask and Scanpy-style workloads, and writes one CSV row per grid point. Rows include runtime package versions, store size, task count, elapsed time, and coarse process/worker memory readings.
+
+Run a small local smoke benchmark:
+
+```bash
+uv run --group test-min python benchmarks/scripts/dask_chunk_grid.py \
+  --shape 1000,250 \
+  --store-types h5ad zarr \
+  --on-disk-chunks 100,250 \
+  --dask-chunks default \
+  --dask-chunks 500,-1 \
+  --workers 1 \
+  --threads-per-worker 1 \
+  --workloads sum_axis0 normalize_log1p_slice scanpy_normalize_log1p \
+  --repeats 1 \
+  --force
+```
+
+Run a larger grid for analysis:
+
+```bash
+uv run --group test-min python benchmarks/scripts/dask_chunk_grid.py \
+  --shape 12000,3000 \
+  --store-types h5ad zarr \
+  --on-disk-chunks 256,1024 \
+  --on-disk-chunks 1024,1024 \
+  --dask-chunks default \
+  --dask-chunks 1024,-1 \
+  --dask-chunks 4096,-1 \
+  --workers 1 \
+  --workers 4 \
+  --threads-per-worker 1 \
+  --workloads sum_axis0 sum_axis1 normalize_log1p_slice scanpy_normalize_log1p \
+  --repeats 3 \
+  --force
+```
+
+By default, results are written to `benchmarks/results/dask_chunk_grid.csv`. Use `benchmarks/notebooks/dask_chunk_grid_analysis.ipynb` to compare elapsed time, task counts, and coarse memory readings across the grid.
diff --git a/benchmarks/notebooks/dask_chunk_grid_analysis.ipynb b/benchmarks/notebooks/dask_chunk_grid_analysis.ipynb
@@ -0,0 +1,194 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "7fb27b941602401d91542211134fc71a",
+   "metadata": {},
+   "source": [
+    "# Dask Chunk Grid Analysis\n",
+    "\n",
+    "This notebook summarizes CSV output from `benchmarks/scripts/dask_chunk_grid.py` for issue #2036. It is intentionally small: run the benchmark script first, then use the tables and plots below to compare on-disk chunks, virtual Dask chunks, worker settings, task counts, storage overhead, and coarse memory readings."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "acae54e37e7d407bbb7b55eff062a284",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from __future__ import annotations\n",
+    "\n",
+    "from pathlib import Path\n",
+    "\n",
+    "import pandas as pd\n",
+    "\n",
+    "results_path = Path(\"../results/dask_chunk_grid.csv\")\n",
+    "df = pd.read_csv(results_path)\n",
+    "df.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "9a63283cbaf04dbcab1f6479b197f3a8",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "metadata_cols = [\n",
+    "    \"run_started_at\",\n",
+    "    \"python_version\",\n",
+    "    \"platform\",\n",
+    "    \"anndata_version\",\n",
+    "    \"numpy_version\",\n",
+    "    \"h5py_version\",\n",
+    "    \"zarr_version\",\n",
+    "    \"dask_version\",\n",
+    "    \"distributed_version\",\n",
+    "    \"scanpy_version\",\n",
+    "]\n",
+    "\n",
+    "df[metadata_cols].drop_duplicates()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "8dd0d8092fe74a7c96281538738b07e2",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "group_cols = [\n",
+    "    \"store_type\",\n",
+    "    \"zarr_format\",\n",
+    "    \"zarr_shards\",\n",
+    "    \"shape\",\n",
+    "    \"on_disk_chunks\",\n",
+    "    \"dask_chunks_arg\",\n",
+    "    \"workers\",\n",
+    "    \"threads_per_worker\",\n",
+    "    \"processes\",\n",
+    "    \"workload\",\n",
+    "]\n",
+    "\n",
+    "summary = (\n",
+    "    df\n",
+    "    .groupby(group_cols, dropna=False)\n",
+    "    .agg(\n",
+    "        elapsed_median_s=(\"elapsed_s\", \"median\"),\n",
+    "        elapsed_min_s=(\"elapsed_s\", \"min\"),\n",
+    "        task_count=(\"task_count\", \"median\"),\n",
+    "        dataset_nbytes=(\"dataset_nbytes\", \"median\"),\n",
+    "        store_nbytes=(\"store_nbytes\", \"median\"),\n",
+    "        worker_rss_after_mb=(\"worker_rss_after_mb\", \"median\"),\n",
+    "        runs=(\"elapsed_s\", \"size\"),\n",
+    "    )\n",
+    "    .reset_index()\n",
+    "    .sort_values([\"workload\", \"elapsed_median_s\"])\n",
+    ")\n",
+    "summary[\"store_overhead_ratio\"] = summary[\"store_nbytes\"] / summary[\"dataset_nbytes\"]\n",
+    "summary.head(20)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "72eea5119410473aa328ad9291626812",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "best_by_workload = summary.loc[\n",
+    "    summary.groupby([\"store_type\", \"workload\"])[\"elapsed_median_s\"].idxmin()\n",
+    "]\n",
+    "best_by_workload.sort_values([\"store_type\", \"workload\"])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "8edb47106e1a46a883d545849b8ab81b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "baseline_cols = [\n",
+    "    \"store_type\",\n",
+    "    \"zarr_format\",\n",
+    "    \"zarr_shards\",\n",
+    "    \"shape\",\n",
+    "    \"on_disk_chunks\",\n",
+    "    \"workers\",\n",
+    "    \"threads_per_worker\",\n",
+    "    \"processes\",\n",
+    "    \"workload\",\n",
+    "]\n",
+    "\n",
+    "baseline = summary.loc[\n",
+    "    summary[\"dask_chunks_arg\"] == \"default\", baseline_cols + [\"elapsed_median_s\"]\n",
+    "]\n",
+    "baseline = baseline.rename(columns={\"elapsed_median_s\": \"default_elapsed_median_s\"})\n",
+    "speedups = summary.merge(baseline, on=baseline_cols, how=\"left\")\n",
+    "speedups[\"speedup_vs_default\"] = (\n",
+    "    speedups[\"default_elapsed_median_s\"] / speedups[\"elapsed_median_s\"]\n",
+    ")\n",
+    "speedups.sort_values([\"workload\", \"speedup_vs_default\"], ascending=[True, False]).head(\n",
+    "    30\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "10185d26023b46108eb7d9f57d49d2b3",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import matplotlib.pyplot as plt\n",
+    "\n",
+    "for workload, workload_df in summary.groupby(\"workload\"):\n",
+    "    fig, ax = plt.subplots(figsize=(10, 5))\n",
+    "    labels = []\n",
+    "    for label, label_df in workload_df.groupby([\n",
+    "        \"store_type\",\n",
+    "        \"on_disk_chunks\",\n",
+    "        \"workers\",\n",
+    "        \"threads_per_worker\",\n",
+    "    ]):\n",
+    "        label_df = label_df.sort_values(\"dask_chunks_arg\")\n",
+    "        labels.append(str(label))\n",
+    "        ax.plot(\n",
+    "            label_df[\"dask_chunks_arg\"],\n",
+    "            label_df[\"elapsed_median_s\"],\n",
+    "            marker=\"o\",\n",
+    "            label=str(label),\n",
+    "        )\n",
+    "    ax.set_title(workload)\n",
+    "    ax.set_xlabel(\"Dask chunks argument\")\n",
+    "    ax.set_ylabel(\"Median elapsed seconds\")\n",
+    "    ax.tick_params(axis=\"x\", rotation=45)\n",
+    "    ax.legend(fontsize=\"small\", bbox_to_anchor=(1.05, 1), loc=\"upper left\")\n",
+    "    fig.tight_layout()\n",
+    "    plt.show()"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}