Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -35,3 +35,4 @@ __pycache__/
benchmark/benchmarks/data
benchmarks/benchmarks/data
benchmarks/pkgs
benchmarks/results
41 changes: 41 additions & 0 deletions benchmarks/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -74,3 +74,44 @@ All benchmarks:
### View in the browser:

You can view the benchmarks in the browser with `asv publish` followed by `asv preview`. If you want to include benchmarks of a local branch, I think you'll have to add that branch to the `"branches"` list in `asv.conf.json`.

## Dask Chunk Default Exploration

Issue [#2036](https://github.com/scverse/anndata/issues/2036) needs benchmark data for choosing more sensible virtual Dask chunk defaults when reading chunked HDF5/Zarr arrays lazily. The script below creates dense `X` arrays with controlled on-disk chunks, reads them through `anndata.experimental.read_elem_lazy`, runs a small set of Dask and Scanpy-style workloads, and writes one CSV row per grid point. Rows include runtime package versions, store size, task count, elapsed time, and coarse process/worker memory readings.

Run a small local smoke benchmark:

```bash
uv run --group test-min python benchmarks/scripts/dask_chunk_grid.py \
--shape 1000,250 \
--store-types h5ad zarr \
--on-disk-chunks 100,250 \
--dask-chunks default \
--dask-chunks 500,-1 \
--workers 1 \
--threads-per-worker 1 \
--workloads sum_axis0 normalize_log1p_slice scanpy_normalize_log1p \
--repeats 1 \
--force
```

Run a larger grid for analysis:

```bash
uv run --group test-min python benchmarks/scripts/dask_chunk_grid.py \
--shape 12000,3000 \
--store-types h5ad zarr \
--on-disk-chunks 256,1024 \
--on-disk-chunks 1024,1024 \
--dask-chunks default \
--dask-chunks 1024,-1 \
--dask-chunks 4096,-1 \
--workers 1 \
--workers 4 \
--threads-per-worker 1 \
--workloads sum_axis0 sum_axis1 normalize_log1p_slice scanpy_normalize_log1p \
--repeats 3 \
--force
```

By default, results are written to `benchmarks/results/dask_chunk_grid.csv`. Use `benchmarks/notebooks/dask_chunk_grid_analysis.ipynb` to compare elapsed time, task counts, and coarse memory readings across the grid.
194 changes: 194 additions & 0 deletions benchmarks/notebooks/dask_chunk_grid_analysis.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,194 @@
{
"cells": [
{
"cell_type": "markdown",
"id": "7fb27b941602401d91542211134fc71a",
"metadata": {},
"source": [
"# Dask Chunk Grid Analysis\n",
"\n",
"This notebook summarizes CSV output from `benchmarks/scripts/dask_chunk_grid.py` for issue #2036. It is intentionally small: run the benchmark script first, then use the tables and plots below to compare on-disk chunks, virtual Dask chunks, worker settings, task counts, storage overhead, and coarse memory readings."
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "acae54e37e7d407bbb7b55eff062a284",
"metadata": {},
"outputs": [],
"source": [
"from __future__ import annotations\n",
"\n",
"from pathlib import Path\n",
"\n",
"import pandas as pd\n",
"\n",
"results_path = Path(\"../results/dask_chunk_grid.csv\")\n",
"df = pd.read_csv(results_path)\n",
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "9a63283cbaf04dbcab1f6479b197f3a8",
"metadata": {},
"outputs": [],
"source": [
"metadata_cols = [\n",
" \"run_started_at\",\n",
" \"python_version\",\n",
" \"platform\",\n",
" \"anndata_version\",\n",
" \"numpy_version\",\n",
" \"h5py_version\",\n",
" \"zarr_version\",\n",
" \"dask_version\",\n",
" \"distributed_version\",\n",
" \"scanpy_version\",\n",
"]\n",
"\n",
"df[metadata_cols].drop_duplicates()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "8dd0d8092fe74a7c96281538738b07e2",
"metadata": {},
"outputs": [],
"source": [
"group_cols = [\n",
" \"store_type\",\n",
" \"zarr_format\",\n",
" \"zarr_shards\",\n",
" \"shape\",\n",
" \"on_disk_chunks\",\n",
" \"dask_chunks_arg\",\n",
" \"workers\",\n",
" \"threads_per_worker\",\n",
" \"processes\",\n",
" \"workload\",\n",
"]\n",
"\n",
"summary = (\n",
" df\n",
" .groupby(group_cols, dropna=False)\n",
" .agg(\n",
" elapsed_median_s=(\"elapsed_s\", \"median\"),\n",
" elapsed_min_s=(\"elapsed_s\", \"min\"),\n",
" task_count=(\"task_count\", \"median\"),\n",
" dataset_nbytes=(\"dataset_nbytes\", \"median\"),\n",
" store_nbytes=(\"store_nbytes\", \"median\"),\n",
" worker_rss_after_mb=(\"worker_rss_after_mb\", \"median\"),\n",
" runs=(\"elapsed_s\", \"size\"),\n",
" )\n",
" .reset_index()\n",
" .sort_values([\"workload\", \"elapsed_median_s\"])\n",
")\n",
"summary[\"store_overhead_ratio\"] = summary[\"store_nbytes\"] / summary[\"dataset_nbytes\"]\n",
"summary.head(20)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "72eea5119410473aa328ad9291626812",
"metadata": {},
"outputs": [],
"source": [
"best_by_workload = summary.loc[\n",
" summary.groupby([\"store_type\", \"workload\"])[\"elapsed_median_s\"].idxmin()\n",
"]\n",
"best_by_workload.sort_values([\"store_type\", \"workload\"])"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "8edb47106e1a46a883d545849b8ab81b",
"metadata": {},
"outputs": [],
"source": [
"baseline_cols = [\n",
" \"store_type\",\n",
" \"zarr_format\",\n",
" \"zarr_shards\",\n",
" \"shape\",\n",
" \"on_disk_chunks\",\n",
" \"workers\",\n",
" \"threads_per_worker\",\n",
" \"processes\",\n",
" \"workload\",\n",
"]\n",
"\n",
"baseline = summary.loc[\n",
" summary[\"dask_chunks_arg\"] == \"default\", baseline_cols + [\"elapsed_median_s\"]\n",
"]\n",
"baseline = baseline.rename(columns={\"elapsed_median_s\": \"default_elapsed_median_s\"})\n",
"speedups = summary.merge(baseline, on=baseline_cols, how=\"left\")\n",
"speedups[\"speedup_vs_default\"] = (\n",
" speedups[\"default_elapsed_median_s\"] / speedups[\"elapsed_median_s\"]\n",
")\n",
"speedups.sort_values([\"workload\", \"speedup_vs_default\"], ascending=[True, False]).head(\n",
" 30\n",
")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "10185d26023b46108eb7d9f57d49d2b3",
"metadata": {},
"outputs": [],
"source": [
"import matplotlib.pyplot as plt\n",
"\n",
"for workload, workload_df in summary.groupby(\"workload\"):\n",
" fig, ax = plt.subplots(figsize=(10, 5))\n",
" labels = []\n",
" for label, label_df in workload_df.groupby([\n",
" \"store_type\",\n",
" \"on_disk_chunks\",\n",
" \"workers\",\n",
" \"threads_per_worker\",\n",
" ]):\n",
" label_df = label_df.sort_values(\"dask_chunks_arg\")\n",
" labels.append(str(label))\n",
" ax.plot(\n",
" label_df[\"dask_chunks_arg\"],\n",
" label_df[\"elapsed_median_s\"],\n",
" marker=\"o\",\n",
" label=str(label),\n",
" )\n",
" ax.set_title(workload)\n",
" ax.set_xlabel(\"Dask chunks argument\")\n",
" ax.set_ylabel(\"Median elapsed seconds\")\n",
" ax.tick_params(axis=\"x\", rotation=45)\n",
" ax.legend(fontsize=\"small\", bbox_to_anchor=(1.05, 1), loc=\"upper left\")\n",
" fig.tight_layout()\n",
" plt.show()"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
Loading
Loading