OpenSTEF · egordm · May 19, 2026 · May 15, 2026 · May 15, 2026 · May 15, 2026
diff --git a/.gitignore b/.gitignore
@@ -68,6 +68,7 @@ dmypy.json
 docs/_build/
 docs/source/api/generated/
 docs/source/tutorials/
+docs/source/benchmarks/
 
 # docs/_doctrees/
 # docs/_static_gen/
@@ -138,3 +139,4 @@ liander_dataset/
 
 # Jupyter notebook cache (myst-nb execution outputs)
 .jupyter_cache/
+docs/build.zip
diff --git a/docs/source/conf.py b/docs/source/conf.py
@@ -150,17 +150,20 @@ def _discover_submodules(fullname: str) -> list[str]:
 # Configure MyST for docstrings
 myst_enable_extensions = [
     "deflist",
+    "dollarmath",
     "tasklist",
     "colon_fence",
 ]
 
 # -- Notebook execution (myst-nb) -------------------------------------------
 nb_custom_formats = {".py": ["jupytext.reads", {"fmt": "py:percent"}]}
-nb_execution_mode = "off"  # TODO(#884): enable "cache" once tutorials are optimized for faster execution
+nb_execution_mode = "cache"
 nb_execution_timeout = 120
 nb_execution_raise_on_error = True
-# TODO(#884): backtesting notebook exceeds timeout — needs rewrite or execution split
-nb_execution_excludepatterns = ["tutorials/backtesting_openstef_with_beam*"]
+nb_execution_excludepatterns = [
+    "benchmarks/*",  # Benchmarks are too expensive to execute during docs build
+    "benchmarks/*/*",
+]
 
 # Sphinx version switcher
 config = SphinxConfig("../../pyproject.toml", globalns=globals())

diff --git a/docs/source/examples.rst b/docs/source/examples.rst
@@ -7,12 +7,45 @@
 Examples
 ========
 
-End-to-end tutorials demonstrating OpenSTEF workflows. Each example is a runnable
+End-to-end tutorials demonstrating OpenSTEF workflows. Each tutorial is a runnable
 Jupyter notebook rendered with executed outputs.
 
+Tutorials
+---------
+
+.. toctree::
+   :maxdepth: 1
+   :caption: Getting Started
+
+   Forecasting Quickstart <tutorials/forecasting_quickstart>
+   Backtesting Quickstart <tutorials/backtesting_quickstart>
+
 .. toctree::
    :maxdepth: 1
+   :caption: Model Training
 
-   Forecasting with Presets <tutorials/forecasting_with_workflow_presets>
+   Building a Custom Pipeline <tutorials/custom_pipeline>
+   Ensemble Forecasting <tutorials/ensemble_forecasting>
    Hyperparameter Tuning <tutorials/hyperparameter_tuning_with_optuna>
-   Backtesting with BEAM <tutorials/backtesting_openstef_with_beam>
+
+.. toctree::
+   :maxdepth: 1
+   :caption: Evaluation & Analysis
+
+   Model Explainability <tutorials/model_explainability>
+   Quantile Calibration <tutorials/quantile_calibration>
+
+
+Benchmarks
+----------
+
+Compare models on real energy data. These notebooks are **not executed** during
+docs build — run them locally.
+
+.. toctree::
+   :maxdepth: 2
+   :caption: Benchmarking
+
+   Benchmarking Guide <benchmarks/README>
+   Liander 2024 <benchmarks/liander2024/README>
+   Build Your Own <benchmarks/custom/README>
diff --git a/examples/benchmarks/README.md b/examples/benchmarks/README.md
@@ -0,0 +1,40 @@
+<!--
+SPDX-FileCopyrightText: 2026 Contributors to the OpenSTEF project <openstef@lfenergy.org>
+
+SPDX-License-Identifier: MPL-2.0
+-->
+
+# Benchmarks
+
+End-to-end benchmarking using **BEAM** (Backtesting, Evaluation, Analysis, Metrics).
+
+BEAM replays historical data day by day, trains your model, makes forecasts, and scores them — all without data leakage.
+
+## Which notebook do I need?
+
+| I want to… | Start here |
+|---|---|
+| **See how OpenSTEF performs** (just run, no code changes) | [XGBoost & GBLinear](liander2024/run_xgboost_gblinear_benchmark) |
+| **Benchmark my own model** | [Implement a Custom Forecaster](custom/custom_forecaster) |
+| **Benchmark on my own data** | [Configure a Custom Benchmark](custom/custom_benchmark) |
+| **Score predictions I already have** | [Evaluate Existing Forecasts](custom/evaluate_existing_forecasts) |
+
+## Quick start
+
+```bash
+# Install (requires uv: https://docs.astral.sh/uv/)
+uv sync --all-extras --all-groups --all-packages
+
+# Run the built-in Liander 2024 benchmark (XGBoost + GBLinear)
+uv run python -m examples.benchmarks.liander2024.run_xgboost_gblinear_benchmark
+```
+
+## Liander 2024
+
+Pre-made benchmarks on the [Liander 2024 STEF benchmark dataset](https://huggingface.co/datasets/OpenSTEF/liander2024-stef-benchmark).
+No code changes needed — just run.
+
+## Build Your Own
+
+Templates for benchmarking custom models or custom data. See the
+[Build Your Own](custom/README) section for a detailed walkthrough.
diff --git a/examples/benchmarks/custom/README.md b/examples/benchmarks/custom/README.md
@@ -0,0 +1,136 @@
+<!--
+SPDX-FileCopyrightText: 2026 Contributors to the OpenSTEF project <openstef@lfenergy.org>
+
+SPDX-License-Identifier: MPL-2.0
+-->
+
+# Custom Benchmark Templates
+
+Copy this folder as a starting point for your own BEAM benchmarks.
+
+## Which file do I start with?
+
+| I want to… | Start here |
+|---|---|
+| **Benchmark my own model** | `custom_forecaster.py` — implement `BacktestForecasterMixin` |
+| **Benchmark on my own data** | `custom_benchmark.py` — extend `SimpleTargetProvider` |
+| **Score predictions I already have** | `evaluate_existing_forecasts.py` |
+
+## Files
+
+| File | Role |
+|---|---|
+| `custom_forecaster.py` | **Template: your model.** Implements the `BacktestForecasterMixin` interface (config, quantiles, fit, predict). |
+| `custom_benchmark.py` | **Template: your benchmark.** Defines where data lives, which metrics to use, and assembles the pipeline. |
+| `run_liander2024_benchmark.py` | **Entry point:** test your forecaster on the built-in Liander 2024 dataset (auto-downloaded). |
+| `run_custom_benchmark.py` | **Entry point:** run your forecaster on your own data (uses `custom_benchmark.py`). |
+| `evaluate_existing_forecasts.py` | **Entry point:** bring your own prediction parquets, skip backtesting. |
+| `compare_benchmark_runs.py` | **Entry point:** compare results from multiple runs side-by-side. |
+
+## Quick start
+
+```bash
+# Install (requires uv: https://docs.astral.sh/uv/)
+uv sync --all-extras --all-groups --all-packages
+
+# Test the example forecaster on Liander 2024
+uv run python -m examples.benchmarks.custom.run_liander2024_benchmark
+
+# Run with your custom data/targets
+uv run python -m examples.benchmarks.custom.run_custom_benchmark
+```
+
+## Creating your own
+
+### 1. Write a forecaster
+
+Copy `custom_forecaster.py` and implement two methods:
+
+- **`fit(data)`** — called periodically with recent history. Train your model here.
+- **`predict(data)`** — called every few hours. Return a `TimeSeriesDataset` with a `"load"` column and one column per quantile (e.g. `"quantile_P05"`, `"quantile_P50"`).
+
+The `data` argument is a `RestrictedHorizonVersionedTimeSeries` — it enforces no-lookahead by only exposing data available at `data.horizon`.
+
+### 2. Define a benchmark (optional)
+
+Copy `custom_benchmark.py` if you want to use **your own data**. Override `_get_measurements_path_for_target()` and `_get_weather_path_for_target()` to point to your parquet files.
+
+If you're fine with the Liander 2024 dataset, skip this step and use `create_liander2024_benchmark_runner()` directly.
+
+### 3. Run it
+
+Copy `run_custom_benchmark.py`. Register your models as forecaster factories and call `pipeline.run()`.
+
+## Evaluating pre-existing forecasts
+
+If you already have predictions, place them in this layout:
+
+```
+benchmark_results/MyForecasts/
+└── backtest/
+    └── <group_name>/                   # e.g. "solar_park"
+        └── <target_name>/              # e.g. "Within 15 kilometers of Opmeer_normalized"
+            └── predictions.parquet
+```
+
+`group_name` and `target_name` must match the values from your targets YAML. You can list them:
+
+```bash
+uv run python -c "
+from examples.benchmarks.custom.custom_benchmark import create_custom_benchmark_runner
+from openstef_beam.benchmarking import LocalBenchmarkStorage
+from pathlib import Path
+runner = create_custom_benchmark_runner(storage=LocalBenchmarkStorage(base_path=Path('./tmp')))
+for t in runner.target_provider.get_targets(['solar_park']):
+    print(t.group_name, '/', t.name)
+"
+```
+
+Each `predictions.parquet` must have:
+
+| Column | Type | Description |
+|---|---|---|
+| *(index)* `timestamp` | `DatetimeIndex` | When each prediction is valid for. 15-min intervals, tz-naive UTC. |
+| `available_at` | `datetime64` | When the prediction was generated (enables D-1 / lead-time filtering). |
+| `quantile_P05` | `float` | 5th percentile prediction. |
+| `quantile_P50` | `float` | Median prediction (**required**). |
+| `quantile_P95` | `float` | 95th percentile prediction. |
+| ... | `float` | One column per quantile, named with `Quantile(x).format()`. |
+
+Example rows:
+
+```
+timestamp (index)      available_at          quantile_P05  quantile_P50  quantile_P95
+2023-01-15 12:00:00    2023-01-14 06:00:00   0.5           1.2           2.0
+2023-01-15 12:15:00    2023-01-14 06:00:00   0.6           1.3           2.1
+```
+
+Then run:
+
+```bash
+uv run python -m examples.benchmarks.custom.evaluate_existing_forecasts
+```
+
+Results are written to `./benchmark_results/`. Each model gets its own subfolder with backtest predictions, evaluation scores, and analysis plots.
+
+## Comparing results
+
+After running at least two models, generate side-by-side comparison plots (global, per-group, per-target). The scripts automatically detect which targets are available in all runs:
+
+```bash
+uv run python -m examples.benchmarks.custom.compare_benchmark_runs
+```
+
+Output (HTML plots) is saved to `./benchmark_results_comparison/`.
+
+```{toctree}
+:maxdepth: 1
+:hidden:
+
+Implement a Custom Forecaster <custom_forecaster>
+Configure a Custom Benchmark <custom_benchmark>
+Run on Liander 2024 Data <run_liander2024_benchmark>
+Run on Your Own Data <run_custom_benchmark>
+Evaluate Existing Forecasts <evaluate_existing_forecasts>
+Compare Multiple Runs <compare_benchmark_runs>
+```
diff --git a/...s/benchmarks/custom_benchmark/__init__.py → examples/benchmarks/custom/__init__.py b/...s/benchmarks/custom_benchmark/__init__.py → examples/benchmarks/custom/__init__.py
diff --git a/examples/benchmarks/custom/compare_benchmark_runs.ipynb b/examples/benchmarks/custom/compare_benchmark_runs.ipynb
@@ -0,0 +1,130 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "eb06ddbf",
+   "metadata": {},
+   "source": [
+    "# Compare Benchmark Runs\n",
+    "\n",
+    "Generate side-by-side comparison plots from multiple benchmark runs.\n",
+    "Uses [`BenchmarkComparisonPipeline`](https://openstef.github.io/openstef/v4/api/generated/openstef_beam.benchmarking.BenchmarkComparisonPipeline.html)\n",
+    "to produce global, per-group, and per-target HTML visualizations.\n",
+    "\n",
+    "**Prerequisites:** Run at least two models first (e.g. via `run_liander2024_benchmark.py`)."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "16bad08a",
+   "metadata": {
+    "tags": [
+     "remove-cell"
+    ]
+   },
+   "outputs": [],
+   "source": [
+    "\"\"\"Compare benchmark results from different runs on the Liander 2024 dataset.\n",
+    "\n",
+    "Usage:\n",
+    "    1. First run at least two models with run_liander2024_benchmark.py\n",
+    "       (e.g. ExampleBaseline and GBLinear).\n",
+    "    2. Then run this script to generate side-by-side comparison plots.\n",
+    "\n",
+    "Output is saved to ./benchmark_results_comparison/liander2024/.\n",
+    "\"\"\"\n",
+    "\n",
+    "# SPDX-FileCopyrightText: 2025 Contributors to the OpenSTEF project <openstef@lfenergy.org>\n",
+    "#\n",
+    "# SPDX-License-Identifier: MPL-2.0"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "51bc5fb2",
+   "metadata": {},
+   "source": [
+    "## Setup\n",
+    "\n",
+    "Point at the result directories from your benchmark runs."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c3148cff",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\n",
+    "from pathlib import Path\n",
+    "from typing import cast\n",
+    "\n",
+    "from openstef_beam.analysis.models import RunName\n",
+    "from openstef_beam.benchmarking import BenchmarkComparisonPipeline, LocalBenchmarkStorage\n",
+    "from openstef_beam.benchmarking.benchmarks import create_liander2024_benchmark_runner\n",
+    "from openstef_beam.benchmarking.benchmarks.liander2024 import LIANDER2024_ANALYSIS_CONFIG\n",
+    "from openstef_beam.benchmarking.storage import BenchmarkStorage\n",
+    "\n",
+    "# One storage per run — keys are human-readable labels shown in comparison plots.\n",
+    "run_storages: dict[RunName, BenchmarkStorage] = {\n",
+    "    \"ExampleBaseline\": LocalBenchmarkStorage(base_path=Path(\"./benchmark_results/ExampleBaseline\")),\n",
+    "    \"GBLinear\": LocalBenchmarkStorage(base_path=Path(\"./benchmark_results/GBLinear\")),\n",
+    "}\n",
+    "\n",
+    "# Check that results exist.\n",
+    "for name, storage in run_storages.items():\n",
+    "    base_path = cast(LocalBenchmarkStorage, storage).base_path\n",
+    "    if not base_path.exists():\n",
+    "        msg = f\"Benchmark directory not found for '{name}': {base_path}. Run the benchmarks first.\"\n",
+    "        raise FileNotFoundError(msg)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "1cb05b48",
+   "metadata": {},
+   "source": [
+    "## Run comparison\n",
+    "\n",
+    "The pipeline loads predictions from each run, re-evaluates them, and produces\n",
+    "comparison visualizations."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "1548b09a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Reuse the Liander 2024 target provider.\n",
+    "OUTPUT_PATH = Path(\"./benchmark_results_comparison/liander2024\")\n",
+    "target_provider = create_liander2024_benchmark_runner(\n",
+    "    storage=LocalBenchmarkStorage(base_path=OUTPUT_PATH),\n",
+    ").target_provider\n",
+    "\n",
+    "# Run the comparison — generates global, group, and per-target HTML plots.\n",
+    "comparison = BenchmarkComparisonPipeline(\n",
+    "    analysis_config=LIANDER2024_ANALYSIS_CONFIG,\n",
+    "    storage=LocalBenchmarkStorage(base_path=OUTPUT_PATH),\n",
+    "    target_provider=target_provider,\n",
+    ")\n",
+    "comparison.run(run_data=run_storages)"
+   ]
+  }
+ ],
+ "metadata": {
+  "jupytext": {
+   "formats": "ipynb,py:percent"
+  },
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/examples/benchmarks/custom/compare_benchmark_runs.ipynb.license b/examples/benchmarks/custom/compare_benchmark_runs.ipynb.license
@@ -0,0 +1,3 @@
+SPDX-FileCopyrightText: 2026 Contributors to the OpenSTEF project <openstef@lfenergy.org>
+
+SPDX-License-Identifier: MPL-2.0
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,3 @@
		SPDX-FileCopyrightText: 2026 Contributors to the OpenSTEF project <openstef@lfenergy.org>

		SPDX-License-Identifier: MPL-2.0