diff --git a/.gitignore b/.gitignore
index 790e5bc66..710a26c21 100644
--- a/.gitignore
+++ b/.gitignore
@@ -68,6 +68,7 @@ dmypy.json
 docs/_build/
 docs/source/api/generated/
 docs/source/tutorials/
+docs/source/benchmarks/
 
 # docs/_doctrees/
 # docs/_static_gen/
@@ -138,3 +139,4 @@ liander_dataset/
 
 # Jupyter notebook cache (myst-nb execution outputs)
 .jupyter_cache/
+docs/build.zip
diff --git a/docs/source/conf.py b/docs/source/conf.py
index 45fb908ea..efd6e9909 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -150,17 +150,20 @@ def _discover_submodules(fullname: str) -> list[str]:
 # Configure MyST for docstrings
 myst_enable_extensions = [
     "deflist",
+    "dollarmath",
     "tasklist",
     "colon_fence",
 ]
 
 # -- Notebook execution (myst-nb) -------------------------------------------
 nb_custom_formats = {".py": ["jupytext.reads", {"fmt": "py:percent"}]}
-nb_execution_mode = "off"  # TODO(#884): enable "cache" once tutorials are optimized for faster execution
+nb_execution_mode = "cache"
 nb_execution_timeout = 120
 nb_execution_raise_on_error = True
-# TODO(#884): backtesting notebook exceeds timeout — needs rewrite or execution split
-nb_execution_excludepatterns = ["tutorials/backtesting_openstef_with_beam*"]
+nb_execution_excludepatterns = [
+    "benchmarks/*",  # Benchmarks are too expensive to execute during docs build
+    "benchmarks/*/*",
+]
 
 # Sphinx version switcher
 config = SphinxConfig("../../pyproject.toml", globalns=globals())
diff --git a/docs/source/examples.rst b/docs/source/examples.rst
index d184ef7b0..dd5f7ba51 100644
--- a/docs/source/examples.rst
+++ b/docs/source/examples.rst
@@ -7,12 +7,45 @@
 Examples
 ========
 
-End-to-end tutorials demonstrating OpenSTEF workflows. Each example is a runnable
+End-to-end tutorials demonstrating OpenSTEF workflows. Each tutorial is a runnable
 Jupyter notebook rendered with executed outputs.
 
+Tutorials
+---------
+
+.. toctree::
+   :maxdepth: 1
+   :caption: Getting Started
+
+   Forecasting Quickstart <tutorials/forecasting_quickstart>
+   Backtesting Quickstart <tutorials/backtesting_quickstart>
+
 .. toctree::
    :maxdepth: 1
+   :caption: Model Training
 
-   Forecasting with Presets <tutorials/forecasting_with_workflow_presets>
+   Building a Custom Pipeline <tutorials/custom_pipeline>
+   Ensemble Forecasting <tutorials/ensemble_forecasting>
    Hyperparameter Tuning <tutorials/hyperparameter_tuning_with_optuna>
-   Backtesting with BEAM <tutorials/backtesting_openstef_with_beam>
+
+.. toctree::
+   :maxdepth: 1
+   :caption: Evaluation & Analysis
+
+   Model Explainability <tutorials/model_explainability>
+   Quantile Calibration <tutorials/quantile_calibration>
+
+
+Benchmarks
+----------
+
+Compare models on real energy data. These notebooks are **not executed** during
+docs build — run them locally.
+
+.. toctree::
+   :maxdepth: 2
+   :caption: Benchmarking
+
+   Benchmarking Guide <benchmarks/README>
+   Liander 2024 <benchmarks/liander2024/README>
+   Build Your Own <benchmarks/custom/README>
diff --git a/examples/benchmarks/README.md b/examples/benchmarks/README.md
new file mode 100644
index 000000000..10a600dc6
--- /dev/null
+++ b/examples/benchmarks/README.md
@@ -0,0 +1,40 @@
+<!--
+SPDX-FileCopyrightText: 2026 Contributors to the OpenSTEF project <openstef@lfenergy.org>
+
+SPDX-License-Identifier: MPL-2.0
+-->
+
+# Benchmarks
+
+End-to-end benchmarking using **BEAM** (Backtesting, Evaluation, Analysis, Metrics).
+
+BEAM replays historical data day by day, trains your model, makes forecasts, and scores them — all without data leakage.
+
+## Which notebook do I need?
+
+| I want to… | Start here |
+|---|---|
+| **See how OpenSTEF performs** (just run, no code changes) | [XGBoost & GBLinear](liander2024/run_xgboost_gblinear_benchmark) |
+| **Benchmark my own model** | [Implement a Custom Forecaster](custom/custom_forecaster) |
+| **Benchmark on my own data** | [Configure a Custom Benchmark](custom/custom_benchmark) |
+| **Score predictions I already have** | [Evaluate Existing Forecasts](custom/evaluate_existing_forecasts) |
+
+## Quick start
+
+```bash
+# Install (requires uv: https://docs.astral.sh/uv/)
+uv sync --all-extras --all-groups --all-packages
+
+# Run the built-in Liander 2024 benchmark (XGBoost + GBLinear)
+uv run python -m examples.benchmarks.liander2024.run_xgboost_gblinear_benchmark
+```
+
+## Liander 2024
+
+Pre-made benchmarks on the [Liander 2024 STEF benchmark dataset](https://huggingface.co/datasets/OpenSTEF/liander2024-stef-benchmark).
+No code changes needed — just run.
+
+## Build Your Own
+
+Templates for benchmarking custom models or custom data. See the
+[Build Your Own](custom/README) section for a detailed walkthrough.
diff --git a/examples/benchmarks/custom/README.md b/examples/benchmarks/custom/README.md
new file mode 100644
index 000000000..a5557c226
--- /dev/null
+++ b/examples/benchmarks/custom/README.md
@@ -0,0 +1,136 @@
+<!--
+SPDX-FileCopyrightText: 2026 Contributors to the OpenSTEF project <openstef@lfenergy.org>
+
+SPDX-License-Identifier: MPL-2.0
+-->
+
+# Custom Benchmark Templates
+
+Copy this folder as a starting point for your own BEAM benchmarks.
+
+## Which file do I start with?
+
+| I want to… | Start here |
+|---|---|
+| **Benchmark my own model** | `custom_forecaster.py` — implement `BacktestForecasterMixin` |
+| **Benchmark on my own data** | `custom_benchmark.py` — extend `SimpleTargetProvider` |
+| **Score predictions I already have** | `evaluate_existing_forecasts.py` |
+
+## Files
+
+| File | Role |
+|---|---|
+| `custom_forecaster.py` | **Template: your model.** Implements the `BacktestForecasterMixin` interface (config, quantiles, fit, predict). |
+| `custom_benchmark.py` | **Template: your benchmark.** Defines where data lives, which metrics to use, and assembles the pipeline. |
+| `run_liander2024_benchmark.py` | **Entry point:** test your forecaster on the built-in Liander 2024 dataset (auto-downloaded). |
+| `run_custom_benchmark.py` | **Entry point:** run your forecaster on your own data (uses `custom_benchmark.py`). |
+| `evaluate_existing_forecasts.py` | **Entry point:** bring your own prediction parquets, skip backtesting. |
+| `compare_benchmark_runs.py` | **Entry point:** compare results from multiple runs side-by-side. |
+
+## Quick start
+
+```bash
+# Install (requires uv: https://docs.astral.sh/uv/)
+uv sync --all-extras --all-groups --all-packages
+
+# Test the example forecaster on Liander 2024
+uv run python -m examples.benchmarks.custom.run_liander2024_benchmark
+
+# Run with your custom data/targets
+uv run python -m examples.benchmarks.custom.run_custom_benchmark
+```
+
+## Creating your own
+
+### 1. Write a forecaster
+
+Copy `custom_forecaster.py` and implement two methods:
+
+- **`fit(data)`** — called periodically with recent history. Train your model here.
+- **`predict(data)`** — called every few hours. Return a `TimeSeriesDataset` with a `"load"` column and one column per quantile (e.g. `"quantile_P05"`, `"quantile_P50"`).
+
+The `data` argument is a `RestrictedHorizonVersionedTimeSeries` — it enforces no-lookahead by only exposing data available at `data.horizon`.
+
+### 2. Define a benchmark (optional)
+
+Copy `custom_benchmark.py` if you want to use **your own data**. Override `_get_measurements_path_for_target()` and `_get_weather_path_for_target()` to point to your parquet files.
+
+If you're fine with the Liander 2024 dataset, skip this step and use `create_liander2024_benchmark_runner()` directly.
+
+### 3. Run it
+
+Copy `run_custom_benchmark.py`. Register your models as forecaster factories and call `pipeline.run()`.
+
+## Evaluating pre-existing forecasts
+
+If you already have predictions, place them in this layout:
+
+```
+benchmark_results/MyForecasts/
+└── backtest/
+    └── <group_name>/                   # e.g. "solar_park"
+        └── <target_name>/              # e.g. "Within 15 kilometers of Opmeer_normalized"
+            └── predictions.parquet
+```
+
+`group_name` and `target_name` must match the values from your targets YAML. You can list them:
+
+```bash
+uv run python -c "
+from examples.benchmarks.custom.custom_benchmark import create_custom_benchmark_runner
+from openstef_beam.benchmarking import LocalBenchmarkStorage
+from pathlib import Path
+runner = create_custom_benchmark_runner(storage=LocalBenchmarkStorage(base_path=Path('./tmp')))
+for t in runner.target_provider.get_targets(['solar_park']):
+    print(t.group_name, '/', t.name)
+"
+```
+
+Each `predictions.parquet` must have:
+
+| Column | Type | Description |
+|---|---|---|
+| *(index)* `timestamp` | `DatetimeIndex` | When each prediction is valid for. 15-min intervals, tz-naive UTC. |
+| `available_at` | `datetime64` | When the prediction was generated (enables D-1 / lead-time filtering). |
+| `quantile_P05` | `float` | 5th percentile prediction. |
+| `quantile_P50` | `float` | Median prediction (**required**). |
+| `quantile_P95` | `float` | 95th percentile prediction. |
+| ... | `float` | One column per quantile, named with `Quantile(x).format()`. |
+
+Example rows:
+
+```
+timestamp (index)      available_at          quantile_P05  quantile_P50  quantile_P95
+2023-01-15 12:00:00    2023-01-14 06:00:00   0.5           1.2           2.0
+2023-01-15 12:15:00    2023-01-14 06:00:00   0.6           1.3           2.1
+```
+
+Then run:
+
+```bash
+uv run python -m examples.benchmarks.custom.evaluate_existing_forecasts
+```
+
+Results are written to `./benchmark_results/`. Each model gets its own subfolder with backtest predictions, evaluation scores, and analysis plots.
+
+## Comparing results
+
+After running at least two models, generate side-by-side comparison plots (global, per-group, per-target). The scripts automatically detect which targets are available in all runs:
+
+```bash
+uv run python -m examples.benchmarks.custom.compare_benchmark_runs
+```
+
+Output (HTML plots) is saved to `./benchmark_results_comparison/`.
+
+```{toctree}
+:maxdepth: 1
+:hidden:
+
+Implement a Custom Forecaster <custom_forecaster>
+Configure a Custom Benchmark <custom_benchmark>
+Run on Liander 2024 Data <run_liander2024_benchmark>
+Run on Your Own Data <run_custom_benchmark>
+Evaluate Existing Forecasts <evaluate_existing_forecasts>
+Compare Multiple Runs <compare_benchmark_runs>
+```
diff --git a/examples/benchmarks/custom_benchmark/__init__.py b/examples/benchmarks/custom/__init__.py
similarity index 100%
rename from examples/benchmarks/custom_benchmark/__init__.py
rename to examples/benchmarks/custom/__init__.py
diff --git a/examples/benchmarks/custom/compare_benchmark_runs.ipynb b/examples/benchmarks/custom/compare_benchmark_runs.ipynb
new file mode 100644
index 000000000..643b159a6
--- /dev/null
+++ b/examples/benchmarks/custom/compare_benchmark_runs.ipynb
@@ -0,0 +1,130 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "eb06ddbf",
+   "metadata": {},
+   "source": [
+    "# Compare Benchmark Runs\n",
+    "\n",
+    "Generate side-by-side comparison plots from multiple benchmark runs.\n",
+    "Uses [`BenchmarkComparisonPipeline`](https://openstef.github.io/openstef/v4/api/generated/openstef_beam.benchmarking.BenchmarkComparisonPipeline.html)\n",
+    "to produce global, per-group, and per-target HTML visualizations.\n",
+    "\n",
+    "**Prerequisites:** Run at least two models first (e.g. via `run_liander2024_benchmark.py`)."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "16bad08a",
+   "metadata": {
+    "tags": [
+     "remove-cell"
+    ]
+   },
+   "outputs": [],
+   "source": [
+    "\"\"\"Compare benchmark results from different runs on the Liander 2024 dataset.\n",
+    "\n",
+    "Usage:\n",
+    "    1. First run at least two models with run_liander2024_benchmark.py\n",
+    "       (e.g. ExampleBaseline and GBLinear).\n",
+    "    2. Then run this script to generate side-by-side comparison plots.\n",
+    "\n",
+    "Output is saved to ./benchmark_results_comparison/liander2024/.\n",
+    "\"\"\"\n",
+    "\n",
+    "# SPDX-FileCopyrightText: 2025 Contributors to the OpenSTEF project <openstef@lfenergy.org>\n",
+    "#\n",
+    "# SPDX-License-Identifier: MPL-2.0"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "51bc5fb2",
+   "metadata": {},
+   "source": [
+    "## Setup\n",
+    "\n",
+    "Point at the result directories from your benchmark runs."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c3148cff",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\n",
+    "from pathlib import Path\n",
+    "from typing import cast\n",
+    "\n",
+    "from openstef_beam.analysis.models import RunName\n",
+    "from openstef_beam.benchmarking import BenchmarkComparisonPipeline, LocalBenchmarkStorage\n",
+    "from openstef_beam.benchmarking.benchmarks import create_liander2024_benchmark_runner\n",
+    "from openstef_beam.benchmarking.benchmarks.liander2024 import LIANDER2024_ANALYSIS_CONFIG\n",
+    "from openstef_beam.benchmarking.storage import BenchmarkStorage\n",
+    "\n",
+    "# One storage per run — keys are human-readable labels shown in comparison plots.\n",
+    "run_storages: dict[RunName, BenchmarkStorage] = {\n",
+    "    \"ExampleBaseline\": LocalBenchmarkStorage(base_path=Path(\"./benchmark_results/ExampleBaseline\")),\n",
+    "    \"GBLinear\": LocalBenchmarkStorage(base_path=Path(\"./benchmark_results/GBLinear\")),\n",
+    "}\n",
+    "\n",
+    "# Check that results exist.\n",
+    "for name, storage in run_storages.items():\n",
+    "    base_path = cast(LocalBenchmarkStorage, storage).base_path\n",
+    "    if not base_path.exists():\n",
+    "        msg = f\"Benchmark directory not found for '{name}': {base_path}. Run the benchmarks first.\"\n",
+    "        raise FileNotFoundError(msg)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "1cb05b48",
+   "metadata": {},
+   "source": [
+    "## Run comparison\n",
+    "\n",
+    "The pipeline loads predictions from each run, re-evaluates them, and produces\n",
+    "comparison visualizations."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "1548b09a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Reuse the Liander 2024 target provider.\n",
+    "OUTPUT_PATH = Path(\"./benchmark_results_comparison/liander2024\")\n",
+    "target_provider = create_liander2024_benchmark_runner(\n",
+    "    storage=LocalBenchmarkStorage(base_path=OUTPUT_PATH),\n",
+    ").target_provider\n",
+    "\n",
+    "# Run the comparison — generates global, group, and per-target HTML plots.\n",
+    "comparison = BenchmarkComparisonPipeline(\n",
+    "    analysis_config=LIANDER2024_ANALYSIS_CONFIG,\n",
+    "    storage=LocalBenchmarkStorage(base_path=OUTPUT_PATH),\n",
+    "    target_provider=target_provider,\n",
+    ")\n",
+    "comparison.run(run_data=run_storages)"
+   ]
+  }
+ ],
+ "metadata": {
+  "jupytext": {
+   "formats": "ipynb,py:percent"
+  },
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/examples/benchmarks/custom/compare_benchmark_runs.ipynb.license b/examples/benchmarks/custom/compare_benchmark_runs.ipynb.license
new file mode 100644
index 000000000..a42c86064
--- /dev/null
+++ b/examples/benchmarks/custom/compare_benchmark_runs.ipynb.license
@@ -0,0 +1,3 @@
+SPDX-FileCopyrightText: 2026 Contributors to the OpenSTEF project <openstef@lfenergy.org>
+
+SPDX-License-Identifier: MPL-2.0
diff --git a/examples/benchmarks/custom_benchmark/compare_liander2024_results.py b/examples/benchmarks/custom/compare_benchmark_runs.py
similarity index 67%
rename from examples/benchmarks/custom_benchmark/compare_liander2024_results.py
rename to examples/benchmarks/custom/compare_benchmark_runs.py
index f0e0ea725..ef5846375 100644
--- a/examples/benchmarks/custom_benchmark/compare_liander2024_results.py
+++ b/examples/benchmarks/custom/compare_benchmark_runs.py
@@ -1,3 +1,28 @@
+# ---
+# jupyter:
+#   jupytext:
+#     formats: ipynb,py:percent
+#     text_representation:
+#       extension: .py
+#       format_name: percent
+#       format_version: '1.3'
+#       jupytext_version: 1.19.1
+#   kernelspec:
+#     display_name: Python 3
+#     language: python
+#     name: python3
+# ---
+
+# %% [markdown]
+# # Compare Benchmark Runs
+#
+# Generate side-by-side comparison plots from multiple benchmark runs.
+# Uses [`BenchmarkComparisonPipeline`](https://openstef.github.io/openstef/v4/api/generated/openstef_beam.benchmarking.BenchmarkComparisonPipeline.html)
+# to produce global, per-group, and per-target HTML visualizations.
+#
+# **Prerequisites:** Run at least two models first (e.g. via `run_liander2024_benchmark.py`).
+
+# %% tags=["remove-cell"]
 """Compare benchmark results from different runs on the Liander 2024 dataset.
 
 Usage:
@@ -12,6 +37,13 @@
 #
 # SPDX-License-Identifier: MPL-2.0
 
+# %% [markdown]
+# ## Setup
+#
+# Point at the result directories from your benchmark runs.
+
+# %%
+
 from pathlib import Path
 from typing import cast
 
@@ -34,6 +66,13 @@
         msg = f"Benchmark directory not found for '{name}': {base_path}. Run the benchmarks first."
         raise FileNotFoundError(msg)
 
+# %% [markdown]
+# ## Run comparison
+#
+# The pipeline loads predictions from each run, re-evaluates them, and produces
+# comparison visualizations.
+
+# %%
 # Reuse the Liander 2024 target provider.
 OUTPUT_PATH = Path("./benchmark_results_comparison/liander2024")
 target_provider = create_liander2024_benchmark_runner(
diff --git a/examples/benchmarks/custom/custom_benchmark.ipynb b/examples/benchmarks/custom/custom_benchmark.ipynb
new file mode 100644
index 000000000..395575269
--- /dev/null
+++ b/examples/benchmarks/custom/custom_benchmark.ipynb
@@ -0,0 +1,307 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "2d7a92c5",
+   "metadata": {},
+   "source": [
+    "# Custom Benchmark Configuration\n",
+    "\n",
+    "Defines a complete benchmark: where your data lives, which metrics to compute,\n",
+    "and how to assemble the pipeline.\n",
+    "\n",
+    "**User story:** *\"I want to benchmark on my own data.\"*\n",
+    "\n",
+    "Copy this file and modify `MyTargetProvider` to point at your dataset.\n",
+    "The pipeline configuration (`create_custom_benchmark_runner`) shows all the\n",
+    "knobs: backtest schedule, evaluation windows, analysis visualizations.\n",
+    "\n",
+    "**See also:**\n",
+    "- [TargetProvider](https://openstef.github.io/openstef/v4/api/generated/openstef_beam.benchmarking.TargetProvider.html) — abstract interface\n",
+    "- [SimpleTargetProvider](https://openstef.github.io/openstef/v4/api/generated/openstef_beam.benchmarking.target_provider.SimpleTargetProvider.html) — file-based implementation (what we extend here)\n",
+    "- [BenchmarkPipeline](https://openstef.github.io/openstef/v4/api/generated/openstef_beam.benchmarking.BenchmarkPipeline.html) — the orchestrator\n",
+    "- [EvaluationConfig](https://openstef.github.io/openstef/v4/api/generated/openstef_beam.evaluation.EvaluationConfig.html) — how predictions are sliced and scored\n",
+    "- [Custom Forecaster template](./custom_forecaster.ipynb) — implement your model here"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "556e06da",
+   "metadata": {
+    "tags": [
+     "remove-cell"
+    ]
+   },
+   "outputs": [],
+   "source": [
+    "\"\"\"Example: custom benchmark with your own target provider.\n",
+    "\n",
+    "Shows how to extend SimpleTargetProvider to load your own data and build a\n",
+    "benchmark pipeline. Uses the Liander 2024 dataset as example data source --\n",
+    "replace paths and logic with your own.\n",
+    "\n",
+    "Expected directory layout (customize via path overrides)::\n",
+    "\n",
+    "    data_dir/\n",
+    "    ├── targets.yaml                    # Target definitions\n",
+    "    ├── load_measurements/\n",
+    "    │   └── <group_name>/<name>.parquet # Measurements per target\n",
+    "    └── features/\n",
+    "        └── <group_name>/<name>.parquet # Features per target (weather, etc.)\n",
+    "\"\"\"\n",
+    "\n",
+    "# SPDX-FileCopyrightText: 2025 Contributors to the OpenSTEF project <openstef@lfenergy.org>\n",
+    "#\n",
+    "# SPDX-License-Identifier: MPL-2.0"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "fb46c6a9",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\n",
+    "from datetime import timedelta\n",
+    "from pathlib import Path\n",
+    "from typing import Literal, override\n",
+    "\n",
+    "from huggingface_hub import snapshot_download  # pyright: ignore[reportUnknownVariableType]\n",
+    "from pydantic import Field\n",
+    "\n",
+    "from openstef_beam.analysis import AnalysisConfig\n",
+    "from openstef_beam.analysis.visualizations import WindowedMetricVisualization\n",
+    "from openstef_beam.analysis.visualizations.grouped_target_metric_visualization import GroupedTargetMetricVisualization\n",
+    "from openstef_beam.analysis.visualizations.quantile_probability_visualization import QuantileProbabilityVisualization\n",
+    "from openstef_beam.analysis.visualizations.summary_table_visualization import SummaryTableVisualization\n",
+    "from openstef_beam.analysis.visualizations.timeseries_visualization import TimeSeriesVisualization\n",
+    "from openstef_beam.backtesting import BacktestConfig\n",
+    "from openstef_beam.benchmarking import BenchmarkPipeline, BenchmarkTarget, StrictExecutionCallback\n",
+    "from openstef_beam.benchmarking.storage.base import BenchmarkStorage\n",
+    "from openstef_beam.benchmarking.target_provider import SimpleTargetProvider\n",
+    "from openstef_beam.evaluation import EvaluationConfig, Window\n",
+    "from openstef_beam.evaluation.metric_providers import MetricProvider, RCRPSProvider, RMAEProvider\n",
+    "from openstef_core.types import AvailableAt, LeadTime, Quantile\n",
+    "\n",
+    "# Define your own target categories for filtering (must match group_name in targets.yaml)\n",
+    "type MyCategory = Literal[\"solar_park\", \"wind_park\"]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "8d6120df",
+   "metadata": {},
+   "source": [
+    "## Target Provider\n",
+    "\n",
+    "The [`TargetProvider`](https://openstef.github.io/openstef/v4/api/generated/openstef_beam.benchmarking.TargetProvider.html)\n",
+    "tells BEAM where your data lives and which metrics to compute.\n",
+    "Here we extend [`SimpleTargetProvider`](https://openstef.github.io/openstef/v4/api/generated/openstef_beam.benchmarking.target_provider.SimpleTargetProvider.html)\n",
+    "which handles file-based datasets with a targets YAML + parquet files.\n",
+    "\n",
+    "**CUSTOMIZE HERE:** Change path templates, category types, and metric selection."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "dc28efb5",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\n",
+    "\n",
+    "class MyTargetProvider(SimpleTargetProvider[BenchmarkTarget, list[MyCategory]]):\n",
+    "    \"\"\"Custom target provider -- extend SimpleTargetProvider to load your own data.\n",
+    "\n",
+    "    Configure path templates and data flags, then override methods to customize\n",
+    "    target filtering, metrics, and file resolution.\n",
+    "    \"\"\"\n",
+    "\n",
+    "    # Path templates -- adapt to your directory structure\n",
+    "    # {name} is replaced with target.name from targets.yaml\n",
+    "    targets_file_path: str = Field(default=\"liander2024_targets.yaml\", init=False)\n",
+    "    measurements_path_template: str = Field(default=\"{name}.parquet\", init=False)\n",
+    "    weather_path_template: str = Field(default=\"{name}.parquet\", init=False)\n",
+    "\n",
+    "    # Disable shared profiles and prices -- only per-target features are used\n",
+    "    # Set to True if you have shared data files (profiles.parquet, prices.parquet)\n",
+    "    use_profiles: bool = False\n",
+    "    use_prices: bool = False\n",
+    "\n",
+    "    @override\n",
+    "    def get_targets(self, filter_args: list[MyCategory] | None = None) -> list[BenchmarkTarget]:\n",
+    "        \"\"\"Load targets and optionally filter by category.\n",
+    "\n",
+    "        Returns:\n",
+    "            Filtered list of benchmark targets.\n",
+    "        \"\"\"\n",
+    "        # super().get_targets() reads targets from the YAML file\n",
+    "        targets = super().get_targets(filter_args)\n",
+    "        # Keep only targets whose group_name matches one of the filter categories\n",
+    "        if filter_args is not None:\n",
+    "            targets = [t for t in targets if t.group_name in filter_args]\n",
+    "        return targets\n",
+    "\n",
+    "    @override\n",
+    "    def get_metrics_for_target(self, target: BenchmarkTarget) -> list[MetricProvider]:\n",
+    "        \"\"\"Define which metrics to compute per target.\n",
+    "\n",
+    "        Returns:\n",
+    "            List of metric providers.\n",
+    "        \"\"\"\n",
+    "        # rMAE: deterministic accuracy at the median (lower is better)\n",
+    "        # rCRPS: probabilistic accuracy across all quantiles (lower is better)\n",
+    "        return [\n",
+    "            RMAEProvider(quantiles=[Quantile(0.5)], lower_quantile=0.01, upper_quantile=0.99),\n",
+    "            RCRPSProvider(lower_quantile=0.01, upper_quantile=0.99),\n",
+    "        ]\n",
+    "\n",
+    "    @override\n",
+    "    def _get_measurements_path_for_target(self, target: BenchmarkTarget) -> Path:\n",
+    "        \"\"\"Resolve path to load measurement parquet.\n",
+    "\n",
+    "        Liander 2024 uses: data_dir/load_measurements/<group>/<name>.parquet\n",
+    "        Change this to match your directory structure.\n",
+    "\n",
+    "        Returns:\n",
+    "            Path to the measurement parquet file.\n",
+    "        \"\"\"\n",
+    "        return self.data_dir / \"load_measurements\" / target.group_name / f\"{target.name}.parquet\"\n",
+    "\n",
+    "    @override\n",
+    "    def _get_weather_path_for_target(self, target: BenchmarkTarget) -> Path:\n",
+    "        \"\"\"Resolve path to features parquet (weather, etc.).\n",
+    "\n",
+    "        Liander 2024 uses: data_dir/weather_forecasts_versioned/<group>/<name>.parquet\n",
+    "        Change this to match your directory structure.\n",
+    "\n",
+    "        Returns:\n",
+    "            Path to the features parquet file.\n",
+    "        \"\"\"\n",
+    "        return self.data_dir / \"weather_forecasts_versioned\" / target.group_name / f\"{target.name}.parquet\""
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "8e3c1663",
+   "metadata": {},
+   "source": [
+    "## Analysis Configuration\n",
+    "\n",
+    "Choose which visualizations and summary tables BEAM generates after evaluation.\n",
+    "Add or remove providers to customize the output report."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "5bffcc06",
+   "metadata": {
+    "lines_to_next_cell": 2
+   },
+   "outputs": [],
+   "source": [
+    "# --- Analysis config: which plots and tables to generate after evaluation ---\n",
+    "ANALYSIS_CONFIG = AnalysisConfig(\n",
+    "    visualization_providers=[\n",
+    "        TimeSeriesVisualization(name=\"time_series\"),\n",
+    "        WindowedMetricVisualization(\n",
+    "            name=\"rMAE_7D\",\n",
+    "            metric=(\"rMAE\", Quantile(0.5)),\n",
+    "            window=Window(lag=timedelta(hours=0), size=timedelta(days=7)),\n",
+    "        ),\n",
+    "        WindowedMetricVisualization(\n",
+    "            name=\"rCRPS_30D\",\n",
+    "            metric=\"rCRPS\",\n",
+    "            window=Window(lag=timedelta(hours=0), size=timedelta(days=30)),\n",
+    "        ),\n",
+    "        GroupedTargetMetricVisualization(name=\"rMAE_grouped\", metric=\"rMAE\", quantile=Quantile(0.5)),\n",
+    "        GroupedTargetMetricVisualization(name=\"rCRPS_grouped\", metric=\"rCRPS\"),\n",
+    "        SummaryTableVisualization(name=\"summary\"),\n",
+    "        QuantileProbabilityVisualization(name=\"quantile_probability\"),\n",
+    "    ],\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "9e74b7c6",
+   "metadata": {},
+   "source": [
+    "## Pipeline Assembly\n",
+    "\n",
+    "Wire everything together: backtest schedule, evaluation config, analysis, and target provider.\n",
+    "See [`BacktestConfig`](https://openstef.github.io/openstef/v4/api/generated/openstef_beam.backtesting.BacktestConfig.html)\n",
+    "and [`EvaluationConfig`](https://openstef.github.io/openstef/v4/api/generated/openstef_beam.evaluation.EvaluationConfig.html)\n",
+    "for all available options.\n",
+    "\n",
+    "**CUSTOMIZE HERE:** Adjust `predict_interval`, `train_interval`, evaluation windows, and lead times."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "4a48b987",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\n",
+    "\n",
+    "def create_custom_benchmark_runner(\n",
+    "    storage: BenchmarkStorage,\n",
+    "    data_dir: Path | None = None,\n",
+    ") -> BenchmarkPipeline[BenchmarkTarget, list[MyCategory]]:\n",
+    "    \"\"\"Assemble a benchmark pipeline with the custom target provider.\n",
+    "\n",
+    "    Args:\n",
+    "        storage: Where to save results.\n",
+    "        data_dir: Dataset path. Downloads Liander 2024 from HuggingFace if None.\n",
+    "\n",
+    "    Returns:\n",
+    "        Ready-to-run benchmark pipeline.\n",
+    "    \"\"\"\n",
+    "    if data_dir is None:\n",
+    "        data_dir = Path(snapshot_download(repo_id=\"OpenSTEF/liander2024-stef-benchmark\", repo_type=\"dataset\"))\n",
+    "\n",
+    "    return BenchmarkPipeline[BenchmarkTarget, list[MyCategory]](\n",
+    "        # Backtest: how to replay history\n",
+    "        backtest_config=BacktestConfig(\n",
+    "            prediction_sample_interval=timedelta(minutes=15),  # Data resolution\n",
+    "            predict_interval=timedelta(hours=6),  # New forecast every 6 hours\n",
+    "            train_interval=timedelta(days=7),  # Retrain model every 7 days\n",
+    "        ),\n",
+    "        # Evaluation: how to slice and score the results\n",
+    "        evaluation_config=EvaluationConfig(\n",
+    "            available_ats=[AvailableAt.from_string(\"D-1T06:00\")],  # Day-ahead forecast at 06:00\n",
+    "            lead_times=[\n",
+    "                LeadTime.from_string(\"P1D\"),  # 1 day ahead\n",
+    "            ],  # Evaluate all lead times\n",
+    "            windows=[  # Rolling windows for metrics\n",
+    "                Window(lag=timedelta(hours=0), size=timedelta(days=7)),\n",
+    "                Window(lag=timedelta(hours=0), size=timedelta(days=30)),\n",
+    "            ],\n",
+    "        ),\n",
+    "        analysis_config=ANALYSIS_CONFIG,\n",
+    "        target_provider=MyTargetProvider(data_dir=data_dir),\n",
+    "        storage=storage,\n",
+    "        callbacks=[StrictExecutionCallback()],  # Fail fast on errors\n",
+    "    )"
+   ]
+  }
+ ],
+ "metadata": {
+  "jupytext": {
+   "formats": "ipynb,py:percent"
+  },
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/examples/benchmarks/custom/custom_benchmark.ipynb.license b/examples/benchmarks/custom/custom_benchmark.ipynb.license
new file mode 100644
index 000000000..a42c86064
--- /dev/null
+++ b/examples/benchmarks/custom/custom_benchmark.ipynb.license
@@ -0,0 +1,3 @@
+SPDX-FileCopyrightText: 2026 Contributors to the OpenSTEF project <openstef@lfenergy.org>
+
+SPDX-License-Identifier: MPL-2.0
diff --git a/examples/benchmarks/custom_benchmark/example_benchmark.py b/examples/benchmarks/custom/custom_benchmark.py
similarity index 73%
rename from examples/benchmarks/custom_benchmark/example_benchmark.py
rename to examples/benchmarks/custom/custom_benchmark.py
index 655756199..62e7aee70 100644
--- a/examples/benchmarks/custom_benchmark/example_benchmark.py
+++ b/examples/benchmarks/custom/custom_benchmark.py
@@ -1,3 +1,38 @@
+# ---
+# jupyter:
+#   jupytext:
+#     formats: ipynb,py:percent
+#     text_representation:
+#       extension: .py
+#       format_name: percent
+#       format_version: '1.3'
+#       jupytext_version: 1.19.1
+#   kernelspec:
+#     display_name: Python 3
+#     language: python
+#     name: python3
+# ---
+
+# %% [markdown]
+# # Custom Benchmark Configuration
+#
+# Defines a complete benchmark: where your data lives, which metrics to compute,
+# and how to assemble the pipeline.
+#
+# **User story:** *"I want to benchmark on my own data."*
+#
+# Copy this file and modify `MyTargetProvider` to point at your dataset.
+# The pipeline configuration (`create_custom_benchmark_runner`) shows all the
+# knobs: backtest schedule, evaluation windows, analysis visualizations.
+#
+# **See also:**
+# - [TargetProvider](https://openstef.github.io/openstef/v4/api/generated/openstef_beam.benchmarking.TargetProvider.html) — abstract interface
+# - [SimpleTargetProvider](https://openstef.github.io/openstef/v4/api/generated/openstef_beam.benchmarking.target_provider.SimpleTargetProvider.html) — file-based implementation (what we extend here)
+# - [BenchmarkPipeline](https://openstef.github.io/openstef/v4/api/generated/openstef_beam.benchmarking.BenchmarkPipeline.html) — the orchestrator
+# - [EvaluationConfig](https://openstef.github.io/openstef/v4/api/generated/openstef_beam.evaluation.EvaluationConfig.html) — how predictions are sliced and scored
+# - [Custom Forecaster template](./custom_forecaster.ipynb) — implement your model here
+
+# %% tags=["remove-cell"]
 """Example: custom benchmark with your own target provider.
 
 Shows how to extend SimpleTargetProvider to load your own data and build a
@@ -18,6 +53,8 @@
 #
 # SPDX-License-Identifier: MPL-2.0
 
+# %%
+
 from datetime import timedelta
 from pathlib import Path
 from typing import Literal, override
@@ -42,6 +79,18 @@
 # Define your own target categories for filtering (must match group_name in targets.yaml)
 type MyCategory = Literal["solar_park", "wind_park"]
 
+# %% [markdown]
+# ## Target Provider
+#
+# The [`TargetProvider`](https://openstef.github.io/openstef/v4/api/generated/openstef_beam.benchmarking.TargetProvider.html)
+# tells BEAM where your data lives and which metrics to compute.
+# Here we extend [`SimpleTargetProvider`](https://openstef.github.io/openstef/v4/api/generated/openstef_beam.benchmarking.target_provider.SimpleTargetProvider.html)
+# which handles file-based datasets with a targets YAML + parquet files.
+#
+# **CUSTOMIZE HERE:** Change path templates, category types, and metric selection.
+
+# %%
+
 
 class MyTargetProvider(SimpleTargetProvider[BenchmarkTarget, list[MyCategory]]):
     """Custom target provider -- extend SimpleTargetProvider to load your own data.
@@ -114,6 +163,13 @@ def _get_weather_path_for_target(self, target: BenchmarkTarget) -> Path:
         return self.data_dir / "weather_forecasts_versioned" / target.group_name / f"{target.name}.parquet"
 
 
+# %% [markdown]
+# ## Analysis Configuration
+#
+# Choose which visualizations and summary tables BEAM generates after evaluation.
+# Add or remove providers to customize the output report.
+
+# %%
 # --- Analysis config: which plots and tables to generate after evaluation ---
 ANALYSIS_CONFIG = AnalysisConfig(
     visualization_providers=[
@@ -136,6 +192,19 @@ def _get_weather_path_for_target(self, target: BenchmarkTarget) -> Path:
 )
 
 
+# %% [markdown]
+# ## Pipeline Assembly
+#
+# Wire everything together: backtest schedule, evaluation config, analysis, and target provider.
+# See [`BacktestConfig`](https://openstef.github.io/openstef/v4/api/generated/openstef_beam.backtesting.BacktestConfig.html)
+# and [`EvaluationConfig`](https://openstef.github.io/openstef/v4/api/generated/openstef_beam.evaluation.EvaluationConfig.html)
+# for all available options.
+#
+# **CUSTOMIZE HERE:** Adjust `predict_interval`, `train_interval`, evaluation windows, and lead times.
+
+# %%
+
+
 def create_custom_benchmark_runner(
     storage: BenchmarkStorage,
     data_dir: Path | None = None,
diff --git a/examples/benchmarks/custom/custom_forecaster.ipynb b/examples/benchmarks/custom/custom_forecaster.ipynb
new file mode 100644
index 000000000..7beca1857
--- /dev/null
+++ b/examples/benchmarks/custom/custom_forecaster.ipynb
@@ -0,0 +1,176 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "7c5e1252",
+   "metadata": {},
+   "source": [
+    "# Custom Forecaster Template\n",
+    "\n",
+    "Implements [`BacktestForecasterMixin`](https://openstef.github.io/openstef/v4/api/generated/openstef_beam.backtesting.BacktestForecasterMixin.html)\n",
+    "— the interface BEAM needs to run any model in its backtesting/benchmarking pipeline.\n",
+    "\n",
+    "**User story:** *\"I want to benchmark my own model.\"*\n",
+    "\n",
+    "Copy this file and modify `fit()` and `predict()` to wrap your model.\n",
+    "\n",
+    "**See also:**\n",
+    "- [BacktestForecasterConfig](https://openstef.github.io/openstef/v4/api/generated/openstef_beam.backtesting.BacktestForecasterConfig.html) — scheduling settings\n",
+    "- [RestrictedHorizonVersionedTimeSeries](https://openstef.github.io/openstef/v4/api/generated/openstef_beam.backtesting.RestrictedHorizonVersionedTimeSeries.html) — the data view passed to `fit()` and `predict()`\n",
+    "- [Backtesting quickstart tutorial](../../tutorials/backtesting_quickstart.ipynb) — introduction to backtesting concepts"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f44c64a2",
+   "metadata": {
+    "tags": [
+     "remove-cell"
+    ]
+   },
+   "outputs": [],
+   "source": [
+    "\"\"\"Custom baseline: predicts a constant value (last known median) for all future timestamps.\n",
+    "\n",
+    "Implements BacktestForecasterMixin — the interface BEAM needs to run any model\n",
+    "in its backtesting/benchmarking pipeline. To create your own baseline, copy this\n",
+    "file and modify fit() and predict().\n",
+    "\"\"\"\n",
+    "\n",
+    "# SPDX-FileCopyrightText: 2025 Contributors to the OpenSTEF project <openstef@lfenergy.org>\n",
+    "#\n",
+    "# SPDX-License-Identifier: MPL-2.0"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "045ccd48",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\n",
+    "from datetime import timedelta\n",
+    "from typing import override\n",
+    "\n",
+    "import pandas as pd\n",
+    "\n",
+    "from openstef_beam.backtesting.backtest_forecaster import BacktestForecasterConfig, BacktestForecasterMixin\n",
+    "from openstef_beam.backtesting.restricted_horizon_timeseries import RestrictedHorizonVersionedTimeSeries\n",
+    "from openstef_core.datasets import TimeSeriesDataset\n",
+    "from openstef_core.types import Q, Quantile"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "067a5f9b",
+   "metadata": {},
+   "source": [
+    "## The `BacktestForecasterMixin` interface\n",
+    "\n",
+    "Your forecaster must implement:\n",
+    "- `config` — a [`BacktestForecasterConfig`](https://openstef.github.io/openstef/v4/api/generated/openstef_beam.backtesting.BacktestForecasterConfig.html) that tells BEAM how to schedule training and prediction\n",
+    "- `quantiles` — which probabilistic bands to produce (e.g. `[Q(0.05), Q(0.5), Q(0.95)]`)\n",
+    "- `fit(data)` — train your model on restricted-horizon data (no lookahead)\n",
+    "- `predict(data)` → `TimeSeriesDataset | None` — produce a forecast\n",
+    "\n",
+    "BEAM calls `fit()` at `train_interval` spacing, and `predict()` at `predict_interval` spacing."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "94bee2f3",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\n",
+    "\n",
+    "class ExampleBenchmarkForecaster(BacktestForecasterMixin):\n",
+    "    \"\"\"Predicts a constant median of recent history for all future timestamps.\n",
+    "\n",
+    "    All quantile columns get the same value -- no uncertainty estimation.\n",
+    "    \"\"\"\n",
+    "\n",
+    "    def __init__(self, predict_quantiles: list[Quantile] | None = None) -> None:  # noqa: D107\n",
+    "        # Quantiles define the probabilistic forecast bands (e.g. P05 = 5th percentile)\n",
+    "        self._quantiles = predict_quantiles or [Q(0.05), Q(0.1), Q(0.3), Q(0.5), Q(0.7), Q(0.9), Q(0.95)]\n",
+    "        self._median: float = 0.0\n",
+    "\n",
+    "        # BacktestForecasterConfig tells BEAM how to schedule training and prediction\n",
+    "        self.config = BacktestForecasterConfig(\n",
+    "            requires_training=True,  # Call fit() before predict()\n",
+    "            predict_length=timedelta(days=7),  # How far ahead to forecast\n",
+    "            predict_min_length=timedelta(minutes=15),\n",
+    "            predict_context_length=timedelta(minutes=15),  # Data needed before forecast start (>0)\n",
+    "            predict_context_min_coverage=0.0,\n",
+    "            training_context_length=timedelta(days=30),  # How much history fit() sees\n",
+    "            training_context_min_coverage=0.3,  # Min 30% non-NaN data required\n",
+    "            predict_sample_interval=timedelta(minutes=15),  # Output resolution (15-min intervals)\n",
+    "        )\n",
+    "\n",
+    "    @property\n",
+    "    @override\n",
+    "    def quantiles(self) -> list[Quantile]:\n",
+    "        \"\"\"Quantiles this forecaster produces.\"\"\"\n",
+    "        return self._quantiles\n",
+    "\n",
+    "    @override\n",
+    "    def fit(self, data: RestrictedHorizonVersionedTimeSeries) -> None:\n",
+    "        \"\"\"Compute median of recent load data.\n",
+    "\n",
+    "        Args:\n",
+    "            data: Restricted-horizon view -- only sees data available at data.horizon.\n",
+    "        \"\"\"\n",
+    "        # data.horizon = the current point in time during backtesting\n",
+    "        # get_window() returns only data that was available at that point (no lookahead)\n",
+    "        training = data.get_window(\n",
+    "            start=data.horizon - self.config.training_context_length,  # 30 days before horizon\n",
+    "            end=data.horizon,\n",
+    "            available_before=data.horizon,  # Ensures no future data leaks in\n",
+    "        )\n",
+    "        # \"load\" is the target column (actual energy consumption/generation)\n",
+    "        if \"load\" in training.data.columns:\n",
+    "            self._median = float(training.data[\"load\"].median())\n",
+    "\n",
+    "    @override\n",
+    "    def predict(self, data: RestrictedHorizonVersionedTimeSeries) -> TimeSeriesDataset | None:\n",
+    "        \"\"\"Return constant median prediction for the forecast horizon.\n",
+    "\n",
+    "        Returns:\n",
+    "            Forecast with all quantiles set to the training median, or None on failure.\n",
+    "        \"\"\"\n",
+    "        # Build a DataFrame with \"load\" + one column per quantile (e.g. \"quantile_P05\")\n",
+    "        # All values are the same constant (the median from fit())\n",
+    "        # q.format() converts Q(0.05) -> \"quantile_P05\" (the required column naming)\n",
+    "        return TimeSeriesDataset(\n",
+    "            data=pd.DataFrame(\n",
+    "                data={\"load\": self._median} | {q.format(): self._median for q in self._quantiles},\n",
+    "                index=pd.DatetimeIndex(\n",
+    "                    pd.date_range(\n",
+    "                        data.horizon,\n",
+    "                        periods=int(self.config.predict_length / self.config.predict_sample_interval),\n",
+    "                        freq=self.config.predict_sample_interval,\n",
+    "                    ),\n",
+    "                    name=\"datetime\",\n",
+    "                ),\n",
+    "            ),\n",
+    "            sample_interval=self.config.predict_sample_interval,\n",
+    "        )"
+   ]
+  }
+ ],
+ "metadata": {
+  "jupytext": {
+   "formats": "ipynb,py:percent"
+  },
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/examples/benchmarks/custom/custom_forecaster.ipynb.license b/examples/benchmarks/custom/custom_forecaster.ipynb.license
new file mode 100644
index 000000000..a42c86064
--- /dev/null
+++ b/examples/benchmarks/custom/custom_forecaster.ipynb.license
@@ -0,0 +1,3 @@
+SPDX-FileCopyrightText: 2026 Contributors to the OpenSTEF project <openstef@lfenergy.org>
+
+SPDX-License-Identifier: MPL-2.0
diff --git a/examples/benchmarks/custom_benchmark/example_baseline.py b/examples/benchmarks/custom/custom_forecaster.py
similarity index 68%
rename from examples/benchmarks/custom_benchmark/example_baseline.py
rename to examples/benchmarks/custom/custom_forecaster.py
index ed3732e89..2dda468a4 100644
--- a/examples/benchmarks/custom_benchmark/example_baseline.py
+++ b/examples/benchmarks/custom/custom_forecaster.py
@@ -1,3 +1,34 @@
+# ---
+# jupyter:
+#   jupytext:
+#     formats: ipynb,py:percent
+#     text_representation:
+#       extension: .py
+#       format_name: percent
+#       format_version: '1.3'
+#       jupytext_version: 1.19.1
+#   kernelspec:
+#     display_name: Python 3
+#     language: python
+#     name: python3
+# ---
+
+# %% [markdown]
+# # Custom Forecaster Template
+#
+# Implements [`BacktestForecasterMixin`](https://openstef.github.io/openstef/v4/api/generated/openstef_beam.backtesting.BacktestForecasterMixin.html)
+# — the interface BEAM needs to run any model in its backtesting/benchmarking pipeline.
+#
+# **User story:** *"I want to benchmark my own model."*
+#
+# Copy this file and modify `fit()` and `predict()` to wrap your model.
+#
+# **See also:**
+# - [BacktestForecasterConfig](https://openstef.github.io/openstef/v4/api/generated/openstef_beam.backtesting.BacktestForecasterConfig.html) — scheduling settings
+# - [RestrictedHorizonVersionedTimeSeries](https://openstef.github.io/openstef/v4/api/generated/openstef_beam.backtesting.RestrictedHorizonVersionedTimeSeries.html) — the data view passed to `fit()` and `predict()`
+# - [Backtesting quickstart tutorial](../../tutorials/backtesting_quickstart.ipynb) — introduction to backtesting concepts
+
+# %% tags=["remove-cell"]
 """Custom baseline: predicts a constant value (last known median) for all future timestamps.
 
 Implements BacktestForecasterMixin — the interface BEAM needs to run any model
@@ -9,6 +40,8 @@
 #
 # SPDX-License-Identifier: MPL-2.0
 
+# %%
+
 from datetime import timedelta
 from typing import override
 
@@ -19,6 +52,19 @@
 from openstef_core.datasets import TimeSeriesDataset
 from openstef_core.types import Q, Quantile
 
+# %% [markdown]
+# ## The `BacktestForecasterMixin` interface
+#
+# Your forecaster must implement:
+# - `config` — a [`BacktestForecasterConfig`](https://openstef.github.io/openstef/v4/api/generated/openstef_beam.backtesting.BacktestForecasterConfig.html) that tells BEAM how to schedule training and prediction
+# - `quantiles` — which probabilistic bands to produce (e.g. `[Q(0.05), Q(0.5), Q(0.95)]`)
+# - `fit(data)` — train your model on restricted-horizon data (no lookahead)
+# - `predict(data)` → `TimeSeriesDataset | None` — produce a forecast
+#
+# BEAM calls `fit()` at `train_interval` spacing, and `predict()` at `predict_interval` spacing.
+
+# %%
+
 
 class ExampleBenchmarkForecaster(BacktestForecasterMixin):
     """Predicts a constant median of recent history for all future timestamps.
diff --git a/examples/benchmarks/custom/evaluate_existing_forecasts.ipynb b/examples/benchmarks/custom/evaluate_existing_forecasts.ipynb
new file mode 100644
index 000000000..15a90b2ad
--- /dev/null
+++ b/examples/benchmarks/custom/evaluate_existing_forecasts.ipynb
@@ -0,0 +1,241 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "24b1f426",
+   "metadata": {},
+   "source": [
+    "# Evaluate Existing Forecasts\n",
+    "\n",
+    "Skip backtesting entirely — bring your own prediction parquets and run only\n",
+    "evaluation + analysis.\n",
+    "\n",
+    "**User story:** *\"I already have forecasts from my own system. I just want to\n",
+    "score them with BEAM's metrics and visualizations.\"*\n",
+    "\n",
+    "**See also:**\n",
+    "- [BenchmarkPipeline](https://openstef.github.io/openstef/v4/api/generated/openstef_beam.benchmarking.BenchmarkPipeline.html) — auto-detects existing predictions and skips backtesting\n",
+    "- [Custom Benchmark configuration](./custom_benchmark.ipynb) — defines which targets and metrics to use\n",
+    "- [Quantile naming convention](https://openstef.github.io/openstef/v4/api/generated/openstef_core.types.Quantile.html) — `Quantile(x).format()` → column names\n",
+    "\n",
+    "## Expected directory layout\n",
+    "\n",
+    "```\n",
+    "benchmark_results/MyForecasts/\n",
+    "└── backtest/\n",
+    "    └── <group_name>/           # e.g. \"solar_park\"\n",
+    "        └── <target_name>/      # e.g. \"Within 15 kilometers of Opmeer_normalized\"\n",
+    "            └── predictions.parquet\n",
+    "```\n",
+    "\n",
+    "## Expected parquet format\n",
+    "\n",
+    "| Column | Type | Description |\n",
+    "|--------|------|-------------|\n",
+    "| *index* | `DatetimeIndex` (name=\"timestamp\", tz-naive UTC, 15-min) | Forecast timestamp |\n",
+    "| `available_at` | datetime | When the prediction was generated |\n",
+    "| `quantile_P05` | float | 5th percentile |\n",
+    "| `quantile_P50` | float | Median (required) |\n",
+    "| `quantile_P95` | float | 95th percentile |\n",
+    "| ... | float | One column per quantile via `Quantile(x).format()` |"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "768a9cb8",
+   "metadata": {
+    "tags": [
+     "remove-cell"
+    ]
+   },
+   "outputs": [],
+   "source": [
+    "\"\"\"Evaluate pre-existing forecasts without running backtesting.\n",
+    "\n",
+    "If you already have forecast predictions (e.g. from your own model or an external\n",
+    "system), you can point the benchmark pipeline at them and run only the evaluation\n",
+    "and analysis steps.\n",
+    "\n",
+    "How it works:\n",
+    "  1. Place your prediction parquet files in the expected directory layout (see below).\n",
+    "  2. Run this script — the pipeline detects existing backtest output and\n",
+    "     automatically skips to evaluation + analysis.\n",
+    "\n",
+    "Expected directory layout::\n",
+    "\n",
+    "    benchmark_results/MyForecasts/\n",
+    "    └── backtest/\n",
+    "        └── <group_name>/           # e.g. \"solar_park\"\n",
+    "            └── <target_name>/      # e.g. \"Within 15 kilometers of Opmeer_normalized\"\n",
+    "                └── predictions.parquet\n",
+    "\n",
+    "Expected parquet format::\n",
+    "\n",
+    "    Index:   pd.DatetimeIndex (name=\"timestamp\", tz-naive UTC, 15-min intervals)\n",
+    "    Columns:\n",
+    "      - \"available_at\" (datetime)  — when the prediction was generated\n",
+    "      - \"quantile_P05\" (float)     — 5th percentile prediction\n",
+    "      - \"quantile_P50\" (float)     — median prediction (REQUIRED)\n",
+    "      - \"quantile_P95\" (float)     — 95th percentile prediction\n",
+    "      - ...one column per quantile, named with Quantile(x).format()\n",
+    "\n",
+    "Example row::\n",
+    "\n",
+    "    timestamp (index)      available_at          quantile_P05  quantile_P50  quantile_P95\n",
+    "    2023-01-15 12:00:00    2023-01-14 06:00:00   0.5           1.2           2.0\n",
+    "\n",
+    "You can list the expected target names and group names by checking the targets.yaml\n",
+    "in your dataset, or by running::\n",
+    "\n",
+    "    runner = create_custom_benchmark_runner()\n",
+    "    for t in runner.target_provider.get_targets([\"solar_park\"]):\n",
+    "        print(t.group_name, t.name)\n",
+    "\n",
+    "The pipeline still needs a \"forecaster factory\" to know which quantiles were used,\n",
+    "but fit() and predict() are never called. We use DummyForecaster for this.\n",
+    "\"\"\"\n",
+    "\n",
+    "# SPDX-FileCopyrightText: 2025 Contributors to the OpenSTEF project <openstef@lfenergy.org>\n",
+    "#\n",
+    "# SPDX-License-Identifier: MPL-2.0\n",
+    "\n",
+    "import os\n",
+    "\n",
+    "os.environ[\"OMP_NUM_THREADS\"] = \"1\"\n",
+    "os.environ[\"OPENBLAS_NUM_THREADS\"] = \"1\"\n",
+    "os.environ[\"MKL_NUM_THREADS\"] = \"1\""
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "990fa931",
+   "metadata": {},
+   "source": [
+    "## Setup"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0f4f5f17",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import logging\n",
+    "import multiprocessing\n",
+    "from pathlib import Path\n",
+    "\n",
+    "from examples.benchmarks.custom.custom_benchmark import create_custom_benchmark_runner\n",
+    "from openstef_beam.backtesting.backtest_forecaster import DummyForecaster\n",
+    "from openstef_beam.benchmarking import BenchmarkContext, BenchmarkTarget, LocalBenchmarkStorage\n",
+    "from openstef_core.types import Q\n",
+    "\n",
+    "_logger = logging.getLogger(__name__)\n",
+    "\n",
+    "logging.basicConfig(level=logging.INFO, format=\"[%(asctime)s][%(levelname)s] %(message)s\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "51fbc379",
+   "metadata": {},
+   "source": [
+    "## Configuration\n",
+    "\n",
+    "Point at the folder containing your prediction parquets and list the quantiles\n",
+    "they were generated for."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f9f94d32",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Path to the folder that contains the backtest/ directory with your parquets.\n",
+    "OUTPUT_PATH = Path(\"./benchmark_results/MyForecasts\")\n",
+    "N_PROCESSES = multiprocessing.cpu_count()\n",
+    "\n",
+    "# Quantiles your forecasts were generated for (must include 0.5 = median).\n",
+    "# Adjust this list to match whatever quantiles are in your parquet columns.\n",
+    "PREDICTION_QUANTILES = [Q(0.05), Q(0.1), Q(0.3), Q(0.5), Q(0.7), Q(0.9), Q(0.95)]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "cb9a09e1",
+   "metadata": {},
+   "source": [
+    "## Dummy forecaster factory\n",
+    "\n",
+    "The pipeline still needs a factory to know which quantiles were used, but\n",
+    "`fit()` and `predict()` are never called — backtesting is skipped."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "3d47f17d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\n",
+    "\n",
+    "def stub_factory(_context: BenchmarkContext, _target: BenchmarkTarget) -> DummyForecaster:\n",
+    "    \"\"\"Factory that returns a DummyForecaster (backtesting is skipped).\n",
+    "\n",
+    "    DummyForecaster provides quantile info to the pipeline but never runs\n",
+    "    fit() or predict() since backtest output already exists on disk.\n",
+    "\n",
+    "    Returns:\n",
+    "        DummyForecaster with the configured quantiles.\n",
+    "    \"\"\"\n",
+    "    return DummyForecaster(predict_quantiles=PREDICTION_QUANTILES)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "09907e33",
+   "metadata": {},
+   "source": [
+    "## Run evaluation\n",
+    "\n",
+    "The pipeline reads existing parquets and runs evaluation + analysis only."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d61f1e93",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "if __name__ == \"__main__\":\n",
+    "    storage = LocalBenchmarkStorage(base_path=OUTPUT_PATH)\n",
+    "\n",
+    "    runner = create_custom_benchmark_runner(storage=storage)\n",
+    "\n",
+    "    runner.run(\n",
+    "        forecaster_factory=stub_factory,\n",
+    "        run_name=\"my_forecasts\",\n",
+    "        n_processes=N_PROCESSES,\n",
+    "        filter_args=[\"solar_park\"],\n",
+    "    )"
+   ]
+  }
+ ],
+ "metadata": {
+  "jupytext": {
+   "formats": "ipynb,py:percent"
+  },
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/examples/benchmarks/custom/evaluate_existing_forecasts.ipynb.license b/examples/benchmarks/custom/evaluate_existing_forecasts.ipynb.license
new file mode 100644
index 000000000..a42c86064
--- /dev/null
+++ b/examples/benchmarks/custom/evaluate_existing_forecasts.ipynb.license
@@ -0,0 +1,3 @@
+SPDX-FileCopyrightText: 2026 Contributors to the OpenSTEF project <openstef@lfenergy.org>
+
+SPDX-License-Identifier: MPL-2.0
diff --git a/examples/benchmarks/custom_benchmark/evaluate_existing_forecasts.py b/examples/benchmarks/custom/evaluate_existing_forecasts.py
similarity index 59%
rename from examples/benchmarks/custom_benchmark/evaluate_existing_forecasts.py
rename to examples/benchmarks/custom/evaluate_existing_forecasts.py
index c1a4dbaeb..348d1af46 100644
--- a/examples/benchmarks/custom_benchmark/evaluate_existing_forecasts.py
+++ b/examples/benchmarks/custom/evaluate_existing_forecasts.py
@@ -1,3 +1,54 @@
+# ---
+# jupyter:
+#   jupytext:
+#     formats: ipynb,py:percent
+#     text_representation:
+#       extension: .py
+#       format_name: percent
+#       format_version: '1.3'
+#       jupytext_version: 1.19.1
+#   kernelspec:
+#     display_name: Python 3
+#     language: python
+#     name: python3
+# ---
+
+# %% [markdown]
+# # Evaluate Existing Forecasts
+#
+# Skip backtesting entirely — bring your own prediction parquets and run only
+# evaluation + analysis.
+#
+# **User story:** *"I already have forecasts from my own system. I just want to
+# score them with BEAM's metrics and visualizations."*
+#
+# **See also:**
+# - [BenchmarkPipeline](https://openstef.github.io/openstef/v4/api/generated/openstef_beam.benchmarking.BenchmarkPipeline.html) — auto-detects existing predictions and skips backtesting
+# - [Custom Benchmark configuration](./custom_benchmark.ipynb) — defines which targets and metrics to use
+# - [Quantile naming convention](https://openstef.github.io/openstef/v4/api/generated/openstef_core.types.Quantile.html) — `Quantile(x).format()` → column names
+#
+# ## Expected directory layout
+#
+# ```
+# benchmark_results/MyForecasts/
+# └── backtest/
+#     └── <group_name>/           # e.g. "solar_park"
+#         └── <target_name>/      # e.g. "Within 15 kilometers of Opmeer_normalized"
+#             └── predictions.parquet
+# ```
+#
+# ## Expected parquet format
+#
+# | Column | Type | Description |
+# |--------|------|-------------|
+# | *index* | `DatetimeIndex` (name="timestamp", tz-naive UTC, 15-min) | Forecast timestamp |
+# | `available_at` | datetime | When the prediction was generated |
+# | `quantile_P05` | float | 5th percentile |
+# | `quantile_P50` | float | Median (required) |
+# | `quantile_P95` | float | 95th percentile |
+# | ... | float | One column per quantile via `Quantile(x).format()` |
+
+# %% tags=["remove-cell"]
 """Evaluate pre-existing forecasts without running backtesting.
 
 If you already have forecast predictions (e.g. from your own model or an external
@@ -47,24 +98,36 @@
 #
 # SPDX-License-Identifier: MPL-2.0
 
+import os
+
+os.environ["OMP_NUM_THREADS"] = "1"
+os.environ["OPENBLAS_NUM_THREADS"] = "1"
+os.environ["MKL_NUM_THREADS"] = "1"
+
+# %% [markdown]
+# ## Setup
+
+# %%
 import logging
 import multiprocessing
-import os
 from pathlib import Path
 
-from examples.benchmarks.custom_benchmark.example_benchmark import create_custom_benchmark_runner
+from examples.benchmarks.custom.custom_benchmark import create_custom_benchmark_runner
 from openstef_beam.backtesting.backtest_forecaster import DummyForecaster
 from openstef_beam.benchmarking import BenchmarkContext, BenchmarkTarget, LocalBenchmarkStorage
 from openstef_core.types import Q
 
-os.environ["OMP_NUM_THREADS"] = "1"
-os.environ["OPENBLAS_NUM_THREADS"] = "1"
-os.environ["MKL_NUM_THREADS"] = "1"
-
 _logger = logging.getLogger(__name__)
 
 logging.basicConfig(level=logging.INFO, format="[%(asctime)s][%(levelname)s] %(message)s")
 
+# %% [markdown]
+# ## Configuration
+#
+# Point at the folder containing your prediction parquets and list the quantiles
+# they were generated for.
+
+# %%
 # Path to the folder that contains the backtest/ directory with your parquets.
 OUTPUT_PATH = Path("./benchmark_results/MyForecasts")
 N_PROCESSES = multiprocessing.cpu_count()
@@ -73,6 +136,14 @@
 # Adjust this list to match whatever quantiles are in your parquet columns.
 PREDICTION_QUANTILES = [Q(0.05), Q(0.1), Q(0.3), Q(0.5), Q(0.7), Q(0.9), Q(0.95)]
 
+# %% [markdown]
+# ## Dummy forecaster factory
+#
+# The pipeline still needs a factory to know which quantiles were used, but
+# `fit()` and `predict()` are never called — backtesting is skipped.
+
+# %%
+
 
 def stub_factory(_context: BenchmarkContext, _target: BenchmarkTarget) -> DummyForecaster:
     """Factory that returns a DummyForecaster (backtesting is skipped).
@@ -86,16 +157,17 @@ def stub_factory(_context: BenchmarkContext, _target: BenchmarkTarget) -> DummyF
     return DummyForecaster(predict_quantiles=PREDICTION_QUANTILES)
 
 
+# %% [markdown]
+# ## Run evaluation
+#
+# The pipeline reads existing parquets and runs evaluation + analysis only.
+
+# %%
 if __name__ == "__main__":
-    # Point the storage at your results folder.
-    # The pipeline reads parquets from:
-    #   OUTPUT_PATH / backtest / <group_name> / <target_name> / predictions.parquet
     storage = LocalBenchmarkStorage(base_path=OUTPUT_PATH)
 
     runner = create_custom_benchmark_runner(storage=storage)
 
-    # Run the pipeline — backtesting is auto-skipped for every target that
-    # already has a predictions.parquet on disk.
     runner.run(
         forecaster_factory=stub_factory,
         run_name="my_forecasts",
diff --git a/examples/benchmarks/custom/run_custom_benchmark.ipynb b/examples/benchmarks/custom/run_custom_benchmark.ipynb
new file mode 100644
index 000000000..c6e10acac
--- /dev/null
+++ b/examples/benchmarks/custom/run_custom_benchmark.ipynb
@@ -0,0 +1,204 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "f7f9fc26",
+   "metadata": {},
+   "source": [
+    "# Run Custom Benchmark\n",
+    "\n",
+    "Entry point: run your custom forecaster on your own data using the pipeline\n",
+    "configured in [`custom_benchmark.py`](./custom_benchmark.ipynb).\n",
+    "\n",
+    "**See also:**\n",
+    "- [Custom Forecaster template](./custom_forecaster.ipynb) — define your model\n",
+    "- [Custom Benchmark configuration](./custom_benchmark.ipynb) — configure targets and metrics"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "769fc12e",
+   "metadata": {
+    "tags": [
+     "remove-cell"
+    ]
+   },
+   "outputs": [],
+   "source": [
+    "\"\"\"Run the custom benchmark: example baseline vs OpenSTEF GBLinear.\n",
+    "\n",
+    "Uses the custom benchmark pipeline from example_benchmark.py (which extends\n",
+    "SimpleTargetProvider) instead of the built-in Liander 2024 runner.\n",
+    "\"\"\"\n",
+    "\n",
+    "# SPDX-FileCopyrightText: 2025 Contributors to the OpenSTEF project <openstef@lfenergy.org>\n",
+    "#\n",
+    "# SPDX-License-Identifier: MPL-2.0\n",
+    "\n",
+    "import os\n",
+    "\n",
+    "# Prevent thread contention when running multiple targets in parallel\n",
+    "os.environ[\"OMP_NUM_THREADS\"] = \"1\"\n",
+    "os.environ[\"OPENBLAS_NUM_THREADS\"] = \"1\"\n",
+    "os.environ[\"MKL_NUM_THREADS\"] = \"1\""
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "b474c24a",
+   "metadata": {},
+   "source": [
+    "## Setup"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "65322a90",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\n",
+    "import logging\n",
+    "import multiprocessing\n",
+    "from pathlib import Path\n",
+    "\n",
+    "from examples.benchmarks.custom.custom_benchmark import MyCategory, create_custom_benchmark_runner\n",
+    "from examples.benchmarks.custom.custom_forecaster import ExampleBenchmarkForecaster\n",
+    "from openstef_beam.benchmarking import BenchmarkContext, BenchmarkTarget, LocalBenchmarkStorage\n",
+    "from openstef_beam.benchmarking.baselines.openstef4 import create_openstef4_preset_backtest_forecaster\n",
+    "from openstef_core.types import LeadTime, Q\n",
+    "from openstef_models.presets import ForecastingWorkflowConfig\n",
+    "\n",
+    "logging.basicConfig(level=logging.INFO, format=\"[%(asctime)s][%(levelname)s] %(message)s\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "e8298b0b",
+   "metadata": {},
+   "source": [
+    "## Configuration"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "8a517fc9",
+   "metadata": {
+    "lines_to_next_cell": 2
+   },
+   "outputs": [],
+   "source": [
+    "OUTPUT_PATH = Path(\"./benchmark_results\")\n",
+    "N_PROCESSES = multiprocessing.cpu_count()\n",
+    "\n",
+    "# Optional: filter to specific target categories (None = run all)\n",
+    "BENCHMARK_FILTER: list[MyCategory] | None = [\"solar_park\"]\n",
+    "\n",
+    "# Quantiles define the probabilistic forecast bands\n",
+    "PREDICTION_QUANTILES = [Q(0.05), Q(0.1), Q(0.3), Q(0.5), Q(0.7), Q(0.9), Q(0.95)]\n",
+    "\n",
+    "# --- GBLinear config ---\n",
+    "# Map column names in your data to what OpenSTEF expects\n",
+    "gblinear_config = ForecastingWorkflowConfig(\n",
+    "    model_id=\"custom_benchmark_\",\n",
+    "    run_name=None,\n",
+    "    model=\"gblinear\",\n",
+    "    horizons=[LeadTime.from_string(\"P3D\")],\n",
+    "    quantiles=PREDICTION_QUANTILES,\n",
+    "    model_reuse_enable=True,\n",
+    "    radiation_column=\"shortwave_radiation\",\n",
+    "    wind_speed_column=\"wind_speed_80m\",\n",
+    "    pressure_column=\"surface_pressure\",\n",
+    "    temperature_column=\"temperature_2m\",\n",
+    "    relative_humidity_column=\"relative_humidity_2m\",\n",
+    "    energy_price_column=\"EPEX_NL\",\n",
+    "    rolling_aggregate_features=[\"mean\", \"median\", \"max\", \"min\"],\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "4b965092",
+   "metadata": {},
+   "source": [
+    "## Forecaster factory"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "486c3ec8",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\n",
+    "\n",
+    "# --- Example baseline factory ---\n",
+    "def example_factory(_context: BenchmarkContext, _target: BenchmarkTarget) -> ExampleBenchmarkForecaster:\n",
+    "    \"\"\"Create an example forecaster for a benchmark target.\n",
+    "\n",
+    "    Returns:\n",
+    "        Configured ExampleBenchmarkForecaster instance.\n",
+    "    \"\"\"\n",
+    "    return ExampleBenchmarkForecaster(predict_quantiles=PREDICTION_QUANTILES)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "51c6bd6a",
+   "metadata": {},
+   "source": [
+    "## Run benchmark\n",
+    "\n",
+    "Run the custom baseline and GBLinear on your data."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a5f7bbe2",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "if __name__ == \"__main__\":\n",
+    "    # 1. Run example baseline using the custom benchmark pipeline\n",
+    "    create_custom_benchmark_runner(\n",
+    "        storage=LocalBenchmarkStorage(base_path=OUTPUT_PATH / \"ExampleBaseline\"),\n",
+    "    ).run(\n",
+    "        forecaster_factory=example_factory,\n",
+    "        run_name=\"example_baseline\",\n",
+    "        n_processes=N_PROCESSES,\n",
+    "        filter_args=BENCHMARK_FILTER,\n",
+    "    )\n",
+    "\n",
+    "    # 2. Run GBLinear using the same custom pipeline\n",
+    "    create_custom_benchmark_runner(\n",
+    "        storage=LocalBenchmarkStorage(base_path=OUTPUT_PATH / \"GBLinear\"),\n",
+    "    ).run(\n",
+    "        forecaster_factory=create_openstef4_preset_backtest_forecaster(\n",
+    "            workflow_config=gblinear_config,\n",
+    "            cache_dir=OUTPUT_PATH / \"cache\",\n",
+    "        ),\n",
+    "        run_name=\"gblinear\",\n",
+    "        n_processes=N_PROCESSES,\n",
+    "        filter_args=BENCHMARK_FILTER,\n",
+    "    )"
+   ]
+  }
+ ],
+ "metadata": {
+  "jupytext": {
+   "formats": "ipynb,py:percent"
+  },
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/examples/benchmarks/custom/run_custom_benchmark.ipynb.license b/examples/benchmarks/custom/run_custom_benchmark.ipynb.license
new file mode 100644
index 000000000..a42c86064
--- /dev/null
+++ b/examples/benchmarks/custom/run_custom_benchmark.ipynb.license
@@ -0,0 +1,3 @@
+SPDX-FileCopyrightText: 2026 Contributors to the OpenSTEF project <openstef@lfenergy.org>
+
+SPDX-License-Identifier: MPL-2.0
diff --git a/examples/benchmarks/custom_benchmark/run_benchmark.py b/examples/benchmarks/custom/run_custom_benchmark.py
similarity index 74%
rename from examples/benchmarks/custom_benchmark/run_benchmark.py
rename to examples/benchmarks/custom/run_custom_benchmark.py
index e744cf5b9..2e0aa3739 100644
--- a/examples/benchmarks/custom_benchmark/run_benchmark.py
+++ b/examples/benchmarks/custom/run_custom_benchmark.py
@@ -1,3 +1,29 @@
+# ---
+# jupyter:
+#   jupytext:
+#     formats: ipynb,py:percent
+#     text_representation:
+#       extension: .py
+#       format_name: percent
+#       format_version: '1.3'
+#       jupytext_version: 1.19.1
+#   kernelspec:
+#     display_name: Python 3
+#     language: python
+#     name: python3
+# ---
+
+# %% [markdown]
+# # Run Custom Benchmark
+#
+# Entry point: run your custom forecaster on your own data using the pipeline
+# configured in [`custom_benchmark.py`](./custom_benchmark.ipynb).
+#
+# **See also:**
+# - [Custom Forecaster template](./custom_forecaster.ipynb) — define your model
+# - [Custom Benchmark configuration](./custom_benchmark.ipynb) — configure targets and metrics
+
+# %% tags=["remove-cell"]
 """Run the custom benchmark: example baseline vs OpenSTEF GBLinear.
 
 Uses the custom benchmark pipeline from example_benchmark.py (which extends
@@ -15,12 +41,17 @@
 os.environ["OPENBLAS_NUM_THREADS"] = "1"
 os.environ["MKL_NUM_THREADS"] = "1"
 
+# %% [markdown]
+# ## Setup
+
+# %%
+
 import logging
 import multiprocessing
 from pathlib import Path
 
-from examples.benchmarks.custom_benchmark.example_baseline import ExampleBenchmarkForecaster
-from examples.benchmarks.custom_benchmark.example_benchmark import MyCategory, create_custom_benchmark_runner
+from examples.benchmarks.custom.custom_benchmark import MyCategory, create_custom_benchmark_runner
+from examples.benchmarks.custom.custom_forecaster import ExampleBenchmarkForecaster
 from openstef_beam.benchmarking import BenchmarkContext, BenchmarkTarget, LocalBenchmarkStorage
 from openstef_beam.benchmarking.baselines.openstef4 import create_openstef4_preset_backtest_forecaster
 from openstef_core.types import LeadTime, Q
@@ -28,6 +59,10 @@
 
 logging.basicConfig(level=logging.INFO, format="[%(asctime)s][%(levelname)s] %(message)s")
 
+# %% [markdown]
+# ## Configuration
+
+# %%
 OUTPUT_PATH = Path("./benchmark_results")
 N_PROCESSES = multiprocessing.cpu_count()
 
@@ -56,6 +91,12 @@
 )
 
 
+# %% [markdown]
+# ## Forecaster factory
+
+# %%
+
+
 # --- Example baseline factory ---
 def example_factory(_context: BenchmarkContext, _target: BenchmarkTarget) -> ExampleBenchmarkForecaster:
     """Create an example forecaster for a benchmark target.
@@ -66,6 +107,12 @@ def example_factory(_context: BenchmarkContext, _target: BenchmarkTarget) -> Exa
     return ExampleBenchmarkForecaster(predict_quantiles=PREDICTION_QUANTILES)
 
 
+# %% [markdown]
+# ## Run benchmark
+#
+# Run the custom baseline and GBLinear on your data.
+
+# %%
 if __name__ == "__main__":
     # 1. Run example baseline using the custom benchmark pipeline
     create_custom_benchmark_runner(
diff --git a/examples/benchmarks/custom/run_liander2024_benchmark.ipynb b/examples/benchmarks/custom/run_liander2024_benchmark.ipynb
new file mode 100644
index 000000000..6f7993f85
--- /dev/null
+++ b/examples/benchmarks/custom/run_liander2024_benchmark.ipynb
@@ -0,0 +1,216 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "05ca0933",
+   "metadata": {},
+   "source": [
+    "# Run Liander 2024 Benchmark (Custom Forecaster)\n",
+    "\n",
+    "Entry point: test your custom forecaster on the built-in\n",
+    "[Liander 2024 dataset](https://huggingface.co/datasets/OpenSTEF/liander2024-stef-benchmark)\n",
+    "(auto-downloaded from HuggingFace).\n",
+    "\n",
+    "Uses [`create_liander2024_benchmark_runner()`](https://openstef.github.io/openstef/v4/api/generated/openstef_beam.benchmarking.benchmarks.liander2024.html)\n",
+    "which pre-configures backtest settings, evaluation windows, metrics, and target definitions.\n",
+    "\n",
+    "**See also:** [Custom Forecaster template](./custom_forecaster.ipynb) — define your model here."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f8e9d860",
+   "metadata": {
+    "tags": [
+     "remove-cell"
+    ]
+   },
+   "outputs": [],
+   "source": [
+    "\"\"\"Example: run the built-in Liander 2024 benchmark with a custom baseline and GBLinear.\n",
+    "\n",
+    "Uses create_liander2024_benchmark_runner() which pre-configures everything:\n",
+    "backtest settings, evaluation windows, metrics, analysis plots, and target\n",
+    "definitions. Data is auto-downloaded from HuggingFace.\n",
+    "\"\"\"\n",
+    "\n",
+    "# SPDX-FileCopyrightText: 2025 Contributors to the OpenSTEF project <openstef@lfenergy.org>\n",
+    "#\n",
+    "# SPDX-License-Identifier: MPL-2.0\n",
+    "\n",
+    "import os\n",
+    "\n",
+    "os.environ[\"OMP_NUM_THREADS\"] = \"1\"\n",
+    "os.environ[\"OPENBLAS_NUM_THREADS\"] = \"1\"\n",
+    "os.environ[\"MKL_NUM_THREADS\"] = \"1\""
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "1aa6b9e9",
+   "metadata": {},
+   "source": [
+    "## Setup"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "60a8b1f5",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\n",
+    "import logging\n",
+    "import multiprocessing\n",
+    "from pathlib import Path\n",
+    "\n",
+    "from examples.benchmarks.custom.custom_forecaster import ExampleBenchmarkForecaster\n",
+    "from openstef_beam.benchmarking import BenchmarkContext, BenchmarkTarget, LocalBenchmarkStorage, StrictExecutionCallback\n",
+    "from openstef_beam.benchmarking.baselines.openstef4 import create_openstef4_preset_backtest_forecaster\n",
+    "from openstef_beam.benchmarking.benchmarks.liander2024 import Liander2024Category, create_liander2024_benchmark_runner\n",
+    "from openstef_core.types import LeadTime, Q\n",
+    "from openstef_models.presets import ForecastingWorkflowConfig\n",
+    "\n",
+    "logging.basicConfig(level=logging.INFO, format=\"[%(asctime)s][%(levelname)s] %(message)s\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "ae1d69fc",
+   "metadata": {},
+   "source": [
+    "## Configuration\n",
+    "\n",
+    "Define output paths, quantiles, and the GBLinear model config."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0e115ded",
+   "metadata": {
+    "lines_to_next_cell": 2
+   },
+   "outputs": [],
+   "source": [
+    "OUTPUT_PATH = Path(\"./benchmark_results\")\n",
+    "N_PROCESSES = int(os.environ.get(\"OPENSTEF_N_PROCESSES\", str(multiprocessing.cpu_count())))\n",
+    "\n",
+    "# Optional: filter to specific target categories (None = run all)\n",
+    "BENCHMARK_FILTER: list[Liander2024Category] | None = None\n",
+    "\n",
+    "# Quantiles define the probabilistic forecast bands\n",
+    "# Q(0.05) = 5th percentile, Q(0.5) = median, Q(0.95) = 95th percentile\n",
+    "PREDICTION_QUANTILES = [Q(0.05), Q(0.1), Q(0.3), Q(0.5), Q(0.7), Q(0.9), Q(0.95)]\n",
+    "\n",
+    "# --- GBLinear model config ---\n",
+    "# Map column names in your data to what OpenSTEF expects\n",
+    "gblinear_config = ForecastingWorkflowConfig(\n",
+    "    model_id=\"liander_benchmark_\",\n",
+    "    run_name=None,\n",
+    "    model=\"gblinear\",\n",
+    "    horizons=[LeadTime.from_string(\"P3D\")],\n",
+    "    quantiles=PREDICTION_QUANTILES,\n",
+    "    model_reuse_enable=True,\n",
+    "    radiation_column=\"shortwave_radiation\",\n",
+    "    wind_speed_column=\"wind_speed_80m\",\n",
+    "    pressure_column=\"surface_pressure\",\n",
+    "    temperature_column=\"temperature_2m\",\n",
+    "    relative_humidity_column=\"relative_humidity_2m\",\n",
+    "    energy_price_column=\"EPEX_NL\",\n",
+    "    rolling_aggregate_features=[\"mean\", \"median\", \"max\", \"min\"],\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "96e99ff0",
+   "metadata": {},
+   "source": [
+    "## Forecaster factory\n",
+    "\n",
+    "The benchmark pipeline calls this function once per target. Return your custom forecaster."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "cfa35952",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\n",
+    "\n",
+    "def example_factory(_context: BenchmarkContext, _target: BenchmarkTarget) -> ExampleBenchmarkForecaster:\n",
+    "    \"\"\"Create the example baseline forecaster.\n",
+    "\n",
+    "    Returns:\n",
+    "        Configured ExampleBenchmarkForecaster instance.\n",
+    "    \"\"\"\n",
+    "    return ExampleBenchmarkForecaster(predict_quantiles=PREDICTION_QUANTILES)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "9ed22818",
+   "metadata": {},
+   "source": [
+    "## Run benchmark\n",
+    "\n",
+    "Run the custom baseline and GBLinear on all Liander 2024 targets.\n",
+    "Results are saved to `./benchmark_results/<model_name>/`."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d62c5aef",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "if __name__ == \"__main__\":\n",
+    "    # 1. Run custom baseline on Liander 2024\n",
+    "    # create_liander2024_benchmark_runner() sets up everything: data download, configs, metrics\n",
+    "    # LocalBenchmarkStorage writes results as parquet files to disk\n",
+    "    create_liander2024_benchmark_runner(\n",
+    "        storage=LocalBenchmarkStorage(base_path=OUTPUT_PATH / \"ExampleBaseline\"),\n",
+    "        callbacks=[StrictExecutionCallback()],  # Fail fast on errors\n",
+    "    ).run(\n",
+    "        forecaster_factory=example_factory,  # Your model factory (called per target)\n",
+    "        run_name=\"example_baseline\",  # Label for this run\n",
+    "        n_processes=N_PROCESSES,  # Parallel targets\n",
+    "        filter_args=BENCHMARK_FILTER,  # None = all categories\n",
+    "    )\n",
+    "\n",
+    "    # 2. Run GBLinear on Liander 2024\n",
+    "    # create_openstef4_preset_backtest_forecaster returns a factory that wraps OpenSTEF models\n",
+    "    create_liander2024_benchmark_runner(\n",
+    "        storage=LocalBenchmarkStorage(base_path=OUTPUT_PATH / \"GBLinear\"),\n",
+    "        callbacks=[StrictExecutionCallback()],\n",
+    "    ).run(\n",
+    "        forecaster_factory=create_openstef4_preset_backtest_forecaster(\n",
+    "            workflow_config=gblinear_config,\n",
+    "            cache_dir=OUTPUT_PATH / \"cache\",\n",
+    "        ),\n",
+    "        run_name=\"gblinear\",\n",
+    "        n_processes=N_PROCESSES,\n",
+    "        filter_args=BENCHMARK_FILTER,\n",
+    "    )"
+   ]
+  }
+ ],
+ "metadata": {
+  "jupytext": {
+   "formats": "ipynb,py:percent"
+  },
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/examples/benchmarks/custom/run_liander2024_benchmark.ipynb.license b/examples/benchmarks/custom/run_liander2024_benchmark.ipynb.license
new file mode 100644
index 000000000..a42c86064
--- /dev/null
+++ b/examples/benchmarks/custom/run_liander2024_benchmark.ipynb.license
@@ -0,0 +1,3 @@
+SPDX-FileCopyrightText: 2026 Contributors to the OpenSTEF project <openstef@lfenergy.org>
+
+SPDX-License-Identifier: MPL-2.0
diff --git a/examples/benchmarks/custom_benchmark/run_liander2024_benchmark.py b/examples/benchmarks/custom/run_liander2024_benchmark.py
similarity index 70%
rename from examples/benchmarks/custom_benchmark/run_liander2024_benchmark.py
rename to examples/benchmarks/custom/run_liander2024_benchmark.py
index 0e9918c8d..bc5524d87 100644
--- a/examples/benchmarks/custom_benchmark/run_liander2024_benchmark.py
+++ b/examples/benchmarks/custom/run_liander2024_benchmark.py
@@ -1,3 +1,31 @@
+# ---
+# jupyter:
+#   jupytext:
+#     formats: ipynb,py:percent
+#     text_representation:
+#       extension: .py
+#       format_name: percent
+#       format_version: '1.3'
+#       jupytext_version: 1.19.1
+#   kernelspec:
+#     display_name: Python 3
+#     language: python
+#     name: python3
+# ---
+
+# %% [markdown]
+# # Run Liander 2024 Benchmark (Custom Forecaster)
+#
+# Entry point: test your custom forecaster on the built-in
+# [Liander 2024 dataset](https://huggingface.co/datasets/OpenSTEF/liander2024-stef-benchmark)
+# (auto-downloaded from HuggingFace).
+#
+# Uses [`create_liander2024_benchmark_runner()`](https://openstef.github.io/openstef/v4/api/generated/openstef_beam.benchmarking.benchmarks.liander2024.html)
+# which pre-configures backtest settings, evaluation windows, metrics, and target definitions.
+#
+# **See also:** [Custom Forecaster template](./custom_forecaster.ipynb) — define your model here.
+
+# %% tags=["remove-cell"]
 """Example: run the built-in Liander 2024 benchmark with a custom baseline and GBLinear.
 
 Uses create_liander2024_benchmark_runner() which pre-configures everything:
@@ -15,11 +43,16 @@
 os.environ["OPENBLAS_NUM_THREADS"] = "1"
 os.environ["MKL_NUM_THREADS"] = "1"
 
+# %% [markdown]
+# ## Setup
+
+# %%
+
 import logging
 import multiprocessing
 from pathlib import Path
 
-from examples.benchmarks.custom_benchmark.example_baseline import ExampleBenchmarkForecaster
+from examples.benchmarks.custom.custom_forecaster import ExampleBenchmarkForecaster
 from openstef_beam.benchmarking import BenchmarkContext, BenchmarkTarget, LocalBenchmarkStorage, StrictExecutionCallback
 from openstef_beam.benchmarking.baselines.openstef4 import create_openstef4_preset_backtest_forecaster
 from openstef_beam.benchmarking.benchmarks.liander2024 import Liander2024Category, create_liander2024_benchmark_runner
@@ -28,8 +61,14 @@
 
 logging.basicConfig(level=logging.INFO, format="[%(asctime)s][%(levelname)s] %(message)s")
 
+# %% [markdown]
+# ## Configuration
+#
+# Define output paths, quantiles, and the GBLinear model config.
+
+# %%
 OUTPUT_PATH = Path("./benchmark_results")
-N_PROCESSES = multiprocessing.cpu_count()
+N_PROCESSES = int(os.environ.get("OPENSTEF_N_PROCESSES", str(multiprocessing.cpu_count())))
 
 # Optional: filter to specific target categories (None = run all)
 BENCHMARK_FILTER: list[Liander2024Category] | None = None
@@ -57,6 +96,14 @@
 )
 
 
+# %% [markdown]
+# ## Forecaster factory
+#
+# The benchmark pipeline calls this function once per target. Return your custom forecaster.
+
+# %%
+
+
 def example_factory(_context: BenchmarkContext, _target: BenchmarkTarget) -> ExampleBenchmarkForecaster:
     """Create the example baseline forecaster.
 
@@ -66,6 +113,13 @@ def example_factory(_context: BenchmarkContext, _target: BenchmarkTarget) -> Exa
     return ExampleBenchmarkForecaster(predict_quantiles=PREDICTION_QUANTILES)
 
 
+# %% [markdown]
+# ## Run benchmark
+#
+# Run the custom baseline and GBLinear on all Liander 2024 targets.
+# Results are saved to `./benchmark_results/<model_name>/`.
+
+# %%
 if __name__ == "__main__":
     # 1. Run custom baseline on Liander 2024
     # create_liander2024_benchmark_runner() sets up everything: data download, configs, metrics
diff --git a/examples/benchmarks/custom_benchmark/README.md b/examples/benchmarks/custom_benchmark/README.md
deleted file mode 100644
index 54db5a672..000000000
--- a/examples/benchmarks/custom_benchmark/README.md
+++ /dev/null
@@ -1,142 +0,0 @@
-<!--
-SPDX-FileCopyrightText: 2026 Contributors to the OpenSTEF project <openstef@lfenergy.org>
-
-SPDX-License-Identifier: MPL-2.0
--->
-
-# Custom Benchmark Example
-
-End-to-end examples for running and customizing OpenSTEF **BEAM** (Backtesting, Evaluation, Analysis, Metrics) benchmarks.
-
-## What is BEAM?
-
-BEAM replays historical data day by day, trains your model, makes forecasts, and scores them -- all without data leakage. It works with any model that implements the `BacktestForecasterMixin` interface.
-
-## Files
-
-| File | What it does |
-|---|---|
-| `example_baseline.py` | **Start here.** A minimal forecaster that predicts the median of recent history. Shows the `BacktestForecasterMixin` interface (`config`, `quantiles`, `fit`, `predict`). |
-| `example_benchmark.py` | Defines a custom benchmark: target provider (where data lives), metrics, and pipeline assembly. Extends `SimpleTargetProvider` directly -- adapt this when you have your own data layout. |
-| `run_liander2024_benchmark.py` | Runs the example baseline + GBLinear on the built-in **Liander 2024** dataset (auto-downloaded from HuggingFace). Good starting point if you just want to try things out. |
-| `run_benchmark.py` | Same as above but uses the custom benchmark pipeline from `example_benchmark.py`. |
-| `evaluate_existing_forecasts.py` | **Bring your own forecasts.** Points the pipeline at pre-existing prediction parquets and runs only evaluation + analysis (no backtesting). |
-| `compare_liander2024_results.py` | Compare results from multiple runs on the **Liander 2024** dataset. Auto-detects which targets are available in all runs. |
-| `compare_custom_results.py` | Compare results from multiple runs on the **custom** benchmark. Same auto-detection as above. |
-
-## Quick Start
-
-```bash
-# 1. Clone the repo
-git clone git@github.com:OpenSTEF/openstef.git -b "release/v4.0.0"
-cd openstef
-
-# 2. Install all packages (requires uv: https://docs.astral.sh/uv/)
-uv sync --all-extras --all-groups --all-packages
-```
-
-### Run the Liander 2024 benchmark
-
-Uses the built-in Liander 2024 dataset (auto-downloaded from HuggingFace). Runs the example baseline and GBLinear on all target categories.
-
-```bash
-uv run python -m examples.benchmarks.custom_benchmark.run_liander2024_benchmark
-```
-
-### Run the custom benchmark
-
-Uses the custom target provider from `example_benchmark.py` with your own pipeline config. Runs on `solar_park` targets by default.
-
-```bash
-uv run python -m examples.benchmarks.custom_benchmark.run_benchmark
-```
-
-### Evaluate pre-existing forecasts (no backtesting)
-
-If you already have predictions from your own model or external system, you can skip backtesting entirely. Place your forecast parquets in the expected directory layout and run only evaluation + analysis.
-
-#### Required directory layout
-
-```
-benchmark_results/MyForecasts/
-└── backtest/
-    └── <group_name>/                   # e.g. "solar_park"
-        └── <target_name>/              # e.g. "Within 15 kilometers of Opmeer_normalized"
-            └── predictions.parquet
-```
-
-`group_name` and `target_name` must match the values from your targets YAML. You can list them:
-
-```bash
-uv run python -c "
-from examples.benchmarks.custom_benchmark.example_benchmark import create_custom_benchmark_runner
-for t in create_custom_benchmark_runner().target_provider.get_targets(['solar_park']):
-    print(t.group_name, '/', t.name)
-"
-```
-
-#### Required parquet format
-
-Each `predictions.parquet` must have:
-
-| Column | Type | Description |
-|---|---|---|
-| *(index)* `timestamp` | `DatetimeIndex` | When each prediction is valid for. 15-min intervals, tz-naive UTC. |
-| `available_at` | `datetime64` | When the prediction was generated (enables D-1 / lead-time filtering). |
-| `quantile_P05` | `float` | 5th percentile prediction. |
-| `quantile_P50` | `float` | Median prediction (**required**). |
-| `quantile_P95` | `float` | 95th percentile prediction. |
-| ... | `float` | One column per quantile, named with `Quantile(x).format()`. |
-
-Example rows:
-
-```
-timestamp (index)      available_at          quantile_P05  quantile_P50  quantile_P95
-2023-01-15 12:00:00    2023-01-14 06:00:00   0.5           1.2           2.0
-2023-01-15 12:15:00    2023-01-14 06:00:00   0.6           1.3           2.1
-```
-
-#### Run
-
-```bash
-uv run python -m examples.benchmarks.custom_benchmark.evaluate_existing_forecasts
-```
-
-See `evaluate_existing_forecasts.py` for the full script.
-
-Results are written to `./benchmark_results/`. Each model gets its own subfolder with backtest predictions, evaluation scores, and analysis plots.
-
-### Compare results across runs
-
-After running at least two models, generate side-by-side comparison plots (global, per-group, per-target). The scripts automatically detect which targets are available in all runs.
-
-```bash
-# Compare on the Liander 2024 dataset
-uv run python -m examples.benchmarks.custom_benchmark.compare_liander2024_results
-
-# Compare on the custom benchmark
-uv run python -m examples.benchmarks.custom_benchmark.compare_custom_results
-```
-
-Comparison output (HTML plots) is saved to `./benchmark_results_comparison/`.
-
-## Creating Your Own
-
-### 1. Write a forecaster
-
-Copy `example_baseline.py` and implement two methods:
-
-- **`fit(data)`** -- called periodically with recent history. Train your model here.
-- **`predict(data)`** -- called every few hours. Return a `TimeSeriesDataset` with a `"load"` column and one column per quantile (e.g. `"quantile_P05"`, `"quantile_P50"`).
-
-The `data` argument is a `RestrictedHorizonVersionedTimeSeries` -- it enforces no-lookahead by only exposing data available at `data.horizon`. Use `data.get_window(start, end, available_before)` to retrieve slices.
-
-### 2. Define a benchmark (optional)
-
-Copy `example_benchmark.py` if you want to use **your own data**. The key class is `SimpleTargetProvider` -- override `_get_measurements_path_for_target()` and `_get_weather_path_for_target()` to point to your parquet files.
-
-If you're fine with the Liander 2024 dataset, skip this step and use `create_liander2024_benchmark_runner()` directly.
-
-### 3. Write a runner
-
-Copy `run_benchmark.py`. Register your models as forecaster factories and call `pipeline.run()`.
diff --git a/examples/benchmarks/custom_benchmark/compare_custom_results.py b/examples/benchmarks/custom_benchmark/compare_custom_results.py
deleted file mode 100644
index 692ce9b79..000000000
--- a/examples/benchmarks/custom_benchmark/compare_custom_results.py
+++ /dev/null
@@ -1,48 +0,0 @@
-"""Compare benchmark results from different runs on the custom benchmark.
-
-Usage:
-    1. First run at least two models with run_benchmark.py
-       (e.g. ExampleBaseline and GBLinear).
-    2. Then run this script to generate side-by-side comparison plots.
-
-Output is saved to ./benchmark_results_comparison/custom/.
-"""
-
-# SPDX-FileCopyrightText: 2025 Contributors to the OpenSTEF project <openstef@lfenergy.org>
-#
-# SPDX-License-Identifier: MPL-2.0
-
-from pathlib import Path
-from typing import cast
-
-from examples.benchmarks.custom_benchmark.example_benchmark import ANALYSIS_CONFIG, create_custom_benchmark_runner
-from openstef_beam.analysis.models import RunName
-from openstef_beam.benchmarking import BenchmarkComparisonPipeline, LocalBenchmarkStorage
-from openstef_beam.benchmarking.storage import BenchmarkStorage
-
-# One storage per run — keys are human-readable labels shown in comparison plots.
-run_storages: dict[RunName, BenchmarkStorage] = {
-    "ExampleBaseline": LocalBenchmarkStorage(base_path=Path("./benchmark_results/ExampleBaseline")),
-    "GBLinear": LocalBenchmarkStorage(base_path=Path("./benchmark_results/GBLinear")),
-}
-
-# Check that results exist.
-for name, storage in run_storages.items():
-    base_path = cast(LocalBenchmarkStorage, storage).base_path
-    if not base_path.exists():
-        msg = f"Benchmark directory not found for '{name}': {base_path}. Run the benchmarks first."
-        raise FileNotFoundError(msg)
-
-# Reuse the custom target provider.
-OUTPUT_PATH = Path("./benchmark_results_comparison/custom")
-target_provider = create_custom_benchmark_runner(
-    storage=LocalBenchmarkStorage(base_path=OUTPUT_PATH),
-).target_provider
-
-# Run the comparison — generates global, group, and per-target HTML plots.
-comparison = BenchmarkComparisonPipeline(
-    analysis_config=ANALYSIS_CONFIG,
-    storage=LocalBenchmarkStorage(base_path=OUTPUT_PATH),
-    target_provider=target_provider,
-)
-comparison.run(run_data=run_storages, filter_args=["solar_park"])
diff --git a/examples/benchmarks/liander2024/README.md b/examples/benchmarks/liander2024/README.md
new file mode 100644
index 000000000..29dd36c9f
--- /dev/null
+++ b/examples/benchmarks/liander2024/README.md
@@ -0,0 +1,26 @@
+<!--
+SPDX-FileCopyrightText: 2025 Contributors to the OpenSTEF project <openstef@lfenergy.org>
+
+SPDX-License-Identifier: MPL-2.0
+-->
+
+# Liander 2024
+
+Pre-made benchmarks on the [Liander 2024 STEF benchmark dataset](https://huggingface.co/datasets/OpenSTEF/liander2024-stef-benchmark)
+— an open dataset of Dutch energy grid measurements (solar, wind, consumption).
+
+**No code changes needed.** Pick a notebook below and run it. Data is
+auto-downloaded from HuggingFace.
+
+```bash
+# Run the XGBoost + GBLinear benchmark
+uv run python -m examples.benchmarks.liander2024.run_xgboost_gblinear_benchmark
+```
+
+```{toctree}
+:maxdepth: 1
+
+XGBoost & GBLinear <run_xgboost_gblinear_benchmark>
+Ensemble Models <run_ensemble_benchmark>
+Compare Results <compare_benchmark_runs>
+```
diff --git a/examples/examples/.gitignore b/examples/benchmarks/liander2024/__init__.py
similarity index 77%
rename from examples/examples/.gitignore
rename to examples/benchmarks/liander2024/__init__.py
index 39116e399..54f9ce2f2 100644
--- a/examples/examples/.gitignore
+++ b/examples/benchmarks/liander2024/__init__.py
@@ -1,5 +1,4 @@
 # SPDX-FileCopyrightText: 2025 Contributors to the OpenSTEF project <openstef@lfenergy.org>
 #
 # SPDX-License-Identifier: MPL-2.0
-
-mlflow_tracking*
+"""Liander 2024 benchmark scripts."""
diff --git a/examples/benchmarks/liander2024/compare_benchmark_runs.ipynb b/examples/benchmarks/liander2024/compare_benchmark_runs.ipynb
new file mode 100644
index 000000000..4cc99e83e
--- /dev/null
+++ b/examples/benchmarks/liander2024/compare_benchmark_runs.ipynb
@@ -0,0 +1,149 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "0d864501",
+   "metadata": {},
+   "source": [
+    "# Compare Benchmark Runs\n",
+    "\n",
+    "Generate side-by-side comparison plots from multiple benchmark runs on the\n",
+    "Liander 2024 dataset.\n",
+    "\n",
+    "**Prerequisites:** Run at least two models first (e.g. XGBoost + GBLinear via\n",
+    "the *XGBoost & GBLinear* notebook).\n",
+    "\n",
+    "**What this does:**\n",
+    "\n",
+    "1. Loads results from multiple model runs (each stored in its own directory)\n",
+    "2. Computes metrics across all targets using\n",
+    "   [`BenchmarkComparisonPipeline`](https://openstef.github.io/openstef/v4/api/generated/openstef_beam.benchmarking.BenchmarkComparisonPipeline.html)\n",
+    "3. Produces comparison visualizations (boxplots, ranking tables, per-target breakdowns)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "8fd216c5",
+   "metadata": {
+    "tags": [
+     "remove-cell"
+    ]
+   },
+   "outputs": [],
+   "source": [
+    "# SPDX-FileCopyrightText: 2025 Contributors to the OpenSTEF project <openstef@lfenergy.org>\n",
+    "#\n",
+    "# SPDX-License-Identifier: MPL-2.0"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "ffb9eb1c",
+   "metadata": {},
+   "source": [
+    "## Setup\n",
+    "\n",
+    "Point at the result directories from your benchmark runs."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "960f4f2c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from pathlib import Path\n",
+    "\n",
+    "from openstef_beam.analysis.models import RunName\n",
+    "from openstef_beam.benchmarking import BenchmarkComparisonPipeline, LocalBenchmarkStorage\n",
+    "from openstef_beam.benchmarking.benchmarks import create_liander2024_benchmark_runner\n",
+    "from openstef_beam.benchmarking.benchmarks.liander2024 import LIANDER2024_ANALYSIS_CONFIG\n",
+    "from openstef_beam.benchmarking.storage import BenchmarkStorage\n",
+    "\n",
+    "BASE_DIR = Path()\n",
+    "\n",
+    "OUTPUT_PATH = BASE_DIR / \"./benchmark_results_comparison\"\n",
+    "\n",
+    "BENCHMARK_DIR_GBLINEAR = BASE_DIR / \"benchmark_results\" / \"GBLinear\"\n",
+    "BENCHMARK_DIR_XGBOOST = BASE_DIR / \"benchmark_results\" / \"XGBoost\""
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "0188f84f",
+   "metadata": {},
+   "source": [
+    "## Load run results\n",
+    "\n",
+    "Each run is identified by a name and backed by a `LocalBenchmarkStorage` that\n",
+    "points at the directory where that model's results were saved."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "4dc75226",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "check_dirs = [\n",
+    "    BENCHMARK_DIR_GBLINEAR,\n",
+    "    BENCHMARK_DIR_XGBOOST,\n",
+    "]\n",
+    "for dir_path in check_dirs:\n",
+    "    if not dir_path.exists():\n",
+    "        msg = f\"Benchmark directory not found: {dir_path}. Make sure to run the benchmarks first.\"\n",
+    "        raise FileNotFoundError(msg)\n",
+    "\n",
+    "run_storages: dict[RunName, BenchmarkStorage] = {\n",
+    "    \"gblinear\": LocalBenchmarkStorage(base_path=BENCHMARK_DIR_GBLINEAR),\n",
+    "    \"xgboost\": LocalBenchmarkStorage(base_path=BENCHMARK_DIR_XGBOOST),\n",
+    "}"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "a58f6240",
+   "metadata": {},
+   "source": [
+    "## Run comparison\n",
+    "\n",
+    "The pipeline loads predictions from each run, re-evaluates them with the\n",
+    "Liander 2024 analysis config, and produces comparison visualizations."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e3bdb97d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "target_provider = create_liander2024_benchmark_runner(\n",
+    "    storage=LocalBenchmarkStorage(base_path=OUTPUT_PATH),\n",
+    ").target_provider\n",
+    "\n",
+    "comparison_pipeline = BenchmarkComparisonPipeline(\n",
+    "    analysis_config=LIANDER2024_ANALYSIS_CONFIG,\n",
+    "    storage=LocalBenchmarkStorage(base_path=OUTPUT_PATH),\n",
+    "    target_provider=target_provider,\n",
+    ")\n",
+    "comparison_pipeline.run(run_data=run_storages)"
+   ]
+  }
+ ],
+ "metadata": {
+  "jupytext": {
+   "formats": "ipynb,py:percent"
+  },
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/examples/benchmarks/liander2024/compare_benchmark_runs.ipynb.license b/examples/benchmarks/liander2024/compare_benchmark_runs.ipynb.license
new file mode 100644
index 000000000..a42c86064
--- /dev/null
+++ b/examples/benchmarks/liander2024/compare_benchmark_runs.ipynb.license
@@ -0,0 +1,3 @@
+SPDX-FileCopyrightText: 2026 Contributors to the OpenSTEF project <openstef@lfenergy.org>
+
+SPDX-License-Identifier: MPL-2.0
diff --git a/examples/benchmarks/liander_2024_compare_results.py b/examples/benchmarks/liander2024/compare_benchmark_runs.py
similarity index 52%
rename from examples/benchmarks/liander_2024_compare_results.py
rename to examples/benchmarks/liander2024/compare_benchmark_runs.py
index de57191fa..ffc988b99 100644
--- a/examples/benchmarks/liander_2024_compare_results.py
+++ b/examples/benchmarks/liander2024/compare_benchmark_runs.py
@@ -1,8 +1,45 @@
-"""Example for comparing benchmark results from different runs on the Liander 2024 dataset."""
+# ---
+# jupyter:
+#   jupytext:
+#     formats: ipynb,py:percent
+#     text_representation:
+#       extension: .py
+#       format_name: percent
+#       format_version: '1.3'
+#       jupytext_version: 1.19.1
+#   kernelspec:
+#     display_name: Python 3
+#     language: python
+#     name: python3
+# ---
+
+# %% [markdown]
+# # Compare Benchmark Runs
+#
+# Generate side-by-side comparison plots from multiple benchmark runs on the
+# Liander 2024 dataset.
+#
+# **Prerequisites:** Run at least two models first (e.g. XGBoost + GBLinear via
+# the *XGBoost & GBLinear* notebook).
+#
+# **What this does:**
+#
+# 1. Loads results from multiple model runs (each stored in its own directory)
+# 2. Computes metrics across all targets using
+#    [`BenchmarkComparisonPipeline`](https://openstef.github.io/openstef/v4/api/generated/openstef_beam.benchmarking.BenchmarkComparisonPipeline.html)
+# 3. Produces comparison visualizations (boxplots, ranking tables, per-target breakdowns)
+
+# %% tags=["remove-cell"]
 # SPDX-FileCopyrightText: 2025 Contributors to the OpenSTEF project <openstef@lfenergy.org>
 #
 # SPDX-License-Identifier: MPL-2.0
 
+# %% [markdown]
+# ## Setup
+#
+# Point at the result directories from your benchmark runs.
+
+# %%
 from pathlib import Path
 
 from openstef_beam.analysis.models import RunName
@@ -18,6 +55,13 @@
 BENCHMARK_DIR_GBLINEAR = BASE_DIR / "benchmark_results" / "GBLinear"
 BENCHMARK_DIR_XGBOOST = BASE_DIR / "benchmark_results" / "XGBoost"
 
+# %% [markdown]
+# ## Load run results
+#
+# Each run is identified by a name and backed by a `LocalBenchmarkStorage` that
+# points at the directory where that model's results were saved.
+
+# %%
 check_dirs = [
     BENCHMARK_DIR_GBLINEAR,
     BENCHMARK_DIR_XGBOOST,
@@ -32,6 +76,13 @@
     "xgboost": LocalBenchmarkStorage(base_path=BENCHMARK_DIR_XGBOOST),
 }
 
+# %% [markdown]
+# ## Run comparison
+#
+# The pipeline loads predictions from each run, re-evaluates them with the
+# Liander 2024 analysis config, and produces comparison visualizations.
+
+# %%
 target_provider = create_liander2024_benchmark_runner(
     storage=LocalBenchmarkStorage(base_path=OUTPUT_PATH),
 ).target_provider
diff --git a/examples/benchmarks/liander2024/run_ensemble_benchmark.ipynb b/examples/benchmarks/liander2024/run_ensemble_benchmark.ipynb
new file mode 100644
index 000000000..b444ed759
--- /dev/null
+++ b/examples/benchmarks/liander2024/run_ensemble_benchmark.ipynb
@@ -0,0 +1,266 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "3b5c7558",
+   "metadata": {},
+   "source": [
+    "# Ensemble Model Benchmark\n",
+    "\n",
+    "Run an ensemble of multiple base models (e.g. LightGBM + GBLinear) with a learned\n",
+    "weight combiner on the\n",
+    "[Liander 2024 STEF benchmark](https://huggingface.co/datasets/OpenSTEF/liander2024-stef-benchmark).\n",
+    "\n",
+    "**What this does:**\n",
+    "\n",
+    "1. Downloads the Liander 2024 dataset from HuggingFace (automatic)\n",
+    "2. Trains multiple base models and a combiner that learns optimal weights\n",
+    "3. Produces probabilistic forecasts (7 quantiles) for a 36-hour horizon\n",
+    "4. Saves results locally for comparison\n",
+    "\n",
+    "**No code changes needed.** To benchmark your own model, see\n",
+    "[Implement a Custom Forecaster](../custom/custom_forecaster.ipynb).\n",
+    "\n",
+    "```{admonition} Ensemble types\n",
+    "Change `ensemble_type` below to try different strategies:\n",
+    "- `\"learned_weights\"` — a combiner model learns per-quantile weights\n",
+    "- `\"stacking\"` — base model outputs become features for a meta-model\n",
+    "- `\"rules\"` — fixed rule-based combination\n",
+    "```"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "36c13dbf",
+   "metadata": {
+    "tags": [
+     "remove-cell"
+    ]
+   },
+   "outputs": [],
+   "source": [
+    "# SPDX-FileCopyrightText: 2025 Contributors to the OpenSTEF project <short.term.energy.forecasts@alliander.com>\n",
+    "#\n",
+    "# SPDX-License-Identifier: MPL-2.0\n",
+    "\n",
+    "import os\n",
+    "import time\n",
+    "\n",
+    "os.environ[\"OMP_NUM_THREADS\"] = \"1\"\n",
+    "os.environ[\"OPENBLAS_NUM_THREADS\"] = \"1\"\n",
+    "os.environ[\"MKL_NUM_THREADS\"] = \"1\""
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "314567bf",
+   "metadata": {},
+   "source": [
+    "## Setup"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "9fb6db81",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import logging\n",
+    "from datetime import timedelta\n",
+    "from pathlib import Path\n",
+    "\n",
+    "from openstef_beam.backtesting.backtest_forecaster import BacktestForecasterConfig\n",
+    "from openstef_beam.benchmarking.baselines.openstef4 import (\n",
+    "    create_openstef4_preset_backtest_forecaster,\n",
+    ")\n",
+    "from openstef_beam.benchmarking.benchmarks.liander2024 import Liander2024Category, create_liander2024_benchmark_runner\n",
+    "from openstef_beam.benchmarking.callbacks.strict_execution_callback import StrictExecutionCallback\n",
+    "from openstef_beam.benchmarking.storage.local_storage import LocalBenchmarkStorage\n",
+    "from openstef_core.types import LeadTime, Q\n",
+    "from openstef_meta.presets import (\n",
+    "    EnsembleForecastingWorkflowConfig,\n",
+    ")\n",
+    "from openstef_models.integrations.mlflow.mlflow_storage import MLFlowStorage\n",
+    "from openstef_models.transforms.general import SampleWeightConfig\n",
+    "\n",
+    "logging.basicConfig(level=logging.INFO, format=\"[%(asctime)s][%(levelname)s] %(message)s\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "39e49d57",
+   "metadata": {},
+   "source": [
+    "## Ensemble configuration\n",
+    "\n",
+    "Choose which base models to combine and how. The `ensemble_type` controls the\n",
+    "combination strategy; `base_models` lists which individual models to train."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "999c6c43",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "OUTPUT_PATH = Path(\"./benchmark_results\")\n",
+    "\n",
+    "N_PROCESSES = int(os.environ.get(\"OPENSTEF_N_PROCESSES\", \"1\"))\n",
+    "\n",
+    "ensemble_type = \"learned_weights\"  # \"stacking\", \"learned_weights\" or \"rules\"\n",
+    "base_models = [\"lgbm\", \"gblinear\"]  # combination of \"lgbm\", \"gblinear\", \"xgboost\" and \"lgbm_linear\"\n",
+    "combiner_model = \"lgbm\"  # \"lgbm\", \"xgboost\", \"rf\" or \"logistic\" for learned weights; \"gblinear\" for stacking\n",
+    "\n",
+    "model = \"Ensemble_\" + \"_\".join(base_models) + \"_\" + ensemble_type + \"_\" + combiner_model\n",
+    "\n",
+    "# Forecast 36 hours ahead, producing 7 quantile bands\n",
+    "FORECAST_HORIZONS = [LeadTime.from_string(\"PT36H\")]\n",
+    "PREDICTION_QUANTILES = [\n",
+    "    Q(0.05),\n",
+    "    Q(0.1),\n",
+    "    Q(0.3),\n",
+    "    Q(0.5),\n",
+    "    Q(0.7),\n",
+    "    Q(0.9),\n",
+    "    Q(0.95),\n",
+    "]\n",
+    "\n",
+    "# Set to a list of categories to run only a subset (e.g. [Liander2024Category.SOLAR])\n",
+    "BENCHMARK_FILTER: list[Liander2024Category] | None = None"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "3e76cd86",
+   "metadata": {},
+   "source": [
+    "## Workflow configuration\n",
+    "\n",
+    "`EnsembleForecastingWorkflowConfig` extends the standard config with ensemble-specific\n",
+    "settings: which base models to use, the combiner strategy, and per-model sample weights."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "5b63cee1",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "USE_MLFLOW_STORAGE = os.environ.get(\"OPENSTEF_MLFLOW_STORAGE\", \"true\").lower() == \"true\"\n",
+    "\n",
+    "if USE_MLFLOW_STORAGE:\n",
+    "    storage = MLFlowStorage(\n",
+    "        tracking_uri=str(OUTPUT_PATH / \"mlflow_artifacts\"),\n",
+    "        local_artifacts_path=OUTPUT_PATH / \"mlflow_tracking_artifacts\",\n",
+    "    )\n",
+    "else:\n",
+    "    storage = None\n",
+    "\n",
+    "workflow_config = EnsembleForecastingWorkflowConfig(\n",
+    "    model_id=\"common_model_\",\n",
+    "    ensemble_type=ensemble_type,\n",
+    "    base_models=base_models,  # type: ignore\n",
+    "    combiner_model=combiner_model,\n",
+    "    horizons=FORECAST_HORIZONS,\n",
+    "    quantiles=PREDICTION_QUANTILES,\n",
+    "    model_reuse_enable=False,\n",
+    "    mlflow_storage=None,\n",
+    "    radiation_column=\"shortwave_radiation\",\n",
+    "    rolling_aggregate_features=[\"mean\", \"median\", \"max\", \"min\"],\n",
+    "    wind_speed_column=\"wind_speed_80m\",\n",
+    "    pressure_column=\"surface_pressure\",\n",
+    "    temperature_column=\"temperature_2m\",\n",
+    "    relative_humidity_column=\"relative_humidity_2m\",\n",
+    "    energy_price_column=\"EPEX_NL\",\n",
+    "    forecaster_sample_weights={\n",
+    "        \"gblinear\": SampleWeightConfig(method=\"exponential\", weight_exponent=1.0),\n",
+    "        \"lgbm\": SampleWeightConfig(weight_exponent=0.0),\n",
+    "        \"xgboost\": SampleWeightConfig(weight_exponent=0.0),\n",
+    "        \"lgbm_linear\": SampleWeightConfig(weight_exponent=0.0),\n",
+    "    },\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "53399c01",
+   "metadata": {},
+   "source": [
+    "## Backtest schedule\n",
+    "\n",
+    "The `BacktestForecasterConfig` controls how BEAM schedules training and prediction\n",
+    "windows. Ensemble models typically need more context than single models."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "1ccc6dcd",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "backtest_config = BacktestForecasterConfig(\n",
+    "    requires_training=True,\n",
+    "    predict_length=timedelta(days=7),\n",
+    "    predict_min_length=timedelta(minutes=15),\n",
+    "    predict_context_length=timedelta(days=14),  # Context needed for lag features\n",
+    "    predict_context_min_coverage=0.5,\n",
+    "    training_context_length=timedelta(days=90),  # Three months of training data\n",
+    "    training_context_min_coverage=0.5,\n",
+    "    predict_sample_interval=timedelta(minutes=15),\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "479036b7",
+   "metadata": {},
+   "source": [
+    "## Run the benchmark"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f02a3ac1",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "if __name__ == \"__main__\":\n",
+    "    start_time = time.time()\n",
+    "    create_liander2024_benchmark_runner(\n",
+    "        storage=LocalBenchmarkStorage(base_path=OUTPUT_PATH / model),\n",
+    "        data_dir=None,  # Path(\"../data/liander2024-energy-forecasting-benchmark\"),\n",
+    "        callbacks=[StrictExecutionCallback()],\n",
+    "    ).run(\n",
+    "        forecaster_factory=create_openstef4_preset_backtest_forecaster(\n",
+    "            workflow_config=workflow_config,\n",
+    "            cache_dir=OUTPUT_PATH / \"cache\",\n",
+    "        ),\n",
+    "        run_name=model,\n",
+    "        n_processes=N_PROCESSES,\n",
+    "        filter_args=BENCHMARK_FILTER,\n",
+    "    )\n",
+    "\n",
+    "    end_time = time.time()\n",
+    "    print(f\"Benchmark completed in {end_time - start_time:.2f} seconds.\")"
+   ]
+  }
+ ],
+ "metadata": {
+  "jupytext": {
+   "formats": "ipynb,py:percent"
+  },
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/examples/benchmarks/liander2024/run_ensemble_benchmark.ipynb.license b/examples/benchmarks/liander2024/run_ensemble_benchmark.ipynb.license
new file mode 100644
index 000000000..a42c86064
--- /dev/null
+++ b/examples/benchmarks/liander2024/run_ensemble_benchmark.ipynb.license
@@ -0,0 +1,3 @@
+SPDX-FileCopyrightText: 2026 Contributors to the OpenSTEF project <openstef@lfenergy.org>
+
+SPDX-License-Identifier: MPL-2.0
diff --git a/examples/benchmarks/liander_2024_ensemble.py b/examples/benchmarks/liander2024/run_ensemble_benchmark.py
similarity index 61%
rename from examples/benchmarks/liander_2024_ensemble.py
rename to examples/benchmarks/liander2024/run_ensemble_benchmark.py
index 5760d35e6..b4f5dca83 100644
--- a/examples/benchmarks/liander_2024_ensemble.py
+++ b/examples/benchmarks/liander2024/run_ensemble_benchmark.py
@@ -1,11 +1,43 @@
-"""Liander 2024 Benchmark Example.
-
-====================================
-
-This example demonstrates how to set up and run the Liander 2024 STEF benchmark using OpenSTEF BEAM.
-The benchmark will evaluate XGBoost and GBLinear models on the dataset from HuggingFace.
-"""
-
+# ---
+# jupyter:
+#   jupytext:
+#     formats: ipynb,py:percent
+#     text_representation:
+#       extension: .py
+#       format_name: percent
+#       format_version: '1.3'
+#       jupytext_version: 1.19.1
+#   kernelspec:
+#     display_name: Python 3
+#     language: python
+#     name: python3
+# ---
+
+# %% [markdown]
+# # Ensemble Model Benchmark
+#
+# Run an ensemble of multiple base models (e.g. LightGBM + GBLinear) with a learned
+# weight combiner on the
+# [Liander 2024 STEF benchmark](https://huggingface.co/datasets/OpenSTEF/liander2024-stef-benchmark).
+#
+# **What this does:**
+#
+# 1. Downloads the Liander 2024 dataset from HuggingFace (automatic)
+# 2. Trains multiple base models and a combiner that learns optimal weights
+# 3. Produces probabilistic forecasts (7 quantiles) for a 36-hour horizon
+# 4. Saves results locally for comparison
+#
+# **No code changes needed.** To benchmark your own model, see
+# [Implement a Custom Forecaster](../custom/custom_forecaster.ipynb).
+#
+# ```{admonition} Ensemble types
+# Change `ensemble_type` below to try different strategies:
+# - `"learned_weights"` — a combiner model learns per-quantile weights
+# - `"stacking"` — base model outputs become features for a meta-model
+# - `"rules"` — fixed rule-based combination
+# ```
+
+# %% tags=["remove-cell"]
 # SPDX-FileCopyrightText: 2025 Contributors to the OpenSTEF project <short.term.energy.forecasts@alliander.com>
 #
 # SPDX-License-Identifier: MPL-2.0
@@ -13,12 +45,15 @@
 import os
 import time
 
-os.environ["OMP_NUM_THREADS"] = "1"  # Set OMP_NUM_THREADS to 1 to avoid issues with parallel execution and xgboost
+os.environ["OMP_NUM_THREADS"] = "1"
 os.environ["OPENBLAS_NUM_THREADS"] = "1"
 os.environ["MKL_NUM_THREADS"] = "1"
 
+# %% [markdown]
+# ## Setup
+
+# %%
 import logging
-import multiprocessing
 from datetime import timedelta
 from pathlib import Path
 
@@ -38,20 +73,25 @@
 
 logging.basicConfig(level=logging.INFO, format="[%(asctime)s][%(levelname)s] %(message)s")
 
+# %% [markdown]
+# ## Ensemble configuration
+#
+# Choose which base models to combine and how. The `ensemble_type` controls the
+# combination strategy; `base_models` lists which individual models to train.
+
+# %%
 OUTPUT_PATH = Path("./benchmark_results")
 
-N_PROCESSES = 1 if True else multiprocessing.cpu_count()  # Amount of parallel processes to use for the benchmark
+N_PROCESSES = int(os.environ.get("OPENSTEF_N_PROCESSES", "1"))
 
 ensemble_type = "learned_weights"  # "stacking", "learned_weights" or "rules"
 base_models = ["lgbm", "gblinear"]  # combination of "lgbm", "gblinear", "xgboost" and "lgbm_linear"
-combiner_model = (
-    "lgbm"  # "lgbm", "xgboost", "rf" or "logistic" for learned weights combiner, gblinear for stacking combiner
-)
+combiner_model = "lgbm"  # "lgbm", "xgboost", "rf" or "logistic" for learned weights; "gblinear" for stacking
 
 model = "Ensemble_" + "_".join(base_models) + "_" + ensemble_type + "_" + combiner_model
 
-# Model configuration
-FORECAST_HORIZONS = [LeadTime.from_string("PT36H")]  # Forecast horizon(s)
+# Forecast 36 hours ahead, producing 7 quantile bands
+FORECAST_HORIZONS = [LeadTime.from_string("PT36H")]
 PREDICTION_QUANTILES = [
     Q(0.05),
     Q(0.1),
@@ -60,11 +100,19 @@
     Q(0.7),
     Q(0.9),
     Q(0.95),
-]  # Quantiles for probabilistic forecasts
+]
 
+# Set to a list of categories to run only a subset (e.g. [Liander2024Category.SOLAR])
 BENCHMARK_FILTER: list[Liander2024Category] | None = None
 
-USE_MLFLOW_STORAGE = True
+# %% [markdown]
+# ## Workflow configuration
+#
+# `EnsembleForecastingWorkflowConfig` extends the standard config with ensemble-specific
+# settings: which base models to use, the combiner strategy, and per-model sample weights.
+
+# %%
+USE_MLFLOW_STORAGE = os.environ.get("OPENSTEF_MLFLOW_STORAGE", "true").lower() == "true"
 
 if USE_MLFLOW_STORAGE:
     storage = MLFlowStorage(
@@ -98,8 +146,13 @@
     },
 )
 
+# %% [markdown]
+# ## Backtest schedule
+#
+# The `BacktestForecasterConfig` controls how BEAM schedules training and prediction
+# windows. Ensemble models typically need more context than single models.
 
-# Create the backtest configuration
+# %%
 backtest_config = BacktestForecasterConfig(
     requires_training=True,
     predict_length=timedelta(days=7),
@@ -111,7 +164,10 @@
     predict_sample_interval=timedelta(minutes=15),
 )
 
+# %% [markdown]
+# ## Run the benchmark
 
+# %%
 if __name__ == "__main__":
     start_time = time.time()
     create_liander2024_benchmark_runner(
diff --git a/examples/benchmarks/liander2024/run_xgboost_gblinear_benchmark.ipynb b/examples/benchmarks/liander2024/run_xgboost_gblinear_benchmark.ipynb
new file mode 100644
index 000000000..afc1d760c
--- /dev/null
+++ b/examples/benchmarks/liander2024/run_xgboost_gblinear_benchmark.ipynb
@@ -0,0 +1,240 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "b0bb1eb8",
+   "metadata": {},
+   "source": [
+    "# XGBoost & GBLinear Benchmark\n",
+    "\n",
+    "Run two models head-to-head on the\n",
+    "[Liander 2024 STEF benchmark](https://huggingface.co/datasets/OpenSTEF/liander2024-stef-benchmark)\n",
+    "— an open dataset of Dutch energy grid measurements.\n",
+    "\n",
+    "**What this does:**\n",
+    "\n",
+    "1. Downloads the Liander 2024 dataset from HuggingFace (automatic)\n",
+    "2. Trains XGBoost and GBLinear on each target using day-by-day backtesting\n",
+    "3. Produces probabilistic forecasts (7 quantiles) for a 3-day horizon\n",
+    "4. Saves results locally for comparison (see *Compare Results* notebook)\n",
+    "\n",
+    "**No code changes needed** — just run it. To benchmark your own model instead,\n",
+    "see [Implement a Custom Forecaster](../custom/custom_forecaster.ipynb).\n",
+    "\n",
+    "```{admonition} Runtime\n",
+    "Expect 30-60 min on a laptop (uses all CPU cores).\n",
+    "Set `OPENSTEF_N_PROCESSES=1` for easier debugging.\n",
+    "```"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "94b2f1c4",
+   "metadata": {
+    "tags": [
+     "remove-cell"
+    ]
+   },
+   "outputs": [],
+   "source": [
+    "# SPDX-FileCopyrightText: 2025 Contributors to the OpenSTEF project <openstef@lfenergy.org>\n",
+    "#\n",
+    "# SPDX-License-Identifier: MPL-2.0\n",
+    "\n",
+    "import os\n",
+    "\n",
+    "os.environ[\"OMP_NUM_THREADS\"] = \"1\"\n",
+    "os.environ[\"OPENBLAS_NUM_THREADS\"] = \"1\"\n",
+    "os.environ[\"MKL_NUM_THREADS\"] = \"1\""
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "a5a1eddb",
+   "metadata": {},
+   "source": [
+    "## Setup\n",
+    "\n",
+    "Import BEAM components and configure logging."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "52bea3bf",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import logging\n",
+    "import multiprocessing\n",
+    "from pathlib import Path\n",
+    "\n",
+    "from openstef_beam.benchmarking.baselines.openstef4 import (\n",
+    "    create_openstef4_preset_backtest_forecaster,\n",
+    ")\n",
+    "from openstef_beam.benchmarking.benchmarks.liander2024 import Liander2024Category, create_liander2024_benchmark_runner\n",
+    "from openstef_beam.benchmarking.callbacks.strict_execution_callback import StrictExecutionCallback\n",
+    "from openstef_beam.benchmarking.storage.local_storage import LocalBenchmarkStorage\n",
+    "from openstef_core.types import LeadTime, Q\n",
+    "from openstef_models.integrations.mlflow.mlflow_storage import MLFlowStorage\n",
+    "from openstef_models.presets import (\n",
+    "    ForecastingWorkflowConfig,\n",
+    ")\n",
+    "\n",
+    "logging.basicConfig(level=logging.INFO, format=\"[%(asctime)s][%(levelname)s] %(message)s\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "6b29d41c",
+   "metadata": {},
+   "source": [
+    "## Configuration\n",
+    "\n",
+    "Define output paths, forecast horizons, and quantiles.\n",
+    "The benchmark runs each model in parallel across all targets in the dataset."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d255dbae",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "OUTPUT_PATH = Path(\"./benchmark_results\")\n",
+    "\n",
+    "BENCHMARK_RESULTS_PATH_XGBOOST = OUTPUT_PATH / \"XGBoost\"\n",
+    "BENCHMARK_RESULTS_PATH_GBLINEAR = OUTPUT_PATH / \"GBLinear\"\n",
+    "N_PROCESSES = int(os.environ.get(\"OPENSTEF_N_PROCESSES\", str(multiprocessing.cpu_count())))\n",
+    "\n",
+    "# Forecast 3 days ahead, producing 7 quantile bands\n",
+    "FORECAST_HORIZONS = [LeadTime.from_string(\"P3D\")]\n",
+    "PREDICTION_QUANTILES = [\n",
+    "    Q(0.05),\n",
+    "    Q(0.1),\n",
+    "    Q(0.3),\n",
+    "    Q(0.5),\n",
+    "    Q(0.7),\n",
+    "    Q(0.9),\n",
+    "    Q(0.95),\n",
+    "]\n",
+    "\n",
+    "# Set to a list of categories to run only a subset (e.g. [Liander2024Category.SOLAR])\n",
+    "BENCHMARK_FILTER: list[Liander2024Category] | None = None"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "37ccec9a",
+   "metadata": {},
+   "source": [
+    "## Model configuration\n",
+    "\n",
+    "`ForecastingWorkflowConfig` defines how OpenSTEF trains and predicts.\n",
+    "We create a shared base config and derive model-specific variants with `model_copy()`.\n",
+    "\n",
+    "Set `OPENSTEF_MLFLOW_STORAGE=true` to log experiment artifacts to MLflow."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "bfcdcb6d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "USE_MLFLOW_STORAGE = os.environ.get(\"OPENSTEF_MLFLOW_STORAGE\", \"false\").lower() == \"true\"\n",
+    "\n",
+    "if USE_MLFLOW_STORAGE:\n",
+    "    storage = MLFlowStorage(\n",
+    "        tracking_uri=str(OUTPUT_PATH / \"mlflow_artifacts\"),\n",
+    "        local_artifacts_path=OUTPUT_PATH / \"mlflow_tracking_artifacts\",\n",
+    "    )\n",
+    "else:\n",
+    "    storage = None\n",
+    "\n",
+    "common_config = ForecastingWorkflowConfig(\n",
+    "    model_id=\"common_model_\",\n",
+    "    run_name=None,\n",
+    "    model=\"flatliner\",\n",
+    "    horizons=FORECAST_HORIZONS,\n",
+    "    quantiles=PREDICTION_QUANTILES,\n",
+    "    model_reuse_enable=True,\n",
+    "    mlflow_storage=storage,\n",
+    "    radiation_column=\"shortwave_radiation\",\n",
+    "    rolling_aggregate_features=[\"mean\", \"median\", \"max\", \"min\"],\n",
+    "    wind_speed_column=\"wind_speed_80m\",\n",
+    "    pressure_column=\"surface_pressure\",\n",
+    "    temperature_column=\"temperature_2m\",\n",
+    "    relative_humidity_column=\"relative_humidity_2m\",\n",
+    "    energy_price_column=\"EPEX_NL\",\n",
+    ")\n",
+    "\n",
+    "xgboost_config = common_config.model_copy(update={\"model\": \"xgboost\"})\n",
+    "gblinear_config = common_config.model_copy(update={\"model\": \"gblinear\"})"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "c6203242",
+   "metadata": {},
+   "source": [
+    "## Run the benchmark\n",
+    "\n",
+    "Each model gets its own output directory. `StrictExecutionCallback` raises on\n",
+    "any target failure (remove it to skip failing targets silently)."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "23069166",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "if __name__ == \"__main__\":\n",
+    "    # --- XGBoost ---\n",
+    "    create_liander2024_benchmark_runner(\n",
+    "        storage=LocalBenchmarkStorage(base_path=BENCHMARK_RESULTS_PATH_XGBOOST),\n",
+    "        callbacks=[StrictExecutionCallback()],\n",
+    "    ).run(\n",
+    "        forecaster_factory=create_openstef4_preset_backtest_forecaster(\n",
+    "            workflow_config=xgboost_config,\n",
+    "            cache_dir=OUTPUT_PATH / \"cache\",\n",
+    "        ),\n",
+    "        run_name=\"xgboost\",\n",
+    "        n_processes=N_PROCESSES,\n",
+    "        filter_args=BENCHMARK_FILTER,\n",
+    "    )\n",
+    "\n",
+    "    # --- GBLinear ---\n",
+    "    create_liander2024_benchmark_runner(\n",
+    "        storage=LocalBenchmarkStorage(base_path=BENCHMARK_RESULTS_PATH_GBLINEAR),\n",
+    "        callbacks=[StrictExecutionCallback()],\n",
+    "    ).run(\n",
+    "        forecaster_factory=create_openstef4_preset_backtest_forecaster(\n",
+    "            workflow_config=gblinear_config,\n",
+    "            cache_dir=OUTPUT_PATH / \"cache\",\n",
+    "        ),\n",
+    "        run_name=\"gblinear\",\n",
+    "        n_processes=N_PROCESSES,\n",
+    "        filter_args=BENCHMARK_FILTER,\n",
+    "    )"
+   ]
+  }
+ ],
+ "metadata": {
+  "jupytext": {
+   "formats": "ipynb,py:percent"
+  },
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/examples/benchmarks/liander2024/run_xgboost_gblinear_benchmark.ipynb.license b/examples/benchmarks/liander2024/run_xgboost_gblinear_benchmark.ipynb.license
new file mode 100644
index 000000000..a42c86064
--- /dev/null
+++ b/examples/benchmarks/liander2024/run_xgboost_gblinear_benchmark.ipynb.license
@@ -0,0 +1,3 @@
+SPDX-FileCopyrightText: 2026 Contributors to the OpenSTEF project <openstef@lfenergy.org>
+
+SPDX-License-Identifier: MPL-2.0
diff --git a/examples/benchmarks/liander_2024_benchmark_xgboost_gblinear.py b/examples/benchmarks/liander2024/run_xgboost_gblinear_benchmark.py
similarity index 57%
rename from examples/benchmarks/liander_2024_benchmark_xgboost_gblinear.py
rename to examples/benchmarks/liander2024/run_xgboost_gblinear_benchmark.py
index 9ff296c5d..bcc7379ea 100644
--- a/examples/benchmarks/liander_2024_benchmark_xgboost_gblinear.py
+++ b/examples/benchmarks/liander2024/run_xgboost_gblinear_benchmark.py
@@ -1,21 +1,57 @@
-"""Liander 2024 Benchmark Example.
-
-====================================
-
-This example demonstrates how to set up and run the Liander 2024 STEF benchmark using OpenSTEF BEAM.
-The benchmark will evaluate XGBoost and GBLinear models on the dataset from HuggingFace.
-"""
+# ---
+# jupyter:
+#   jupytext:
+#     formats: ipynb,py:percent
+#     text_representation:
+#       extension: .py
+#       format_name: percent
+#       format_version: '1.3'
+#       jupytext_version: 1.19.1
+#   kernelspec:
+#     display_name: Python 3
+#     language: python
+#     name: python3
+# ---
+
+# %% [markdown]
+# # XGBoost & GBLinear Benchmark
+#
+# Run two models head-to-head on the
+# [Liander 2024 STEF benchmark](https://huggingface.co/datasets/OpenSTEF/liander2024-stef-benchmark)
+# — an open dataset of Dutch energy grid measurements.
+#
+# **What this does:**
+#
+# 1. Downloads the Liander 2024 dataset from HuggingFace (automatic)
+# 2. Trains XGBoost and GBLinear on each target using day-by-day backtesting
+# 3. Produces probabilistic forecasts (7 quantiles) for a 3-day horizon
+# 4. Saves results locally for comparison (see *Compare Results* notebook)
+#
+# **No code changes needed** — just run it. To benchmark your own model instead,
+# see [Implement a Custom Forecaster](../custom/custom_forecaster.ipynb).
+#
+# ```{admonition} Runtime
+# Expect 30-60 min on a laptop (uses all CPU cores).
+# Set `OPENSTEF_N_PROCESSES=1` for easier debugging.
+# ```
 
+# %% tags=["remove-cell"]
 # SPDX-FileCopyrightText: 2025 Contributors to the OpenSTEF project <openstef@lfenergy.org>
 #
 # SPDX-License-Identifier: MPL-2.0
 
 import os
 
-os.environ["OMP_NUM_THREADS"] = "1"  # Set OMP_NUM_THREADS to 1 to avoid issues with parallel execution and xgboost
+os.environ["OMP_NUM_THREADS"] = "1"
 os.environ["OPENBLAS_NUM_THREADS"] = "1"
 os.environ["MKL_NUM_THREADS"] = "1"
 
+# %% [markdown]
+# ## Setup
+#
+# Import BEAM components and configure logging.
+
+# %%
 import logging
 import multiprocessing
 from pathlib import Path
@@ -34,14 +70,21 @@
 
 logging.basicConfig(level=logging.INFO, format="[%(asctime)s][%(levelname)s] %(message)s")
 
+# %% [markdown]
+# ## Configuration
+#
+# Define output paths, forecast horizons, and quantiles.
+# The benchmark runs each model in parallel across all targets in the dataset.
+
+# %%
 OUTPUT_PATH = Path("./benchmark_results")
 
 BENCHMARK_RESULTS_PATH_XGBOOST = OUTPUT_PATH / "XGBoost"
 BENCHMARK_RESULTS_PATH_GBLINEAR = OUTPUT_PATH / "GBLinear"
-N_PROCESSES = multiprocessing.cpu_count()  # Amount of parallel processes to use for the benchmark
+N_PROCESSES = int(os.environ.get("OPENSTEF_N_PROCESSES", str(multiprocessing.cpu_count())))
 
-# Model configuration
-FORECAST_HORIZONS = [LeadTime.from_string("P3D")]  # Forecast horizon(s)
+# Forecast 3 days ahead, producing 7 quantile bands
+FORECAST_HORIZONS = [LeadTime.from_string("P3D")]
 PREDICTION_QUANTILES = [
     Q(0.05),
     Q(0.1),
@@ -50,11 +93,21 @@
     Q(0.7),
     Q(0.9),
     Q(0.95),
-]  # Quantiles for probabilistic forecasts
+]
 
+# Set to a list of categories to run only a subset (e.g. [Liander2024Category.SOLAR])
 BENCHMARK_FILTER: list[Liander2024Category] | None = None
 
-USE_MLFLOW_STORAGE = False
+# %% [markdown]
+# ## Model configuration
+#
+# `ForecastingWorkflowConfig` defines how OpenSTEF trains and predicts.
+# We create a shared base config and derive model-specific variants with `model_copy()`.
+#
+# Set `OPENSTEF_MLFLOW_STORAGE=true` to log experiment artifacts to MLflow.
+
+# %%
+USE_MLFLOW_STORAGE = os.environ.get("OPENSTEF_MLFLOW_STORAGE", "false").lower() == "true"
 
 if USE_MLFLOW_STORAGE:
     storage = MLFlowStorage(
@@ -82,11 +135,17 @@
 )
 
 xgboost_config = common_config.model_copy(update={"model": "xgboost"})
-
 gblinear_config = common_config.model_copy(update={"model": "gblinear"})
 
+# %% [markdown]
+# ## Run the benchmark
+#
+# Each model gets its own output directory. `StrictExecutionCallback` raises on
+# any target failure (remove it to skip failing targets silently).
+
+# %%
 if __name__ == "__main__":
-    # Run for XGBoost model
+    # --- XGBoost ---
     create_liander2024_benchmark_runner(
         storage=LocalBenchmarkStorage(base_path=BENCHMARK_RESULTS_PATH_XGBOOST),
         callbacks=[StrictExecutionCallback()],
@@ -100,7 +159,7 @@
         filter_args=BENCHMARK_FILTER,
     )
 
-    # # Run for GBLinear model
+    # --- GBLinear ---
     create_liander2024_benchmark_runner(
         storage=LocalBenchmarkStorage(base_path=BENCHMARK_RESULTS_PATH_GBLINEAR),
         callbacks=[StrictExecutionCallback()],
diff --git a/examples/deployment/.gitkeep b/examples/deployment/.gitkeep
deleted file mode 100644
index e69de29bb..000000000
diff --git a/examples/examples/configuring_model_pipeline_example.py b/examples/examples/configuring_model_pipeline_example.py
deleted file mode 100644
index dc37fdf16..000000000
--- a/examples/examples/configuring_model_pipeline_example.py
+++ /dev/null
@@ -1,141 +0,0 @@
-"""Configuring Model Pipeline Example.
-
-====================================
-
-This example demonstrates how to configure and use a complete forecasting pipeline
-in OpenSTEF. It shows how to:
-
-1. Create synthetic time series data for demonstration
-2. Configure a full forecasting model with preprocessing and postprocessing
-3. Set up model storage for persistence
-4. Use the workflow pattern for training and prediction
-
-The example uses a ConstantMedianForecaster with feature engineering including
-holiday features, lag transforms, and data scaling. This represents a typical
-OpenSTEF forecasting setup that can be adapted for real-world use cases.
-
-Key Components:
-    - VersionedTimeSeriesDataset: Time series data structure
-    - ForecastingModel: Complete forecasting pipeline
-    - FeaturePipeline: Preprocessing with holidays and lags
-    - LocalModelStorage: File-based model persistence
-    - CustomForecastingWorkflow: High-level orchestration
-
-This example is useful for understanding how to integrate all OpenSTEF components
-into a working forecasting system.
-"""
-
-# SPDX-FileCopyrightText: 2025 Contributors to the OpenSTEF project <openstef@lfenergy.org>
-#
-# SPDX-License-Identifier: MPL-2.0
-
-import logging
-from datetime import timedelta
-from pathlib import Path
-
-import numpy as np
-import pandas as pd
-from pydantic_extra_types.country import CountryAlpha2
-
-from openstef_beam.analysis.plots import ForecastTimeSeriesPlotter
-from openstef_core.datasets import ForecastDataset, TimeSeriesDataset
-from openstef_core.mixins import TransformPipeline
-from openstef_core.types import LeadTime, Q
-from openstef_models.integrations.mlflow import MLFlowStorageCallback
-from openstef_models.integrations.mlflow.mlflow_storage import MLFlowStorage
-from openstef_models.models.forecasting.gblinear_forecaster import (
-    GBLinearForecaster,
-    GBLinearHyperParams,
-)
-from openstef_models.models.forecasting_model import ForecastingModel
-from openstef_models.transforms.general import Scaler
-from openstef_models.transforms.time_domain import HolidayFeatureAdder
-from openstef_models.utils.feature_selection import FeatureSelection
-from openstef_models.workflows import CustomForecastingWorkflow
-
-logging.basicConfig(level=logging.INFO, format="[%(asctime)s][%(levelname)s] %(message)s")
-logger = logging.getLogger(__name__)
-
-workspace_dir = Path(__file__).parent.resolve()
-
-# Create synthetic time series data
-n_samples = 24 * 31 * 3  # 3 months of hourly data
-rng = np.random.default_rng(42)
-temp = rng.standard_normal(size=n_samples)
-wind = rng.standard_normal(size=n_samples)
-radiation = rng.standard_normal(size=n_samples)
-timestamps = pd.date_range("2025-01-01", periods=n_samples, freq="h")
-
-dataset = TimeSeriesDataset(
-    data=pd.DataFrame(
-        {
-            "load": wind * -10 + temp * -3 + radiation * -5 + rng.standard_normal(size=n_samples) * 2,
-            "temp": temp,
-            "wind": wind,
-            "radiation": radiation,
-        },
-        index=timestamps,
-    ),
-    sample_interval=timedelta(hours=1),
-)
-
-model = ForecastingModel(
-    preprocessing=TransformPipeline(
-        transforms=[
-            Scaler(method="standard", selection=FeatureSelection(include={"temp", "wind", "radiation"})),
-            HolidayFeatureAdder(country_code=CountryAlpha2("NL")),
-        ],
-    ),
-    forecaster=GBLinearForecaster(
-        horizons=[LeadTime.from_string("PT36H")],
-        quantiles=[Q(0.5), Q(0.1), Q(0.9)],
-        hyperparams=GBLinearHyperParams(
-            n_steps=1000,
-            learning_rate=0.3,
-        ),
-        verbosity=True,
-    ),
-    target_column="load",
-    tags={
-        "model": "gblinear",
-        "version": "1.0.0",
-    },
-)
-
-pipeline = CustomForecastingWorkflow(
-    model_id="gblinear_forecaster_v1",
-    model=model,
-    callbacks=[
-        MLFlowStorageCallback(
-            storage=MLFlowStorage(
-                tracking_uri=str(workspace_dir / "mlflow_tracking"),
-                local_artifacts_path=workspace_dir / "mlflow_tracking_artifacts",
-            ),
-            model_reuse_enable=False,
-        )
-    ],
-)
-
-logger.info("Starting model training")
-result = pipeline.fit(dataset)
-if result is not None:
-    logger.info("Full eval result:\n%s", result.metrics_full.to_dataframe())
-
-    if result.metrics_test is not None:
-        logger.info("Test result:\n%s", result.metrics_test.to_dataframe())
-
-logger.info("Starting forecasting")
-forecast: ForecastDataset = pipeline.predict(dataset)
-
-print(forecast.data.tail())
-
-
-logger.info("Storing forecast plot to forecast_plot.html")
-fig = (
-    ForecastTimeSeriesPlotter()
-    .add_measurements(measurements=dataset.select_version().data["load"])
-    .add_model(model_name="gblinear", forecast=forecast.median_series, quantiles=forecast.quantiles_data)
-    .plot()
-)
-
-fig.write_html("forecast_plot.html")  # pyright: ignore[reportUnknownMemberType]
diff --git a/examples/examples/forecasting_preset_example.py b/examples/examples/forecasting_preset_example.py
deleted file mode 100644
index 480527252..000000000
--- a/examples/examples/forecasting_preset_example.py
+++ /dev/null
@@ -1,93 +0,0 @@
-"""Configuring Model Pipeline Example.
-
-====================================
-
-This example demonstrates how to configure and use a complete forecasting pipeline
-in OpenSTEF. It shows how to:
-
-1. Create synthetic time series data for demonstration
-2. Configure a full forecasting model with preprocessing and postprocessing
-3. Set up model storage for persistence
-4. Use the workflow pattern for training and prediction
-
-The example uses a ConstantMedianForecaster with feature engineering including
-holiday features, lag transforms, and data scaling. This represents a typical
-OpenSTEF forecasting setup that can be adapted for real-world use cases.
-
-Key Components:
-    - VersionedTimeSeriesDataset: Time series data structure
-    - ForecastingModel: Complete forecasting pipeline
-    - FeaturePipeline: Preprocessing with holidays and lags
-    - LocalModelStorage: File-based model persistence
-    - CustomForecastingWorkflow: High-level orchestration
-
-This example is useful for understanding how to integrate all OpenSTEF components
-into a working forecasting system.
-"""
-
-# SPDX-FileCopyrightText: 2025 Contributors to the OpenSTEF project <openstef@lfenergy.org>
-#
-# SPDX-License-Identifier: MPL-2.0
-
-import logging
-from datetime import timedelta
-from pathlib import Path
-
-from openstef_beam.analysis.plots import ForecastTimeSeriesPlotter
-from openstef_core.datasets import ForecastDataset
-from openstef_core.testing import create_synthetic_forecasting_dataset
-from openstef_core.types import LeadTime, Q
-from openstef_models.integrations.mlflow import MLFlowStorage
-from openstef_models.presets import ForecastingWorkflowConfig, create_forecasting_workflow
-
-logging.basicConfig(level=logging.INFO, format="[%(asctime)s][%(levelname)s] %(message)s")
-logger = logging.getLogger(__name__)
-
-workspace_dir = Path(__file__).parent.resolve()
-
-# Create synthetic time series data
-dataset = create_synthetic_forecasting_dataset(
-    length=timedelta(days=90),
-    wind_influence=-10.0,
-    temp_influence=5.0,
-    radiation_influence=-7.0,
-    stochastic_influence=2.0,
-    sample_interval=timedelta(hours=1),
-)
-
-workflow = create_forecasting_workflow(
-    config=ForecastingWorkflowConfig(
-        model_id="gblinear_forecaster_v1",
-        model="gblinear",
-        horizons=[LeadTime.from_string("PT36H")],
-        quantiles=[Q(0.5), Q(0.1), Q(0.9)],
-        mlflow_storage=MLFlowStorage(
-            tracking_uri=str(workspace_dir / "mlflow_tracking"),
-            local_artifacts_path=workspace_dir / "mlflow_tracking_artifacts",
-        ),
-    )
-)
-
-logger.info("Starting model training")
-result = workflow.fit(dataset)
-if result is not None:
-    logger.info("Full eval result:\n%s", result.metrics_full.to_dataframe())
-
-    if result.metrics_test is not None:
-        logger.info("Test result:\n%s", result.metrics_test.to_dataframe())
-
-logger.info("Starting forecasting")
-forecast: ForecastDataset = workflow.predict(dataset)
-
-print(forecast.data.tail())
-
-# Plot the result
-logger.info("Storing forecast plot to forecast_plot.html")
-fig = (
-    ForecastTimeSeriesPlotter()
-    .add_measurements(measurements=dataset.select_version().data["load"])
-    .add_model(model_name="gblinear", forecast=forecast.median_series, quantiles=forecast.quantiles_data)
-    .plot()
-)
-
-fig.write_html("forecast_plot.html")  # pyright: ignore[reportUnknownMemberType]
diff --git a/examples/examples/isotonic_calibration_example.py b/examples/examples/isotonic_calibration_example.py
deleted file mode 100644
index 791581ba6..000000000
--- a/examples/examples/isotonic_calibration_example.py
+++ /dev/null
@@ -1,170 +0,0 @@
-"""Isotonic Quantile Calibration Example.
-
-=======================================
-
-This example demonstrates how to use isotonic quantile calibration to improve
-the reliability of probabilistic forecasts. It shows:
-
-1. Training a forecasting model with isotonic calibration as postprocessing
-2. Visualizing calibration quality (expected vs observed coverage)
-
-Isotonic calibration ensures that predicted quantiles match observed quantile
-levels, improving the reliability of uncertainty estimates.
-"""
-
-# SPDX-FileCopyrightText: 2025 Contributors to the OpenSTEF project <openstef@lfenergy.org>
-#
-# SPDX-License-Identifier: MPL-2.0
-
-from datetime import timedelta
-
-import numpy as np
-import pandas as pd
-import plotly.graph_objects as go
-
-from openstef_core.datasets import ForecastDataset, TimeSeriesDataset
-from openstef_core.mixins import TransformPipeline
-from openstef_core.types import LeadTime, Q
-from openstef_models.models.forecasting.gblinear_forecaster import (
-    GBLinearForecaster,
-    GBLinearHyperParams,
-)
-from openstef_models.models.forecasting_model import ForecastingModel
-from openstef_models.transforms.postprocessing import IsotonicQuantileCalibrator
-from openstef_models.workflows import CustomForecastingWorkflow
-
-# Step 1: Create synthetic time series data
-n_samples = 24 * 31 * 3  # 3 months of hourly data
-rng = np.random.default_rng(42)
-timestamps = pd.date_range("2025-01-01", periods=n_samples, freq="h")
-
-dataset = TimeSeriesDataset(
-    data=pd.DataFrame(
-        {
-            "load": rng.standard_normal(size=n_samples) * 10 + 50,
-            "feature": rng.standard_normal(size=n_samples),
-        },
-        index=timestamps,
-    ),
-    sample_interval=timedelta(hours=1),
-)
-
-# Step 2: Configure model without calibration (for comparison)
-model_uncalibrated = ForecastingModel(
-    forecaster=GBLinearForecaster(
-        horizons=[LeadTime.from_string("PT1H")],
-        quantiles=[Q(0.1), Q(0.5), Q(0.9)],
-        hyperparams=GBLinearHyperParams(n_steps=100),
-        verbosity=0,
-    ),
-    target_column="load",
-)
-
-pipeline_uncalibrated = CustomForecastingWorkflow(model_id="uncalibrated_forecaster", model=model_uncalibrated)
-pipeline_uncalibrated.fit(dataset)
-forecast_uncalibrated = pipeline_uncalibrated.predict(dataset)
-
-# Step 3: Configure model with windowed isotonic quantile calibration
-model_calibrated = ForecastingModel(
-    forecaster=GBLinearForecaster(
-        horizons=[LeadTime.from_string("PT1H")],
-        quantiles=[Q(0.1), Q(0.5), Q(0.9)],
-        hyperparams=GBLinearHyperParams(n_steps=100),
-        verbosity=0,
-    ),
-    postprocessing=TransformPipeline(
-        transforms=[
-            IsotonicQuantileCalibrator(
-                quantiles=[Q(0.1), Q(0.5), Q(0.9)],
-                use_local_quantile_estimation=True,  # Enable windowed approach
-                # window_size uses adaptive sizing by default: max(_MIN_WINDOW_SIZE, n_samples // 10)
-            ),
-        ],
-    ),
-    target_column="load",
-)
-
-# Step 4: Train and predict with calibration
-pipeline_calibrated = CustomForecastingWorkflow(model_id="calibrated_forecaster", model=model_calibrated)
-pipeline_calibrated.fit(dataset)
-forecast_calibrated = pipeline_calibrated.predict(dataset)
-
-
-# Step 5: Visualize calibration quality (before and after)
-def plot_calibration_comparison(
-    forecast_before: ForecastDataset, forecast_after: ForecastDataset, actuals: pd.Series
-) -> go.Figure:
-    """Plot expected vs observed quantile coverage before and after calibration.
-
-    Returns:
-        A Plotly figure showing the calibration comparison.
-    """
-
-    def calculate_coverage(forecast: ForecastDataset) -> list[float]:
-        common_index = forecast.data.index.intersection(actuals.index)
-        forecast_aligned = forecast.data.loc[common_index]
-        actuals_aligned = actuals.loc[common_index]
-        return [
-            (actuals_aligned <= forecast_aligned["quantile_P10"]).mean(),
-            (actuals_aligned <= forecast_aligned["quantile_P50"]).mean(),
-            (actuals_aligned <= forecast_aligned["quantile_P90"]).mean(),
-        ]
-
-    expected = [0.1, 0.5, 0.9]
-    observed_before = calculate_coverage(forecast_before)
-    observed_after = calculate_coverage(forecast_after)
-
-    fig = go.Figure()
-
-    # expected == observed line
-    fig.add_trace(  # pyright: ignore[reportUnknownMemberType]
-        go.Scatter(
-            x=[0, 1],
-            y=[0, 1],
-            mode="lines",
-            name="expected equals observed",
-            line={"color": "gray", "dash": "dash", "width": 2},
-        )
-    )
-
-    # Before calibration
-    fig.add_trace(  # pyright: ignore[reportUnknownMemberType]
-        go.Scatter(
-            x=expected,
-            y=observed_before,
-            mode="markers+lines",
-            name="before isotonic calibration",
-            marker={"size": 12, "color": "red", "symbol": "x"},
-            line={"color": "red", "width": 2, "dash": "dot"},
-        )
-    )
-
-    # After calibration
-    fig.add_trace(  # pyright: ignore[reportUnknownMemberType]
-        go.Scatter(
-            x=expected,
-            y=observed_after,
-            mode="markers+lines",
-            name="after isotonic calibration",
-            marker={"size": 12, "color": "blue"},
-            line={"color": "blue", "width": 2},
-        )
-    )
-
-    fig.update_layout(  # pyright: ignore[reportUnknownMemberType]
-        title="Isontonic quantile calibration",
-        xaxis_title="expected quantile",
-        yaxis_title="observed quantile",
-        xaxis={"range": [0, 1], "tickvals": [0, 0.1, 0.5, 0.9, 1]},
-        yaxis={"range": [0, 1], "tickvals": [0, 0.1, 0.5, 0.9, 1]},
-        width=600,
-        height=600,
-    )
-    return fig
-
-
-calibration_fig = plot_calibration_comparison(
-    forecast_uncalibrated, forecast_calibrated, dataset.select_version().data["load"]
-)
-calibration_fig.write_html("calibration_plot.html")  # pyright: ignore[reportUnknownMemberType]
-print("Calibration plot saved to calibration_plot.html")
diff --git a/examples/tutorials/backtesting_openstef_with_beam.ipynb b/examples/tutorials/backtesting_openstef_with_beam.ipynb
deleted file mode 100644
index 3b48e1738..000000000
--- a/examples/tutorials/backtesting_openstef_with_beam.ipynb
+++ /dev/null
@@ -1,465 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "markdown",
-   "id": "caf13084",
-   "metadata": {},
-   "source": [
-    "# 📊 Backtesting OpenSTEF Models with OpenSTEF-BEAM\n",
-    "\n",
-    "This tutorial demonstrates how to use **OpenSTEF-BEAM** (Backtesting, Evaluation, Analysis, Metrics) to systematically evaluate forecasting models. You'll learn how to:\n",
-    "\n",
-    "1. **Configure benchmark experiments** with multiple model types\n",
-    "2. **Run parallel backtests** across dozens of energy assets\n",
-    "3. **Compare model performance** with standardized metrics\n",
-    "4. **Generate analysis reports** with interactive visualizations\n",
-    "\n",
-    "> **BEAM** provides a rigorous framework for model evaluation, ensuring fair comparisons and reproducible results."
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "329ce2a3",
-   "metadata": {},
-   "source": [
-    "## 🔧 Environment Setup\n",
-    "\n",
-    "First, we configure thread settings to prevent conflicts with XGBoost's internal parallelization when running multiple processes."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "24d53eb6",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# --- Thread Configuration ---\n",
-    "# Prevent thread contention when running parallel backtests with XGBoost\n",
-    "import os\n",
-    "\n",
-    "os.environ[\"OMP_NUM_THREADS\"] = \"1\"\n",
-    "os.environ[\"OPENBLAS_NUM_THREADS\"] = \"1\"\n",
-    "os.environ[\"MKL_NUM_THREADS\"] = \"1\"\n",
-    "\n",
-    "# --- Standard Imports ---\n",
-    "import logging\n",
-    "import multiprocessing\n",
-    "from pathlib import Path\n",
-    "\n",
-    "logging.basicConfig(level=logging.INFO, format=\"[%(asctime)s][%(levelname)s] %(message)s\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "0a2d9aed",
-   "metadata": {},
-   "source": [
-    "## ⚙️ Benchmark Configuration\n",
-    "\n",
-    "Configure the benchmark parameters:\n",
-    "- **Output paths** — where to store results for each model\n",
-    "- **Forecast horizons** — how far ahead to predict (using ISO 8601 duration format)\n",
-    "- **Quantiles** — prediction intervals for probabilistic evaluation"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "99c03b80",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Import types for configuration\n",
-    "from openstef_beam.benchmarking.benchmarks.liander2024 import Liander2024Category\n",
-    "from openstef_core.types import LeadTime, Q  # LeadTime: forecast horizon, Q: quantile\n",
-    "\n",
-    "# --- Output Paths ---\n",
-    "OUTPUT_PATH = Path(\"./benchmark_results\")\n",
-    "BENCHMARK_RESULTS_PATH_XGBOOST = OUTPUT_PATH / \"XGBoost\"\n",
-    "BENCHMARK_RESULTS_PATH_GBLINEAR = OUTPUT_PATH / \"GBLinear\"\n",
-    "\n",
-    "# --- Parallelization ---\n",
-    "N_PROCESSES = multiprocessing.cpu_count()  # Use all available CPU cores\n",
-    "print(f\"🖥️  Running with {N_PROCESSES} parallel processes\")\n",
-    "\n",
-    "# --- Forecast Configuration ---\n",
-    "FORECAST_HORIZONS = [LeadTime.from_string(\"P3D\")]  # 3-day ahead forecast (ISO 8601: P3D)\n",
-    "\n",
-    "# Quantiles for probabilistic forecasting (7 quantiles covering 5th to 95th percentile)\n",
-    "PREDICTION_QUANTILES = [\n",
-    "    Q(0.05),\n",
-    "    Q(0.1),\n",
-    "    Q(0.3),  # Lower quantiles\n",
-    "    Q(0.5),  # Median\n",
-    "    Q(0.7),\n",
-    "    Q(0.9),\n",
-    "    Q(0.95),  # Upper quantiles\n",
-    "]\n",
-    "\n",
-    "# --- Benchmark Filter (optional) ---\n",
-    "# Set to None to run all categories, or specify categories like:\n",
-    "# BENCHMARK_FILTER = [Liander2024Category.TRANSFORMER, Liander2024Category.MV_FEEDER]\n",
-    "BENCHMARK_FILTER: list[Liander2024Category] | None = None"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "a3618966",
-   "metadata": {},
-   "source": [
-    "## 🛠️ Model Configuration\n",
-    "\n",
-    "We define a **common configuration** that both models share, then create model-specific variants. This ensures fair comparison by keeping all settings identical except the model type.\n",
-    "\n",
-    "### Available Models:\n",
-    "- **XGBoost** — Gradient boosting trees (handles complex nonlinear patterns)\n",
-    "- **GBLinear** — Gradient boosted linear model (better extrapolation, faster)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "3a39b756",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Import workflow configuration\n",
-    "from openstef_models.presets import ForecastingWorkflowConfig\n",
-    "\n",
-    "# Common configuration shared by all models\n",
-    "# This ensures fair comparison by keeping all settings identical\n",
-    "common_config = ForecastingWorkflowConfig(\n",
-    "    model_id=\"benchmark_model_\",\n",
-    "    run_name=None,\n",
-    "    model=\"flatliner\",  # Placeholder - will be overwritten per model\n",
-    "    # Forecast settings\n",
-    "    horizons=FORECAST_HORIZONS,\n",
-    "    quantiles=PREDICTION_QUANTILES,\n",
-    "    # Model reuse: reuse trained model for same target (speeds up backtesting)\n",
-    "    model_reuse_enable=True,\n",
-    "    mlflow_storage=None,  # Disable MLflow for this demo\n",
-    "    # Weather feature column mappings (match dataset column names)\n",
-    "    radiation_column=\"shortwave_radiation\",\n",
-    "    wind_speed_column=\"wind_speed_80m\",  # 80m wind speed for better wind park predictions\n",
-    "    pressure_column=\"surface_pressure\",\n",
-    "    temperature_column=\"temperature_2m\",\n",
-    "    relative_humidity_column=\"relative_humidity_2m\",\n",
-    "    # Additional features\n",
-    "    energy_price_column=\"EPEX_NL\",  # Day-ahead electricity price\n",
-    "    rolling_aggregate_features=[\"mean\", \"median\", \"max\", \"min\"],  # Rolling window stats\n",
-    "    # Logging\n",
-    "    verbosity=0,  # Quiet mode for batch processing\n",
-    ")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "ed202922",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Create model-specific configurations by copying common config and updating model type\n",
-    "xgboost_config = common_config.model_copy(update={\"model\": \"xgboost\"})\n",
-    "gblinear_config = common_config.model_copy(update={\"model\": \"gblinear\"})\n",
-    "\n",
-    "print(\"✅ Model configurations created:\")\n",
-    "print(f\"   - XGBoost: {xgboost_config.model}\")\n",
-    "print(f\"   - GBLinear: {gblinear_config.model}\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "4425a740",
-   "metadata": {},
-   "source": [
-    "## 💾 Storage Configuration\n",
-    "\n",
-    "**LocalBenchmarkStorage** manages the file structure for benchmark results:\n",
-    "```\n",
-    "benchmark_results/\n",
-    "├── XGBoost/\n",
-    "│   ├── backtest/      # Raw predictions\n",
-    "│   ├── evaluation/    # Metrics per target\n",
-    "│   └── analysis/      # Visualizations (HTML)\n",
-    "└── GBLinear/\n",
-    "    └── ...\n",
-    "```"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "c2e44656",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Initialize storage backends for each model\n",
-    "from openstef_beam.benchmarking.storage.local_storage import LocalBenchmarkStorage\n",
-    "\n",
-    "storage_xgboost = LocalBenchmarkStorage(base_path=BENCHMARK_RESULTS_PATH_XGBOOST)\n",
-    "storage_gblinear = LocalBenchmarkStorage(base_path=BENCHMARK_RESULTS_PATH_GBLINEAR)\n",
-    "\n",
-    "print(f\"📁 XGBoost results: {BENCHMARK_RESULTS_PATH_XGBOOST}\")\n",
-    "print(f\"📁 GBLinear results: {BENCHMARK_RESULTS_PATH_GBLINEAR}\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "41e6b2e3",
-   "metadata": {},
-   "source": [
-    "## 🚀 Run Backtests\n",
-    "\n",
-    "Now we run the **Liander 2024 Benchmark** — a comprehensive evaluation suite that:\n",
-    "1. Downloads the benchmark dataset from HuggingFace Hub (if needed)\n",
-    "2. Runs backtests across 5 asset categories (transformers, feeders, solar/wind parks)\n",
-    "3. Computes metrics and generates analysis visualizations\n",
-    "\n",
-    "⚠️ **Note**: This may take several minutes depending on your hardware."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "d6aae871",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Import benchmark components\n",
-    "from openstef_beam.benchmarking.baselines.openstef4 import create_openstef4_preset_backtest_forecaster\n",
-    "from openstef_beam.benchmarking.benchmarks.liander2024 import create_liander2024_benchmark_runner\n",
-    "from openstef_beam.benchmarking.callbacks.strict_execution_callback import StrictExecutionCallback\n",
-    "\n",
-    "# --- Run XGBoost Benchmark ---\n",
-    "print(\"🌲 Running XGBoost benchmark...\")\n",
-    "create_liander2024_benchmark_runner(\n",
-    "    storage=storage_xgboost,\n",
-    "    callbacks=[StrictExecutionCallback()],  # Fail fast on errors\n",
-    ").run(\n",
-    "    forecaster_factory=create_openstef4_preset_backtest_forecaster(\n",
-    "        workflow_config=xgboost_config,\n",
-    "    ),\n",
-    "    run_name=\"xgboost\",\n",
-    "    n_processes=N_PROCESSES,\n",
-    "    filter_args=BENCHMARK_FILTER,\n",
-    ")\n",
-    "print(\"✅ XGBoost benchmark complete!\")\n",
-    "\n",
-    "# --- Run GBLinear Benchmark ---\n",
-    "print(\"\\n📈 Running GBLinear benchmark...\")\n",
-    "create_liander2024_benchmark_runner(\n",
-    "    storage=storage_gblinear,\n",
-    "    callbacks=[StrictExecutionCallback()],\n",
-    ").run(\n",
-    "    forecaster_factory=create_openstef4_preset_backtest_forecaster(\n",
-    "        workflow_config=gblinear_config,\n",
-    "    ),\n",
-    "    run_name=\"gblinear\",\n",
-    "    n_processes=N_PROCESSES,\n",
-    "    filter_args=BENCHMARK_FILTER,\n",
-    ")\n",
-    "print(\"✅ GBLinear benchmark complete!\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "d1690a07",
-   "metadata": {},
-   "source": [
-    "## 📊 Compare Model Performance\n",
-    "\n",
-    "The **BenchmarkComparisonPipeline** generates side-by-side analysis of multiple models:\n",
-    "- Global metrics across all targets\n",
-    "- Per-category breakdowns (transformers, feeders, etc.)\n",
-    "- Time-windowed performance analysis"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "0a6bdfcf",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Run model comparison analysis\n",
-    "from openstef_beam.benchmarking import BenchmarkComparisonPipeline\n",
-    "from openstef_beam.benchmarking.benchmarks.liander2024 import LIANDER2024_ANALYSIS_CONFIG\n",
-    "\n",
-    "# Create comparison pipeline\n",
-    "target_provider = create_liander2024_benchmark_runner(\n",
-    "    storage=LocalBenchmarkStorage(base_path=OUTPUT_PATH),\n",
-    ").target_provider\n",
-    "\n",
-    "comparison_pipeline = BenchmarkComparisonPipeline(\n",
-    "    analysis_config=LIANDER2024_ANALYSIS_CONFIG,\n",
-    "    storage=LocalBenchmarkStorage(base_path=OUTPUT_PATH),\n",
-    "    target_provider=target_provider,\n",
-    ")\n",
-    "\n",
-    "# Generate comparison reports\n",
-    "print(\"📊 Generating comparison analysis...\")\n",
-    "comparison_pipeline.run(\n",
-    "    run_data={\n",
-    "        \"xgboost\": storage_xgboost,\n",
-    "        \"gblinear\": storage_gblinear,\n",
-    "    }\n",
-    ")\n",
-    "print(\"✅ Comparison analysis complete!\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "c22c61f4",
-   "metadata": {},
-   "source": [
-    "## 📈 View Analysis Results\n",
-    "\n",
-    "The benchmark generates interactive HTML visualizations. Let's open the most important ones:\n",
-    "\n",
-    "### Key Metrics:\n",
-    "- **rCRPS** (relative Continuous Ranked Probability Score) — measures probabilistic forecast accuracy\n",
-    "- **rMAE** (relative Mean Absolute Error) — measures point forecast accuracy\n",
-    "- Lower values = better performance"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "af09be7e",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Open key analysis plots in browser\n",
-    "# HTML visualizations are interactive and best viewed in a browser\n",
-    "import os\n",
-    "import webbrowser\n",
-    "\n",
-    "# Base path for analysis results\n",
-    "analysis_base = os.path.abspath(\"./benchmark_results/analysis/D-1T06:00\")\n",
-    "\n",
-    "# Define key visualizations to open\n",
-    "visualizations = [\n",
-    "    (\"rCRPS Grouped by Category\", \"rCRPS_grouped.html\"),\n",
-    "    (\"rCRPS Time-Windowed (7 days)\", \"rCRPS_windowed_7D.html\"),\n",
-    "]\n",
-    "\n",
-    "print(\"🌐 Opening analysis visualizations in browser...\\n\")\n",
-    "for name, filename in visualizations:\n",
-    "    filepath = os.path.join(analysis_base, filename)\n",
-    "    if Path(filepath).exists():\n",
-    "        print(f\"   📊 {name}\")\n",
-    "        webbrowser.open(f\"file://{filepath}\")\n",
-    "    else:\n",
-    "        print(f\"   ⚠️  {name} not found at {filepath}\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "59e8d779",
-   "metadata": {},
-   "source": [
-    "### 🔍 Explore Individual Target Results\n",
-    "\n",
-    "You can also view time series plots for individual targets. Let's look at a transformer forecast:"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "ea2fd469",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# List available target-specific visualizations\n",
-    "import glob\n",
-    "\n",
-    "# Find all time series plots for individual targets\n",
-    "target_plots = glob.glob(\"./benchmark_results/XGBoost/analysis/*/*/time_series_plot*.html\")\n",
-    "\n",
-    "if target_plots:\n",
-    "    print(\"📊 Available target-specific time series plots:\\n\")\n",
-    "    for i, plot in enumerate(sorted(target_plots)[:5]):  # Show first 5\n",
-    "        parts = plot.split(\"/\")\n",
-    "        category = parts[-3]  # e.g., \"transformer\"\n",
-    "        target = parts[-2]  # e.g., \"OS Apeldoorn\"\n",
-    "        print(f\"   {i + 1}. {category}/{target}\")\n",
-    "\n",
-    "    # Open the first transformer plot as an example\n",
-    "    transformer_plots = [p for p in target_plots if \"transformer\" in p]\n",
-    "    if transformer_plots:\n",
-    "        example_plot = os.path.abspath(transformer_plots[0])\n",
-    "        print(f\"\\n🌐 Opening example: {transformer_plots[0]}\")\n",
-    "        webbrowser.open(f\"file://{example_plot}\")\n",
-    "else:\n",
-    "    print(\"⚠️  No target-specific plots found. Run the benchmark first.\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "e41df479",
-   "metadata": {},
-   "source": [
-    "---\n",
-    "\n",
-    "## 🎯 Summary\n",
-    "\n",
-    "In this tutorial, you learned how to:\n",
-    "\n",
-    "1. ✅ **Configure benchmark experiments** with `ForecastingWorkflowConfig`\n",
-    "2. ✅ **Run parallel backtests** using the Liander 2024 benchmark\n",
-    "3. ✅ **Compare models** (XGBoost vs GBLinear) with `BenchmarkComparisonPipeline`\n",
-    "4. ✅ **Analyze results** with interactive HTML visualizations\n",
-    "\n",
-    "### 📁 Output Structure\n",
-    "\n",
-    "```\n",
-    "benchmark_results/\n",
-    "├── XGBoost/\n",
-    "│   ├── backtest/       # Raw predictions (parquet)\n",
-    "│   ├── evaluation/     # Metrics per target\n",
-    "│   └── analysis/       # HTML visualizations\n",
-    "├── GBLinear/\n",
-    "│   └── ...\n",
-    "└── analysis/           # Comparison analysis (both models)\n",
-    "    └── D-1T06:00/\n",
-    "        ├── rCRPS_grouped.html      # Probabilistic accuracy by category\n",
-    "        ├── rMAE_grouped.html       # Point forecast accuracy\n",
-    "        └── summary.html            # Overall summary\n",
-    "```\n",
-    "\n",
-    "### 🚀 Next Steps\n",
-    "\n",
-    "- Experiment with different `FORECAST_HORIZONS` (e.g., `\"PT6H\"`, `\"P7D\"`)\n",
-    "- Add more quantiles for higher resolution prediction intervals\n",
-    "- Filter specific categories with `BENCHMARK_FILTER`\n",
-    "- Integrate MLflow for experiment tracking"
-   ]
-  }
- ],
- "metadata": {
-  "jupytext": {
-   "formats": "ipynb,py:percent"
-  },
-  "kernelspec": {
-   "display_name": ".venv",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.13.5"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 5
-}
diff --git a/examples/tutorials/backtesting_openstef_with_beam.py b/examples/tutorials/backtesting_openstef_with_beam.py
deleted file mode 100644
index 0c30580ec..000000000
--- a/examples/tutorials/backtesting_openstef_with_beam.py
+++ /dev/null
@@ -1,338 +0,0 @@
-# ---
-# jupyter:
-#   jupytext:
-#     formats: ipynb,py:percent
-#     text_representation:
-#       extension: .py
-#       format_name: percent
-#       format_version: '1.3'
-#       jupytext_version: 1.19.1
-#   kernelspec:
-#     display_name: .venv
-#     language: python
-#     name: python3
-# ---
-
-# %% [markdown]
-# # 📊 Backtesting OpenSTEF Models with OpenSTEF-BEAM
-#
-# This tutorial demonstrates how to use **OpenSTEF-BEAM** (Backtesting, Evaluation, Analysis, Metrics) to systematically evaluate forecasting models. You'll learn how to:
-#
-# 1. **Configure benchmark experiments** with multiple model types
-# 2. **Run parallel backtests** across dozens of energy assets
-# 3. **Compare model performance** with standardized metrics
-# 4. **Generate analysis reports** with interactive visualizations
-#
-# > **BEAM** provides a rigorous framework for model evaluation, ensuring fair comparisons and reproducible results.
-
-# %% [markdown]
-# ## 🔧 Environment Setup
-#
-# First, we configure thread settings to prevent conflicts with XGBoost's internal parallelization when running multiple processes.
-
-# %%
-# --- Thread Configuration ---
-# Prevent thread contention when running parallel backtests with XGBoost
-import os
-
-os.environ["OMP_NUM_THREADS"] = "1"
-os.environ["OPENBLAS_NUM_THREADS"] = "1"
-os.environ["MKL_NUM_THREADS"] = "1"
-
-# --- Standard Imports ---
-import logging
-import multiprocessing
-from pathlib import Path
-
-logging.basicConfig(level=logging.INFO, format="[%(asctime)s][%(levelname)s] %(message)s")
-
-# %% [markdown]
-# ## ⚙️ Benchmark Configuration
-#
-# Configure the benchmark parameters:
-# - **Output paths** — where to store results for each model
-# - **Forecast horizons** — how far ahead to predict (using ISO 8601 duration format)
-# - **Quantiles** — prediction intervals for probabilistic evaluation
-
-# %%
-# Import types for configuration
-from openstef_beam.benchmarking.benchmarks.liander2024 import Liander2024Category
-from openstef_core.types import LeadTime, Q  # LeadTime: forecast horizon, Q: quantile
-
-# --- Output Paths ---
-OUTPUT_PATH = Path("./benchmark_results")
-BENCHMARK_RESULTS_PATH_XGBOOST = OUTPUT_PATH / "XGBoost"
-BENCHMARK_RESULTS_PATH_GBLINEAR = OUTPUT_PATH / "GBLinear"
-
-# --- Parallelization ---
-N_PROCESSES = multiprocessing.cpu_count()  # Use all available CPU cores
-print(f"🖥️  Running with {N_PROCESSES} parallel processes")
-
-# --- Forecast Configuration ---
-FORECAST_HORIZONS = [LeadTime.from_string("P3D")]  # 3-day ahead forecast (ISO 8601: P3D)
-
-# Quantiles for probabilistic forecasting (7 quantiles covering 5th to 95th percentile)
-PREDICTION_QUANTILES = [
-    Q(0.05),
-    Q(0.1),
-    Q(0.3),  # Lower quantiles
-    Q(0.5),  # Median
-    Q(0.7),
-    Q(0.9),
-    Q(0.95),  # Upper quantiles
-]
-
-# --- Benchmark Filter (optional) ---
-# Set to None to run all categories, or specify categories like:
-# BENCHMARK_FILTER = [Liander2024Category.TRANSFORMER, Liander2024Category.MV_FEEDER]
-BENCHMARK_FILTER: list[Liander2024Category] | None = None
-
-# %% [markdown]
-# ## 🛠️ Model Configuration
-#
-# We define a **common configuration** that both models share, then create model-specific variants. This ensures fair comparison by keeping all settings identical except the model type.
-#
-# ### Available Models:
-# - **XGBoost** — Gradient boosting trees (handles complex nonlinear patterns)
-# - **GBLinear** — Gradient boosted linear model (better extrapolation, faster)
-
-# %%
-# Import workflow configuration
-from openstef_models.presets import ForecastingWorkflowConfig
-
-# Common configuration shared by all models
-# This ensures fair comparison by keeping all settings identical
-common_config = ForecastingWorkflowConfig(
-    model_id="benchmark_model_",
-    run_name=None,
-    model="flatliner",  # Placeholder - will be overwritten per model
-    # Forecast settings
-    horizons=FORECAST_HORIZONS,
-    quantiles=PREDICTION_QUANTILES,
-    # Model reuse: reuse trained model for same target (speeds up backtesting)
-    model_reuse_enable=True,
-    mlflow_storage=None,  # Disable MLflow for this demo
-    # Weather feature column mappings (match dataset column names)
-    radiation_column="shortwave_radiation",
-    wind_speed_column="wind_speed_80m",  # 80m wind speed for better wind park predictions
-    pressure_column="surface_pressure",
-    temperature_column="temperature_2m",
-    relative_humidity_column="relative_humidity_2m",
-    # Additional features
-    energy_price_column="EPEX_NL",  # Day-ahead electricity price
-    rolling_aggregate_features=["mean", "median", "max", "min"],  # Rolling window stats
-    # Logging
-    verbosity=0,  # Quiet mode for batch processing
-)
-
-# %%
-# Create model-specific configurations by copying common config and updating model type
-xgboost_config = common_config.model_copy(update={"model": "xgboost"})
-gblinear_config = common_config.model_copy(update={"model": "gblinear"})
-
-print("✅ Model configurations created:")
-print(f"   - XGBoost: {xgboost_config.model}")
-print(f"   - GBLinear: {gblinear_config.model}")
-
-# %% [markdown]
-# ## 💾 Storage Configuration
-#
-# **LocalBenchmarkStorage** manages the file structure for benchmark results:
-# ```
-# benchmark_results/
-# ├── XGBoost/
-# │   ├── backtest/      # Raw predictions
-# │   ├── evaluation/    # Metrics per target
-# │   └── analysis/      # Visualizations (HTML)
-# └── GBLinear/
-#     └── ...
-# ```
-
-# %%
-# Initialize storage backends for each model
-from openstef_beam.benchmarking.storage.local_storage import LocalBenchmarkStorage
-
-storage_xgboost = LocalBenchmarkStorage(base_path=BENCHMARK_RESULTS_PATH_XGBOOST)
-storage_gblinear = LocalBenchmarkStorage(base_path=BENCHMARK_RESULTS_PATH_GBLINEAR)
-
-print(f"📁 XGBoost results: {BENCHMARK_RESULTS_PATH_XGBOOST}")
-print(f"📁 GBLinear results: {BENCHMARK_RESULTS_PATH_GBLINEAR}")
-
-# %% [markdown]
-# ## 🚀 Run Backtests
-#
-# Now we run the **Liander 2024 Benchmark** — a comprehensive evaluation suite that:
-# 1. Downloads the benchmark dataset from HuggingFace Hub (if needed)
-# 2. Runs backtests across 5 asset categories (transformers, feeders, solar/wind parks)
-# 3. Computes metrics and generates analysis visualizations
-#
-# ⚠️ **Note**: This may take several minutes depending on your hardware.
-
-# %%
-# Import benchmark components
-from openstef_beam.benchmarking.baselines.openstef4 import create_openstef4_preset_backtest_forecaster
-from openstef_beam.benchmarking.benchmarks.liander2024 import create_liander2024_benchmark_runner
-from openstef_beam.benchmarking.callbacks.strict_execution_callback import StrictExecutionCallback
-
-# --- Run XGBoost Benchmark ---
-print("🌲 Running XGBoost benchmark...")
-create_liander2024_benchmark_runner(
-    storage=storage_xgboost,
-    callbacks=[StrictExecutionCallback()],  # Fail fast on errors
-).run(
-    forecaster_factory=create_openstef4_preset_backtest_forecaster(
-        workflow_config=xgboost_config,
-    ),
-    run_name="xgboost",
-    n_processes=N_PROCESSES,
-    filter_args=BENCHMARK_FILTER,
-)
-print("✅ XGBoost benchmark complete!")
-
-# --- Run GBLinear Benchmark ---
-print("\n📈 Running GBLinear benchmark...")
-create_liander2024_benchmark_runner(
-    storage=storage_gblinear,
-    callbacks=[StrictExecutionCallback()],
-).run(
-    forecaster_factory=create_openstef4_preset_backtest_forecaster(
-        workflow_config=gblinear_config,
-    ),
-    run_name="gblinear",
-    n_processes=N_PROCESSES,
-    filter_args=BENCHMARK_FILTER,
-)
-print("✅ GBLinear benchmark complete!")
-
-# %% [markdown]
-# ## 📊 Compare Model Performance
-#
-# The **BenchmarkComparisonPipeline** generates side-by-side analysis of multiple models:
-# - Global metrics across all targets
-# - Per-category breakdowns (transformers, feeders, etc.)
-# - Time-windowed performance analysis
-
-# %%
-# Run model comparison analysis
-from openstef_beam.benchmarking import BenchmarkComparisonPipeline
-from openstef_beam.benchmarking.benchmarks.liander2024 import LIANDER2024_ANALYSIS_CONFIG
-
-# Create comparison pipeline
-target_provider = create_liander2024_benchmark_runner(
-    storage=LocalBenchmarkStorage(base_path=OUTPUT_PATH),
-).target_provider
-
-comparison_pipeline = BenchmarkComparisonPipeline(
-    analysis_config=LIANDER2024_ANALYSIS_CONFIG,
-    storage=LocalBenchmarkStorage(base_path=OUTPUT_PATH),
-    target_provider=target_provider,
-)
-
-# Generate comparison reports
-print("📊 Generating comparison analysis...")
-comparison_pipeline.run(
-    run_data={
-        "xgboost": storage_xgboost,
-        "gblinear": storage_gblinear,
-    }
-)
-print("✅ Comparison analysis complete!")
-
-# %% [markdown]
-# ## 📈 View Analysis Results
-#
-# The benchmark generates interactive HTML visualizations. Let's open the most important ones:
-#
-# ### Key Metrics:
-# - **rCRPS** (relative Continuous Ranked Probability Score) — measures probabilistic forecast accuracy
-# - **rMAE** (relative Mean Absolute Error) — measures point forecast accuracy
-# - Lower values = better performance
-
-# %%
-# Open key analysis plots in browser
-# HTML visualizations are interactive and best viewed in a browser
-import os
-import webbrowser
-
-# Base path for analysis results
-analysis_base = os.path.abspath("./benchmark_results/analysis/D-1T06:00")
-
-# Define key visualizations to open
-visualizations = [
-    ("rCRPS Grouped by Category", "rCRPS_grouped.html"),
-    ("rCRPS Time-Windowed (7 days)", "rCRPS_windowed_7D.html"),
-]
-
-print("🌐 Opening analysis visualizations in browser...\n")
-for name, filename in visualizations:
-    filepath = os.path.join(analysis_base, filename)
-    if Path(filepath).exists():
-        print(f"   📊 {name}")
-        webbrowser.open(f"file://{filepath}")
-    else:
-        print(f"   ⚠️  {name} not found at {filepath}")
-
-# %% [markdown]
-# ### 🔍 Explore Individual Target Results
-#
-# You can also view time series plots for individual targets. Let's look at a transformer forecast:
-
-# %%
-# List available target-specific visualizations
-import glob
-
-# Find all time series plots for individual targets
-target_plots = glob.glob("./benchmark_results/XGBoost/analysis/*/*/time_series_plot*.html")
-
-if target_plots:
-    print("📊 Available target-specific time series plots:\n")
-    for i, plot in enumerate(sorted(target_plots)[:5]):  # Show first 5
-        parts = plot.split("/")
-        category = parts[-3]  # e.g., "transformer"
-        target = parts[-2]  # e.g., "OS Apeldoorn"
-        print(f"   {i + 1}. {category}/{target}")
-
-    # Open the first transformer plot as an example
-    transformer_plots = [p for p in target_plots if "transformer" in p]
-    if transformer_plots:
-        example_plot = os.path.abspath(transformer_plots[0])
-        print(f"\n🌐 Opening example: {transformer_plots[0]}")
-        webbrowser.open(f"file://{example_plot}")
-else:
-    print("⚠️  No target-specific plots found. Run the benchmark first.")
-
-# %% [markdown]
-# ---
-#
-# ## 🎯 Summary
-#
-# In this tutorial, you learned how to:
-#
-# 1. ✅ **Configure benchmark experiments** with `ForecastingWorkflowConfig`
-# 2. ✅ **Run parallel backtests** using the Liander 2024 benchmark
-# 3. ✅ **Compare models** (XGBoost vs GBLinear) with `BenchmarkComparisonPipeline`
-# 4. ✅ **Analyze results** with interactive HTML visualizations
-#
-# ### 📁 Output Structure
-#
-# ```
-# benchmark_results/
-# ├── XGBoost/
-# │   ├── backtest/       # Raw predictions (parquet)
-# │   ├── evaluation/     # Metrics per target
-# │   └── analysis/       # HTML visualizations
-# ├── GBLinear/
-# │   └── ...
-# └── analysis/           # Comparison analysis (both models)
-#     └── D-1T06:00/
-#         ├── rCRPS_grouped.html      # Probabilistic accuracy by category
-#         ├── rMAE_grouped.html       # Point forecast accuracy
-#         └── summary.html            # Overall summary
-# ```
-#
-# ### 🚀 Next Steps
-#
-# - Experiment with different `FORECAST_HORIZONS` (e.g., `"PT6H"`, `"P7D"`)
-# - Add more quantiles for higher resolution prediction intervals
-# - Filter specific categories with `BENCHMARK_FILTER`
-# - Integrate MLflow for experiment tracking
diff --git a/examples/tutorials/backtesting_quickstart.ipynb b/examples/tutorials/backtesting_quickstart.ipynb
new file mode 100644
index 000000000..4b8304116
--- /dev/null
+++ b/examples/tutorials/backtesting_quickstart.ipynb
@@ -0,0 +1,435 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "712d20d4",
+   "metadata": {
+    "tags": [
+     "remove-cell"
+    ]
+   },
+   "outputs": [],
+   "source": [
+    "# SPDX-FileCopyrightText: 2025 Contributors to the OpenSTEF project <openstef@lfenergy.org>\n",
+    "#\n",
+    "# SPDX-License-Identifier: MPL-2.0\n",
+    "\n",
+    "# pyright: basic"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ab1b247e",
+   "metadata": {
+    "tags": [
+     "remove-cell"
+    ]
+   },
+   "outputs": [],
+   "source": [
+    "import warnings\n",
+    "\n",
+    "warnings.filterwarnings(\"ignore\")\n",
+    "\n",
+    "from openstef_core.testing import configure_notebook_display, setup_notebook_logging\n",
+    "\n",
+    "configure_notebook_display()\n",
+    "logger = setup_notebook_logging(\n",
+    "    __name__,\n",
+    "    suppress=(\n",
+    "        \"choreographer\",\n",
+    "        \"kaleido\",\n",
+    "        \"httpx\",\n",
+    "        \"huggingface_hub\",\n",
+    "        \"fsspec\",\n",
+    "        \"filelock\",\n",
+    "        \"openstef_core.datasets\",\n",
+    "        \"lightgbm\",\n",
+    "        \"openstef_beam.backtesting\",\n",
+    "    ),\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "35a1520c",
+   "metadata": {},
+   "source": [
+    "# Backtesting Quickstart\n",
+    "\n",
+    "Backtesting simulates how a forecasting model would have performed in a\n",
+    "real operational setting.  Unlike a simple train/test split, it respects\n",
+    "temporal constraints: models are retrained on a schedule and predictions\n",
+    "use only data that would have been available at prediction time.\n",
+    "\n",
+    "**What you will learn:**\n",
+    "\n",
+    "- How to set up a backtesting pipeline with [`BacktestPipeline`](https://openstef.github.io/openstef/v4/api/generated/openstef_beam.backtesting.BacktestPipeline.html)\n",
+    "- How to configure prediction and retraining schedules\n",
+    "- How to evaluate backtest results with standardized metrics\n",
+    "\n",
+    "```{note}\n",
+    "This tutorial shows the low-level backtesting API step by step.\n",
+    "For production use, the **benchmark framework** (`openstef_beam.benchmarking`)\n",
+    "wraps all of this into a single pipeline call — see\n",
+    "`examples/benchmarks/` for ready-to-run examples.\n",
+    "```\n",
+    "\n",
+    "**Key API references:**\n",
+    "[`BacktestPipeline`](https://openstef.github.io/openstef/v4/api/generated/openstef_beam.backtesting.BacktestPipeline.html)\n",
+    "· [`BacktestConfig`](https://openstef.github.io/openstef/v4/api/generated/openstef_beam.backtesting.BacktestConfig.html)\n",
+    "· [`BacktestForecasterConfig`](https://openstef.github.io/openstef/v4/api/generated/openstef_beam.backtesting.backtest_forecaster.BacktestForecasterConfig.html)\n",
+    "· [`EvaluationConfig`](https://openstef.github.io/openstef/v4/api/generated/openstef_beam.evaluation.EvaluationConfig.html)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "ef3b0032",
+   "metadata": {},
+   "source": [
+    "## How backtesting works\n",
+    "\n",
+    "A backtesting pipeline replays history as if it were happening in real-time:\n",
+    "\n",
+    "1. **Event generation** — the pipeline creates a schedule of prediction and\n",
+    "   retraining events based on configured intervals.\n",
+    "2. **Training** — at each retraining event, the model is fitted on all data\n",
+    "   available up to that point (no lookahead).\n",
+    "3. **Prediction** — at each prediction event, the model generates a forecast\n",
+    "   using only data published before that moment.\n",
+    "4. **Collection** — all forecasts are gathered into a single dataset for\n",
+    "   evaluation against ground truth."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "39f81027",
+   "metadata": {},
+   "source": [
+    "## Load the versioned dataset\n",
+    "\n",
+    "Backtesting requires **versioned** data — each data point carries an\n",
+    "`available_at` timestamp indicating when it became known.  This prevents\n",
+    "the model from accidentally using future information.\n",
+    "[`VersionedTimeSeriesDataset`](https://openstef.github.io/openstef/v4/api/generated/openstef_core.datasets.VersionedTimeSeriesDataset.html)\n",
+    "provides this out of the box."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "74ff72b0",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from pathlib import Path\n",
+    "\n",
+    "from openstef_core.datasets import VersionedTimeSeriesDataset\n",
+    "\n",
+    "data_dir = Path(\"liander_dataset\")\n",
+    "\n",
+    "# Ground truth: actual load measurements\n",
+    "ground_truth = VersionedTimeSeriesDataset.read_parquet(\n",
+    "    data_dir / \"load_measurements\" / \"mv_feeder\" / \"OS Gorredijk.parquet\"\n",
+    ")\n",
+    "\n",
+    "# Predictors: versioned weather forecasts (available_at < forecast time)\n",
+    "predictors = VersionedTimeSeriesDataset.read_parquet(\n",
+    "    data_dir / \"weather_forecasts_versioned\" / \"mv_feeder\" / \"OS Gorredijk.parquet\"\n",
+    ")\n",
+    "\n",
+    "print(f\"Ground truth: {len(ground_truth.index):,} timestamps, {len(ground_truth.feature_names)} features\")\n",
+    "print(f\"Predictors:   {len(predictors.index):,} timestamps, {len(predictors.feature_names)} features\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "52acd6ce",
+   "metadata": {},
+   "source": [
+    "## Configure the forecaster\n",
+    "\n",
+    "We wrap a standard [`ForecastingWorkflowConfig`](https://openstef.github.io/openstef/v4/api/generated/openstef_models.presets.ForecastingWorkflowConfig.html) in an\n",
+    "`OpenSTEF4BacktestForecaster` which implements the backtesting interface\n",
+    "(fit/predict with temporal constraints)."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "4c1aa76d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from datetime import timedelta\n",
+    "\n",
+    "from openstef_beam.backtesting.backtest_forecaster import BacktestForecasterConfig\n",
+    "from openstef_beam.benchmarking.baselines.openstef4 import OpenSTEF4BacktestForecaster\n",
+    "from openstef_core.types import LeadTime, Q\n",
+    "from openstef_models.presets import ForecastingWorkflowConfig, create_forecasting_workflow\n",
+    "\n",
+    "workflow_config = ForecastingWorkflowConfig(\n",
+    "    model_id=\"backtest_demo\",\n",
+    "    model=\"xgboost\",\n",
+    "    horizons=[LeadTime.from_string(\"PT48H\")],\n",
+    "    quantiles=[Q(0.5), Q(0.1), Q(0.9)],\n",
+    "    target_column=\"load\",\n",
+    "    temperature_column=\"temperature_2m\",\n",
+    "    relative_humidity_column=\"relative_humidity_2m\",\n",
+    "    wind_speed_column=\"wind_speed_10m\",\n",
+    "    radiation_column=\"shortwave_radiation\",\n",
+    "    pressure_column=\"surface_pressure\",\n",
+    "    mlflow_storage=None,\n",
+    "    verbosity=0,\n",
+    ")\n",
+    "\n",
+    "backtest_forecaster_config = BacktestForecasterConfig(\n",
+    "    requires_training=True,\n",
+    "    predict_length=timedelta(hours=48),\n",
+    "    predict_min_length=timedelta(minutes=15),\n",
+    "    predict_context_length=timedelta(days=14),\n",
+    "    predict_context_min_coverage=0.5,\n",
+    "    training_context_length=timedelta(days=90),\n",
+    "    training_context_min_coverage=0.5,\n",
+    ")\n",
+    "\n",
+    "workflow = create_forecasting_workflow(workflow_config)\n",
+    "forecaster = OpenSTEF4BacktestForecaster(\n",
+    "    config=backtest_forecaster_config,\n",
+    "    workflow_template=workflow,\n",
+    "    cache_dir=Path(\"cache/backtest_demo\"),\n",
+    ")\n",
+    "\n",
+    "print(f\"Model:            {workflow_config.model}\")\n",
+    "print(f\"Training window:  {backtest_forecaster_config.training_context_length}\")\n",
+    "print(f\"Predict horizon:  {backtest_forecaster_config.predict_length}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "fc797d30",
+   "metadata": {},
+   "source": [
+    "## Run the backtest\n",
+    "\n",
+    "We configure the pipeline to predict every 6 hours and retrain weekly.\n",
+    "The backtest covers a short 5-day window for fast execution."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "11b0adeb",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from datetime import datetime\n",
+    "\n",
+    "from openstef_beam.backtesting import BacktestConfig, BacktestPipeline\n",
+    "\n",
+    "backtest_config = BacktestConfig(\n",
+    "    prediction_sample_interval=timedelta(minutes=15),\n",
+    "    predict_interval=timedelta(hours=6),\n",
+    "    train_interval=timedelta(days=7),\n",
+    ")\n",
+    "\n",
+    "pipeline = BacktestPipeline(config=backtest_config, forecaster=forecaster)\n",
+    "\n",
+    "# Short evaluation window: 5 days starting well into the dataset\n",
+    "backtest_start = datetime.fromisoformat(\"2024-05-01T00:00:00Z\")\n",
+    "backtest_end = datetime.fromisoformat(\"2024-05-06T00:00:00Z\")\n",
+    "\n",
+    "predictions = pipeline.run(\n",
+    "    ground_truth=ground_truth,\n",
+    "    predictors=predictors,\n",
+    "    start=backtest_start,\n",
+    "    end=backtest_end,\n",
+    ")\n",
+    "\n",
+    "print(f\"Predictions generated: {predictions.data.shape[0]:,} rows\")\n",
+    "print(f\"Time range: {predictions.data.index.min()} to {predictions.data.index.max()}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "94bdbdc2",
+   "metadata": {
+    "tags": [
+     "remove-cell"
+    ]
+   },
+   "outputs": [],
+   "source": [
+    "assert predictions.data.shape[0] > 100, f\"Expected >100 prediction rows, got {predictions.data.shape[0]}\""
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "37e479ab",
+   "metadata": {},
+   "source": [
+    "## Evaluate the results\n",
+    "\n",
+    "The [`EvaluationPipeline`](https://openstef.github.io/openstef/v4/api/generated/openstef_beam.evaluation.EvaluationPipeline.html) computes metrics over configurable time windows.\n",
+    "It filters predictions by lead time to produce meaningful comparisons\n",
+    "(e.g., day-ahead forecasts only).\n",
+    "\n",
+    "We use [rMAE](https://openstef.github.io/openstef/v4/api/generated/openstef_beam.evaluation.metric_providers.RMAEProvider.html) (relative Mean Absolute Error) and [rCRPS](https://openstef.github.io/openstef/v4/api/generated/openstef_beam.evaluation.metric_providers.RCRPSProvider.html) (relative Continuous\n",
+    "Ranked Probability Score) — both normalized by mean absolute actuals.\n",
+    "See the full list of [available metrics](https://openstef.github.io/openstef/v4/api/generated/openstef_beam.evaluation.metric_providers.html).\n",
+    "If your scores are suboptimal, {doc}`hyperparameter_tuning_with_optuna`\n",
+    "shows how to optimize model parameters before re-running the backtest."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "5e938dc3",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from openstef_beam.evaluation import EvaluationConfig, EvaluationPipeline, Window\n",
+    "from openstef_beam.evaluation.metric_providers import RCRPSProvider, RMAEProvider\n",
+    "\n",
+    "evaluation_config = EvaluationConfig(\n",
+    "    windows=[Window(lag=timedelta(hours=0), size=timedelta(days=5))],\n",
+    "    lead_times=[],  # Only use available_at filtering (day-ahead)\n",
+    ")\n",
+    "\n",
+    "eval_pipeline = EvaluationPipeline(\n",
+    "    config=evaluation_config,\n",
+    "    quantiles=workflow_config.quantiles,\n",
+    "    window_metric_providers=[\n",
+    "        RMAEProvider(quantiles=[Q(0.5)]),\n",
+    "        RCRPSProvider(),\n",
+    "    ],\n",
+    "    global_metric_providers=[\n",
+    "        RMAEProvider(quantiles=[Q(0.5)]),\n",
+    "        RCRPSProvider(),\n",
+    "    ],\n",
+    ")\n",
+    "\n",
+    "report = eval_pipeline.run(\n",
+    "    predictions=predictions,\n",
+    "    ground_truth=ground_truth,\n",
+    "    target_column=\"load\",\n",
+    ")\n",
+    "\n",
+    "print(\"Backtest evaluation metrics (day-ahead):\")\n",
+    "for subset_report in report.subset_reports:\n",
+    "    print(f\"\\n  Lead-time filter: {subset_report.filtering}\")\n",
+    "    for metric in subset_report.metrics:\n",
+    "        df = metric.to_dataframe()\n",
+    "        print(f\"  Window: {metric.window}\")\n",
+    "        print(df.to_string(index=False))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "98c0b36d",
+   "metadata": {},
+   "source": [
+    "## Visualize predictions vs actuals\n",
+    "\n",
+    "The evaluation report contains a properly filtered [`ForecastDataset`](https://openstef.github.io/openstef/v4/api/generated/openstef_core.datasets.ForecastDataset.html) for\n",
+    "each lead-time subset.  We use this directly for visualization — it\n",
+    "shows only day-ahead predictions aligned with their corresponding actuals."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b17332ed",
+   "metadata": {
+    "tags": [
+     "hide-input"
+    ]
+   },
+   "outputs": [],
+   "source": [
+    "from openstef_beam.analysis.plots import ForecastTimeSeriesPlotter\n",
+    "\n",
+    "# The evaluation subset contains actuals + predictions filtered by lead time\n",
+    "subset = report.subset_reports[0].subset\n",
+    "\n",
+    "plotter = ForecastTimeSeriesPlotter()\n",
+    "plotter.add_measurements(measurements=subset.target_series)\n",
+    "plotter.add_model(\n",
+    "    model_name=\"XGBoost (day-ahead)\",\n",
+    "    forecast=subset.median_series,\n",
+    "    quantiles=subset.quantiles_data,\n",
+    ")\n",
+    "\n",
+    "fig = plotter.plot()\n",
+    "fig.update_layout(\n",
+    "    title=\"Backtest: Day-Ahead Forecast vs Actuals\",\n",
+    "    xaxis_title=\"Time\",\n",
+    "    yaxis_title=\"Load (W)\",\n",
+    "    height=400,\n",
+    ")\n",
+    "fig.show()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "9e6e5ad7",
+   "metadata": {},
+   "source": [
+    "## The easy way: benchmark framework\n",
+    "\n",
+    "The code above demonstrates each backtesting step explicitly.  In practice,\n",
+    "the **benchmark framework** handles all of this (data loading, target\n",
+    "management, evaluation, analysis) in a single pipeline:\n",
+    "\n",
+    "```python\n",
+    "from openstef_beam.benchmarking.benchmarks.liander2024 import (\n",
+    "    create_liander2024_benchmark_runner,\n",
+    ")\n",
+    "from openstef_beam.benchmarking.baselines.openstef4 import (\n",
+    "    create_openstef4_preset_backtest_forecaster,\n",
+    ")\n",
+    "\n",
+    "runner = create_liander2024_benchmark_runner()\n",
+    "forecaster_factory = create_openstef4_preset_backtest_forecaster(workflow_config)\n",
+    "runner.run(forecaster_factory, run_name=\"my_experiment\")\n",
+    "```\n",
+    "\n",
+    "The benchmark runner automatically:\n",
+    "- Downloads and manages the dataset\n",
+    "- Iterates over all targets (feeders, transformers, solar parks, etc.)\n",
+    "- Runs backtests with standardized configuration\n",
+    "- Computes metrics and generates analysis visualizations\n",
+    "\n",
+    "See `examples/benchmarks/` for complete benchmark scripts that will be\n",
+    "converted to Jupytext tutorials in a future update."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "f9814713",
+   "metadata": {},
+   "source": [
+    "## Next steps\n",
+    "\n",
+    "- {doc}`hyperparameter_tuning_with_optuna` — optimize model parameters,\n",
+    "  then re-run the backtest to measure improvement.\n",
+    "- {doc}`ensemble_forecasting` — backtest an ensemble of diverse models."
+   ]
+  }
+ ],
+ "metadata": {
+  "jupytext": {
+   "formats": "ipynb,py:percent"
+  },
+  "kernelspec": {
+   "display_name": ".venv",
+   "language": "python",
+   "name": "python3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/examples/tutorials/backtesting_quickstart.py b/examples/tutorials/backtesting_quickstart.py
new file mode 100644
index 000000000..cccc0d072
--- /dev/null
+++ b/examples/tutorials/backtesting_quickstart.py
@@ -0,0 +1,316 @@
+# ---
+# jupyter:
+#   jupytext:
+#     formats: ipynb,py:percent
+#     text_representation:
+#       extension: .py
+#       format_name: percent
+#       format_version: '1.3'
+#       jupytext_version: 1.19.1
+#   kernelspec:
+#     display_name: .venv
+#     language: python
+#     name: python3
+# ---
+
+# %% tags=["remove-cell"]
+# SPDX-FileCopyrightText: 2025 Contributors to the OpenSTEF project <openstef@lfenergy.org>
+#
+# SPDX-License-Identifier: MPL-2.0
+
+# pyright: basic
+
+# %% tags=["remove-cell"]
+import warnings
+
+warnings.filterwarnings("ignore")
+
+from openstef_core.testing import configure_notebook_display, setup_notebook_logging
+
+configure_notebook_display()
+logger = setup_notebook_logging(
+    __name__,
+    suppress=(
+        "choreographer",
+        "kaleido",
+        "httpx",
+        "huggingface_hub",
+        "fsspec",
+        "filelock",
+        "openstef_core.datasets",
+        "lightgbm",
+        "openstef_beam.backtesting",
+    ),
+)
+
+# %% [markdown]
+# # Backtesting Quickstart
+#
+# Backtesting simulates how a forecasting model would have performed in a
+# real operational setting.  Unlike a simple train/test split, it respects
+# temporal constraints: models are retrained on a schedule and predictions
+# use only data that would have been available at prediction time.
+#
+# **What you will learn:**
+#
+# - How to set up a backtesting pipeline with [`BacktestPipeline`](https://openstef.github.io/openstef/v4/api/generated/openstef_beam.backtesting.BacktestPipeline.html)
+# - How to configure prediction and retraining schedules
+# - How to evaluate backtest results with standardized metrics
+#
+# ```{note}
+# This tutorial shows the low-level backtesting API step by step.
+# For production use, the **benchmark framework** (`openstef_beam.benchmarking`)
+# wraps all of this into a single pipeline call — see
+# `examples/benchmarks/` for ready-to-run examples.
+# ```
+#
+# **Key API references:**
+# [`BacktestPipeline`](https://openstef.github.io/openstef/v4/api/generated/openstef_beam.backtesting.BacktestPipeline.html)
+# · [`BacktestConfig`](https://openstef.github.io/openstef/v4/api/generated/openstef_beam.backtesting.BacktestConfig.html)
+# · [`BacktestForecasterConfig`](https://openstef.github.io/openstef/v4/api/generated/openstef_beam.backtesting.backtest_forecaster.BacktestForecasterConfig.html)
+# · [`EvaluationConfig`](https://openstef.github.io/openstef/v4/api/generated/openstef_beam.evaluation.EvaluationConfig.html)
+
+# %% [markdown]
+# ## How backtesting works
+#
+# A backtesting pipeline replays history as if it were happening in real-time:
+#
+# 1. **Event generation** — the pipeline creates a schedule of prediction and
+#    retraining events based on configured intervals.
+# 2. **Training** — at each retraining event, the model is fitted on all data
+#    available up to that point (no lookahead).
+# 3. **Prediction** — at each prediction event, the model generates a forecast
+#    using only data published before that moment.
+# 4. **Collection** — all forecasts are gathered into a single dataset for
+#    evaluation against ground truth.
+
+# %% [markdown]
+# ## Load the versioned dataset
+#
+# Backtesting requires **versioned** data — each data point carries an
+# `available_at` timestamp indicating when it became known.  This prevents
+# the model from accidentally using future information.
+# [`VersionedTimeSeriesDataset`](https://openstef.github.io/openstef/v4/api/generated/openstef_core.datasets.VersionedTimeSeriesDataset.html)
+# provides this out of the box.
+
+# %%
+from pathlib import Path
+
+from openstef_core.datasets import VersionedTimeSeriesDataset
+
+data_dir = Path("liander_dataset")
+
+# Ground truth: actual load measurements
+ground_truth = VersionedTimeSeriesDataset.read_parquet(
+    data_dir / "load_measurements" / "mv_feeder" / "OS Gorredijk.parquet"
+)
+
+# Predictors: versioned weather forecasts (available_at < forecast time)
+predictors = VersionedTimeSeriesDataset.read_parquet(
+    data_dir / "weather_forecasts_versioned" / "mv_feeder" / "OS Gorredijk.parquet"
+)
+
+print(f"Ground truth: {len(ground_truth.index):,} timestamps, {len(ground_truth.feature_names)} features")
+print(f"Predictors:   {len(predictors.index):,} timestamps, {len(predictors.feature_names)} features")
+
+# %% [markdown]
+# ## Configure the forecaster
+#
+# We wrap a standard [`ForecastingWorkflowConfig`](https://openstef.github.io/openstef/v4/api/generated/openstef_models.presets.ForecastingWorkflowConfig.html) in an
+# `OpenSTEF4BacktestForecaster` which implements the backtesting interface
+# (fit/predict with temporal constraints).
+
+# %%
+from datetime import timedelta
+
+from openstef_beam.backtesting.backtest_forecaster import BacktestForecasterConfig
+from openstef_beam.benchmarking.baselines.openstef4 import OpenSTEF4BacktestForecaster
+from openstef_core.types import LeadTime, Q
+from openstef_models.presets import ForecastingWorkflowConfig, create_forecasting_workflow
+
+workflow_config = ForecastingWorkflowConfig(
+    model_id="backtest_demo",
+    model="xgboost",
+    horizons=[LeadTime.from_string("PT48H")],
+    quantiles=[Q(0.5), Q(0.1), Q(0.9)],
+    target_column="load",
+    temperature_column="temperature_2m",
+    relative_humidity_column="relative_humidity_2m",
+    wind_speed_column="wind_speed_10m",
+    radiation_column="shortwave_radiation",
+    pressure_column="surface_pressure",
+    mlflow_storage=None,
+    verbosity=0,
+)
+
+backtest_forecaster_config = BacktestForecasterConfig(
+    requires_training=True,
+    predict_length=timedelta(hours=48),
+    predict_min_length=timedelta(minutes=15),
+    predict_context_length=timedelta(days=14),
+    predict_context_min_coverage=0.5,
+    training_context_length=timedelta(days=90),
+    training_context_min_coverage=0.5,
+)
+
+workflow = create_forecasting_workflow(workflow_config)
+forecaster = OpenSTEF4BacktestForecaster(
+    config=backtest_forecaster_config,
+    workflow_template=workflow,
+    cache_dir=Path("cache/backtest_demo"),
+)
+
+print(f"Model:            {workflow_config.model}")
+print(f"Training window:  {backtest_forecaster_config.training_context_length}")
+print(f"Predict horizon:  {backtest_forecaster_config.predict_length}")
+
+# %% [markdown]
+# ## Run the backtest
+#
+# We configure the pipeline to predict every 6 hours and retrain weekly.
+# The backtest covers a short 5-day window for fast execution.
+
+# %%
+from datetime import datetime
+
+from openstef_beam.backtesting import BacktestConfig, BacktestPipeline
+
+backtest_config = BacktestConfig(
+    prediction_sample_interval=timedelta(minutes=15),
+    predict_interval=timedelta(hours=6),
+    train_interval=timedelta(days=7),
+)
+
+pipeline = BacktestPipeline(config=backtest_config, forecaster=forecaster)
+
+# Short evaluation window: 5 days starting well into the dataset
+backtest_start = datetime.fromisoformat("2024-05-01T00:00:00Z")
+backtest_end = datetime.fromisoformat("2024-05-06T00:00:00Z")
+
+predictions = pipeline.run(
+    ground_truth=ground_truth,
+    predictors=predictors,
+    start=backtest_start,
+    end=backtest_end,
+)
+
+print(f"Predictions generated: {predictions.data.shape[0]:,} rows")
+print(f"Time range: {predictions.data.index.min()} to {predictions.data.index.max()}")
+
+# %% tags=["remove-cell"]
+assert predictions.data.shape[0] > 100, f"Expected >100 prediction rows, got {predictions.data.shape[0]}"
+
+# %% [markdown]
+# ## Evaluate the results
+#
+# The [`EvaluationPipeline`](https://openstef.github.io/openstef/v4/api/generated/openstef_beam.evaluation.EvaluationPipeline.html) computes metrics over configurable time windows.
+# It filters predictions by lead time to produce meaningful comparisons
+# (e.g., day-ahead forecasts only).
+#
+# We use [rMAE](https://openstef.github.io/openstef/v4/api/generated/openstef_beam.evaluation.metric_providers.RMAEProvider.html) (relative Mean Absolute Error) and [rCRPS](https://openstef.github.io/openstef/v4/api/generated/openstef_beam.evaluation.metric_providers.RCRPSProvider.html) (relative Continuous
+# Ranked Probability Score) — both normalized by mean absolute actuals.
+# See the full list of [available metrics](https://openstef.github.io/openstef/v4/api/generated/openstef_beam.evaluation.metric_providers.html).
+# If your scores are suboptimal, {doc}`hyperparameter_tuning_with_optuna`
+# shows how to optimize model parameters before re-running the backtest.
+
+# %%
+from openstef_beam.evaluation import EvaluationConfig, EvaluationPipeline, Window
+from openstef_beam.evaluation.metric_providers import RCRPSProvider, RMAEProvider
+
+evaluation_config = EvaluationConfig(
+    windows=[Window(lag=timedelta(hours=0), size=timedelta(days=5))],
+    lead_times=[],  # Only use available_at filtering (day-ahead)
+)
+
+eval_pipeline = EvaluationPipeline(
+    config=evaluation_config,
+    quantiles=workflow_config.quantiles,
+    window_metric_providers=[
+        RMAEProvider(quantiles=[Q(0.5)]),
+        RCRPSProvider(),
+    ],
+    global_metric_providers=[
+        RMAEProvider(quantiles=[Q(0.5)]),
+        RCRPSProvider(),
+    ],
+)
+
+report = eval_pipeline.run(
+    predictions=predictions,
+    ground_truth=ground_truth,
+    target_column="load",
+)
+
+print("Backtest evaluation metrics (day-ahead):")
+for subset_report in report.subset_reports:
+    print(f"\n  Lead-time filter: {subset_report.filtering}")
+    for metric in subset_report.metrics:
+        df = metric.to_dataframe()
+        print(f"  Window: {metric.window}")
+        print(df.to_string(index=False))
+
+# %% [markdown]
+# ## Visualize predictions vs actuals
+#
+# The evaluation report contains a properly filtered [`ForecastDataset`](https://openstef.github.io/openstef/v4/api/generated/openstef_core.datasets.ForecastDataset.html) for
+# each lead-time subset.  We use this directly for visualization — it
+# shows only day-ahead predictions aligned with their corresponding actuals.
+
+# %% tags=["hide-input"]
+from openstef_beam.analysis.plots import ForecastTimeSeriesPlotter
+
+# The evaluation subset contains actuals + predictions filtered by lead time
+subset = report.subset_reports[0].subset
+
+plotter = ForecastTimeSeriesPlotter()
+plotter.add_measurements(measurements=subset.target_series)
+plotter.add_model(
+    model_name="XGBoost (day-ahead)",
+    forecast=subset.median_series,
+    quantiles=subset.quantiles_data,
+)
+
+fig = plotter.plot()
+fig.update_layout(
+    title="Backtest: Day-Ahead Forecast vs Actuals",
+    xaxis_title="Time",
+    yaxis_title="Load (W)",
+    height=400,
+)
+fig.show()
+
+# %% [markdown]
+# ## The easy way: benchmark framework
+#
+# The code above demonstrates each backtesting step explicitly.  In practice,
+# the **benchmark framework** handles all of this (data loading, target
+# management, evaluation, analysis) in a single pipeline:
+#
+# ```python
+# from openstef_beam.benchmarking.benchmarks.liander2024 import (
+#     create_liander2024_benchmark_runner,
+# )
+# from openstef_beam.benchmarking.baselines.openstef4 import (
+#     create_openstef4_preset_backtest_forecaster,
+# )
+#
+# runner = create_liander2024_benchmark_runner()
+# forecaster_factory = create_openstef4_preset_backtest_forecaster(workflow_config)
+# runner.run(forecaster_factory, run_name="my_experiment")
+# ```
+#
+# The benchmark runner automatically:
+# - Downloads and manages the dataset
+# - Iterates over all targets (feeders, transformers, solar parks, etc.)
+# - Runs backtests with standardized configuration
+# - Computes metrics and generates analysis visualizations
+#
+# See `examples/benchmarks/` for complete benchmark scripts that will be
+# converted to Jupytext tutorials in a future update.
+
+# %% [markdown]
+# ## Next steps
+#
+# - {doc}`hyperparameter_tuning_with_optuna` — optimize model parameters,
+#   then re-run the backtest to measure improvement.
+# - {doc}`ensemble_forecasting` — backtest an ensemble of diverse models.
diff --git a/examples/tutorials/custom_pipeline.ipynb b/examples/tutorials/custom_pipeline.ipynb
new file mode 100644
index 000000000..143609092
--- /dev/null
+++ b/examples/tutorials/custom_pipeline.ipynb
@@ -0,0 +1,499 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e932643b",
+   "metadata": {
+    "tags": [
+     "remove-cell"
+    ]
+   },
+   "outputs": [],
+   "source": [
+    "# SPDX-FileCopyrightText: 2025 Contributors to the OpenSTEF project <openstef@lfenergy.org>\n",
+    "#\n",
+    "# SPDX-License-Identifier: MPL-2.0\n",
+    "\n",
+    "# pyright: basic"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d7068def",
+   "metadata": {
+    "tags": [
+     "remove-cell"
+    ]
+   },
+   "outputs": [],
+   "source": [
+    "import warnings\n",
+    "\n",
+    "warnings.filterwarnings(\"ignore\")\n",
+    "\n",
+    "from openstef_core.testing import configure_notebook_display, setup_notebook_logging\n",
+    "\n",
+    "configure_notebook_display()\n",
+    "logger = setup_notebook_logging(\n",
+    "    __name__,\n",
+    "    suppress=(\n",
+    "        \"choreographer\",\n",
+    "        \"kaleido\",\n",
+    "        \"httpx\",\n",
+    "        \"huggingface_hub\",\n",
+    "        \"fsspec\",\n",
+    "        \"filelock\",\n",
+    "        \"openstef_core.datasets\",\n",
+    "    ),\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "81d0d171",
+   "metadata": {},
+   "source": [
+    "# Building a Custom Pipeline\n",
+    "\n",
+    "The [`create_forecasting_workflow`](https://openstef.github.io/openstef/v4/api/generated/openstef_models.presets.create_forecasting_workflow.html) preset handles pipeline assembly\n",
+    "automatically.  When you need full control — custom transforms, different\n",
+    "feature engineering, or non-standard postprocessing — you can build a\n",
+    "[`ForecastingModel`](https://openstef.github.io/openstef/v4/api/generated/openstef_models.models.ForecastingModel.html) from individual components.\n",
+    "\n",
+    "**What you'll learn:**\n",
+    "\n",
+    "- Assemble preprocessing, forecaster, and postprocessing into a pipeline\n",
+    "- Select and configure individual transforms\n",
+    "- Train and predict with a hand-built pipeline\n",
+    "- Compare the custom pipeline against a preset\n",
+    "\n",
+    "```{note}\n",
+    "This tutorial is for advanced users who need to go beyond presets.\n",
+    "Start with {doc}`forecasting_quickstart` for the standard approach.\n",
+    "```\n",
+    "\n",
+    "**Key API references:**\n",
+    "[`ForecastingModel`](https://openstef.github.io/openstef/v4/api/generated/openstef_models.models.ForecastingModel.html)\n",
+    "· [`TransformPipeline`](https://openstef.github.io/openstef/v4/api/generated/openstef_core.mixins.TransformPipeline.html)\n",
+    "· [`GBLinearForecaster`](https://openstef.github.io/openstef/v4/api/generated/openstef_models.models.forecasting.html)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "9418a6bc",
+   "metadata": {},
+   "source": [
+    "## Load the dataset"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "7322debe",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from datetime import timedelta\n",
+    "\n",
+    "from openstef_core.testing import load_liander_dataset\n",
+    "from openstef_core.types import LeadTime, Q\n",
+    "\n",
+    "dataset = load_liander_dataset()\n",
+    "\n",
+    "from datetime import datetime\n",
+    "\n",
+    "train_start = datetime.fromisoformat(\"2024-03-01T00:00:00Z\")\n",
+    "train_end = train_start + timedelta(days=45)\n",
+    "forecast_end = train_end + timedelta(days=7)\n",
+    "\n",
+    "train_dataset = dataset.filter_by_range(start=train_start, end=train_end)\n",
+    "predict_dataset = dataset.filter_by_range(\n",
+    "    start=train_end - timedelta(days=14),\n",
+    "    end=forecast_end,\n",
+    ")\n",
+    "\n",
+    "print(f\"Training:  {train_dataset.data.shape[0]:,} rows\")\n",
+    "print(f\"Predict:   {predict_dataset.data.shape[0]:,} rows\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "3c407256",
+   "metadata": {},
+   "source": [
+    "## Define pipeline components\n",
+    "\n",
+    "A [`ForecastingModel`](https://openstef.github.io/openstef/v4/api/generated/openstef_models.models.ForecastingModel.html) has three stages:\n",
+    "\n",
+    "1. **Preprocessing** — feature engineering and data cleaning transforms\n",
+    "2. **Forecaster** — the model that produces predictions\n",
+    "3. **Postprocessing** — transforms applied to the forecast output\n",
+    "\n",
+    "Below we build each stage explicitly."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "431e2214",
+   "metadata": {},
+   "source": [
+    "### Preprocessing\n",
+    "\n",
+    "We select transforms from the available modules:\n",
+    "\n",
+    "| Module | Transforms |\n",
+    "|--------|-----------|\n",
+    "| `transforms.general` | Scaler, Imputer, NaNDropper, OutlierHandler, EmptyFeatureRemover |\n",
+    "| `transforms.time_domain` | HolidayFeatureAdder, DatetimeFeaturesAdder, CyclicFeaturesAdder, LagsAdder |\n",
+    "| `transforms.weather_domain` | AtmosphereDerivedFeaturesAdder, DaylightFeatureAdder, RadiationDerivedFeaturesAdder |\n",
+    "| `transforms.energy_domain` | WindPowerFeatureAdder |\n",
+    "| `transforms.validation` | CompletenessChecker, FlatlineChecker |"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b8ae7bb5",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from openstef_core.mixins import TransformPipeline\n",
+    "from openstef_models.transforms.general import EmptyFeatureRemover, Imputer, NaNDropper, Scaler\n",
+    "from openstef_models.transforms.time_domain import CyclicFeaturesAdder, HolidayFeatureAdder\n",
+    "from openstef_models.transforms.time_domain.lags_adder import LagsAdder\n",
+    "from openstef_models.utils.feature_selection import Exclude\n",
+    "\n",
+    "quantiles = [Q(0.1), Q(0.5), Q(0.9)]\n",
+    "horizons = [LeadTime.from_string(\"PT36H\")]\n",
+    "\n",
+    "preprocessing = TransformPipeline(\n",
+    "    transforms=[\n",
+    "        # Feature engineering\n",
+    "        LagsAdder(\n",
+    "            history_available=timedelta(days=14),\n",
+    "            horizons=horizons,\n",
+    "            add_trivial_lags=False,\n",
+    "            target_column=\"load\",\n",
+    "            custom_lags=[timedelta(days=7)],\n",
+    "            lag_fallback_offset=timedelta(days=7),\n",
+    "        ),\n",
+    "        CyclicFeaturesAdder(),\n",
+    "        HolidayFeatureAdder(country_code=\"NL\"),\n",
+    "        # Standardization\n",
+    "        Scaler(selection=Exclude(\"load\"), method=\"standard\"),\n",
+    "        EmptyFeatureRemover(),\n",
+    "        # Missing value handling\n",
+    "        Imputer(selection=Exclude(\"load\"), imputation_strategy=\"mean\"),\n",
+    "        NaNDropper(selection=Exclude(\"load\")),\n",
+    "    ]\n",
+    ")\n",
+    "\n",
+    "print(f\"Preprocessing steps: {len(preprocessing.transforms)}\")\n",
+    "for t in preprocessing.transforms:\n",
+    "    print(f\"  - {type(t).__name__}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "7e7a9dab",
+   "metadata": {},
+   "source": [
+    "### Forecaster\n",
+    "\n",
+    "We use `GBLinearForecaster` — a gradient-boosted linear model that works well\n",
+    "with the Imputer + NaNDropper preprocessing pattern above."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "fdfcd4cb",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from openstef_models.models.forecasting.gblinear_forecaster import (\n",
+    "    GBLinearForecaster,\n",
+    "    GBLinearHyperParams,\n",
+    ")\n",
+    "\n",
+    "forecaster = GBLinearForecaster(\n",
+    "    quantiles=quantiles,\n",
+    "    horizons=horizons,\n",
+    "    hyperparams=GBLinearHyperParams(\n",
+    "        n_steps=100,\n",
+    "        learning_rate=0.3,\n",
+    "    ),\n",
+    "    verbosity=0,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "5ed31867",
+   "metadata": {},
+   "source": [
+    "### Postprocessing\n",
+    "\n",
+    "We add a [`QuantileSorter`](https://openstef.github.io/openstef/v4/api/generated/openstef_models.transforms.postprocessing.QuantileSorter.html) (ensures quantile ordering) and a\n",
+    "[`ConfidenceIntervalApplicator`](https://openstef.github.io/openstef/v4/api/generated/openstef_models.transforms.postprocessing.ConfidenceIntervalApplicator.html) (adds confidence interval columns)."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0f200e18",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from openstef_models.transforms.postprocessing import (\n",
+    "    ConfidenceIntervalApplicator,\n",
+    "    QuantileSorter,\n",
+    ")\n",
+    "\n",
+    "postprocessing = TransformPipeline(\n",
+    "    transforms=[\n",
+    "        QuantileSorter(),\n",
+    "        ConfidenceIntervalApplicator(\n",
+    "            quantiles=quantiles,\n",
+    "            add_quantiles_from_std=False,\n",
+    "        ),\n",
+    "    ]\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "3cd19602",
+   "metadata": {},
+   "source": [
+    "## Assemble the model\n",
+    "\n",
+    "[`ForecastingModel`](https://openstef.github.io/openstef/v4/api/generated/openstef_models.models.ForecastingModel.html) combines all three stages.  We wrap it in a\n",
+    "[`CustomForecastingWorkflow`](https://openstef.github.io/openstef/v4/api/generated/openstef_models.workflows.custom_forecasting_workflow.CustomForecastingWorkflow.html) which adds train/predict orchestration."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "055172cf",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from openstef_models.models.forecasting_model import ForecastingModel\n",
+    "from openstef_models.workflows import CustomForecastingWorkflow\n",
+    "\n",
+    "model = ForecastingModel(\n",
+    "    preprocessing=preprocessing,\n",
+    "    forecaster=forecaster,\n",
+    "    postprocessing=postprocessing,\n",
+    "    target_column=\"load\",\n",
+    ")\n",
+    "\n",
+    "workflow = CustomForecastingWorkflow(\n",
+    "    model_id=\"custom_pipeline_demo\",\n",
+    "    model=model,\n",
+    "    callbacks=[],\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "12bc07c5",
+   "metadata": {},
+   "source": [
+    "## Train and predict"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "4876ddc6",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "result = workflow.fit(train_dataset)\n",
+    "forecast = workflow.predict(predict_dataset, forecast_start=train_end)\n",
+    "\n",
+    "print(f\"Forecast rows:  {len(forecast.data)}\")\n",
+    "print(f\"Columns:        {list(forecast.data.columns)}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "cdabc2d1",
+   "metadata": {
+    "tags": [
+     "remove-cell"
+    ]
+   },
+   "outputs": [],
+   "source": [
+    "assert len(forecast.data) > 100, f\"Expected >100 forecast rows, got {len(forecast.data)}\""
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "56f58706",
+   "metadata": {},
+   "source": [
+    "## Visualize the result"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ec9920d7",
+   "metadata": {
+    "tags": [
+     "hide-input"
+    ]
+   },
+   "outputs": [],
+   "source": [
+    "from openstef_beam.analysis.plots import ForecastTimeSeriesPlotter\n",
+    "\n",
+    "fig = (\n",
+    "    ForecastTimeSeriesPlotter()\n",
+    "    .add_measurements(measurements=predict_dataset.data[\"load\"].loc[train_end:])\n",
+    "    .add_model(\n",
+    "        model_name=\"Custom GBLinear\",\n",
+    "        forecast=forecast.median_series,\n",
+    "        quantiles=forecast.quantiles_data,\n",
+    "    )\n",
+    "    .plot()\n",
+    ")\n",
+    "fig.update_layout(\n",
+    "    title=\"Custom pipeline — forecast vs actuals\",\n",
+    "    yaxis_title=\"Load (MW)\",\n",
+    "    xaxis_title=\"Time\",\n",
+    "    height=450,\n",
+    ")\n",
+    "fig.show()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "ac970781",
+   "metadata": {},
+   "source": [
+    "## Using components individually\n",
+    "\n",
+    "`ForecastingModel` is convenient, but every component also works on its\n",
+    "own.  You can run the preprocessing pipeline, inspect intermediate data,\n",
+    "and call the forecaster directly."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "c5caef5f",
+   "metadata": {},
+   "source": [
+    "### Run preprocessing on raw data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "638c7673",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "preprocessed = model.preprocessing.transform(train_dataset)\n",
+    "\n",
+    "print(f\"Before preprocessing: {train_dataset.data.shape[1]} columns\")\n",
+    "print(f\"After preprocessing:  {preprocessed.data.shape[1]} columns\")\n",
+    "print(f\"\\nAdded features: {sorted(set(preprocessed.data.columns) - set(train_dataset.data.columns))[:8]}...\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "2560845f",
+   "metadata": {},
+   "source": [
+    "### Run a single transform"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "926884f3",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "single_transform = CyclicFeaturesAdder()\n",
+    "single_transform.fit(train_dataset)\n",
+    "result_single = single_transform.transform(train_dataset)\n",
+    "\n",
+    "print(\n",
+    "    f\"CyclicFeaturesAdder added {len(single_transform.features_added())} columns: {single_transform.features_added()}\"\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "0baca441",
+   "metadata": {},
+   "source": [
+    "### Call the forecaster directly\n",
+    "\n",
+    "After preprocessing, you can pass the data to a [`ForecastInputDataset`](https://openstef.github.io/openstef/v4/api/generated/openstef_core.datasets.ForecastInputDataset.html)\n",
+    "and call the forecaster directly.\n",
+    "This is useful for debugging or integrating into custom workflows."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "2e610d16",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from openstef_core.datasets import ForecastInputDataset\n",
+    "\n",
+    "# Preprocess the prediction data\n",
+    "preprocessed_predict = model.preprocessing.transform(predict_dataset)\n",
+    "\n",
+    "# Convert to ForecastInputDataset (what the forecaster expects)\n",
+    "forecast_input = ForecastInputDataset(\n",
+    "    data=preprocessed_predict.data,\n",
+    "    sample_interval=preprocessed_predict.sample_interval,\n",
+    "    target_column=\"load\",\n",
+    "    forecast_start=train_end,\n",
+    ")\n",
+    "\n",
+    "# Call the forecaster directly\n",
+    "raw_forecast = model.forecaster.predict(forecast_input)\n",
+    "print(f\"Raw forecast shape: {raw_forecast.data.shape}\")\n",
+    "print(f\"Raw forecast columns: {list(raw_forecast.data.columns)}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "064b3de5",
+   "metadata": {},
+   "source": [
+    "## Next steps\n",
+    "\n",
+    "- {doc}`ensemble_forecasting` — combine your custom pipeline with other\n",
+    "  models into an ensemble for better accuracy.\n",
+    "- {doc}`quantile_calibration` — append isotonic calibration to your\n",
+    "  postprocessing for more reliable confidence intervals."
+   ]
+  }
+ ],
+ "metadata": {
+  "jupytext": {
+   "formats": "ipynb,py:percent"
+  },
+  "kernelspec": {
+   "display_name": ".venv",
+   "language": "python",
+   "name": "python3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/examples/tutorials/custom_pipeline.py b/examples/tutorials/custom_pipeline.py
new file mode 100644
index 000000000..0d623a607
--- /dev/null
+++ b/examples/tutorials/custom_pipeline.py
@@ -0,0 +1,320 @@
+# ---
+# jupyter:
+#   jupytext:
+#     formats: ipynb,py:percent
+#     text_representation:
+#       extension: .py
+#       format_name: percent
+#       format_version: '1.3'
+#       jupytext_version: 1.19.1
+#   kernelspec:
+#     display_name: .venv
+#     language: python
+#     name: python3
+# ---
+
+# %% tags=["remove-cell"]
+# SPDX-FileCopyrightText: 2025 Contributors to the OpenSTEF project <openstef@lfenergy.org>
+#
+# SPDX-License-Identifier: MPL-2.0
+
+# pyright: basic
+
+# %% tags=["remove-cell"]
+import warnings
+
+warnings.filterwarnings("ignore")
+
+from openstef_core.testing import configure_notebook_display, setup_notebook_logging
+
+configure_notebook_display()
+logger = setup_notebook_logging(
+    __name__,
+    suppress=(
+        "choreographer",
+        "kaleido",
+        "httpx",
+        "huggingface_hub",
+        "fsspec",
+        "filelock",
+        "openstef_core.datasets",
+    ),
+)
+
+# %% [markdown]
+# # Building a Custom Pipeline
+#
+# The [`create_forecasting_workflow`](https://openstef.github.io/openstef/v4/api/generated/openstef_models.presets.create_forecasting_workflow.html) preset handles pipeline assembly
+# automatically.  When you need full control — custom transforms, different
+# feature engineering, or non-standard postprocessing — you can build a
+# [`ForecastingModel`](https://openstef.github.io/openstef/v4/api/generated/openstef_models.models.ForecastingModel.html) from individual components.
+#
+# **What you'll learn:**
+#
+# - Assemble preprocessing, forecaster, and postprocessing into a pipeline
+# - Select and configure individual transforms
+# - Train and predict with a hand-built pipeline
+# - Compare the custom pipeline against a preset
+#
+# ```{note}
+# This tutorial is for advanced users who need to go beyond presets.
+# Start with {doc}`forecasting_quickstart` for the standard approach.
+# ```
+#
+# **Key API references:**
+# [`ForecastingModel`](https://openstef.github.io/openstef/v4/api/generated/openstef_models.models.ForecastingModel.html)
+# · [`TransformPipeline`](https://openstef.github.io/openstef/v4/api/generated/openstef_core.mixins.TransformPipeline.html)
+# · [`GBLinearForecaster`](https://openstef.github.io/openstef/v4/api/generated/openstef_models.models.forecasting.html)
+
+# %% [markdown]
+# ## Load the dataset
+
+# %%
+from datetime import timedelta
+
+from openstef_core.testing import load_liander_dataset
+from openstef_core.types import LeadTime, Q
+
+dataset = load_liander_dataset()
+
+from datetime import datetime
+
+train_start = datetime.fromisoformat("2024-03-01T00:00:00Z")
+train_end = train_start + timedelta(days=45)
+forecast_end = train_end + timedelta(days=7)
+
+train_dataset = dataset.filter_by_range(start=train_start, end=train_end)
+predict_dataset = dataset.filter_by_range(
+    start=train_end - timedelta(days=14),
+    end=forecast_end,
+)
+
+print(f"Training:  {train_dataset.data.shape[0]:,} rows")
+print(f"Predict:   {predict_dataset.data.shape[0]:,} rows")
+
+# %% [markdown]
+# ## Define pipeline components
+#
+# A [`ForecastingModel`](https://openstef.github.io/openstef/v4/api/generated/openstef_models.models.ForecastingModel.html) has three stages:
+#
+# 1. **Preprocessing** — feature engineering and data cleaning transforms
+# 2. **Forecaster** — the model that produces predictions
+# 3. **Postprocessing** — transforms applied to the forecast output
+#
+# Below we build each stage explicitly.
+
+# %% [markdown]
+# ### Preprocessing
+#
+# We select transforms from the available modules:
+#
+# | Module | Transforms |
+# |--------|-----------|
+# | `transforms.general` | Scaler, Imputer, NaNDropper, OutlierHandler, EmptyFeatureRemover |
+# | `transforms.time_domain` | HolidayFeatureAdder, DatetimeFeaturesAdder, CyclicFeaturesAdder, LagsAdder |
+# | `transforms.weather_domain` | AtmosphereDerivedFeaturesAdder, DaylightFeatureAdder, RadiationDerivedFeaturesAdder |
+# | `transforms.energy_domain` | WindPowerFeatureAdder |
+# | `transforms.validation` | CompletenessChecker, FlatlineChecker |
+
+# %%
+from openstef_core.mixins import TransformPipeline
+from openstef_models.transforms.general import EmptyFeatureRemover, Imputer, NaNDropper, Scaler
+from openstef_models.transforms.time_domain import CyclicFeaturesAdder, HolidayFeatureAdder
+from openstef_models.transforms.time_domain.lags_adder import LagsAdder
+from openstef_models.utils.feature_selection import Exclude
+
+quantiles = [Q(0.1), Q(0.5), Q(0.9)]
+horizons = [LeadTime.from_string("PT36H")]
+
+preprocessing = TransformPipeline(
+    transforms=[
+        # Feature engineering
+        LagsAdder(
+            history_available=timedelta(days=14),
+            horizons=horizons,
+            add_trivial_lags=False,
+            target_column="load",
+            custom_lags=[timedelta(days=7)],
+            lag_fallback_offset=timedelta(days=7),
+        ),
+        CyclicFeaturesAdder(),
+        HolidayFeatureAdder(country_code="NL"),
+        # Standardization
+        Scaler(selection=Exclude("load"), method="standard"),
+        EmptyFeatureRemover(),
+        # Missing value handling
+        Imputer(selection=Exclude("load"), imputation_strategy="mean"),
+        NaNDropper(selection=Exclude("load")),
+    ]
+)
+
+print(f"Preprocessing steps: {len(preprocessing.transforms)}")
+for t in preprocessing.transforms:
+    print(f"  - {type(t).__name__}")
+
+# %% [markdown]
+# ### Forecaster
+#
+# We use `GBLinearForecaster` — a gradient-boosted linear model that works well
+# with the Imputer + NaNDropper preprocessing pattern above.
+
+# %%
+from openstef_models.models.forecasting.gblinear_forecaster import (
+    GBLinearForecaster,
+    GBLinearHyperParams,
+)
+
+forecaster = GBLinearForecaster(
+    quantiles=quantiles,
+    horizons=horizons,
+    hyperparams=GBLinearHyperParams(
+        n_steps=100,
+        learning_rate=0.3,
+    ),
+    verbosity=0,
+)
+
+# %% [markdown]
+# ### Postprocessing
+#
+# We add a [`QuantileSorter`](https://openstef.github.io/openstef/v4/api/generated/openstef_models.transforms.postprocessing.QuantileSorter.html) (ensures quantile ordering) and a
+# [`ConfidenceIntervalApplicator`](https://openstef.github.io/openstef/v4/api/generated/openstef_models.transforms.postprocessing.ConfidenceIntervalApplicator.html) (adds confidence interval columns).
+
+# %%
+from openstef_models.transforms.postprocessing import (
+    ConfidenceIntervalApplicator,
+    QuantileSorter,
+)
+
+postprocessing = TransformPipeline(
+    transforms=[
+        QuantileSorter(),
+        ConfidenceIntervalApplicator(
+            quantiles=quantiles,
+            add_quantiles_from_std=False,
+        ),
+    ]
+)
+
+# %% [markdown]
+# ## Assemble the model
+#
+# [`ForecastingModel`](https://openstef.github.io/openstef/v4/api/generated/openstef_models.models.ForecastingModel.html) combines all three stages.  We wrap it in a
+# [`CustomForecastingWorkflow`](https://openstef.github.io/openstef/v4/api/generated/openstef_models.workflows.custom_forecasting_workflow.CustomForecastingWorkflow.html) which adds train/predict orchestration.
+
+# %%
+from openstef_models.models.forecasting_model import ForecastingModel
+from openstef_models.workflows import CustomForecastingWorkflow
+
+model = ForecastingModel(
+    preprocessing=preprocessing,
+    forecaster=forecaster,
+    postprocessing=postprocessing,
+    target_column="load",
+)
+
+workflow = CustomForecastingWorkflow(
+    model_id="custom_pipeline_demo",
+    model=model,
+    callbacks=[],
+)
+
+# %% [markdown]
+# ## Train and predict
+
+# %%
+result = workflow.fit(train_dataset)
+forecast = workflow.predict(predict_dataset, forecast_start=train_end)
+
+print(f"Forecast rows:  {len(forecast.data)}")
+print(f"Columns:        {list(forecast.data.columns)}")
+
+# %% tags=["remove-cell"]
+assert len(forecast.data) > 100, f"Expected >100 forecast rows, got {len(forecast.data)}"
+
+# %% [markdown]
+# ## Visualize the result
+
+# %% tags=["hide-input"]
+from openstef_beam.analysis.plots import ForecastTimeSeriesPlotter
+
+fig = (
+    ForecastTimeSeriesPlotter()
+    .add_measurements(measurements=predict_dataset.data["load"].loc[train_end:])
+    .add_model(
+        model_name="Custom GBLinear",
+        forecast=forecast.median_series,
+        quantiles=forecast.quantiles_data,
+    )
+    .plot()
+)
+fig.update_layout(
+    title="Custom pipeline — forecast vs actuals",
+    yaxis_title="Load (MW)",
+    xaxis_title="Time",
+    height=450,
+)
+fig.show()
+
+# %% [markdown]
+# ## Using components individually
+#
+# `ForecastingModel` is convenient, but every component also works on its
+# own.  You can run the preprocessing pipeline, inspect intermediate data,
+# and call the forecaster directly.
+
+# %% [markdown]
+# ### Run preprocessing on raw data
+
+# %%
+preprocessed = model.preprocessing.transform(train_dataset)
+
+print(f"Before preprocessing: {train_dataset.data.shape[1]} columns")
+print(f"After preprocessing:  {preprocessed.data.shape[1]} columns")
+print(f"\nAdded features: {sorted(set(preprocessed.data.columns) - set(train_dataset.data.columns))[:8]}...")
+
+# %% [markdown]
+# ### Run a single transform
+
+# %%
+single_transform = CyclicFeaturesAdder()
+single_transform.fit(train_dataset)
+result_single = single_transform.transform(train_dataset)
+
+print(
+    f"CyclicFeaturesAdder added {len(single_transform.features_added())} columns: {single_transform.features_added()}"
+)
+
+# %% [markdown]
+# ### Call the forecaster directly
+#
+# After preprocessing, you can pass the data to a [`ForecastInputDataset`](https://openstef.github.io/openstef/v4/api/generated/openstef_core.datasets.ForecastInputDataset.html)
+# and call the forecaster directly.
+# This is useful for debugging or integrating into custom workflows.
+
+# %%
+from openstef_core.datasets import ForecastInputDataset
+
+# Preprocess the prediction data
+preprocessed_predict = model.preprocessing.transform(predict_dataset)
+
+# Convert to ForecastInputDataset (what the forecaster expects)
+forecast_input = ForecastInputDataset(
+    data=preprocessed_predict.data,
+    sample_interval=preprocessed_predict.sample_interval,
+    target_column="load",
+    forecast_start=train_end,
+)
+
+# Call the forecaster directly
+raw_forecast = model.forecaster.predict(forecast_input)
+print(f"Raw forecast shape: {raw_forecast.data.shape}")
+print(f"Raw forecast columns: {list(raw_forecast.data.columns)}")
+
+# %% [markdown]
+# ## Next steps
+#
+# - {doc}`ensemble_forecasting` — combine your custom pipeline with other
+#   models into an ensemble for better accuracy.
+# - {doc}`quantile_calibration` — append isotonic calibration to your
+#   postprocessing for more reliable confidence intervals.
diff --git a/examples/tutorials/ensemble_forecasting.ipynb b/examples/tutorials/ensemble_forecasting.ipynb
new file mode 100644
index 000000000..af7d492e4
--- /dev/null
+++ b/examples/tutorials/ensemble_forecasting.ipynb
@@ -0,0 +1,536 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d87cffae",
+   "metadata": {
+    "tags": [
+     "remove-cell"
+    ]
+   },
+   "outputs": [],
+   "source": [
+    "# SPDX-FileCopyrightText: 2025 Contributors to the OpenSTEF project <openstef@lfenergy.org>\n",
+    "#\n",
+    "# SPDX-License-Identifier: MPL-2.0\n",
+    "\n",
+    "# pyright: basic"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "4c12c212",
+   "metadata": {
+    "tags": [
+     "remove-cell"
+    ]
+   },
+   "outputs": [],
+   "source": [
+    "import warnings\n",
+    "\n",
+    "warnings.filterwarnings(\"ignore\")\n",
+    "\n",
+    "from openstef_core.testing import configure_notebook_display, setup_notebook_logging\n",
+    "\n",
+    "configure_notebook_display()\n",
+    "logger = setup_notebook_logging(\n",
+    "    __name__,\n",
+    "    suppress=(\n",
+    "        \"choreographer\",\n",
+    "        \"kaleido\",\n",
+    "        \"httpx\",\n",
+    "        \"huggingface_hub\",\n",
+    "        \"fsspec\",\n",
+    "        \"filelock\",\n",
+    "        \"openstef_core.datasets\",\n",
+    "        \"lightgbm\",\n",
+    "    ),\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "e283b926",
+   "metadata": {},
+   "source": [
+    "# Ensemble Forecasting\n",
+    "\n",
+    "OpenSTEF supports ensemble models that combine multiple base forecasters\n",
+    "into a single prediction.  A **combiner** learns which base model performs\n",
+    "best under different conditions and weights their outputs accordingly.\n",
+    "\n",
+    "**What you'll learn:**\n",
+    "\n",
+    "- Why combining tree-based and linear models improves forecasts\n",
+    "- How to configure and train an ensemble with [`EnsembleForecastingWorkflowConfig`](https://openstef.github.io/openstef/v4/api/generated/openstef_meta.presets.EnsembleForecastingWorkflowConfig.html)\n",
+    "- How to inspect combiner behavior (which model does it prefer?)\n",
+    "- How ensemble predictions compare to individual base models\n",
+    "\n",
+    "```{note}\n",
+    "This tutorial uses a small dataset for fast execution.\n",
+    "See `examples/benchmarks/` for production-scale ensemble runs.\n",
+    "```\n",
+    "\n",
+    "**Key API references:**\n",
+    "[`EnsembleForecastingWorkflowConfig`](https://openstef.github.io/openstef/v4/api/generated/openstef_meta.presets.EnsembleForecastingWorkflowConfig.html)\n",
+    "· [`create_ensemble_forecasting_workflow`](https://openstef.github.io/openstef/v4/api/generated/openstef_meta.presets.create_ensemble_forecasting_workflow.html)\n",
+    "· [`ForecastingWorkflowConfig`](https://openstef.github.io/openstef/v4/api/generated/openstef_models.presets.ForecastingWorkflowConfig.html)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "4dbf4b68",
+   "metadata": {},
+   "source": [
+    "## Why ensemble forecasting?\n",
+    "\n",
+    "Different model types have complementary strengths:\n",
+    "\n",
+    "| Model | Strengths | Weaknesses |\n",
+    "|-------|-----------|------------|\n",
+    "| **Tree-based** (LightGBM, XGBoost) | Captures complex non-linear patterns, handles feature interactions well | Poor extrapolation — struggles with unseen peaks, seasonal shifts, or values outside training range |\n",
+    "| **Linear** (GBLinear) | Extrapolates naturally to new ranges, captures seasonal/solar trends | Cannot model non-linear interactions |\n",
+    "\n",
+    "In energy forecasting, load peaks during extreme weather or seasonal\n",
+    "transitions often fall outside the training distribution.  A tree-based\n",
+    "model underestimates these peaks while a linear model captures the trend\n",
+    "but misses finer patterns.  An **ensemble** combines both: the combiner\n",
+    "learns *when* each model is more reliable and weights accordingly.\n",
+    "\n",
+    "## How it works\n",
+    "\n",
+    "An ensemble workflow has three layers:\n",
+    "\n",
+    "1. **Common preprocessing** — shared feature engineering (lags, holidays,\n",
+    "   cyclic features, scaling) applied once to raw data.\n",
+    "2. **Base forecasters** — multiple models each trained on the preprocessed\n",
+    "   data, with optional per-model transforms (e.g. GBLinear gets fewer lags\n",
+    "   to avoid collinearity).\n",
+    "3. **Combiner** — learns to aggregate base forecaster outputs.  Two modes:\n",
+    "   - *Learned weights*: a classifier predicts which base model will perform\n",
+    "     best for each sample, then weights predictions accordingly.\n",
+    "   - *Stacking*: a meta-regressor trained on base model outputs per quantile."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "3725ade6",
+   "metadata": {},
+   "source": [
+    "## Load the dataset"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "9f5936af",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from datetime import datetime, timedelta\n",
+    "\n",
+    "from openstef_core.testing import load_liander_dataset\n",
+    "from openstef_core.types import LeadTime, Q\n",
+    "\n",
+    "dataset = load_liander_dataset()\n",
+    "\n",
+    "train_start = datetime.fromisoformat(\"2024-03-01T00:00:00Z\")\n",
+    "train_end = train_start + timedelta(days=45)\n",
+    "forecast_end = train_end + timedelta(days=7)\n",
+    "\n",
+    "train_dataset = dataset.filter_by_range(start=train_start, end=train_end)\n",
+    "predict_dataset = dataset.filter_by_range(\n",
+    "    start=train_end - timedelta(days=14),\n",
+    "    end=forecast_end,\n",
+    ")\n",
+    "\n",
+    "print(f\"Training:  {train_dataset.data.shape[0]:,} rows\")\n",
+    "print(f\"Predict:   {predict_dataset.data.shape[0]:,} rows\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "ac238dc8",
+   "metadata": {},
+   "source": [
+    "## Configure the ensemble\n",
+    "\n",
+    "[`EnsembleForecastingWorkflowConfig`](https://openstef.github.io/openstef/v4/api/generated/openstef_meta.presets.EnsembleForecastingWorkflowConfig.html) sets up the full pipeline.\n",
+    "Key parameters:\n",
+    "\n",
+    "- `base_models` — which forecasters to include\n",
+    "- `ensemble_type` — how to combine them (`\"learned_weights\"` or `\"stacking\"`)\n",
+    "- `combiner_model` — the algorithm used by the combiner"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "791aa085",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from openstef_meta.presets import EnsembleForecastingWorkflowConfig, create_ensemble_forecasting_workflow\n",
+    "\n",
+    "ensemble_config = EnsembleForecastingWorkflowConfig(\n",
+    "    model_id=\"ensemble_demo\",\n",
+    "    # Ensemble architecture\n",
+    "    ensemble_type=\"learned_weights\",\n",
+    "    base_models=[\"lgbm\", \"gblinear\"],\n",
+    "    combiner_model=\"lgbm\",\n",
+    "    # Forecast settings\n",
+    "    horizons=[LeadTime.from_string(\"PT36H\")],\n",
+    "    quantiles=[Q(0.5), Q(0.1), Q(0.9)],\n",
+    "    # Data columns\n",
+    "    target_column=\"load\",\n",
+    "    temperature_column=\"temperature_2m\",\n",
+    "    relative_humidity_column=\"relative_humidity_2m\",\n",
+    "    wind_speed_column=\"wind_speed_10m\",\n",
+    "    radiation_column=\"shortwave_radiation\",\n",
+    "    pressure_column=\"surface_pressure\",\n",
+    "    # Disable MLFlow for tutorial\n",
+    "    mlflow_storage=None,\n",
+    ")\n",
+    "\n",
+    "print(f\"Base models:    {list(ensemble_config.base_models)}\")\n",
+    "print(f\"Ensemble type:  {ensemble_config.ensemble_type}\")\n",
+    "print(f\"Combiner:       {ensemble_config.combiner_model}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "52685e3b",
+   "metadata": {},
+   "source": [
+    "## Create and train the ensemble workflow"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "355268ad",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "workflow = create_ensemble_forecasting_workflow(ensemble_config)\n",
+    "fit_result = workflow.fit(train_dataset)\n",
+    "\n",
+    "print(\"Ensemble trained successfully\")\n",
+    "print(\"\\nPer-model validation R2:\")\n",
+    "for name, child_result in fit_result.component_fit_results.items():\n",
+    "    r2 = child_result.metrics_val.get_metric(quantile=Q(0.5), metric_name=\"R2\")\n",
+    "    print(f\"  {name:12s}: {r2:.4f}\")\n",
+    "\n",
+    "# Get combiner (ensemble-level) R2\n",
+    "ensemble_r2 = fit_result.metrics_val.get_metric(quantile=Q(0.5), metric_name=\"R2\")\n",
+    "print(f\"  {'ensemble':12s}: {ensemble_r2:.4f}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "1ab6bba0",
+   "metadata": {
+    "tags": [
+     "remove-cell"
+    ]
+   },
+   "outputs": [],
+   "source": [
+    "assert ensemble_r2 is not None and ensemble_r2 > 0.0, f\"Expected positive R2, got {ensemble_r2}\""
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "08ab509c",
+   "metadata": {},
+   "source": [
+    "## Generate forecasts"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "7cf50a50",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "forecast = workflow.predict(predict_dataset, forecast_start=train_end)\n",
+    "\n",
+    "print(f\"Forecast rows: {len(forecast.data):,}\")\n",
+    "print(f\"Quantiles: {[float(q) for q in forecast.quantiles]}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "cb1f3f30",
+   "metadata": {
+    "tags": [
+     "remove-cell"
+    ]
+   },
+   "outputs": [],
+   "source": [
+    "assert len(forecast.data) > 100, f\"Expected >100 forecast rows, got {len(forecast.data)}\""
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "1a52947e",
+   "metadata": {},
+   "source": [
+    "## Compare: ensemble vs individual base models\n",
+    "\n",
+    "To show the benefit of ensembling, let's also train each base model\n",
+    "individually and compare their forecasts."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d47ceef4",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from openstef_models.presets import ForecastingWorkflowConfig, create_forecasting_workflow\n",
+    "\n",
+    "individual_forecasts = {}\n",
+    "for model_type in [\"lgbm\", \"gblinear\"]:\n",
+    "    config = ForecastingWorkflowConfig(\n",
+    "        model_id=f\"{model_type}_solo\",\n",
+    "        model=model_type,\n",
+    "        horizons=[LeadTime.from_string(\"PT36H\")],\n",
+    "        quantiles=[Q(0.5), Q(0.1), Q(0.9)],\n",
+    "        target_column=\"load\",\n",
+    "        temperature_column=\"temperature_2m\",\n",
+    "        relative_humidity_column=\"relative_humidity_2m\",\n",
+    "        wind_speed_column=\"wind_speed_10m\",\n",
+    "        radiation_column=\"shortwave_radiation\",\n",
+    "        pressure_column=\"surface_pressure\",\n",
+    "        mlflow_storage=None,\n",
+    "        verbosity=0,\n",
+    "    )\n",
+    "    wf = create_forecasting_workflow(config)\n",
+    "    wf.fit(train_dataset)\n",
+    "    individual_forecasts[model_type] = wf.predict(predict_dataset, forecast_start=train_end)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "002117a5",
+   "metadata": {},
+   "source": [
+    "## Visualize the comparison"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "1cfea55d",
+   "metadata": {
+    "tags": [
+     "hide-input"
+    ]
+   },
+   "outputs": [],
+   "source": [
+    "from openstef_beam.analysis.plots import ForecastTimeSeriesPlotter\n",
+    "\n",
+    "plotter = ForecastTimeSeriesPlotter()\n",
+    "plotter.add_measurements(measurements=predict_dataset.data[\"load\"].loc[train_end:])\n",
+    "\n",
+    "# Add individual models\n",
+    "for name, fc in individual_forecasts.items():\n",
+    "    plotter.add_model(model_name=name, forecast=fc.median_series, quantiles=fc.quantiles_data)\n",
+    "\n",
+    "# Add ensemble\n",
+    "plotter.add_model(model_name=\"Ensemble\", forecast=forecast.median_series, quantiles=forecast.quantiles_data)\n",
+    "\n",
+    "fig = plotter.plot()\n",
+    "fig.update_layout(\n",
+    "    title=\"Ensemble vs Individual Models\",\n",
+    "    xaxis_title=\"Time\",\n",
+    "    yaxis_title=\"MW\",\n",
+    "    height=450,\n",
+    ")\n",
+    "fig.show()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "8f632fa8",
+   "metadata": {},
+   "source": [
+    "## Combiner insights\n",
+    "\n",
+    "The learned-weights combiner trains a classifier that predicts — for each\n",
+    "timestep — which base model will be most accurate.  It then uses those\n",
+    "predicted probabilities as mixing weights.\n",
+    "\n",
+    "We can inspect this behavior at two levels:\n",
+    "\n",
+    "1. **Global feature importances** — which input signals the classifier\n",
+    "   relies on most when deciding between models.\n",
+    "2. **Per-timestamp selection weights** — the actual mixing probabilities\n",
+    "   assigned to each model during forecasting."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "3d2093d0",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import plotly.graph_objects as go\n",
+    "\n",
+    "ensemble_model = workflow.model\n",
+    "combiner = ensemble_model.combiner\n",
+    "\n",
+    "# Global importances: shows how much the classifier attends to each base\n",
+    "# model's prediction value when deciding which model to trust.\n",
+    "importances = combiner.feature_importances\n",
+    "print(\"Combiner feature importances (per quantile):\")\n",
+    "print(importances.to_string())"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "d0f83593",
+   "metadata": {},
+   "source": [
+    "With two base models, the feature importance tells us how much the\n",
+    "combiner's internal classifier *uses* each model's prediction to decide\n",
+    "who should contribute more.  A higher importance means the classifier pays\n",
+    "more attention to that model's output when making the selection decision.\n",
+    "\n",
+    "More informative is the **per-timestamp weight** — the actual probability\n",
+    "the combiner assigns to each model at each point in time during forecasting."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "7cde7d87",
+   "metadata": {
+    "tags": [
+     "hide-input"
+    ]
+   },
+   "outputs": [],
+   "source": [
+    "# Reproduce the internal flow to extract per-timestamp weights\n",
+    "ensemble_dataset = ensemble_model._predict_forecasters(predict_dataset, forecast_start=train_end)\n",
+    "base_preds = ensemble_dataset.get_base_predictions_for_quantile(Q(0.5))\n",
+    "input_data = base_preds.input_data()\n",
+    "weights = combiner._predict_weights(input_data, Q(0.5))\n",
+    "\n",
+    "fig = go.Figure()\n",
+    "for col in weights.columns:\n",
+    "    fig.add_trace(go.Scatter(x=weights.index, y=weights[col], mode=\"lines\", name=col, stackgroup=\"one\"))\n",
+    "fig.update_layout(\n",
+    "    title=\"Combiner Model Selection Weights Over Time (q50)\",\n",
+    "    xaxis_title=\"Time\",\n",
+    "    yaxis_title=\"Weight (probability)\",\n",
+    "    yaxis_range=[0, 1],\n",
+    "    height=350,\n",
+    "    legend_title=\"Base model\",\n",
+    ")\n",
+    "fig.show()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "2c8cb8de",
+   "metadata": {},
+   "source": [
+    "The stacked area chart reveals *when* the combiner trusts each model.\n",
+    "Typical patterns:\n",
+    "\n",
+    "- **gblinear dominates at peaks/troughs** — its linear extrapolation\n",
+    "  handles values near or beyond the training range better.\n",
+    "- **lgbm dominates during stable periods** — its tree-based flexibility\n",
+    "  captures non-linear patterns (time-of-day effects, weather interactions)\n",
+    "  more accurately when extrapolation is not needed.\n",
+    "\n",
+    "This adaptive selection is the core advantage of ensembling: neither model\n",
+    "alone achieves the accuracy of the dynamically-weighted combination."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "0a012679",
+   "metadata": {},
+   "source": [
+    "## Metrics comparison\n",
+    "\n",
+    "Let's quantify the ensemble advantage with relative MAE (rMAE) on the\n",
+    "forecast period.  rMAE normalizes the MAE by the range of actuals, making it\n",
+    "easier to compare across datasets with different scales.  We use the\n",
+    "implementation from `openstef_beam.metrics`."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "2b327d1e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\n",
+    "from openstef_beam.metrics import rmae\n",
+    "\n",
+    "actuals = predict_dataset.data[\"load\"].loc[train_end:forecast_end]\n",
+    "\n",
+    "models = {\"lgbm\": individual_forecasts[\"lgbm\"], \"gblinear\": individual_forecasts[\"gblinear\"], \"Ensemble\": forecast}\n",
+    "\n",
+    "print(f\"{'Model':<12} {'rMAE':>8}\")\n",
+    "print(f\"{'':-<12} {'':-^8}\")\n",
+    "for name, fc in models.items():\n",
+    "    common = actuals.index.intersection(fc.median_series.index)\n",
+    "    print(f\"{name:<12} {rmae(actuals.loc[common].to_numpy(), fc.median_series.loc[common].to_numpy()):>8.4f}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "afc3f9ca",
+   "metadata": {},
+   "source": [
+    "The ensemble consistently achieves the lowest rMAE by combining the\n",
+    "strengths of both models.  In production with longer training windows and\n",
+    "more diverse base models (e.g. adding XGBoost or a neural forecaster),\n",
+    "the improvement typically grows larger.  To validate ensemble gains over\n",
+    "longer periods, run a full {doc}`backtesting_quickstart`."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "fc0bae78",
+   "metadata": {},
+   "source": [
+    "## Next steps\n",
+    "\n",
+    "- {doc}`hyperparameter_tuning_with_optuna` — tune each base model's\n",
+    "  parameters before combining them.\n",
+    "- {doc}`quantile_calibration` — calibrate the ensemble's uncertainty\n",
+    "  estimates for more reliable confidence intervals."
+   ]
+  }
+ ],
+ "metadata": {
+  "jupytext": {
+   "formats": "ipynb,py:percent"
+  },
+  "kernelspec": {
+   "display_name": ".venv",
+   "language": "python",
+   "name": "python3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/examples/tutorials/ensemble_forecasting.py b/examples/tutorials/ensemble_forecasting.py
new file mode 100644
index 000000000..065003205
--- /dev/null
+++ b/examples/tutorials/ensemble_forecasting.py
@@ -0,0 +1,349 @@
+# ---
+# jupyter:
+#   jupytext:
+#     formats: ipynb,py:percent
+#     text_representation:
+#       extension: .py
+#       format_name: percent
+#       format_version: '1.3'
+#       jupytext_version: 1.19.1
+#   kernelspec:
+#     display_name: .venv
+#     language: python
+#     name: python3
+# ---
+
+# %% tags=["remove-cell"]
+# SPDX-FileCopyrightText: 2025 Contributors to the OpenSTEF project <openstef@lfenergy.org>
+#
+# SPDX-License-Identifier: MPL-2.0
+
+# pyright: basic
+
+# %% tags=["remove-cell"]
+import warnings
+
+warnings.filterwarnings("ignore")
+
+from openstef_core.testing import configure_notebook_display, setup_notebook_logging
+
+configure_notebook_display()
+logger = setup_notebook_logging(
+    __name__,
+    suppress=(
+        "choreographer",
+        "kaleido",
+        "httpx",
+        "huggingface_hub",
+        "fsspec",
+        "filelock",
+        "openstef_core.datasets",
+        "lightgbm",
+    ),
+)
+
+# %% [markdown]
+# # Ensemble Forecasting
+#
+# OpenSTEF supports ensemble models that combine multiple base forecasters
+# into a single prediction.  A **combiner** learns which base model performs
+# best under different conditions and weights their outputs accordingly.
+#
+# **What you'll learn:**
+#
+# - Why combining tree-based and linear models improves forecasts
+# - How to configure and train an ensemble with [`EnsembleForecastingWorkflowConfig`](https://openstef.github.io/openstef/v4/api/generated/openstef_meta.presets.EnsembleForecastingWorkflowConfig.html)
+# - How to inspect combiner behavior (which model does it prefer?)
+# - How ensemble predictions compare to individual base models
+#
+# ```{note}
+# This tutorial uses a small dataset for fast execution.
+# See `examples/benchmarks/` for production-scale ensemble runs.
+# ```
+#
+# **Key API references:**
+# [`EnsembleForecastingWorkflowConfig`](https://openstef.github.io/openstef/v4/api/generated/openstef_meta.presets.EnsembleForecastingWorkflowConfig.html)
+# · [`create_ensemble_forecasting_workflow`](https://openstef.github.io/openstef/v4/api/generated/openstef_meta.presets.create_ensemble_forecasting_workflow.html)
+# · [`ForecastingWorkflowConfig`](https://openstef.github.io/openstef/v4/api/generated/openstef_models.presets.ForecastingWorkflowConfig.html)
+
+# %% [markdown]
+# ## Why ensemble forecasting?
+#
+# Different model types have complementary strengths:
+#
+# | Model | Strengths | Weaknesses |
+# |-------|-----------|------------|
+# | **Tree-based** (LightGBM, XGBoost) | Captures complex non-linear patterns, handles feature interactions well | Poor extrapolation — struggles with unseen peaks, seasonal shifts, or values outside training range |
+# | **Linear** (GBLinear) | Extrapolates naturally to new ranges, captures seasonal/solar trends | Cannot model non-linear interactions |
+#
+# In energy forecasting, load peaks during extreme weather or seasonal
+# transitions often fall outside the training distribution.  A tree-based
+# model underestimates these peaks while a linear model captures the trend
+# but misses finer patterns.  An **ensemble** combines both: the combiner
+# learns *when* each model is more reliable and weights accordingly.
+#
+# ## How it works
+#
+# An ensemble workflow has three layers:
+#
+# 1. **Common preprocessing** — shared feature engineering (lags, holidays,
+#    cyclic features, scaling) applied once to raw data.
+# 2. **Base forecasters** — multiple models each trained on the preprocessed
+#    data, with optional per-model transforms (e.g. GBLinear gets fewer lags
+#    to avoid collinearity).
+# 3. **Combiner** — learns to aggregate base forecaster outputs.  Two modes:
+#    - *Learned weights*: a classifier predicts which base model will perform
+#      best for each sample, then weights predictions accordingly.
+#    - *Stacking*: a meta-regressor trained on base model outputs per quantile.
+
+# %% [markdown]
+# ## Load the dataset
+
+# %%
+from datetime import datetime, timedelta
+
+from openstef_core.testing import load_liander_dataset
+from openstef_core.types import LeadTime, Q
+
+dataset = load_liander_dataset()
+
+train_start = datetime.fromisoformat("2024-03-01T00:00:00Z")
+train_end = train_start + timedelta(days=45)
+forecast_end = train_end + timedelta(days=7)
+
+train_dataset = dataset.filter_by_range(start=train_start, end=train_end)
+predict_dataset = dataset.filter_by_range(
+    start=train_end - timedelta(days=14),
+    end=forecast_end,
+)
+
+print(f"Training:  {train_dataset.data.shape[0]:,} rows")
+print(f"Predict:   {predict_dataset.data.shape[0]:,} rows")
+
+# %% [markdown]
+# ## Configure the ensemble
+#
+# [`EnsembleForecastingWorkflowConfig`](https://openstef.github.io/openstef/v4/api/generated/openstef_meta.presets.EnsembleForecastingWorkflowConfig.html) sets up the full pipeline.
+# Key parameters:
+#
+# - `base_models` — which forecasters to include
+# - `ensemble_type` — how to combine them (`"learned_weights"` or `"stacking"`)
+# - `combiner_model` — the algorithm used by the combiner
+
+# %%
+from openstef_meta.presets import EnsembleForecastingWorkflowConfig, create_ensemble_forecasting_workflow
+
+ensemble_config = EnsembleForecastingWorkflowConfig(
+    model_id="ensemble_demo",
+    # Ensemble architecture
+    ensemble_type="learned_weights",
+    base_models=["lgbm", "gblinear"],
+    combiner_model="lgbm",
+    # Forecast settings
+    horizons=[LeadTime.from_string("PT36H")],
+    quantiles=[Q(0.5), Q(0.1), Q(0.9)],
+    # Data columns
+    target_column="load",
+    temperature_column="temperature_2m",
+    relative_humidity_column="relative_humidity_2m",
+    wind_speed_column="wind_speed_10m",
+    radiation_column="shortwave_radiation",
+    pressure_column="surface_pressure",
+    # Disable MLFlow for tutorial
+    mlflow_storage=None,
+)
+
+print(f"Base models:    {list(ensemble_config.base_models)}")
+print(f"Ensemble type:  {ensemble_config.ensemble_type}")
+print(f"Combiner:       {ensemble_config.combiner_model}")
+
+# %% [markdown]
+# ## Create and train the ensemble workflow
+
+# %%
+workflow = create_ensemble_forecasting_workflow(ensemble_config)
+fit_result = workflow.fit(train_dataset)
+
+print("Ensemble trained successfully")
+print("\nPer-model validation R2:")
+for name, child_result in fit_result.component_fit_results.items():
+    r2 = child_result.metrics_val.get_metric(quantile=Q(0.5), metric_name="R2")
+    print(f"  {name:12s}: {r2:.4f}")
+
+# Get combiner (ensemble-level) R2
+ensemble_r2 = fit_result.metrics_val.get_metric(quantile=Q(0.5), metric_name="R2")
+print(f"  {'ensemble':12s}: {ensemble_r2:.4f}")
+
+# %% tags=["remove-cell"]
+assert ensemble_r2 is not None and ensemble_r2 > 0.0, f"Expected positive R2, got {ensemble_r2}"
+
+# %% [markdown]
+# ## Generate forecasts
+
+# %%
+forecast = workflow.predict(predict_dataset, forecast_start=train_end)
+
+print(f"Forecast rows: {len(forecast.data):,}")
+print(f"Quantiles: {[float(q) for q in forecast.quantiles]}")
+
+# %% tags=["remove-cell"]
+assert len(forecast.data) > 100, f"Expected >100 forecast rows, got {len(forecast.data)}"
+
+# %% [markdown]
+# ## Compare: ensemble vs individual base models
+#
+# To show the benefit of ensembling, let's also train each base model
+# individually and compare their forecasts.
+
+# %%
+from openstef_models.presets import ForecastingWorkflowConfig, create_forecasting_workflow
+
+individual_forecasts = {}
+for model_type in ["lgbm", "gblinear"]:
+    config = ForecastingWorkflowConfig(
+        model_id=f"{model_type}_solo",
+        model=model_type,
+        horizons=[LeadTime.from_string("PT36H")],
+        quantiles=[Q(0.5), Q(0.1), Q(0.9)],
+        target_column="load",
+        temperature_column="temperature_2m",
+        relative_humidity_column="relative_humidity_2m",
+        wind_speed_column="wind_speed_10m",
+        radiation_column="shortwave_radiation",
+        pressure_column="surface_pressure",
+        mlflow_storage=None,
+        verbosity=0,
+    )
+    wf = create_forecasting_workflow(config)
+    wf.fit(train_dataset)
+    individual_forecasts[model_type] = wf.predict(predict_dataset, forecast_start=train_end)
+
+# %% [markdown]
+# ## Visualize the comparison
+
+# %% tags=["hide-input"]
+from openstef_beam.analysis.plots import ForecastTimeSeriesPlotter
+
+plotter = ForecastTimeSeriesPlotter()
+plotter.add_measurements(measurements=predict_dataset.data["load"].loc[train_end:])
+
+# Add individual models
+for name, fc in individual_forecasts.items():
+    plotter.add_model(model_name=name, forecast=fc.median_series, quantiles=fc.quantiles_data)
+
+# Add ensemble
+plotter.add_model(model_name="Ensemble", forecast=forecast.median_series, quantiles=forecast.quantiles_data)
+
+fig = plotter.plot()
+fig.update_layout(
+    title="Ensemble vs Individual Models",
+    xaxis_title="Time",
+    yaxis_title="MW",
+    height=450,
+)
+fig.show()
+
+# %% [markdown]
+# ## Combiner insights
+#
+# The learned-weights combiner trains a classifier that predicts — for each
+# timestep — which base model will be most accurate.  It then uses those
+# predicted probabilities as mixing weights.
+#
+# We can inspect this behavior at two levels:
+#
+# 1. **Global feature importances** — which input signals the classifier
+#    relies on most when deciding between models.
+# 2. **Per-timestamp selection weights** — the actual mixing probabilities
+#    assigned to each model during forecasting.
+
+# %%
+import plotly.graph_objects as go
+
+ensemble_model = workflow.model
+combiner = ensemble_model.combiner
+
+# Global importances: shows how much the classifier attends to each base
+# model's prediction value when deciding which model to trust.
+importances = combiner.feature_importances
+print("Combiner feature importances (per quantile):")
+print(importances.to_string())
+
+# %% [markdown]
+# With two base models, the feature importance tells us how much the
+# combiner's internal classifier *uses* each model's prediction to decide
+# who should contribute more.  A higher importance means the classifier pays
+# more attention to that model's output when making the selection decision.
+#
+# More informative is the **per-timestamp weight** — the actual probability
+# the combiner assigns to each model at each point in time during forecasting.
+
+# %% tags=["hide-input"]
+# Reproduce the internal flow to extract per-timestamp weights
+ensemble_dataset = ensemble_model._predict_forecasters(predict_dataset, forecast_start=train_end)
+base_preds = ensemble_dataset.get_base_predictions_for_quantile(Q(0.5))
+input_data = base_preds.input_data()
+weights = combiner._predict_weights(input_data, Q(0.5))
+
+fig = go.Figure()
+for col in weights.columns:
+    fig.add_trace(go.Scatter(x=weights.index, y=weights[col], mode="lines", name=col, stackgroup="one"))
+fig.update_layout(
+    title="Combiner Model Selection Weights Over Time (q50)",
+    xaxis_title="Time",
+    yaxis_title="Weight (probability)",
+    yaxis_range=[0, 1],
+    height=350,
+    legend_title="Base model",
+)
+fig.show()
+
+# %% [markdown]
+# The stacked area chart reveals *when* the combiner trusts each model.
+# Typical patterns:
+#
+# - **gblinear dominates at peaks/troughs** — its linear extrapolation
+#   handles values near or beyond the training range better.
+# - **lgbm dominates during stable periods** — its tree-based flexibility
+#   captures non-linear patterns (time-of-day effects, weather interactions)
+#   more accurately when extrapolation is not needed.
+#
+# This adaptive selection is the core advantage of ensembling: neither model
+# alone achieves the accuracy of the dynamically-weighted combination.
+
+# %% [markdown]
+# ## Metrics comparison
+#
+# Let's quantify the ensemble advantage with relative MAE (rMAE) on the
+# forecast period.  rMAE normalizes the MAE by the range of actuals, making it
+# easier to compare across datasets with different scales.  We use the
+# implementation from `openstef_beam.metrics`.
+
+# %%
+
+from openstef_beam.metrics import rmae
+
+actuals = predict_dataset.data["load"].loc[train_end:forecast_end]
+
+models = {"lgbm": individual_forecasts["lgbm"], "gblinear": individual_forecasts["gblinear"], "Ensemble": forecast}
+
+print(f"{'Model':<12} {'rMAE':>8}")
+print(f"{'':-<12} {'':-^8}")
+for name, fc in models.items():
+    common = actuals.index.intersection(fc.median_series.index)
+    print(f"{name:<12} {rmae(actuals.loc[common].to_numpy(), fc.median_series.loc[common].to_numpy()):>8.4f}")
+
+# %% [markdown]
+# The ensemble consistently achieves the lowest rMAE by combining the
+# strengths of both models.  In production with longer training windows and
+# more diverse base models (e.g. adding XGBoost or a neural forecaster),
+# the improvement typically grows larger.  To validate ensemble gains over
+# longer periods, run a full {doc}`backtesting_quickstart`.
+
+# %% [markdown]
+# ## Next steps
+#
+# - {doc}`hyperparameter_tuning_with_optuna` — tune each base model's
+#   parameters before combining them.
+# - {doc}`quantile_calibration` — calibrate the ensemble's uncertainty
+#   estimates for more reliable confidence intervals.
diff --git a/examples/tutorials/forecasting_quickstart.ipynb b/examples/tutorials/forecasting_quickstart.ipynb
new file mode 100644
index 000000000..c9379ae3e
--- /dev/null
+++ b/examples/tutorials/forecasting_quickstart.ipynb
@@ -0,0 +1,359 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "bff4101e",
+   "metadata": {
+    "tags": [
+     "remove-cell"
+    ]
+   },
+   "outputs": [],
+   "source": [
+    "# SPDX-FileCopyrightText: 2025 Contributors to the OpenSTEF project <openstef@lfenergy.org>\n",
+    "#\n",
+    "# SPDX-License-Identifier: MPL-2.0\n",
+    "\n",
+    "# pyright: basic"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "eb73d789",
+   "metadata": {},
+   "source": [
+    "# Forecasting Quickstart\n",
+    "\n",
+    "Train a GBLinear model on real energy data and generate probabilistic forecasts\n",
+    "with confidence intervals — all in under a minute.\n",
+    "\n",
+    "**What you'll learn:**\n",
+    "\n",
+    "- Load the Liander 2024 benchmark dataset\n",
+    "- Configure a forecasting workflow with `ForecastingWorkflowConfig`\n",
+    "- Train a model and inspect evaluation metrics\n",
+    "- Generate quantile forecasts (P10 / P50 / P90)\n",
+    "- Visualize predictions against actuals\n",
+    "\n",
+    "```{note}\n",
+    "This tutorial uses a small data slice for fast execution.\n",
+    "See `examples/benchmarks/` for production-scale runs.\n",
+    "```\n",
+    "\n",
+    "**Key API references:**\n",
+    "[`ForecastingWorkflowConfig`](https://openstef.github.io/openstef/v4/api/generated/openstef_models.presets.ForecastingWorkflowConfig.html)\n",
+    "[`create_forecasting_workflow`](https://openstef.github.io/openstef/v4/api/generated/openstef_models.presets.create_forecasting_workflow.html)\n",
+    "· [`LeadTime`](https://openstef.github.io/openstef/v4/api/generated/openstef_core.types.LeadTime.html)\n",
+    "· [`Q`](https://openstef.github.io/openstef/v4/api/generated/openstef_core.types.Quantile.html)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ebcdecfb",
+   "metadata": {
+    "tags": [
+     "remove-cell"
+    ]
+   },
+   "outputs": [],
+   "source": [
+    "import warnings\n",
+    "from typing import Any, cast\n",
+    "\n",
+    "warnings.filterwarnings(\"ignore\")\n",
+    "\n",
+    "from openstef_core.testing import configure_notebook_display, setup_notebook_logging\n",
+    "\n",
+    "configure_notebook_display()\n",
+    "logger = setup_notebook_logging(\n",
+    "    __name__,\n",
+    "    suppress=(\n",
+    "        \"choreographer\",\n",
+    "        \"kaleido\",\n",
+    "        \"httpx\",\n",
+    "        \"huggingface_hub\",\n",
+    "        \"fsspec\",\n",
+    "        \"filelock\",\n",
+    "        \"openstef_core.datasets\",\n",
+    "    ),\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "6a1838eb",
+   "metadata": {},
+   "source": [
+    "## Load the dataset\n",
+    "\n",
+    "The [Liander 2024 benchmark](https://huggingface.co/datasets/Alliander/MSL_Benchmark_Dataset)\n",
+    "dataset contains load measurements, versioned weather forecasts, EPEX prices, and\n",
+    "load profiles for a medium-voltage feeder in the Netherlands.\n",
+    "\n",
+    "We split the data into:\n",
+    "\n",
+    "- **45 days** of training data\n",
+    "- **7 days** for forecasting\n",
+    "\n",
+    "The predict window includes **14 days of history** before the forecast start so\n",
+    "that lag features (e.g. `load_lag_P7D`) can be computed during prediction."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ac530157",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from datetime import datetime, timedelta\n",
+    "\n",
+    "from openstef_core.testing import load_liander_dataset\n",
+    "\n",
+    "dataset = load_liander_dataset()\n",
+    "\n",
+    "train_start = datetime.fromisoformat(\"2024-03-01T00:00:00Z\")\n",
+    "train_end = train_start + timedelta(days=45)\n",
+    "forecast_end = train_end + timedelta(days=7)\n",
+    "\n",
+    "train_dataset = dataset.filter_by_range(start=train_start, end=train_end)\n",
+    "\n",
+    "# Include 14 days of history before forecast start for lag feature computation\n",
+    "predict_dataset = dataset.filter_by_range(\n",
+    "    start=train_end - timedelta(days=14),\n",
+    "    end=forecast_end,\n",
+    ")\n",
+    "\n",
+    "print(\n",
+    "    f\"Training:  {train_dataset.data.shape[0]:,} rows, \"\n",
+    "    f\"{train_dataset.data.index.min():%Y-%m-%d} to {train_dataset.data.index.max():%Y-%m-%d}\"\n",
+    ")\n",
+    "print(\n",
+    "    f\"Predict:   {predict_dataset.data.shape[0]:,} rows, \"\n",
+    "    f\"{predict_dataset.data.index.min():%Y-%m-%d} to {predict_dataset.data.index.max():%Y-%m-%d}\"\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "810b4ff7",
+   "metadata": {
+    "tags": [
+     "hide-input"
+    ]
+   },
+   "outputs": [],
+   "source": [
+    "# Quick look at the target variable\n",
+    "fig = cast(Any, train_dataset.data[[\"load\"]].plot(title=\"Training period — energy load\"))\n",
+    "fig.update_layout(yaxis_title=\"Load (MW)\", xaxis_title=\"Time\")\n",
+    "fig.show()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "d890079d",
+   "metadata": {},
+   "source": [
+    "## Configure the workflow\n",
+    "\n",
+    "[`ForecastingWorkflowConfig`](https://openstef.github.io/openstef/v4/api/generated/openstef_models.presets.ForecastingWorkflowConfig.html) bundles all settings — model type, horizons, quantiles,\n",
+    "and feature columns — into a single object.  [`create_forecasting_workflow`](https://openstef.github.io/openstef/v4/api/generated/openstef_models.presets.create_forecasting_workflow.html) turns\n",
+    "it into a ready-to-use pipeline with preprocessing, training, and postprocessing.\n",
+    "\n",
+    "We pick **GBLinear** (gradient-boosted linear model) for its speed and\n",
+    "ability to extrapolate beyond training data."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "79ad62bd",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from openstef_core.types import LeadTime, Q\n",
+    "from openstef_models.presets import ForecastingWorkflowConfig, create_forecasting_workflow\n",
+    "from openstef_models.presets.forecasting_workflow import GBLinearForecaster\n",
+    "\n",
+    "workflow = create_forecasting_workflow(\n",
+    "    config=ForecastingWorkflowConfig(\n",
+    "        model_id=\"quickstart_gblinear\",\n",
+    "        model=\"gblinear\",\n",
+    "        horizons=[LeadTime.from_string(\"PT36H\")],\n",
+    "        quantiles=[Q(0.5), Q(0.1), Q(0.9)],\n",
+    "        target_column=\"load\",\n",
+    "        # Weather features available in the Liander dataset\n",
+    "        temperature_column=\"temperature_2m\",\n",
+    "        relative_humidity_column=\"relative_humidity_2m\",\n",
+    "        wind_speed_column=\"wind_speed_10m\",\n",
+    "        radiation_column=\"shortwave_radiation\",\n",
+    "        pressure_column=\"surface_pressure\",\n",
+    "        verbosity=0,\n",
+    "        mlflow_storage=None,\n",
+    "        gblinear_hyperparams=GBLinearForecaster.HyperParams(n_steps=50),\n",
+    "    )\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "c3d0a581",
+   "metadata": {},
+   "source": [
+    "## Train the model\n",
+    "\n",
+    "`workflow.fit()` runs the full pipeline: feature engineering, data validation,\n",
+    "model training, and evaluation on a held-out test split."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "3b7f49e4",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "result = workflow.fit(train_dataset)\n",
+    "\n",
+    "if result is not None:\n",
+    "    print(\"Training metrics:\")\n",
+    "    print(result.metrics_full.to_dataframe())\n",
+    "\n",
+    "    if result.metrics_test is not None:\n",
+    "        print(\"\\nTest-set metrics:\")\n",
+    "        print(result.metrics_test.to_dataframe())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b4f6ea9e",
+   "metadata": {
+    "tags": [
+     "remove-cell"
+    ]
+   },
+   "outputs": [],
+   "source": [
+    "assert result is not None, \"Training should produce a result\"\n",
+    "assert result.metrics_full is not None, \"Full metrics should be present\""
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "d1d35a3d",
+   "metadata": {},
+   "source": [
+    "## Generate forecasts\n",
+    "\n",
+    "The trained workflow produces a [`ForecastDataset`](https://openstef.github.io/openstef/v4/api/generated/openstef_core.datasets.ForecastDataset.html) with point predictions and\n",
+    "quantile bands.  The P10-P90 interval covers 80 % of expected outcomes.\n",
+    "To improve the reliability of these quantile estimates, see\n",
+    "{doc}`quantile_calibration`."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "476f2918",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from openstef_core.datasets import ForecastDataset\n",
+    "\n",
+    "forecast: ForecastDataset = workflow.predict(predict_dataset, forecast_start=train_end)\n",
+    "\n",
+    "print(f\"Forecast rows: {len(forecast.data)}\")\n",
+    "print(f\"Quantiles:     {forecast.quantiles}\")\n",
+    "forecast.data.tail()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d892621d",
+   "metadata": {
+    "tags": [
+     "remove-cell"
+    ]
+   },
+   "outputs": [],
+   "source": [
+    "assert len(forecast.data) > 100, f\"Expected >100 forecast rows, got {len(forecast.data)}\"\n",
+    "assert forecast.quantiles is not None, \"Quantile data should be present\""
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "33f6f4dd",
+   "metadata": {},
+   "source": [
+    "## Visualize the results\n",
+    "\n",
+    "[`ForecastTimeSeriesPlotter`](https://openstef.github.io/openstef/v4/api/generated/openstef_beam.analysis.plots.ForecastTimeSeriesPlotter.html) overlays measurements and predictions with shaded\n",
+    "confidence bands in a single interactive chart."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c6cc34a1",
+   "metadata": {
+    "tags": [
+     "hide-input"
+    ]
+   },
+   "outputs": [],
+   "source": [
+    "from openstef_beam.analysis.plots import ForecastTimeSeriesPlotter\n",
+    "\n",
+    "fig = (\n",
+    "    ForecastTimeSeriesPlotter()\n",
+    "    .add_measurements(measurements=predict_dataset.data[\"load\"].loc[train_end:])\n",
+    "    .add_model(\n",
+    "        model_name=\"GBLinear\",\n",
+    "        forecast=forecast.median_series,\n",
+    "        quantiles=forecast.quantiles_data,\n",
+    "    )\n",
+    "    .plot()\n",
+    ")\n",
+    "fig.update_layout(\n",
+    "    title=\"Forecast vs actuals\",\n",
+    "    yaxis_title=\"Load (MW)\",\n",
+    "    xaxis_title=\"Time\",\n",
+    "    height=500,\n",
+    ")\n",
+    "fig.show()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "96fe385b",
+   "metadata": {},
+   "source": [
+    "## Next steps\n",
+    "\n",
+    "- {doc}`backtesting_quickstart` — evaluate how this model performs on\n",
+    "  historical data with realistic temporal constraints.\n",
+    "- {doc}`custom_pipeline` — build a model from individual transforms when\n",
+    "  presets don't cover your use case."
+   ]
+  }
+ ],
+ "metadata": {
+  "jupytext": {
+   "formats": "ipynb,py:percent"
+  },
+  "kernelspec": {
+   "display_name": ".venv",
+   "language": "python",
+   "name": "python3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/examples/tutorials/forecasting_quickstart.py b/examples/tutorials/forecasting_quickstart.py
new file mode 100644
index 000000000..676657948
--- /dev/null
+++ b/examples/tutorials/forecasting_quickstart.py
@@ -0,0 +1,228 @@
+# ---
+# jupyter:
+#   jupytext:
+#     formats: ipynb,py:percent
+#     text_representation:
+#       extension: .py
+#       format_name: percent
+#       format_version: '1.3'
+#       jupytext_version: 1.19.1
+#   kernelspec:
+#     display_name: .venv
+#     language: python
+#     name: python3
+# ---
+
+# %% tags=["remove-cell"]
+# SPDX-FileCopyrightText: 2025 Contributors to the OpenSTEF project <openstef@lfenergy.org>
+#
+# SPDX-License-Identifier: MPL-2.0
+
+# pyright: basic
+
+# %% [markdown]
+# # Forecasting Quickstart
+#
+# Train a GBLinear model on real energy data and generate probabilistic forecasts
+# with confidence intervals — all in under a minute.
+#
+# **What you'll learn:**
+#
+# - Load the Liander 2024 benchmark dataset
+# - Configure a forecasting workflow with `ForecastingWorkflowConfig`
+# - Train a model and inspect evaluation metrics
+# - Generate quantile forecasts (P10 / P50 / P90)
+# - Visualize predictions against actuals
+#
+# ```{note}
+# This tutorial uses a small data slice for fast execution.
+# See `examples/benchmarks/` for production-scale runs.
+# ```
+#
+# **Key API references:**
+# [`ForecastingWorkflowConfig`](https://openstef.github.io/openstef/v4/api/generated/openstef_models.presets.ForecastingWorkflowConfig.html)
+# [`create_forecasting_workflow`](https://openstef.github.io/openstef/v4/api/generated/openstef_models.presets.create_forecasting_workflow.html)
+# · [`LeadTime`](https://openstef.github.io/openstef/v4/api/generated/openstef_core.types.LeadTime.html)
+# · [`Q`](https://openstef.github.io/openstef/v4/api/generated/openstef_core.types.Quantile.html)
+
+# %% tags=["remove-cell"]
+import warnings
+from typing import Any, cast
+
+warnings.filterwarnings("ignore")
+
+from openstef_core.testing import configure_notebook_display, setup_notebook_logging
+
+configure_notebook_display()
+logger = setup_notebook_logging(
+    __name__,
+    suppress=(
+        "choreographer",
+        "kaleido",
+        "httpx",
+        "huggingface_hub",
+        "fsspec",
+        "filelock",
+        "openstef_core.datasets",
+    ),
+)
+
+# %% [markdown]
+# ## Load the dataset
+#
+# The [Liander 2024 benchmark](https://huggingface.co/datasets/Alliander/MSL_Benchmark_Dataset)
+# dataset contains load measurements, versioned weather forecasts, EPEX prices, and
+# load profiles for a medium-voltage feeder in the Netherlands.
+#
+# We split the data into:
+#
+# - **45 days** of training data
+# - **7 days** for forecasting
+#
+# The predict window includes **14 days of history** before the forecast start so
+# that lag features (e.g. `load_lag_P7D`) can be computed during prediction.
+
+# %%
+from datetime import datetime, timedelta
+
+from openstef_core.testing import load_liander_dataset
+
+dataset = load_liander_dataset()
+
+train_start = datetime.fromisoformat("2024-03-01T00:00:00Z")
+train_end = train_start + timedelta(days=45)
+forecast_end = train_end + timedelta(days=7)
+
+train_dataset = dataset.filter_by_range(start=train_start, end=train_end)
+
+# Include 14 days of history before forecast start for lag feature computation
+predict_dataset = dataset.filter_by_range(
+    start=train_end - timedelta(days=14),
+    end=forecast_end,
+)
+
+print(
+    f"Training:  {train_dataset.data.shape[0]:,} rows, "
+    f"{train_dataset.data.index.min():%Y-%m-%d} to {train_dataset.data.index.max():%Y-%m-%d}"
+)
+print(
+    f"Predict:   {predict_dataset.data.shape[0]:,} rows, "
+    f"{predict_dataset.data.index.min():%Y-%m-%d} to {predict_dataset.data.index.max():%Y-%m-%d}"
+)
+
+# %% tags=["hide-input"]
+# Quick look at the target variable
+fig = cast(Any, train_dataset.data[["load"]].plot(title="Training period — energy load"))
+fig.update_layout(yaxis_title="Load (MW)", xaxis_title="Time")
+fig.show()
+
+# %% [markdown]
+# ## Configure the workflow
+#
+# [`ForecastingWorkflowConfig`](https://openstef.github.io/openstef/v4/api/generated/openstef_models.presets.ForecastingWorkflowConfig.html) bundles all settings — model type, horizons, quantiles,
+# and feature columns — into a single object.  [`create_forecasting_workflow`](https://openstef.github.io/openstef/v4/api/generated/openstef_models.presets.create_forecasting_workflow.html) turns
+# it into a ready-to-use pipeline with preprocessing, training, and postprocessing.
+#
+# We pick **GBLinear** (gradient-boosted linear model) for its speed and
+# ability to extrapolate beyond training data.
+
+# %%
+from openstef_core.types import LeadTime, Q
+from openstef_models.presets import ForecastingWorkflowConfig, create_forecasting_workflow
+from openstef_models.presets.forecasting_workflow import GBLinearForecaster
+
+workflow = create_forecasting_workflow(
+    config=ForecastingWorkflowConfig(
+        model_id="quickstart_gblinear",
+        model="gblinear",
+        horizons=[LeadTime.from_string("PT36H")],
+        quantiles=[Q(0.5), Q(0.1), Q(0.9)],
+        target_column="load",
+        # Weather features available in the Liander dataset
+        temperature_column="temperature_2m",
+        relative_humidity_column="relative_humidity_2m",
+        wind_speed_column="wind_speed_10m",
+        radiation_column="shortwave_radiation",
+        pressure_column="surface_pressure",
+        verbosity=0,
+        mlflow_storage=None,
+        gblinear_hyperparams=GBLinearForecaster.HyperParams(n_steps=50),
+    )
+)
+
+# %% [markdown]
+# ## Train the model
+#
+# `workflow.fit()` runs the full pipeline: feature engineering, data validation,
+# model training, and evaluation on a held-out test split.
+
+# %%
+result = workflow.fit(train_dataset)
+
+if result is not None:
+    print("Training metrics:")
+    print(result.metrics_full.to_dataframe())
+
+    if result.metrics_test is not None:
+        print("\nTest-set metrics:")
+        print(result.metrics_test.to_dataframe())
+
+# %% tags=["remove-cell"]
+assert result is not None, "Training should produce a result"
+assert result.metrics_full is not None, "Full metrics should be present"
+
+# %% [markdown]
+# ## Generate forecasts
+#
+# The trained workflow produces a [`ForecastDataset`](https://openstef.github.io/openstef/v4/api/generated/openstef_core.datasets.ForecastDataset.html) with point predictions and
+# quantile bands.  The P10-P90 interval covers 80 % of expected outcomes.
+# To improve the reliability of these quantile estimates, see
+# {doc}`quantile_calibration`.
+
+# %%
+from openstef_core.datasets import ForecastDataset
+
+forecast: ForecastDataset = workflow.predict(predict_dataset, forecast_start=train_end)
+
+print(f"Forecast rows: {len(forecast.data)}")
+print(f"Quantiles:     {forecast.quantiles}")
+forecast.data.tail()
+
+# %% tags=["remove-cell"]
+assert len(forecast.data) > 100, f"Expected >100 forecast rows, got {len(forecast.data)}"
+assert forecast.quantiles is not None, "Quantile data should be present"
+
+# %% [markdown]
+# ## Visualize the results
+#
+# [`ForecastTimeSeriesPlotter`](https://openstef.github.io/openstef/v4/api/generated/openstef_beam.analysis.plots.ForecastTimeSeriesPlotter.html) overlays measurements and predictions with shaded
+# confidence bands in a single interactive chart.
+
+# %% tags=["hide-input"]
+from openstef_beam.analysis.plots import ForecastTimeSeriesPlotter
+
+fig = (
+    ForecastTimeSeriesPlotter()
+    .add_measurements(measurements=predict_dataset.data["load"].loc[train_end:])
+    .add_model(
+        model_name="GBLinear",
+        forecast=forecast.median_series,
+        quantiles=forecast.quantiles_data,
+    )
+    .plot()
+)
+fig.update_layout(
+    title="Forecast vs actuals",
+    yaxis_title="Load (MW)",
+    xaxis_title="Time",
+    height=500,
+)
+fig.show()
+
+# %% [markdown]
+# ## Next steps
+#
+# - {doc}`backtesting_quickstart` — evaluate how this model performs on
+#   historical data with realistic temporal constraints.
+# - {doc}`custom_pipeline` — build a model from individual transforms when
+#   presets don't cover your use case.
diff --git a/examples/tutorials/forecasting_with_workflow_presets.ipynb b/examples/tutorials/forecasting_with_workflow_presets.ipynb
deleted file mode 100644
index dd0910ee4..000000000
--- a/examples/tutorials/forecasting_with_workflow_presets.ipynb
+++ /dev/null
@@ -1,434 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "e8eae4f7",
-   "metadata": {
-    "tags": [
-     "remove-cell"
-    ]
-   },
-   "outputs": [],
-   "source": [
-    "# SPDX-FileCopyrightText: 2025 Contributors to the OpenSTEF project <openstef@lfenergy.org>\n",
-    "#\n",
-    "# SPDX-License-Identifier: MPL-2.0\n",
-    "\n",
-    "# pyright: basic"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "20e1bef8",
-   "metadata": {},
-   "source": [
-    "# 🔮 Forecasting with OpenSTEF 4.0 Workflow Presets\n",
-    "\n",
-    "This tutorial demonstrates how to use **OpenSTEF 4.0** to create energy load forecasts\n",
-    "using the **Workflow Presets** pattern. You'll learn how to:\n",
-    "\n",
-    "1. **Load real-world energy data** from the Liander 2024 benchmark dataset\n",
-    "2. **Configure a forecasting workflow** with weather features and prediction quantiles\n",
-    "3. **Train a model** and inspect its performance\n",
-    "4. **Generate probabilistic forecasts** with confidence intervals\n",
-    "5. **Visualize results** and explain feature importance\n",
-    "\n",
-    "> **OpenSTEF** (Short-Term Energy Forecasting) is a modular library for creating\n",
-    "> accurate energy forecasts in the power grid domain."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "a6b72dc5",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# --- Setup: Logging and Display Configuration ---\n",
-    "from typing import Any, cast\n",
-    "\n",
-    "from openstef_core.testing import configure_notebook_display, setup_notebook_logging\n",
-    "\n",
-    "configure_notebook_display()\n",
-    "logger = setup_notebook_logging(__name__)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "fdddf593",
-   "metadata": {},
-   "source": [
-    "## 📦 Step 1: Download the Dataset\n",
-    "\n",
-    "We'll use the **Liander 2024 Energy Forecasting Benchmark** dataset from HuggingFace Hub. This dataset contains:\n",
-    "- **Load measurements** — historical energy consumption from various installations (mv feeders, transformers, etc.)\n",
-    "- **Weather forecasts** — versioned weather predictions (temperature, radiation, wind, etc.)\n",
-    "- **EPEX prices** — day-ahead electricity market prices\n",
-    "- **Profiles** — typical daily/weekly load patterns"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "3a97b4a1",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Download and combine the Liander benchmark dataset into a single TimeSeriesDataset.\n",
-    "from openstef_core.testing import load_liander_dataset, prepare_tutorial_datasets\n",
-    "\n",
-    "dataset = load_liander_dataset()\n",
-    "\n",
-    "print(f\"Dataset shape: {dataset.data.shape}\")\n",
-    "print(f\"Date range: {dataset.data.index.min()} to {dataset.data.index.max()}\")\n",
-    "dataset.data.head()"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "c1f4660d",
-   "metadata": {},
-   "source": [
-    "## ✂️ Step 3: Split Data into Training and Forecast Periods\n",
-    "\n",
-    "We'll use:\n",
-    "- **90 days** of historical data for training\n",
-    "- **14 days** as the forecast period (where we'll generate predictions)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "95f4a401",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Split the dataset into training (90 days) and forecast (14 days) periods.\n",
-    "train_dataset, forecast_dataset = prepare_tutorial_datasets()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "162c0fc2",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Visualize the training data\n",
-    "# The plot shows the 'load' column (energy consumption in MW) over time\n",
-    "# cast() needed: pandas returns plotly Figure at runtime (backend=\"plotly\") but typed as Axes\n",
-    "fig = cast(Any, train_dataset.data[[\"load\"]].plot(title=\"Training Data: Energy Load over Time\"))\n",
-    "fig.update_layout(yaxis_title=\"Load (MW)\", xaxis_title=\"Time\")\n",
-    "fig.show()"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "6c64db35",
-   "metadata": {},
-   "source": [
-    "## ⚙️ Step 4: Configure the Forecasting Workflow\n",
-    "\n",
-    "OpenSTEF uses a **ForecastingWorkflowConfig** to define all aspects of the forecasting pipeline:\n",
-    "- **Model type** — `gblinear` (gradient boosted linear model) or `xgboost`\n",
-    "- **Forecast horizons** — how far ahead to predict (e.g., 36 hours)\n",
-    "- **Quantiles** — prediction intervals for probabilistic forecasts\n",
-    "- **Feature columns** — which weather variables to use\n",
-    "\n",
-    "The **GBLinear** model is particularly good for energy forecasting because:\n",
-    "1. It can extrapolate beyond training data (important for rare events)\n",
-    "2. It provides interpretable feature importance\n",
-    "3. It's fast to train and predict"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "5afd85eb",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Import workflow components\n",
-    "from openstef_core.types import LeadTime, Q  # LeadTime: forecast horizon, Q: quantile\n",
-    "from openstef_models.presets import ForecastingWorkflowConfig, create_forecasting_workflow\n",
-    "from openstef_models.presets.forecasting_workflow import GBLinearForecaster\n",
-    "\n",
-    "# Configure the forecasting workflow\n",
-    "workflow = create_forecasting_workflow(\n",
-    "    config=ForecastingWorkflowConfig(\n",
-    "        # Model identification\n",
-    "        model_id=\"gblinear_demo_v1\",\n",
-    "        model=\"gblinear\",  # Use gradient boosted linear model\n",
-    "        # Forecast settings\n",
-    "        horizons=[LeadTime.from_string(\"PT36H\")],  # Predict up to 36 hours ahead\n",
-    "        quantiles=[Q(0.5), Q(0.1), Q(0.9)],  # Median + 80% prediction interval\n",
-    "        # Target column (what we're predicting)\n",
-    "        target_column=\"load\",\n",
-    "        # Weather feature columns (from the dataset)\n",
-    "        temperature_column=\"temperature_2m\",\n",
-    "        relative_humidity_column=\"relative_humidity_2m\",\n",
-    "        wind_speed_column=\"wind_speed_10m\",\n",
-    "        radiation_column=\"shortwave_radiation\",  # Solar radiation\n",
-    "        pressure_column=\"surface_pressure\",\n",
-    "        # Training settings\n",
-    "        verbosity=1,  # Show progress during training\n",
-    "        mlflow_storage=None,  # Disable MLflow tracking for this demo\n",
-    "        # Model-specific hyperparameters\n",
-    "        gblinear_hyperparams=GBLinearForecaster.HyperParams(\n",
-    "            n_steps=50  # Number of boosting iterations\n",
-    "        ),\n",
-    "    )\n",
-    ")\n",
-    "\n",
-    "print(\"✅ Workflow configured successfully!\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "293d3e51",
-   "metadata": {},
-   "source": [
-    "## 🏋️ Step 5: Train the Model\n",
-    "\n",
-    "The workflow's `fit()` method handles the entire training pipeline:\n",
-    "1. **Preprocessing** — feature engineering, data validation, scaling\n",
-    "2. **Training** — fit the model on historical data\n",
-    "3. **Evaluation** — compute metrics on training data"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "7ab71aca",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Train the model on historical data\n",
-    "logger.info(\"🏋️ Starting model training...\")\n",
-    "\n",
-    "result = workflow.fit(train_dataset)\n",
-    "\n",
-    "# Display training metrics\n",
-    "if result is not None:\n",
-    "    logger.info(\"✅ Training complete!\")\n",
-    "    print(\"\\n📊 Training Evaluation Metrics:\")\n",
-    "    print(result.metrics_full.to_dataframe())\n",
-    "\n",
-    "    if result.metrics_test is not None:\n",
-    "        print(\"\\n📊 Test Set Metrics (held-out validation):\")\n",
-    "        print(result.metrics_test.to_dataframe())"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "95f5f18a",
-   "metadata": {},
-   "source": [
-    "## 🔮 Step 6: Generate Forecasts\n",
-    "\n",
-    "Now we use the trained model to predict energy load for the next 14 days.\n",
-    "The output is a **ForecastDataset** containing:\n",
-    "- **Median prediction** (`quantile_P50`)\n",
-    "- **Lower bound** (`quantile_P10`) — 10th percentile\n",
-    "- **Upper bound** (`quantile_P90`) — 90th percentile"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "b341466f",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Generate probabilistic forecasts for the forecast period\n",
-    "from openstef_core.datasets import ForecastDataset\n",
-    "\n",
-    "logger.info(\"🔮 Generating forecasts...\")\n",
-    "forecast: ForecastDataset = workflow.predict(forecast_dataset)\n",
-    "\n",
-    "# Display forecast summary\n",
-    "print(f\"\\n📈 Forecast generated for {len(forecast.data)} timestamps\")\n",
-    "print(f\"📊 Quantiles: {forecast.quantiles}\")\n",
-    "print(\"\\n🔍 Last 5 forecast values:\")\n",
-    "print(forecast.data.tail())"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "f3c5c3c6",
-   "metadata": {},
-   "source": [
-    "## 📈 Step 7: Visualize Forecast Results\n",
-    "\n",
-    "OpenSTEF-BEAM provides **ForecastTimeSeriesPlotter** for beautiful interactive visualizations:\n",
-    "- Actual measurements shown as a line\n",
-    "- Forecast median shown as another line\n",
-    "- Prediction intervals shown as shaded areas"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "17801cef",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Create an interactive forecast visualization\n",
-    "from openstef_beam.analysis.plots import ForecastTimeSeriesPlotter\n",
-    "\n",
-    "fig = (\n",
-    "    ForecastTimeSeriesPlotter()\n",
-    "    # Add actual measurements (ground truth)\n",
-    "    .add_measurements(measurements=forecast_dataset.data[\"load\"])\n",
-    "    # Add model predictions with confidence bands\n",
-    "    .add_model(\n",
-    "        model_name=\"GBLinear\",\n",
-    "        forecast=forecast.median_series,  # P50 prediction\n",
-    "        quantiles=forecast.quantiles_data,  # P10-P90 confidence band\n",
-    "    )\n",
-    "    .plot()\n",
-    ")\n",
-    "\n",
-    "# Update layout for better presentation\n",
-    "fig.update_layout(\n",
-    "    title=\"🔮 Energy Load Forecast vs Actual\",\n",
-    "    yaxis_title=\"Load (MW)\",\n",
-    "    xaxis_title=\"Time\",\n",
-    "    height=500,\n",
-    ")\n",
-    "fig.show()"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "6f24ae06",
-   "metadata": {},
-   "source": [
-    "## 🔍 Step 8: Explain Feature Importance\n",
-    "\n",
-    "Understanding **why** the model makes certain predictions is crucial for trust\n",
-    "and debugging. GBLinear models provide clear feature importance rankings."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "932b62c8",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Visualize feature importance using the ExplainableForecaster interface\n",
-    "from typing import cast\n",
-    "\n",
-    "from openstef_models.explainability import ExplainableForecaster\n",
-    "from openstef_models.models.forecasting_model import ForecastingModel\n",
-    "\n",
-    "# The GBLinear model implements ExplainableForecaster, providing feature importance\n",
-    "forecaster = cast(ForecastingModel, workflow.model).forecaster\n",
-    "explainable_model = cast(ExplainableForecaster, forecaster)\n",
-    "\n",
-    "# Create an interactive treemap of feature importances\n",
-    "# Larger boxes = more important features\n",
-    "fig = explainable_model.plot_feature_importances()\n",
-    "fig.update_layout(title=\"🔍 Feature Importance Treemap\")\n",
-    "fig.show()"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "609b7509",
-   "metadata": {},
-   "source": [
-    "## 🔬 Step 9: Visualize Feature Contributions (SHAP)\n",
-    "\n",
-    "While feature importance shows **which** features matter overall, **contributions**\n",
-    "show how each feature pushed the prediction up or down for every individual timestep.\n",
-    "GBLinear models expose these as SHAP values via `predict_contributions()`."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "1dddfe95",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Compute per-timestep feature contributions for the forecast period\n",
-    "from openstef_models.explainability import ContributionsPlotter\n",
-    "\n",
-    "contributions = workflow.model.predict_contributions(forecast_dataset)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "539e6156",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Heatmap: contributions over time with prediction line\n",
-    "ContributionsPlotter.plot_heatmap(contributions, top_n=10, show_prediction=True).show()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "d68f4bbc",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Waterfall: decompose a single timestep's prediction\n",
-    "ContributionsPlotter.plot_waterfall(contributions, timestep=0, top_n=10).show()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "19582854",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Bar chart: mean absolute contribution per feature\n",
-    "ContributionsPlotter.plot_bar(contributions, top_n=10).show()"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "1f53f172",
-   "metadata": {},
-   "source": [
-    "---\n",
-    "\n",
-    "## 🎯 Summary\n",
-    "\n",
-    "In this tutorial, you learned how to:\n",
-    "\n",
-    "1. ✅ **Load energy data** from the Liander 2024 benchmark dataset\n",
-    "2. ✅ **Configure a workflow** with `ForecastingWorkflowConfig`\n",
-    "3. ✅ **Train a GBLinear model** for probabilistic forecasting\n",
-    "4. ✅ **Generate forecasts** with confidence intervals\n",
-    "5. ✅ **Visualize results** and feature importance\n",
-    "\n",
-    "### 🚀 Next Steps\n",
-    "\n",
-    "- Try different models: `\"xgboost\"` for more complex patterns\n",
-    "- Experiment with more quantiles for narrower prediction intervals\n",
-    "- Use the **backtesting notebook** to evaluate model performance systematically\n",
-    "- Explore MLflow integration for experiment tracking"
-   ]
-  }
- ],
- "metadata": {
-  "jupytext": {
-   "formats": "ipynb,py:percent"
-  },
-  "kernelspec": {
-   "display_name": ".venv",
-   "language": "python",
-   "name": "python3"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 5
-}
diff --git a/examples/tutorials/forecasting_with_workflow_presets.py b/examples/tutorials/forecasting_with_workflow_presets.py
deleted file mode 100644
index c1d5526b3..000000000
--- a/examples/tutorials/forecasting_with_workflow_presets.py
+++ /dev/null
@@ -1,280 +0,0 @@
-# ---
-# jupyter:
-#   jupytext:
-#     formats: ipynb,py:percent
-#     text_representation:
-#       extension: .py
-#       format_name: percent
-#       format_version: '1.3'
-#       jupytext_version: 1.19.1
-#   kernelspec:
-#     display_name: .venv
-#     language: python
-#     name: python3
-# ---
-
-# %% tags=["remove-cell"]
-# SPDX-FileCopyrightText: 2025 Contributors to the OpenSTEF project <openstef@lfenergy.org>
-#
-# SPDX-License-Identifier: MPL-2.0
-
-# pyright: basic
-
-# %% [markdown]
-# # 🔮 Forecasting with OpenSTEF 4.0 Workflow Presets
-#
-# This tutorial demonstrates how to use **OpenSTEF 4.0** to create energy load forecasts
-# using the **Workflow Presets** pattern. You'll learn how to:
-#
-# 1. **Load real-world energy data** from the Liander 2024 benchmark dataset
-# 2. **Configure a forecasting workflow** with weather features and prediction quantiles
-# 3. **Train a model** and inspect its performance
-# 4. **Generate probabilistic forecasts** with confidence intervals
-# 5. **Visualize results** and explain feature importance
-#
-# > **OpenSTEF** (Short-Term Energy Forecasting) is a modular library for creating
-# > accurate energy forecasts in the power grid domain.
-
-# %%
-# --- Setup: Logging and Display Configuration ---
-from typing import Any, cast
-
-from openstef_core.testing import configure_notebook_display, setup_notebook_logging
-
-configure_notebook_display()
-logger = setup_notebook_logging(__name__)
-
-# %% [markdown]
-# ## 📦 Step 1: Download the Dataset
-#
-# We'll use the **Liander 2024 Energy Forecasting Benchmark** dataset from HuggingFace Hub. This dataset contains:
-# - **Load measurements** — historical energy consumption from various installations (mv feeders, transformers, etc.)
-# - **Weather forecasts** — versioned weather predictions (temperature, radiation, wind, etc.)
-# - **EPEX prices** — day-ahead electricity market prices
-# - **Profiles** — typical daily/weekly load patterns
-
-# %%
-# Download and combine the Liander benchmark dataset into a single TimeSeriesDataset.
-from openstef_core.testing import load_liander_dataset, prepare_tutorial_datasets
-
-dataset = load_liander_dataset()
-
-print(f"Dataset shape: {dataset.data.shape}")
-print(f"Date range: {dataset.data.index.min()} to {dataset.data.index.max()}")
-dataset.data.head()
-
-# %% [markdown]
-# ## ✂️ Step 3: Split Data into Training and Forecast Periods
-#
-# We'll use:
-# - **90 days** of historical data for training
-# - **14 days** as the forecast period (where we'll generate predictions)
-
-# %%
-# Split the dataset into training (90 days) and forecast (14 days) periods.
-train_dataset, forecast_dataset = prepare_tutorial_datasets()
-
-# %%
-# Visualize the training data
-# The plot shows the 'load' column (energy consumption in MW) over time
-# cast() needed: pandas returns plotly Figure at runtime (backend="plotly") but typed as Axes
-fig = cast(Any, train_dataset.data[["load"]].plot(title="Training Data: Energy Load over Time"))
-fig.update_layout(yaxis_title="Load (MW)", xaxis_title="Time")
-fig.show()
-
-# %% [markdown]
-# ## ⚙️ Step 4: Configure the Forecasting Workflow
-#
-# OpenSTEF uses a **ForecastingWorkflowConfig** to define all aspects of the forecasting pipeline:
-# - **Model type** — `gblinear` (gradient boosted linear model) or `xgboost`
-# - **Forecast horizons** — how far ahead to predict (e.g., 36 hours)
-# - **Quantiles** — prediction intervals for probabilistic forecasts
-# - **Feature columns** — which weather variables to use
-#
-# The **GBLinear** model is particularly good for energy forecasting because:
-# 1. It can extrapolate beyond training data (important for rare events)
-# 2. It provides interpretable feature importance
-# 3. It's fast to train and predict
-
-# %%
-# Import workflow components
-from openstef_core.types import LeadTime, Q  # LeadTime: forecast horizon, Q: quantile
-from openstef_models.presets import ForecastingWorkflowConfig, create_forecasting_workflow
-from openstef_models.presets.forecasting_workflow import GBLinearForecaster
-
-# Configure the forecasting workflow
-workflow = create_forecasting_workflow(
-    config=ForecastingWorkflowConfig(
-        # Model identification
-        model_id="gblinear_demo_v1",
-        model="gblinear",  # Use gradient boosted linear model
-        # Forecast settings
-        horizons=[LeadTime.from_string("PT36H")],  # Predict up to 36 hours ahead
-        quantiles=[Q(0.5), Q(0.1), Q(0.9)],  # Median + 80% prediction interval
-        # Target column (what we're predicting)
-        target_column="load",
-        # Weather feature columns (from the dataset)
-        temperature_column="temperature_2m",
-        relative_humidity_column="relative_humidity_2m",
-        wind_speed_column="wind_speed_10m",
-        radiation_column="shortwave_radiation",  # Solar radiation
-        pressure_column="surface_pressure",
-        # Training settings
-        verbosity=1,  # Show progress during training
-        mlflow_storage=None,  # Disable MLflow tracking for this demo
-        # Model-specific hyperparameters
-        gblinear_hyperparams=GBLinearForecaster.HyperParams(
-            n_steps=50  # Number of boosting iterations
-        ),
-    )
-)
-
-print("✅ Workflow configured successfully!")
-
-# %% [markdown]
-# ## 🏋️ Step 5: Train the Model
-#
-# The workflow's `fit()` method handles the entire training pipeline:
-# 1. **Preprocessing** — feature engineering, data validation, scaling
-# 2. **Training** — fit the model on historical data
-# 3. **Evaluation** — compute metrics on training data
-
-# %%
-# Train the model on historical data
-logger.info("🏋️ Starting model training...")
-
-result = workflow.fit(train_dataset)
-
-# Display training metrics
-if result is not None:
-    logger.info("✅ Training complete!")
-    print("\n📊 Training Evaluation Metrics:")
-    print(result.metrics_full.to_dataframe())
-
-    if result.metrics_test is not None:
-        print("\n📊 Test Set Metrics (held-out validation):")
-        print(result.metrics_test.to_dataframe())
-
-# %% [markdown]
-# ## 🔮 Step 6: Generate Forecasts
-#
-# Now we use the trained model to predict energy load for the next 14 days.
-# The output is a **ForecastDataset** containing:
-# - **Median prediction** (`quantile_P50`)
-# - **Lower bound** (`quantile_P10`) — 10th percentile
-# - **Upper bound** (`quantile_P90`) — 90th percentile
-
-# %%
-# Generate probabilistic forecasts for the forecast period
-from openstef_core.datasets import ForecastDataset
-
-logger.info("🔮 Generating forecasts...")
-forecast: ForecastDataset = workflow.predict(forecast_dataset)
-
-# Display forecast summary
-print(f"\n📈 Forecast generated for {len(forecast.data)} timestamps")
-print(f"📊 Quantiles: {forecast.quantiles}")
-print("\n🔍 Last 5 forecast values:")
-print(forecast.data.tail())
-
-# %% [markdown]
-# ## 📈 Step 7: Visualize Forecast Results
-#
-# OpenSTEF-BEAM provides **ForecastTimeSeriesPlotter** for beautiful interactive visualizations:
-# - Actual measurements shown as a line
-# - Forecast median shown as another line
-# - Prediction intervals shown as shaded areas
-
-# %%
-# Create an interactive forecast visualization
-from openstef_beam.analysis.plots import ForecastTimeSeriesPlotter
-
-fig = (
-    ForecastTimeSeriesPlotter()
-    # Add actual measurements (ground truth)
-    .add_measurements(measurements=forecast_dataset.data["load"])
-    # Add model predictions with confidence bands
-    .add_model(
-        model_name="GBLinear",
-        forecast=forecast.median_series,  # P50 prediction
-        quantiles=forecast.quantiles_data,  # P10-P90 confidence band
-    )
-    .plot()
-)
-
-# Update layout for better presentation
-fig.update_layout(
-    title="🔮 Energy Load Forecast vs Actual",
-    yaxis_title="Load (MW)",
-    xaxis_title="Time",
-    height=500,
-)
-fig.show()
-
-# %% [markdown]
-# ## 🔍 Step 8: Explain Feature Importance
-#
-# Understanding **why** the model makes certain predictions is crucial for trust
-# and debugging. GBLinear models provide clear feature importance rankings.
-
-# %%
-# Visualize feature importance using the ExplainableForecaster interface
-from typing import cast
-
-from openstef_models.explainability import ExplainableForecaster
-from openstef_models.models.forecasting_model import ForecastingModel
-
-# The GBLinear model implements ExplainableForecaster, providing feature importance
-forecaster = cast(ForecastingModel, workflow.model).forecaster
-explainable_model = cast(ExplainableForecaster, forecaster)
-
-# Create an interactive treemap of feature importances
-# Larger boxes = more important features
-fig = explainable_model.plot_feature_importances()
-fig.update_layout(title="🔍 Feature Importance Treemap")
-fig.show()
-
-# %% [markdown]
-# ## 🔬 Step 9: Visualize Feature Contributions (SHAP)
-#
-# While feature importance shows **which** features matter overall, **contributions**
-# show how each feature pushed the prediction up or down for every individual timestep.
-# GBLinear models expose these as SHAP values via `predict_contributions()`.
-
-# %%
-# Compute per-timestep feature contributions for the forecast period
-from openstef_models.explainability import ContributionsPlotter
-
-contributions = workflow.model.predict_contributions(forecast_dataset)
-
-# %%
-# Heatmap: contributions over time with prediction line
-ContributionsPlotter.plot_heatmap(contributions, top_n=10, show_prediction=True).show()
-
-# %%
-# Waterfall: decompose a single timestep's prediction
-ContributionsPlotter.plot_waterfall(contributions, timestep=0, top_n=10).show()
-
-# %%
-# Bar chart: mean absolute contribution per feature
-ContributionsPlotter.plot_bar(contributions, top_n=10).show()
-
-# %% [markdown]
-# ---
-#
-# ## 🎯 Summary
-#
-# In this tutorial, you learned how to:
-#
-# 1. ✅ **Load energy data** from the Liander 2024 benchmark dataset
-# 2. ✅ **Configure a workflow** with `ForecastingWorkflowConfig`
-# 3. ✅ **Train a GBLinear model** for probabilistic forecasting
-# 4. ✅ **Generate forecasts** with confidence intervals
-# 5. ✅ **Visualize results** and feature importance
-#
-# ### 🚀 Next Steps
-#
-# - Try different models: `"xgboost"` for more complex patterns
-# - Experiment with more quantiles for narrower prediction intervals
-# - Use the **backtesting notebook** to evaluate model performance systematically
-# - Explore MLflow integration for experiment tracking
diff --git a/examples/tutorials/hyperparameter_tuning_with_optuna.ipynb b/examples/tutorials/hyperparameter_tuning_with_optuna.ipynb
index ecf1de68d..39458e24b 100644
--- a/examples/tutorials/hyperparameter_tuning_with_optuna.ipynb
+++ b/examples/tutorials/hyperparameter_tuning_with_optuna.ipynb
@@ -22,17 +22,72 @@
    "cell_type": "code",
    "execution_count": null,
    "id": "31e74f93",
-   "metadata": {},
+   "metadata": {
+    "tags": [
+     "remove-cell"
+    ]
+   },
    "outputs": [],
    "source": [
-    "# --- Setup: Logging and Display Configuration ---\n",
-    "# Configure logging and display settings for the notebook\n",
-    "from typing import Literal\n",
+    "import warnings\n",
+    "\n",
+    "warnings.filterwarnings(\"ignore\")\n",
     "\n",
     "from openstef_core.testing import configure_notebook_display, setup_notebook_logging\n",
     "\n",
     "configure_notebook_display()\n",
-    "logger = setup_notebook_logging(__name__)"
+    "logger = setup_notebook_logging(\n",
+    "    __name__,\n",
+    "    suppress=(\n",
+    "        \"choreographer\",\n",
+    "        \"kaleido\",\n",
+    "        \"httpx\",\n",
+    "        \"huggingface_hub\",\n",
+    "        \"fsspec\",\n",
+    "        \"filelock\",\n",
+    "        \"openstef_core.datasets\",\n",
+    "        \"optuna\",\n",
+    "        \"lightgbm\",\n",
+    "    ),\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "44af0dd4",
+   "metadata": {},
+   "source": [
+    "# Hyperparameter Tuning with Optuna\n",
+    "\n",
+    "OpenSTEF integrates with [Optuna](https://optuna.org/) for Bayesian\n",
+    "hyperparameter optimization.  Every forecaster in OpenSTEF declares\n",
+    "sensible search bounds on its hyperparameters — you just choose which\n",
+    "ones to activate for tuning.\n",
+    "\n",
+    "**What you'll learn:**\n",
+    "\n",
+    "- Why models ship with built-in search spaces\n",
+    "- How to activate, deactivate, and customize tunable parameters\n",
+    "- How to change the optimization metric (e.g. rCRPS for probabilistic scoring)\n",
+    "- How to compare an untuned baseline against the tuned model\n",
+    "\n",
+    "```{note}\n",
+    "This tutorial runs only 5 trials for fast execution.\n",
+    "Increase `n_trials` for production use.\n",
+    "```\n",
+    "\n",
+    "**Key API references:**\n",
+    "[`HyperparameterTuner`](https://openstef.github.io/openstef/v4/api/generated/openstef_models.integrations.optuna.HyperparameterTuner.html)\n",
+    "· [`XGBoostHyperParams`](https://openstef.github.io/openstef/v4/api/generated/openstef_models.models.forecasting.html)\n",
+    "· [`FloatRange`](https://openstef.github.io/openstef/v4/api/generated/openstef_core.mixins.param_ranges.FloatRange.html) / [`IntRange`](https://openstef.github.io/openstef/v4/api/generated/openstef_core.mixins.param_ranges.IntRange.html)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "a8bdc792",
+   "metadata": {},
+   "source": [
+    "## Load the dataset"
    ]
   },
   {
@@ -42,267 +97,388 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "# Download and combine the Liander benchmark dataset into a single TimeSeriesDataset.\n",
-    "from openstef_core.testing import load_liander_dataset, prepare_tutorial_datasets\n",
+    "from datetime import datetime, timedelta\n",
+    "\n",
+    "from openstef_core.testing import load_liander_dataset\n",
+    "from openstef_core.types import LeadTime, Q\n",
     "\n",
     "dataset = load_liander_dataset()\n",
     "\n",
-    "print(f\"Dataset shape: {dataset.data.shape}\")\n",
-    "print(f\"Date range: {dataset.data.index.min()} to {dataset.data.index.max()}\")\n",
-    "dataset.data.head()"
+    "train_start = datetime.fromisoformat(\"2024-03-01T00:00:00Z\")\n",
+    "train_end = train_start + timedelta(days=45)\n",
+    "forecast_end = train_end + timedelta(days=7)\n",
+    "\n",
+    "train_dataset = dataset.filter_by_range(start=train_start, end=train_end)\n",
+    "predict_dataset = dataset.filter_by_range(\n",
+    "    start=train_end - timedelta(days=14),\n",
+    "    end=forecast_end,\n",
+    ")\n",
+    "\n",
+    "print(f\"Training:  {train_dataset.data.shape[0]:,} rows\")\n",
+    "print(f\"Predict:   {predict_dataset.data.shape[0]:,} rows\")"
    ]
   },
   {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "eee2c101",
+   "cell_type": "markdown",
+   "id": "4665d14e",
    "metadata": {},
-   "outputs": [],
    "source": [
-    "# Split the dataset into training (90 days) and forecast (14 days) periods.\n",
-    "train_dataset, forecast_dataset = prepare_tutorial_datasets()"
+    "## Understanding built-in search spaces\n",
+    "\n",
+    "Each forecaster's `HyperParams` class uses Python's `Annotated` type hints\n",
+    "to declare valid search bounds on every parameter.  For example,\n",
+    "`XGBoostHyperParams` defines:\n",
+    "\n",
+    "```python\n",
+    "n_estimators: Annotated[int, IntRange(50, 500)] = 100\n",
+    "learning_rate: Annotated[float, FloatRange(0.01, 0.5, log=True)] = 0.3\n",
+    "max_depth: Annotated[int, IntRange(1, 15)] = 6\n",
+    "subsample: Annotated[float, FloatRange(0.5, 1.0)] = 1.0\n",
+    "```\n",
+    "\n",
+    "These ranges define **where** Optuna can search, but tuning is **not active\n",
+    "by default**.  The `tune=True` flag explicitly activates each parameter.\n",
+    "This design means you always get sensible bounds without accidentally\n",
+    "tuning everything.\n",
+    "\n",
+    "Let's see the default search space — with nothing activated:"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "695ea7fd",
+   "id": "eee2c101",
    "metadata": {},
    "outputs": [],
    "source": [
-    "# Visualize the training data\n",
-    "# The plot shows the 'load' column (energy consumption in MW) over time\n",
-    "fig = train_dataset.data[[\"load\"]].plot(title=\"Training Data: Energy Load over Time\")\n",
-    "fig.update_layout(yaxis_title=\"Load (MW)\", xaxis_title=\"Time\")  # type: ignore[union-attr]  # plotly Figure\n",
-    "fig.show()  # type: ignore[union-attr]"
+    "from openstef_core.mixins.param_ranges import FloatRange, IntRange\n",
+    "from openstef_models.models.forecasting.xgboost_forecaster import XGBoostHyperParams\n",
+    "\n",
+    "default_hp = XGBoostHyperParams()\n",
+    "default_space = default_hp.get_search_space()\n",
+    "print(f\"Default tunable parameters: {len(default_space)}\")\n",
+    "print(\"(All parameters use their fixed defaults until you opt in with tune=True)\")"
    ]
   },
   {
    "cell_type": "markdown",
-   "id": "ed1b233e",
+   "id": "4329d363",
    "metadata": {},
    "source": [
-    "## Define a base config with inline search space\n",
+    "## Customizing the search space\n",
+    "\n",
+    "To activate tuning on a parameter, pass a range with `tune=True`.\n",
+    "You can also narrow or widen the bounds, or leave bounds as `None` to\n",
+    "inherit the class-level defaults from the `Annotated` metadata.\n",
+    "\n",
+    "**Activate with custom bounds:**\n",
+    "```python\n",
+    "learning_rate=FloatRange(0.01, 0.3, log=True, tune=True)\n",
+    "```\n",
     "\n",
-    "Override default hyperparameters with `TuningRange(tune=True)` to mark them for tuning.\n",
-    "Any parameter left as a plain value keeps its default during tuning."
+    "**Activate with default bounds** (inherits from Annotated metadata):\n",
+    "```python\n",
+    "subsample=FloatRange(tune=True)\n",
+    "```\n",
+    "\n",
+    "**Keep a parameter fixed** (don't pass a range — just a plain value or omit it):\n",
+    "```python\n",
+    "max_depth=6  # fixed, not tuned\n",
+    "```\n",
+    "\n",
+    "Let's configure XGBoost with 4 tunable parameters and keep `reg_alpha`\n",
+    "fixed at a known-good value:"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "fc276f61",
+   "id": "695ea7fd",
    "metadata": {},
    "outputs": [],
    "source": [
-    "from openstef_beam.evaluation.metric_providers import ObservedProbabilityProvider, RMAEProvider\n",
-    "from openstef_core.mixins.param_ranges import FloatRange, IntRange\n",
-    "from openstef_core.types import LeadTime, Q\n",
-    "from openstef_models.integrations.optuna import HyperparameterTuner\n",
-    "from openstef_models.models.forecasting.xgboost_forecaster import XGBoostHyperParams\n",
     "from openstef_models.presets import ForecastingWorkflowConfig, create_forecasting_workflow\n",
     "\n",
     "config = ForecastingWorkflowConfig(\n",
     "    model_id=\"tuning_demo\",\n",
     "    model=\"xgboost\",\n",
-    "    # Forecast settings\n",
-    "    horizons=[LeadTime.from_string(\"PT36H\")],  # Predict up to 36 hours ahead\n",
-    "    quantiles=[Q(0.5), Q(0.1), Q(0.9)],  # Median + 80% prediction interval\n",
-    "    # Target column (what we're predicting)\n",
+    "    horizons=[LeadTime.from_string(\"PT36H\")],\n",
+    "    quantiles=[Q(0.5), Q(0.1), Q(0.9)],\n",
     "    target_column=\"load\",\n",
-    "    # Weather feature columns (from the dataset)\n",
     "    temperature_column=\"temperature_2m\",\n",
     "    relative_humidity_column=\"relative_humidity_2m\",\n",
     "    wind_speed_column=\"wind_speed_10m\",\n",
-    "    radiation_column=\"shortwave_radiation\",  # Solar radiation\n",
+    "    radiation_column=\"shortwave_radiation\",\n",
     "    pressure_column=\"surface_pressure\",\n",
-    "    # Hyperparameters to tune\n",
     "    xgboost_hyperparams=XGBoostHyperParams(\n",
-    "        learning_rate=FloatRange(0.01, 0.3, log=True, tune=True),  # pyright: ignore[reportCallIssue]  # ranges accepted at runtime via Annotated\n",
-    "        n_estimators=IntRange(50, 500, tune=True),\n",
-    "        max_depth=IntRange(3, 10, tune=True),\n",
-    "        subsample=FloatRange(0.5, 1.0, tune=True),\n",
-    "        colsample_bytree=FloatRange(0.5, 1.0, tune=True),\n",
+    "        # Tuned — custom bounds\n",
+    "        learning_rate=FloatRange(0.01, 0.3, log=True, tune=True),  # pyright: ignore[reportCallIssue]\n",
+    "        n_estimators=IntRange(50, 300, tune=True),\n",
+    "        # Tuned — inherits class-level bounds [1, 15]\n",
+    "        max_depth=IntRange(tune=True),\n",
+    "        # Tuned — custom narrower bounds\n",
+    "        subsample=FloatRange(0.6, 1.0, tune=True),\n",
+    "        # Fixed — not tuned\n",
+    "        reg_alpha=0.1,\n",
     "    ),\n",
-    "    evaluation_metrics=[RMAEProvider(), ObservedProbabilityProvider()],\n",
-    "    mlflow_storage=None,  # Disable MLFlow tune to avoid reusing models between trials.\n",
+    "    mlflow_storage=None,\n",
+    "    verbosity=0,\n",
     ")"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "374108f1",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "space = config.xgboost_hyperparams.get_search_space()\n",
+    "\n",
+    "print(f\"Active search space ({len(space)} parameters):\")\n",
+    "for name, param in space.items():\n",
+    "    if isinstance(param, (FloatRange, IntRange)):\n",
+    "        scale = \" [log]\" if param.log else \"\"\n",
+    "        print(f\"  {name:20s}: {type(param).__name__}  [{param.low} — {param.high}]{scale}\")"
+   ]
+  },
   {
    "cell_type": "markdown",
-   "id": "7168a1a8",
+   "id": "ed1b233e",
    "metadata": {},
    "source": [
-    "## Inspect the resolved search space\n"
+    "## Changing the tuning metric\n",
+    "\n",
+    "By default, `HyperparameterTuner` optimizes `R2` on the median quantile.\n",
+    "For probabilistic forecasts, the **relative Continuous Ranked Probability\n",
+    "Score (rCRPS)** is a better choice — it evaluates the full quantile\n",
+    "distribution, not just the median.\n",
+    "\n",
+    "To use rCRPS, add [`RCRPSProvider`](https://openstef.github.io/openstef/v4/api/generated/openstef_beam.evaluation.metric_providers.RCRPSProvider.html) to the config's `evaluation_metrics`\n",
+    "and set `metric_name=\"rCRPS\"` with `direction=\"minimize\"` (lower is better):"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "41f1b6ab",
+   "id": "fc276f61",
    "metadata": {},
    "outputs": [],
    "source": [
+    "from openstef_beam.evaluation.metric_providers import ObservedProbabilityProvider, R2Provider, RCRPSProvider\n",
     "\n",
-    "# Get the search space from the hyperparams instance (resolve fills None bounds from class-level defaults).\n",
-    "resolved_space = config.xgboost_hyperparams.get_search_space()\n",
-    "\n",
-    "print(\"Resolved search space:\")\n",
-    "for name, param in resolved_space.items():\n",
-    "    if isinstance(param, (FloatRange, IntRange)):\n",
-    "        scale = \"  [log]\" if param.log else \"\"\n",
-    "        print(f\"  {name:25s}: {type(param).__name__}  [{param.low} — {param.high}]{scale}\")\n",
-    "    else:\n",
-    "        print(f\"  {name:25s}: CategoricalRange  {param.choices}\")"
+    "config_with_rcrps = config.model_copy(\n",
+    "    update={\"evaluation_metrics\": [R2Provider(), ObservedProbabilityProvider(), RCRPSProvider()]}\n",
+    ")"
    ]
   },
   {
    "cell_type": "markdown",
-   "id": "dabeaa30",
+   "id": "5a158b63",
    "metadata": {},
    "source": [
-    "## Run the Optuna study with `HyperparameterTuner`"
+    "## Train an untuned baseline\n",
+    "\n",
+    "Before tuning, let's train a model with the default hyperparameters so we\n",
+    "can measure the improvement."
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "8b170806",
+   "id": "cb569a95",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "baseline_config = config_with_rcrps.model_copy(\n",
+    "    update={\n",
+    "        \"xgboost_hyperparams\": XGBoostHyperParams(reg_alpha=0.1),\n",
+    "    }\n",
+    ")\n",
+    "baseline_workflow = create_forecasting_workflow(baseline_config)\n",
+    "baseline_result = baseline_workflow.fit(train_dataset)\n",
+    "baseline_forecast = baseline_workflow.predict(predict_dataset, forecast_start=train_end)\n",
+    "\n",
+    "baseline_r2 = baseline_result.metrics_val.get_metric(quantile=Q(0.5), metric_name=\"R2\")\n",
+    "baseline_rcrps = baseline_result.metrics_val.get_metric(quantile=\"global\", metric_name=\"rCRPS\")\n",
+    "print(f\"Baseline R2:    {baseline_r2:.4f}\")\n",
+    "print(f\"Baseline rCRPS: {baseline_rcrps:.4f}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "7cdfac18",
    "metadata": {
-    "lines_to_next_cell": 2
+    "tags": [
+     "remove-cell"
+    ]
    },
    "outputs": [],
+   "source": [
+    "assert baseline_r2 is not None and baseline_r2 > 0.0"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "9b795a56",
+   "metadata": {},
+   "source": [
+    "## Run the Optuna study\n",
+    "\n",
+    "`HyperparameterTuner.fit_with_tuning()` runs the study and trains a final\n",
+    "workflow using [`create_forecasting_workflow`](https://openstef.github.io/openstef/v4/api/generated/openstef_models.presets.create_forecasting_workflow.html) on the full training set with the best hyperparameters.\n",
+    "The first trial always evaluates the default values so the search starts\n",
+    "from a known baseline."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "5c628cb4",
+   "metadata": {},
+   "outputs": [],
    "source": [
     "import optuna\n",
     "\n",
-    "optuna.logging.set_verbosity(optuna.logging.WARNING)  # Suppress per-trial logs\n",
+    "from openstef_models.integrations.optuna import HyperparameterTuner\n",
+    "\n",
+    "optuna.logging.set_verbosity(optuna.logging.WARNING)\n",
     "\n",
     "tuner = HyperparameterTuner(\n",
-    "    config=config,\n",
+    "    config=config_with_rcrps,\n",
     "    train_dataset=train_dataset,\n",
     "    create_workflow=create_forecasting_workflow,\n",
-    "    target_quantile=Q(0.5),\n",
-    "    metric_name=\"rMAE\",\n",
+    "    target_quantile=\"global\",\n",
+    "    metric_name=\"rCRPS\",\n",
     "    direction=\"minimize\",\n",
-    "    n_trials=20,\n",
+    "    n_trials=5,\n",
     "    seed=42,\n",
     ")\n",
-    "tuning_result = tuner.fit_with_tuning()\n",
+    "tuning_result = tuner.fit_with_tuning(show_progress_bar=False)\n",
     "\n",
-    "print(f\"Study complete: {len(tuning_result.study.trials)} trials\")\n",
-    "print(f\"Best value: {tuning_result.study.best_value:.4f}\")\n",
+    "print(f\"Trials completed: {len(tuning_result.study.trials)}\")\n",
+    "print(f\"Best rCRPS: {tuning_result.study.best_value:.4f}\")\n",
     "print(f\"Best params: {tuning_result.study.best_params}\")"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "602f6b4f",
+   "id": "26bb13cc",
    "metadata": {
-    "lines_to_next_cell": 2
+    "tags": [
+     "remove-cell"
+    ]
    },
    "outputs": [],
    "source": [
-    "# Inspect which hyperparameters were tuned vs kept at their default.\n",
-    "best_config = tuning_result.best_config  # type: ignore[union-attr]  # known to be ForecastingWorkflowConfig\n",
-    "print(\"Final XGBoost hyperparameters (tuned values marked):\")\n",
-    "final_hp = best_config.xgboost_hyperparams\n",
-    "baseline_hp = config.xgboost_hyperparams\n",
-    "best_params = tuning_result.study.best_params\n",
-    "\n",
-    "for field in type(final_hp).model_fields:\n",
-    "    value = getattr(final_hp, field)\n",
-    "    baseline = getattr(baseline_hp, field)\n",
-    "    marker: Literal[\" <- tuned\", \"\"] = \" <- tuned\" if field in best_params else \"\"\n",
-    "    print(f\"  {field:25s}: {value}{marker}\")"
+    "assert len(tuning_result.study.trials) == 5, f\"Expected 5 trials, got {len(tuning_result.study.trials)}\""
    ]
   },
   {
    "cell_type": "markdown",
-   "id": "2875de25",
+   "id": "7168a1a8",
    "metadata": {},
    "source": [
-    "## The fitted workflow\n",
-    "\n",
-    "`fit_with_tuning()` already trains a final workflow on the full training set using the best\n",
-    "hyperparameters — no separate fit step is needed. The result is in `tuning_result.workflow`.\n"
+    "## Inspect the best hyperparameters"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "8d91be63",
+   "id": "41f1b6ab",
    "metadata": {},
    "outputs": [],
    "source": [
-    "workflow = tuning_result.workflow"
+    "best_hp = tuning_result.best_config.xgboost_hyperparams\n",
+    "best_params = tuning_result.study.best_params\n",
+    "\n",
+    "print(\"Final hyperparameters (tuned values marked):\")\n",
+    "for field in type(best_hp).model_fields:\n",
+    "    value = getattr(best_hp, field)\n",
+    "    marker = \" <- tuned\" if field in best_params else \"\"\n",
+    "    print(f\"  {field:20s}: {value}{marker}\")"
    ]
   },
   {
    "cell_type": "markdown",
-   "id": "bb6ba8c7",
+   "id": "dabeaa30",
    "metadata": {},
    "source": [
-    "## Inspect the study and forecast\n",
+    "## Visualize optimization history\n",
     "\n",
-    "1. How did $rMAE$ improve over trials?\n",
-    "2. Which parameters had the most impact?\n",
-    "3. Final tuned model predictions on the held-out forecast window.\n"
+    "The optimization history shows how rCRPS decreased over trials.  With only\n",
+    "5 trials results are noisy — increase `n_trials` for smoother convergence."
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "2d34a7fa",
+   "id": "8b170806",
    "metadata": {
-    "lines_to_next_cell": 2
+    "tags": [
+     "hide-input"
+    ]
    },
    "outputs": [],
    "source": [
-    "from optuna.visualization import plot_optimization_history, plot_param_importances\n",
-    "\n",
-    "study = tuning_result.study\n",
+    "from optuna.visualization import plot_optimization_history\n",
     "\n",
-    "# How the best score evolved over trials\n",
-    "fig = plot_optimization_history(study)\n",
-    "fig.update_layout(title=\"Optimization History: rMAE over Trials\")\n",
-    "fig.show()\n",
+    "fig = plot_optimization_history(tuning_result.study)\n",
+    "fig.update_layout(title=\"Optimization History: rCRPS over Trials\", height=400)\n",
+    "fig.show()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "4fbe1d32",
+   "metadata": {},
+   "source": [
+    "## Compare: untuned vs tuned\n",
     "\n",
-    "# Which hyperparameters mattered most (requires ≥ ~20 trials for reliable ranking)\n",
-    "fig2 = plot_param_importances(study)\n",
-    "fig2.update_layout(title=\"Hyperparameter Importances\")\n",
-    "fig2.show()"
+    "Plot both models on the same chart to visualize the improvement.\n",
+    "Once you're happy with the tuned parameters, run a\n",
+    "{doc}`backtesting_quickstart` to measure the gain over a realistic\n",
+    "operational timeline."
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "1b2b1124",
+   "id": "602f6b4f",
    "metadata": {
-    "lines_to_next_cell": 2
+    "tags": [
+     "hide-input"
+    ]
    },
    "outputs": [],
    "source": [
     "from openstef_beam.analysis.plots import ForecastTimeSeriesPlotter\n",
     "\n",
-    "forecast = workflow.predict(forecast_dataset)\n",
+    "tuned_forecast = tuning_result.workflow.predict(predict_dataset, forecast_start=train_end)\n",
     "\n",
     "fig = (\n",
     "    ForecastTimeSeriesPlotter()\n",
-    "    .add_measurements(measurements=forecast_dataset.data[\"load\"])\n",
+    "    .add_measurements(measurements=predict_dataset.data[\"load\"].loc[train_end:])\n",
+    "    .add_model(\n",
+    "        model_name=\"XGBoost (default)\",\n",
+    "        forecast=baseline_forecast.median_series,\n",
+    "        quantiles=baseline_forecast.quantiles_data,\n",
+    "    )\n",
     "    .add_model(\n",
     "        model_name=\"XGBoost (tuned)\",\n",
-    "        forecast=forecast.median_series,\n",
-    "        quantiles=forecast.quantiles_data,\n",
+    "        forecast=tuned_forecast.median_series,\n",
+    "        quantiles=tuned_forecast.quantiles_data,\n",
     "    )\n",
     "    .plot()\n",
     ")\n",
+    "\n",
     "fig.update_layout(\n",
-    "    title=\"Tuned XGBoost Forecast vs Actual\",\n",
-    "    yaxis_title=\"Load (MW)\",\n",
+    "    title=\"Hyperparameter Tuning: Default vs Tuned XGBoost\",\n",
     "    xaxis_title=\"Time\",\n",
-    "    height=500,\n",
+    "    yaxis_title=\"MW\",\n",
+    "    height=400,\n",
     ")\n",
     "fig.show()"
    ]
@@ -310,10 +486,31 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "f4a861db",
+   "id": "3a388a0c",
    "metadata": {},
    "outputs": [],
-   "source": []
+   "source": [
+    "tuned_rcrps = tuning_result.study.best_value\n",
+    "\n",
+    "print(f\"{'Model':<20} {'rCRPS':>10}\")\n",
+    "print(f\"{'':-<20} {'':-^10}\")\n",
+    "print(f\"{'XGBoost (default)':<20} {baseline_rcrps:>10.4f}\")\n",
+    "print(f\"{'XGBoost (tuned)':<20} {tuned_rcrps:>10.4f}\")\n",
+    "print(f\"{'Improvement':<20} {baseline_rcrps - tuned_rcrps:>10.4f}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "2875de25",
+   "metadata": {},
+   "source": [
+    "## Next steps\n",
+    "\n",
+    "- {doc}`ensemble_forecasting` — combine tuned models into an ensemble\n",
+    "  for even better accuracy.\n",
+    "- {doc}`backtesting_quickstart` — validate tuned parameters on longer\n",
+    "  historical windows."
+   ]
   }
  ],
  "metadata": {
diff --git a/examples/tutorials/hyperparameter_tuning_with_optuna.py b/examples/tutorials/hyperparameter_tuning_with_optuna.py
index 6014e1afa..90d973f46 100644
--- a/examples/tutorials/hyperparameter_tuning_with_optuna.py
+++ b/examples/tutorials/hyperparameter_tuning_with_optuna.py
@@ -20,190 +20,323 @@
 
 # pyright: basic
 
-# %%
-# --- Setup: Logging and Display Configuration ---
-# Configure logging and display settings for the notebook
-from typing import Literal
+# %% tags=["remove-cell"]
+import warnings
+
+warnings.filterwarnings("ignore")
 
 from openstef_core.testing import configure_notebook_display, setup_notebook_logging
 
 configure_notebook_display()
-logger = setup_notebook_logging(__name__)
+logger = setup_notebook_logging(
+    __name__,
+    suppress=(
+        "choreographer",
+        "kaleido",
+        "httpx",
+        "huggingface_hub",
+        "fsspec",
+        "filelock",
+        "openstef_core.datasets",
+        "optuna",
+        "lightgbm",
+    ),
+)
+
+# %% [markdown]
+# # Hyperparameter Tuning with Optuna
+#
+# OpenSTEF integrates with [Optuna](https://optuna.org/) for Bayesian
+# hyperparameter optimization.  Every forecaster in OpenSTEF declares
+# sensible search bounds on its hyperparameters — you just choose which
+# ones to activate for tuning.
+#
+# **What you'll learn:**
+#
+# - Why models ship with built-in search spaces
+# - How to activate, deactivate, and customize tunable parameters
+# - How to change the optimization metric (e.g. rCRPS for probabilistic scoring)
+# - How to compare an untuned baseline against the tuned model
+#
+# ```{note}
+# This tutorial runs only 5 trials for fast execution.
+# Increase `n_trials` for production use.
+# ```
+#
+# **Key API references:**
+# [`HyperparameterTuner`](https://openstef.github.io/openstef/v4/api/generated/openstef_models.integrations.optuna.HyperparameterTuner.html)
+# · [`XGBoostHyperParams`](https://openstef.github.io/openstef/v4/api/generated/openstef_models.models.forecasting.html)
+# · [`FloatRange`](https://openstef.github.io/openstef/v4/api/generated/openstef_core.mixins.param_ranges.FloatRange.html) / [`IntRange`](https://openstef.github.io/openstef/v4/api/generated/openstef_core.mixins.param_ranges.IntRange.html)
+
+# %% [markdown]
+# ## Load the dataset
 
 # %%
-# Download and combine the Liander benchmark dataset into a single TimeSeriesDataset.
-from openstef_core.testing import load_liander_dataset, prepare_tutorial_datasets
+from datetime import datetime, timedelta
+
+from openstef_core.testing import load_liander_dataset
+from openstef_core.types import LeadTime, Q
 
 dataset = load_liander_dataset()
 
-print(f"Dataset shape: {dataset.data.shape}")
-print(f"Date range: {dataset.data.index.min()} to {dataset.data.index.max()}")
-dataset.data.head()
+train_start = datetime.fromisoformat("2024-03-01T00:00:00Z")
+train_end = train_start + timedelta(days=45)
+forecast_end = train_end + timedelta(days=7)
 
-# %%
-# Split the dataset into training (90 days) and forecast (14 days) periods.
-train_dataset, forecast_dataset = prepare_tutorial_datasets()
+train_dataset = dataset.filter_by_range(start=train_start, end=train_end)
+predict_dataset = dataset.filter_by_range(
+    start=train_end - timedelta(days=14),
+    end=forecast_end,
+)
 
-# %%
-# Visualize the training data
-# The plot shows the 'load' column (energy consumption in MW) over time
-fig = train_dataset.data[["load"]].plot(title="Training Data: Energy Load over Time")
-fig.update_layout(yaxis_title="Load (MW)", xaxis_title="Time")  # type: ignore[union-attr]  # plotly Figure
-fig.show()  # type: ignore[union-attr]
+print(f"Training:  {train_dataset.data.shape[0]:,} rows")
+print(f"Predict:   {predict_dataset.data.shape[0]:,} rows")
 
 # %% [markdown]
-# ## Define a base config with inline search space
+# ## Understanding built-in search spaces
+#
+# Each forecaster's `HyperParams` class uses Python's `Annotated` type hints
+# to declare valid search bounds on every parameter.  For example,
+# `XGBoostHyperParams` defines:
 #
-# Override default hyperparameters with `TuningRange(tune=True)` to mark them for tuning.
-# Any parameter left as a plain value keeps its default during tuning.
+# ```python
+# n_estimators: Annotated[int, IntRange(50, 500)] = 100
+# learning_rate: Annotated[float, FloatRange(0.01, 0.5, log=True)] = 0.3
+# max_depth: Annotated[int, IntRange(1, 15)] = 6
+# subsample: Annotated[float, FloatRange(0.5, 1.0)] = 1.0
+# ```
+#
+# These ranges define **where** Optuna can search, but tuning is **not active
+# by default**.  The `tune=True` flag explicitly activates each parameter.
+# This design means you always get sensible bounds without accidentally
+# tuning everything.
+#
+# Let's see the default search space — with nothing activated:
 
 # %%
-from openstef_beam.evaluation.metric_providers import ObservedProbabilityProvider, RMAEProvider
 from openstef_core.mixins.param_ranges import FloatRange, IntRange
-from openstef_core.types import LeadTime, Q
-from openstef_models.integrations.optuna import HyperparameterTuner
 from openstef_models.models.forecasting.xgboost_forecaster import XGBoostHyperParams
+
+default_hp = XGBoostHyperParams()
+default_space = default_hp.get_search_space()
+print(f"Default tunable parameters: {len(default_space)}")
+print("(All parameters use their fixed defaults until you opt in with tune=True)")
+
+# %% [markdown]
+# ## Customizing the search space
+#
+# To activate tuning on a parameter, pass a range with `tune=True`.
+# You can also narrow or widen the bounds, or leave bounds as `None` to
+# inherit the class-level defaults from the `Annotated` metadata.
+#
+# **Activate with custom bounds:**
+# ```python
+# learning_rate=FloatRange(0.01, 0.3, log=True, tune=True)
+# ```
+#
+# **Activate with default bounds** (inherits from Annotated metadata):
+# ```python
+# subsample=FloatRange(tune=True)
+# ```
+#
+# **Keep a parameter fixed** (don't pass a range — just a plain value or omit it):
+# ```python
+# max_depth=6  # fixed, not tuned
+# ```
+#
+# Let's configure XGBoost with 4 tunable parameters and keep `reg_alpha`
+# fixed at a known-good value:
+
+# %%
 from openstef_models.presets import ForecastingWorkflowConfig, create_forecasting_workflow
 
 config = ForecastingWorkflowConfig(
     model_id="tuning_demo",
     model="xgboost",
-    # Forecast settings
-    horizons=[LeadTime.from_string("PT36H")],  # Predict up to 36 hours ahead
-    quantiles=[Q(0.5), Q(0.1), Q(0.9)],  # Median + 80% prediction interval
-    # Target column (what we're predicting)
+    horizons=[LeadTime.from_string("PT36H")],
+    quantiles=[Q(0.5), Q(0.1), Q(0.9)],
     target_column="load",
-    # Weather feature columns (from the dataset)
     temperature_column="temperature_2m",
     relative_humidity_column="relative_humidity_2m",
     wind_speed_column="wind_speed_10m",
-    radiation_column="shortwave_radiation",  # Solar radiation
+    radiation_column="shortwave_radiation",
     pressure_column="surface_pressure",
-    # Hyperparameters to tune
     xgboost_hyperparams=XGBoostHyperParams(
-        learning_rate=FloatRange(0.01, 0.3, log=True, tune=True),  # pyright: ignore[reportCallIssue]  # ranges accepted at runtime via Annotated
-        n_estimators=IntRange(50, 500, tune=True),
-        max_depth=IntRange(3, 10, tune=True),
-        subsample=FloatRange(0.5, 1.0, tune=True),
-        colsample_bytree=FloatRange(0.5, 1.0, tune=True),
+        # Tuned — custom bounds
+        learning_rate=FloatRange(0.01, 0.3, log=True, tune=True),  # pyright: ignore[reportCallIssue]
+        n_estimators=IntRange(50, 300, tune=True),
+        # Tuned — inherits class-level bounds [1, 15]
+        max_depth=IntRange(tune=True),
+        # Tuned — custom narrower bounds
+        subsample=FloatRange(0.6, 1.0, tune=True),
+        # Fixed — not tuned
+        reg_alpha=0.1,
     ),
-    evaluation_metrics=[RMAEProvider(), ObservedProbabilityProvider()],
-    mlflow_storage=None,  # Disable MLFlow tune to avoid reusing models between trials.
+    mlflow_storage=None,
+    verbosity=0,
 )
 
+# %%
+space = config.xgboost_hyperparams.get_search_space()
+
+print(f"Active search space ({len(space)} parameters):")
+for name, param in space.items():
+    if isinstance(param, (FloatRange, IntRange)):
+        scale = " [log]" if param.log else ""
+        print(f"  {name:20s}: {type(param).__name__}  [{param.low} — {param.high}]{scale}")
+
 # %% [markdown]
-# ## Inspect the resolved search space
+# ## Changing the tuning metric
+#
+# By default, `HyperparameterTuner` optimizes `R2` on the median quantile.
+# For probabilistic forecasts, the **relative Continuous Ranked Probability
+# Score (rCRPS)** is a better choice — it evaluates the full quantile
+# distribution, not just the median.
 #
+# To use rCRPS, add [`RCRPSProvider`](https://openstef.github.io/openstef/v4/api/generated/openstef_beam.evaluation.metric_providers.RCRPSProvider.html) to the config's `evaluation_metrics`
+# and set `metric_name="rCRPS"` with `direction="minimize"` (lower is better):
 
 # %%
+from openstef_beam.evaluation.metric_providers import ObservedProbabilityProvider, R2Provider, RCRPSProvider
 
-# Get the search space from the hyperparams instance (resolve fills None bounds from class-level defaults).
-resolved_space = config.xgboost_hyperparams.get_search_space()
+config_with_rcrps = config.model_copy(
+    update={"evaluation_metrics": [R2Provider(), ObservedProbabilityProvider(), RCRPSProvider()]}
+)
 
-print("Resolved search space:")
-for name, param in resolved_space.items():
-    if isinstance(param, (FloatRange, IntRange)):
-        scale = "  [log]" if param.log else ""
-        print(f"  {name:25s}: {type(param).__name__}  [{param.low} — {param.high}]{scale}")
-    else:
-        print(f"  {name:25s}: CategoricalRange  {param.choices}")
+# %% [markdown]
+# ## Train an untuned baseline
+#
+# Before tuning, let's train a model with the default hyperparameters so we
+# can measure the improvement.
+
+# %%
+baseline_config = config_with_rcrps.model_copy(
+    update={
+        "xgboost_hyperparams": XGBoostHyperParams(reg_alpha=0.1),
+    }
+)
+baseline_workflow = create_forecasting_workflow(baseline_config)
+baseline_result = baseline_workflow.fit(train_dataset)
+baseline_forecast = baseline_workflow.predict(predict_dataset, forecast_start=train_end)
+
+baseline_r2 = baseline_result.metrics_val.get_metric(quantile=Q(0.5), metric_name="R2")
+baseline_rcrps = baseline_result.metrics_val.get_metric(quantile="global", metric_name="rCRPS")
+print(f"Baseline R2:    {baseline_r2:.4f}")
+print(f"Baseline rCRPS: {baseline_rcrps:.4f}")
+
+# %% tags=["remove-cell"]
+assert baseline_r2 is not None and baseline_r2 > 0.0
 
 # %% [markdown]
-# ## Run the Optuna study with `HyperparameterTuner`
+# ## Run the Optuna study
+#
+# `HyperparameterTuner.fit_with_tuning()` runs the study and trains a final
+# workflow using [`create_forecasting_workflow`](https://openstef.github.io/openstef/v4/api/generated/openstef_models.presets.create_forecasting_workflow.html) on the full training set with the best hyperparameters.
+# The first trial always evaluates the default values so the search starts
+# from a known baseline.
 
 # %%
 import optuna
 
-optuna.logging.set_verbosity(optuna.logging.WARNING)  # Suppress per-trial logs
+from openstef_models.integrations.optuna import HyperparameterTuner
+
+optuna.logging.set_verbosity(optuna.logging.WARNING)
 
 tuner = HyperparameterTuner(
-    config=config,
+    config=config_with_rcrps,
     train_dataset=train_dataset,
     create_workflow=create_forecasting_workflow,
-    target_quantile=Q(0.5),
-    metric_name="rMAE",
+    target_quantile="global",
+    metric_name="rCRPS",
     direction="minimize",
-    n_trials=20,
+    n_trials=5,
     seed=42,
 )
-tuning_result = tuner.fit_with_tuning()
+tuning_result = tuner.fit_with_tuning(show_progress_bar=False)
 
-print(f"Study complete: {len(tuning_result.study.trials)} trials")
-print(f"Best value: {tuning_result.study.best_value:.4f}")
+print(f"Trials completed: {len(tuning_result.study.trials)}")
+print(f"Best rCRPS: {tuning_result.study.best_value:.4f}")
 print(f"Best params: {tuning_result.study.best_params}")
 
-
-# %%
-# Inspect which hyperparameters were tuned vs kept at their default.
-best_config = tuning_result.best_config  # type: ignore[union-attr]  # known to be ForecastingWorkflowConfig
-print("Final XGBoost hyperparameters (tuned values marked):")
-final_hp = best_config.xgboost_hyperparams
-baseline_hp = config.xgboost_hyperparams
-best_params = tuning_result.study.best_params
-
-for field in type(final_hp).model_fields:
-    value = getattr(final_hp, field)
-    baseline = getattr(baseline_hp, field)
-    marker: Literal[" <- tuned", ""] = " <- tuned" if field in best_params else ""
-    print(f"  {field:25s}: {value}{marker}")
-
+# %% tags=["remove-cell"]
+assert len(tuning_result.study.trials) == 5, f"Expected 5 trials, got {len(tuning_result.study.trials)}"
 
 # %% [markdown]
-# ## The fitted workflow
-#
-# `fit_with_tuning()` already trains a final workflow on the full training set using the best
-# hyperparameters — no separate fit step is needed. The result is in `tuning_result.workflow`.
-#
+# ## Inspect the best hyperparameters
 
 # %%
-workflow = tuning_result.workflow
+best_hp = tuning_result.best_config.xgboost_hyperparams
+best_params = tuning_result.study.best_params
+
+print("Final hyperparameters (tuned values marked):")
+for field in type(best_hp).model_fields:
+    value = getattr(best_hp, field)
+    marker = " <- tuned" if field in best_params else ""
+    print(f"  {field:20s}: {value}{marker}")
 
 # %% [markdown]
-# ## Inspect the study and forecast
-#
-# 1. How did $rMAE$ improve over trials?
-# 2. Which parameters had the most impact?
-# 3. Final tuned model predictions on the held-out forecast window.
+# ## Visualize optimization history
 #
+# The optimization history shows how rCRPS decreased over trials.  With only
+# 5 trials results are noisy — increase `n_trials` for smoother convergence.
 
-# %%
-from optuna.visualization import plot_optimization_history, plot_param_importances
-
-study = tuning_result.study
+# %% tags=["hide-input"]
+from optuna.visualization import plot_optimization_history
 
-# How the best score evolved over trials
-fig = plot_optimization_history(study)
-fig.update_layout(title="Optimization History: rMAE over Trials")
+fig = plot_optimization_history(tuning_result.study)
+fig.update_layout(title="Optimization History: rCRPS over Trials", height=400)
 fig.show()
 
-# Which hyperparameters mattered most (requires ≥ ~20 trials for reliable ranking)
-fig2 = plot_param_importances(study)
-fig2.update_layout(title="Hyperparameter Importances")
-fig2.show()
-
+# %% [markdown]
+# ## Compare: untuned vs tuned
+#
+# Plot both models on the same chart to visualize the improvement.
+# Once you're happy with the tuned parameters, run a
+# {doc}`backtesting_quickstart` to measure the gain over a realistic
+# operational timeline.
 
-# %%
+# %% tags=["hide-input"]
 from openstef_beam.analysis.plots import ForecastTimeSeriesPlotter
 
-forecast = workflow.predict(forecast_dataset)
+tuned_forecast = tuning_result.workflow.predict(predict_dataset, forecast_start=train_end)
 
 fig = (
     ForecastTimeSeriesPlotter()
-    .add_measurements(measurements=forecast_dataset.data["load"])
+    .add_measurements(measurements=predict_dataset.data["load"].loc[train_end:])
+    .add_model(
+        model_name="XGBoost (default)",
+        forecast=baseline_forecast.median_series,
+        quantiles=baseline_forecast.quantiles_data,
+    )
     .add_model(
         model_name="XGBoost (tuned)",
-        forecast=forecast.median_series,
-        quantiles=forecast.quantiles_data,
+        forecast=tuned_forecast.median_series,
+        quantiles=tuned_forecast.quantiles_data,
     )
     .plot()
 )
+
 fig.update_layout(
-    title="Tuned XGBoost Forecast vs Actual",
-    yaxis_title="Load (MW)",
+    title="Hyperparameter Tuning: Default vs Tuned XGBoost",
     xaxis_title="Time",
-    height=500,
+    yaxis_title="MW",
+    height=400,
 )
 fig.show()
 
-
 # %%
+tuned_rcrps = tuning_result.study.best_value
+
+print(f"{'Model':<20} {'rCRPS':>10}")
+print(f"{'':-<20} {'':-^10}")
+print(f"{'XGBoost (default)':<20} {baseline_rcrps:>10.4f}")
+print(f"{'XGBoost (tuned)':<20} {tuned_rcrps:>10.4f}")
+print(f"{'Improvement':<20} {baseline_rcrps - tuned_rcrps:>10.4f}")
+
+# %% [markdown]
+# ## Next steps
+#
+# - {doc}`ensemble_forecasting` — combine tuned models into an ensemble
+#   for even better accuracy.
+# - {doc}`backtesting_quickstart` — validate tuned parameters on longer
+#   historical windows.
diff --git a/examples/tutorials/model_explainability.ipynb b/examples/tutorials/model_explainability.ipynb
new file mode 100644
index 000000000..4d493d99f
--- /dev/null
+++ b/examples/tutorials/model_explainability.ipynb
@@ -0,0 +1,347 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "7efd95d4",
+   "metadata": {
+    "tags": [
+     "remove-cell"
+    ]
+   },
+   "outputs": [],
+   "source": [
+    "# SPDX-FileCopyrightText: 2025 Contributors to the OpenSTEF project <openstef@lfenergy.org>\n",
+    "#\n",
+    "# SPDX-License-Identifier: MPL-2.0\n",
+    "\n",
+    "# pyright: basic"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f565776d",
+   "metadata": {
+    "tags": [
+     "remove-cell"
+    ]
+   },
+   "outputs": [],
+   "source": [
+    "import warnings\n",
+    "from typing import cast\n",
+    "\n",
+    "warnings.filterwarnings(\"ignore\")\n",
+    "\n",
+    "from openstef_core.testing import configure_notebook_display, setup_notebook_logging\n",
+    "\n",
+    "configure_notebook_display()\n",
+    "logger = setup_notebook_logging(\n",
+    "    __name__,\n",
+    "    suppress=(\n",
+    "        \"choreographer\",\n",
+    "        \"kaleido\",\n",
+    "        \"httpx\",\n",
+    "        \"huggingface_hub\",\n",
+    "        \"fsspec\",\n",
+    "        \"filelock\",\n",
+    "        \"openstef_core.datasets\",\n",
+    "    ),\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "9047f142",
+   "metadata": {},
+   "source": [
+    "# Model Explainability\n",
+    "\n",
+    "Understand why a forecasting model makes the predictions it does, using\n",
+    "feature importance scores and per-timestep SHAP contributions.\n",
+    "\n",
+    "**What you'll learn:**\n",
+    "\n",
+    "- Inspect global feature importance with an interactive treemap\n",
+    "- Compute per-timestep feature contributions (SHAP values)\n",
+    "- Visualize contributions with heatmaps, waterfall charts, and bar charts\n",
+    "\n",
+    "```{note}\n",
+    "This tutorial uses a small data slice for fast execution.\n",
+    "See `examples/benchmarks/` for production-scale runs.\n",
+    "```\n",
+    "\n",
+    "**Key API references:**\n",
+    "[`ExplainableForecaster`](https://openstef.github.io/openstef/v4/api/generated/openstef_models.explainability.ExplainableForecaster.html)\n",
+    "· [`ContributionsPlotter`](https://openstef.github.io/openstef/v4/api/generated/openstef_models.explainability.ContributionsPlotter.html)\n",
+    "· [`FeatureImportancePlotter`](https://openstef.github.io/openstef/v4/api/generated/openstef_models.explainability.FeatureImportancePlotter.html)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "79a5bbf9",
+   "metadata": {},
+   "source": [
+    "## Train a model\n",
+    "\n",
+    "We reuse the same setup as the {doc}`forecasting_quickstart` — train a GBLinear\n",
+    "model on 45 days of Liander data."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "862a46b8",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from datetime import datetime, timedelta\n",
+    "\n",
+    "from openstef_core.testing import load_liander_dataset\n",
+    "from openstef_core.types import LeadTime, Q\n",
+    "from openstef_models.presets import ForecastingWorkflowConfig, create_forecasting_workflow\n",
+    "from openstef_models.presets.forecasting_workflow import GBLinearForecaster\n",
+    "\n",
+    "dataset = load_liander_dataset()\n",
+    "\n",
+    "train_start = datetime.fromisoformat(\"2024-03-01T00:00:00Z\")\n",
+    "train_end = train_start + timedelta(days=45)\n",
+    "forecast_end = train_end + timedelta(days=7)\n",
+    "\n",
+    "train_dataset = dataset.filter_by_range(start=train_start, end=train_end)\n",
+    "predict_dataset = dataset.filter_by_range(\n",
+    "    start=train_end - timedelta(days=14),\n",
+    "    end=forecast_end,\n",
+    ")\n",
+    "\n",
+    "workflow = create_forecasting_workflow(\n",
+    "    config=ForecastingWorkflowConfig(\n",
+    "        model_id=\"explainability_gblinear\",\n",
+    "        model=\"gblinear\",\n",
+    "        horizons=[LeadTime.from_string(\"PT36H\")],\n",
+    "        quantiles=[Q(0.5), Q(0.1), Q(0.9)],\n",
+    "        target_column=\"load\",\n",
+    "        temperature_column=\"temperature_2m\",\n",
+    "        relative_humidity_column=\"relative_humidity_2m\",\n",
+    "        wind_speed_column=\"wind_speed_10m\",\n",
+    "        radiation_column=\"shortwave_radiation\",\n",
+    "        pressure_column=\"surface_pressure\",\n",
+    "        verbosity=0,\n",
+    "        mlflow_storage=None,\n",
+    "        gblinear_hyperparams=GBLinearForecaster.HyperParams(n_steps=50),\n",
+    "    )\n",
+    ")\n",
+    "\n",
+    "result = workflow.fit(train_dataset)\n",
+    "print(\"Training complete.\")\n",
+    "print(result.metrics_full.to_dataframe())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "622af3db",
+   "metadata": {
+    "tags": [
+     "remove-cell"
+    ]
+   },
+   "outputs": [],
+   "source": [
+    "assert result is not None, \"Training should produce a result\""
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "0680e73c",
+   "metadata": {},
+   "source": [
+    "## Feature importance\n",
+    "\n",
+    "Feature importance scores rank features by their overall impact on the model's\n",
+    "predictions.  The [`FeatureImportancePlotter`](https://openstef.github.io/openstef/v4/api/generated/openstef_models.explainability.FeatureImportancePlotter.html) treemap visualization groups features by magnitude — larger\n",
+    "tiles represent more influential features."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ca38989f",
+   "metadata": {
+    "tags": [
+     "hide-input"
+    ]
+   },
+   "outputs": [],
+   "source": [
+    "from openstef_models.explainability import ExplainableForecaster\n",
+    "from openstef_models.models.forecasting_model import ForecastingModel\n",
+    "\n",
+    "forecaster = cast(ForecastingModel, workflow.model).forecaster\n",
+    "explainable_model = cast(ExplainableForecaster, forecaster)\n",
+    "\n",
+    "fig = explainable_model.plot_feature_importances()\n",
+    "fig.update_layout(title=\"Feature importance (treemap)\", height=500)\n",
+    "fig.show()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "bf6cec05",
+   "metadata": {},
+   "source": [
+    "## Feature contributions\n",
+    "\n",
+    "While feature importance is a global summary, **feature contributions** explain\n",
+    "individual predictions.  For each timestep, they decompose the prediction into\n",
+    "additive terms: one per feature plus a bias.\n",
+    "\n",
+    "GBLinear models provide exact SHAP values, making this decomposition faithful\n",
+    "to the model's internal logic.  Use [`ContributionsPlotter`](https://openstef.github.io/openstef/v4/api/generated/openstef_models.explainability.ContributionsPlotter.html)\n",
+    "to visualize contributions as heatmaps, bar charts, or waterfall charts."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "63e99143",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from openstef_models.explainability import ContributionsPlotter\n",
+    "\n",
+    "contributions = workflow.model.predict_contributions(predict_dataset, forecast_start=train_end)\n",
+    "\n",
+    "print(f\"Contributions shape: {contributions.data.shape}\")\n",
+    "print(f\"Features: {contributions.data.columns.tolist()[:5]} ... ({len(contributions.data.columns)} total)\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d8c89b86",
+   "metadata": {
+    "tags": [
+     "remove-cell"
+    ]
+   },
+   "outputs": [],
+   "source": [
+    "assert contributions.data.shape[0] > 100, f\"Expected >100 rows, got {contributions.data.shape[0]}\"\n",
+    "assert \"bias\" in contributions.data.columns, \"Contributions should include bias column\""
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "06217b33",
+   "metadata": {},
+   "source": [
+    "### Heatmap — contributions over time\n",
+    "\n",
+    "Each row is a feature, each column is a timestep.  Red cells indicate positive\n",
+    "contributions (pushing the prediction up), blue cells indicate negative ones.\n",
+    "The prediction line overlays the total."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b15c2e1a",
+   "metadata": {
+    "tags": [
+     "hide-input"
+    ]
+   },
+   "outputs": [],
+   "source": [
+    "fig = ContributionsPlotter.plot_heatmap(contributions, top_n=10, show_prediction=True)\n",
+    "fig.update_layout(height=500)\n",
+    "fig.show()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "35727a5e",
+   "metadata": {},
+   "source": [
+    "### Bar chart — average feature impact\n",
+    "\n",
+    "Mean absolute contribution per feature, ranked from most to least impactful.\n",
+    "This gives a complementary view to global importance — here you see which\n",
+    "features actively moved predictions during the forecast window.  If certain\n",
+    "features dominate unexpectedly, consider adjusting the pipeline via\n",
+    "{doc}`custom_pipeline`."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a61dc558",
+   "metadata": {
+    "tags": [
+     "hide-input"
+    ]
+   },
+   "outputs": [],
+   "source": [
+    "fig = ContributionsPlotter.plot_bar(contributions, top_n=12)\n",
+    "fig.update_layout(title=\"Mean absolute contribution per feature\", height=450)\n",
+    "fig.show()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "8a2abed2",
+   "metadata": {},
+   "source": [
+    "### Waterfall — single timestep decomposition\n",
+    "\n",
+    "The waterfall chart breaks down one specific prediction into its components.\n",
+    "Starting from the bias (baseline prediction), each feature adds or subtracts\n",
+    "from the final value."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0a44aa28",
+   "metadata": {
+    "tags": [
+     "hide-input"
+    ]
+   },
+   "outputs": [],
+   "source": [
+    "fig = ContributionsPlotter.plot_waterfall(contributions, timestep=48, top_n=10)\n",
+    "fig.update_layout(title=\"Prediction decomposition (timestep 48)\", height=500)\n",
+    "fig.show()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "dc95f759",
+   "metadata": {},
+   "source": [
+    "## Next steps\n",
+    "\n",
+    "- {doc}`hyperparameter_tuning_with_optuna` — use explainability insights\n",
+    "  to guide which parameters to tune.\n",
+    "- {doc}`custom_pipeline` — fine-tune feature engineering based on what\n",
+    "  the contributions reveal."
+   ]
+  }
+ ],
+ "metadata": {
+  "jupytext": {
+   "formats": "ipynb,py:percent"
+  },
+  "kernelspec": {
+   "display_name": ".venv",
+   "language": "python",
+   "name": "python3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/examples/tutorials/model_explainability.py b/examples/tutorials/model_explainability.py
new file mode 100644
index 000000000..e8f7ead18
--- /dev/null
+++ b/examples/tutorials/model_explainability.py
@@ -0,0 +1,203 @@
+# ---
+# jupyter:
+#   jupytext:
+#     formats: ipynb,py:percent
+#     text_representation:
+#       extension: .py
+#       format_name: percent
+#       format_version: '1.3'
+#       jupytext_version: 1.19.1
+#   kernelspec:
+#     display_name: .venv
+#     language: python
+#     name: python3
+# ---
+
+# %% tags=["remove-cell"]
+# SPDX-FileCopyrightText: 2025 Contributors to the OpenSTEF project <openstef@lfenergy.org>
+#
+# SPDX-License-Identifier: MPL-2.0
+
+# pyright: basic
+
+# %% tags=["remove-cell"]
+import warnings
+from typing import cast
+
+warnings.filterwarnings("ignore")
+
+from openstef_core.testing import configure_notebook_display, setup_notebook_logging
+
+configure_notebook_display()
+logger = setup_notebook_logging(
+    __name__,
+    suppress=(
+        "choreographer",
+        "kaleido",
+        "httpx",
+        "huggingface_hub",
+        "fsspec",
+        "filelock",
+        "openstef_core.datasets",
+    ),
+)
+
+# %% [markdown]
+# # Model Explainability
+#
+# Understand why a forecasting model makes the predictions it does, using
+# feature importance scores and per-timestep SHAP contributions.
+#
+# **What you'll learn:**
+#
+# - Inspect global feature importance with an interactive treemap
+# - Compute per-timestep feature contributions (SHAP values)
+# - Visualize contributions with heatmaps, waterfall charts, and bar charts
+#
+# ```{note}
+# This tutorial uses a small data slice for fast execution.
+# See `examples/benchmarks/` for production-scale runs.
+# ```
+#
+# **Key API references:**
+# [`ExplainableForecaster`](https://openstef.github.io/openstef/v4/api/generated/openstef_models.explainability.ExplainableForecaster.html)
+# · [`ContributionsPlotter`](https://openstef.github.io/openstef/v4/api/generated/openstef_models.explainability.ContributionsPlotter.html)
+# · [`FeatureImportancePlotter`](https://openstef.github.io/openstef/v4/api/generated/openstef_models.explainability.FeatureImportancePlotter.html)
+
+# %% [markdown]
+# ## Train a model
+#
+# We reuse the same setup as the {doc}`forecasting_quickstart` — train a GBLinear
+# model on 45 days of Liander data.
+
+# %%
+from datetime import datetime, timedelta
+
+from openstef_core.testing import load_liander_dataset
+from openstef_core.types import LeadTime, Q
+from openstef_models.presets import ForecastingWorkflowConfig, create_forecasting_workflow
+from openstef_models.presets.forecasting_workflow import GBLinearForecaster
+
+dataset = load_liander_dataset()
+
+train_start = datetime.fromisoformat("2024-03-01T00:00:00Z")
+train_end = train_start + timedelta(days=45)
+forecast_end = train_end + timedelta(days=7)
+
+train_dataset = dataset.filter_by_range(start=train_start, end=train_end)
+predict_dataset = dataset.filter_by_range(
+    start=train_end - timedelta(days=14),
+    end=forecast_end,
+)
+
+workflow = create_forecasting_workflow(
+    config=ForecastingWorkflowConfig(
+        model_id="explainability_gblinear",
+        model="gblinear",
+        horizons=[LeadTime.from_string("PT36H")],
+        quantiles=[Q(0.5), Q(0.1), Q(0.9)],
+        target_column="load",
+        temperature_column="temperature_2m",
+        relative_humidity_column="relative_humidity_2m",
+        wind_speed_column="wind_speed_10m",
+        radiation_column="shortwave_radiation",
+        pressure_column="surface_pressure",
+        verbosity=0,
+        mlflow_storage=None,
+        gblinear_hyperparams=GBLinearForecaster.HyperParams(n_steps=50),
+    )
+)
+
+result = workflow.fit(train_dataset)
+print("Training complete.")
+print(result.metrics_full.to_dataframe())
+
+# %% tags=["remove-cell"]
+assert result is not None, "Training should produce a result"
+
+# %% [markdown]
+# ## Feature importance
+#
+# Feature importance scores rank features by their overall impact on the model's
+# predictions.  The [`FeatureImportancePlotter`](https://openstef.github.io/openstef/v4/api/generated/openstef_models.explainability.FeatureImportancePlotter.html) treemap visualization groups features by magnitude — larger
+# tiles represent more influential features.
+
+# %% tags=["hide-input"]
+from openstef_models.explainability import ExplainableForecaster
+from openstef_models.models.forecasting_model import ForecastingModel
+
+forecaster = cast(ForecastingModel, workflow.model).forecaster
+explainable_model = cast(ExplainableForecaster, forecaster)
+
+fig = explainable_model.plot_feature_importances()
+fig.update_layout(title="Feature importance (treemap)", height=500)
+fig.show()
+
+# %% [markdown]
+# ## Feature contributions
+#
+# While feature importance is a global summary, **feature contributions** explain
+# individual predictions.  For each timestep, they decompose the prediction into
+# additive terms: one per feature plus a bias.
+#
+# GBLinear models provide exact SHAP values, making this decomposition faithful
+# to the model's internal logic.  Use [`ContributionsPlotter`](https://openstef.github.io/openstef/v4/api/generated/openstef_models.explainability.ContributionsPlotter.html)
+# to visualize contributions as heatmaps, bar charts, or waterfall charts.
+
+# %%
+from openstef_models.explainability import ContributionsPlotter
+
+contributions = workflow.model.predict_contributions(predict_dataset, forecast_start=train_end)
+
+print(f"Contributions shape: {contributions.data.shape}")
+print(f"Features: {contributions.data.columns.tolist()[:5]} ... ({len(contributions.data.columns)} total)")
+
+# %% tags=["remove-cell"]
+assert contributions.data.shape[0] > 100, f"Expected >100 rows, got {contributions.data.shape[0]}"
+assert "bias" in contributions.data.columns, "Contributions should include bias column"
+
+# %% [markdown]
+# ### Heatmap — contributions over time
+#
+# Each row is a feature, each column is a timestep.  Red cells indicate positive
+# contributions (pushing the prediction up), blue cells indicate negative ones.
+# The prediction line overlays the total.
+
+# %% tags=["hide-input"]
+fig = ContributionsPlotter.plot_heatmap(contributions, top_n=10, show_prediction=True)
+fig.update_layout(height=500)
+fig.show()
+
+# %% [markdown]
+# ### Bar chart — average feature impact
+#
+# Mean absolute contribution per feature, ranked from most to least impactful.
+# This gives a complementary view to global importance — here you see which
+# features actively moved predictions during the forecast window.  If certain
+# features dominate unexpectedly, consider adjusting the pipeline via
+# {doc}`custom_pipeline`.
+
+# %% tags=["hide-input"]
+fig = ContributionsPlotter.plot_bar(contributions, top_n=12)
+fig.update_layout(title="Mean absolute contribution per feature", height=450)
+fig.show()
+
+# %% [markdown]
+# ### Waterfall — single timestep decomposition
+#
+# The waterfall chart breaks down one specific prediction into its components.
+# Starting from the bias (baseline prediction), each feature adds or subtracts
+# from the final value.
+
+# %% tags=["hide-input"]
+fig = ContributionsPlotter.plot_waterfall(contributions, timestep=48, top_n=10)
+fig.update_layout(title="Prediction decomposition (timestep 48)", height=500)
+fig.show()
+
+# %% [markdown]
+# ## Next steps
+#
+# - {doc}`hyperparameter_tuning_with_optuna` — use explainability insights
+#   to guide which parameters to tune.
+# - {doc}`custom_pipeline` — fine-tune feature engineering based on what
+#   the contributions reveal.
diff --git a/examples/tutorials/quantile_calibration.ipynb b/examples/tutorials/quantile_calibration.ipynb
new file mode 100644
index 000000000..a772e9db5
--- /dev/null
+++ b/examples/tutorials/quantile_calibration.ipynb
@@ -0,0 +1,376 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "28ab33c6",
+   "metadata": {
+    "tags": [
+     "remove-cell"
+    ]
+   },
+   "outputs": [],
+   "source": [
+    "# SPDX-FileCopyrightText: 2025 Contributors to the OpenSTEF project <openstef@lfenergy.org>\n",
+    "#\n",
+    "# SPDX-License-Identifier: MPL-2.0\n",
+    "\n",
+    "# pyright: basic"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "8d1ac2cd",
+   "metadata": {
+    "tags": [
+     "remove-cell"
+    ]
+   },
+   "outputs": [],
+   "source": [
+    "import warnings\n",
+    "\n",
+    "warnings.filterwarnings(\"ignore\")\n",
+    "\n",
+    "from openstef_core.testing import configure_notebook_display, setup_notebook_logging\n",
+    "\n",
+    "configure_notebook_display()\n",
+    "logger = setup_notebook_logging(\n",
+    "    __name__,\n",
+    "    suppress=(\n",
+    "        \"choreographer\",\n",
+    "        \"kaleido\",\n",
+    "        \"httpx\",\n",
+    "        \"huggingface_hub\",\n",
+    "        \"fsspec\",\n",
+    "        \"filelock\",\n",
+    "        \"openstef_core.datasets\",\n",
+    "    ),\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "2297aa0d",
+   "metadata": {},
+   "source": [
+    "# Quantile Calibration\n",
+    "\n",
+    "Improve the reliability of probabilistic forecasts using isotonic quantile\n",
+    "calibration.  A well-calibrated P10 quantile should exceed actual values\n",
+    "roughly 10 % of the time — this tutorial shows how to measure and correct\n",
+    "deviations.\n",
+    "\n",
+    "**What you'll learn:**\n",
+    "\n",
+    "- Measure quantile calibration with observed coverage\n",
+    "- Add isotonic calibration as a postprocessing step\n",
+    "- Compare before/after calibration on real data\n",
+    "\n",
+    "```{note}\n",
+    "This tutorial uses a small data slice for fast execution.\n",
+    "See `examples/benchmarks/` for production-scale runs.\n",
+    "```\n",
+    "\n",
+    "**Key API references:**\n",
+    "[`IsotonicQuantileCalibrator`](https://openstef.github.io/openstef/v4/api/generated/openstef_models.transforms.postprocessing.IsotonicQuantileCalibrator.html)\n",
+    "· [`ForecastingWorkflowConfig`](https://openstef.github.io/openstef/v4/api/generated/openstef_models.presets.ForecastingWorkflowConfig.html)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "4c98c476",
+   "metadata": {},
+   "source": [
+    "## Load data and train an uncalibrated model\n",
+    "\n",
+    "We start with the same GBLinear setup as the {doc}`forecasting_quickstart` and\n",
+    "measure how well its predicted quantiles match observed coverage.\n",
+    "The [`ForecastingWorkflowConfig`](https://openstef.github.io/openstef/v4/api/generated/openstef_models.presets.ForecastingWorkflowConfig.html)\n",
+    "defines the model architecture and quantile levels."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "bf2cde43",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from datetime import datetime, timedelta\n",
+    "\n",
+    "import pandas as pd\n",
+    "import plotly.graph_objects as go\n",
+    "\n",
+    "from openstef_core.testing import load_liander_dataset\n",
+    "from openstef_core.types import LeadTime, Q\n",
+    "from openstef_models.presets import ForecastingWorkflowConfig, create_forecasting_workflow\n",
+    "from openstef_models.presets.forecasting_workflow import GBLinearForecaster\n",
+    "\n",
+    "dataset = load_liander_dataset()\n",
+    "\n",
+    "train_start = datetime.fromisoformat(\"2024-03-01T00:00:00Z\")\n",
+    "train_end = train_start + timedelta(days=45)\n",
+    "forecast_end = train_end + timedelta(days=7)\n",
+    "\n",
+    "train_dataset = dataset.filter_by_range(start=train_start, end=train_end)\n",
+    "predict_dataset = dataset.filter_by_range(\n",
+    "    start=train_end - timedelta(days=14),\n",
+    "    end=forecast_end,\n",
+    ")\n",
+    "\n",
+    "quantiles = [Q(0.1), Q(0.5), Q(0.9)]\n",
+    "\n",
+    "config = ForecastingWorkflowConfig(\n",
+    "    model_id=\"uncalibrated_gblinear\",\n",
+    "    model=\"gblinear\",\n",
+    "    horizons=[LeadTime.from_string(\"PT36H\")],\n",
+    "    quantiles=quantiles,\n",
+    "    target_column=\"load\",\n",
+    "    temperature_column=\"temperature_2m\",\n",
+    "    relative_humidity_column=\"relative_humidity_2m\",\n",
+    "    wind_speed_column=\"wind_speed_10m\",\n",
+    "    radiation_column=\"shortwave_radiation\",\n",
+    "    pressure_column=\"surface_pressure\",\n",
+    "    verbosity=0,\n",
+    "    mlflow_storage=None,\n",
+    "    gblinear_hyperparams=GBLinearForecaster.HyperParams(n_steps=50),\n",
+    ")\n",
+    "\n",
+    "workflow_uncal = create_forecasting_workflow(config=config)\n",
+    "workflow_uncal.fit(train_dataset)\n",
+    "forecast_uncal = workflow_uncal.predict(predict_dataset, forecast_start=train_end)\n",
+    "\n",
+    "print(f\"Forecast rows: {len(forecast_uncal.data)}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "018066a2",
+   "metadata": {
+    "tags": [
+     "remove-cell"
+    ]
+   },
+   "outputs": [],
+   "source": [
+    "assert len(forecast_uncal.data) > 100, f\"Expected >100 forecast rows, got {len(forecast_uncal.data)}\""
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "390de36e",
+   "metadata": {},
+   "source": [
+    "## Measure calibration quality\n",
+    "\n",
+    "For a perfectly calibrated forecast at quantile $p$, the fraction of\n",
+    "observations falling below the predicted value should equal $p$.  We compute\n",
+    "the **observed coverage** for each quantile and compare it to the expected\n",
+    "level."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d0229c1c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "actuals = predict_dataset.data[\"load\"].loc[train_end:].reindex(forecast_uncal.data.index).dropna()\n",
+    "forecast_aligned = forecast_uncal.data.loc[actuals.index]\n",
+    "\n",
+    "expected = [float(q) for q in quantiles]\n",
+    "observed_uncal = [float((actuals <= forecast_aligned[f\"quantile_P{int(float(q) * 100)}\"]).mean()) for q in quantiles]\n",
+    "\n",
+    "calibration_df = pd.DataFrame({\n",
+    "    \"quantile\": [f\"P{int(float(q) * 100)}\" for q in quantiles],\n",
+    "    \"expected\": expected,\n",
+    "    \"observed\": observed_uncal,\n",
+    "    \"error\": [o - e for o, e in zip(observed_uncal, expected, strict=True)],\n",
+    "})\n",
+    "print(\"Calibration before isotonic correction:\")\n",
+    "print(calibration_df.to_string(index=False))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "c0b8a434",
+   "metadata": {},
+   "source": [
+    "## Add isotonic calibration\n",
+    "\n",
+    "[`IsotonicQuantileCalibrator`](https://openstef.github.io/openstef/v4/api/generated/openstef_models.transforms.postprocessing.IsotonicQuantileCalibrator.html) is a postprocessing transform that learns a\n",
+    "monotonic mapping from predicted quantiles to observed quantile levels.\n",
+    "During training it fits on the validation split; during prediction it\n",
+    "corrects each quantile value.\n",
+    "\n",
+    "We create a second workflow identical to the first, but with the calibrator\n",
+    "appended to its postprocessing pipeline."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "dcd100f9",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from openstef_models.transforms.postprocessing import IsotonicQuantileCalibrator\n",
+    "\n",
+    "config_cal = config.model_copy(update={\"model_id\": \"calibrated_gblinear\"})\n",
+    "workflow_cal = create_forecasting_workflow(config=config_cal)\n",
+    "\n",
+    "# Append isotonic calibration to the existing postprocessing pipeline\n",
+    "workflow_cal.model.postprocessing.transforms.append(\n",
+    "    IsotonicQuantileCalibrator(\n",
+    "        quantiles=quantiles,\n",
+    "        use_local_quantile_estimation=True,\n",
+    "    )\n",
+    ")\n",
+    "\n",
+    "workflow_cal.fit(train_dataset)\n",
+    "forecast_cal = workflow_cal.predict(predict_dataset, forecast_start=train_end)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e7d92a1c",
+   "metadata": {
+    "tags": [
+     "remove-cell"
+    ]
+   },
+   "outputs": [],
+   "source": [
+    "assert len(forecast_cal.data) > 100, f\"Expected >100 calibrated forecast rows, got {len(forecast_cal.data)}\""
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "0dc57ac6",
+   "metadata": {},
+   "source": [
+    "## Compare calibration before and after"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "9b28cbc3",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "forecast_cal_aligned = forecast_cal.data.loc[actuals.index]\n",
+    "\n",
+    "observed_cal = [float((actuals <= forecast_cal_aligned[f\"quantile_P{int(float(q) * 100)}\"]).mean()) for q in quantiles]\n",
+    "\n",
+    "comparison_df = pd.DataFrame({\n",
+    "    \"quantile\": [f\"P{int(float(q) * 100)}\" for q in quantiles],\n",
+    "    \"expected\": expected,\n",
+    "    \"observed (before)\": observed_uncal,\n",
+    "    \"observed (after)\": observed_cal,\n",
+    "    \"error (before)\": [o - e for o, e in zip(observed_uncal, expected, strict=True)],\n",
+    "    \"error (after)\": [o - e for o, e in zip(observed_cal, expected, strict=True)],\n",
+    "})\n",
+    "print(comparison_df.to_string(index=False))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "aea63869",
+   "metadata": {
+    "tags": [
+     "hide-input"
+    ]
+   },
+   "outputs": [],
+   "source": [
+    "fig = go.Figure()\n",
+    "\n",
+    "fig.add_trace(  # pyright: ignore[reportUnknownMemberType]\n",
+    "    go.Scatter(\n",
+    "        x=[0, 1],\n",
+    "        y=[0, 1],\n",
+    "        mode=\"lines\",\n",
+    "        name=\"Perfect calibration\",\n",
+    "        line={\"color\": \"gray\", \"dash\": \"dash\", \"width\": 2},\n",
+    "    )\n",
+    ")\n",
+    "\n",
+    "fig.add_trace(  # pyright: ignore[reportUnknownMemberType]\n",
+    "    go.Scatter(\n",
+    "        x=expected,\n",
+    "        y=observed_uncal,\n",
+    "        mode=\"markers+lines\",\n",
+    "        name=\"Before calibration\",\n",
+    "        marker={\"size\": 12, \"color\": \"red\", \"symbol\": \"x\"},\n",
+    "        line={\"color\": \"red\", \"width\": 2, \"dash\": \"dot\"},\n",
+    "    )\n",
+    ")\n",
+    "\n",
+    "fig.add_trace(  # pyright: ignore[reportUnknownMemberType]\n",
+    "    go.Scatter(\n",
+    "        x=expected,\n",
+    "        y=observed_cal,\n",
+    "        mode=\"markers+lines\",\n",
+    "        name=\"After calibration\",\n",
+    "        marker={\"size\": 12, \"color\": \"blue\"},\n",
+    "        line={\"color\": \"blue\", \"width\": 2},\n",
+    "    )\n",
+    ")\n",
+    "\n",
+    "fig.update_layout(  # pyright: ignore[reportUnknownMemberType]\n",
+    "    title=\"Quantile calibration: expected vs observed coverage\",\n",
+    "    xaxis_title=\"Expected quantile level\",\n",
+    "    yaxis_title=\"Observed coverage\",\n",
+    "    xaxis={\"range\": [0, 1], \"tickvals\": [0, 0.1, 0.5, 0.9, 1]},\n",
+    "    yaxis={\"range\": [0, 1], \"tickvals\": [0, 0.1, 0.5, 0.9, 1]},\n",
+    "    height=500,\n",
+    "    width=600,\n",
+    ")\n",
+    "fig.show()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "fb2c92a8",
+   "metadata": {},
+   "source": [
+    "Points closer to the diagonal indicate better calibration.  The isotonic\n",
+    "correction pulls the observed coverage towards the expected level, improving\n",
+    "the reliability of uncertainty estimates.  To measure calibration stability\n",
+    "over longer time horizons, combine this with a {doc}`backtesting_quickstart`."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "187cc8df",
+   "metadata": {},
+   "source": [
+    "## Next steps\n",
+    "\n",
+    "- {doc}`backtesting_quickstart` — measure calibration consistency over\n",
+    "  realistic operational periods.\n",
+    "- {doc}`ensemble_forecasting` — apply calibration to ensemble models\n",
+    "  for combined accuracy and reliable uncertainty."
+   ]
+  }
+ ],
+ "metadata": {
+  "jupytext": {
+   "formats": "ipynb,py:percent"
+  },
+  "kernelspec": {
+   "display_name": ".venv",
+   "language": "python",
+   "name": "python3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/examples/tutorials/quantile_calibration.py b/examples/tutorials/quantile_calibration.py
new file mode 100644
index 000000000..bf8976e07
--- /dev/null
+++ b/examples/tutorials/quantile_calibration.py
@@ -0,0 +1,256 @@
+# ---
+# jupyter:
+#   jupytext:
+#     formats: ipynb,py:percent
+#     text_representation:
+#       extension: .py
+#       format_name: percent
+#       format_version: '1.3'
+#       jupytext_version: 1.19.1
+#   kernelspec:
+#     display_name: .venv
+#     language: python
+#     name: python3
+# ---
+
+# %% tags=["remove-cell"]
+# SPDX-FileCopyrightText: 2025 Contributors to the OpenSTEF project <openstef@lfenergy.org>
+#
+# SPDX-License-Identifier: MPL-2.0
+
+# pyright: basic
+
+# %% tags=["remove-cell"]
+import warnings
+
+warnings.filterwarnings("ignore")
+
+from openstef_core.testing import configure_notebook_display, setup_notebook_logging
+
+configure_notebook_display()
+logger = setup_notebook_logging(
+    __name__,
+    suppress=(
+        "choreographer",
+        "kaleido",
+        "httpx",
+        "huggingface_hub",
+        "fsspec",
+        "filelock",
+        "openstef_core.datasets",
+    ),
+)
+
+# %% [markdown]
+# # Quantile Calibration
+#
+# Improve the reliability of probabilistic forecasts using isotonic quantile
+# calibration.  A well-calibrated P10 quantile should exceed actual values
+# roughly 10 % of the time — this tutorial shows how to measure and correct
+# deviations.
+#
+# **What you'll learn:**
+#
+# - Measure quantile calibration with observed coverage
+# - Add isotonic calibration as a postprocessing step
+# - Compare before/after calibration on real data
+#
+# ```{note}
+# This tutorial uses a small data slice for fast execution.
+# See `examples/benchmarks/` for production-scale runs.
+# ```
+#
+# **Key API references:**
+# [`IsotonicQuantileCalibrator`](https://openstef.github.io/openstef/v4/api/generated/openstef_models.transforms.postprocessing.IsotonicQuantileCalibrator.html)
+# · [`ForecastingWorkflowConfig`](https://openstef.github.io/openstef/v4/api/generated/openstef_models.presets.ForecastingWorkflowConfig.html)
+
+# %% [markdown]
+# ## Load data and train an uncalibrated model
+#
+# We start with the same GBLinear setup as the {doc}`forecasting_quickstart` and
+# measure how well its predicted quantiles match observed coverage.
+# The [`ForecastingWorkflowConfig`](https://openstef.github.io/openstef/v4/api/generated/openstef_models.presets.ForecastingWorkflowConfig.html)
+# defines the model architecture and quantile levels.
+
+# %%
+from datetime import datetime, timedelta
+
+import pandas as pd
+import plotly.graph_objects as go
+
+from openstef_core.testing import load_liander_dataset
+from openstef_core.types import LeadTime, Q
+from openstef_models.presets import ForecastingWorkflowConfig, create_forecasting_workflow
+from openstef_models.presets.forecasting_workflow import GBLinearForecaster
+
+dataset = load_liander_dataset()
+
+train_start = datetime.fromisoformat("2024-03-01T00:00:00Z")
+train_end = train_start + timedelta(days=45)
+forecast_end = train_end + timedelta(days=7)
+
+train_dataset = dataset.filter_by_range(start=train_start, end=train_end)
+predict_dataset = dataset.filter_by_range(
+    start=train_end - timedelta(days=14),
+    end=forecast_end,
+)
+
+quantiles = [Q(0.1), Q(0.5), Q(0.9)]
+
+config = ForecastingWorkflowConfig(
+    model_id="uncalibrated_gblinear",
+    model="gblinear",
+    horizons=[LeadTime.from_string("PT36H")],
+    quantiles=quantiles,
+    target_column="load",
+    temperature_column="temperature_2m",
+    relative_humidity_column="relative_humidity_2m",
+    wind_speed_column="wind_speed_10m",
+    radiation_column="shortwave_radiation",
+    pressure_column="surface_pressure",
+    verbosity=0,
+    mlflow_storage=None,
+    gblinear_hyperparams=GBLinearForecaster.HyperParams(n_steps=50),
+)
+
+workflow_uncal = create_forecasting_workflow(config=config)
+workflow_uncal.fit(train_dataset)
+forecast_uncal = workflow_uncal.predict(predict_dataset, forecast_start=train_end)
+
+print(f"Forecast rows: {len(forecast_uncal.data)}")
+
+# %% tags=["remove-cell"]
+assert len(forecast_uncal.data) > 100, f"Expected >100 forecast rows, got {len(forecast_uncal.data)}"
+
+# %% [markdown]
+# ## Measure calibration quality
+#
+# For a perfectly calibrated forecast at quantile $p$, the fraction of
+# observations falling below the predicted value should equal $p$.  We compute
+# the **observed coverage** for each quantile and compare it to the expected
+# level.
+
+# %%
+actuals = predict_dataset.data["load"].loc[train_end:].reindex(forecast_uncal.data.index).dropna()
+forecast_aligned = forecast_uncal.data.loc[actuals.index]
+
+expected = [float(q) for q in quantiles]
+observed_uncal = [float((actuals <= forecast_aligned[f"quantile_P{int(float(q) * 100)}"]).mean()) for q in quantiles]
+
+calibration_df = pd.DataFrame({
+    "quantile": [f"P{int(float(q) * 100)}" for q in quantiles],
+    "expected": expected,
+    "observed": observed_uncal,
+    "error": [o - e for o, e in zip(observed_uncal, expected, strict=True)],
+})
+print("Calibration before isotonic correction:")
+print(calibration_df.to_string(index=False))
+
+# %% [markdown]
+# ## Add isotonic calibration
+#
+# [`IsotonicQuantileCalibrator`](https://openstef.github.io/openstef/v4/api/generated/openstef_models.transforms.postprocessing.IsotonicQuantileCalibrator.html) is a postprocessing transform that learns a
+# monotonic mapping from predicted quantiles to observed quantile levels.
+# During training it fits on the validation split; during prediction it
+# corrects each quantile value.
+#
+# We create a second workflow identical to the first, but with the calibrator
+# appended to its postprocessing pipeline.
+
+# %%
+from openstef_models.transforms.postprocessing import IsotonicQuantileCalibrator
+
+config_cal = config.model_copy(update={"model_id": "calibrated_gblinear"})
+workflow_cal = create_forecasting_workflow(config=config_cal)
+
+# Append isotonic calibration to the existing postprocessing pipeline
+workflow_cal.model.postprocessing.transforms.append(
+    IsotonicQuantileCalibrator(
+        quantiles=quantiles,
+        use_local_quantile_estimation=True,
+    )
+)
+
+workflow_cal.fit(train_dataset)
+forecast_cal = workflow_cal.predict(predict_dataset, forecast_start=train_end)
+
+# %% tags=["remove-cell"]
+assert len(forecast_cal.data) > 100, f"Expected >100 calibrated forecast rows, got {len(forecast_cal.data)}"
+
+# %% [markdown]
+# ## Compare calibration before and after
+
+# %%
+forecast_cal_aligned = forecast_cal.data.loc[actuals.index]
+
+observed_cal = [float((actuals <= forecast_cal_aligned[f"quantile_P{int(float(q) * 100)}"]).mean()) for q in quantiles]
+
+comparison_df = pd.DataFrame({
+    "quantile": [f"P{int(float(q) * 100)}" for q in quantiles],
+    "expected": expected,
+    "observed (before)": observed_uncal,
+    "observed (after)": observed_cal,
+    "error (before)": [o - e for o, e in zip(observed_uncal, expected, strict=True)],
+    "error (after)": [o - e for o, e in zip(observed_cal, expected, strict=True)],
+})
+print(comparison_df.to_string(index=False))
+
+# %% tags=["hide-input"]
+fig = go.Figure()
+
+fig.add_trace(  # pyright: ignore[reportUnknownMemberType]
+    go.Scatter(
+        x=[0, 1],
+        y=[0, 1],
+        mode="lines",
+        name="Perfect calibration",
+        line={"color": "gray", "dash": "dash", "width": 2},
+    )
+)
+
+fig.add_trace(  # pyright: ignore[reportUnknownMemberType]
+    go.Scatter(
+        x=expected,
+        y=observed_uncal,
+        mode="markers+lines",
+        name="Before calibration",
+        marker={"size": 12, "color": "red", "symbol": "x"},
+        line={"color": "red", "width": 2, "dash": "dot"},
+    )
+)
+
+fig.add_trace(  # pyright: ignore[reportUnknownMemberType]
+    go.Scatter(
+        x=expected,
+        y=observed_cal,
+        mode="markers+lines",
+        name="After calibration",
+        marker={"size": 12, "color": "blue"},
+        line={"color": "blue", "width": 2},
+    )
+)
+
+fig.update_layout(  # pyright: ignore[reportUnknownMemberType]
+    title="Quantile calibration: expected vs observed coverage",
+    xaxis_title="Expected quantile level",
+    yaxis_title="Observed coverage",
+    xaxis={"range": [0, 1], "tickvals": [0, 0.1, 0.5, 0.9, 1]},
+    yaxis={"range": [0, 1], "tickvals": [0, 0.1, 0.5, 0.9, 1]},
+    height=500,
+    width=600,
+)
+fig.show()
+
+# %% [markdown]
+# Points closer to the diagonal indicate better calibration.  The isotonic
+# correction pulls the observed coverage towards the expected level, improving
+# the reliability of uncertainty estimates.  To measure calibration stability
+# over longer time horizons, combine this with a {doc}`backtesting_quickstart`.
+
+# %% [markdown]
+# ## Next steps
+#
+# - {doc}`backtesting_quickstart` — measure calibration consistency over
+#   realistic operational periods.
+# - {doc}`ensemble_forecasting` — apply calibration to ensemble models
+#   for combined accuracy and reliable uncertainty.
diff --git a/packages/openstef-core/src/openstef_core/testing.py b/packages/openstef-core/src/openstef_core/testing.py
index 3c8c4e8b2..30109c167 100644
--- a/packages/openstef-core/src/openstef_core/testing.py
+++ b/packages/openstef-core/src/openstef_core/testing.py
@@ -8,10 +8,11 @@
 DataFrames and Series with equality semantics.
 """
 
+import logging
 from collections.abc import Sequence
 from datetime import datetime, timedelta
 from pathlib import Path
-from typing import TYPE_CHECKING, Any, override
+from typing import Any, override
 
 import numpy as np
 import pandas as pd
@@ -19,9 +20,6 @@
 from openstef_core.constants import LIANDER_DATASET_REPO_ID
 from openstef_core.datasets import TimeSeriesDataset, VersionedTimeSeriesDataset
 
-if TYPE_CHECKING:
-    import logging
-
 
 class IsSamePandas:
     """Utility class to allow comparison of pandas DataFrames in assertion / calls."""
@@ -185,6 +183,7 @@ def load_liander_dataset(
     """
     try:
         from huggingface_hub import hf_hub_download  # pyright: ignore[reportUnknownVariableType]  # noqa: PLC0415
+        from huggingface_hub.utils import logging as hf_logging  # noqa: PLC0415
     except ImportError:
         msg = "huggingface-hub is required for benchmark datasets: pip install openstef-core[benchmark]"
         raise ImportError(msg) from None
@@ -197,13 +196,14 @@ def load_liander_dataset(
         *(extra_files or []),
     ]
 
+    # Suppress HF Hub noise (unauthenticated requests warning, progress bars)
+    hf_logging.set_verbosity_error()
     for filename in files_to_download:
         hf_hub_download(  # pyright: ignore[reportCallIssue]
             repo_id=repo_id,
             filename=filename,
             repo_type="dataset",
             local_dir=local_dir,
-            local_dir_use_symlinks=False,
         )
 
     datasets = [VersionedTimeSeriesDataset.read_parquet(local_dir / f) for f in files_to_download]
@@ -238,33 +238,46 @@ def configure_notebook_display(renderer: str = "png") -> None:
     pio.renderers.default = renderer
 
 
-_DEFAULT_NOISY_LOGGERS: tuple[str, ...] = ("choreographer", "kaleido")
+_DEFAULT_NOISY_LOGGERS: tuple[str, ...] = (
+    "choreographer",
+    "kaleido",
+    "huggingface_hub",
+    "huggingface_hub.utils._http",
+    "openstef_core.datasets.timeseries_dataset",
+)
 
 
 def setup_notebook_logging(
     name: str | None = None,
     suppress: Sequence[str] | None = None,
-) -> "logging.Logger":
+) -> logging.Logger:
     """Configure logging for tutorial notebooks and return a named logger.
 
     Sets the root logger to INFO level and silences the loggers in *suppress*
-    by raising their level to ERROR and disabling them entirely.
+    by raising their level to ERROR and disabling propagation.  Child loggers
+    sharing a prefix are also silenced.
 
     Args:
         name: Logger name, typically ``__name__`` of the calling module.
         suppress: Sequence of logger names to silence.  Defaults to
-            ``("choreographer", "kaleido")``.
+            ``_DEFAULT_NOISY_LOGGERS``.
 
     Returns:
         Configured Logger instance.
     """
-    import logging  # noqa: PLC0415
-
     noisy = suppress if suppress is not None else _DEFAULT_NOISY_LOGGERS
     logging.basicConfig(level=logging.INFO, format="[%(asctime)s][%(levelname)s] %(message)s")
     for logger_name in noisy:
-        logging.getLogger(logger_name).setLevel(logging.ERROR)
-        logging.getLogger(logger_name).disabled = True
+        lgr = logging.getLogger(logger_name)
+        lgr.setLevel(logging.ERROR)
+        lgr.propagate = False
+        # Also silence any existing child loggers
+        prefix = logger_name + "."
+        for key in logging.Logger.manager.loggerDict:
+            if key.startswith(prefix):
+                child = logging.getLogger(key)
+                child.setLevel(logging.ERROR)
+                child.propagate = False
     return logging.getLogger(name)
 
 
diff --git a/pyproject.toml b/pyproject.toml
index bdb01fff2..e0e52efbd 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -127,16 +127,33 @@ lint.ignore = [
   "TC006",  # Let's not force quoting the first param of typing.cast
   "TRY003", # simplify exception messages
 ]
-lint.per-file-ignores."./examples/tutorials/*" = [
-  "D100",   # Notebooks don't need module docstrings
+lint.per-file-ignores."./examples/benchmarks/custom/*" = [
   "E402",   # Imports not at top — notebook cells have natural ordering
-  "E501",   # Long lines in notebooks — readability is cell-scoped
+  "E501",   # Long lines in markdown cells (API doc URLs)
+  "ERA001", # Jupytext YAML frontmatter looks like commented-out code
+  "T201",   # Benchmarks may use print for demonstration
+]
+lint.per-file-ignores."./examples/benchmarks/liander2024/*" = [
+  "D100",   # Module docstring is in a remove-cell for clean notebook rendering
+  "E402",   # Imports not at top — os.environ must be set before imports
+  "E501",   # Long lines in markdown cells (API doc URLs)
   "ERA001", # Jupytext YAML frontmatter looks like commented-out code
-  "F821",   # Cell-scoped names appear undefined to Ruff's module-level analysis
-  "INP001", # Not a namespace package — notebooks don't need __init__.py
-  "PTH",    # Tutorials may use os.path for simplicity
-  "S101",   # Tutorials may use assert for runtime checks
-  "T201",   # Tutorials may use print for demonstration
+  "T201",   # Benchmarks may use print for demonstration
+]
+lint.per-file-ignores."./examples/tutorials/*" = [
+  "D100",    # Notebooks don't need module docstrings
+  "D103",    # Inline helpers in notebooks don't need docstrings
+  "E402",    # Imports not at top — notebook cells have natural ordering
+  "E501",    # Long lines in notebooks — readability is cell-scoped
+  "ERA001",  # Jupytext YAML frontmatter looks like commented-out code
+  "F821",    # Cell-scoped names appear undefined to Ruff's module-level analysis
+  "INP001",  # Not a namespace package — notebooks don't need __init__.py
+  "PLR2004", # Magic values in assertions are fine for notebook checks
+  "PT018",   # Assertion style doesn't matter in notebooks
+  "PTH",     # Tutorials may use os.path for simplicity
+  "S101",    # Tutorials may use assert for runtime checks
+  "SLF001",  # Private member access needed to demonstrate internals
+  "T201",    # Tutorials may use print for demonstration
 ]
 lint.per-file-ignores."./packages/*/tests/*" = [
   "ARG",     # Unused function args -> fixtures nevertheless are functionally relevant...
@@ -285,17 +302,24 @@ cmd = "pytest --numprocesses=auto --doctest-modules packages/*/src --maxfail=1"
 
 [tool.poe.tasks.notebooks]
 help = "Sync jupytext .py percent sources → .ipynb notebooks (and vice versa)"
-cmd = "jupytext --sync examples/tutorials/*.py"
+sequence = [
+  { cmd = "jupytext --sync examples/tutorials/*.py" },
+  { cmd = "jupytext --sync examples/benchmarks/custom/*.py examples/benchmarks/liander2024/*.py" },
+]
 
 [tool.poe.tasks.notebooks-clear]
 help = "Strip outputs from all .ipynb notebooks"
-cmd = "jupyter nbconvert --clear-output --inplace examples/tutorials/*.ipynb"
+sequence = [
+  { cmd = "jupyter nbconvert --clear-output --inplace examples/tutorials/*.ipynb" },
+  { cmd = "jupyter nbconvert --clear-output --inplace examples/benchmarks/custom/*.ipynb examples/benchmarks/liander2024/*.ipynb" },
+]
 
 [tool.poe.tasks.notebooks-check]
 help = "Check that .ipynb notebooks are in sync with their jupytext .py sources and have no outputs"
 sequence = [
   { cmd = "jupytext --sync examples/tutorials/*.py" },
-  { cmd = "git diff --exit-code -- examples/tutorials/" },
+  { cmd = "jupytext --sync examples/benchmarks/custom/*.py examples/benchmarks/liander2024/*.py" },
+  { cmd = "git diff --exit-code -- examples/tutorials/ examples/benchmarks/" },
   { script = "tools.check_notebook_outputs:main" },
 ]
 
@@ -386,8 +410,8 @@ sequence = [
 ]
 
 [tool.poe.tasks._docs_sync]
-help = "Sync tutorial sources into docs/source for Sphinx"
-cmd = "python -c \"import shutil; shutil.rmtree('docs/source/tutorials', ignore_errors=True); shutil.copytree('examples/tutorials', 'docs/source/tutorials')\""
+help = "Sync tutorial and benchmark sources into docs/source for Sphinx"
+cmd = "python -c \"import shutil; shutil.rmtree('docs/source/tutorials', ignore_errors=True); shutil.copytree('examples/tutorials', 'docs/source/tutorials'); shutil.rmtree('docs/source/benchmarks', ignore_errors=True); shutil.copytree('examples/benchmarks', 'docs/source/benchmarks')\""
 
 [tool.poe.tasks.docs]
 help = "Build the documentation"
diff --git a/sonar-project.properties b/sonar-project.properties
index 28abab60f..5361f74cb 100644
--- a/sonar-project.properties
+++ b/sonar-project.properties
@@ -2,10 +2,15 @@
 #
 # SPDX-License-Identifier: MPL-2.0
 
-# Exclude generated Jupyter notebook files from copy-paste detection.
-# Each .ipynb is generated from its paired .py jupytext source of truth,
-# so SonarCloud reports their content as duplication — it is a false positive.
-# The paired .py jupytext source files share the same boilerplate dataset setup,
-# so they are also excluded for the same reason.
-# Test files are excluded because fixture and helper duplication is expected and intentional.
-sonar.cpd.exclusions=**/*.ipynb,examples/tutorials/*.py,**/tests/**/*.py
+# Exclude generated Jupyter notebooks and example scripts from SonarQube analysis.
+# .ipynb files are generated from paired .py jupytext sources — analysing both is redundant.
+# Tutorial and benchmark .py files are interactive notebook scripts, not library code;
+# SonarQube flags patterns like user-toggles (USE_MLFLOW_STORAGE = False; if USE_MLFLOW_STORAGE)
+# as "constant expressions", which are false positives for configurable notebook cells.
+sonar.exclusions=**/*.ipynb,examples/tutorials/*.py,examples/benchmarks/**/*.py,docs/source/tutorials/**,docs/source/benchmarks/**
+
+# Target Python version for precise analysis.
+sonar.python.version=3.12
+
+# Exclude expected duplication in tests (fixtures, helpers, parametrized patterns).
+sonar.cpd.exclusions=**/tests/**/*.py
diff --git a/tools/check_notebook_outputs.py b/tools/check_notebook_outputs.py
index 37f039822..1a75bba91 100644
--- a/tools/check_notebook_outputs.py
+++ b/tools/check_notebook_outputs.py
@@ -8,25 +8,29 @@
 from pathlib import Path
 
 TUTORIALS_DIR = Path("examples/tutorials")
+BENCHMARKS_DIRS = [Path("examples/benchmarks/custom"), Path("examples/benchmarks/liander2024")]
 
 
 def main() -> None:
-    """Validate that no .ipynb in tutorials has stored outputs."""
+    """Validate that no .ipynb in tutorials or benchmarks has stored outputs."""
     failures: list[str] = []
+    checked = 0
 
-    for nb_path in sorted(TUTORIALS_DIR.glob("*.ipynb")):
-        nb = json.loads(nb_path.read_text(encoding="utf-8"))
-        for i, cell in enumerate(nb.get("cells", [])):
-            if cell.get("cell_type") == "code" and cell.get("outputs"):
-                failures.append(f"  {nb_path.name}: cell {i} has outputs")
-                break
+    for search_dir in [TUTORIALS_DIR, *BENCHMARKS_DIRS]:
+        for nb_path in sorted(search_dir.glob("*.ipynb")):
+            checked += 1
+            nb = json.loads(nb_path.read_text(encoding="utf-8"))
+            for i, cell in enumerate(nb.get("cells", [])):
+                if cell.get("cell_type") == "code" and cell.get("outputs"):
+                    failures.append(f"  {nb_path.relative_to('.')}: cell {i} has outputs")
+                    break
 
     if failures:
         print("ERROR: Notebooks with outputs found (run `poe notebooks-clear`):")
         print("\n".join(failures))
         sys.exit(1)
 
-    print(f"OK: {len(list(TUTORIALS_DIR.glob('*.ipynb')))} notebooks checked, no outputs found")
+    print(f"OK: {checked} notebooks checked, no outputs found")
 
 
 if __name__ == "__main__":