diff --git a/.gitignore b/.gitignore index 790e5bc66..710a26c21 100644 --- a/.gitignore +++ b/.gitignore @@ -68,6 +68,7 @@ dmypy.json docs/_build/ docs/source/api/generated/ docs/source/tutorials/ +docs/source/benchmarks/ # docs/_doctrees/ # docs/_static_gen/ @@ -138,3 +139,4 @@ liander_dataset/ # Jupyter notebook cache (myst-nb execution outputs) .jupyter_cache/ +docs/build.zip diff --git a/docs/source/conf.py b/docs/source/conf.py index 45fb908ea..efd6e9909 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -150,17 +150,20 @@ def _discover_submodules(fullname: str) -> list[str]: # Configure MyST for docstrings myst_enable_extensions = [ "deflist", + "dollarmath", "tasklist", "colon_fence", ] # -- Notebook execution (myst-nb) ------------------------------------------- nb_custom_formats = {".py": ["jupytext.reads", {"fmt": "py:percent"}]} -nb_execution_mode = "off" # TODO(#884): enable "cache" once tutorials are optimized for faster execution +nb_execution_mode = "cache" nb_execution_timeout = 120 nb_execution_raise_on_error = True -# TODO(#884): backtesting notebook exceeds timeout — needs rewrite or execution split -nb_execution_excludepatterns = ["tutorials/backtesting_openstef_with_beam*"] +nb_execution_excludepatterns = [ + "benchmarks/*", # Benchmarks are too expensive to execute during docs build + "benchmarks/*/*", +] # Sphinx version switcher config = SphinxConfig("../../pyproject.toml", globalns=globals()) diff --git a/docs/source/examples.rst b/docs/source/examples.rst index d184ef7b0..dd5f7ba51 100644 --- a/docs/source/examples.rst +++ b/docs/source/examples.rst @@ -7,12 +7,45 @@ Examples ======== -End-to-end tutorials demonstrating OpenSTEF workflows. Each example is a runnable +End-to-end tutorials demonstrating OpenSTEF workflows. Each tutorial is a runnable Jupyter notebook rendered with executed outputs. +Tutorials +--------- + +.. toctree:: + :maxdepth: 1 + :caption: Getting Started + + Forecasting Quickstart + Backtesting Quickstart + .. toctree:: :maxdepth: 1 + :caption: Model Training - Forecasting with Presets + Building a Custom Pipeline + Ensemble Forecasting Hyperparameter Tuning - Backtesting with BEAM + +.. toctree:: + :maxdepth: 1 + :caption: Evaluation & Analysis + + Model Explainability + Quantile Calibration + + +Benchmarks +---------- + +Compare models on real energy data. These notebooks are **not executed** during +docs build — run them locally. + +.. toctree:: + :maxdepth: 2 + :caption: Benchmarking + + Benchmarking Guide + Liander 2024 + Build Your Own diff --git a/examples/benchmarks/README.md b/examples/benchmarks/README.md new file mode 100644 index 000000000..10a600dc6 --- /dev/null +++ b/examples/benchmarks/README.md @@ -0,0 +1,40 @@ + + +# Benchmarks + +End-to-end benchmarking using **BEAM** (Backtesting, Evaluation, Analysis, Metrics). + +BEAM replays historical data day by day, trains your model, makes forecasts, and scores them — all without data leakage. + +## Which notebook do I need? + +| I want to… | Start here | +|---|---| +| **See how OpenSTEF performs** (just run, no code changes) | [XGBoost & GBLinear](liander2024/run_xgboost_gblinear_benchmark) | +| **Benchmark my own model** | [Implement a Custom Forecaster](custom/custom_forecaster) | +| **Benchmark on my own data** | [Configure a Custom Benchmark](custom/custom_benchmark) | +| **Score predictions I already have** | [Evaluate Existing Forecasts](custom/evaluate_existing_forecasts) | + +## Quick start + +```bash +# Install (requires uv: https://docs.astral.sh/uv/) +uv sync --all-extras --all-groups --all-packages + +# Run the built-in Liander 2024 benchmark (XGBoost + GBLinear) +uv run python -m examples.benchmarks.liander2024.run_xgboost_gblinear_benchmark +``` + +## Liander 2024 + +Pre-made benchmarks on the [Liander 2024 STEF benchmark dataset](https://huggingface.co/datasets/OpenSTEF/liander2024-stef-benchmark). +No code changes needed — just run. + +## Build Your Own + +Templates for benchmarking custom models or custom data. See the +[Build Your Own](custom/README) section for a detailed walkthrough. diff --git a/examples/benchmarks/custom/README.md b/examples/benchmarks/custom/README.md new file mode 100644 index 000000000..a5557c226 --- /dev/null +++ b/examples/benchmarks/custom/README.md @@ -0,0 +1,136 @@ + + +# Custom Benchmark Templates + +Copy this folder as a starting point for your own BEAM benchmarks. + +## Which file do I start with? + +| I want to… | Start here | +|---|---| +| **Benchmark my own model** | `custom_forecaster.py` — implement `BacktestForecasterMixin` | +| **Benchmark on my own data** | `custom_benchmark.py` — extend `SimpleTargetProvider` | +| **Score predictions I already have** | `evaluate_existing_forecasts.py` | + +## Files + +| File | Role | +|---|---| +| `custom_forecaster.py` | **Template: your model.** Implements the `BacktestForecasterMixin` interface (config, quantiles, fit, predict). | +| `custom_benchmark.py` | **Template: your benchmark.** Defines where data lives, which metrics to use, and assembles the pipeline. | +| `run_liander2024_benchmark.py` | **Entry point:** test your forecaster on the built-in Liander 2024 dataset (auto-downloaded). | +| `run_custom_benchmark.py` | **Entry point:** run your forecaster on your own data (uses `custom_benchmark.py`). | +| `evaluate_existing_forecasts.py` | **Entry point:** bring your own prediction parquets, skip backtesting. | +| `compare_benchmark_runs.py` | **Entry point:** compare results from multiple runs side-by-side. | + +## Quick start + +```bash +# Install (requires uv: https://docs.astral.sh/uv/) +uv sync --all-extras --all-groups --all-packages + +# Test the example forecaster on Liander 2024 +uv run python -m examples.benchmarks.custom.run_liander2024_benchmark + +# Run with your custom data/targets +uv run python -m examples.benchmarks.custom.run_custom_benchmark +``` + +## Creating your own + +### 1. Write a forecaster + +Copy `custom_forecaster.py` and implement two methods: + +- **`fit(data)`** — called periodically with recent history. Train your model here. +- **`predict(data)`** — called every few hours. Return a `TimeSeriesDataset` with a `"load"` column and one column per quantile (e.g. `"quantile_P05"`, `"quantile_P50"`). + +The `data` argument is a `RestrictedHorizonVersionedTimeSeries` — it enforces no-lookahead by only exposing data available at `data.horizon`. + +### 2. Define a benchmark (optional) + +Copy `custom_benchmark.py` if you want to use **your own data**. Override `_get_measurements_path_for_target()` and `_get_weather_path_for_target()` to point to your parquet files. + +If you're fine with the Liander 2024 dataset, skip this step and use `create_liander2024_benchmark_runner()` directly. + +### 3. Run it + +Copy `run_custom_benchmark.py`. Register your models as forecaster factories and call `pipeline.run()`. + +## Evaluating pre-existing forecasts + +If you already have predictions, place them in this layout: + +``` +benchmark_results/MyForecasts/ +└── backtest/ + └── / # e.g. "solar_park" + └── / # e.g. "Within 15 kilometers of Opmeer_normalized" + └── predictions.parquet +``` + +`group_name` and `target_name` must match the values from your targets YAML. You can list them: + +```bash +uv run python -c " +from examples.benchmarks.custom.custom_benchmark import create_custom_benchmark_runner +from openstef_beam.benchmarking import LocalBenchmarkStorage +from pathlib import Path +runner = create_custom_benchmark_runner(storage=LocalBenchmarkStorage(base_path=Path('./tmp'))) +for t in runner.target_provider.get_targets(['solar_park']): + print(t.group_name, '/', t.name) +" +``` + +Each `predictions.parquet` must have: + +| Column | Type | Description | +|---|---|---| +| *(index)* `timestamp` | `DatetimeIndex` | When each prediction is valid for. 15-min intervals, tz-naive UTC. | +| `available_at` | `datetime64` | When the prediction was generated (enables D-1 / lead-time filtering). | +| `quantile_P05` | `float` | 5th percentile prediction. | +| `quantile_P50` | `float` | Median prediction (**required**). | +| `quantile_P95` | `float` | 95th percentile prediction. | +| ... | `float` | One column per quantile, named with `Quantile(x).format()`. | + +Example rows: + +``` +timestamp (index) available_at quantile_P05 quantile_P50 quantile_P95 +2023-01-15 12:00:00 2023-01-14 06:00:00 0.5 1.2 2.0 +2023-01-15 12:15:00 2023-01-14 06:00:00 0.6 1.3 2.1 +``` + +Then run: + +```bash +uv run python -m examples.benchmarks.custom.evaluate_existing_forecasts +``` + +Results are written to `./benchmark_results/`. Each model gets its own subfolder with backtest predictions, evaluation scores, and analysis plots. + +## Comparing results + +After running at least two models, generate side-by-side comparison plots (global, per-group, per-target). The scripts automatically detect which targets are available in all runs: + +```bash +uv run python -m examples.benchmarks.custom.compare_benchmark_runs +``` + +Output (HTML plots) is saved to `./benchmark_results_comparison/`. + +```{toctree} +:maxdepth: 1 +:hidden: + +Implement a Custom Forecaster +Configure a Custom Benchmark +Run on Liander 2024 Data +Run on Your Own Data +Evaluate Existing Forecasts +Compare Multiple Runs +``` diff --git a/examples/benchmarks/custom_benchmark/__init__.py b/examples/benchmarks/custom/__init__.py similarity index 100% rename from examples/benchmarks/custom_benchmark/__init__.py rename to examples/benchmarks/custom/__init__.py diff --git a/examples/benchmarks/custom/compare_benchmark_runs.ipynb b/examples/benchmarks/custom/compare_benchmark_runs.ipynb new file mode 100644 index 000000000..643b159a6 --- /dev/null +++ b/examples/benchmarks/custom/compare_benchmark_runs.ipynb @@ -0,0 +1,130 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "eb06ddbf", + "metadata": {}, + "source": [ + "# Compare Benchmark Runs\n", + "\n", + "Generate side-by-side comparison plots from multiple benchmark runs.\n", + "Uses [`BenchmarkComparisonPipeline`](https://openstef.github.io/openstef/v4/api/generated/openstef_beam.benchmarking.BenchmarkComparisonPipeline.html)\n", + "to produce global, per-group, and per-target HTML visualizations.\n", + "\n", + "**Prerequisites:** Run at least two models first (e.g. via `run_liander2024_benchmark.py`)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "16bad08a", + "metadata": { + "tags": [ + "remove-cell" + ] + }, + "outputs": [], + "source": [ + "\"\"\"Compare benchmark results from different runs on the Liander 2024 dataset.\n", + "\n", + "Usage:\n", + " 1. First run at least two models with run_liander2024_benchmark.py\n", + " (e.g. ExampleBaseline and GBLinear).\n", + " 2. Then run this script to generate side-by-side comparison plots.\n", + "\n", + "Output is saved to ./benchmark_results_comparison/liander2024/.\n", + "\"\"\"\n", + "\n", + "# SPDX-FileCopyrightText: 2025 Contributors to the OpenSTEF project \n", + "#\n", + "# SPDX-License-Identifier: MPL-2.0" + ] + }, + { + "cell_type": "markdown", + "id": "51bc5fb2", + "metadata": {}, + "source": [ + "## Setup\n", + "\n", + "Point at the result directories from your benchmark runs." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c3148cff", + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "from pathlib import Path\n", + "from typing import cast\n", + "\n", + "from openstef_beam.analysis.models import RunName\n", + "from openstef_beam.benchmarking import BenchmarkComparisonPipeline, LocalBenchmarkStorage\n", + "from openstef_beam.benchmarking.benchmarks import create_liander2024_benchmark_runner\n", + "from openstef_beam.benchmarking.benchmarks.liander2024 import LIANDER2024_ANALYSIS_CONFIG\n", + "from openstef_beam.benchmarking.storage import BenchmarkStorage\n", + "\n", + "# One storage per run — keys are human-readable labels shown in comparison plots.\n", + "run_storages: dict[RunName, BenchmarkStorage] = {\n", + " \"ExampleBaseline\": LocalBenchmarkStorage(base_path=Path(\"./benchmark_results/ExampleBaseline\")),\n", + " \"GBLinear\": LocalBenchmarkStorage(base_path=Path(\"./benchmark_results/GBLinear\")),\n", + "}\n", + "\n", + "# Check that results exist.\n", + "for name, storage in run_storages.items():\n", + " base_path = cast(LocalBenchmarkStorage, storage).base_path\n", + " if not base_path.exists():\n", + " msg = f\"Benchmark directory not found for '{name}': {base_path}. Run the benchmarks first.\"\n", + " raise FileNotFoundError(msg)" + ] + }, + { + "cell_type": "markdown", + "id": "1cb05b48", + "metadata": {}, + "source": [ + "## Run comparison\n", + "\n", + "The pipeline loads predictions from each run, re-evaluates them, and produces\n", + "comparison visualizations." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1548b09a", + "metadata": {}, + "outputs": [], + "source": [ + "# Reuse the Liander 2024 target provider.\n", + "OUTPUT_PATH = Path(\"./benchmark_results_comparison/liander2024\")\n", + "target_provider = create_liander2024_benchmark_runner(\n", + " storage=LocalBenchmarkStorage(base_path=OUTPUT_PATH),\n", + ").target_provider\n", + "\n", + "# Run the comparison — generates global, group, and per-target HTML plots.\n", + "comparison = BenchmarkComparisonPipeline(\n", + " analysis_config=LIANDER2024_ANALYSIS_CONFIG,\n", + " storage=LocalBenchmarkStorage(base_path=OUTPUT_PATH),\n", + " target_provider=target_provider,\n", + ")\n", + "comparison.run(run_data=run_storages)" + ] + } + ], + "metadata": { + "jupytext": { + "formats": "ipynb,py:percent" + }, + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/examples/benchmarks/custom/compare_benchmark_runs.ipynb.license b/examples/benchmarks/custom/compare_benchmark_runs.ipynb.license new file mode 100644 index 000000000..a42c86064 --- /dev/null +++ b/examples/benchmarks/custom/compare_benchmark_runs.ipynb.license @@ -0,0 +1,3 @@ +SPDX-FileCopyrightText: 2026 Contributors to the OpenSTEF project + +SPDX-License-Identifier: MPL-2.0 diff --git a/examples/benchmarks/custom_benchmark/compare_liander2024_results.py b/examples/benchmarks/custom/compare_benchmark_runs.py similarity index 67% rename from examples/benchmarks/custom_benchmark/compare_liander2024_results.py rename to examples/benchmarks/custom/compare_benchmark_runs.py index f0e0ea725..ef5846375 100644 --- a/examples/benchmarks/custom_benchmark/compare_liander2024_results.py +++ b/examples/benchmarks/custom/compare_benchmark_runs.py @@ -1,3 +1,28 @@ +# --- +# jupyter: +# jupytext: +# formats: ipynb,py:percent +# text_representation: +# extension: .py +# format_name: percent +# format_version: '1.3' +# jupytext_version: 1.19.1 +# kernelspec: +# display_name: Python 3 +# language: python +# name: python3 +# --- + +# %% [markdown] +# # Compare Benchmark Runs +# +# Generate side-by-side comparison plots from multiple benchmark runs. +# Uses [`BenchmarkComparisonPipeline`](https://openstef.github.io/openstef/v4/api/generated/openstef_beam.benchmarking.BenchmarkComparisonPipeline.html) +# to produce global, per-group, and per-target HTML visualizations. +# +# **Prerequisites:** Run at least two models first (e.g. via `run_liander2024_benchmark.py`). + +# %% tags=["remove-cell"] """Compare benchmark results from different runs on the Liander 2024 dataset. Usage: @@ -12,6 +37,13 @@ # # SPDX-License-Identifier: MPL-2.0 +# %% [markdown] +# ## Setup +# +# Point at the result directories from your benchmark runs. + +# %% + from pathlib import Path from typing import cast @@ -34,6 +66,13 @@ msg = f"Benchmark directory not found for '{name}': {base_path}. Run the benchmarks first." raise FileNotFoundError(msg) +# %% [markdown] +# ## Run comparison +# +# The pipeline loads predictions from each run, re-evaluates them, and produces +# comparison visualizations. + +# %% # Reuse the Liander 2024 target provider. OUTPUT_PATH = Path("./benchmark_results_comparison/liander2024") target_provider = create_liander2024_benchmark_runner( diff --git a/examples/benchmarks/custom/custom_benchmark.ipynb b/examples/benchmarks/custom/custom_benchmark.ipynb new file mode 100644 index 000000000..395575269 --- /dev/null +++ b/examples/benchmarks/custom/custom_benchmark.ipynb @@ -0,0 +1,307 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "2d7a92c5", + "metadata": {}, + "source": [ + "# Custom Benchmark Configuration\n", + "\n", + "Defines a complete benchmark: where your data lives, which metrics to compute,\n", + "and how to assemble the pipeline.\n", + "\n", + "**User story:** *\"I want to benchmark on my own data.\"*\n", + "\n", + "Copy this file and modify `MyTargetProvider` to point at your dataset.\n", + "The pipeline configuration (`create_custom_benchmark_runner`) shows all the\n", + "knobs: backtest schedule, evaluation windows, analysis visualizations.\n", + "\n", + "**See also:**\n", + "- [TargetProvider](https://openstef.github.io/openstef/v4/api/generated/openstef_beam.benchmarking.TargetProvider.html) — abstract interface\n", + "- [SimpleTargetProvider](https://openstef.github.io/openstef/v4/api/generated/openstef_beam.benchmarking.target_provider.SimpleTargetProvider.html) — file-based implementation (what we extend here)\n", + "- [BenchmarkPipeline](https://openstef.github.io/openstef/v4/api/generated/openstef_beam.benchmarking.BenchmarkPipeline.html) — the orchestrator\n", + "- [EvaluationConfig](https://openstef.github.io/openstef/v4/api/generated/openstef_beam.evaluation.EvaluationConfig.html) — how predictions are sliced and scored\n", + "- [Custom Forecaster template](./custom_forecaster.ipynb) — implement your model here" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "556e06da", + "metadata": { + "tags": [ + "remove-cell" + ] + }, + "outputs": [], + "source": [ + "\"\"\"Example: custom benchmark with your own target provider.\n", + "\n", + "Shows how to extend SimpleTargetProvider to load your own data and build a\n", + "benchmark pipeline. Uses the Liander 2024 dataset as example data source --\n", + "replace paths and logic with your own.\n", + "\n", + "Expected directory layout (customize via path overrides)::\n", + "\n", + " data_dir/\n", + " ├── targets.yaml # Target definitions\n", + " ├── load_measurements/\n", + " │ └── /.parquet # Measurements per target\n", + " └── features/\n", + " └── /.parquet # Features per target (weather, etc.)\n", + "\"\"\"\n", + "\n", + "# SPDX-FileCopyrightText: 2025 Contributors to the OpenSTEF project \n", + "#\n", + "# SPDX-License-Identifier: MPL-2.0" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fb46c6a9", + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "from datetime import timedelta\n", + "from pathlib import Path\n", + "from typing import Literal, override\n", + "\n", + "from huggingface_hub import snapshot_download # pyright: ignore[reportUnknownVariableType]\n", + "from pydantic import Field\n", + "\n", + "from openstef_beam.analysis import AnalysisConfig\n", + "from openstef_beam.analysis.visualizations import WindowedMetricVisualization\n", + "from openstef_beam.analysis.visualizations.grouped_target_metric_visualization import GroupedTargetMetricVisualization\n", + "from openstef_beam.analysis.visualizations.quantile_probability_visualization import QuantileProbabilityVisualization\n", + "from openstef_beam.analysis.visualizations.summary_table_visualization import SummaryTableVisualization\n", + "from openstef_beam.analysis.visualizations.timeseries_visualization import TimeSeriesVisualization\n", + "from openstef_beam.backtesting import BacktestConfig\n", + "from openstef_beam.benchmarking import BenchmarkPipeline, BenchmarkTarget, StrictExecutionCallback\n", + "from openstef_beam.benchmarking.storage.base import BenchmarkStorage\n", + "from openstef_beam.benchmarking.target_provider import SimpleTargetProvider\n", + "from openstef_beam.evaluation import EvaluationConfig, Window\n", + "from openstef_beam.evaluation.metric_providers import MetricProvider, RCRPSProvider, RMAEProvider\n", + "from openstef_core.types import AvailableAt, LeadTime, Quantile\n", + "\n", + "# Define your own target categories for filtering (must match group_name in targets.yaml)\n", + "type MyCategory = Literal[\"solar_park\", \"wind_park\"]" + ] + }, + { + "cell_type": "markdown", + "id": "8d6120df", + "metadata": {}, + "source": [ + "## Target Provider\n", + "\n", + "The [`TargetProvider`](https://openstef.github.io/openstef/v4/api/generated/openstef_beam.benchmarking.TargetProvider.html)\n", + "tells BEAM where your data lives and which metrics to compute.\n", + "Here we extend [`SimpleTargetProvider`](https://openstef.github.io/openstef/v4/api/generated/openstef_beam.benchmarking.target_provider.SimpleTargetProvider.html)\n", + "which handles file-based datasets with a targets YAML + parquet files.\n", + "\n", + "**CUSTOMIZE HERE:** Change path templates, category types, and metric selection." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "dc28efb5", + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "\n", + "class MyTargetProvider(SimpleTargetProvider[BenchmarkTarget, list[MyCategory]]):\n", + " \"\"\"Custom target provider -- extend SimpleTargetProvider to load your own data.\n", + "\n", + " Configure path templates and data flags, then override methods to customize\n", + " target filtering, metrics, and file resolution.\n", + " \"\"\"\n", + "\n", + " # Path templates -- adapt to your directory structure\n", + " # {name} is replaced with target.name from targets.yaml\n", + " targets_file_path: str = Field(default=\"liander2024_targets.yaml\", init=False)\n", + " measurements_path_template: str = Field(default=\"{name}.parquet\", init=False)\n", + " weather_path_template: str = Field(default=\"{name}.parquet\", init=False)\n", + "\n", + " # Disable shared profiles and prices -- only per-target features are used\n", + " # Set to True if you have shared data files (profiles.parquet, prices.parquet)\n", + " use_profiles: bool = False\n", + " use_prices: bool = False\n", + "\n", + " @override\n", + " def get_targets(self, filter_args: list[MyCategory] | None = None) -> list[BenchmarkTarget]:\n", + " \"\"\"Load targets and optionally filter by category.\n", + "\n", + " Returns:\n", + " Filtered list of benchmark targets.\n", + " \"\"\"\n", + " # super().get_targets() reads targets from the YAML file\n", + " targets = super().get_targets(filter_args)\n", + " # Keep only targets whose group_name matches one of the filter categories\n", + " if filter_args is not None:\n", + " targets = [t for t in targets if t.group_name in filter_args]\n", + " return targets\n", + "\n", + " @override\n", + " def get_metrics_for_target(self, target: BenchmarkTarget) -> list[MetricProvider]:\n", + " \"\"\"Define which metrics to compute per target.\n", + "\n", + " Returns:\n", + " List of metric providers.\n", + " \"\"\"\n", + " # rMAE: deterministic accuracy at the median (lower is better)\n", + " # rCRPS: probabilistic accuracy across all quantiles (lower is better)\n", + " return [\n", + " RMAEProvider(quantiles=[Quantile(0.5)], lower_quantile=0.01, upper_quantile=0.99),\n", + " RCRPSProvider(lower_quantile=0.01, upper_quantile=0.99),\n", + " ]\n", + "\n", + " @override\n", + " def _get_measurements_path_for_target(self, target: BenchmarkTarget) -> Path:\n", + " \"\"\"Resolve path to load measurement parquet.\n", + "\n", + " Liander 2024 uses: data_dir/load_measurements//.parquet\n", + " Change this to match your directory structure.\n", + "\n", + " Returns:\n", + " Path to the measurement parquet file.\n", + " \"\"\"\n", + " return self.data_dir / \"load_measurements\" / target.group_name / f\"{target.name}.parquet\"\n", + "\n", + " @override\n", + " def _get_weather_path_for_target(self, target: BenchmarkTarget) -> Path:\n", + " \"\"\"Resolve path to features parquet (weather, etc.).\n", + "\n", + " Liander 2024 uses: data_dir/weather_forecasts_versioned//.parquet\n", + " Change this to match your directory structure.\n", + "\n", + " Returns:\n", + " Path to the features parquet file.\n", + " \"\"\"\n", + " return self.data_dir / \"weather_forecasts_versioned\" / target.group_name / f\"{target.name}.parquet\"" + ] + }, + { + "cell_type": "markdown", + "id": "8e3c1663", + "metadata": {}, + "source": [ + "## Analysis Configuration\n", + "\n", + "Choose which visualizations and summary tables BEAM generates after evaluation.\n", + "Add or remove providers to customize the output report." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5bffcc06", + "metadata": { + "lines_to_next_cell": 2 + }, + "outputs": [], + "source": [ + "# --- Analysis config: which plots and tables to generate after evaluation ---\n", + "ANALYSIS_CONFIG = AnalysisConfig(\n", + " visualization_providers=[\n", + " TimeSeriesVisualization(name=\"time_series\"),\n", + " WindowedMetricVisualization(\n", + " name=\"rMAE_7D\",\n", + " metric=(\"rMAE\", Quantile(0.5)),\n", + " window=Window(lag=timedelta(hours=0), size=timedelta(days=7)),\n", + " ),\n", + " WindowedMetricVisualization(\n", + " name=\"rCRPS_30D\",\n", + " metric=\"rCRPS\",\n", + " window=Window(lag=timedelta(hours=0), size=timedelta(days=30)),\n", + " ),\n", + " GroupedTargetMetricVisualization(name=\"rMAE_grouped\", metric=\"rMAE\", quantile=Quantile(0.5)),\n", + " GroupedTargetMetricVisualization(name=\"rCRPS_grouped\", metric=\"rCRPS\"),\n", + " SummaryTableVisualization(name=\"summary\"),\n", + " QuantileProbabilityVisualization(name=\"quantile_probability\"),\n", + " ],\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "9e74b7c6", + "metadata": {}, + "source": [ + "## Pipeline Assembly\n", + "\n", + "Wire everything together: backtest schedule, evaluation config, analysis, and target provider.\n", + "See [`BacktestConfig`](https://openstef.github.io/openstef/v4/api/generated/openstef_beam.backtesting.BacktestConfig.html)\n", + "and [`EvaluationConfig`](https://openstef.github.io/openstef/v4/api/generated/openstef_beam.evaluation.EvaluationConfig.html)\n", + "for all available options.\n", + "\n", + "**CUSTOMIZE HERE:** Adjust `predict_interval`, `train_interval`, evaluation windows, and lead times." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4a48b987", + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "\n", + "def create_custom_benchmark_runner(\n", + " storage: BenchmarkStorage,\n", + " data_dir: Path | None = None,\n", + ") -> BenchmarkPipeline[BenchmarkTarget, list[MyCategory]]:\n", + " \"\"\"Assemble a benchmark pipeline with the custom target provider.\n", + "\n", + " Args:\n", + " storage: Where to save results.\n", + " data_dir: Dataset path. Downloads Liander 2024 from HuggingFace if None.\n", + "\n", + " Returns:\n", + " Ready-to-run benchmark pipeline.\n", + " \"\"\"\n", + " if data_dir is None:\n", + " data_dir = Path(snapshot_download(repo_id=\"OpenSTEF/liander2024-stef-benchmark\", repo_type=\"dataset\"))\n", + "\n", + " return BenchmarkPipeline[BenchmarkTarget, list[MyCategory]](\n", + " # Backtest: how to replay history\n", + " backtest_config=BacktestConfig(\n", + " prediction_sample_interval=timedelta(minutes=15), # Data resolution\n", + " predict_interval=timedelta(hours=6), # New forecast every 6 hours\n", + " train_interval=timedelta(days=7), # Retrain model every 7 days\n", + " ),\n", + " # Evaluation: how to slice and score the results\n", + " evaluation_config=EvaluationConfig(\n", + " available_ats=[AvailableAt.from_string(\"D-1T06:00\")], # Day-ahead forecast at 06:00\n", + " lead_times=[\n", + " LeadTime.from_string(\"P1D\"), # 1 day ahead\n", + " ], # Evaluate all lead times\n", + " windows=[ # Rolling windows for metrics\n", + " Window(lag=timedelta(hours=0), size=timedelta(days=7)),\n", + " Window(lag=timedelta(hours=0), size=timedelta(days=30)),\n", + " ],\n", + " ),\n", + " analysis_config=ANALYSIS_CONFIG,\n", + " target_provider=MyTargetProvider(data_dir=data_dir),\n", + " storage=storage,\n", + " callbacks=[StrictExecutionCallback()], # Fail fast on errors\n", + " )" + ] + } + ], + "metadata": { + "jupytext": { + "formats": "ipynb,py:percent" + }, + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/examples/benchmarks/custom/custom_benchmark.ipynb.license b/examples/benchmarks/custom/custom_benchmark.ipynb.license new file mode 100644 index 000000000..a42c86064 --- /dev/null +++ b/examples/benchmarks/custom/custom_benchmark.ipynb.license @@ -0,0 +1,3 @@ +SPDX-FileCopyrightText: 2026 Contributors to the OpenSTEF project + +SPDX-License-Identifier: MPL-2.0 diff --git a/examples/benchmarks/custom_benchmark/example_benchmark.py b/examples/benchmarks/custom/custom_benchmark.py similarity index 73% rename from examples/benchmarks/custom_benchmark/example_benchmark.py rename to examples/benchmarks/custom/custom_benchmark.py index 655756199..62e7aee70 100644 --- a/examples/benchmarks/custom_benchmark/example_benchmark.py +++ b/examples/benchmarks/custom/custom_benchmark.py @@ -1,3 +1,38 @@ +# --- +# jupyter: +# jupytext: +# formats: ipynb,py:percent +# text_representation: +# extension: .py +# format_name: percent +# format_version: '1.3' +# jupytext_version: 1.19.1 +# kernelspec: +# display_name: Python 3 +# language: python +# name: python3 +# --- + +# %% [markdown] +# # Custom Benchmark Configuration +# +# Defines a complete benchmark: where your data lives, which metrics to compute, +# and how to assemble the pipeline. +# +# **User story:** *"I want to benchmark on my own data."* +# +# Copy this file and modify `MyTargetProvider` to point at your dataset. +# The pipeline configuration (`create_custom_benchmark_runner`) shows all the +# knobs: backtest schedule, evaluation windows, analysis visualizations. +# +# **See also:** +# - [TargetProvider](https://openstef.github.io/openstef/v4/api/generated/openstef_beam.benchmarking.TargetProvider.html) — abstract interface +# - [SimpleTargetProvider](https://openstef.github.io/openstef/v4/api/generated/openstef_beam.benchmarking.target_provider.SimpleTargetProvider.html) — file-based implementation (what we extend here) +# - [BenchmarkPipeline](https://openstef.github.io/openstef/v4/api/generated/openstef_beam.benchmarking.BenchmarkPipeline.html) — the orchestrator +# - [EvaluationConfig](https://openstef.github.io/openstef/v4/api/generated/openstef_beam.evaluation.EvaluationConfig.html) — how predictions are sliced and scored +# - [Custom Forecaster template](./custom_forecaster.ipynb) — implement your model here + +# %% tags=["remove-cell"] """Example: custom benchmark with your own target provider. Shows how to extend SimpleTargetProvider to load your own data and build a @@ -18,6 +53,8 @@ # # SPDX-License-Identifier: MPL-2.0 +# %% + from datetime import timedelta from pathlib import Path from typing import Literal, override @@ -42,6 +79,18 @@ # Define your own target categories for filtering (must match group_name in targets.yaml) type MyCategory = Literal["solar_park", "wind_park"] +# %% [markdown] +# ## Target Provider +# +# The [`TargetProvider`](https://openstef.github.io/openstef/v4/api/generated/openstef_beam.benchmarking.TargetProvider.html) +# tells BEAM where your data lives and which metrics to compute. +# Here we extend [`SimpleTargetProvider`](https://openstef.github.io/openstef/v4/api/generated/openstef_beam.benchmarking.target_provider.SimpleTargetProvider.html) +# which handles file-based datasets with a targets YAML + parquet files. +# +# **CUSTOMIZE HERE:** Change path templates, category types, and metric selection. + +# %% + class MyTargetProvider(SimpleTargetProvider[BenchmarkTarget, list[MyCategory]]): """Custom target provider -- extend SimpleTargetProvider to load your own data. @@ -114,6 +163,13 @@ def _get_weather_path_for_target(self, target: BenchmarkTarget) -> Path: return self.data_dir / "weather_forecasts_versioned" / target.group_name / f"{target.name}.parquet" +# %% [markdown] +# ## Analysis Configuration +# +# Choose which visualizations and summary tables BEAM generates after evaluation. +# Add or remove providers to customize the output report. + +# %% # --- Analysis config: which plots and tables to generate after evaluation --- ANALYSIS_CONFIG = AnalysisConfig( visualization_providers=[ @@ -136,6 +192,19 @@ def _get_weather_path_for_target(self, target: BenchmarkTarget) -> Path: ) +# %% [markdown] +# ## Pipeline Assembly +# +# Wire everything together: backtest schedule, evaluation config, analysis, and target provider. +# See [`BacktestConfig`](https://openstef.github.io/openstef/v4/api/generated/openstef_beam.backtesting.BacktestConfig.html) +# and [`EvaluationConfig`](https://openstef.github.io/openstef/v4/api/generated/openstef_beam.evaluation.EvaluationConfig.html) +# for all available options. +# +# **CUSTOMIZE HERE:** Adjust `predict_interval`, `train_interval`, evaluation windows, and lead times. + +# %% + + def create_custom_benchmark_runner( storage: BenchmarkStorage, data_dir: Path | None = None, diff --git a/examples/benchmarks/custom/custom_forecaster.ipynb b/examples/benchmarks/custom/custom_forecaster.ipynb new file mode 100644 index 000000000..7beca1857 --- /dev/null +++ b/examples/benchmarks/custom/custom_forecaster.ipynb @@ -0,0 +1,176 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "7c5e1252", + "metadata": {}, + "source": [ + "# Custom Forecaster Template\n", + "\n", + "Implements [`BacktestForecasterMixin`](https://openstef.github.io/openstef/v4/api/generated/openstef_beam.backtesting.BacktestForecasterMixin.html)\n", + "— the interface BEAM needs to run any model in its backtesting/benchmarking pipeline.\n", + "\n", + "**User story:** *\"I want to benchmark my own model.\"*\n", + "\n", + "Copy this file and modify `fit()` and `predict()` to wrap your model.\n", + "\n", + "**See also:**\n", + "- [BacktestForecasterConfig](https://openstef.github.io/openstef/v4/api/generated/openstef_beam.backtesting.BacktestForecasterConfig.html) — scheduling settings\n", + "- [RestrictedHorizonVersionedTimeSeries](https://openstef.github.io/openstef/v4/api/generated/openstef_beam.backtesting.RestrictedHorizonVersionedTimeSeries.html) — the data view passed to `fit()` and `predict()`\n", + "- [Backtesting quickstart tutorial](../../tutorials/backtesting_quickstart.ipynb) — introduction to backtesting concepts" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f44c64a2", + "metadata": { + "tags": [ + "remove-cell" + ] + }, + "outputs": [], + "source": [ + "\"\"\"Custom baseline: predicts a constant value (last known median) for all future timestamps.\n", + "\n", + "Implements BacktestForecasterMixin — the interface BEAM needs to run any model\n", + "in its backtesting/benchmarking pipeline. To create your own baseline, copy this\n", + "file and modify fit() and predict().\n", + "\"\"\"\n", + "\n", + "# SPDX-FileCopyrightText: 2025 Contributors to the OpenSTEF project \n", + "#\n", + "# SPDX-License-Identifier: MPL-2.0" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "045ccd48", + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "from datetime import timedelta\n", + "from typing import override\n", + "\n", + "import pandas as pd\n", + "\n", + "from openstef_beam.backtesting.backtest_forecaster import BacktestForecasterConfig, BacktestForecasterMixin\n", + "from openstef_beam.backtesting.restricted_horizon_timeseries import RestrictedHorizonVersionedTimeSeries\n", + "from openstef_core.datasets import TimeSeriesDataset\n", + "from openstef_core.types import Q, Quantile" + ] + }, + { + "cell_type": "markdown", + "id": "067a5f9b", + "metadata": {}, + "source": [ + "## The `BacktestForecasterMixin` interface\n", + "\n", + "Your forecaster must implement:\n", + "- `config` — a [`BacktestForecasterConfig`](https://openstef.github.io/openstef/v4/api/generated/openstef_beam.backtesting.BacktestForecasterConfig.html) that tells BEAM how to schedule training and prediction\n", + "- `quantiles` — which probabilistic bands to produce (e.g. `[Q(0.05), Q(0.5), Q(0.95)]`)\n", + "- `fit(data)` — train your model on restricted-horizon data (no lookahead)\n", + "- `predict(data)` → `TimeSeriesDataset | None` — produce a forecast\n", + "\n", + "BEAM calls `fit()` at `train_interval` spacing, and `predict()` at `predict_interval` spacing." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "94bee2f3", + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "\n", + "class ExampleBenchmarkForecaster(BacktestForecasterMixin):\n", + " \"\"\"Predicts a constant median of recent history for all future timestamps.\n", + "\n", + " All quantile columns get the same value -- no uncertainty estimation.\n", + " \"\"\"\n", + "\n", + " def __init__(self, predict_quantiles: list[Quantile] | None = None) -> None: # noqa: D107\n", + " # Quantiles define the probabilistic forecast bands (e.g. P05 = 5th percentile)\n", + " self._quantiles = predict_quantiles or [Q(0.05), Q(0.1), Q(0.3), Q(0.5), Q(0.7), Q(0.9), Q(0.95)]\n", + " self._median: float = 0.0\n", + "\n", + " # BacktestForecasterConfig tells BEAM how to schedule training and prediction\n", + " self.config = BacktestForecasterConfig(\n", + " requires_training=True, # Call fit() before predict()\n", + " predict_length=timedelta(days=7), # How far ahead to forecast\n", + " predict_min_length=timedelta(minutes=15),\n", + " predict_context_length=timedelta(minutes=15), # Data needed before forecast start (>0)\n", + " predict_context_min_coverage=0.0,\n", + " training_context_length=timedelta(days=30), # How much history fit() sees\n", + " training_context_min_coverage=0.3, # Min 30% non-NaN data required\n", + " predict_sample_interval=timedelta(minutes=15), # Output resolution (15-min intervals)\n", + " )\n", + "\n", + " @property\n", + " @override\n", + " def quantiles(self) -> list[Quantile]:\n", + " \"\"\"Quantiles this forecaster produces.\"\"\"\n", + " return self._quantiles\n", + "\n", + " @override\n", + " def fit(self, data: RestrictedHorizonVersionedTimeSeries) -> None:\n", + " \"\"\"Compute median of recent load data.\n", + "\n", + " Args:\n", + " data: Restricted-horizon view -- only sees data available at data.horizon.\n", + " \"\"\"\n", + " # data.horizon = the current point in time during backtesting\n", + " # get_window() returns only data that was available at that point (no lookahead)\n", + " training = data.get_window(\n", + " start=data.horizon - self.config.training_context_length, # 30 days before horizon\n", + " end=data.horizon,\n", + " available_before=data.horizon, # Ensures no future data leaks in\n", + " )\n", + " # \"load\" is the target column (actual energy consumption/generation)\n", + " if \"load\" in training.data.columns:\n", + " self._median = float(training.data[\"load\"].median())\n", + "\n", + " @override\n", + " def predict(self, data: RestrictedHorizonVersionedTimeSeries) -> TimeSeriesDataset | None:\n", + " \"\"\"Return constant median prediction for the forecast horizon.\n", + "\n", + " Returns:\n", + " Forecast with all quantiles set to the training median, or None on failure.\n", + " \"\"\"\n", + " # Build a DataFrame with \"load\" + one column per quantile (e.g. \"quantile_P05\")\n", + " # All values are the same constant (the median from fit())\n", + " # q.format() converts Q(0.05) -> \"quantile_P05\" (the required column naming)\n", + " return TimeSeriesDataset(\n", + " data=pd.DataFrame(\n", + " data={\"load\": self._median} | {q.format(): self._median for q in self._quantiles},\n", + " index=pd.DatetimeIndex(\n", + " pd.date_range(\n", + " data.horizon,\n", + " periods=int(self.config.predict_length / self.config.predict_sample_interval),\n", + " freq=self.config.predict_sample_interval,\n", + " ),\n", + " name=\"datetime\",\n", + " ),\n", + " ),\n", + " sample_interval=self.config.predict_sample_interval,\n", + " )" + ] + } + ], + "metadata": { + "jupytext": { + "formats": "ipynb,py:percent" + }, + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/examples/benchmarks/custom/custom_forecaster.ipynb.license b/examples/benchmarks/custom/custom_forecaster.ipynb.license new file mode 100644 index 000000000..a42c86064 --- /dev/null +++ b/examples/benchmarks/custom/custom_forecaster.ipynb.license @@ -0,0 +1,3 @@ +SPDX-FileCopyrightText: 2026 Contributors to the OpenSTEF project + +SPDX-License-Identifier: MPL-2.0 diff --git a/examples/benchmarks/custom_benchmark/example_baseline.py b/examples/benchmarks/custom/custom_forecaster.py similarity index 68% rename from examples/benchmarks/custom_benchmark/example_baseline.py rename to examples/benchmarks/custom/custom_forecaster.py index ed3732e89..2dda468a4 100644 --- a/examples/benchmarks/custom_benchmark/example_baseline.py +++ b/examples/benchmarks/custom/custom_forecaster.py @@ -1,3 +1,34 @@ +# --- +# jupyter: +# jupytext: +# formats: ipynb,py:percent +# text_representation: +# extension: .py +# format_name: percent +# format_version: '1.3' +# jupytext_version: 1.19.1 +# kernelspec: +# display_name: Python 3 +# language: python +# name: python3 +# --- + +# %% [markdown] +# # Custom Forecaster Template +# +# Implements [`BacktestForecasterMixin`](https://openstef.github.io/openstef/v4/api/generated/openstef_beam.backtesting.BacktestForecasterMixin.html) +# — the interface BEAM needs to run any model in its backtesting/benchmarking pipeline. +# +# **User story:** *"I want to benchmark my own model."* +# +# Copy this file and modify `fit()` and `predict()` to wrap your model. +# +# **See also:** +# - [BacktestForecasterConfig](https://openstef.github.io/openstef/v4/api/generated/openstef_beam.backtesting.BacktestForecasterConfig.html) — scheduling settings +# - [RestrictedHorizonVersionedTimeSeries](https://openstef.github.io/openstef/v4/api/generated/openstef_beam.backtesting.RestrictedHorizonVersionedTimeSeries.html) — the data view passed to `fit()` and `predict()` +# - [Backtesting quickstart tutorial](../../tutorials/backtesting_quickstart.ipynb) — introduction to backtesting concepts + +# %% tags=["remove-cell"] """Custom baseline: predicts a constant value (last known median) for all future timestamps. Implements BacktestForecasterMixin — the interface BEAM needs to run any model @@ -9,6 +40,8 @@ # # SPDX-License-Identifier: MPL-2.0 +# %% + from datetime import timedelta from typing import override @@ -19,6 +52,19 @@ from openstef_core.datasets import TimeSeriesDataset from openstef_core.types import Q, Quantile +# %% [markdown] +# ## The `BacktestForecasterMixin` interface +# +# Your forecaster must implement: +# - `config` — a [`BacktestForecasterConfig`](https://openstef.github.io/openstef/v4/api/generated/openstef_beam.backtesting.BacktestForecasterConfig.html) that tells BEAM how to schedule training and prediction +# - `quantiles` — which probabilistic bands to produce (e.g. `[Q(0.05), Q(0.5), Q(0.95)]`) +# - `fit(data)` — train your model on restricted-horizon data (no lookahead) +# - `predict(data)` → `TimeSeriesDataset | None` — produce a forecast +# +# BEAM calls `fit()` at `train_interval` spacing, and `predict()` at `predict_interval` spacing. + +# %% + class ExampleBenchmarkForecaster(BacktestForecasterMixin): """Predicts a constant median of recent history for all future timestamps. diff --git a/examples/benchmarks/custom/evaluate_existing_forecasts.ipynb b/examples/benchmarks/custom/evaluate_existing_forecasts.ipynb new file mode 100644 index 000000000..15a90b2ad --- /dev/null +++ b/examples/benchmarks/custom/evaluate_existing_forecasts.ipynb @@ -0,0 +1,241 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "24b1f426", + "metadata": {}, + "source": [ + "# Evaluate Existing Forecasts\n", + "\n", + "Skip backtesting entirely — bring your own prediction parquets and run only\n", + "evaluation + analysis.\n", + "\n", + "**User story:** *\"I already have forecasts from my own system. I just want to\n", + "score them with BEAM's metrics and visualizations.\"*\n", + "\n", + "**See also:**\n", + "- [BenchmarkPipeline](https://openstef.github.io/openstef/v4/api/generated/openstef_beam.benchmarking.BenchmarkPipeline.html) — auto-detects existing predictions and skips backtesting\n", + "- [Custom Benchmark configuration](./custom_benchmark.ipynb) — defines which targets and metrics to use\n", + "- [Quantile naming convention](https://openstef.github.io/openstef/v4/api/generated/openstef_core.types.Quantile.html) — `Quantile(x).format()` → column names\n", + "\n", + "## Expected directory layout\n", + "\n", + "```\n", + "benchmark_results/MyForecasts/\n", + "└── backtest/\n", + " └── / # e.g. \"solar_park\"\n", + " └── / # e.g. \"Within 15 kilometers of Opmeer_normalized\"\n", + " └── predictions.parquet\n", + "```\n", + "\n", + "## Expected parquet format\n", + "\n", + "| Column | Type | Description |\n", + "|--------|------|-------------|\n", + "| *index* | `DatetimeIndex` (name=\"timestamp\", tz-naive UTC, 15-min) | Forecast timestamp |\n", + "| `available_at` | datetime | When the prediction was generated |\n", + "| `quantile_P05` | float | 5th percentile |\n", + "| `quantile_P50` | float | Median (required) |\n", + "| `quantile_P95` | float | 95th percentile |\n", + "| ... | float | One column per quantile via `Quantile(x).format()` |" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "768a9cb8", + "metadata": { + "tags": [ + "remove-cell" + ] + }, + "outputs": [], + "source": [ + "\"\"\"Evaluate pre-existing forecasts without running backtesting.\n", + "\n", + "If you already have forecast predictions (e.g. from your own model or an external\n", + "system), you can point the benchmark pipeline at them and run only the evaluation\n", + "and analysis steps.\n", + "\n", + "How it works:\n", + " 1. Place your prediction parquet files in the expected directory layout (see below).\n", + " 2. Run this script — the pipeline detects existing backtest output and\n", + " automatically skips to evaluation + analysis.\n", + "\n", + "Expected directory layout::\n", + "\n", + " benchmark_results/MyForecasts/\n", + " └── backtest/\n", + " └── / # e.g. \"solar_park\"\n", + " └── / # e.g. \"Within 15 kilometers of Opmeer_normalized\"\n", + " └── predictions.parquet\n", + "\n", + "Expected parquet format::\n", + "\n", + " Index: pd.DatetimeIndex (name=\"timestamp\", tz-naive UTC, 15-min intervals)\n", + " Columns:\n", + " - \"available_at\" (datetime) — when the prediction was generated\n", + " - \"quantile_P05\" (float) — 5th percentile prediction\n", + " - \"quantile_P50\" (float) — median prediction (REQUIRED)\n", + " - \"quantile_P95\" (float) — 95th percentile prediction\n", + " - ...one column per quantile, named with Quantile(x).format()\n", + "\n", + "Example row::\n", + "\n", + " timestamp (index) available_at quantile_P05 quantile_P50 quantile_P95\n", + " 2023-01-15 12:00:00 2023-01-14 06:00:00 0.5 1.2 2.0\n", + "\n", + "You can list the expected target names and group names by checking the targets.yaml\n", + "in your dataset, or by running::\n", + "\n", + " runner = create_custom_benchmark_runner()\n", + " for t in runner.target_provider.get_targets([\"solar_park\"]):\n", + " print(t.group_name, t.name)\n", + "\n", + "The pipeline still needs a \"forecaster factory\" to know which quantiles were used,\n", + "but fit() and predict() are never called. We use DummyForecaster for this.\n", + "\"\"\"\n", + "\n", + "# SPDX-FileCopyrightText: 2025 Contributors to the OpenSTEF project \n", + "#\n", + "# SPDX-License-Identifier: MPL-2.0\n", + "\n", + "import os\n", + "\n", + "os.environ[\"OMP_NUM_THREADS\"] = \"1\"\n", + "os.environ[\"OPENBLAS_NUM_THREADS\"] = \"1\"\n", + "os.environ[\"MKL_NUM_THREADS\"] = \"1\"" + ] + }, + { + "cell_type": "markdown", + "id": "990fa931", + "metadata": {}, + "source": [ + "## Setup" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0f4f5f17", + "metadata": {}, + "outputs": [], + "source": [ + "import logging\n", + "import multiprocessing\n", + "from pathlib import Path\n", + "\n", + "from examples.benchmarks.custom.custom_benchmark import create_custom_benchmark_runner\n", + "from openstef_beam.backtesting.backtest_forecaster import DummyForecaster\n", + "from openstef_beam.benchmarking import BenchmarkContext, BenchmarkTarget, LocalBenchmarkStorage\n", + "from openstef_core.types import Q\n", + "\n", + "_logger = logging.getLogger(__name__)\n", + "\n", + "logging.basicConfig(level=logging.INFO, format=\"[%(asctime)s][%(levelname)s] %(message)s\")" + ] + }, + { + "cell_type": "markdown", + "id": "51fbc379", + "metadata": {}, + "source": [ + "## Configuration\n", + "\n", + "Point at the folder containing your prediction parquets and list the quantiles\n", + "they were generated for." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f9f94d32", + "metadata": {}, + "outputs": [], + "source": [ + "# Path to the folder that contains the backtest/ directory with your parquets.\n", + "OUTPUT_PATH = Path(\"./benchmark_results/MyForecasts\")\n", + "N_PROCESSES = multiprocessing.cpu_count()\n", + "\n", + "# Quantiles your forecasts were generated for (must include 0.5 = median).\n", + "# Adjust this list to match whatever quantiles are in your parquet columns.\n", + "PREDICTION_QUANTILES = [Q(0.05), Q(0.1), Q(0.3), Q(0.5), Q(0.7), Q(0.9), Q(0.95)]" + ] + }, + { + "cell_type": "markdown", + "id": "cb9a09e1", + "metadata": {}, + "source": [ + "## Dummy forecaster factory\n", + "\n", + "The pipeline still needs a factory to know which quantiles were used, but\n", + "`fit()` and `predict()` are never called — backtesting is skipped." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3d47f17d", + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "\n", + "def stub_factory(_context: BenchmarkContext, _target: BenchmarkTarget) -> DummyForecaster:\n", + " \"\"\"Factory that returns a DummyForecaster (backtesting is skipped).\n", + "\n", + " DummyForecaster provides quantile info to the pipeline but never runs\n", + " fit() or predict() since backtest output already exists on disk.\n", + "\n", + " Returns:\n", + " DummyForecaster with the configured quantiles.\n", + " \"\"\"\n", + " return DummyForecaster(predict_quantiles=PREDICTION_QUANTILES)" + ] + }, + { + "cell_type": "markdown", + "id": "09907e33", + "metadata": {}, + "source": [ + "## Run evaluation\n", + "\n", + "The pipeline reads existing parquets and runs evaluation + analysis only." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d61f1e93", + "metadata": {}, + "outputs": [], + "source": [ + "if __name__ == \"__main__\":\n", + " storage = LocalBenchmarkStorage(base_path=OUTPUT_PATH)\n", + "\n", + " runner = create_custom_benchmark_runner(storage=storage)\n", + "\n", + " runner.run(\n", + " forecaster_factory=stub_factory,\n", + " run_name=\"my_forecasts\",\n", + " n_processes=N_PROCESSES,\n", + " filter_args=[\"solar_park\"],\n", + " )" + ] + } + ], + "metadata": { + "jupytext": { + "formats": "ipynb,py:percent" + }, + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/examples/benchmarks/custom/evaluate_existing_forecasts.ipynb.license b/examples/benchmarks/custom/evaluate_existing_forecasts.ipynb.license new file mode 100644 index 000000000..a42c86064 --- /dev/null +++ b/examples/benchmarks/custom/evaluate_existing_forecasts.ipynb.license @@ -0,0 +1,3 @@ +SPDX-FileCopyrightText: 2026 Contributors to the OpenSTEF project + +SPDX-License-Identifier: MPL-2.0 diff --git a/examples/benchmarks/custom_benchmark/evaluate_existing_forecasts.py b/examples/benchmarks/custom/evaluate_existing_forecasts.py similarity index 59% rename from examples/benchmarks/custom_benchmark/evaluate_existing_forecasts.py rename to examples/benchmarks/custom/evaluate_existing_forecasts.py index c1a4dbaeb..348d1af46 100644 --- a/examples/benchmarks/custom_benchmark/evaluate_existing_forecasts.py +++ b/examples/benchmarks/custom/evaluate_existing_forecasts.py @@ -1,3 +1,54 @@ +# --- +# jupyter: +# jupytext: +# formats: ipynb,py:percent +# text_representation: +# extension: .py +# format_name: percent +# format_version: '1.3' +# jupytext_version: 1.19.1 +# kernelspec: +# display_name: Python 3 +# language: python +# name: python3 +# --- + +# %% [markdown] +# # Evaluate Existing Forecasts +# +# Skip backtesting entirely — bring your own prediction parquets and run only +# evaluation + analysis. +# +# **User story:** *"I already have forecasts from my own system. I just want to +# score them with BEAM's metrics and visualizations."* +# +# **See also:** +# - [BenchmarkPipeline](https://openstef.github.io/openstef/v4/api/generated/openstef_beam.benchmarking.BenchmarkPipeline.html) — auto-detects existing predictions and skips backtesting +# - [Custom Benchmark configuration](./custom_benchmark.ipynb) — defines which targets and metrics to use +# - [Quantile naming convention](https://openstef.github.io/openstef/v4/api/generated/openstef_core.types.Quantile.html) — `Quantile(x).format()` → column names +# +# ## Expected directory layout +# +# ``` +# benchmark_results/MyForecasts/ +# └── backtest/ +# └── / # e.g. "solar_park" +# └── / # e.g. "Within 15 kilometers of Opmeer_normalized" +# └── predictions.parquet +# ``` +# +# ## Expected parquet format +# +# | Column | Type | Description | +# |--------|------|-------------| +# | *index* | `DatetimeIndex` (name="timestamp", tz-naive UTC, 15-min) | Forecast timestamp | +# | `available_at` | datetime | When the prediction was generated | +# | `quantile_P05` | float | 5th percentile | +# | `quantile_P50` | float | Median (required) | +# | `quantile_P95` | float | 95th percentile | +# | ... | float | One column per quantile via `Quantile(x).format()` | + +# %% tags=["remove-cell"] """Evaluate pre-existing forecasts without running backtesting. If you already have forecast predictions (e.g. from your own model or an external @@ -47,24 +98,36 @@ # # SPDX-License-Identifier: MPL-2.0 +import os + +os.environ["OMP_NUM_THREADS"] = "1" +os.environ["OPENBLAS_NUM_THREADS"] = "1" +os.environ["MKL_NUM_THREADS"] = "1" + +# %% [markdown] +# ## Setup + +# %% import logging import multiprocessing -import os from pathlib import Path -from examples.benchmarks.custom_benchmark.example_benchmark import create_custom_benchmark_runner +from examples.benchmarks.custom.custom_benchmark import create_custom_benchmark_runner from openstef_beam.backtesting.backtest_forecaster import DummyForecaster from openstef_beam.benchmarking import BenchmarkContext, BenchmarkTarget, LocalBenchmarkStorage from openstef_core.types import Q -os.environ["OMP_NUM_THREADS"] = "1" -os.environ["OPENBLAS_NUM_THREADS"] = "1" -os.environ["MKL_NUM_THREADS"] = "1" - _logger = logging.getLogger(__name__) logging.basicConfig(level=logging.INFO, format="[%(asctime)s][%(levelname)s] %(message)s") +# %% [markdown] +# ## Configuration +# +# Point at the folder containing your prediction parquets and list the quantiles +# they were generated for. + +# %% # Path to the folder that contains the backtest/ directory with your parquets. OUTPUT_PATH = Path("./benchmark_results/MyForecasts") N_PROCESSES = multiprocessing.cpu_count() @@ -73,6 +136,14 @@ # Adjust this list to match whatever quantiles are in your parquet columns. PREDICTION_QUANTILES = [Q(0.05), Q(0.1), Q(0.3), Q(0.5), Q(0.7), Q(0.9), Q(0.95)] +# %% [markdown] +# ## Dummy forecaster factory +# +# The pipeline still needs a factory to know which quantiles were used, but +# `fit()` and `predict()` are never called — backtesting is skipped. + +# %% + def stub_factory(_context: BenchmarkContext, _target: BenchmarkTarget) -> DummyForecaster: """Factory that returns a DummyForecaster (backtesting is skipped). @@ -86,16 +157,17 @@ def stub_factory(_context: BenchmarkContext, _target: BenchmarkTarget) -> DummyF return DummyForecaster(predict_quantiles=PREDICTION_QUANTILES) +# %% [markdown] +# ## Run evaluation +# +# The pipeline reads existing parquets and runs evaluation + analysis only. + +# %% if __name__ == "__main__": - # Point the storage at your results folder. - # The pipeline reads parquets from: - # OUTPUT_PATH / backtest / / / predictions.parquet storage = LocalBenchmarkStorage(base_path=OUTPUT_PATH) runner = create_custom_benchmark_runner(storage=storage) - # Run the pipeline — backtesting is auto-skipped for every target that - # already has a predictions.parquet on disk. runner.run( forecaster_factory=stub_factory, run_name="my_forecasts", diff --git a/examples/benchmarks/custom/run_custom_benchmark.ipynb b/examples/benchmarks/custom/run_custom_benchmark.ipynb new file mode 100644 index 000000000..c6e10acac --- /dev/null +++ b/examples/benchmarks/custom/run_custom_benchmark.ipynb @@ -0,0 +1,204 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "f7f9fc26", + "metadata": {}, + "source": [ + "# Run Custom Benchmark\n", + "\n", + "Entry point: run your custom forecaster on your own data using the pipeline\n", + "configured in [`custom_benchmark.py`](./custom_benchmark.ipynb).\n", + "\n", + "**See also:**\n", + "- [Custom Forecaster template](./custom_forecaster.ipynb) — define your model\n", + "- [Custom Benchmark configuration](./custom_benchmark.ipynb) — configure targets and metrics" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "769fc12e", + "metadata": { + "tags": [ + "remove-cell" + ] + }, + "outputs": [], + "source": [ + "\"\"\"Run the custom benchmark: example baseline vs OpenSTEF GBLinear.\n", + "\n", + "Uses the custom benchmark pipeline from example_benchmark.py (which extends\n", + "SimpleTargetProvider) instead of the built-in Liander 2024 runner.\n", + "\"\"\"\n", + "\n", + "# SPDX-FileCopyrightText: 2025 Contributors to the OpenSTEF project \n", + "#\n", + "# SPDX-License-Identifier: MPL-2.0\n", + "\n", + "import os\n", + "\n", + "# Prevent thread contention when running multiple targets in parallel\n", + "os.environ[\"OMP_NUM_THREADS\"] = \"1\"\n", + "os.environ[\"OPENBLAS_NUM_THREADS\"] = \"1\"\n", + "os.environ[\"MKL_NUM_THREADS\"] = \"1\"" + ] + }, + { + "cell_type": "markdown", + "id": "b474c24a", + "metadata": {}, + "source": [ + "## Setup" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "65322a90", + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "import logging\n", + "import multiprocessing\n", + "from pathlib import Path\n", + "\n", + "from examples.benchmarks.custom.custom_benchmark import MyCategory, create_custom_benchmark_runner\n", + "from examples.benchmarks.custom.custom_forecaster import ExampleBenchmarkForecaster\n", + "from openstef_beam.benchmarking import BenchmarkContext, BenchmarkTarget, LocalBenchmarkStorage\n", + "from openstef_beam.benchmarking.baselines.openstef4 import create_openstef4_preset_backtest_forecaster\n", + "from openstef_core.types import LeadTime, Q\n", + "from openstef_models.presets import ForecastingWorkflowConfig\n", + "\n", + "logging.basicConfig(level=logging.INFO, format=\"[%(asctime)s][%(levelname)s] %(message)s\")" + ] + }, + { + "cell_type": "markdown", + "id": "e8298b0b", + "metadata": {}, + "source": [ + "## Configuration" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8a517fc9", + "metadata": { + "lines_to_next_cell": 2 + }, + "outputs": [], + "source": [ + "OUTPUT_PATH = Path(\"./benchmark_results\")\n", + "N_PROCESSES = multiprocessing.cpu_count()\n", + "\n", + "# Optional: filter to specific target categories (None = run all)\n", + "BENCHMARK_FILTER: list[MyCategory] | None = [\"solar_park\"]\n", + "\n", + "# Quantiles define the probabilistic forecast bands\n", + "PREDICTION_QUANTILES = [Q(0.05), Q(0.1), Q(0.3), Q(0.5), Q(0.7), Q(0.9), Q(0.95)]\n", + "\n", + "# --- GBLinear config ---\n", + "# Map column names in your data to what OpenSTEF expects\n", + "gblinear_config = ForecastingWorkflowConfig(\n", + " model_id=\"custom_benchmark_\",\n", + " run_name=None,\n", + " model=\"gblinear\",\n", + " horizons=[LeadTime.from_string(\"P3D\")],\n", + " quantiles=PREDICTION_QUANTILES,\n", + " model_reuse_enable=True,\n", + " radiation_column=\"shortwave_radiation\",\n", + " wind_speed_column=\"wind_speed_80m\",\n", + " pressure_column=\"surface_pressure\",\n", + " temperature_column=\"temperature_2m\",\n", + " relative_humidity_column=\"relative_humidity_2m\",\n", + " energy_price_column=\"EPEX_NL\",\n", + " rolling_aggregate_features=[\"mean\", \"median\", \"max\", \"min\"],\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "4b965092", + "metadata": {}, + "source": [ + "## Forecaster factory" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "486c3ec8", + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "\n", + "# --- Example baseline factory ---\n", + "def example_factory(_context: BenchmarkContext, _target: BenchmarkTarget) -> ExampleBenchmarkForecaster:\n", + " \"\"\"Create an example forecaster for a benchmark target.\n", + "\n", + " Returns:\n", + " Configured ExampleBenchmarkForecaster instance.\n", + " \"\"\"\n", + " return ExampleBenchmarkForecaster(predict_quantiles=PREDICTION_QUANTILES)" + ] + }, + { + "cell_type": "markdown", + "id": "51c6bd6a", + "metadata": {}, + "source": [ + "## Run benchmark\n", + "\n", + "Run the custom baseline and GBLinear on your data." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a5f7bbe2", + "metadata": {}, + "outputs": [], + "source": [ + "if __name__ == \"__main__\":\n", + " # 1. Run example baseline using the custom benchmark pipeline\n", + " create_custom_benchmark_runner(\n", + " storage=LocalBenchmarkStorage(base_path=OUTPUT_PATH / \"ExampleBaseline\"),\n", + " ).run(\n", + " forecaster_factory=example_factory,\n", + " run_name=\"example_baseline\",\n", + " n_processes=N_PROCESSES,\n", + " filter_args=BENCHMARK_FILTER,\n", + " )\n", + "\n", + " # 2. Run GBLinear using the same custom pipeline\n", + " create_custom_benchmark_runner(\n", + " storage=LocalBenchmarkStorage(base_path=OUTPUT_PATH / \"GBLinear\"),\n", + " ).run(\n", + " forecaster_factory=create_openstef4_preset_backtest_forecaster(\n", + " workflow_config=gblinear_config,\n", + " cache_dir=OUTPUT_PATH / \"cache\",\n", + " ),\n", + " run_name=\"gblinear\",\n", + " n_processes=N_PROCESSES,\n", + " filter_args=BENCHMARK_FILTER,\n", + " )" + ] + } + ], + "metadata": { + "jupytext": { + "formats": "ipynb,py:percent" + }, + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/examples/benchmarks/custom/run_custom_benchmark.ipynb.license b/examples/benchmarks/custom/run_custom_benchmark.ipynb.license new file mode 100644 index 000000000..a42c86064 --- /dev/null +++ b/examples/benchmarks/custom/run_custom_benchmark.ipynb.license @@ -0,0 +1,3 @@ +SPDX-FileCopyrightText: 2026 Contributors to the OpenSTEF project + +SPDX-License-Identifier: MPL-2.0 diff --git a/examples/benchmarks/custom_benchmark/run_benchmark.py b/examples/benchmarks/custom/run_custom_benchmark.py similarity index 74% rename from examples/benchmarks/custom_benchmark/run_benchmark.py rename to examples/benchmarks/custom/run_custom_benchmark.py index e744cf5b9..2e0aa3739 100644 --- a/examples/benchmarks/custom_benchmark/run_benchmark.py +++ b/examples/benchmarks/custom/run_custom_benchmark.py @@ -1,3 +1,29 @@ +# --- +# jupyter: +# jupytext: +# formats: ipynb,py:percent +# text_representation: +# extension: .py +# format_name: percent +# format_version: '1.3' +# jupytext_version: 1.19.1 +# kernelspec: +# display_name: Python 3 +# language: python +# name: python3 +# --- + +# %% [markdown] +# # Run Custom Benchmark +# +# Entry point: run your custom forecaster on your own data using the pipeline +# configured in [`custom_benchmark.py`](./custom_benchmark.ipynb). +# +# **See also:** +# - [Custom Forecaster template](./custom_forecaster.ipynb) — define your model +# - [Custom Benchmark configuration](./custom_benchmark.ipynb) — configure targets and metrics + +# %% tags=["remove-cell"] """Run the custom benchmark: example baseline vs OpenSTEF GBLinear. Uses the custom benchmark pipeline from example_benchmark.py (which extends @@ -15,12 +41,17 @@ os.environ["OPENBLAS_NUM_THREADS"] = "1" os.environ["MKL_NUM_THREADS"] = "1" +# %% [markdown] +# ## Setup + +# %% + import logging import multiprocessing from pathlib import Path -from examples.benchmarks.custom_benchmark.example_baseline import ExampleBenchmarkForecaster -from examples.benchmarks.custom_benchmark.example_benchmark import MyCategory, create_custom_benchmark_runner +from examples.benchmarks.custom.custom_benchmark import MyCategory, create_custom_benchmark_runner +from examples.benchmarks.custom.custom_forecaster import ExampleBenchmarkForecaster from openstef_beam.benchmarking import BenchmarkContext, BenchmarkTarget, LocalBenchmarkStorage from openstef_beam.benchmarking.baselines.openstef4 import create_openstef4_preset_backtest_forecaster from openstef_core.types import LeadTime, Q @@ -28,6 +59,10 @@ logging.basicConfig(level=logging.INFO, format="[%(asctime)s][%(levelname)s] %(message)s") +# %% [markdown] +# ## Configuration + +# %% OUTPUT_PATH = Path("./benchmark_results") N_PROCESSES = multiprocessing.cpu_count() @@ -56,6 +91,12 @@ ) +# %% [markdown] +# ## Forecaster factory + +# %% + + # --- Example baseline factory --- def example_factory(_context: BenchmarkContext, _target: BenchmarkTarget) -> ExampleBenchmarkForecaster: """Create an example forecaster for a benchmark target. @@ -66,6 +107,12 @@ def example_factory(_context: BenchmarkContext, _target: BenchmarkTarget) -> Exa return ExampleBenchmarkForecaster(predict_quantiles=PREDICTION_QUANTILES) +# %% [markdown] +# ## Run benchmark +# +# Run the custom baseline and GBLinear on your data. + +# %% if __name__ == "__main__": # 1. Run example baseline using the custom benchmark pipeline create_custom_benchmark_runner( diff --git a/examples/benchmarks/custom/run_liander2024_benchmark.ipynb b/examples/benchmarks/custom/run_liander2024_benchmark.ipynb new file mode 100644 index 000000000..6f7993f85 --- /dev/null +++ b/examples/benchmarks/custom/run_liander2024_benchmark.ipynb @@ -0,0 +1,216 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "05ca0933", + "metadata": {}, + "source": [ + "# Run Liander 2024 Benchmark (Custom Forecaster)\n", + "\n", + "Entry point: test your custom forecaster on the built-in\n", + "[Liander 2024 dataset](https://huggingface.co/datasets/OpenSTEF/liander2024-stef-benchmark)\n", + "(auto-downloaded from HuggingFace).\n", + "\n", + "Uses [`create_liander2024_benchmark_runner()`](https://openstef.github.io/openstef/v4/api/generated/openstef_beam.benchmarking.benchmarks.liander2024.html)\n", + "which pre-configures backtest settings, evaluation windows, metrics, and target definitions.\n", + "\n", + "**See also:** [Custom Forecaster template](./custom_forecaster.ipynb) — define your model here." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f8e9d860", + "metadata": { + "tags": [ + "remove-cell" + ] + }, + "outputs": [], + "source": [ + "\"\"\"Example: run the built-in Liander 2024 benchmark with a custom baseline and GBLinear.\n", + "\n", + "Uses create_liander2024_benchmark_runner() which pre-configures everything:\n", + "backtest settings, evaluation windows, metrics, analysis plots, and target\n", + "definitions. Data is auto-downloaded from HuggingFace.\n", + "\"\"\"\n", + "\n", + "# SPDX-FileCopyrightText: 2025 Contributors to the OpenSTEF project \n", + "#\n", + "# SPDX-License-Identifier: MPL-2.0\n", + "\n", + "import os\n", + "\n", + "os.environ[\"OMP_NUM_THREADS\"] = \"1\"\n", + "os.environ[\"OPENBLAS_NUM_THREADS\"] = \"1\"\n", + "os.environ[\"MKL_NUM_THREADS\"] = \"1\"" + ] + }, + { + "cell_type": "markdown", + "id": "1aa6b9e9", + "metadata": {}, + "source": [ + "## Setup" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "60a8b1f5", + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "import logging\n", + "import multiprocessing\n", + "from pathlib import Path\n", + "\n", + "from examples.benchmarks.custom.custom_forecaster import ExampleBenchmarkForecaster\n", + "from openstef_beam.benchmarking import BenchmarkContext, BenchmarkTarget, LocalBenchmarkStorage, StrictExecutionCallback\n", + "from openstef_beam.benchmarking.baselines.openstef4 import create_openstef4_preset_backtest_forecaster\n", + "from openstef_beam.benchmarking.benchmarks.liander2024 import Liander2024Category, create_liander2024_benchmark_runner\n", + "from openstef_core.types import LeadTime, Q\n", + "from openstef_models.presets import ForecastingWorkflowConfig\n", + "\n", + "logging.basicConfig(level=logging.INFO, format=\"[%(asctime)s][%(levelname)s] %(message)s\")" + ] + }, + { + "cell_type": "markdown", + "id": "ae1d69fc", + "metadata": {}, + "source": [ + "## Configuration\n", + "\n", + "Define output paths, quantiles, and the GBLinear model config." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0e115ded", + "metadata": { + "lines_to_next_cell": 2 + }, + "outputs": [], + "source": [ + "OUTPUT_PATH = Path(\"./benchmark_results\")\n", + "N_PROCESSES = int(os.environ.get(\"OPENSTEF_N_PROCESSES\", str(multiprocessing.cpu_count())))\n", + "\n", + "# Optional: filter to specific target categories (None = run all)\n", + "BENCHMARK_FILTER: list[Liander2024Category] | None = None\n", + "\n", + "# Quantiles define the probabilistic forecast bands\n", + "# Q(0.05) = 5th percentile, Q(0.5) = median, Q(0.95) = 95th percentile\n", + "PREDICTION_QUANTILES = [Q(0.05), Q(0.1), Q(0.3), Q(0.5), Q(0.7), Q(0.9), Q(0.95)]\n", + "\n", + "# --- GBLinear model config ---\n", + "# Map column names in your data to what OpenSTEF expects\n", + "gblinear_config = ForecastingWorkflowConfig(\n", + " model_id=\"liander_benchmark_\",\n", + " run_name=None,\n", + " model=\"gblinear\",\n", + " horizons=[LeadTime.from_string(\"P3D\")],\n", + " quantiles=PREDICTION_QUANTILES,\n", + " model_reuse_enable=True,\n", + " radiation_column=\"shortwave_radiation\",\n", + " wind_speed_column=\"wind_speed_80m\",\n", + " pressure_column=\"surface_pressure\",\n", + " temperature_column=\"temperature_2m\",\n", + " relative_humidity_column=\"relative_humidity_2m\",\n", + " energy_price_column=\"EPEX_NL\",\n", + " rolling_aggregate_features=[\"mean\", \"median\", \"max\", \"min\"],\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "96e99ff0", + "metadata": {}, + "source": [ + "## Forecaster factory\n", + "\n", + "The benchmark pipeline calls this function once per target. Return your custom forecaster." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cfa35952", + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "\n", + "def example_factory(_context: BenchmarkContext, _target: BenchmarkTarget) -> ExampleBenchmarkForecaster:\n", + " \"\"\"Create the example baseline forecaster.\n", + "\n", + " Returns:\n", + " Configured ExampleBenchmarkForecaster instance.\n", + " \"\"\"\n", + " return ExampleBenchmarkForecaster(predict_quantiles=PREDICTION_QUANTILES)" + ] + }, + { + "cell_type": "markdown", + "id": "9ed22818", + "metadata": {}, + "source": [ + "## Run benchmark\n", + "\n", + "Run the custom baseline and GBLinear on all Liander 2024 targets.\n", + "Results are saved to `./benchmark_results//`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d62c5aef", + "metadata": {}, + "outputs": [], + "source": [ + "if __name__ == \"__main__\":\n", + " # 1. Run custom baseline on Liander 2024\n", + " # create_liander2024_benchmark_runner() sets up everything: data download, configs, metrics\n", + " # LocalBenchmarkStorage writes results as parquet files to disk\n", + " create_liander2024_benchmark_runner(\n", + " storage=LocalBenchmarkStorage(base_path=OUTPUT_PATH / \"ExampleBaseline\"),\n", + " callbacks=[StrictExecutionCallback()], # Fail fast on errors\n", + " ).run(\n", + " forecaster_factory=example_factory, # Your model factory (called per target)\n", + " run_name=\"example_baseline\", # Label for this run\n", + " n_processes=N_PROCESSES, # Parallel targets\n", + " filter_args=BENCHMARK_FILTER, # None = all categories\n", + " )\n", + "\n", + " # 2. Run GBLinear on Liander 2024\n", + " # create_openstef4_preset_backtest_forecaster returns a factory that wraps OpenSTEF models\n", + " create_liander2024_benchmark_runner(\n", + " storage=LocalBenchmarkStorage(base_path=OUTPUT_PATH / \"GBLinear\"),\n", + " callbacks=[StrictExecutionCallback()],\n", + " ).run(\n", + " forecaster_factory=create_openstef4_preset_backtest_forecaster(\n", + " workflow_config=gblinear_config,\n", + " cache_dir=OUTPUT_PATH / \"cache\",\n", + " ),\n", + " run_name=\"gblinear\",\n", + " n_processes=N_PROCESSES,\n", + " filter_args=BENCHMARK_FILTER,\n", + " )" + ] + } + ], + "metadata": { + "jupytext": { + "formats": "ipynb,py:percent" + }, + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/examples/benchmarks/custom/run_liander2024_benchmark.ipynb.license b/examples/benchmarks/custom/run_liander2024_benchmark.ipynb.license new file mode 100644 index 000000000..a42c86064 --- /dev/null +++ b/examples/benchmarks/custom/run_liander2024_benchmark.ipynb.license @@ -0,0 +1,3 @@ +SPDX-FileCopyrightText: 2026 Contributors to the OpenSTEF project + +SPDX-License-Identifier: MPL-2.0 diff --git a/examples/benchmarks/custom_benchmark/run_liander2024_benchmark.py b/examples/benchmarks/custom/run_liander2024_benchmark.py similarity index 70% rename from examples/benchmarks/custom_benchmark/run_liander2024_benchmark.py rename to examples/benchmarks/custom/run_liander2024_benchmark.py index 0e9918c8d..bc5524d87 100644 --- a/examples/benchmarks/custom_benchmark/run_liander2024_benchmark.py +++ b/examples/benchmarks/custom/run_liander2024_benchmark.py @@ -1,3 +1,31 @@ +# --- +# jupyter: +# jupytext: +# formats: ipynb,py:percent +# text_representation: +# extension: .py +# format_name: percent +# format_version: '1.3' +# jupytext_version: 1.19.1 +# kernelspec: +# display_name: Python 3 +# language: python +# name: python3 +# --- + +# %% [markdown] +# # Run Liander 2024 Benchmark (Custom Forecaster) +# +# Entry point: test your custom forecaster on the built-in +# [Liander 2024 dataset](https://huggingface.co/datasets/OpenSTEF/liander2024-stef-benchmark) +# (auto-downloaded from HuggingFace). +# +# Uses [`create_liander2024_benchmark_runner()`](https://openstef.github.io/openstef/v4/api/generated/openstef_beam.benchmarking.benchmarks.liander2024.html) +# which pre-configures backtest settings, evaluation windows, metrics, and target definitions. +# +# **See also:** [Custom Forecaster template](./custom_forecaster.ipynb) — define your model here. + +# %% tags=["remove-cell"] """Example: run the built-in Liander 2024 benchmark with a custom baseline and GBLinear. Uses create_liander2024_benchmark_runner() which pre-configures everything: @@ -15,11 +43,16 @@ os.environ["OPENBLAS_NUM_THREADS"] = "1" os.environ["MKL_NUM_THREADS"] = "1" +# %% [markdown] +# ## Setup + +# %% + import logging import multiprocessing from pathlib import Path -from examples.benchmarks.custom_benchmark.example_baseline import ExampleBenchmarkForecaster +from examples.benchmarks.custom.custom_forecaster import ExampleBenchmarkForecaster from openstef_beam.benchmarking import BenchmarkContext, BenchmarkTarget, LocalBenchmarkStorage, StrictExecutionCallback from openstef_beam.benchmarking.baselines.openstef4 import create_openstef4_preset_backtest_forecaster from openstef_beam.benchmarking.benchmarks.liander2024 import Liander2024Category, create_liander2024_benchmark_runner @@ -28,8 +61,14 @@ logging.basicConfig(level=logging.INFO, format="[%(asctime)s][%(levelname)s] %(message)s") +# %% [markdown] +# ## Configuration +# +# Define output paths, quantiles, and the GBLinear model config. + +# %% OUTPUT_PATH = Path("./benchmark_results") -N_PROCESSES = multiprocessing.cpu_count() +N_PROCESSES = int(os.environ.get("OPENSTEF_N_PROCESSES", str(multiprocessing.cpu_count()))) # Optional: filter to specific target categories (None = run all) BENCHMARK_FILTER: list[Liander2024Category] | None = None @@ -57,6 +96,14 @@ ) +# %% [markdown] +# ## Forecaster factory +# +# The benchmark pipeline calls this function once per target. Return your custom forecaster. + +# %% + + def example_factory(_context: BenchmarkContext, _target: BenchmarkTarget) -> ExampleBenchmarkForecaster: """Create the example baseline forecaster. @@ -66,6 +113,13 @@ def example_factory(_context: BenchmarkContext, _target: BenchmarkTarget) -> Exa return ExampleBenchmarkForecaster(predict_quantiles=PREDICTION_QUANTILES) +# %% [markdown] +# ## Run benchmark +# +# Run the custom baseline and GBLinear on all Liander 2024 targets. +# Results are saved to `./benchmark_results//`. + +# %% if __name__ == "__main__": # 1. Run custom baseline on Liander 2024 # create_liander2024_benchmark_runner() sets up everything: data download, configs, metrics diff --git a/examples/benchmarks/custom_benchmark/README.md b/examples/benchmarks/custom_benchmark/README.md deleted file mode 100644 index 54db5a672..000000000 --- a/examples/benchmarks/custom_benchmark/README.md +++ /dev/null @@ -1,142 +0,0 @@ - - -# Custom Benchmark Example - -End-to-end examples for running and customizing OpenSTEF **BEAM** (Backtesting, Evaluation, Analysis, Metrics) benchmarks. - -## What is BEAM? - -BEAM replays historical data day by day, trains your model, makes forecasts, and scores them -- all without data leakage. It works with any model that implements the `BacktestForecasterMixin` interface. - -## Files - -| File | What it does | -|---|---| -| `example_baseline.py` | **Start here.** A minimal forecaster that predicts the median of recent history. Shows the `BacktestForecasterMixin` interface (`config`, `quantiles`, `fit`, `predict`). | -| `example_benchmark.py` | Defines a custom benchmark: target provider (where data lives), metrics, and pipeline assembly. Extends `SimpleTargetProvider` directly -- adapt this when you have your own data layout. | -| `run_liander2024_benchmark.py` | Runs the example baseline + GBLinear on the built-in **Liander 2024** dataset (auto-downloaded from HuggingFace). Good starting point if you just want to try things out. | -| `run_benchmark.py` | Same as above but uses the custom benchmark pipeline from `example_benchmark.py`. | -| `evaluate_existing_forecasts.py` | **Bring your own forecasts.** Points the pipeline at pre-existing prediction parquets and runs only evaluation + analysis (no backtesting). | -| `compare_liander2024_results.py` | Compare results from multiple runs on the **Liander 2024** dataset. Auto-detects which targets are available in all runs. | -| `compare_custom_results.py` | Compare results from multiple runs on the **custom** benchmark. Same auto-detection as above. | - -## Quick Start - -```bash -# 1. Clone the repo -git clone git@github.com:OpenSTEF/openstef.git -b "release/v4.0.0" -cd openstef - -# 2. Install all packages (requires uv: https://docs.astral.sh/uv/) -uv sync --all-extras --all-groups --all-packages -``` - -### Run the Liander 2024 benchmark - -Uses the built-in Liander 2024 dataset (auto-downloaded from HuggingFace). Runs the example baseline and GBLinear on all target categories. - -```bash -uv run python -m examples.benchmarks.custom_benchmark.run_liander2024_benchmark -``` - -### Run the custom benchmark - -Uses the custom target provider from `example_benchmark.py` with your own pipeline config. Runs on `solar_park` targets by default. - -```bash -uv run python -m examples.benchmarks.custom_benchmark.run_benchmark -``` - -### Evaluate pre-existing forecasts (no backtesting) - -If you already have predictions from your own model or external system, you can skip backtesting entirely. Place your forecast parquets in the expected directory layout and run only evaluation + analysis. - -#### Required directory layout - -``` -benchmark_results/MyForecasts/ -└── backtest/ - └── / # e.g. "solar_park" - └── / # e.g. "Within 15 kilometers of Opmeer_normalized" - └── predictions.parquet -``` - -`group_name` and `target_name` must match the values from your targets YAML. You can list them: - -```bash -uv run python -c " -from examples.benchmarks.custom_benchmark.example_benchmark import create_custom_benchmark_runner -for t in create_custom_benchmark_runner().target_provider.get_targets(['solar_park']): - print(t.group_name, '/', t.name) -" -``` - -#### Required parquet format - -Each `predictions.parquet` must have: - -| Column | Type | Description | -|---|---|---| -| *(index)* `timestamp` | `DatetimeIndex` | When each prediction is valid for. 15-min intervals, tz-naive UTC. | -| `available_at` | `datetime64` | When the prediction was generated (enables D-1 / lead-time filtering). | -| `quantile_P05` | `float` | 5th percentile prediction. | -| `quantile_P50` | `float` | Median prediction (**required**). | -| `quantile_P95` | `float` | 95th percentile prediction. | -| ... | `float` | One column per quantile, named with `Quantile(x).format()`. | - -Example rows: - -``` -timestamp (index) available_at quantile_P05 quantile_P50 quantile_P95 -2023-01-15 12:00:00 2023-01-14 06:00:00 0.5 1.2 2.0 -2023-01-15 12:15:00 2023-01-14 06:00:00 0.6 1.3 2.1 -``` - -#### Run - -```bash -uv run python -m examples.benchmarks.custom_benchmark.evaluate_existing_forecasts -``` - -See `evaluate_existing_forecasts.py` for the full script. - -Results are written to `./benchmark_results/`. Each model gets its own subfolder with backtest predictions, evaluation scores, and analysis plots. - -### Compare results across runs - -After running at least two models, generate side-by-side comparison plots (global, per-group, per-target). The scripts automatically detect which targets are available in all runs. - -```bash -# Compare on the Liander 2024 dataset -uv run python -m examples.benchmarks.custom_benchmark.compare_liander2024_results - -# Compare on the custom benchmark -uv run python -m examples.benchmarks.custom_benchmark.compare_custom_results -``` - -Comparison output (HTML plots) is saved to `./benchmark_results_comparison/`. - -## Creating Your Own - -### 1. Write a forecaster - -Copy `example_baseline.py` and implement two methods: - -- **`fit(data)`** -- called periodically with recent history. Train your model here. -- **`predict(data)`** -- called every few hours. Return a `TimeSeriesDataset` with a `"load"` column and one column per quantile (e.g. `"quantile_P05"`, `"quantile_P50"`). - -The `data` argument is a `RestrictedHorizonVersionedTimeSeries` -- it enforces no-lookahead by only exposing data available at `data.horizon`. Use `data.get_window(start, end, available_before)` to retrieve slices. - -### 2. Define a benchmark (optional) - -Copy `example_benchmark.py` if you want to use **your own data**. The key class is `SimpleTargetProvider` -- override `_get_measurements_path_for_target()` and `_get_weather_path_for_target()` to point to your parquet files. - -If you're fine with the Liander 2024 dataset, skip this step and use `create_liander2024_benchmark_runner()` directly. - -### 3. Write a runner - -Copy `run_benchmark.py`. Register your models as forecaster factories and call `pipeline.run()`. diff --git a/examples/benchmarks/custom_benchmark/compare_custom_results.py b/examples/benchmarks/custom_benchmark/compare_custom_results.py deleted file mode 100644 index 692ce9b79..000000000 --- a/examples/benchmarks/custom_benchmark/compare_custom_results.py +++ /dev/null @@ -1,48 +0,0 @@ -"""Compare benchmark results from different runs on the custom benchmark. - -Usage: - 1. First run at least two models with run_benchmark.py - (e.g. ExampleBaseline and GBLinear). - 2. Then run this script to generate side-by-side comparison plots. - -Output is saved to ./benchmark_results_comparison/custom/. -""" - -# SPDX-FileCopyrightText: 2025 Contributors to the OpenSTEF project -# -# SPDX-License-Identifier: MPL-2.0 - -from pathlib import Path -from typing import cast - -from examples.benchmarks.custom_benchmark.example_benchmark import ANALYSIS_CONFIG, create_custom_benchmark_runner -from openstef_beam.analysis.models import RunName -from openstef_beam.benchmarking import BenchmarkComparisonPipeline, LocalBenchmarkStorage -from openstef_beam.benchmarking.storage import BenchmarkStorage - -# One storage per run — keys are human-readable labels shown in comparison plots. -run_storages: dict[RunName, BenchmarkStorage] = { - "ExampleBaseline": LocalBenchmarkStorage(base_path=Path("./benchmark_results/ExampleBaseline")), - "GBLinear": LocalBenchmarkStorage(base_path=Path("./benchmark_results/GBLinear")), -} - -# Check that results exist. -for name, storage in run_storages.items(): - base_path = cast(LocalBenchmarkStorage, storage).base_path - if not base_path.exists(): - msg = f"Benchmark directory not found for '{name}': {base_path}. Run the benchmarks first." - raise FileNotFoundError(msg) - -# Reuse the custom target provider. -OUTPUT_PATH = Path("./benchmark_results_comparison/custom") -target_provider = create_custom_benchmark_runner( - storage=LocalBenchmarkStorage(base_path=OUTPUT_PATH), -).target_provider - -# Run the comparison — generates global, group, and per-target HTML plots. -comparison = BenchmarkComparisonPipeline( - analysis_config=ANALYSIS_CONFIG, - storage=LocalBenchmarkStorage(base_path=OUTPUT_PATH), - target_provider=target_provider, -) -comparison.run(run_data=run_storages, filter_args=["solar_park"]) diff --git a/examples/benchmarks/liander2024/README.md b/examples/benchmarks/liander2024/README.md new file mode 100644 index 000000000..29dd36c9f --- /dev/null +++ b/examples/benchmarks/liander2024/README.md @@ -0,0 +1,26 @@ + + +# Liander 2024 + +Pre-made benchmarks on the [Liander 2024 STEF benchmark dataset](https://huggingface.co/datasets/OpenSTEF/liander2024-stef-benchmark) +— an open dataset of Dutch energy grid measurements (solar, wind, consumption). + +**No code changes needed.** Pick a notebook below and run it. Data is +auto-downloaded from HuggingFace. + +```bash +# Run the XGBoost + GBLinear benchmark +uv run python -m examples.benchmarks.liander2024.run_xgboost_gblinear_benchmark +``` + +```{toctree} +:maxdepth: 1 + +XGBoost & GBLinear +Ensemble Models +Compare Results +``` diff --git a/examples/examples/.gitignore b/examples/benchmarks/liander2024/__init__.py similarity index 77% rename from examples/examples/.gitignore rename to examples/benchmarks/liander2024/__init__.py index 39116e399..54f9ce2f2 100644 --- a/examples/examples/.gitignore +++ b/examples/benchmarks/liander2024/__init__.py @@ -1,5 +1,4 @@ # SPDX-FileCopyrightText: 2025 Contributors to the OpenSTEF project # # SPDX-License-Identifier: MPL-2.0 - -mlflow_tracking* +"""Liander 2024 benchmark scripts.""" diff --git a/examples/benchmarks/liander2024/compare_benchmark_runs.ipynb b/examples/benchmarks/liander2024/compare_benchmark_runs.ipynb new file mode 100644 index 000000000..4cc99e83e --- /dev/null +++ b/examples/benchmarks/liander2024/compare_benchmark_runs.ipynb @@ -0,0 +1,149 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "0d864501", + "metadata": {}, + "source": [ + "# Compare Benchmark Runs\n", + "\n", + "Generate side-by-side comparison plots from multiple benchmark runs on the\n", + "Liander 2024 dataset.\n", + "\n", + "**Prerequisites:** Run at least two models first (e.g. XGBoost + GBLinear via\n", + "the *XGBoost & GBLinear* notebook).\n", + "\n", + "**What this does:**\n", + "\n", + "1. Loads results from multiple model runs (each stored in its own directory)\n", + "2. Computes metrics across all targets using\n", + " [`BenchmarkComparisonPipeline`](https://openstef.github.io/openstef/v4/api/generated/openstef_beam.benchmarking.BenchmarkComparisonPipeline.html)\n", + "3. Produces comparison visualizations (boxplots, ranking tables, per-target breakdowns)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8fd216c5", + "metadata": { + "tags": [ + "remove-cell" + ] + }, + "outputs": [], + "source": [ + "# SPDX-FileCopyrightText: 2025 Contributors to the OpenSTEF project \n", + "#\n", + "# SPDX-License-Identifier: MPL-2.0" + ] + }, + { + "cell_type": "markdown", + "id": "ffb9eb1c", + "metadata": {}, + "source": [ + "## Setup\n", + "\n", + "Point at the result directories from your benchmark runs." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "960f4f2c", + "metadata": {}, + "outputs": [], + "source": [ + "from pathlib import Path\n", + "\n", + "from openstef_beam.analysis.models import RunName\n", + "from openstef_beam.benchmarking import BenchmarkComparisonPipeline, LocalBenchmarkStorage\n", + "from openstef_beam.benchmarking.benchmarks import create_liander2024_benchmark_runner\n", + "from openstef_beam.benchmarking.benchmarks.liander2024 import LIANDER2024_ANALYSIS_CONFIG\n", + "from openstef_beam.benchmarking.storage import BenchmarkStorage\n", + "\n", + "BASE_DIR = Path()\n", + "\n", + "OUTPUT_PATH = BASE_DIR / \"./benchmark_results_comparison\"\n", + "\n", + "BENCHMARK_DIR_GBLINEAR = BASE_DIR / \"benchmark_results\" / \"GBLinear\"\n", + "BENCHMARK_DIR_XGBOOST = BASE_DIR / \"benchmark_results\" / \"XGBoost\"" + ] + }, + { + "cell_type": "markdown", + "id": "0188f84f", + "metadata": {}, + "source": [ + "## Load run results\n", + "\n", + "Each run is identified by a name and backed by a `LocalBenchmarkStorage` that\n", + "points at the directory where that model's results were saved." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4dc75226", + "metadata": {}, + "outputs": [], + "source": [ + "check_dirs = [\n", + " BENCHMARK_DIR_GBLINEAR,\n", + " BENCHMARK_DIR_XGBOOST,\n", + "]\n", + "for dir_path in check_dirs:\n", + " if not dir_path.exists():\n", + " msg = f\"Benchmark directory not found: {dir_path}. Make sure to run the benchmarks first.\"\n", + " raise FileNotFoundError(msg)\n", + "\n", + "run_storages: dict[RunName, BenchmarkStorage] = {\n", + " \"gblinear\": LocalBenchmarkStorage(base_path=BENCHMARK_DIR_GBLINEAR),\n", + " \"xgboost\": LocalBenchmarkStorage(base_path=BENCHMARK_DIR_XGBOOST),\n", + "}" + ] + }, + { + "cell_type": "markdown", + "id": "a58f6240", + "metadata": {}, + "source": [ + "## Run comparison\n", + "\n", + "The pipeline loads predictions from each run, re-evaluates them with the\n", + "Liander 2024 analysis config, and produces comparison visualizations." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e3bdb97d", + "metadata": {}, + "outputs": [], + "source": [ + "target_provider = create_liander2024_benchmark_runner(\n", + " storage=LocalBenchmarkStorage(base_path=OUTPUT_PATH),\n", + ").target_provider\n", + "\n", + "comparison_pipeline = BenchmarkComparisonPipeline(\n", + " analysis_config=LIANDER2024_ANALYSIS_CONFIG,\n", + " storage=LocalBenchmarkStorage(base_path=OUTPUT_PATH),\n", + " target_provider=target_provider,\n", + ")\n", + "comparison_pipeline.run(run_data=run_storages)" + ] + } + ], + "metadata": { + "jupytext": { + "formats": "ipynb,py:percent" + }, + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/examples/benchmarks/liander2024/compare_benchmark_runs.ipynb.license b/examples/benchmarks/liander2024/compare_benchmark_runs.ipynb.license new file mode 100644 index 000000000..a42c86064 --- /dev/null +++ b/examples/benchmarks/liander2024/compare_benchmark_runs.ipynb.license @@ -0,0 +1,3 @@ +SPDX-FileCopyrightText: 2026 Contributors to the OpenSTEF project + +SPDX-License-Identifier: MPL-2.0 diff --git a/examples/benchmarks/liander_2024_compare_results.py b/examples/benchmarks/liander2024/compare_benchmark_runs.py similarity index 52% rename from examples/benchmarks/liander_2024_compare_results.py rename to examples/benchmarks/liander2024/compare_benchmark_runs.py index de57191fa..ffc988b99 100644 --- a/examples/benchmarks/liander_2024_compare_results.py +++ b/examples/benchmarks/liander2024/compare_benchmark_runs.py @@ -1,8 +1,45 @@ -"""Example for comparing benchmark results from different runs on the Liander 2024 dataset.""" +# --- +# jupyter: +# jupytext: +# formats: ipynb,py:percent +# text_representation: +# extension: .py +# format_name: percent +# format_version: '1.3' +# jupytext_version: 1.19.1 +# kernelspec: +# display_name: Python 3 +# language: python +# name: python3 +# --- + +# %% [markdown] +# # Compare Benchmark Runs +# +# Generate side-by-side comparison plots from multiple benchmark runs on the +# Liander 2024 dataset. +# +# **Prerequisites:** Run at least two models first (e.g. XGBoost + GBLinear via +# the *XGBoost & GBLinear* notebook). +# +# **What this does:** +# +# 1. Loads results from multiple model runs (each stored in its own directory) +# 2. Computes metrics across all targets using +# [`BenchmarkComparisonPipeline`](https://openstef.github.io/openstef/v4/api/generated/openstef_beam.benchmarking.BenchmarkComparisonPipeline.html) +# 3. Produces comparison visualizations (boxplots, ranking tables, per-target breakdowns) + +# %% tags=["remove-cell"] # SPDX-FileCopyrightText: 2025 Contributors to the OpenSTEF project # # SPDX-License-Identifier: MPL-2.0 +# %% [markdown] +# ## Setup +# +# Point at the result directories from your benchmark runs. + +# %% from pathlib import Path from openstef_beam.analysis.models import RunName @@ -18,6 +55,13 @@ BENCHMARK_DIR_GBLINEAR = BASE_DIR / "benchmark_results" / "GBLinear" BENCHMARK_DIR_XGBOOST = BASE_DIR / "benchmark_results" / "XGBoost" +# %% [markdown] +# ## Load run results +# +# Each run is identified by a name and backed by a `LocalBenchmarkStorage` that +# points at the directory where that model's results were saved. + +# %% check_dirs = [ BENCHMARK_DIR_GBLINEAR, BENCHMARK_DIR_XGBOOST, @@ -32,6 +76,13 @@ "xgboost": LocalBenchmarkStorage(base_path=BENCHMARK_DIR_XGBOOST), } +# %% [markdown] +# ## Run comparison +# +# The pipeline loads predictions from each run, re-evaluates them with the +# Liander 2024 analysis config, and produces comparison visualizations. + +# %% target_provider = create_liander2024_benchmark_runner( storage=LocalBenchmarkStorage(base_path=OUTPUT_PATH), ).target_provider diff --git a/examples/benchmarks/liander2024/run_ensemble_benchmark.ipynb b/examples/benchmarks/liander2024/run_ensemble_benchmark.ipynb new file mode 100644 index 000000000..b444ed759 --- /dev/null +++ b/examples/benchmarks/liander2024/run_ensemble_benchmark.ipynb @@ -0,0 +1,266 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "3b5c7558", + "metadata": {}, + "source": [ + "# Ensemble Model Benchmark\n", + "\n", + "Run an ensemble of multiple base models (e.g. LightGBM + GBLinear) with a learned\n", + "weight combiner on the\n", + "[Liander 2024 STEF benchmark](https://huggingface.co/datasets/OpenSTEF/liander2024-stef-benchmark).\n", + "\n", + "**What this does:**\n", + "\n", + "1. Downloads the Liander 2024 dataset from HuggingFace (automatic)\n", + "2. Trains multiple base models and a combiner that learns optimal weights\n", + "3. Produces probabilistic forecasts (7 quantiles) for a 36-hour horizon\n", + "4. Saves results locally for comparison\n", + "\n", + "**No code changes needed.** To benchmark your own model, see\n", + "[Implement a Custom Forecaster](../custom/custom_forecaster.ipynb).\n", + "\n", + "```{admonition} Ensemble types\n", + "Change `ensemble_type` below to try different strategies:\n", + "- `\"learned_weights\"` — a combiner model learns per-quantile weights\n", + "- `\"stacking\"` — base model outputs become features for a meta-model\n", + "- `\"rules\"` — fixed rule-based combination\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "36c13dbf", + "metadata": { + "tags": [ + "remove-cell" + ] + }, + "outputs": [], + "source": [ + "# SPDX-FileCopyrightText: 2025 Contributors to the OpenSTEF project \n", + "#\n", + "# SPDX-License-Identifier: MPL-2.0\n", + "\n", + "import os\n", + "import time\n", + "\n", + "os.environ[\"OMP_NUM_THREADS\"] = \"1\"\n", + "os.environ[\"OPENBLAS_NUM_THREADS\"] = \"1\"\n", + "os.environ[\"MKL_NUM_THREADS\"] = \"1\"" + ] + }, + { + "cell_type": "markdown", + "id": "314567bf", + "metadata": {}, + "source": [ + "## Setup" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9fb6db81", + "metadata": {}, + "outputs": [], + "source": [ + "import logging\n", + "from datetime import timedelta\n", + "from pathlib import Path\n", + "\n", + "from openstef_beam.backtesting.backtest_forecaster import BacktestForecasterConfig\n", + "from openstef_beam.benchmarking.baselines.openstef4 import (\n", + " create_openstef4_preset_backtest_forecaster,\n", + ")\n", + "from openstef_beam.benchmarking.benchmarks.liander2024 import Liander2024Category, create_liander2024_benchmark_runner\n", + "from openstef_beam.benchmarking.callbacks.strict_execution_callback import StrictExecutionCallback\n", + "from openstef_beam.benchmarking.storage.local_storage import LocalBenchmarkStorage\n", + "from openstef_core.types import LeadTime, Q\n", + "from openstef_meta.presets import (\n", + " EnsembleForecastingWorkflowConfig,\n", + ")\n", + "from openstef_models.integrations.mlflow.mlflow_storage import MLFlowStorage\n", + "from openstef_models.transforms.general import SampleWeightConfig\n", + "\n", + "logging.basicConfig(level=logging.INFO, format=\"[%(asctime)s][%(levelname)s] %(message)s\")" + ] + }, + { + "cell_type": "markdown", + "id": "39e49d57", + "metadata": {}, + "source": [ + "## Ensemble configuration\n", + "\n", + "Choose which base models to combine and how. The `ensemble_type` controls the\n", + "combination strategy; `base_models` lists which individual models to train." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "999c6c43", + "metadata": {}, + "outputs": [], + "source": [ + "OUTPUT_PATH = Path(\"./benchmark_results\")\n", + "\n", + "N_PROCESSES = int(os.environ.get(\"OPENSTEF_N_PROCESSES\", \"1\"))\n", + "\n", + "ensemble_type = \"learned_weights\" # \"stacking\", \"learned_weights\" or \"rules\"\n", + "base_models = [\"lgbm\", \"gblinear\"] # combination of \"lgbm\", \"gblinear\", \"xgboost\" and \"lgbm_linear\"\n", + "combiner_model = \"lgbm\" # \"lgbm\", \"xgboost\", \"rf\" or \"logistic\" for learned weights; \"gblinear\" for stacking\n", + "\n", + "model = \"Ensemble_\" + \"_\".join(base_models) + \"_\" + ensemble_type + \"_\" + combiner_model\n", + "\n", + "# Forecast 36 hours ahead, producing 7 quantile bands\n", + "FORECAST_HORIZONS = [LeadTime.from_string(\"PT36H\")]\n", + "PREDICTION_QUANTILES = [\n", + " Q(0.05),\n", + " Q(0.1),\n", + " Q(0.3),\n", + " Q(0.5),\n", + " Q(0.7),\n", + " Q(0.9),\n", + " Q(0.95),\n", + "]\n", + "\n", + "# Set to a list of categories to run only a subset (e.g. [Liander2024Category.SOLAR])\n", + "BENCHMARK_FILTER: list[Liander2024Category] | None = None" + ] + }, + { + "cell_type": "markdown", + "id": "3e76cd86", + "metadata": {}, + "source": [ + "## Workflow configuration\n", + "\n", + "`EnsembleForecastingWorkflowConfig` extends the standard config with ensemble-specific\n", + "settings: which base models to use, the combiner strategy, and per-model sample weights." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5b63cee1", + "metadata": {}, + "outputs": [], + "source": [ + "USE_MLFLOW_STORAGE = os.environ.get(\"OPENSTEF_MLFLOW_STORAGE\", \"true\").lower() == \"true\"\n", + "\n", + "if USE_MLFLOW_STORAGE:\n", + " storage = MLFlowStorage(\n", + " tracking_uri=str(OUTPUT_PATH / \"mlflow_artifacts\"),\n", + " local_artifacts_path=OUTPUT_PATH / \"mlflow_tracking_artifacts\",\n", + " )\n", + "else:\n", + " storage = None\n", + "\n", + "workflow_config = EnsembleForecastingWorkflowConfig(\n", + " model_id=\"common_model_\",\n", + " ensemble_type=ensemble_type,\n", + " base_models=base_models, # type: ignore\n", + " combiner_model=combiner_model,\n", + " horizons=FORECAST_HORIZONS,\n", + " quantiles=PREDICTION_QUANTILES,\n", + " model_reuse_enable=False,\n", + " mlflow_storage=None,\n", + " radiation_column=\"shortwave_radiation\",\n", + " rolling_aggregate_features=[\"mean\", \"median\", \"max\", \"min\"],\n", + " wind_speed_column=\"wind_speed_80m\",\n", + " pressure_column=\"surface_pressure\",\n", + " temperature_column=\"temperature_2m\",\n", + " relative_humidity_column=\"relative_humidity_2m\",\n", + " energy_price_column=\"EPEX_NL\",\n", + " forecaster_sample_weights={\n", + " \"gblinear\": SampleWeightConfig(method=\"exponential\", weight_exponent=1.0),\n", + " \"lgbm\": SampleWeightConfig(weight_exponent=0.0),\n", + " \"xgboost\": SampleWeightConfig(weight_exponent=0.0),\n", + " \"lgbm_linear\": SampleWeightConfig(weight_exponent=0.0),\n", + " },\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "53399c01", + "metadata": {}, + "source": [ + "## Backtest schedule\n", + "\n", + "The `BacktestForecasterConfig` controls how BEAM schedules training and prediction\n", + "windows. Ensemble models typically need more context than single models." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1ccc6dcd", + "metadata": {}, + "outputs": [], + "source": [ + "backtest_config = BacktestForecasterConfig(\n", + " requires_training=True,\n", + " predict_length=timedelta(days=7),\n", + " predict_min_length=timedelta(minutes=15),\n", + " predict_context_length=timedelta(days=14), # Context needed for lag features\n", + " predict_context_min_coverage=0.5,\n", + " training_context_length=timedelta(days=90), # Three months of training data\n", + " training_context_min_coverage=0.5,\n", + " predict_sample_interval=timedelta(minutes=15),\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "479036b7", + "metadata": {}, + "source": [ + "## Run the benchmark" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f02a3ac1", + "metadata": {}, + "outputs": [], + "source": [ + "if __name__ == \"__main__\":\n", + " start_time = time.time()\n", + " create_liander2024_benchmark_runner(\n", + " storage=LocalBenchmarkStorage(base_path=OUTPUT_PATH / model),\n", + " data_dir=None, # Path(\"../data/liander2024-energy-forecasting-benchmark\"),\n", + " callbacks=[StrictExecutionCallback()],\n", + " ).run(\n", + " forecaster_factory=create_openstef4_preset_backtest_forecaster(\n", + " workflow_config=workflow_config,\n", + " cache_dir=OUTPUT_PATH / \"cache\",\n", + " ),\n", + " run_name=model,\n", + " n_processes=N_PROCESSES,\n", + " filter_args=BENCHMARK_FILTER,\n", + " )\n", + "\n", + " end_time = time.time()\n", + " print(f\"Benchmark completed in {end_time - start_time:.2f} seconds.\")" + ] + } + ], + "metadata": { + "jupytext": { + "formats": "ipynb,py:percent" + }, + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/examples/benchmarks/liander2024/run_ensemble_benchmark.ipynb.license b/examples/benchmarks/liander2024/run_ensemble_benchmark.ipynb.license new file mode 100644 index 000000000..a42c86064 --- /dev/null +++ b/examples/benchmarks/liander2024/run_ensemble_benchmark.ipynb.license @@ -0,0 +1,3 @@ +SPDX-FileCopyrightText: 2026 Contributors to the OpenSTEF project + +SPDX-License-Identifier: MPL-2.0 diff --git a/examples/benchmarks/liander_2024_ensemble.py b/examples/benchmarks/liander2024/run_ensemble_benchmark.py similarity index 61% rename from examples/benchmarks/liander_2024_ensemble.py rename to examples/benchmarks/liander2024/run_ensemble_benchmark.py index 5760d35e6..b4f5dca83 100644 --- a/examples/benchmarks/liander_2024_ensemble.py +++ b/examples/benchmarks/liander2024/run_ensemble_benchmark.py @@ -1,11 +1,43 @@ -"""Liander 2024 Benchmark Example. - -==================================== - -This example demonstrates how to set up and run the Liander 2024 STEF benchmark using OpenSTEF BEAM. -The benchmark will evaluate XGBoost and GBLinear models on the dataset from HuggingFace. -""" - +# --- +# jupyter: +# jupytext: +# formats: ipynb,py:percent +# text_representation: +# extension: .py +# format_name: percent +# format_version: '1.3' +# jupytext_version: 1.19.1 +# kernelspec: +# display_name: Python 3 +# language: python +# name: python3 +# --- + +# %% [markdown] +# # Ensemble Model Benchmark +# +# Run an ensemble of multiple base models (e.g. LightGBM + GBLinear) with a learned +# weight combiner on the +# [Liander 2024 STEF benchmark](https://huggingface.co/datasets/OpenSTEF/liander2024-stef-benchmark). +# +# **What this does:** +# +# 1. Downloads the Liander 2024 dataset from HuggingFace (automatic) +# 2. Trains multiple base models and a combiner that learns optimal weights +# 3. Produces probabilistic forecasts (7 quantiles) for a 36-hour horizon +# 4. Saves results locally for comparison +# +# **No code changes needed.** To benchmark your own model, see +# [Implement a Custom Forecaster](../custom/custom_forecaster.ipynb). +# +# ```{admonition} Ensemble types +# Change `ensemble_type` below to try different strategies: +# - `"learned_weights"` — a combiner model learns per-quantile weights +# - `"stacking"` — base model outputs become features for a meta-model +# - `"rules"` — fixed rule-based combination +# ``` + +# %% tags=["remove-cell"] # SPDX-FileCopyrightText: 2025 Contributors to the OpenSTEF project # # SPDX-License-Identifier: MPL-2.0 @@ -13,12 +45,15 @@ import os import time -os.environ["OMP_NUM_THREADS"] = "1" # Set OMP_NUM_THREADS to 1 to avoid issues with parallel execution and xgboost +os.environ["OMP_NUM_THREADS"] = "1" os.environ["OPENBLAS_NUM_THREADS"] = "1" os.environ["MKL_NUM_THREADS"] = "1" +# %% [markdown] +# ## Setup + +# %% import logging -import multiprocessing from datetime import timedelta from pathlib import Path @@ -38,20 +73,25 @@ logging.basicConfig(level=logging.INFO, format="[%(asctime)s][%(levelname)s] %(message)s") +# %% [markdown] +# ## Ensemble configuration +# +# Choose which base models to combine and how. The `ensemble_type` controls the +# combination strategy; `base_models` lists which individual models to train. + +# %% OUTPUT_PATH = Path("./benchmark_results") -N_PROCESSES = 1 if True else multiprocessing.cpu_count() # Amount of parallel processes to use for the benchmark +N_PROCESSES = int(os.environ.get("OPENSTEF_N_PROCESSES", "1")) ensemble_type = "learned_weights" # "stacking", "learned_weights" or "rules" base_models = ["lgbm", "gblinear"] # combination of "lgbm", "gblinear", "xgboost" and "lgbm_linear" -combiner_model = ( - "lgbm" # "lgbm", "xgboost", "rf" or "logistic" for learned weights combiner, gblinear for stacking combiner -) +combiner_model = "lgbm" # "lgbm", "xgboost", "rf" or "logistic" for learned weights; "gblinear" for stacking model = "Ensemble_" + "_".join(base_models) + "_" + ensemble_type + "_" + combiner_model -# Model configuration -FORECAST_HORIZONS = [LeadTime.from_string("PT36H")] # Forecast horizon(s) +# Forecast 36 hours ahead, producing 7 quantile bands +FORECAST_HORIZONS = [LeadTime.from_string("PT36H")] PREDICTION_QUANTILES = [ Q(0.05), Q(0.1), @@ -60,11 +100,19 @@ Q(0.7), Q(0.9), Q(0.95), -] # Quantiles for probabilistic forecasts +] +# Set to a list of categories to run only a subset (e.g. [Liander2024Category.SOLAR]) BENCHMARK_FILTER: list[Liander2024Category] | None = None -USE_MLFLOW_STORAGE = True +# %% [markdown] +# ## Workflow configuration +# +# `EnsembleForecastingWorkflowConfig` extends the standard config with ensemble-specific +# settings: which base models to use, the combiner strategy, and per-model sample weights. + +# %% +USE_MLFLOW_STORAGE = os.environ.get("OPENSTEF_MLFLOW_STORAGE", "true").lower() == "true" if USE_MLFLOW_STORAGE: storage = MLFlowStorage( @@ -98,8 +146,13 @@ }, ) +# %% [markdown] +# ## Backtest schedule +# +# The `BacktestForecasterConfig` controls how BEAM schedules training and prediction +# windows. Ensemble models typically need more context than single models. -# Create the backtest configuration +# %% backtest_config = BacktestForecasterConfig( requires_training=True, predict_length=timedelta(days=7), @@ -111,7 +164,10 @@ predict_sample_interval=timedelta(minutes=15), ) +# %% [markdown] +# ## Run the benchmark +# %% if __name__ == "__main__": start_time = time.time() create_liander2024_benchmark_runner( diff --git a/examples/benchmarks/liander2024/run_xgboost_gblinear_benchmark.ipynb b/examples/benchmarks/liander2024/run_xgboost_gblinear_benchmark.ipynb new file mode 100644 index 000000000..afc1d760c --- /dev/null +++ b/examples/benchmarks/liander2024/run_xgboost_gblinear_benchmark.ipynb @@ -0,0 +1,240 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "b0bb1eb8", + "metadata": {}, + "source": [ + "# XGBoost & GBLinear Benchmark\n", + "\n", + "Run two models head-to-head on the\n", + "[Liander 2024 STEF benchmark](https://huggingface.co/datasets/OpenSTEF/liander2024-stef-benchmark)\n", + "— an open dataset of Dutch energy grid measurements.\n", + "\n", + "**What this does:**\n", + "\n", + "1. Downloads the Liander 2024 dataset from HuggingFace (automatic)\n", + "2. Trains XGBoost and GBLinear on each target using day-by-day backtesting\n", + "3. Produces probabilistic forecasts (7 quantiles) for a 3-day horizon\n", + "4. Saves results locally for comparison (see *Compare Results* notebook)\n", + "\n", + "**No code changes needed** — just run it. To benchmark your own model instead,\n", + "see [Implement a Custom Forecaster](../custom/custom_forecaster.ipynb).\n", + "\n", + "```{admonition} Runtime\n", + "Expect 30-60 min on a laptop (uses all CPU cores).\n", + "Set `OPENSTEF_N_PROCESSES=1` for easier debugging.\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "94b2f1c4", + "metadata": { + "tags": [ + "remove-cell" + ] + }, + "outputs": [], + "source": [ + "# SPDX-FileCopyrightText: 2025 Contributors to the OpenSTEF project \n", + "#\n", + "# SPDX-License-Identifier: MPL-2.0\n", + "\n", + "import os\n", + "\n", + "os.environ[\"OMP_NUM_THREADS\"] = \"1\"\n", + "os.environ[\"OPENBLAS_NUM_THREADS\"] = \"1\"\n", + "os.environ[\"MKL_NUM_THREADS\"] = \"1\"" + ] + }, + { + "cell_type": "markdown", + "id": "a5a1eddb", + "metadata": {}, + "source": [ + "## Setup\n", + "\n", + "Import BEAM components and configure logging." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "52bea3bf", + "metadata": {}, + "outputs": [], + "source": [ + "import logging\n", + "import multiprocessing\n", + "from pathlib import Path\n", + "\n", + "from openstef_beam.benchmarking.baselines.openstef4 import (\n", + " create_openstef4_preset_backtest_forecaster,\n", + ")\n", + "from openstef_beam.benchmarking.benchmarks.liander2024 import Liander2024Category, create_liander2024_benchmark_runner\n", + "from openstef_beam.benchmarking.callbacks.strict_execution_callback import StrictExecutionCallback\n", + "from openstef_beam.benchmarking.storage.local_storage import LocalBenchmarkStorage\n", + "from openstef_core.types import LeadTime, Q\n", + "from openstef_models.integrations.mlflow.mlflow_storage import MLFlowStorage\n", + "from openstef_models.presets import (\n", + " ForecastingWorkflowConfig,\n", + ")\n", + "\n", + "logging.basicConfig(level=logging.INFO, format=\"[%(asctime)s][%(levelname)s] %(message)s\")" + ] + }, + { + "cell_type": "markdown", + "id": "6b29d41c", + "metadata": {}, + "source": [ + "## Configuration\n", + "\n", + "Define output paths, forecast horizons, and quantiles.\n", + "The benchmark runs each model in parallel across all targets in the dataset." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d255dbae", + "metadata": {}, + "outputs": [], + "source": [ + "OUTPUT_PATH = Path(\"./benchmark_results\")\n", + "\n", + "BENCHMARK_RESULTS_PATH_XGBOOST = OUTPUT_PATH / \"XGBoost\"\n", + "BENCHMARK_RESULTS_PATH_GBLINEAR = OUTPUT_PATH / \"GBLinear\"\n", + "N_PROCESSES = int(os.environ.get(\"OPENSTEF_N_PROCESSES\", str(multiprocessing.cpu_count())))\n", + "\n", + "# Forecast 3 days ahead, producing 7 quantile bands\n", + "FORECAST_HORIZONS = [LeadTime.from_string(\"P3D\")]\n", + "PREDICTION_QUANTILES = [\n", + " Q(0.05),\n", + " Q(0.1),\n", + " Q(0.3),\n", + " Q(0.5),\n", + " Q(0.7),\n", + " Q(0.9),\n", + " Q(0.95),\n", + "]\n", + "\n", + "# Set to a list of categories to run only a subset (e.g. [Liander2024Category.SOLAR])\n", + "BENCHMARK_FILTER: list[Liander2024Category] | None = None" + ] + }, + { + "cell_type": "markdown", + "id": "37ccec9a", + "metadata": {}, + "source": [ + "## Model configuration\n", + "\n", + "`ForecastingWorkflowConfig` defines how OpenSTEF trains and predicts.\n", + "We create a shared base config and derive model-specific variants with `model_copy()`.\n", + "\n", + "Set `OPENSTEF_MLFLOW_STORAGE=true` to log experiment artifacts to MLflow." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bfcdcb6d", + "metadata": {}, + "outputs": [], + "source": [ + "USE_MLFLOW_STORAGE = os.environ.get(\"OPENSTEF_MLFLOW_STORAGE\", \"false\").lower() == \"true\"\n", + "\n", + "if USE_MLFLOW_STORAGE:\n", + " storage = MLFlowStorage(\n", + " tracking_uri=str(OUTPUT_PATH / \"mlflow_artifacts\"),\n", + " local_artifacts_path=OUTPUT_PATH / \"mlflow_tracking_artifacts\",\n", + " )\n", + "else:\n", + " storage = None\n", + "\n", + "common_config = ForecastingWorkflowConfig(\n", + " model_id=\"common_model_\",\n", + " run_name=None,\n", + " model=\"flatliner\",\n", + " horizons=FORECAST_HORIZONS,\n", + " quantiles=PREDICTION_QUANTILES,\n", + " model_reuse_enable=True,\n", + " mlflow_storage=storage,\n", + " radiation_column=\"shortwave_radiation\",\n", + " rolling_aggregate_features=[\"mean\", \"median\", \"max\", \"min\"],\n", + " wind_speed_column=\"wind_speed_80m\",\n", + " pressure_column=\"surface_pressure\",\n", + " temperature_column=\"temperature_2m\",\n", + " relative_humidity_column=\"relative_humidity_2m\",\n", + " energy_price_column=\"EPEX_NL\",\n", + ")\n", + "\n", + "xgboost_config = common_config.model_copy(update={\"model\": \"xgboost\"})\n", + "gblinear_config = common_config.model_copy(update={\"model\": \"gblinear\"})" + ] + }, + { + "cell_type": "markdown", + "id": "c6203242", + "metadata": {}, + "source": [ + "## Run the benchmark\n", + "\n", + "Each model gets its own output directory. `StrictExecutionCallback` raises on\n", + "any target failure (remove it to skip failing targets silently)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "23069166", + "metadata": {}, + "outputs": [], + "source": [ + "if __name__ == \"__main__\":\n", + " # --- XGBoost ---\n", + " create_liander2024_benchmark_runner(\n", + " storage=LocalBenchmarkStorage(base_path=BENCHMARK_RESULTS_PATH_XGBOOST),\n", + " callbacks=[StrictExecutionCallback()],\n", + " ).run(\n", + " forecaster_factory=create_openstef4_preset_backtest_forecaster(\n", + " workflow_config=xgboost_config,\n", + " cache_dir=OUTPUT_PATH / \"cache\",\n", + " ),\n", + " run_name=\"xgboost\",\n", + " n_processes=N_PROCESSES,\n", + " filter_args=BENCHMARK_FILTER,\n", + " )\n", + "\n", + " # --- GBLinear ---\n", + " create_liander2024_benchmark_runner(\n", + " storage=LocalBenchmarkStorage(base_path=BENCHMARK_RESULTS_PATH_GBLINEAR),\n", + " callbacks=[StrictExecutionCallback()],\n", + " ).run(\n", + " forecaster_factory=create_openstef4_preset_backtest_forecaster(\n", + " workflow_config=gblinear_config,\n", + " cache_dir=OUTPUT_PATH / \"cache\",\n", + " ),\n", + " run_name=\"gblinear\",\n", + " n_processes=N_PROCESSES,\n", + " filter_args=BENCHMARK_FILTER,\n", + " )" + ] + } + ], + "metadata": { + "jupytext": { + "formats": "ipynb,py:percent" + }, + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/examples/benchmarks/liander2024/run_xgboost_gblinear_benchmark.ipynb.license b/examples/benchmarks/liander2024/run_xgboost_gblinear_benchmark.ipynb.license new file mode 100644 index 000000000..a42c86064 --- /dev/null +++ b/examples/benchmarks/liander2024/run_xgboost_gblinear_benchmark.ipynb.license @@ -0,0 +1,3 @@ +SPDX-FileCopyrightText: 2026 Contributors to the OpenSTEF project + +SPDX-License-Identifier: MPL-2.0 diff --git a/examples/benchmarks/liander_2024_benchmark_xgboost_gblinear.py b/examples/benchmarks/liander2024/run_xgboost_gblinear_benchmark.py similarity index 57% rename from examples/benchmarks/liander_2024_benchmark_xgboost_gblinear.py rename to examples/benchmarks/liander2024/run_xgboost_gblinear_benchmark.py index 9ff296c5d..bcc7379ea 100644 --- a/examples/benchmarks/liander_2024_benchmark_xgboost_gblinear.py +++ b/examples/benchmarks/liander2024/run_xgboost_gblinear_benchmark.py @@ -1,21 +1,57 @@ -"""Liander 2024 Benchmark Example. - -==================================== - -This example demonstrates how to set up and run the Liander 2024 STEF benchmark using OpenSTEF BEAM. -The benchmark will evaluate XGBoost and GBLinear models on the dataset from HuggingFace. -""" +# --- +# jupyter: +# jupytext: +# formats: ipynb,py:percent +# text_representation: +# extension: .py +# format_name: percent +# format_version: '1.3' +# jupytext_version: 1.19.1 +# kernelspec: +# display_name: Python 3 +# language: python +# name: python3 +# --- + +# %% [markdown] +# # XGBoost & GBLinear Benchmark +# +# Run two models head-to-head on the +# [Liander 2024 STEF benchmark](https://huggingface.co/datasets/OpenSTEF/liander2024-stef-benchmark) +# — an open dataset of Dutch energy grid measurements. +# +# **What this does:** +# +# 1. Downloads the Liander 2024 dataset from HuggingFace (automatic) +# 2. Trains XGBoost and GBLinear on each target using day-by-day backtesting +# 3. Produces probabilistic forecasts (7 quantiles) for a 3-day horizon +# 4. Saves results locally for comparison (see *Compare Results* notebook) +# +# **No code changes needed** — just run it. To benchmark your own model instead, +# see [Implement a Custom Forecaster](../custom/custom_forecaster.ipynb). +# +# ```{admonition} Runtime +# Expect 30-60 min on a laptop (uses all CPU cores). +# Set `OPENSTEF_N_PROCESSES=1` for easier debugging. +# ``` +# %% tags=["remove-cell"] # SPDX-FileCopyrightText: 2025 Contributors to the OpenSTEF project # # SPDX-License-Identifier: MPL-2.0 import os -os.environ["OMP_NUM_THREADS"] = "1" # Set OMP_NUM_THREADS to 1 to avoid issues with parallel execution and xgboost +os.environ["OMP_NUM_THREADS"] = "1" os.environ["OPENBLAS_NUM_THREADS"] = "1" os.environ["MKL_NUM_THREADS"] = "1" +# %% [markdown] +# ## Setup +# +# Import BEAM components and configure logging. + +# %% import logging import multiprocessing from pathlib import Path @@ -34,14 +70,21 @@ logging.basicConfig(level=logging.INFO, format="[%(asctime)s][%(levelname)s] %(message)s") +# %% [markdown] +# ## Configuration +# +# Define output paths, forecast horizons, and quantiles. +# The benchmark runs each model in parallel across all targets in the dataset. + +# %% OUTPUT_PATH = Path("./benchmark_results") BENCHMARK_RESULTS_PATH_XGBOOST = OUTPUT_PATH / "XGBoost" BENCHMARK_RESULTS_PATH_GBLINEAR = OUTPUT_PATH / "GBLinear" -N_PROCESSES = multiprocessing.cpu_count() # Amount of parallel processes to use for the benchmark +N_PROCESSES = int(os.environ.get("OPENSTEF_N_PROCESSES", str(multiprocessing.cpu_count()))) -# Model configuration -FORECAST_HORIZONS = [LeadTime.from_string("P3D")] # Forecast horizon(s) +# Forecast 3 days ahead, producing 7 quantile bands +FORECAST_HORIZONS = [LeadTime.from_string("P3D")] PREDICTION_QUANTILES = [ Q(0.05), Q(0.1), @@ -50,11 +93,21 @@ Q(0.7), Q(0.9), Q(0.95), -] # Quantiles for probabilistic forecasts +] +# Set to a list of categories to run only a subset (e.g. [Liander2024Category.SOLAR]) BENCHMARK_FILTER: list[Liander2024Category] | None = None -USE_MLFLOW_STORAGE = False +# %% [markdown] +# ## Model configuration +# +# `ForecastingWorkflowConfig` defines how OpenSTEF trains and predicts. +# We create a shared base config and derive model-specific variants with `model_copy()`. +# +# Set `OPENSTEF_MLFLOW_STORAGE=true` to log experiment artifacts to MLflow. + +# %% +USE_MLFLOW_STORAGE = os.environ.get("OPENSTEF_MLFLOW_STORAGE", "false").lower() == "true" if USE_MLFLOW_STORAGE: storage = MLFlowStorage( @@ -82,11 +135,17 @@ ) xgboost_config = common_config.model_copy(update={"model": "xgboost"}) - gblinear_config = common_config.model_copy(update={"model": "gblinear"}) +# %% [markdown] +# ## Run the benchmark +# +# Each model gets its own output directory. `StrictExecutionCallback` raises on +# any target failure (remove it to skip failing targets silently). + +# %% if __name__ == "__main__": - # Run for XGBoost model + # --- XGBoost --- create_liander2024_benchmark_runner( storage=LocalBenchmarkStorage(base_path=BENCHMARK_RESULTS_PATH_XGBOOST), callbacks=[StrictExecutionCallback()], @@ -100,7 +159,7 @@ filter_args=BENCHMARK_FILTER, ) - # # Run for GBLinear model + # --- GBLinear --- create_liander2024_benchmark_runner( storage=LocalBenchmarkStorage(base_path=BENCHMARK_RESULTS_PATH_GBLINEAR), callbacks=[StrictExecutionCallback()], diff --git a/examples/deployment/.gitkeep b/examples/deployment/.gitkeep deleted file mode 100644 index e69de29bb..000000000 diff --git a/examples/examples/configuring_model_pipeline_example.py b/examples/examples/configuring_model_pipeline_example.py deleted file mode 100644 index dc37fdf16..000000000 --- a/examples/examples/configuring_model_pipeline_example.py +++ /dev/null @@ -1,141 +0,0 @@ -"""Configuring Model Pipeline Example. - -==================================== - -This example demonstrates how to configure and use a complete forecasting pipeline -in OpenSTEF. It shows how to: - -1. Create synthetic time series data for demonstration -2. Configure a full forecasting model with preprocessing and postprocessing -3. Set up model storage for persistence -4. Use the workflow pattern for training and prediction - -The example uses a ConstantMedianForecaster with feature engineering including -holiday features, lag transforms, and data scaling. This represents a typical -OpenSTEF forecasting setup that can be adapted for real-world use cases. - -Key Components: - - VersionedTimeSeriesDataset: Time series data structure - - ForecastingModel: Complete forecasting pipeline - - FeaturePipeline: Preprocessing with holidays and lags - - LocalModelStorage: File-based model persistence - - CustomForecastingWorkflow: High-level orchestration - -This example is useful for understanding how to integrate all OpenSTEF components -into a working forecasting system. -""" - -# SPDX-FileCopyrightText: 2025 Contributors to the OpenSTEF project -# -# SPDX-License-Identifier: MPL-2.0 - -import logging -from datetime import timedelta -from pathlib import Path - -import numpy as np -import pandas as pd -from pydantic_extra_types.country import CountryAlpha2 - -from openstef_beam.analysis.plots import ForecastTimeSeriesPlotter -from openstef_core.datasets import ForecastDataset, TimeSeriesDataset -from openstef_core.mixins import TransformPipeline -from openstef_core.types import LeadTime, Q -from openstef_models.integrations.mlflow import MLFlowStorageCallback -from openstef_models.integrations.mlflow.mlflow_storage import MLFlowStorage -from openstef_models.models.forecasting.gblinear_forecaster import ( - GBLinearForecaster, - GBLinearHyperParams, -) -from openstef_models.models.forecasting_model import ForecastingModel -from openstef_models.transforms.general import Scaler -from openstef_models.transforms.time_domain import HolidayFeatureAdder -from openstef_models.utils.feature_selection import FeatureSelection -from openstef_models.workflows import CustomForecastingWorkflow - -logging.basicConfig(level=logging.INFO, format="[%(asctime)s][%(levelname)s] %(message)s") -logger = logging.getLogger(__name__) - -workspace_dir = Path(__file__).parent.resolve() - -# Create synthetic time series data -n_samples = 24 * 31 * 3 # 3 months of hourly data -rng = np.random.default_rng(42) -temp = rng.standard_normal(size=n_samples) -wind = rng.standard_normal(size=n_samples) -radiation = rng.standard_normal(size=n_samples) -timestamps = pd.date_range("2025-01-01", periods=n_samples, freq="h") - -dataset = TimeSeriesDataset( - data=pd.DataFrame( - { - "load": wind * -10 + temp * -3 + radiation * -5 + rng.standard_normal(size=n_samples) * 2, - "temp": temp, - "wind": wind, - "radiation": radiation, - }, - index=timestamps, - ), - sample_interval=timedelta(hours=1), -) - -model = ForecastingModel( - preprocessing=TransformPipeline( - transforms=[ - Scaler(method="standard", selection=FeatureSelection(include={"temp", "wind", "radiation"})), - HolidayFeatureAdder(country_code=CountryAlpha2("NL")), - ], - ), - forecaster=GBLinearForecaster( - horizons=[LeadTime.from_string("PT36H")], - quantiles=[Q(0.5), Q(0.1), Q(0.9)], - hyperparams=GBLinearHyperParams( - n_steps=1000, - learning_rate=0.3, - ), - verbosity=True, - ), - target_column="load", - tags={ - "model": "gblinear", - "version": "1.0.0", - }, -) - -pipeline = CustomForecastingWorkflow( - model_id="gblinear_forecaster_v1", - model=model, - callbacks=[ - MLFlowStorageCallback( - storage=MLFlowStorage( - tracking_uri=str(workspace_dir / "mlflow_tracking"), - local_artifacts_path=workspace_dir / "mlflow_tracking_artifacts", - ), - model_reuse_enable=False, - ) - ], -) - -logger.info("Starting model training") -result = pipeline.fit(dataset) -if result is not None: - logger.info("Full eval result:\n%s", result.metrics_full.to_dataframe()) - - if result.metrics_test is not None: - logger.info("Test result:\n%s", result.metrics_test.to_dataframe()) - -logger.info("Starting forecasting") -forecast: ForecastDataset = pipeline.predict(dataset) - -print(forecast.data.tail()) - - -logger.info("Storing forecast plot to forecast_plot.html") -fig = ( - ForecastTimeSeriesPlotter() - .add_measurements(measurements=dataset.select_version().data["load"]) - .add_model(model_name="gblinear", forecast=forecast.median_series, quantiles=forecast.quantiles_data) - .plot() -) - -fig.write_html("forecast_plot.html") # pyright: ignore[reportUnknownMemberType] diff --git a/examples/examples/forecasting_preset_example.py b/examples/examples/forecasting_preset_example.py deleted file mode 100644 index 480527252..000000000 --- a/examples/examples/forecasting_preset_example.py +++ /dev/null @@ -1,93 +0,0 @@ -"""Configuring Model Pipeline Example. - -==================================== - -This example demonstrates how to configure and use a complete forecasting pipeline -in OpenSTEF. It shows how to: - -1. Create synthetic time series data for demonstration -2. Configure a full forecasting model with preprocessing and postprocessing -3. Set up model storage for persistence -4. Use the workflow pattern for training and prediction - -The example uses a ConstantMedianForecaster with feature engineering including -holiday features, lag transforms, and data scaling. This represents a typical -OpenSTEF forecasting setup that can be adapted for real-world use cases. - -Key Components: - - VersionedTimeSeriesDataset: Time series data structure - - ForecastingModel: Complete forecasting pipeline - - FeaturePipeline: Preprocessing with holidays and lags - - LocalModelStorage: File-based model persistence - - CustomForecastingWorkflow: High-level orchestration - -This example is useful for understanding how to integrate all OpenSTEF components -into a working forecasting system. -""" - -# SPDX-FileCopyrightText: 2025 Contributors to the OpenSTEF project -# -# SPDX-License-Identifier: MPL-2.0 - -import logging -from datetime import timedelta -from pathlib import Path - -from openstef_beam.analysis.plots import ForecastTimeSeriesPlotter -from openstef_core.datasets import ForecastDataset -from openstef_core.testing import create_synthetic_forecasting_dataset -from openstef_core.types import LeadTime, Q -from openstef_models.integrations.mlflow import MLFlowStorage -from openstef_models.presets import ForecastingWorkflowConfig, create_forecasting_workflow - -logging.basicConfig(level=logging.INFO, format="[%(asctime)s][%(levelname)s] %(message)s") -logger = logging.getLogger(__name__) - -workspace_dir = Path(__file__).parent.resolve() - -# Create synthetic time series data -dataset = create_synthetic_forecasting_dataset( - length=timedelta(days=90), - wind_influence=-10.0, - temp_influence=5.0, - radiation_influence=-7.0, - stochastic_influence=2.0, - sample_interval=timedelta(hours=1), -) - -workflow = create_forecasting_workflow( - config=ForecastingWorkflowConfig( - model_id="gblinear_forecaster_v1", - model="gblinear", - horizons=[LeadTime.from_string("PT36H")], - quantiles=[Q(0.5), Q(0.1), Q(0.9)], - mlflow_storage=MLFlowStorage( - tracking_uri=str(workspace_dir / "mlflow_tracking"), - local_artifacts_path=workspace_dir / "mlflow_tracking_artifacts", - ), - ) -) - -logger.info("Starting model training") -result = workflow.fit(dataset) -if result is not None: - logger.info("Full eval result:\n%s", result.metrics_full.to_dataframe()) - - if result.metrics_test is not None: - logger.info("Test result:\n%s", result.metrics_test.to_dataframe()) - -logger.info("Starting forecasting") -forecast: ForecastDataset = workflow.predict(dataset) - -print(forecast.data.tail()) - -# Plot the result -logger.info("Storing forecast plot to forecast_plot.html") -fig = ( - ForecastTimeSeriesPlotter() - .add_measurements(measurements=dataset.select_version().data["load"]) - .add_model(model_name="gblinear", forecast=forecast.median_series, quantiles=forecast.quantiles_data) - .plot() -) - -fig.write_html("forecast_plot.html") # pyright: ignore[reportUnknownMemberType] diff --git a/examples/examples/isotonic_calibration_example.py b/examples/examples/isotonic_calibration_example.py deleted file mode 100644 index 791581ba6..000000000 --- a/examples/examples/isotonic_calibration_example.py +++ /dev/null @@ -1,170 +0,0 @@ -"""Isotonic Quantile Calibration Example. - -======================================= - -This example demonstrates how to use isotonic quantile calibration to improve -the reliability of probabilistic forecasts. It shows: - -1. Training a forecasting model with isotonic calibration as postprocessing -2. Visualizing calibration quality (expected vs observed coverage) - -Isotonic calibration ensures that predicted quantiles match observed quantile -levels, improving the reliability of uncertainty estimates. -""" - -# SPDX-FileCopyrightText: 2025 Contributors to the OpenSTEF project -# -# SPDX-License-Identifier: MPL-2.0 - -from datetime import timedelta - -import numpy as np -import pandas as pd -import plotly.graph_objects as go - -from openstef_core.datasets import ForecastDataset, TimeSeriesDataset -from openstef_core.mixins import TransformPipeline -from openstef_core.types import LeadTime, Q -from openstef_models.models.forecasting.gblinear_forecaster import ( - GBLinearForecaster, - GBLinearHyperParams, -) -from openstef_models.models.forecasting_model import ForecastingModel -from openstef_models.transforms.postprocessing import IsotonicQuantileCalibrator -from openstef_models.workflows import CustomForecastingWorkflow - -# Step 1: Create synthetic time series data -n_samples = 24 * 31 * 3 # 3 months of hourly data -rng = np.random.default_rng(42) -timestamps = pd.date_range("2025-01-01", periods=n_samples, freq="h") - -dataset = TimeSeriesDataset( - data=pd.DataFrame( - { - "load": rng.standard_normal(size=n_samples) * 10 + 50, - "feature": rng.standard_normal(size=n_samples), - }, - index=timestamps, - ), - sample_interval=timedelta(hours=1), -) - -# Step 2: Configure model without calibration (for comparison) -model_uncalibrated = ForecastingModel( - forecaster=GBLinearForecaster( - horizons=[LeadTime.from_string("PT1H")], - quantiles=[Q(0.1), Q(0.5), Q(0.9)], - hyperparams=GBLinearHyperParams(n_steps=100), - verbosity=0, - ), - target_column="load", -) - -pipeline_uncalibrated = CustomForecastingWorkflow(model_id="uncalibrated_forecaster", model=model_uncalibrated) -pipeline_uncalibrated.fit(dataset) -forecast_uncalibrated = pipeline_uncalibrated.predict(dataset) - -# Step 3: Configure model with windowed isotonic quantile calibration -model_calibrated = ForecastingModel( - forecaster=GBLinearForecaster( - horizons=[LeadTime.from_string("PT1H")], - quantiles=[Q(0.1), Q(0.5), Q(0.9)], - hyperparams=GBLinearHyperParams(n_steps=100), - verbosity=0, - ), - postprocessing=TransformPipeline( - transforms=[ - IsotonicQuantileCalibrator( - quantiles=[Q(0.1), Q(0.5), Q(0.9)], - use_local_quantile_estimation=True, # Enable windowed approach - # window_size uses adaptive sizing by default: max(_MIN_WINDOW_SIZE, n_samples // 10) - ), - ], - ), - target_column="load", -) - -# Step 4: Train and predict with calibration -pipeline_calibrated = CustomForecastingWorkflow(model_id="calibrated_forecaster", model=model_calibrated) -pipeline_calibrated.fit(dataset) -forecast_calibrated = pipeline_calibrated.predict(dataset) - - -# Step 5: Visualize calibration quality (before and after) -def plot_calibration_comparison( - forecast_before: ForecastDataset, forecast_after: ForecastDataset, actuals: pd.Series -) -> go.Figure: - """Plot expected vs observed quantile coverage before and after calibration. - - Returns: - A Plotly figure showing the calibration comparison. - """ - - def calculate_coverage(forecast: ForecastDataset) -> list[float]: - common_index = forecast.data.index.intersection(actuals.index) - forecast_aligned = forecast.data.loc[common_index] - actuals_aligned = actuals.loc[common_index] - return [ - (actuals_aligned <= forecast_aligned["quantile_P10"]).mean(), - (actuals_aligned <= forecast_aligned["quantile_P50"]).mean(), - (actuals_aligned <= forecast_aligned["quantile_P90"]).mean(), - ] - - expected = [0.1, 0.5, 0.9] - observed_before = calculate_coverage(forecast_before) - observed_after = calculate_coverage(forecast_after) - - fig = go.Figure() - - # expected == observed line - fig.add_trace( # pyright: ignore[reportUnknownMemberType] - go.Scatter( - x=[0, 1], - y=[0, 1], - mode="lines", - name="expected equals observed", - line={"color": "gray", "dash": "dash", "width": 2}, - ) - ) - - # Before calibration - fig.add_trace( # pyright: ignore[reportUnknownMemberType] - go.Scatter( - x=expected, - y=observed_before, - mode="markers+lines", - name="before isotonic calibration", - marker={"size": 12, "color": "red", "symbol": "x"}, - line={"color": "red", "width": 2, "dash": "dot"}, - ) - ) - - # After calibration - fig.add_trace( # pyright: ignore[reportUnknownMemberType] - go.Scatter( - x=expected, - y=observed_after, - mode="markers+lines", - name="after isotonic calibration", - marker={"size": 12, "color": "blue"}, - line={"color": "blue", "width": 2}, - ) - ) - - fig.update_layout( # pyright: ignore[reportUnknownMemberType] - title="Isontonic quantile calibration", - xaxis_title="expected quantile", - yaxis_title="observed quantile", - xaxis={"range": [0, 1], "tickvals": [0, 0.1, 0.5, 0.9, 1]}, - yaxis={"range": [0, 1], "tickvals": [0, 0.1, 0.5, 0.9, 1]}, - width=600, - height=600, - ) - return fig - - -calibration_fig = plot_calibration_comparison( - forecast_uncalibrated, forecast_calibrated, dataset.select_version().data["load"] -) -calibration_fig.write_html("calibration_plot.html") # pyright: ignore[reportUnknownMemberType] -print("Calibration plot saved to calibration_plot.html") diff --git a/examples/tutorials/backtesting_openstef_with_beam.ipynb b/examples/tutorials/backtesting_openstef_with_beam.ipynb deleted file mode 100644 index 3b48e1738..000000000 --- a/examples/tutorials/backtesting_openstef_with_beam.ipynb +++ /dev/null @@ -1,465 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "id": "caf13084", - "metadata": {}, - "source": [ - "# 📊 Backtesting OpenSTEF Models with OpenSTEF-BEAM\n", - "\n", - "This tutorial demonstrates how to use **OpenSTEF-BEAM** (Backtesting, Evaluation, Analysis, Metrics) to systematically evaluate forecasting models. You'll learn how to:\n", - "\n", - "1. **Configure benchmark experiments** with multiple model types\n", - "2. **Run parallel backtests** across dozens of energy assets\n", - "3. **Compare model performance** with standardized metrics\n", - "4. **Generate analysis reports** with interactive visualizations\n", - "\n", - "> **BEAM** provides a rigorous framework for model evaluation, ensuring fair comparisons and reproducible results." - ] - }, - { - "cell_type": "markdown", - "id": "329ce2a3", - "metadata": {}, - "source": [ - "## 🔧 Environment Setup\n", - "\n", - "First, we configure thread settings to prevent conflicts with XGBoost's internal parallelization when running multiple processes." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "24d53eb6", - "metadata": {}, - "outputs": [], - "source": [ - "# --- Thread Configuration ---\n", - "# Prevent thread contention when running parallel backtests with XGBoost\n", - "import os\n", - "\n", - "os.environ[\"OMP_NUM_THREADS\"] = \"1\"\n", - "os.environ[\"OPENBLAS_NUM_THREADS\"] = \"1\"\n", - "os.environ[\"MKL_NUM_THREADS\"] = \"1\"\n", - "\n", - "# --- Standard Imports ---\n", - "import logging\n", - "import multiprocessing\n", - "from pathlib import Path\n", - "\n", - "logging.basicConfig(level=logging.INFO, format=\"[%(asctime)s][%(levelname)s] %(message)s\")" - ] - }, - { - "cell_type": "markdown", - "id": "0a2d9aed", - "metadata": {}, - "source": [ - "## ⚙️ Benchmark Configuration\n", - "\n", - "Configure the benchmark parameters:\n", - "- **Output paths** — where to store results for each model\n", - "- **Forecast horizons** — how far ahead to predict (using ISO 8601 duration format)\n", - "- **Quantiles** — prediction intervals for probabilistic evaluation" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "99c03b80", - "metadata": {}, - "outputs": [], - "source": [ - "# Import types for configuration\n", - "from openstef_beam.benchmarking.benchmarks.liander2024 import Liander2024Category\n", - "from openstef_core.types import LeadTime, Q # LeadTime: forecast horizon, Q: quantile\n", - "\n", - "# --- Output Paths ---\n", - "OUTPUT_PATH = Path(\"./benchmark_results\")\n", - "BENCHMARK_RESULTS_PATH_XGBOOST = OUTPUT_PATH / \"XGBoost\"\n", - "BENCHMARK_RESULTS_PATH_GBLINEAR = OUTPUT_PATH / \"GBLinear\"\n", - "\n", - "# --- Parallelization ---\n", - "N_PROCESSES = multiprocessing.cpu_count() # Use all available CPU cores\n", - "print(f\"🖥️ Running with {N_PROCESSES} parallel processes\")\n", - "\n", - "# --- Forecast Configuration ---\n", - "FORECAST_HORIZONS = [LeadTime.from_string(\"P3D\")] # 3-day ahead forecast (ISO 8601: P3D)\n", - "\n", - "# Quantiles for probabilistic forecasting (7 quantiles covering 5th to 95th percentile)\n", - "PREDICTION_QUANTILES = [\n", - " Q(0.05),\n", - " Q(0.1),\n", - " Q(0.3), # Lower quantiles\n", - " Q(0.5), # Median\n", - " Q(0.7),\n", - " Q(0.9),\n", - " Q(0.95), # Upper quantiles\n", - "]\n", - "\n", - "# --- Benchmark Filter (optional) ---\n", - "# Set to None to run all categories, or specify categories like:\n", - "# BENCHMARK_FILTER = [Liander2024Category.TRANSFORMER, Liander2024Category.MV_FEEDER]\n", - "BENCHMARK_FILTER: list[Liander2024Category] | None = None" - ] - }, - { - "cell_type": "markdown", - "id": "a3618966", - "metadata": {}, - "source": [ - "## 🛠️ Model Configuration\n", - "\n", - "We define a **common configuration** that both models share, then create model-specific variants. This ensures fair comparison by keeping all settings identical except the model type.\n", - "\n", - "### Available Models:\n", - "- **XGBoost** — Gradient boosting trees (handles complex nonlinear patterns)\n", - "- **GBLinear** — Gradient boosted linear model (better extrapolation, faster)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "3a39b756", - "metadata": {}, - "outputs": [], - "source": [ - "# Import workflow configuration\n", - "from openstef_models.presets import ForecastingWorkflowConfig\n", - "\n", - "# Common configuration shared by all models\n", - "# This ensures fair comparison by keeping all settings identical\n", - "common_config = ForecastingWorkflowConfig(\n", - " model_id=\"benchmark_model_\",\n", - " run_name=None,\n", - " model=\"flatliner\", # Placeholder - will be overwritten per model\n", - " # Forecast settings\n", - " horizons=FORECAST_HORIZONS,\n", - " quantiles=PREDICTION_QUANTILES,\n", - " # Model reuse: reuse trained model for same target (speeds up backtesting)\n", - " model_reuse_enable=True,\n", - " mlflow_storage=None, # Disable MLflow for this demo\n", - " # Weather feature column mappings (match dataset column names)\n", - " radiation_column=\"shortwave_radiation\",\n", - " wind_speed_column=\"wind_speed_80m\", # 80m wind speed for better wind park predictions\n", - " pressure_column=\"surface_pressure\",\n", - " temperature_column=\"temperature_2m\",\n", - " relative_humidity_column=\"relative_humidity_2m\",\n", - " # Additional features\n", - " energy_price_column=\"EPEX_NL\", # Day-ahead electricity price\n", - " rolling_aggregate_features=[\"mean\", \"median\", \"max\", \"min\"], # Rolling window stats\n", - " # Logging\n", - " verbosity=0, # Quiet mode for batch processing\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "ed202922", - "metadata": {}, - "outputs": [], - "source": [ - "# Create model-specific configurations by copying common config and updating model type\n", - "xgboost_config = common_config.model_copy(update={\"model\": \"xgboost\"})\n", - "gblinear_config = common_config.model_copy(update={\"model\": \"gblinear\"})\n", - "\n", - "print(\"✅ Model configurations created:\")\n", - "print(f\" - XGBoost: {xgboost_config.model}\")\n", - "print(f\" - GBLinear: {gblinear_config.model}\")" - ] - }, - { - "cell_type": "markdown", - "id": "4425a740", - "metadata": {}, - "source": [ - "## 💾 Storage Configuration\n", - "\n", - "**LocalBenchmarkStorage** manages the file structure for benchmark results:\n", - "```\n", - "benchmark_results/\n", - "├── XGBoost/\n", - "│ ├── backtest/ # Raw predictions\n", - "│ ├── evaluation/ # Metrics per target\n", - "│ └── analysis/ # Visualizations (HTML)\n", - "└── GBLinear/\n", - " └── ...\n", - "```" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "c2e44656", - "metadata": {}, - "outputs": [], - "source": [ - "# Initialize storage backends for each model\n", - "from openstef_beam.benchmarking.storage.local_storage import LocalBenchmarkStorage\n", - "\n", - "storage_xgboost = LocalBenchmarkStorage(base_path=BENCHMARK_RESULTS_PATH_XGBOOST)\n", - "storage_gblinear = LocalBenchmarkStorage(base_path=BENCHMARK_RESULTS_PATH_GBLINEAR)\n", - "\n", - "print(f\"📁 XGBoost results: {BENCHMARK_RESULTS_PATH_XGBOOST}\")\n", - "print(f\"📁 GBLinear results: {BENCHMARK_RESULTS_PATH_GBLINEAR}\")" - ] - }, - { - "cell_type": "markdown", - "id": "41e6b2e3", - "metadata": {}, - "source": [ - "## 🚀 Run Backtests\n", - "\n", - "Now we run the **Liander 2024 Benchmark** — a comprehensive evaluation suite that:\n", - "1. Downloads the benchmark dataset from HuggingFace Hub (if needed)\n", - "2. Runs backtests across 5 asset categories (transformers, feeders, solar/wind parks)\n", - "3. Computes metrics and generates analysis visualizations\n", - "\n", - "⚠️ **Note**: This may take several minutes depending on your hardware." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "d6aae871", - "metadata": {}, - "outputs": [], - "source": [ - "# Import benchmark components\n", - "from openstef_beam.benchmarking.baselines.openstef4 import create_openstef4_preset_backtest_forecaster\n", - "from openstef_beam.benchmarking.benchmarks.liander2024 import create_liander2024_benchmark_runner\n", - "from openstef_beam.benchmarking.callbacks.strict_execution_callback import StrictExecutionCallback\n", - "\n", - "# --- Run XGBoost Benchmark ---\n", - "print(\"🌲 Running XGBoost benchmark...\")\n", - "create_liander2024_benchmark_runner(\n", - " storage=storage_xgboost,\n", - " callbacks=[StrictExecutionCallback()], # Fail fast on errors\n", - ").run(\n", - " forecaster_factory=create_openstef4_preset_backtest_forecaster(\n", - " workflow_config=xgboost_config,\n", - " ),\n", - " run_name=\"xgboost\",\n", - " n_processes=N_PROCESSES,\n", - " filter_args=BENCHMARK_FILTER,\n", - ")\n", - "print(\"✅ XGBoost benchmark complete!\")\n", - "\n", - "# --- Run GBLinear Benchmark ---\n", - "print(\"\\n📈 Running GBLinear benchmark...\")\n", - "create_liander2024_benchmark_runner(\n", - " storage=storage_gblinear,\n", - " callbacks=[StrictExecutionCallback()],\n", - ").run(\n", - " forecaster_factory=create_openstef4_preset_backtest_forecaster(\n", - " workflow_config=gblinear_config,\n", - " ),\n", - " run_name=\"gblinear\",\n", - " n_processes=N_PROCESSES,\n", - " filter_args=BENCHMARK_FILTER,\n", - ")\n", - "print(\"✅ GBLinear benchmark complete!\")" - ] - }, - { - "cell_type": "markdown", - "id": "d1690a07", - "metadata": {}, - "source": [ - "## 📊 Compare Model Performance\n", - "\n", - "The **BenchmarkComparisonPipeline** generates side-by-side analysis of multiple models:\n", - "- Global metrics across all targets\n", - "- Per-category breakdowns (transformers, feeders, etc.)\n", - "- Time-windowed performance analysis" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "0a6bdfcf", - "metadata": {}, - "outputs": [], - "source": [ - "# Run model comparison analysis\n", - "from openstef_beam.benchmarking import BenchmarkComparisonPipeline\n", - "from openstef_beam.benchmarking.benchmarks.liander2024 import LIANDER2024_ANALYSIS_CONFIG\n", - "\n", - "# Create comparison pipeline\n", - "target_provider = create_liander2024_benchmark_runner(\n", - " storage=LocalBenchmarkStorage(base_path=OUTPUT_PATH),\n", - ").target_provider\n", - "\n", - "comparison_pipeline = BenchmarkComparisonPipeline(\n", - " analysis_config=LIANDER2024_ANALYSIS_CONFIG,\n", - " storage=LocalBenchmarkStorage(base_path=OUTPUT_PATH),\n", - " target_provider=target_provider,\n", - ")\n", - "\n", - "# Generate comparison reports\n", - "print(\"📊 Generating comparison analysis...\")\n", - "comparison_pipeline.run(\n", - " run_data={\n", - " \"xgboost\": storage_xgboost,\n", - " \"gblinear\": storage_gblinear,\n", - " }\n", - ")\n", - "print(\"✅ Comparison analysis complete!\")" - ] - }, - { - "cell_type": "markdown", - "id": "c22c61f4", - "metadata": {}, - "source": [ - "## 📈 View Analysis Results\n", - "\n", - "The benchmark generates interactive HTML visualizations. Let's open the most important ones:\n", - "\n", - "### Key Metrics:\n", - "- **rCRPS** (relative Continuous Ranked Probability Score) — measures probabilistic forecast accuracy\n", - "- **rMAE** (relative Mean Absolute Error) — measures point forecast accuracy\n", - "- Lower values = better performance" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "af09be7e", - "metadata": {}, - "outputs": [], - "source": [ - "# Open key analysis plots in browser\n", - "# HTML visualizations are interactive and best viewed in a browser\n", - "import os\n", - "import webbrowser\n", - "\n", - "# Base path for analysis results\n", - "analysis_base = os.path.abspath(\"./benchmark_results/analysis/D-1T06:00\")\n", - "\n", - "# Define key visualizations to open\n", - "visualizations = [\n", - " (\"rCRPS Grouped by Category\", \"rCRPS_grouped.html\"),\n", - " (\"rCRPS Time-Windowed (7 days)\", \"rCRPS_windowed_7D.html\"),\n", - "]\n", - "\n", - "print(\"🌐 Opening analysis visualizations in browser...\\n\")\n", - "for name, filename in visualizations:\n", - " filepath = os.path.join(analysis_base, filename)\n", - " if Path(filepath).exists():\n", - " print(f\" 📊 {name}\")\n", - " webbrowser.open(f\"file://{filepath}\")\n", - " else:\n", - " print(f\" ⚠️ {name} not found at {filepath}\")" - ] - }, - { - "cell_type": "markdown", - "id": "59e8d779", - "metadata": {}, - "source": [ - "### 🔍 Explore Individual Target Results\n", - "\n", - "You can also view time series plots for individual targets. Let's look at a transformer forecast:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "ea2fd469", - "metadata": {}, - "outputs": [], - "source": [ - "# List available target-specific visualizations\n", - "import glob\n", - "\n", - "# Find all time series plots for individual targets\n", - "target_plots = glob.glob(\"./benchmark_results/XGBoost/analysis/*/*/time_series_plot*.html\")\n", - "\n", - "if target_plots:\n", - " print(\"📊 Available target-specific time series plots:\\n\")\n", - " for i, plot in enumerate(sorted(target_plots)[:5]): # Show first 5\n", - " parts = plot.split(\"/\")\n", - " category = parts[-3] # e.g., \"transformer\"\n", - " target = parts[-2] # e.g., \"OS Apeldoorn\"\n", - " print(f\" {i + 1}. {category}/{target}\")\n", - "\n", - " # Open the first transformer plot as an example\n", - " transformer_plots = [p for p in target_plots if \"transformer\" in p]\n", - " if transformer_plots:\n", - " example_plot = os.path.abspath(transformer_plots[0])\n", - " print(f\"\\n🌐 Opening example: {transformer_plots[0]}\")\n", - " webbrowser.open(f\"file://{example_plot}\")\n", - "else:\n", - " print(\"⚠️ No target-specific plots found. Run the benchmark first.\")" - ] - }, - { - "cell_type": "markdown", - "id": "e41df479", - "metadata": {}, - "source": [ - "---\n", - "\n", - "## 🎯 Summary\n", - "\n", - "In this tutorial, you learned how to:\n", - "\n", - "1. ✅ **Configure benchmark experiments** with `ForecastingWorkflowConfig`\n", - "2. ✅ **Run parallel backtests** using the Liander 2024 benchmark\n", - "3. ✅ **Compare models** (XGBoost vs GBLinear) with `BenchmarkComparisonPipeline`\n", - "4. ✅ **Analyze results** with interactive HTML visualizations\n", - "\n", - "### 📁 Output Structure\n", - "\n", - "```\n", - "benchmark_results/\n", - "├── XGBoost/\n", - "│ ├── backtest/ # Raw predictions (parquet)\n", - "│ ├── evaluation/ # Metrics per target\n", - "│ └── analysis/ # HTML visualizations\n", - "├── GBLinear/\n", - "│ └── ...\n", - "└── analysis/ # Comparison analysis (both models)\n", - " └── D-1T06:00/\n", - " ├── rCRPS_grouped.html # Probabilistic accuracy by category\n", - " ├── rMAE_grouped.html # Point forecast accuracy\n", - " └── summary.html # Overall summary\n", - "```\n", - "\n", - "### 🚀 Next Steps\n", - "\n", - "- Experiment with different `FORECAST_HORIZONS` (e.g., `\"PT6H\"`, `\"P7D\"`)\n", - "- Add more quantiles for higher resolution prediction intervals\n", - "- Filter specific categories with `BENCHMARK_FILTER`\n", - "- Integrate MLflow for experiment tracking" - ] - } - ], - "metadata": { - "jupytext": { - "formats": "ipynb,py:percent" - }, - "kernelspec": { - "display_name": ".venv", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.13.5" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/examples/tutorials/backtesting_openstef_with_beam.py b/examples/tutorials/backtesting_openstef_with_beam.py deleted file mode 100644 index 0c30580ec..000000000 --- a/examples/tutorials/backtesting_openstef_with_beam.py +++ /dev/null @@ -1,338 +0,0 @@ -# --- -# jupyter: -# jupytext: -# formats: ipynb,py:percent -# text_representation: -# extension: .py -# format_name: percent -# format_version: '1.3' -# jupytext_version: 1.19.1 -# kernelspec: -# display_name: .venv -# language: python -# name: python3 -# --- - -# %% [markdown] -# # 📊 Backtesting OpenSTEF Models with OpenSTEF-BEAM -# -# This tutorial demonstrates how to use **OpenSTEF-BEAM** (Backtesting, Evaluation, Analysis, Metrics) to systematically evaluate forecasting models. You'll learn how to: -# -# 1. **Configure benchmark experiments** with multiple model types -# 2. **Run parallel backtests** across dozens of energy assets -# 3. **Compare model performance** with standardized metrics -# 4. **Generate analysis reports** with interactive visualizations -# -# > **BEAM** provides a rigorous framework for model evaluation, ensuring fair comparisons and reproducible results. - -# %% [markdown] -# ## 🔧 Environment Setup -# -# First, we configure thread settings to prevent conflicts with XGBoost's internal parallelization when running multiple processes. - -# %% -# --- Thread Configuration --- -# Prevent thread contention when running parallel backtests with XGBoost -import os - -os.environ["OMP_NUM_THREADS"] = "1" -os.environ["OPENBLAS_NUM_THREADS"] = "1" -os.environ["MKL_NUM_THREADS"] = "1" - -# --- Standard Imports --- -import logging -import multiprocessing -from pathlib import Path - -logging.basicConfig(level=logging.INFO, format="[%(asctime)s][%(levelname)s] %(message)s") - -# %% [markdown] -# ## ⚙️ Benchmark Configuration -# -# Configure the benchmark parameters: -# - **Output paths** — where to store results for each model -# - **Forecast horizons** — how far ahead to predict (using ISO 8601 duration format) -# - **Quantiles** — prediction intervals for probabilistic evaluation - -# %% -# Import types for configuration -from openstef_beam.benchmarking.benchmarks.liander2024 import Liander2024Category -from openstef_core.types import LeadTime, Q # LeadTime: forecast horizon, Q: quantile - -# --- Output Paths --- -OUTPUT_PATH = Path("./benchmark_results") -BENCHMARK_RESULTS_PATH_XGBOOST = OUTPUT_PATH / "XGBoost" -BENCHMARK_RESULTS_PATH_GBLINEAR = OUTPUT_PATH / "GBLinear" - -# --- Parallelization --- -N_PROCESSES = multiprocessing.cpu_count() # Use all available CPU cores -print(f"🖥️ Running with {N_PROCESSES} parallel processes") - -# --- Forecast Configuration --- -FORECAST_HORIZONS = [LeadTime.from_string("P3D")] # 3-day ahead forecast (ISO 8601: P3D) - -# Quantiles for probabilistic forecasting (7 quantiles covering 5th to 95th percentile) -PREDICTION_QUANTILES = [ - Q(0.05), - Q(0.1), - Q(0.3), # Lower quantiles - Q(0.5), # Median - Q(0.7), - Q(0.9), - Q(0.95), # Upper quantiles -] - -# --- Benchmark Filter (optional) --- -# Set to None to run all categories, or specify categories like: -# BENCHMARK_FILTER = [Liander2024Category.TRANSFORMER, Liander2024Category.MV_FEEDER] -BENCHMARK_FILTER: list[Liander2024Category] | None = None - -# %% [markdown] -# ## 🛠️ Model Configuration -# -# We define a **common configuration** that both models share, then create model-specific variants. This ensures fair comparison by keeping all settings identical except the model type. -# -# ### Available Models: -# - **XGBoost** — Gradient boosting trees (handles complex nonlinear patterns) -# - **GBLinear** — Gradient boosted linear model (better extrapolation, faster) - -# %% -# Import workflow configuration -from openstef_models.presets import ForecastingWorkflowConfig - -# Common configuration shared by all models -# This ensures fair comparison by keeping all settings identical -common_config = ForecastingWorkflowConfig( - model_id="benchmark_model_", - run_name=None, - model="flatliner", # Placeholder - will be overwritten per model - # Forecast settings - horizons=FORECAST_HORIZONS, - quantiles=PREDICTION_QUANTILES, - # Model reuse: reuse trained model for same target (speeds up backtesting) - model_reuse_enable=True, - mlflow_storage=None, # Disable MLflow for this demo - # Weather feature column mappings (match dataset column names) - radiation_column="shortwave_radiation", - wind_speed_column="wind_speed_80m", # 80m wind speed for better wind park predictions - pressure_column="surface_pressure", - temperature_column="temperature_2m", - relative_humidity_column="relative_humidity_2m", - # Additional features - energy_price_column="EPEX_NL", # Day-ahead electricity price - rolling_aggregate_features=["mean", "median", "max", "min"], # Rolling window stats - # Logging - verbosity=0, # Quiet mode for batch processing -) - -# %% -# Create model-specific configurations by copying common config and updating model type -xgboost_config = common_config.model_copy(update={"model": "xgboost"}) -gblinear_config = common_config.model_copy(update={"model": "gblinear"}) - -print("✅ Model configurations created:") -print(f" - XGBoost: {xgboost_config.model}") -print(f" - GBLinear: {gblinear_config.model}") - -# %% [markdown] -# ## 💾 Storage Configuration -# -# **LocalBenchmarkStorage** manages the file structure for benchmark results: -# ``` -# benchmark_results/ -# ├── XGBoost/ -# │ ├── backtest/ # Raw predictions -# │ ├── evaluation/ # Metrics per target -# │ └── analysis/ # Visualizations (HTML) -# └── GBLinear/ -# └── ... -# ``` - -# %% -# Initialize storage backends for each model -from openstef_beam.benchmarking.storage.local_storage import LocalBenchmarkStorage - -storage_xgboost = LocalBenchmarkStorage(base_path=BENCHMARK_RESULTS_PATH_XGBOOST) -storage_gblinear = LocalBenchmarkStorage(base_path=BENCHMARK_RESULTS_PATH_GBLINEAR) - -print(f"📁 XGBoost results: {BENCHMARK_RESULTS_PATH_XGBOOST}") -print(f"📁 GBLinear results: {BENCHMARK_RESULTS_PATH_GBLINEAR}") - -# %% [markdown] -# ## 🚀 Run Backtests -# -# Now we run the **Liander 2024 Benchmark** — a comprehensive evaluation suite that: -# 1. Downloads the benchmark dataset from HuggingFace Hub (if needed) -# 2. Runs backtests across 5 asset categories (transformers, feeders, solar/wind parks) -# 3. Computes metrics and generates analysis visualizations -# -# ⚠️ **Note**: This may take several minutes depending on your hardware. - -# %% -# Import benchmark components -from openstef_beam.benchmarking.baselines.openstef4 import create_openstef4_preset_backtest_forecaster -from openstef_beam.benchmarking.benchmarks.liander2024 import create_liander2024_benchmark_runner -from openstef_beam.benchmarking.callbacks.strict_execution_callback import StrictExecutionCallback - -# --- Run XGBoost Benchmark --- -print("🌲 Running XGBoost benchmark...") -create_liander2024_benchmark_runner( - storage=storage_xgboost, - callbacks=[StrictExecutionCallback()], # Fail fast on errors -).run( - forecaster_factory=create_openstef4_preset_backtest_forecaster( - workflow_config=xgboost_config, - ), - run_name="xgboost", - n_processes=N_PROCESSES, - filter_args=BENCHMARK_FILTER, -) -print("✅ XGBoost benchmark complete!") - -# --- Run GBLinear Benchmark --- -print("\n📈 Running GBLinear benchmark...") -create_liander2024_benchmark_runner( - storage=storage_gblinear, - callbacks=[StrictExecutionCallback()], -).run( - forecaster_factory=create_openstef4_preset_backtest_forecaster( - workflow_config=gblinear_config, - ), - run_name="gblinear", - n_processes=N_PROCESSES, - filter_args=BENCHMARK_FILTER, -) -print("✅ GBLinear benchmark complete!") - -# %% [markdown] -# ## 📊 Compare Model Performance -# -# The **BenchmarkComparisonPipeline** generates side-by-side analysis of multiple models: -# - Global metrics across all targets -# - Per-category breakdowns (transformers, feeders, etc.) -# - Time-windowed performance analysis - -# %% -# Run model comparison analysis -from openstef_beam.benchmarking import BenchmarkComparisonPipeline -from openstef_beam.benchmarking.benchmarks.liander2024 import LIANDER2024_ANALYSIS_CONFIG - -# Create comparison pipeline -target_provider = create_liander2024_benchmark_runner( - storage=LocalBenchmarkStorage(base_path=OUTPUT_PATH), -).target_provider - -comparison_pipeline = BenchmarkComparisonPipeline( - analysis_config=LIANDER2024_ANALYSIS_CONFIG, - storage=LocalBenchmarkStorage(base_path=OUTPUT_PATH), - target_provider=target_provider, -) - -# Generate comparison reports -print("📊 Generating comparison analysis...") -comparison_pipeline.run( - run_data={ - "xgboost": storage_xgboost, - "gblinear": storage_gblinear, - } -) -print("✅ Comparison analysis complete!") - -# %% [markdown] -# ## 📈 View Analysis Results -# -# The benchmark generates interactive HTML visualizations. Let's open the most important ones: -# -# ### Key Metrics: -# - **rCRPS** (relative Continuous Ranked Probability Score) — measures probabilistic forecast accuracy -# - **rMAE** (relative Mean Absolute Error) — measures point forecast accuracy -# - Lower values = better performance - -# %% -# Open key analysis plots in browser -# HTML visualizations are interactive and best viewed in a browser -import os -import webbrowser - -# Base path for analysis results -analysis_base = os.path.abspath("./benchmark_results/analysis/D-1T06:00") - -# Define key visualizations to open -visualizations = [ - ("rCRPS Grouped by Category", "rCRPS_grouped.html"), - ("rCRPS Time-Windowed (7 days)", "rCRPS_windowed_7D.html"), -] - -print("🌐 Opening analysis visualizations in browser...\n") -for name, filename in visualizations: - filepath = os.path.join(analysis_base, filename) - if Path(filepath).exists(): - print(f" 📊 {name}") - webbrowser.open(f"file://{filepath}") - else: - print(f" ⚠️ {name} not found at {filepath}") - -# %% [markdown] -# ### 🔍 Explore Individual Target Results -# -# You can also view time series plots for individual targets. Let's look at a transformer forecast: - -# %% -# List available target-specific visualizations -import glob - -# Find all time series plots for individual targets -target_plots = glob.glob("./benchmark_results/XGBoost/analysis/*/*/time_series_plot*.html") - -if target_plots: - print("📊 Available target-specific time series plots:\n") - for i, plot in enumerate(sorted(target_plots)[:5]): # Show first 5 - parts = plot.split("/") - category = parts[-3] # e.g., "transformer" - target = parts[-2] # e.g., "OS Apeldoorn" - print(f" {i + 1}. {category}/{target}") - - # Open the first transformer plot as an example - transformer_plots = [p for p in target_plots if "transformer" in p] - if transformer_plots: - example_plot = os.path.abspath(transformer_plots[0]) - print(f"\n🌐 Opening example: {transformer_plots[0]}") - webbrowser.open(f"file://{example_plot}") -else: - print("⚠️ No target-specific plots found. Run the benchmark first.") - -# %% [markdown] -# --- -# -# ## 🎯 Summary -# -# In this tutorial, you learned how to: -# -# 1. ✅ **Configure benchmark experiments** with `ForecastingWorkflowConfig` -# 2. ✅ **Run parallel backtests** using the Liander 2024 benchmark -# 3. ✅ **Compare models** (XGBoost vs GBLinear) with `BenchmarkComparisonPipeline` -# 4. ✅ **Analyze results** with interactive HTML visualizations -# -# ### 📁 Output Structure -# -# ``` -# benchmark_results/ -# ├── XGBoost/ -# │ ├── backtest/ # Raw predictions (parquet) -# │ ├── evaluation/ # Metrics per target -# │ └── analysis/ # HTML visualizations -# ├── GBLinear/ -# │ └── ... -# └── analysis/ # Comparison analysis (both models) -# └── D-1T06:00/ -# ├── rCRPS_grouped.html # Probabilistic accuracy by category -# ├── rMAE_grouped.html # Point forecast accuracy -# └── summary.html # Overall summary -# ``` -# -# ### 🚀 Next Steps -# -# - Experiment with different `FORECAST_HORIZONS` (e.g., `"PT6H"`, `"P7D"`) -# - Add more quantiles for higher resolution prediction intervals -# - Filter specific categories with `BENCHMARK_FILTER` -# - Integrate MLflow for experiment tracking diff --git a/examples/tutorials/backtesting_quickstart.ipynb b/examples/tutorials/backtesting_quickstart.ipynb new file mode 100644 index 000000000..4b8304116 --- /dev/null +++ b/examples/tutorials/backtesting_quickstart.ipynb @@ -0,0 +1,435 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "712d20d4", + "metadata": { + "tags": [ + "remove-cell" + ] + }, + "outputs": [], + "source": [ + "# SPDX-FileCopyrightText: 2025 Contributors to the OpenSTEF project \n", + "#\n", + "# SPDX-License-Identifier: MPL-2.0\n", + "\n", + "# pyright: basic" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ab1b247e", + "metadata": { + "tags": [ + "remove-cell" + ] + }, + "outputs": [], + "source": [ + "import warnings\n", + "\n", + "warnings.filterwarnings(\"ignore\")\n", + "\n", + "from openstef_core.testing import configure_notebook_display, setup_notebook_logging\n", + "\n", + "configure_notebook_display()\n", + "logger = setup_notebook_logging(\n", + " __name__,\n", + " suppress=(\n", + " \"choreographer\",\n", + " \"kaleido\",\n", + " \"httpx\",\n", + " \"huggingface_hub\",\n", + " \"fsspec\",\n", + " \"filelock\",\n", + " \"openstef_core.datasets\",\n", + " \"lightgbm\",\n", + " \"openstef_beam.backtesting\",\n", + " ),\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "35a1520c", + "metadata": {}, + "source": [ + "# Backtesting Quickstart\n", + "\n", + "Backtesting simulates how a forecasting model would have performed in a\n", + "real operational setting. Unlike a simple train/test split, it respects\n", + "temporal constraints: models are retrained on a schedule and predictions\n", + "use only data that would have been available at prediction time.\n", + "\n", + "**What you will learn:**\n", + "\n", + "- How to set up a backtesting pipeline with [`BacktestPipeline`](https://openstef.github.io/openstef/v4/api/generated/openstef_beam.backtesting.BacktestPipeline.html)\n", + "- How to configure prediction and retraining schedules\n", + "- How to evaluate backtest results with standardized metrics\n", + "\n", + "```{note}\n", + "This tutorial shows the low-level backtesting API step by step.\n", + "For production use, the **benchmark framework** (`openstef_beam.benchmarking`)\n", + "wraps all of this into a single pipeline call — see\n", + "`examples/benchmarks/` for ready-to-run examples.\n", + "```\n", + "\n", + "**Key API references:**\n", + "[`BacktestPipeline`](https://openstef.github.io/openstef/v4/api/generated/openstef_beam.backtesting.BacktestPipeline.html)\n", + "· [`BacktestConfig`](https://openstef.github.io/openstef/v4/api/generated/openstef_beam.backtesting.BacktestConfig.html)\n", + "· [`BacktestForecasterConfig`](https://openstef.github.io/openstef/v4/api/generated/openstef_beam.backtesting.backtest_forecaster.BacktestForecasterConfig.html)\n", + "· [`EvaluationConfig`](https://openstef.github.io/openstef/v4/api/generated/openstef_beam.evaluation.EvaluationConfig.html)" + ] + }, + { + "cell_type": "markdown", + "id": "ef3b0032", + "metadata": {}, + "source": [ + "## How backtesting works\n", + "\n", + "A backtesting pipeline replays history as if it were happening in real-time:\n", + "\n", + "1. **Event generation** — the pipeline creates a schedule of prediction and\n", + " retraining events based on configured intervals.\n", + "2. **Training** — at each retraining event, the model is fitted on all data\n", + " available up to that point (no lookahead).\n", + "3. **Prediction** — at each prediction event, the model generates a forecast\n", + " using only data published before that moment.\n", + "4. **Collection** — all forecasts are gathered into a single dataset for\n", + " evaluation against ground truth." + ] + }, + { + "cell_type": "markdown", + "id": "39f81027", + "metadata": {}, + "source": [ + "## Load the versioned dataset\n", + "\n", + "Backtesting requires **versioned** data — each data point carries an\n", + "`available_at` timestamp indicating when it became known. This prevents\n", + "the model from accidentally using future information.\n", + "[`VersionedTimeSeriesDataset`](https://openstef.github.io/openstef/v4/api/generated/openstef_core.datasets.VersionedTimeSeriesDataset.html)\n", + "provides this out of the box." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "74ff72b0", + "metadata": {}, + "outputs": [], + "source": [ + "from pathlib import Path\n", + "\n", + "from openstef_core.datasets import VersionedTimeSeriesDataset\n", + "\n", + "data_dir = Path(\"liander_dataset\")\n", + "\n", + "# Ground truth: actual load measurements\n", + "ground_truth = VersionedTimeSeriesDataset.read_parquet(\n", + " data_dir / \"load_measurements\" / \"mv_feeder\" / \"OS Gorredijk.parquet\"\n", + ")\n", + "\n", + "# Predictors: versioned weather forecasts (available_at < forecast time)\n", + "predictors = VersionedTimeSeriesDataset.read_parquet(\n", + " data_dir / \"weather_forecasts_versioned\" / \"mv_feeder\" / \"OS Gorredijk.parquet\"\n", + ")\n", + "\n", + "print(f\"Ground truth: {len(ground_truth.index):,} timestamps, {len(ground_truth.feature_names)} features\")\n", + "print(f\"Predictors: {len(predictors.index):,} timestamps, {len(predictors.feature_names)} features\")" + ] + }, + { + "cell_type": "markdown", + "id": "52acd6ce", + "metadata": {}, + "source": [ + "## Configure the forecaster\n", + "\n", + "We wrap a standard [`ForecastingWorkflowConfig`](https://openstef.github.io/openstef/v4/api/generated/openstef_models.presets.ForecastingWorkflowConfig.html) in an\n", + "`OpenSTEF4BacktestForecaster` which implements the backtesting interface\n", + "(fit/predict with temporal constraints)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4c1aa76d", + "metadata": {}, + "outputs": [], + "source": [ + "from datetime import timedelta\n", + "\n", + "from openstef_beam.backtesting.backtest_forecaster import BacktestForecasterConfig\n", + "from openstef_beam.benchmarking.baselines.openstef4 import OpenSTEF4BacktestForecaster\n", + "from openstef_core.types import LeadTime, Q\n", + "from openstef_models.presets import ForecastingWorkflowConfig, create_forecasting_workflow\n", + "\n", + "workflow_config = ForecastingWorkflowConfig(\n", + " model_id=\"backtest_demo\",\n", + " model=\"xgboost\",\n", + " horizons=[LeadTime.from_string(\"PT48H\")],\n", + " quantiles=[Q(0.5), Q(0.1), Q(0.9)],\n", + " target_column=\"load\",\n", + " temperature_column=\"temperature_2m\",\n", + " relative_humidity_column=\"relative_humidity_2m\",\n", + " wind_speed_column=\"wind_speed_10m\",\n", + " radiation_column=\"shortwave_radiation\",\n", + " pressure_column=\"surface_pressure\",\n", + " mlflow_storage=None,\n", + " verbosity=0,\n", + ")\n", + "\n", + "backtest_forecaster_config = BacktestForecasterConfig(\n", + " requires_training=True,\n", + " predict_length=timedelta(hours=48),\n", + " predict_min_length=timedelta(minutes=15),\n", + " predict_context_length=timedelta(days=14),\n", + " predict_context_min_coverage=0.5,\n", + " training_context_length=timedelta(days=90),\n", + " training_context_min_coverage=0.5,\n", + ")\n", + "\n", + "workflow = create_forecasting_workflow(workflow_config)\n", + "forecaster = OpenSTEF4BacktestForecaster(\n", + " config=backtest_forecaster_config,\n", + " workflow_template=workflow,\n", + " cache_dir=Path(\"cache/backtest_demo\"),\n", + ")\n", + "\n", + "print(f\"Model: {workflow_config.model}\")\n", + "print(f\"Training window: {backtest_forecaster_config.training_context_length}\")\n", + "print(f\"Predict horizon: {backtest_forecaster_config.predict_length}\")" + ] + }, + { + "cell_type": "markdown", + "id": "fc797d30", + "metadata": {}, + "source": [ + "## Run the backtest\n", + "\n", + "We configure the pipeline to predict every 6 hours and retrain weekly.\n", + "The backtest covers a short 5-day window for fast execution." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "11b0adeb", + "metadata": {}, + "outputs": [], + "source": [ + "from datetime import datetime\n", + "\n", + "from openstef_beam.backtesting import BacktestConfig, BacktestPipeline\n", + "\n", + "backtest_config = BacktestConfig(\n", + " prediction_sample_interval=timedelta(minutes=15),\n", + " predict_interval=timedelta(hours=6),\n", + " train_interval=timedelta(days=7),\n", + ")\n", + "\n", + "pipeline = BacktestPipeline(config=backtest_config, forecaster=forecaster)\n", + "\n", + "# Short evaluation window: 5 days starting well into the dataset\n", + "backtest_start = datetime.fromisoformat(\"2024-05-01T00:00:00Z\")\n", + "backtest_end = datetime.fromisoformat(\"2024-05-06T00:00:00Z\")\n", + "\n", + "predictions = pipeline.run(\n", + " ground_truth=ground_truth,\n", + " predictors=predictors,\n", + " start=backtest_start,\n", + " end=backtest_end,\n", + ")\n", + "\n", + "print(f\"Predictions generated: {predictions.data.shape[0]:,} rows\")\n", + "print(f\"Time range: {predictions.data.index.min()} to {predictions.data.index.max()}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "94bdbdc2", + "metadata": { + "tags": [ + "remove-cell" + ] + }, + "outputs": [], + "source": [ + "assert predictions.data.shape[0] > 100, f\"Expected >100 prediction rows, got {predictions.data.shape[0]}\"" + ] + }, + { + "cell_type": "markdown", + "id": "37e479ab", + "metadata": {}, + "source": [ + "## Evaluate the results\n", + "\n", + "The [`EvaluationPipeline`](https://openstef.github.io/openstef/v4/api/generated/openstef_beam.evaluation.EvaluationPipeline.html) computes metrics over configurable time windows.\n", + "It filters predictions by lead time to produce meaningful comparisons\n", + "(e.g., day-ahead forecasts only).\n", + "\n", + "We use [rMAE](https://openstef.github.io/openstef/v4/api/generated/openstef_beam.evaluation.metric_providers.RMAEProvider.html) (relative Mean Absolute Error) and [rCRPS](https://openstef.github.io/openstef/v4/api/generated/openstef_beam.evaluation.metric_providers.RCRPSProvider.html) (relative Continuous\n", + "Ranked Probability Score) — both normalized by mean absolute actuals.\n", + "See the full list of [available metrics](https://openstef.github.io/openstef/v4/api/generated/openstef_beam.evaluation.metric_providers.html).\n", + "If your scores are suboptimal, {doc}`hyperparameter_tuning_with_optuna`\n", + "shows how to optimize model parameters before re-running the backtest." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5e938dc3", + "metadata": {}, + "outputs": [], + "source": [ + "from openstef_beam.evaluation import EvaluationConfig, EvaluationPipeline, Window\n", + "from openstef_beam.evaluation.metric_providers import RCRPSProvider, RMAEProvider\n", + "\n", + "evaluation_config = EvaluationConfig(\n", + " windows=[Window(lag=timedelta(hours=0), size=timedelta(days=5))],\n", + " lead_times=[], # Only use available_at filtering (day-ahead)\n", + ")\n", + "\n", + "eval_pipeline = EvaluationPipeline(\n", + " config=evaluation_config,\n", + " quantiles=workflow_config.quantiles,\n", + " window_metric_providers=[\n", + " RMAEProvider(quantiles=[Q(0.5)]),\n", + " RCRPSProvider(),\n", + " ],\n", + " global_metric_providers=[\n", + " RMAEProvider(quantiles=[Q(0.5)]),\n", + " RCRPSProvider(),\n", + " ],\n", + ")\n", + "\n", + "report = eval_pipeline.run(\n", + " predictions=predictions,\n", + " ground_truth=ground_truth,\n", + " target_column=\"load\",\n", + ")\n", + "\n", + "print(\"Backtest evaluation metrics (day-ahead):\")\n", + "for subset_report in report.subset_reports:\n", + " print(f\"\\n Lead-time filter: {subset_report.filtering}\")\n", + " for metric in subset_report.metrics:\n", + " df = metric.to_dataframe()\n", + " print(f\" Window: {metric.window}\")\n", + " print(df.to_string(index=False))" + ] + }, + { + "cell_type": "markdown", + "id": "98c0b36d", + "metadata": {}, + "source": [ + "## Visualize predictions vs actuals\n", + "\n", + "The evaluation report contains a properly filtered [`ForecastDataset`](https://openstef.github.io/openstef/v4/api/generated/openstef_core.datasets.ForecastDataset.html) for\n", + "each lead-time subset. We use this directly for visualization — it\n", + "shows only day-ahead predictions aligned with their corresponding actuals." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b17332ed", + "metadata": { + "tags": [ + "hide-input" + ] + }, + "outputs": [], + "source": [ + "from openstef_beam.analysis.plots import ForecastTimeSeriesPlotter\n", + "\n", + "# The evaluation subset contains actuals + predictions filtered by lead time\n", + "subset = report.subset_reports[0].subset\n", + "\n", + "plotter = ForecastTimeSeriesPlotter()\n", + "plotter.add_measurements(measurements=subset.target_series)\n", + "plotter.add_model(\n", + " model_name=\"XGBoost (day-ahead)\",\n", + " forecast=subset.median_series,\n", + " quantiles=subset.quantiles_data,\n", + ")\n", + "\n", + "fig = plotter.plot()\n", + "fig.update_layout(\n", + " title=\"Backtest: Day-Ahead Forecast vs Actuals\",\n", + " xaxis_title=\"Time\",\n", + " yaxis_title=\"Load (W)\",\n", + " height=400,\n", + ")\n", + "fig.show()" + ] + }, + { + "cell_type": "markdown", + "id": "9e6e5ad7", + "metadata": {}, + "source": [ + "## The easy way: benchmark framework\n", + "\n", + "The code above demonstrates each backtesting step explicitly. In practice,\n", + "the **benchmark framework** handles all of this (data loading, target\n", + "management, evaluation, analysis) in a single pipeline:\n", + "\n", + "```python\n", + "from openstef_beam.benchmarking.benchmarks.liander2024 import (\n", + " create_liander2024_benchmark_runner,\n", + ")\n", + "from openstef_beam.benchmarking.baselines.openstef4 import (\n", + " create_openstef4_preset_backtest_forecaster,\n", + ")\n", + "\n", + "runner = create_liander2024_benchmark_runner()\n", + "forecaster_factory = create_openstef4_preset_backtest_forecaster(workflow_config)\n", + "runner.run(forecaster_factory, run_name=\"my_experiment\")\n", + "```\n", + "\n", + "The benchmark runner automatically:\n", + "- Downloads and manages the dataset\n", + "- Iterates over all targets (feeders, transformers, solar parks, etc.)\n", + "- Runs backtests with standardized configuration\n", + "- Computes metrics and generates analysis visualizations\n", + "\n", + "See `examples/benchmarks/` for complete benchmark scripts that will be\n", + "converted to Jupytext tutorials in a future update." + ] + }, + { + "cell_type": "markdown", + "id": "f9814713", + "metadata": {}, + "source": [ + "## Next steps\n", + "\n", + "- {doc}`hyperparameter_tuning_with_optuna` — optimize model parameters,\n", + " then re-run the backtest to measure improvement.\n", + "- {doc}`ensemble_forecasting` — backtest an ensemble of diverse models." + ] + } + ], + "metadata": { + "jupytext": { + "formats": "ipynb,py:percent" + }, + "kernelspec": { + "display_name": ".venv", + "language": "python", + "name": "python3" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/examples/tutorials/backtesting_quickstart.py b/examples/tutorials/backtesting_quickstart.py new file mode 100644 index 000000000..cccc0d072 --- /dev/null +++ b/examples/tutorials/backtesting_quickstart.py @@ -0,0 +1,316 @@ +# --- +# jupyter: +# jupytext: +# formats: ipynb,py:percent +# text_representation: +# extension: .py +# format_name: percent +# format_version: '1.3' +# jupytext_version: 1.19.1 +# kernelspec: +# display_name: .venv +# language: python +# name: python3 +# --- + +# %% tags=["remove-cell"] +# SPDX-FileCopyrightText: 2025 Contributors to the OpenSTEF project +# +# SPDX-License-Identifier: MPL-2.0 + +# pyright: basic + +# %% tags=["remove-cell"] +import warnings + +warnings.filterwarnings("ignore") + +from openstef_core.testing import configure_notebook_display, setup_notebook_logging + +configure_notebook_display() +logger = setup_notebook_logging( + __name__, + suppress=( + "choreographer", + "kaleido", + "httpx", + "huggingface_hub", + "fsspec", + "filelock", + "openstef_core.datasets", + "lightgbm", + "openstef_beam.backtesting", + ), +) + +# %% [markdown] +# # Backtesting Quickstart +# +# Backtesting simulates how a forecasting model would have performed in a +# real operational setting. Unlike a simple train/test split, it respects +# temporal constraints: models are retrained on a schedule and predictions +# use only data that would have been available at prediction time. +# +# **What you will learn:** +# +# - How to set up a backtesting pipeline with [`BacktestPipeline`](https://openstef.github.io/openstef/v4/api/generated/openstef_beam.backtesting.BacktestPipeline.html) +# - How to configure prediction and retraining schedules +# - How to evaluate backtest results with standardized metrics +# +# ```{note} +# This tutorial shows the low-level backtesting API step by step. +# For production use, the **benchmark framework** (`openstef_beam.benchmarking`) +# wraps all of this into a single pipeline call — see +# `examples/benchmarks/` for ready-to-run examples. +# ``` +# +# **Key API references:** +# [`BacktestPipeline`](https://openstef.github.io/openstef/v4/api/generated/openstef_beam.backtesting.BacktestPipeline.html) +# · [`BacktestConfig`](https://openstef.github.io/openstef/v4/api/generated/openstef_beam.backtesting.BacktestConfig.html) +# · [`BacktestForecasterConfig`](https://openstef.github.io/openstef/v4/api/generated/openstef_beam.backtesting.backtest_forecaster.BacktestForecasterConfig.html) +# · [`EvaluationConfig`](https://openstef.github.io/openstef/v4/api/generated/openstef_beam.evaluation.EvaluationConfig.html) + +# %% [markdown] +# ## How backtesting works +# +# A backtesting pipeline replays history as if it were happening in real-time: +# +# 1. **Event generation** — the pipeline creates a schedule of prediction and +# retraining events based on configured intervals. +# 2. **Training** — at each retraining event, the model is fitted on all data +# available up to that point (no lookahead). +# 3. **Prediction** — at each prediction event, the model generates a forecast +# using only data published before that moment. +# 4. **Collection** — all forecasts are gathered into a single dataset for +# evaluation against ground truth. + +# %% [markdown] +# ## Load the versioned dataset +# +# Backtesting requires **versioned** data — each data point carries an +# `available_at` timestamp indicating when it became known. This prevents +# the model from accidentally using future information. +# [`VersionedTimeSeriesDataset`](https://openstef.github.io/openstef/v4/api/generated/openstef_core.datasets.VersionedTimeSeriesDataset.html) +# provides this out of the box. + +# %% +from pathlib import Path + +from openstef_core.datasets import VersionedTimeSeriesDataset + +data_dir = Path("liander_dataset") + +# Ground truth: actual load measurements +ground_truth = VersionedTimeSeriesDataset.read_parquet( + data_dir / "load_measurements" / "mv_feeder" / "OS Gorredijk.parquet" +) + +# Predictors: versioned weather forecasts (available_at < forecast time) +predictors = VersionedTimeSeriesDataset.read_parquet( + data_dir / "weather_forecasts_versioned" / "mv_feeder" / "OS Gorredijk.parquet" +) + +print(f"Ground truth: {len(ground_truth.index):,} timestamps, {len(ground_truth.feature_names)} features") +print(f"Predictors: {len(predictors.index):,} timestamps, {len(predictors.feature_names)} features") + +# %% [markdown] +# ## Configure the forecaster +# +# We wrap a standard [`ForecastingWorkflowConfig`](https://openstef.github.io/openstef/v4/api/generated/openstef_models.presets.ForecastingWorkflowConfig.html) in an +# `OpenSTEF4BacktestForecaster` which implements the backtesting interface +# (fit/predict with temporal constraints). + +# %% +from datetime import timedelta + +from openstef_beam.backtesting.backtest_forecaster import BacktestForecasterConfig +from openstef_beam.benchmarking.baselines.openstef4 import OpenSTEF4BacktestForecaster +from openstef_core.types import LeadTime, Q +from openstef_models.presets import ForecastingWorkflowConfig, create_forecasting_workflow + +workflow_config = ForecastingWorkflowConfig( + model_id="backtest_demo", + model="xgboost", + horizons=[LeadTime.from_string("PT48H")], + quantiles=[Q(0.5), Q(0.1), Q(0.9)], + target_column="load", + temperature_column="temperature_2m", + relative_humidity_column="relative_humidity_2m", + wind_speed_column="wind_speed_10m", + radiation_column="shortwave_radiation", + pressure_column="surface_pressure", + mlflow_storage=None, + verbosity=0, +) + +backtest_forecaster_config = BacktestForecasterConfig( + requires_training=True, + predict_length=timedelta(hours=48), + predict_min_length=timedelta(minutes=15), + predict_context_length=timedelta(days=14), + predict_context_min_coverage=0.5, + training_context_length=timedelta(days=90), + training_context_min_coverage=0.5, +) + +workflow = create_forecasting_workflow(workflow_config) +forecaster = OpenSTEF4BacktestForecaster( + config=backtest_forecaster_config, + workflow_template=workflow, + cache_dir=Path("cache/backtest_demo"), +) + +print(f"Model: {workflow_config.model}") +print(f"Training window: {backtest_forecaster_config.training_context_length}") +print(f"Predict horizon: {backtest_forecaster_config.predict_length}") + +# %% [markdown] +# ## Run the backtest +# +# We configure the pipeline to predict every 6 hours and retrain weekly. +# The backtest covers a short 5-day window for fast execution. + +# %% +from datetime import datetime + +from openstef_beam.backtesting import BacktestConfig, BacktestPipeline + +backtest_config = BacktestConfig( + prediction_sample_interval=timedelta(minutes=15), + predict_interval=timedelta(hours=6), + train_interval=timedelta(days=7), +) + +pipeline = BacktestPipeline(config=backtest_config, forecaster=forecaster) + +# Short evaluation window: 5 days starting well into the dataset +backtest_start = datetime.fromisoformat("2024-05-01T00:00:00Z") +backtest_end = datetime.fromisoformat("2024-05-06T00:00:00Z") + +predictions = pipeline.run( + ground_truth=ground_truth, + predictors=predictors, + start=backtest_start, + end=backtest_end, +) + +print(f"Predictions generated: {predictions.data.shape[0]:,} rows") +print(f"Time range: {predictions.data.index.min()} to {predictions.data.index.max()}") + +# %% tags=["remove-cell"] +assert predictions.data.shape[0] > 100, f"Expected >100 prediction rows, got {predictions.data.shape[0]}" + +# %% [markdown] +# ## Evaluate the results +# +# The [`EvaluationPipeline`](https://openstef.github.io/openstef/v4/api/generated/openstef_beam.evaluation.EvaluationPipeline.html) computes metrics over configurable time windows. +# It filters predictions by lead time to produce meaningful comparisons +# (e.g., day-ahead forecasts only). +# +# We use [rMAE](https://openstef.github.io/openstef/v4/api/generated/openstef_beam.evaluation.metric_providers.RMAEProvider.html) (relative Mean Absolute Error) and [rCRPS](https://openstef.github.io/openstef/v4/api/generated/openstef_beam.evaluation.metric_providers.RCRPSProvider.html) (relative Continuous +# Ranked Probability Score) — both normalized by mean absolute actuals. +# See the full list of [available metrics](https://openstef.github.io/openstef/v4/api/generated/openstef_beam.evaluation.metric_providers.html). +# If your scores are suboptimal, {doc}`hyperparameter_tuning_with_optuna` +# shows how to optimize model parameters before re-running the backtest. + +# %% +from openstef_beam.evaluation import EvaluationConfig, EvaluationPipeline, Window +from openstef_beam.evaluation.metric_providers import RCRPSProvider, RMAEProvider + +evaluation_config = EvaluationConfig( + windows=[Window(lag=timedelta(hours=0), size=timedelta(days=5))], + lead_times=[], # Only use available_at filtering (day-ahead) +) + +eval_pipeline = EvaluationPipeline( + config=evaluation_config, + quantiles=workflow_config.quantiles, + window_metric_providers=[ + RMAEProvider(quantiles=[Q(0.5)]), + RCRPSProvider(), + ], + global_metric_providers=[ + RMAEProvider(quantiles=[Q(0.5)]), + RCRPSProvider(), + ], +) + +report = eval_pipeline.run( + predictions=predictions, + ground_truth=ground_truth, + target_column="load", +) + +print("Backtest evaluation metrics (day-ahead):") +for subset_report in report.subset_reports: + print(f"\n Lead-time filter: {subset_report.filtering}") + for metric in subset_report.metrics: + df = metric.to_dataframe() + print(f" Window: {metric.window}") + print(df.to_string(index=False)) + +# %% [markdown] +# ## Visualize predictions vs actuals +# +# The evaluation report contains a properly filtered [`ForecastDataset`](https://openstef.github.io/openstef/v4/api/generated/openstef_core.datasets.ForecastDataset.html) for +# each lead-time subset. We use this directly for visualization — it +# shows only day-ahead predictions aligned with their corresponding actuals. + +# %% tags=["hide-input"] +from openstef_beam.analysis.plots import ForecastTimeSeriesPlotter + +# The evaluation subset contains actuals + predictions filtered by lead time +subset = report.subset_reports[0].subset + +plotter = ForecastTimeSeriesPlotter() +plotter.add_measurements(measurements=subset.target_series) +plotter.add_model( + model_name="XGBoost (day-ahead)", + forecast=subset.median_series, + quantiles=subset.quantiles_data, +) + +fig = plotter.plot() +fig.update_layout( + title="Backtest: Day-Ahead Forecast vs Actuals", + xaxis_title="Time", + yaxis_title="Load (W)", + height=400, +) +fig.show() + +# %% [markdown] +# ## The easy way: benchmark framework +# +# The code above demonstrates each backtesting step explicitly. In practice, +# the **benchmark framework** handles all of this (data loading, target +# management, evaluation, analysis) in a single pipeline: +# +# ```python +# from openstef_beam.benchmarking.benchmarks.liander2024 import ( +# create_liander2024_benchmark_runner, +# ) +# from openstef_beam.benchmarking.baselines.openstef4 import ( +# create_openstef4_preset_backtest_forecaster, +# ) +# +# runner = create_liander2024_benchmark_runner() +# forecaster_factory = create_openstef4_preset_backtest_forecaster(workflow_config) +# runner.run(forecaster_factory, run_name="my_experiment") +# ``` +# +# The benchmark runner automatically: +# - Downloads and manages the dataset +# - Iterates over all targets (feeders, transformers, solar parks, etc.) +# - Runs backtests with standardized configuration +# - Computes metrics and generates analysis visualizations +# +# See `examples/benchmarks/` for complete benchmark scripts that will be +# converted to Jupytext tutorials in a future update. + +# %% [markdown] +# ## Next steps +# +# - {doc}`hyperparameter_tuning_with_optuna` — optimize model parameters, +# then re-run the backtest to measure improvement. +# - {doc}`ensemble_forecasting` — backtest an ensemble of diverse models. diff --git a/examples/tutorials/custom_pipeline.ipynb b/examples/tutorials/custom_pipeline.ipynb new file mode 100644 index 000000000..143609092 --- /dev/null +++ b/examples/tutorials/custom_pipeline.ipynb @@ -0,0 +1,499 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "e932643b", + "metadata": { + "tags": [ + "remove-cell" + ] + }, + "outputs": [], + "source": [ + "# SPDX-FileCopyrightText: 2025 Contributors to the OpenSTEF project \n", + "#\n", + "# SPDX-License-Identifier: MPL-2.0\n", + "\n", + "# pyright: basic" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d7068def", + "metadata": { + "tags": [ + "remove-cell" + ] + }, + "outputs": [], + "source": [ + "import warnings\n", + "\n", + "warnings.filterwarnings(\"ignore\")\n", + "\n", + "from openstef_core.testing import configure_notebook_display, setup_notebook_logging\n", + "\n", + "configure_notebook_display()\n", + "logger = setup_notebook_logging(\n", + " __name__,\n", + " suppress=(\n", + " \"choreographer\",\n", + " \"kaleido\",\n", + " \"httpx\",\n", + " \"huggingface_hub\",\n", + " \"fsspec\",\n", + " \"filelock\",\n", + " \"openstef_core.datasets\",\n", + " ),\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "81d0d171", + "metadata": {}, + "source": [ + "# Building a Custom Pipeline\n", + "\n", + "The [`create_forecasting_workflow`](https://openstef.github.io/openstef/v4/api/generated/openstef_models.presets.create_forecasting_workflow.html) preset handles pipeline assembly\n", + "automatically. When you need full control — custom transforms, different\n", + "feature engineering, or non-standard postprocessing — you can build a\n", + "[`ForecastingModel`](https://openstef.github.io/openstef/v4/api/generated/openstef_models.models.ForecastingModel.html) from individual components.\n", + "\n", + "**What you'll learn:**\n", + "\n", + "- Assemble preprocessing, forecaster, and postprocessing into a pipeline\n", + "- Select and configure individual transforms\n", + "- Train and predict with a hand-built pipeline\n", + "- Compare the custom pipeline against a preset\n", + "\n", + "```{note}\n", + "This tutorial is for advanced users who need to go beyond presets.\n", + "Start with {doc}`forecasting_quickstart` for the standard approach.\n", + "```\n", + "\n", + "**Key API references:**\n", + "[`ForecastingModel`](https://openstef.github.io/openstef/v4/api/generated/openstef_models.models.ForecastingModel.html)\n", + "· [`TransformPipeline`](https://openstef.github.io/openstef/v4/api/generated/openstef_core.mixins.TransformPipeline.html)\n", + "· [`GBLinearForecaster`](https://openstef.github.io/openstef/v4/api/generated/openstef_models.models.forecasting.html)" + ] + }, + { + "cell_type": "markdown", + "id": "9418a6bc", + "metadata": {}, + "source": [ + "## Load the dataset" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7322debe", + "metadata": {}, + "outputs": [], + "source": [ + "from datetime import timedelta\n", + "\n", + "from openstef_core.testing import load_liander_dataset\n", + "from openstef_core.types import LeadTime, Q\n", + "\n", + "dataset = load_liander_dataset()\n", + "\n", + "from datetime import datetime\n", + "\n", + "train_start = datetime.fromisoformat(\"2024-03-01T00:00:00Z\")\n", + "train_end = train_start + timedelta(days=45)\n", + "forecast_end = train_end + timedelta(days=7)\n", + "\n", + "train_dataset = dataset.filter_by_range(start=train_start, end=train_end)\n", + "predict_dataset = dataset.filter_by_range(\n", + " start=train_end - timedelta(days=14),\n", + " end=forecast_end,\n", + ")\n", + "\n", + "print(f\"Training: {train_dataset.data.shape[0]:,} rows\")\n", + "print(f\"Predict: {predict_dataset.data.shape[0]:,} rows\")" + ] + }, + { + "cell_type": "markdown", + "id": "3c407256", + "metadata": {}, + "source": [ + "## Define pipeline components\n", + "\n", + "A [`ForecastingModel`](https://openstef.github.io/openstef/v4/api/generated/openstef_models.models.ForecastingModel.html) has three stages:\n", + "\n", + "1. **Preprocessing** — feature engineering and data cleaning transforms\n", + "2. **Forecaster** — the model that produces predictions\n", + "3. **Postprocessing** — transforms applied to the forecast output\n", + "\n", + "Below we build each stage explicitly." + ] + }, + { + "cell_type": "markdown", + "id": "431e2214", + "metadata": {}, + "source": [ + "### Preprocessing\n", + "\n", + "We select transforms from the available modules:\n", + "\n", + "| Module | Transforms |\n", + "|--------|-----------|\n", + "| `transforms.general` | Scaler, Imputer, NaNDropper, OutlierHandler, EmptyFeatureRemover |\n", + "| `transforms.time_domain` | HolidayFeatureAdder, DatetimeFeaturesAdder, CyclicFeaturesAdder, LagsAdder |\n", + "| `transforms.weather_domain` | AtmosphereDerivedFeaturesAdder, DaylightFeatureAdder, RadiationDerivedFeaturesAdder |\n", + "| `transforms.energy_domain` | WindPowerFeatureAdder |\n", + "| `transforms.validation` | CompletenessChecker, FlatlineChecker |" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b8ae7bb5", + "metadata": {}, + "outputs": [], + "source": [ + "from openstef_core.mixins import TransformPipeline\n", + "from openstef_models.transforms.general import EmptyFeatureRemover, Imputer, NaNDropper, Scaler\n", + "from openstef_models.transforms.time_domain import CyclicFeaturesAdder, HolidayFeatureAdder\n", + "from openstef_models.transforms.time_domain.lags_adder import LagsAdder\n", + "from openstef_models.utils.feature_selection import Exclude\n", + "\n", + "quantiles = [Q(0.1), Q(0.5), Q(0.9)]\n", + "horizons = [LeadTime.from_string(\"PT36H\")]\n", + "\n", + "preprocessing = TransformPipeline(\n", + " transforms=[\n", + " # Feature engineering\n", + " LagsAdder(\n", + " history_available=timedelta(days=14),\n", + " horizons=horizons,\n", + " add_trivial_lags=False,\n", + " target_column=\"load\",\n", + " custom_lags=[timedelta(days=7)],\n", + " lag_fallback_offset=timedelta(days=7),\n", + " ),\n", + " CyclicFeaturesAdder(),\n", + " HolidayFeatureAdder(country_code=\"NL\"),\n", + " # Standardization\n", + " Scaler(selection=Exclude(\"load\"), method=\"standard\"),\n", + " EmptyFeatureRemover(),\n", + " # Missing value handling\n", + " Imputer(selection=Exclude(\"load\"), imputation_strategy=\"mean\"),\n", + " NaNDropper(selection=Exclude(\"load\")),\n", + " ]\n", + ")\n", + "\n", + "print(f\"Preprocessing steps: {len(preprocessing.transforms)}\")\n", + "for t in preprocessing.transforms:\n", + " print(f\" - {type(t).__name__}\")" + ] + }, + { + "cell_type": "markdown", + "id": "7e7a9dab", + "metadata": {}, + "source": [ + "### Forecaster\n", + "\n", + "We use `GBLinearForecaster` — a gradient-boosted linear model that works well\n", + "with the Imputer + NaNDropper preprocessing pattern above." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fdfcd4cb", + "metadata": {}, + "outputs": [], + "source": [ + "from openstef_models.models.forecasting.gblinear_forecaster import (\n", + " GBLinearForecaster,\n", + " GBLinearHyperParams,\n", + ")\n", + "\n", + "forecaster = GBLinearForecaster(\n", + " quantiles=quantiles,\n", + " horizons=horizons,\n", + " hyperparams=GBLinearHyperParams(\n", + " n_steps=100,\n", + " learning_rate=0.3,\n", + " ),\n", + " verbosity=0,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "5ed31867", + "metadata": {}, + "source": [ + "### Postprocessing\n", + "\n", + "We add a [`QuantileSorter`](https://openstef.github.io/openstef/v4/api/generated/openstef_models.transforms.postprocessing.QuantileSorter.html) (ensures quantile ordering) and a\n", + "[`ConfidenceIntervalApplicator`](https://openstef.github.io/openstef/v4/api/generated/openstef_models.transforms.postprocessing.ConfidenceIntervalApplicator.html) (adds confidence interval columns)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0f200e18", + "metadata": {}, + "outputs": [], + "source": [ + "from openstef_models.transforms.postprocessing import (\n", + " ConfidenceIntervalApplicator,\n", + " QuantileSorter,\n", + ")\n", + "\n", + "postprocessing = TransformPipeline(\n", + " transforms=[\n", + " QuantileSorter(),\n", + " ConfidenceIntervalApplicator(\n", + " quantiles=quantiles,\n", + " add_quantiles_from_std=False,\n", + " ),\n", + " ]\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "3cd19602", + "metadata": {}, + "source": [ + "## Assemble the model\n", + "\n", + "[`ForecastingModel`](https://openstef.github.io/openstef/v4/api/generated/openstef_models.models.ForecastingModel.html) combines all three stages. We wrap it in a\n", + "[`CustomForecastingWorkflow`](https://openstef.github.io/openstef/v4/api/generated/openstef_models.workflows.custom_forecasting_workflow.CustomForecastingWorkflow.html) which adds train/predict orchestration." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "055172cf", + "metadata": {}, + "outputs": [], + "source": [ + "from openstef_models.models.forecasting_model import ForecastingModel\n", + "from openstef_models.workflows import CustomForecastingWorkflow\n", + "\n", + "model = ForecastingModel(\n", + " preprocessing=preprocessing,\n", + " forecaster=forecaster,\n", + " postprocessing=postprocessing,\n", + " target_column=\"load\",\n", + ")\n", + "\n", + "workflow = CustomForecastingWorkflow(\n", + " model_id=\"custom_pipeline_demo\",\n", + " model=model,\n", + " callbacks=[],\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "12bc07c5", + "metadata": {}, + "source": [ + "## Train and predict" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4876ddc6", + "metadata": {}, + "outputs": [], + "source": [ + "result = workflow.fit(train_dataset)\n", + "forecast = workflow.predict(predict_dataset, forecast_start=train_end)\n", + "\n", + "print(f\"Forecast rows: {len(forecast.data)}\")\n", + "print(f\"Columns: {list(forecast.data.columns)}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cdabc2d1", + "metadata": { + "tags": [ + "remove-cell" + ] + }, + "outputs": [], + "source": [ + "assert len(forecast.data) > 100, f\"Expected >100 forecast rows, got {len(forecast.data)}\"" + ] + }, + { + "cell_type": "markdown", + "id": "56f58706", + "metadata": {}, + "source": [ + "## Visualize the result" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ec9920d7", + "metadata": { + "tags": [ + "hide-input" + ] + }, + "outputs": [], + "source": [ + "from openstef_beam.analysis.plots import ForecastTimeSeriesPlotter\n", + "\n", + "fig = (\n", + " ForecastTimeSeriesPlotter()\n", + " .add_measurements(measurements=predict_dataset.data[\"load\"].loc[train_end:])\n", + " .add_model(\n", + " model_name=\"Custom GBLinear\",\n", + " forecast=forecast.median_series,\n", + " quantiles=forecast.quantiles_data,\n", + " )\n", + " .plot()\n", + ")\n", + "fig.update_layout(\n", + " title=\"Custom pipeline — forecast vs actuals\",\n", + " yaxis_title=\"Load (MW)\",\n", + " xaxis_title=\"Time\",\n", + " height=450,\n", + ")\n", + "fig.show()" + ] + }, + { + "cell_type": "markdown", + "id": "ac970781", + "metadata": {}, + "source": [ + "## Using components individually\n", + "\n", + "`ForecastingModel` is convenient, but every component also works on its\n", + "own. You can run the preprocessing pipeline, inspect intermediate data,\n", + "and call the forecaster directly." + ] + }, + { + "cell_type": "markdown", + "id": "c5caef5f", + "metadata": {}, + "source": [ + "### Run preprocessing on raw data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "638c7673", + "metadata": {}, + "outputs": [], + "source": [ + "preprocessed = model.preprocessing.transform(train_dataset)\n", + "\n", + "print(f\"Before preprocessing: {train_dataset.data.shape[1]} columns\")\n", + "print(f\"After preprocessing: {preprocessed.data.shape[1]} columns\")\n", + "print(f\"\\nAdded features: {sorted(set(preprocessed.data.columns) - set(train_dataset.data.columns))[:8]}...\")" + ] + }, + { + "cell_type": "markdown", + "id": "2560845f", + "metadata": {}, + "source": [ + "### Run a single transform" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "926884f3", + "metadata": {}, + "outputs": [], + "source": [ + "single_transform = CyclicFeaturesAdder()\n", + "single_transform.fit(train_dataset)\n", + "result_single = single_transform.transform(train_dataset)\n", + "\n", + "print(\n", + " f\"CyclicFeaturesAdder added {len(single_transform.features_added())} columns: {single_transform.features_added()}\"\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "0baca441", + "metadata": {}, + "source": [ + "### Call the forecaster directly\n", + "\n", + "After preprocessing, you can pass the data to a [`ForecastInputDataset`](https://openstef.github.io/openstef/v4/api/generated/openstef_core.datasets.ForecastInputDataset.html)\n", + "and call the forecaster directly.\n", + "This is useful for debugging or integrating into custom workflows." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2e610d16", + "metadata": {}, + "outputs": [], + "source": [ + "from openstef_core.datasets import ForecastInputDataset\n", + "\n", + "# Preprocess the prediction data\n", + "preprocessed_predict = model.preprocessing.transform(predict_dataset)\n", + "\n", + "# Convert to ForecastInputDataset (what the forecaster expects)\n", + "forecast_input = ForecastInputDataset(\n", + " data=preprocessed_predict.data,\n", + " sample_interval=preprocessed_predict.sample_interval,\n", + " target_column=\"load\",\n", + " forecast_start=train_end,\n", + ")\n", + "\n", + "# Call the forecaster directly\n", + "raw_forecast = model.forecaster.predict(forecast_input)\n", + "print(f\"Raw forecast shape: {raw_forecast.data.shape}\")\n", + "print(f\"Raw forecast columns: {list(raw_forecast.data.columns)}\")" + ] + }, + { + "cell_type": "markdown", + "id": "064b3de5", + "metadata": {}, + "source": [ + "## Next steps\n", + "\n", + "- {doc}`ensemble_forecasting` — combine your custom pipeline with other\n", + " models into an ensemble for better accuracy.\n", + "- {doc}`quantile_calibration` — append isotonic calibration to your\n", + " postprocessing for more reliable confidence intervals." + ] + } + ], + "metadata": { + "jupytext": { + "formats": "ipynb,py:percent" + }, + "kernelspec": { + "display_name": ".venv", + "language": "python", + "name": "python3" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/examples/tutorials/custom_pipeline.py b/examples/tutorials/custom_pipeline.py new file mode 100644 index 000000000..0d623a607 --- /dev/null +++ b/examples/tutorials/custom_pipeline.py @@ -0,0 +1,320 @@ +# --- +# jupyter: +# jupytext: +# formats: ipynb,py:percent +# text_representation: +# extension: .py +# format_name: percent +# format_version: '1.3' +# jupytext_version: 1.19.1 +# kernelspec: +# display_name: .venv +# language: python +# name: python3 +# --- + +# %% tags=["remove-cell"] +# SPDX-FileCopyrightText: 2025 Contributors to the OpenSTEF project +# +# SPDX-License-Identifier: MPL-2.0 + +# pyright: basic + +# %% tags=["remove-cell"] +import warnings + +warnings.filterwarnings("ignore") + +from openstef_core.testing import configure_notebook_display, setup_notebook_logging + +configure_notebook_display() +logger = setup_notebook_logging( + __name__, + suppress=( + "choreographer", + "kaleido", + "httpx", + "huggingface_hub", + "fsspec", + "filelock", + "openstef_core.datasets", + ), +) + +# %% [markdown] +# # Building a Custom Pipeline +# +# The [`create_forecasting_workflow`](https://openstef.github.io/openstef/v4/api/generated/openstef_models.presets.create_forecasting_workflow.html) preset handles pipeline assembly +# automatically. When you need full control — custom transforms, different +# feature engineering, or non-standard postprocessing — you can build a +# [`ForecastingModel`](https://openstef.github.io/openstef/v4/api/generated/openstef_models.models.ForecastingModel.html) from individual components. +# +# **What you'll learn:** +# +# - Assemble preprocessing, forecaster, and postprocessing into a pipeline +# - Select and configure individual transforms +# - Train and predict with a hand-built pipeline +# - Compare the custom pipeline against a preset +# +# ```{note} +# This tutorial is for advanced users who need to go beyond presets. +# Start with {doc}`forecasting_quickstart` for the standard approach. +# ``` +# +# **Key API references:** +# [`ForecastingModel`](https://openstef.github.io/openstef/v4/api/generated/openstef_models.models.ForecastingModel.html) +# · [`TransformPipeline`](https://openstef.github.io/openstef/v4/api/generated/openstef_core.mixins.TransformPipeline.html) +# · [`GBLinearForecaster`](https://openstef.github.io/openstef/v4/api/generated/openstef_models.models.forecasting.html) + +# %% [markdown] +# ## Load the dataset + +# %% +from datetime import timedelta + +from openstef_core.testing import load_liander_dataset +from openstef_core.types import LeadTime, Q + +dataset = load_liander_dataset() + +from datetime import datetime + +train_start = datetime.fromisoformat("2024-03-01T00:00:00Z") +train_end = train_start + timedelta(days=45) +forecast_end = train_end + timedelta(days=7) + +train_dataset = dataset.filter_by_range(start=train_start, end=train_end) +predict_dataset = dataset.filter_by_range( + start=train_end - timedelta(days=14), + end=forecast_end, +) + +print(f"Training: {train_dataset.data.shape[0]:,} rows") +print(f"Predict: {predict_dataset.data.shape[0]:,} rows") + +# %% [markdown] +# ## Define pipeline components +# +# A [`ForecastingModel`](https://openstef.github.io/openstef/v4/api/generated/openstef_models.models.ForecastingModel.html) has three stages: +# +# 1. **Preprocessing** — feature engineering and data cleaning transforms +# 2. **Forecaster** — the model that produces predictions +# 3. **Postprocessing** — transforms applied to the forecast output +# +# Below we build each stage explicitly. + +# %% [markdown] +# ### Preprocessing +# +# We select transforms from the available modules: +# +# | Module | Transforms | +# |--------|-----------| +# | `transforms.general` | Scaler, Imputer, NaNDropper, OutlierHandler, EmptyFeatureRemover | +# | `transforms.time_domain` | HolidayFeatureAdder, DatetimeFeaturesAdder, CyclicFeaturesAdder, LagsAdder | +# | `transforms.weather_domain` | AtmosphereDerivedFeaturesAdder, DaylightFeatureAdder, RadiationDerivedFeaturesAdder | +# | `transforms.energy_domain` | WindPowerFeatureAdder | +# | `transforms.validation` | CompletenessChecker, FlatlineChecker | + +# %% +from openstef_core.mixins import TransformPipeline +from openstef_models.transforms.general import EmptyFeatureRemover, Imputer, NaNDropper, Scaler +from openstef_models.transforms.time_domain import CyclicFeaturesAdder, HolidayFeatureAdder +from openstef_models.transforms.time_domain.lags_adder import LagsAdder +from openstef_models.utils.feature_selection import Exclude + +quantiles = [Q(0.1), Q(0.5), Q(0.9)] +horizons = [LeadTime.from_string("PT36H")] + +preprocessing = TransformPipeline( + transforms=[ + # Feature engineering + LagsAdder( + history_available=timedelta(days=14), + horizons=horizons, + add_trivial_lags=False, + target_column="load", + custom_lags=[timedelta(days=7)], + lag_fallback_offset=timedelta(days=7), + ), + CyclicFeaturesAdder(), + HolidayFeatureAdder(country_code="NL"), + # Standardization + Scaler(selection=Exclude("load"), method="standard"), + EmptyFeatureRemover(), + # Missing value handling + Imputer(selection=Exclude("load"), imputation_strategy="mean"), + NaNDropper(selection=Exclude("load")), + ] +) + +print(f"Preprocessing steps: {len(preprocessing.transforms)}") +for t in preprocessing.transforms: + print(f" - {type(t).__name__}") + +# %% [markdown] +# ### Forecaster +# +# We use `GBLinearForecaster` — a gradient-boosted linear model that works well +# with the Imputer + NaNDropper preprocessing pattern above. + +# %% +from openstef_models.models.forecasting.gblinear_forecaster import ( + GBLinearForecaster, + GBLinearHyperParams, +) + +forecaster = GBLinearForecaster( + quantiles=quantiles, + horizons=horizons, + hyperparams=GBLinearHyperParams( + n_steps=100, + learning_rate=0.3, + ), + verbosity=0, +) + +# %% [markdown] +# ### Postprocessing +# +# We add a [`QuantileSorter`](https://openstef.github.io/openstef/v4/api/generated/openstef_models.transforms.postprocessing.QuantileSorter.html) (ensures quantile ordering) and a +# [`ConfidenceIntervalApplicator`](https://openstef.github.io/openstef/v4/api/generated/openstef_models.transforms.postprocessing.ConfidenceIntervalApplicator.html) (adds confidence interval columns). + +# %% +from openstef_models.transforms.postprocessing import ( + ConfidenceIntervalApplicator, + QuantileSorter, +) + +postprocessing = TransformPipeline( + transforms=[ + QuantileSorter(), + ConfidenceIntervalApplicator( + quantiles=quantiles, + add_quantiles_from_std=False, + ), + ] +) + +# %% [markdown] +# ## Assemble the model +# +# [`ForecastingModel`](https://openstef.github.io/openstef/v4/api/generated/openstef_models.models.ForecastingModel.html) combines all three stages. We wrap it in a +# [`CustomForecastingWorkflow`](https://openstef.github.io/openstef/v4/api/generated/openstef_models.workflows.custom_forecasting_workflow.CustomForecastingWorkflow.html) which adds train/predict orchestration. + +# %% +from openstef_models.models.forecasting_model import ForecastingModel +from openstef_models.workflows import CustomForecastingWorkflow + +model = ForecastingModel( + preprocessing=preprocessing, + forecaster=forecaster, + postprocessing=postprocessing, + target_column="load", +) + +workflow = CustomForecastingWorkflow( + model_id="custom_pipeline_demo", + model=model, + callbacks=[], +) + +# %% [markdown] +# ## Train and predict + +# %% +result = workflow.fit(train_dataset) +forecast = workflow.predict(predict_dataset, forecast_start=train_end) + +print(f"Forecast rows: {len(forecast.data)}") +print(f"Columns: {list(forecast.data.columns)}") + +# %% tags=["remove-cell"] +assert len(forecast.data) > 100, f"Expected >100 forecast rows, got {len(forecast.data)}" + +# %% [markdown] +# ## Visualize the result + +# %% tags=["hide-input"] +from openstef_beam.analysis.plots import ForecastTimeSeriesPlotter + +fig = ( + ForecastTimeSeriesPlotter() + .add_measurements(measurements=predict_dataset.data["load"].loc[train_end:]) + .add_model( + model_name="Custom GBLinear", + forecast=forecast.median_series, + quantiles=forecast.quantiles_data, + ) + .plot() +) +fig.update_layout( + title="Custom pipeline — forecast vs actuals", + yaxis_title="Load (MW)", + xaxis_title="Time", + height=450, +) +fig.show() + +# %% [markdown] +# ## Using components individually +# +# `ForecastingModel` is convenient, but every component also works on its +# own. You can run the preprocessing pipeline, inspect intermediate data, +# and call the forecaster directly. + +# %% [markdown] +# ### Run preprocessing on raw data + +# %% +preprocessed = model.preprocessing.transform(train_dataset) + +print(f"Before preprocessing: {train_dataset.data.shape[1]} columns") +print(f"After preprocessing: {preprocessed.data.shape[1]} columns") +print(f"\nAdded features: {sorted(set(preprocessed.data.columns) - set(train_dataset.data.columns))[:8]}...") + +# %% [markdown] +# ### Run a single transform + +# %% +single_transform = CyclicFeaturesAdder() +single_transform.fit(train_dataset) +result_single = single_transform.transform(train_dataset) + +print( + f"CyclicFeaturesAdder added {len(single_transform.features_added())} columns: {single_transform.features_added()}" +) + +# %% [markdown] +# ### Call the forecaster directly +# +# After preprocessing, you can pass the data to a [`ForecastInputDataset`](https://openstef.github.io/openstef/v4/api/generated/openstef_core.datasets.ForecastInputDataset.html) +# and call the forecaster directly. +# This is useful for debugging or integrating into custom workflows. + +# %% +from openstef_core.datasets import ForecastInputDataset + +# Preprocess the prediction data +preprocessed_predict = model.preprocessing.transform(predict_dataset) + +# Convert to ForecastInputDataset (what the forecaster expects) +forecast_input = ForecastInputDataset( + data=preprocessed_predict.data, + sample_interval=preprocessed_predict.sample_interval, + target_column="load", + forecast_start=train_end, +) + +# Call the forecaster directly +raw_forecast = model.forecaster.predict(forecast_input) +print(f"Raw forecast shape: {raw_forecast.data.shape}") +print(f"Raw forecast columns: {list(raw_forecast.data.columns)}") + +# %% [markdown] +# ## Next steps +# +# - {doc}`ensemble_forecasting` — combine your custom pipeline with other +# models into an ensemble for better accuracy. +# - {doc}`quantile_calibration` — append isotonic calibration to your +# postprocessing for more reliable confidence intervals. diff --git a/examples/tutorials/ensemble_forecasting.ipynb b/examples/tutorials/ensemble_forecasting.ipynb new file mode 100644 index 000000000..af7d492e4 --- /dev/null +++ b/examples/tutorials/ensemble_forecasting.ipynb @@ -0,0 +1,536 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "d87cffae", + "metadata": { + "tags": [ + "remove-cell" + ] + }, + "outputs": [], + "source": [ + "# SPDX-FileCopyrightText: 2025 Contributors to the OpenSTEF project \n", + "#\n", + "# SPDX-License-Identifier: MPL-2.0\n", + "\n", + "# pyright: basic" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4c12c212", + "metadata": { + "tags": [ + "remove-cell" + ] + }, + "outputs": [], + "source": [ + "import warnings\n", + "\n", + "warnings.filterwarnings(\"ignore\")\n", + "\n", + "from openstef_core.testing import configure_notebook_display, setup_notebook_logging\n", + "\n", + "configure_notebook_display()\n", + "logger = setup_notebook_logging(\n", + " __name__,\n", + " suppress=(\n", + " \"choreographer\",\n", + " \"kaleido\",\n", + " \"httpx\",\n", + " \"huggingface_hub\",\n", + " \"fsspec\",\n", + " \"filelock\",\n", + " \"openstef_core.datasets\",\n", + " \"lightgbm\",\n", + " ),\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "e283b926", + "metadata": {}, + "source": [ + "# Ensemble Forecasting\n", + "\n", + "OpenSTEF supports ensemble models that combine multiple base forecasters\n", + "into a single prediction. A **combiner** learns which base model performs\n", + "best under different conditions and weights their outputs accordingly.\n", + "\n", + "**What you'll learn:**\n", + "\n", + "- Why combining tree-based and linear models improves forecasts\n", + "- How to configure and train an ensemble with [`EnsembleForecastingWorkflowConfig`](https://openstef.github.io/openstef/v4/api/generated/openstef_meta.presets.EnsembleForecastingWorkflowConfig.html)\n", + "- How to inspect combiner behavior (which model does it prefer?)\n", + "- How ensemble predictions compare to individual base models\n", + "\n", + "```{note}\n", + "This tutorial uses a small dataset for fast execution.\n", + "See `examples/benchmarks/` for production-scale ensemble runs.\n", + "```\n", + "\n", + "**Key API references:**\n", + "[`EnsembleForecastingWorkflowConfig`](https://openstef.github.io/openstef/v4/api/generated/openstef_meta.presets.EnsembleForecastingWorkflowConfig.html)\n", + "· [`create_ensemble_forecasting_workflow`](https://openstef.github.io/openstef/v4/api/generated/openstef_meta.presets.create_ensemble_forecasting_workflow.html)\n", + "· [`ForecastingWorkflowConfig`](https://openstef.github.io/openstef/v4/api/generated/openstef_models.presets.ForecastingWorkflowConfig.html)" + ] + }, + { + "cell_type": "markdown", + "id": "4dbf4b68", + "metadata": {}, + "source": [ + "## Why ensemble forecasting?\n", + "\n", + "Different model types have complementary strengths:\n", + "\n", + "| Model | Strengths | Weaknesses |\n", + "|-------|-----------|------------|\n", + "| **Tree-based** (LightGBM, XGBoost) | Captures complex non-linear patterns, handles feature interactions well | Poor extrapolation — struggles with unseen peaks, seasonal shifts, or values outside training range |\n", + "| **Linear** (GBLinear) | Extrapolates naturally to new ranges, captures seasonal/solar trends | Cannot model non-linear interactions |\n", + "\n", + "In energy forecasting, load peaks during extreme weather or seasonal\n", + "transitions often fall outside the training distribution. A tree-based\n", + "model underestimates these peaks while a linear model captures the trend\n", + "but misses finer patterns. An **ensemble** combines both: the combiner\n", + "learns *when* each model is more reliable and weights accordingly.\n", + "\n", + "## How it works\n", + "\n", + "An ensemble workflow has three layers:\n", + "\n", + "1. **Common preprocessing** — shared feature engineering (lags, holidays,\n", + " cyclic features, scaling) applied once to raw data.\n", + "2. **Base forecasters** — multiple models each trained on the preprocessed\n", + " data, with optional per-model transforms (e.g. GBLinear gets fewer lags\n", + " to avoid collinearity).\n", + "3. **Combiner** — learns to aggregate base forecaster outputs. Two modes:\n", + " - *Learned weights*: a classifier predicts which base model will perform\n", + " best for each sample, then weights predictions accordingly.\n", + " - *Stacking*: a meta-regressor trained on base model outputs per quantile." + ] + }, + { + "cell_type": "markdown", + "id": "3725ade6", + "metadata": {}, + "source": [ + "## Load the dataset" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9f5936af", + "metadata": {}, + "outputs": [], + "source": [ + "from datetime import datetime, timedelta\n", + "\n", + "from openstef_core.testing import load_liander_dataset\n", + "from openstef_core.types import LeadTime, Q\n", + "\n", + "dataset = load_liander_dataset()\n", + "\n", + "train_start = datetime.fromisoformat(\"2024-03-01T00:00:00Z\")\n", + "train_end = train_start + timedelta(days=45)\n", + "forecast_end = train_end + timedelta(days=7)\n", + "\n", + "train_dataset = dataset.filter_by_range(start=train_start, end=train_end)\n", + "predict_dataset = dataset.filter_by_range(\n", + " start=train_end - timedelta(days=14),\n", + " end=forecast_end,\n", + ")\n", + "\n", + "print(f\"Training: {train_dataset.data.shape[0]:,} rows\")\n", + "print(f\"Predict: {predict_dataset.data.shape[0]:,} rows\")" + ] + }, + { + "cell_type": "markdown", + "id": "ac238dc8", + "metadata": {}, + "source": [ + "## Configure the ensemble\n", + "\n", + "[`EnsembleForecastingWorkflowConfig`](https://openstef.github.io/openstef/v4/api/generated/openstef_meta.presets.EnsembleForecastingWorkflowConfig.html) sets up the full pipeline.\n", + "Key parameters:\n", + "\n", + "- `base_models` — which forecasters to include\n", + "- `ensemble_type` — how to combine them (`\"learned_weights\"` or `\"stacking\"`)\n", + "- `combiner_model` — the algorithm used by the combiner" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "791aa085", + "metadata": {}, + "outputs": [], + "source": [ + "from openstef_meta.presets import EnsembleForecastingWorkflowConfig, create_ensemble_forecasting_workflow\n", + "\n", + "ensemble_config = EnsembleForecastingWorkflowConfig(\n", + " model_id=\"ensemble_demo\",\n", + " # Ensemble architecture\n", + " ensemble_type=\"learned_weights\",\n", + " base_models=[\"lgbm\", \"gblinear\"],\n", + " combiner_model=\"lgbm\",\n", + " # Forecast settings\n", + " horizons=[LeadTime.from_string(\"PT36H\")],\n", + " quantiles=[Q(0.5), Q(0.1), Q(0.9)],\n", + " # Data columns\n", + " target_column=\"load\",\n", + " temperature_column=\"temperature_2m\",\n", + " relative_humidity_column=\"relative_humidity_2m\",\n", + " wind_speed_column=\"wind_speed_10m\",\n", + " radiation_column=\"shortwave_radiation\",\n", + " pressure_column=\"surface_pressure\",\n", + " # Disable MLFlow for tutorial\n", + " mlflow_storage=None,\n", + ")\n", + "\n", + "print(f\"Base models: {list(ensemble_config.base_models)}\")\n", + "print(f\"Ensemble type: {ensemble_config.ensemble_type}\")\n", + "print(f\"Combiner: {ensemble_config.combiner_model}\")" + ] + }, + { + "cell_type": "markdown", + "id": "52685e3b", + "metadata": {}, + "source": [ + "## Create and train the ensemble workflow" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "355268ad", + "metadata": {}, + "outputs": [], + "source": [ + "workflow = create_ensemble_forecasting_workflow(ensemble_config)\n", + "fit_result = workflow.fit(train_dataset)\n", + "\n", + "print(\"Ensemble trained successfully\")\n", + "print(\"\\nPer-model validation R2:\")\n", + "for name, child_result in fit_result.component_fit_results.items():\n", + " r2 = child_result.metrics_val.get_metric(quantile=Q(0.5), metric_name=\"R2\")\n", + " print(f\" {name:12s}: {r2:.4f}\")\n", + "\n", + "# Get combiner (ensemble-level) R2\n", + "ensemble_r2 = fit_result.metrics_val.get_metric(quantile=Q(0.5), metric_name=\"R2\")\n", + "print(f\" {'ensemble':12s}: {ensemble_r2:.4f}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1ab6bba0", + "metadata": { + "tags": [ + "remove-cell" + ] + }, + "outputs": [], + "source": [ + "assert ensemble_r2 is not None and ensemble_r2 > 0.0, f\"Expected positive R2, got {ensemble_r2}\"" + ] + }, + { + "cell_type": "markdown", + "id": "08ab509c", + "metadata": {}, + "source": [ + "## Generate forecasts" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7cf50a50", + "metadata": {}, + "outputs": [], + "source": [ + "forecast = workflow.predict(predict_dataset, forecast_start=train_end)\n", + "\n", + "print(f\"Forecast rows: {len(forecast.data):,}\")\n", + "print(f\"Quantiles: {[float(q) for q in forecast.quantiles]}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cb1f3f30", + "metadata": { + "tags": [ + "remove-cell" + ] + }, + "outputs": [], + "source": [ + "assert len(forecast.data) > 100, f\"Expected >100 forecast rows, got {len(forecast.data)}\"" + ] + }, + { + "cell_type": "markdown", + "id": "1a52947e", + "metadata": {}, + "source": [ + "## Compare: ensemble vs individual base models\n", + "\n", + "To show the benefit of ensembling, let's also train each base model\n", + "individually and compare their forecasts." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d47ceef4", + "metadata": {}, + "outputs": [], + "source": [ + "from openstef_models.presets import ForecastingWorkflowConfig, create_forecasting_workflow\n", + "\n", + "individual_forecasts = {}\n", + "for model_type in [\"lgbm\", \"gblinear\"]:\n", + " config = ForecastingWorkflowConfig(\n", + " model_id=f\"{model_type}_solo\",\n", + " model=model_type,\n", + " horizons=[LeadTime.from_string(\"PT36H\")],\n", + " quantiles=[Q(0.5), Q(0.1), Q(0.9)],\n", + " target_column=\"load\",\n", + " temperature_column=\"temperature_2m\",\n", + " relative_humidity_column=\"relative_humidity_2m\",\n", + " wind_speed_column=\"wind_speed_10m\",\n", + " radiation_column=\"shortwave_radiation\",\n", + " pressure_column=\"surface_pressure\",\n", + " mlflow_storage=None,\n", + " verbosity=0,\n", + " )\n", + " wf = create_forecasting_workflow(config)\n", + " wf.fit(train_dataset)\n", + " individual_forecasts[model_type] = wf.predict(predict_dataset, forecast_start=train_end)" + ] + }, + { + "cell_type": "markdown", + "id": "002117a5", + "metadata": {}, + "source": [ + "## Visualize the comparison" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1cfea55d", + "metadata": { + "tags": [ + "hide-input" + ] + }, + "outputs": [], + "source": [ + "from openstef_beam.analysis.plots import ForecastTimeSeriesPlotter\n", + "\n", + "plotter = ForecastTimeSeriesPlotter()\n", + "plotter.add_measurements(measurements=predict_dataset.data[\"load\"].loc[train_end:])\n", + "\n", + "# Add individual models\n", + "for name, fc in individual_forecasts.items():\n", + " plotter.add_model(model_name=name, forecast=fc.median_series, quantiles=fc.quantiles_data)\n", + "\n", + "# Add ensemble\n", + "plotter.add_model(model_name=\"Ensemble\", forecast=forecast.median_series, quantiles=forecast.quantiles_data)\n", + "\n", + "fig = plotter.plot()\n", + "fig.update_layout(\n", + " title=\"Ensemble vs Individual Models\",\n", + " xaxis_title=\"Time\",\n", + " yaxis_title=\"MW\",\n", + " height=450,\n", + ")\n", + "fig.show()" + ] + }, + { + "cell_type": "markdown", + "id": "8f632fa8", + "metadata": {}, + "source": [ + "## Combiner insights\n", + "\n", + "The learned-weights combiner trains a classifier that predicts — for each\n", + "timestep — which base model will be most accurate. It then uses those\n", + "predicted probabilities as mixing weights.\n", + "\n", + "We can inspect this behavior at two levels:\n", + "\n", + "1. **Global feature importances** — which input signals the classifier\n", + " relies on most when deciding between models.\n", + "2. **Per-timestamp selection weights** — the actual mixing probabilities\n", + " assigned to each model during forecasting." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3d2093d0", + "metadata": {}, + "outputs": [], + "source": [ + "import plotly.graph_objects as go\n", + "\n", + "ensemble_model = workflow.model\n", + "combiner = ensemble_model.combiner\n", + "\n", + "# Global importances: shows how much the classifier attends to each base\n", + "# model's prediction value when deciding which model to trust.\n", + "importances = combiner.feature_importances\n", + "print(\"Combiner feature importances (per quantile):\")\n", + "print(importances.to_string())" + ] + }, + { + "cell_type": "markdown", + "id": "d0f83593", + "metadata": {}, + "source": [ + "With two base models, the feature importance tells us how much the\n", + "combiner's internal classifier *uses* each model's prediction to decide\n", + "who should contribute more. A higher importance means the classifier pays\n", + "more attention to that model's output when making the selection decision.\n", + "\n", + "More informative is the **per-timestamp weight** — the actual probability\n", + "the combiner assigns to each model at each point in time during forecasting." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7cde7d87", + "metadata": { + "tags": [ + "hide-input" + ] + }, + "outputs": [], + "source": [ + "# Reproduce the internal flow to extract per-timestamp weights\n", + "ensemble_dataset = ensemble_model._predict_forecasters(predict_dataset, forecast_start=train_end)\n", + "base_preds = ensemble_dataset.get_base_predictions_for_quantile(Q(0.5))\n", + "input_data = base_preds.input_data()\n", + "weights = combiner._predict_weights(input_data, Q(0.5))\n", + "\n", + "fig = go.Figure()\n", + "for col in weights.columns:\n", + " fig.add_trace(go.Scatter(x=weights.index, y=weights[col], mode=\"lines\", name=col, stackgroup=\"one\"))\n", + "fig.update_layout(\n", + " title=\"Combiner Model Selection Weights Over Time (q50)\",\n", + " xaxis_title=\"Time\",\n", + " yaxis_title=\"Weight (probability)\",\n", + " yaxis_range=[0, 1],\n", + " height=350,\n", + " legend_title=\"Base model\",\n", + ")\n", + "fig.show()" + ] + }, + { + "cell_type": "markdown", + "id": "2c8cb8de", + "metadata": {}, + "source": [ + "The stacked area chart reveals *when* the combiner trusts each model.\n", + "Typical patterns:\n", + "\n", + "- **gblinear dominates at peaks/troughs** — its linear extrapolation\n", + " handles values near or beyond the training range better.\n", + "- **lgbm dominates during stable periods** — its tree-based flexibility\n", + " captures non-linear patterns (time-of-day effects, weather interactions)\n", + " more accurately when extrapolation is not needed.\n", + "\n", + "This adaptive selection is the core advantage of ensembling: neither model\n", + "alone achieves the accuracy of the dynamically-weighted combination." + ] + }, + { + "cell_type": "markdown", + "id": "0a012679", + "metadata": {}, + "source": [ + "## Metrics comparison\n", + "\n", + "Let's quantify the ensemble advantage with relative MAE (rMAE) on the\n", + "forecast period. rMAE normalizes the MAE by the range of actuals, making it\n", + "easier to compare across datasets with different scales. We use the\n", + "implementation from `openstef_beam.metrics`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2b327d1e", + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "from openstef_beam.metrics import rmae\n", + "\n", + "actuals = predict_dataset.data[\"load\"].loc[train_end:forecast_end]\n", + "\n", + "models = {\"lgbm\": individual_forecasts[\"lgbm\"], \"gblinear\": individual_forecasts[\"gblinear\"], \"Ensemble\": forecast}\n", + "\n", + "print(f\"{'Model':<12} {'rMAE':>8}\")\n", + "print(f\"{'':-<12} {'':-^8}\")\n", + "for name, fc in models.items():\n", + " common = actuals.index.intersection(fc.median_series.index)\n", + " print(f\"{name:<12} {rmae(actuals.loc[common].to_numpy(), fc.median_series.loc[common].to_numpy()):>8.4f}\")" + ] + }, + { + "cell_type": "markdown", + "id": "afc3f9ca", + "metadata": {}, + "source": [ + "The ensemble consistently achieves the lowest rMAE by combining the\n", + "strengths of both models. In production with longer training windows and\n", + "more diverse base models (e.g. adding XGBoost or a neural forecaster),\n", + "the improvement typically grows larger. To validate ensemble gains over\n", + "longer periods, run a full {doc}`backtesting_quickstart`." + ] + }, + { + "cell_type": "markdown", + "id": "fc0bae78", + "metadata": {}, + "source": [ + "## Next steps\n", + "\n", + "- {doc}`hyperparameter_tuning_with_optuna` — tune each base model's\n", + " parameters before combining them.\n", + "- {doc}`quantile_calibration` — calibrate the ensemble's uncertainty\n", + " estimates for more reliable confidence intervals." + ] + } + ], + "metadata": { + "jupytext": { + "formats": "ipynb,py:percent" + }, + "kernelspec": { + "display_name": ".venv", + "language": "python", + "name": "python3" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/examples/tutorials/ensemble_forecasting.py b/examples/tutorials/ensemble_forecasting.py new file mode 100644 index 000000000..065003205 --- /dev/null +++ b/examples/tutorials/ensemble_forecasting.py @@ -0,0 +1,349 @@ +# --- +# jupyter: +# jupytext: +# formats: ipynb,py:percent +# text_representation: +# extension: .py +# format_name: percent +# format_version: '1.3' +# jupytext_version: 1.19.1 +# kernelspec: +# display_name: .venv +# language: python +# name: python3 +# --- + +# %% tags=["remove-cell"] +# SPDX-FileCopyrightText: 2025 Contributors to the OpenSTEF project +# +# SPDX-License-Identifier: MPL-2.0 + +# pyright: basic + +# %% tags=["remove-cell"] +import warnings + +warnings.filterwarnings("ignore") + +from openstef_core.testing import configure_notebook_display, setup_notebook_logging + +configure_notebook_display() +logger = setup_notebook_logging( + __name__, + suppress=( + "choreographer", + "kaleido", + "httpx", + "huggingface_hub", + "fsspec", + "filelock", + "openstef_core.datasets", + "lightgbm", + ), +) + +# %% [markdown] +# # Ensemble Forecasting +# +# OpenSTEF supports ensemble models that combine multiple base forecasters +# into a single prediction. A **combiner** learns which base model performs +# best under different conditions and weights their outputs accordingly. +# +# **What you'll learn:** +# +# - Why combining tree-based and linear models improves forecasts +# - How to configure and train an ensemble with [`EnsembleForecastingWorkflowConfig`](https://openstef.github.io/openstef/v4/api/generated/openstef_meta.presets.EnsembleForecastingWorkflowConfig.html) +# - How to inspect combiner behavior (which model does it prefer?) +# - How ensemble predictions compare to individual base models +# +# ```{note} +# This tutorial uses a small dataset for fast execution. +# See `examples/benchmarks/` for production-scale ensemble runs. +# ``` +# +# **Key API references:** +# [`EnsembleForecastingWorkflowConfig`](https://openstef.github.io/openstef/v4/api/generated/openstef_meta.presets.EnsembleForecastingWorkflowConfig.html) +# · [`create_ensemble_forecasting_workflow`](https://openstef.github.io/openstef/v4/api/generated/openstef_meta.presets.create_ensemble_forecasting_workflow.html) +# · [`ForecastingWorkflowConfig`](https://openstef.github.io/openstef/v4/api/generated/openstef_models.presets.ForecastingWorkflowConfig.html) + +# %% [markdown] +# ## Why ensemble forecasting? +# +# Different model types have complementary strengths: +# +# | Model | Strengths | Weaknesses | +# |-------|-----------|------------| +# | **Tree-based** (LightGBM, XGBoost) | Captures complex non-linear patterns, handles feature interactions well | Poor extrapolation — struggles with unseen peaks, seasonal shifts, or values outside training range | +# | **Linear** (GBLinear) | Extrapolates naturally to new ranges, captures seasonal/solar trends | Cannot model non-linear interactions | +# +# In energy forecasting, load peaks during extreme weather or seasonal +# transitions often fall outside the training distribution. A tree-based +# model underestimates these peaks while a linear model captures the trend +# but misses finer patterns. An **ensemble** combines both: the combiner +# learns *when* each model is more reliable and weights accordingly. +# +# ## How it works +# +# An ensemble workflow has three layers: +# +# 1. **Common preprocessing** — shared feature engineering (lags, holidays, +# cyclic features, scaling) applied once to raw data. +# 2. **Base forecasters** — multiple models each trained on the preprocessed +# data, with optional per-model transforms (e.g. GBLinear gets fewer lags +# to avoid collinearity). +# 3. **Combiner** — learns to aggregate base forecaster outputs. Two modes: +# - *Learned weights*: a classifier predicts which base model will perform +# best for each sample, then weights predictions accordingly. +# - *Stacking*: a meta-regressor trained on base model outputs per quantile. + +# %% [markdown] +# ## Load the dataset + +# %% +from datetime import datetime, timedelta + +from openstef_core.testing import load_liander_dataset +from openstef_core.types import LeadTime, Q + +dataset = load_liander_dataset() + +train_start = datetime.fromisoformat("2024-03-01T00:00:00Z") +train_end = train_start + timedelta(days=45) +forecast_end = train_end + timedelta(days=7) + +train_dataset = dataset.filter_by_range(start=train_start, end=train_end) +predict_dataset = dataset.filter_by_range( + start=train_end - timedelta(days=14), + end=forecast_end, +) + +print(f"Training: {train_dataset.data.shape[0]:,} rows") +print(f"Predict: {predict_dataset.data.shape[0]:,} rows") + +# %% [markdown] +# ## Configure the ensemble +# +# [`EnsembleForecastingWorkflowConfig`](https://openstef.github.io/openstef/v4/api/generated/openstef_meta.presets.EnsembleForecastingWorkflowConfig.html) sets up the full pipeline. +# Key parameters: +# +# - `base_models` — which forecasters to include +# - `ensemble_type` — how to combine them (`"learned_weights"` or `"stacking"`) +# - `combiner_model` — the algorithm used by the combiner + +# %% +from openstef_meta.presets import EnsembleForecastingWorkflowConfig, create_ensemble_forecasting_workflow + +ensemble_config = EnsembleForecastingWorkflowConfig( + model_id="ensemble_demo", + # Ensemble architecture + ensemble_type="learned_weights", + base_models=["lgbm", "gblinear"], + combiner_model="lgbm", + # Forecast settings + horizons=[LeadTime.from_string("PT36H")], + quantiles=[Q(0.5), Q(0.1), Q(0.9)], + # Data columns + target_column="load", + temperature_column="temperature_2m", + relative_humidity_column="relative_humidity_2m", + wind_speed_column="wind_speed_10m", + radiation_column="shortwave_radiation", + pressure_column="surface_pressure", + # Disable MLFlow for tutorial + mlflow_storage=None, +) + +print(f"Base models: {list(ensemble_config.base_models)}") +print(f"Ensemble type: {ensemble_config.ensemble_type}") +print(f"Combiner: {ensemble_config.combiner_model}") + +# %% [markdown] +# ## Create and train the ensemble workflow + +# %% +workflow = create_ensemble_forecasting_workflow(ensemble_config) +fit_result = workflow.fit(train_dataset) + +print("Ensemble trained successfully") +print("\nPer-model validation R2:") +for name, child_result in fit_result.component_fit_results.items(): + r2 = child_result.metrics_val.get_metric(quantile=Q(0.5), metric_name="R2") + print(f" {name:12s}: {r2:.4f}") + +# Get combiner (ensemble-level) R2 +ensemble_r2 = fit_result.metrics_val.get_metric(quantile=Q(0.5), metric_name="R2") +print(f" {'ensemble':12s}: {ensemble_r2:.4f}") + +# %% tags=["remove-cell"] +assert ensemble_r2 is not None and ensemble_r2 > 0.0, f"Expected positive R2, got {ensemble_r2}" + +# %% [markdown] +# ## Generate forecasts + +# %% +forecast = workflow.predict(predict_dataset, forecast_start=train_end) + +print(f"Forecast rows: {len(forecast.data):,}") +print(f"Quantiles: {[float(q) for q in forecast.quantiles]}") + +# %% tags=["remove-cell"] +assert len(forecast.data) > 100, f"Expected >100 forecast rows, got {len(forecast.data)}" + +# %% [markdown] +# ## Compare: ensemble vs individual base models +# +# To show the benefit of ensembling, let's also train each base model +# individually and compare their forecasts. + +# %% +from openstef_models.presets import ForecastingWorkflowConfig, create_forecasting_workflow + +individual_forecasts = {} +for model_type in ["lgbm", "gblinear"]: + config = ForecastingWorkflowConfig( + model_id=f"{model_type}_solo", + model=model_type, + horizons=[LeadTime.from_string("PT36H")], + quantiles=[Q(0.5), Q(0.1), Q(0.9)], + target_column="load", + temperature_column="temperature_2m", + relative_humidity_column="relative_humidity_2m", + wind_speed_column="wind_speed_10m", + radiation_column="shortwave_radiation", + pressure_column="surface_pressure", + mlflow_storage=None, + verbosity=0, + ) + wf = create_forecasting_workflow(config) + wf.fit(train_dataset) + individual_forecasts[model_type] = wf.predict(predict_dataset, forecast_start=train_end) + +# %% [markdown] +# ## Visualize the comparison + +# %% tags=["hide-input"] +from openstef_beam.analysis.plots import ForecastTimeSeriesPlotter + +plotter = ForecastTimeSeriesPlotter() +plotter.add_measurements(measurements=predict_dataset.data["load"].loc[train_end:]) + +# Add individual models +for name, fc in individual_forecasts.items(): + plotter.add_model(model_name=name, forecast=fc.median_series, quantiles=fc.quantiles_data) + +# Add ensemble +plotter.add_model(model_name="Ensemble", forecast=forecast.median_series, quantiles=forecast.quantiles_data) + +fig = plotter.plot() +fig.update_layout( + title="Ensemble vs Individual Models", + xaxis_title="Time", + yaxis_title="MW", + height=450, +) +fig.show() + +# %% [markdown] +# ## Combiner insights +# +# The learned-weights combiner trains a classifier that predicts — for each +# timestep — which base model will be most accurate. It then uses those +# predicted probabilities as mixing weights. +# +# We can inspect this behavior at two levels: +# +# 1. **Global feature importances** — which input signals the classifier +# relies on most when deciding between models. +# 2. **Per-timestamp selection weights** — the actual mixing probabilities +# assigned to each model during forecasting. + +# %% +import plotly.graph_objects as go + +ensemble_model = workflow.model +combiner = ensemble_model.combiner + +# Global importances: shows how much the classifier attends to each base +# model's prediction value when deciding which model to trust. +importances = combiner.feature_importances +print("Combiner feature importances (per quantile):") +print(importances.to_string()) + +# %% [markdown] +# With two base models, the feature importance tells us how much the +# combiner's internal classifier *uses* each model's prediction to decide +# who should contribute more. A higher importance means the classifier pays +# more attention to that model's output when making the selection decision. +# +# More informative is the **per-timestamp weight** — the actual probability +# the combiner assigns to each model at each point in time during forecasting. + +# %% tags=["hide-input"] +# Reproduce the internal flow to extract per-timestamp weights +ensemble_dataset = ensemble_model._predict_forecasters(predict_dataset, forecast_start=train_end) +base_preds = ensemble_dataset.get_base_predictions_for_quantile(Q(0.5)) +input_data = base_preds.input_data() +weights = combiner._predict_weights(input_data, Q(0.5)) + +fig = go.Figure() +for col in weights.columns: + fig.add_trace(go.Scatter(x=weights.index, y=weights[col], mode="lines", name=col, stackgroup="one")) +fig.update_layout( + title="Combiner Model Selection Weights Over Time (q50)", + xaxis_title="Time", + yaxis_title="Weight (probability)", + yaxis_range=[0, 1], + height=350, + legend_title="Base model", +) +fig.show() + +# %% [markdown] +# The stacked area chart reveals *when* the combiner trusts each model. +# Typical patterns: +# +# - **gblinear dominates at peaks/troughs** — its linear extrapolation +# handles values near or beyond the training range better. +# - **lgbm dominates during stable periods** — its tree-based flexibility +# captures non-linear patterns (time-of-day effects, weather interactions) +# more accurately when extrapolation is not needed. +# +# This adaptive selection is the core advantage of ensembling: neither model +# alone achieves the accuracy of the dynamically-weighted combination. + +# %% [markdown] +# ## Metrics comparison +# +# Let's quantify the ensemble advantage with relative MAE (rMAE) on the +# forecast period. rMAE normalizes the MAE by the range of actuals, making it +# easier to compare across datasets with different scales. We use the +# implementation from `openstef_beam.metrics`. + +# %% + +from openstef_beam.metrics import rmae + +actuals = predict_dataset.data["load"].loc[train_end:forecast_end] + +models = {"lgbm": individual_forecasts["lgbm"], "gblinear": individual_forecasts["gblinear"], "Ensemble": forecast} + +print(f"{'Model':<12} {'rMAE':>8}") +print(f"{'':-<12} {'':-^8}") +for name, fc in models.items(): + common = actuals.index.intersection(fc.median_series.index) + print(f"{name:<12} {rmae(actuals.loc[common].to_numpy(), fc.median_series.loc[common].to_numpy()):>8.4f}") + +# %% [markdown] +# The ensemble consistently achieves the lowest rMAE by combining the +# strengths of both models. In production with longer training windows and +# more diverse base models (e.g. adding XGBoost or a neural forecaster), +# the improvement typically grows larger. To validate ensemble gains over +# longer periods, run a full {doc}`backtesting_quickstart`. + +# %% [markdown] +# ## Next steps +# +# - {doc}`hyperparameter_tuning_with_optuna` — tune each base model's +# parameters before combining them. +# - {doc}`quantile_calibration` — calibrate the ensemble's uncertainty +# estimates for more reliable confidence intervals. diff --git a/examples/tutorials/forecasting_quickstart.ipynb b/examples/tutorials/forecasting_quickstart.ipynb new file mode 100644 index 000000000..c9379ae3e --- /dev/null +++ b/examples/tutorials/forecasting_quickstart.ipynb @@ -0,0 +1,359 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "bff4101e", + "metadata": { + "tags": [ + "remove-cell" + ] + }, + "outputs": [], + "source": [ + "# SPDX-FileCopyrightText: 2025 Contributors to the OpenSTEF project \n", + "#\n", + "# SPDX-License-Identifier: MPL-2.0\n", + "\n", + "# pyright: basic" + ] + }, + { + "cell_type": "markdown", + "id": "eb73d789", + "metadata": {}, + "source": [ + "# Forecasting Quickstart\n", + "\n", + "Train a GBLinear model on real energy data and generate probabilistic forecasts\n", + "with confidence intervals — all in under a minute.\n", + "\n", + "**What you'll learn:**\n", + "\n", + "- Load the Liander 2024 benchmark dataset\n", + "- Configure a forecasting workflow with `ForecastingWorkflowConfig`\n", + "- Train a model and inspect evaluation metrics\n", + "- Generate quantile forecasts (P10 / P50 / P90)\n", + "- Visualize predictions against actuals\n", + "\n", + "```{note}\n", + "This tutorial uses a small data slice for fast execution.\n", + "See `examples/benchmarks/` for production-scale runs.\n", + "```\n", + "\n", + "**Key API references:**\n", + "[`ForecastingWorkflowConfig`](https://openstef.github.io/openstef/v4/api/generated/openstef_models.presets.ForecastingWorkflowConfig.html)\n", + "[`create_forecasting_workflow`](https://openstef.github.io/openstef/v4/api/generated/openstef_models.presets.create_forecasting_workflow.html)\n", + "· [`LeadTime`](https://openstef.github.io/openstef/v4/api/generated/openstef_core.types.LeadTime.html)\n", + "· [`Q`](https://openstef.github.io/openstef/v4/api/generated/openstef_core.types.Quantile.html)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ebcdecfb", + "metadata": { + "tags": [ + "remove-cell" + ] + }, + "outputs": [], + "source": [ + "import warnings\n", + "from typing import Any, cast\n", + "\n", + "warnings.filterwarnings(\"ignore\")\n", + "\n", + "from openstef_core.testing import configure_notebook_display, setup_notebook_logging\n", + "\n", + "configure_notebook_display()\n", + "logger = setup_notebook_logging(\n", + " __name__,\n", + " suppress=(\n", + " \"choreographer\",\n", + " \"kaleido\",\n", + " \"httpx\",\n", + " \"huggingface_hub\",\n", + " \"fsspec\",\n", + " \"filelock\",\n", + " \"openstef_core.datasets\",\n", + " ),\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "6a1838eb", + "metadata": {}, + "source": [ + "## Load the dataset\n", + "\n", + "The [Liander 2024 benchmark](https://huggingface.co/datasets/Alliander/MSL_Benchmark_Dataset)\n", + "dataset contains load measurements, versioned weather forecasts, EPEX prices, and\n", + "load profiles for a medium-voltage feeder in the Netherlands.\n", + "\n", + "We split the data into:\n", + "\n", + "- **45 days** of training data\n", + "- **7 days** for forecasting\n", + "\n", + "The predict window includes **14 days of history** before the forecast start so\n", + "that lag features (e.g. `load_lag_P7D`) can be computed during prediction." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ac530157", + "metadata": {}, + "outputs": [], + "source": [ + "from datetime import datetime, timedelta\n", + "\n", + "from openstef_core.testing import load_liander_dataset\n", + "\n", + "dataset = load_liander_dataset()\n", + "\n", + "train_start = datetime.fromisoformat(\"2024-03-01T00:00:00Z\")\n", + "train_end = train_start + timedelta(days=45)\n", + "forecast_end = train_end + timedelta(days=7)\n", + "\n", + "train_dataset = dataset.filter_by_range(start=train_start, end=train_end)\n", + "\n", + "# Include 14 days of history before forecast start for lag feature computation\n", + "predict_dataset = dataset.filter_by_range(\n", + " start=train_end - timedelta(days=14),\n", + " end=forecast_end,\n", + ")\n", + "\n", + "print(\n", + " f\"Training: {train_dataset.data.shape[0]:,} rows, \"\n", + " f\"{train_dataset.data.index.min():%Y-%m-%d} to {train_dataset.data.index.max():%Y-%m-%d}\"\n", + ")\n", + "print(\n", + " f\"Predict: {predict_dataset.data.shape[0]:,} rows, \"\n", + " f\"{predict_dataset.data.index.min():%Y-%m-%d} to {predict_dataset.data.index.max():%Y-%m-%d}\"\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "810b4ff7", + "metadata": { + "tags": [ + "hide-input" + ] + }, + "outputs": [], + "source": [ + "# Quick look at the target variable\n", + "fig = cast(Any, train_dataset.data[[\"load\"]].plot(title=\"Training period — energy load\"))\n", + "fig.update_layout(yaxis_title=\"Load (MW)\", xaxis_title=\"Time\")\n", + "fig.show()" + ] + }, + { + "cell_type": "markdown", + "id": "d890079d", + "metadata": {}, + "source": [ + "## Configure the workflow\n", + "\n", + "[`ForecastingWorkflowConfig`](https://openstef.github.io/openstef/v4/api/generated/openstef_models.presets.ForecastingWorkflowConfig.html) bundles all settings — model type, horizons, quantiles,\n", + "and feature columns — into a single object. [`create_forecasting_workflow`](https://openstef.github.io/openstef/v4/api/generated/openstef_models.presets.create_forecasting_workflow.html) turns\n", + "it into a ready-to-use pipeline with preprocessing, training, and postprocessing.\n", + "\n", + "We pick **GBLinear** (gradient-boosted linear model) for its speed and\n", + "ability to extrapolate beyond training data." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "79ad62bd", + "metadata": {}, + "outputs": [], + "source": [ + "from openstef_core.types import LeadTime, Q\n", + "from openstef_models.presets import ForecastingWorkflowConfig, create_forecasting_workflow\n", + "from openstef_models.presets.forecasting_workflow import GBLinearForecaster\n", + "\n", + "workflow = create_forecasting_workflow(\n", + " config=ForecastingWorkflowConfig(\n", + " model_id=\"quickstart_gblinear\",\n", + " model=\"gblinear\",\n", + " horizons=[LeadTime.from_string(\"PT36H\")],\n", + " quantiles=[Q(0.5), Q(0.1), Q(0.9)],\n", + " target_column=\"load\",\n", + " # Weather features available in the Liander dataset\n", + " temperature_column=\"temperature_2m\",\n", + " relative_humidity_column=\"relative_humidity_2m\",\n", + " wind_speed_column=\"wind_speed_10m\",\n", + " radiation_column=\"shortwave_radiation\",\n", + " pressure_column=\"surface_pressure\",\n", + " verbosity=0,\n", + " mlflow_storage=None,\n", + " gblinear_hyperparams=GBLinearForecaster.HyperParams(n_steps=50),\n", + " )\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "c3d0a581", + "metadata": {}, + "source": [ + "## Train the model\n", + "\n", + "`workflow.fit()` runs the full pipeline: feature engineering, data validation,\n", + "model training, and evaluation on a held-out test split." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3b7f49e4", + "metadata": {}, + "outputs": [], + "source": [ + "result = workflow.fit(train_dataset)\n", + "\n", + "if result is not None:\n", + " print(\"Training metrics:\")\n", + " print(result.metrics_full.to_dataframe())\n", + "\n", + " if result.metrics_test is not None:\n", + " print(\"\\nTest-set metrics:\")\n", + " print(result.metrics_test.to_dataframe())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b4f6ea9e", + "metadata": { + "tags": [ + "remove-cell" + ] + }, + "outputs": [], + "source": [ + "assert result is not None, \"Training should produce a result\"\n", + "assert result.metrics_full is not None, \"Full metrics should be present\"" + ] + }, + { + "cell_type": "markdown", + "id": "d1d35a3d", + "metadata": {}, + "source": [ + "## Generate forecasts\n", + "\n", + "The trained workflow produces a [`ForecastDataset`](https://openstef.github.io/openstef/v4/api/generated/openstef_core.datasets.ForecastDataset.html) with point predictions and\n", + "quantile bands. The P10-P90 interval covers 80 % of expected outcomes.\n", + "To improve the reliability of these quantile estimates, see\n", + "{doc}`quantile_calibration`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "476f2918", + "metadata": {}, + "outputs": [], + "source": [ + "from openstef_core.datasets import ForecastDataset\n", + "\n", + "forecast: ForecastDataset = workflow.predict(predict_dataset, forecast_start=train_end)\n", + "\n", + "print(f\"Forecast rows: {len(forecast.data)}\")\n", + "print(f\"Quantiles: {forecast.quantiles}\")\n", + "forecast.data.tail()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d892621d", + "metadata": { + "tags": [ + "remove-cell" + ] + }, + "outputs": [], + "source": [ + "assert len(forecast.data) > 100, f\"Expected >100 forecast rows, got {len(forecast.data)}\"\n", + "assert forecast.quantiles is not None, \"Quantile data should be present\"" + ] + }, + { + "cell_type": "markdown", + "id": "33f6f4dd", + "metadata": {}, + "source": [ + "## Visualize the results\n", + "\n", + "[`ForecastTimeSeriesPlotter`](https://openstef.github.io/openstef/v4/api/generated/openstef_beam.analysis.plots.ForecastTimeSeriesPlotter.html) overlays measurements and predictions with shaded\n", + "confidence bands in a single interactive chart." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c6cc34a1", + "metadata": { + "tags": [ + "hide-input" + ] + }, + "outputs": [], + "source": [ + "from openstef_beam.analysis.plots import ForecastTimeSeriesPlotter\n", + "\n", + "fig = (\n", + " ForecastTimeSeriesPlotter()\n", + " .add_measurements(measurements=predict_dataset.data[\"load\"].loc[train_end:])\n", + " .add_model(\n", + " model_name=\"GBLinear\",\n", + " forecast=forecast.median_series,\n", + " quantiles=forecast.quantiles_data,\n", + " )\n", + " .plot()\n", + ")\n", + "fig.update_layout(\n", + " title=\"Forecast vs actuals\",\n", + " yaxis_title=\"Load (MW)\",\n", + " xaxis_title=\"Time\",\n", + " height=500,\n", + ")\n", + "fig.show()" + ] + }, + { + "cell_type": "markdown", + "id": "96fe385b", + "metadata": {}, + "source": [ + "## Next steps\n", + "\n", + "- {doc}`backtesting_quickstart` — evaluate how this model performs on\n", + " historical data with realistic temporal constraints.\n", + "- {doc}`custom_pipeline` — build a model from individual transforms when\n", + " presets don't cover your use case." + ] + } + ], + "metadata": { + "jupytext": { + "formats": "ipynb,py:percent" + }, + "kernelspec": { + "display_name": ".venv", + "language": "python", + "name": "python3" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/examples/tutorials/forecasting_quickstart.py b/examples/tutorials/forecasting_quickstart.py new file mode 100644 index 000000000..676657948 --- /dev/null +++ b/examples/tutorials/forecasting_quickstart.py @@ -0,0 +1,228 @@ +# --- +# jupyter: +# jupytext: +# formats: ipynb,py:percent +# text_representation: +# extension: .py +# format_name: percent +# format_version: '1.3' +# jupytext_version: 1.19.1 +# kernelspec: +# display_name: .venv +# language: python +# name: python3 +# --- + +# %% tags=["remove-cell"] +# SPDX-FileCopyrightText: 2025 Contributors to the OpenSTEF project +# +# SPDX-License-Identifier: MPL-2.0 + +# pyright: basic + +# %% [markdown] +# # Forecasting Quickstart +# +# Train a GBLinear model on real energy data and generate probabilistic forecasts +# with confidence intervals — all in under a minute. +# +# **What you'll learn:** +# +# - Load the Liander 2024 benchmark dataset +# - Configure a forecasting workflow with `ForecastingWorkflowConfig` +# - Train a model and inspect evaluation metrics +# - Generate quantile forecasts (P10 / P50 / P90) +# - Visualize predictions against actuals +# +# ```{note} +# This tutorial uses a small data slice for fast execution. +# See `examples/benchmarks/` for production-scale runs. +# ``` +# +# **Key API references:** +# [`ForecastingWorkflowConfig`](https://openstef.github.io/openstef/v4/api/generated/openstef_models.presets.ForecastingWorkflowConfig.html) +# [`create_forecasting_workflow`](https://openstef.github.io/openstef/v4/api/generated/openstef_models.presets.create_forecasting_workflow.html) +# · [`LeadTime`](https://openstef.github.io/openstef/v4/api/generated/openstef_core.types.LeadTime.html) +# · [`Q`](https://openstef.github.io/openstef/v4/api/generated/openstef_core.types.Quantile.html) + +# %% tags=["remove-cell"] +import warnings +from typing import Any, cast + +warnings.filterwarnings("ignore") + +from openstef_core.testing import configure_notebook_display, setup_notebook_logging + +configure_notebook_display() +logger = setup_notebook_logging( + __name__, + suppress=( + "choreographer", + "kaleido", + "httpx", + "huggingface_hub", + "fsspec", + "filelock", + "openstef_core.datasets", + ), +) + +# %% [markdown] +# ## Load the dataset +# +# The [Liander 2024 benchmark](https://huggingface.co/datasets/Alliander/MSL_Benchmark_Dataset) +# dataset contains load measurements, versioned weather forecasts, EPEX prices, and +# load profiles for a medium-voltage feeder in the Netherlands. +# +# We split the data into: +# +# - **45 days** of training data +# - **7 days** for forecasting +# +# The predict window includes **14 days of history** before the forecast start so +# that lag features (e.g. `load_lag_P7D`) can be computed during prediction. + +# %% +from datetime import datetime, timedelta + +from openstef_core.testing import load_liander_dataset + +dataset = load_liander_dataset() + +train_start = datetime.fromisoformat("2024-03-01T00:00:00Z") +train_end = train_start + timedelta(days=45) +forecast_end = train_end + timedelta(days=7) + +train_dataset = dataset.filter_by_range(start=train_start, end=train_end) + +# Include 14 days of history before forecast start for lag feature computation +predict_dataset = dataset.filter_by_range( + start=train_end - timedelta(days=14), + end=forecast_end, +) + +print( + f"Training: {train_dataset.data.shape[0]:,} rows, " + f"{train_dataset.data.index.min():%Y-%m-%d} to {train_dataset.data.index.max():%Y-%m-%d}" +) +print( + f"Predict: {predict_dataset.data.shape[0]:,} rows, " + f"{predict_dataset.data.index.min():%Y-%m-%d} to {predict_dataset.data.index.max():%Y-%m-%d}" +) + +# %% tags=["hide-input"] +# Quick look at the target variable +fig = cast(Any, train_dataset.data[["load"]].plot(title="Training period — energy load")) +fig.update_layout(yaxis_title="Load (MW)", xaxis_title="Time") +fig.show() + +# %% [markdown] +# ## Configure the workflow +# +# [`ForecastingWorkflowConfig`](https://openstef.github.io/openstef/v4/api/generated/openstef_models.presets.ForecastingWorkflowConfig.html) bundles all settings — model type, horizons, quantiles, +# and feature columns — into a single object. [`create_forecasting_workflow`](https://openstef.github.io/openstef/v4/api/generated/openstef_models.presets.create_forecasting_workflow.html) turns +# it into a ready-to-use pipeline with preprocessing, training, and postprocessing. +# +# We pick **GBLinear** (gradient-boosted linear model) for its speed and +# ability to extrapolate beyond training data. + +# %% +from openstef_core.types import LeadTime, Q +from openstef_models.presets import ForecastingWorkflowConfig, create_forecasting_workflow +from openstef_models.presets.forecasting_workflow import GBLinearForecaster + +workflow = create_forecasting_workflow( + config=ForecastingWorkflowConfig( + model_id="quickstart_gblinear", + model="gblinear", + horizons=[LeadTime.from_string("PT36H")], + quantiles=[Q(0.5), Q(0.1), Q(0.9)], + target_column="load", + # Weather features available in the Liander dataset + temperature_column="temperature_2m", + relative_humidity_column="relative_humidity_2m", + wind_speed_column="wind_speed_10m", + radiation_column="shortwave_radiation", + pressure_column="surface_pressure", + verbosity=0, + mlflow_storage=None, + gblinear_hyperparams=GBLinearForecaster.HyperParams(n_steps=50), + ) +) + +# %% [markdown] +# ## Train the model +# +# `workflow.fit()` runs the full pipeline: feature engineering, data validation, +# model training, and evaluation on a held-out test split. + +# %% +result = workflow.fit(train_dataset) + +if result is not None: + print("Training metrics:") + print(result.metrics_full.to_dataframe()) + + if result.metrics_test is not None: + print("\nTest-set metrics:") + print(result.metrics_test.to_dataframe()) + +# %% tags=["remove-cell"] +assert result is not None, "Training should produce a result" +assert result.metrics_full is not None, "Full metrics should be present" + +# %% [markdown] +# ## Generate forecasts +# +# The trained workflow produces a [`ForecastDataset`](https://openstef.github.io/openstef/v4/api/generated/openstef_core.datasets.ForecastDataset.html) with point predictions and +# quantile bands. The P10-P90 interval covers 80 % of expected outcomes. +# To improve the reliability of these quantile estimates, see +# {doc}`quantile_calibration`. + +# %% +from openstef_core.datasets import ForecastDataset + +forecast: ForecastDataset = workflow.predict(predict_dataset, forecast_start=train_end) + +print(f"Forecast rows: {len(forecast.data)}") +print(f"Quantiles: {forecast.quantiles}") +forecast.data.tail() + +# %% tags=["remove-cell"] +assert len(forecast.data) > 100, f"Expected >100 forecast rows, got {len(forecast.data)}" +assert forecast.quantiles is not None, "Quantile data should be present" + +# %% [markdown] +# ## Visualize the results +# +# [`ForecastTimeSeriesPlotter`](https://openstef.github.io/openstef/v4/api/generated/openstef_beam.analysis.plots.ForecastTimeSeriesPlotter.html) overlays measurements and predictions with shaded +# confidence bands in a single interactive chart. + +# %% tags=["hide-input"] +from openstef_beam.analysis.plots import ForecastTimeSeriesPlotter + +fig = ( + ForecastTimeSeriesPlotter() + .add_measurements(measurements=predict_dataset.data["load"].loc[train_end:]) + .add_model( + model_name="GBLinear", + forecast=forecast.median_series, + quantiles=forecast.quantiles_data, + ) + .plot() +) +fig.update_layout( + title="Forecast vs actuals", + yaxis_title="Load (MW)", + xaxis_title="Time", + height=500, +) +fig.show() + +# %% [markdown] +# ## Next steps +# +# - {doc}`backtesting_quickstart` — evaluate how this model performs on +# historical data with realistic temporal constraints. +# - {doc}`custom_pipeline` — build a model from individual transforms when +# presets don't cover your use case. diff --git a/examples/tutorials/forecasting_with_workflow_presets.ipynb b/examples/tutorials/forecasting_with_workflow_presets.ipynb deleted file mode 100644 index dd0910ee4..000000000 --- a/examples/tutorials/forecasting_with_workflow_presets.ipynb +++ /dev/null @@ -1,434 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": null, - "id": "e8eae4f7", - "metadata": { - "tags": [ - "remove-cell" - ] - }, - "outputs": [], - "source": [ - "# SPDX-FileCopyrightText: 2025 Contributors to the OpenSTEF project \n", - "#\n", - "# SPDX-License-Identifier: MPL-2.0\n", - "\n", - "# pyright: basic" - ] - }, - { - "cell_type": "markdown", - "id": "20e1bef8", - "metadata": {}, - "source": [ - "# 🔮 Forecasting with OpenSTEF 4.0 Workflow Presets\n", - "\n", - "This tutorial demonstrates how to use **OpenSTEF 4.0** to create energy load forecasts\n", - "using the **Workflow Presets** pattern. You'll learn how to:\n", - "\n", - "1. **Load real-world energy data** from the Liander 2024 benchmark dataset\n", - "2. **Configure a forecasting workflow** with weather features and prediction quantiles\n", - "3. **Train a model** and inspect its performance\n", - "4. **Generate probabilistic forecasts** with confidence intervals\n", - "5. **Visualize results** and explain feature importance\n", - "\n", - "> **OpenSTEF** (Short-Term Energy Forecasting) is a modular library for creating\n", - "> accurate energy forecasts in the power grid domain." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "a6b72dc5", - "metadata": {}, - "outputs": [], - "source": [ - "# --- Setup: Logging and Display Configuration ---\n", - "from typing import Any, cast\n", - "\n", - "from openstef_core.testing import configure_notebook_display, setup_notebook_logging\n", - "\n", - "configure_notebook_display()\n", - "logger = setup_notebook_logging(__name__)" - ] - }, - { - "cell_type": "markdown", - "id": "fdddf593", - "metadata": {}, - "source": [ - "## 📦 Step 1: Download the Dataset\n", - "\n", - "We'll use the **Liander 2024 Energy Forecasting Benchmark** dataset from HuggingFace Hub. This dataset contains:\n", - "- **Load measurements** — historical energy consumption from various installations (mv feeders, transformers, etc.)\n", - "- **Weather forecasts** — versioned weather predictions (temperature, radiation, wind, etc.)\n", - "- **EPEX prices** — day-ahead electricity market prices\n", - "- **Profiles** — typical daily/weekly load patterns" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "3a97b4a1", - "metadata": {}, - "outputs": [], - "source": [ - "# Download and combine the Liander benchmark dataset into a single TimeSeriesDataset.\n", - "from openstef_core.testing import load_liander_dataset, prepare_tutorial_datasets\n", - "\n", - "dataset = load_liander_dataset()\n", - "\n", - "print(f\"Dataset shape: {dataset.data.shape}\")\n", - "print(f\"Date range: {dataset.data.index.min()} to {dataset.data.index.max()}\")\n", - "dataset.data.head()" - ] - }, - { - "cell_type": "markdown", - "id": "c1f4660d", - "metadata": {}, - "source": [ - "## ✂️ Step 3: Split Data into Training and Forecast Periods\n", - "\n", - "We'll use:\n", - "- **90 days** of historical data for training\n", - "- **14 days** as the forecast period (where we'll generate predictions)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "95f4a401", - "metadata": {}, - "outputs": [], - "source": [ - "# Split the dataset into training (90 days) and forecast (14 days) periods.\n", - "train_dataset, forecast_dataset = prepare_tutorial_datasets()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "162c0fc2", - "metadata": {}, - "outputs": [], - "source": [ - "# Visualize the training data\n", - "# The plot shows the 'load' column (energy consumption in MW) over time\n", - "# cast() needed: pandas returns plotly Figure at runtime (backend=\"plotly\") but typed as Axes\n", - "fig = cast(Any, train_dataset.data[[\"load\"]].plot(title=\"Training Data: Energy Load over Time\"))\n", - "fig.update_layout(yaxis_title=\"Load (MW)\", xaxis_title=\"Time\")\n", - "fig.show()" - ] - }, - { - "cell_type": "markdown", - "id": "6c64db35", - "metadata": {}, - "source": [ - "## ⚙️ Step 4: Configure the Forecasting Workflow\n", - "\n", - "OpenSTEF uses a **ForecastingWorkflowConfig** to define all aspects of the forecasting pipeline:\n", - "- **Model type** — `gblinear` (gradient boosted linear model) or `xgboost`\n", - "- **Forecast horizons** — how far ahead to predict (e.g., 36 hours)\n", - "- **Quantiles** — prediction intervals for probabilistic forecasts\n", - "- **Feature columns** — which weather variables to use\n", - "\n", - "The **GBLinear** model is particularly good for energy forecasting because:\n", - "1. It can extrapolate beyond training data (important for rare events)\n", - "2. It provides interpretable feature importance\n", - "3. It's fast to train and predict" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "5afd85eb", - "metadata": {}, - "outputs": [], - "source": [ - "# Import workflow components\n", - "from openstef_core.types import LeadTime, Q # LeadTime: forecast horizon, Q: quantile\n", - "from openstef_models.presets import ForecastingWorkflowConfig, create_forecasting_workflow\n", - "from openstef_models.presets.forecasting_workflow import GBLinearForecaster\n", - "\n", - "# Configure the forecasting workflow\n", - "workflow = create_forecasting_workflow(\n", - " config=ForecastingWorkflowConfig(\n", - " # Model identification\n", - " model_id=\"gblinear_demo_v1\",\n", - " model=\"gblinear\", # Use gradient boosted linear model\n", - " # Forecast settings\n", - " horizons=[LeadTime.from_string(\"PT36H\")], # Predict up to 36 hours ahead\n", - " quantiles=[Q(0.5), Q(0.1), Q(0.9)], # Median + 80% prediction interval\n", - " # Target column (what we're predicting)\n", - " target_column=\"load\",\n", - " # Weather feature columns (from the dataset)\n", - " temperature_column=\"temperature_2m\",\n", - " relative_humidity_column=\"relative_humidity_2m\",\n", - " wind_speed_column=\"wind_speed_10m\",\n", - " radiation_column=\"shortwave_radiation\", # Solar radiation\n", - " pressure_column=\"surface_pressure\",\n", - " # Training settings\n", - " verbosity=1, # Show progress during training\n", - " mlflow_storage=None, # Disable MLflow tracking for this demo\n", - " # Model-specific hyperparameters\n", - " gblinear_hyperparams=GBLinearForecaster.HyperParams(\n", - " n_steps=50 # Number of boosting iterations\n", - " ),\n", - " )\n", - ")\n", - "\n", - "print(\"✅ Workflow configured successfully!\")" - ] - }, - { - "cell_type": "markdown", - "id": "293d3e51", - "metadata": {}, - "source": [ - "## 🏋️ Step 5: Train the Model\n", - "\n", - "The workflow's `fit()` method handles the entire training pipeline:\n", - "1. **Preprocessing** — feature engineering, data validation, scaling\n", - "2. **Training** — fit the model on historical data\n", - "3. **Evaluation** — compute metrics on training data" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "7ab71aca", - "metadata": {}, - "outputs": [], - "source": [ - "# Train the model on historical data\n", - "logger.info(\"🏋️ Starting model training...\")\n", - "\n", - "result = workflow.fit(train_dataset)\n", - "\n", - "# Display training metrics\n", - "if result is not None:\n", - " logger.info(\"✅ Training complete!\")\n", - " print(\"\\n📊 Training Evaluation Metrics:\")\n", - " print(result.metrics_full.to_dataframe())\n", - "\n", - " if result.metrics_test is not None:\n", - " print(\"\\n📊 Test Set Metrics (held-out validation):\")\n", - " print(result.metrics_test.to_dataframe())" - ] - }, - { - "cell_type": "markdown", - "id": "95f5f18a", - "metadata": {}, - "source": [ - "## 🔮 Step 6: Generate Forecasts\n", - "\n", - "Now we use the trained model to predict energy load for the next 14 days.\n", - "The output is a **ForecastDataset** containing:\n", - "- **Median prediction** (`quantile_P50`)\n", - "- **Lower bound** (`quantile_P10`) — 10th percentile\n", - "- **Upper bound** (`quantile_P90`) — 90th percentile" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "b341466f", - "metadata": {}, - "outputs": [], - "source": [ - "# Generate probabilistic forecasts for the forecast period\n", - "from openstef_core.datasets import ForecastDataset\n", - "\n", - "logger.info(\"🔮 Generating forecasts...\")\n", - "forecast: ForecastDataset = workflow.predict(forecast_dataset)\n", - "\n", - "# Display forecast summary\n", - "print(f\"\\n📈 Forecast generated for {len(forecast.data)} timestamps\")\n", - "print(f\"📊 Quantiles: {forecast.quantiles}\")\n", - "print(\"\\n🔍 Last 5 forecast values:\")\n", - "print(forecast.data.tail())" - ] - }, - { - "cell_type": "markdown", - "id": "f3c5c3c6", - "metadata": {}, - "source": [ - "## 📈 Step 7: Visualize Forecast Results\n", - "\n", - "OpenSTEF-BEAM provides **ForecastTimeSeriesPlotter** for beautiful interactive visualizations:\n", - "- Actual measurements shown as a line\n", - "- Forecast median shown as another line\n", - "- Prediction intervals shown as shaded areas" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "17801cef", - "metadata": {}, - "outputs": [], - "source": [ - "# Create an interactive forecast visualization\n", - "from openstef_beam.analysis.plots import ForecastTimeSeriesPlotter\n", - "\n", - "fig = (\n", - " ForecastTimeSeriesPlotter()\n", - " # Add actual measurements (ground truth)\n", - " .add_measurements(measurements=forecast_dataset.data[\"load\"])\n", - " # Add model predictions with confidence bands\n", - " .add_model(\n", - " model_name=\"GBLinear\",\n", - " forecast=forecast.median_series, # P50 prediction\n", - " quantiles=forecast.quantiles_data, # P10-P90 confidence band\n", - " )\n", - " .plot()\n", - ")\n", - "\n", - "# Update layout for better presentation\n", - "fig.update_layout(\n", - " title=\"🔮 Energy Load Forecast vs Actual\",\n", - " yaxis_title=\"Load (MW)\",\n", - " xaxis_title=\"Time\",\n", - " height=500,\n", - ")\n", - "fig.show()" - ] - }, - { - "cell_type": "markdown", - "id": "6f24ae06", - "metadata": {}, - "source": [ - "## 🔍 Step 8: Explain Feature Importance\n", - "\n", - "Understanding **why** the model makes certain predictions is crucial for trust\n", - "and debugging. GBLinear models provide clear feature importance rankings." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "932b62c8", - "metadata": {}, - "outputs": [], - "source": [ - "# Visualize feature importance using the ExplainableForecaster interface\n", - "from typing import cast\n", - "\n", - "from openstef_models.explainability import ExplainableForecaster\n", - "from openstef_models.models.forecasting_model import ForecastingModel\n", - "\n", - "# The GBLinear model implements ExplainableForecaster, providing feature importance\n", - "forecaster = cast(ForecastingModel, workflow.model).forecaster\n", - "explainable_model = cast(ExplainableForecaster, forecaster)\n", - "\n", - "# Create an interactive treemap of feature importances\n", - "# Larger boxes = more important features\n", - "fig = explainable_model.plot_feature_importances()\n", - "fig.update_layout(title=\"🔍 Feature Importance Treemap\")\n", - "fig.show()" - ] - }, - { - "cell_type": "markdown", - "id": "609b7509", - "metadata": {}, - "source": [ - "## 🔬 Step 9: Visualize Feature Contributions (SHAP)\n", - "\n", - "While feature importance shows **which** features matter overall, **contributions**\n", - "show how each feature pushed the prediction up or down for every individual timestep.\n", - "GBLinear models expose these as SHAP values via `predict_contributions()`." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "1dddfe95", - "metadata": {}, - "outputs": [], - "source": [ - "# Compute per-timestep feature contributions for the forecast period\n", - "from openstef_models.explainability import ContributionsPlotter\n", - "\n", - "contributions = workflow.model.predict_contributions(forecast_dataset)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "539e6156", - "metadata": {}, - "outputs": [], - "source": [ - "# Heatmap: contributions over time with prediction line\n", - "ContributionsPlotter.plot_heatmap(contributions, top_n=10, show_prediction=True).show()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "d68f4bbc", - "metadata": {}, - "outputs": [], - "source": [ - "# Waterfall: decompose a single timestep's prediction\n", - "ContributionsPlotter.plot_waterfall(contributions, timestep=0, top_n=10).show()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "19582854", - "metadata": {}, - "outputs": [], - "source": [ - "# Bar chart: mean absolute contribution per feature\n", - "ContributionsPlotter.plot_bar(contributions, top_n=10).show()" - ] - }, - { - "cell_type": "markdown", - "id": "1f53f172", - "metadata": {}, - "source": [ - "---\n", - "\n", - "## 🎯 Summary\n", - "\n", - "In this tutorial, you learned how to:\n", - "\n", - "1. ✅ **Load energy data** from the Liander 2024 benchmark dataset\n", - "2. ✅ **Configure a workflow** with `ForecastingWorkflowConfig`\n", - "3. ✅ **Train a GBLinear model** for probabilistic forecasting\n", - "4. ✅ **Generate forecasts** with confidence intervals\n", - "5. ✅ **Visualize results** and feature importance\n", - "\n", - "### 🚀 Next Steps\n", - "\n", - "- Try different models: `\"xgboost\"` for more complex patterns\n", - "- Experiment with more quantiles for narrower prediction intervals\n", - "- Use the **backtesting notebook** to evaluate model performance systematically\n", - "- Explore MLflow integration for experiment tracking" - ] - } - ], - "metadata": { - "jupytext": { - "formats": "ipynb,py:percent" - }, - "kernelspec": { - "display_name": ".venv", - "language": "python", - "name": "python3" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/examples/tutorials/forecasting_with_workflow_presets.py b/examples/tutorials/forecasting_with_workflow_presets.py deleted file mode 100644 index c1d5526b3..000000000 --- a/examples/tutorials/forecasting_with_workflow_presets.py +++ /dev/null @@ -1,280 +0,0 @@ -# --- -# jupyter: -# jupytext: -# formats: ipynb,py:percent -# text_representation: -# extension: .py -# format_name: percent -# format_version: '1.3' -# jupytext_version: 1.19.1 -# kernelspec: -# display_name: .venv -# language: python -# name: python3 -# --- - -# %% tags=["remove-cell"] -# SPDX-FileCopyrightText: 2025 Contributors to the OpenSTEF project -# -# SPDX-License-Identifier: MPL-2.0 - -# pyright: basic - -# %% [markdown] -# # 🔮 Forecasting with OpenSTEF 4.0 Workflow Presets -# -# This tutorial demonstrates how to use **OpenSTEF 4.0** to create energy load forecasts -# using the **Workflow Presets** pattern. You'll learn how to: -# -# 1. **Load real-world energy data** from the Liander 2024 benchmark dataset -# 2. **Configure a forecasting workflow** with weather features and prediction quantiles -# 3. **Train a model** and inspect its performance -# 4. **Generate probabilistic forecasts** with confidence intervals -# 5. **Visualize results** and explain feature importance -# -# > **OpenSTEF** (Short-Term Energy Forecasting) is a modular library for creating -# > accurate energy forecasts in the power grid domain. - -# %% -# --- Setup: Logging and Display Configuration --- -from typing import Any, cast - -from openstef_core.testing import configure_notebook_display, setup_notebook_logging - -configure_notebook_display() -logger = setup_notebook_logging(__name__) - -# %% [markdown] -# ## 📦 Step 1: Download the Dataset -# -# We'll use the **Liander 2024 Energy Forecasting Benchmark** dataset from HuggingFace Hub. This dataset contains: -# - **Load measurements** — historical energy consumption from various installations (mv feeders, transformers, etc.) -# - **Weather forecasts** — versioned weather predictions (temperature, radiation, wind, etc.) -# - **EPEX prices** — day-ahead electricity market prices -# - **Profiles** — typical daily/weekly load patterns - -# %% -# Download and combine the Liander benchmark dataset into a single TimeSeriesDataset. -from openstef_core.testing import load_liander_dataset, prepare_tutorial_datasets - -dataset = load_liander_dataset() - -print(f"Dataset shape: {dataset.data.shape}") -print(f"Date range: {dataset.data.index.min()} to {dataset.data.index.max()}") -dataset.data.head() - -# %% [markdown] -# ## ✂️ Step 3: Split Data into Training and Forecast Periods -# -# We'll use: -# - **90 days** of historical data for training -# - **14 days** as the forecast period (where we'll generate predictions) - -# %% -# Split the dataset into training (90 days) and forecast (14 days) periods. -train_dataset, forecast_dataset = prepare_tutorial_datasets() - -# %% -# Visualize the training data -# The plot shows the 'load' column (energy consumption in MW) over time -# cast() needed: pandas returns plotly Figure at runtime (backend="plotly") but typed as Axes -fig = cast(Any, train_dataset.data[["load"]].plot(title="Training Data: Energy Load over Time")) -fig.update_layout(yaxis_title="Load (MW)", xaxis_title="Time") -fig.show() - -# %% [markdown] -# ## ⚙️ Step 4: Configure the Forecasting Workflow -# -# OpenSTEF uses a **ForecastingWorkflowConfig** to define all aspects of the forecasting pipeline: -# - **Model type** — `gblinear` (gradient boosted linear model) or `xgboost` -# - **Forecast horizons** — how far ahead to predict (e.g., 36 hours) -# - **Quantiles** — prediction intervals for probabilistic forecasts -# - **Feature columns** — which weather variables to use -# -# The **GBLinear** model is particularly good for energy forecasting because: -# 1. It can extrapolate beyond training data (important for rare events) -# 2. It provides interpretable feature importance -# 3. It's fast to train and predict - -# %% -# Import workflow components -from openstef_core.types import LeadTime, Q # LeadTime: forecast horizon, Q: quantile -from openstef_models.presets import ForecastingWorkflowConfig, create_forecasting_workflow -from openstef_models.presets.forecasting_workflow import GBLinearForecaster - -# Configure the forecasting workflow -workflow = create_forecasting_workflow( - config=ForecastingWorkflowConfig( - # Model identification - model_id="gblinear_demo_v1", - model="gblinear", # Use gradient boosted linear model - # Forecast settings - horizons=[LeadTime.from_string("PT36H")], # Predict up to 36 hours ahead - quantiles=[Q(0.5), Q(0.1), Q(0.9)], # Median + 80% prediction interval - # Target column (what we're predicting) - target_column="load", - # Weather feature columns (from the dataset) - temperature_column="temperature_2m", - relative_humidity_column="relative_humidity_2m", - wind_speed_column="wind_speed_10m", - radiation_column="shortwave_radiation", # Solar radiation - pressure_column="surface_pressure", - # Training settings - verbosity=1, # Show progress during training - mlflow_storage=None, # Disable MLflow tracking for this demo - # Model-specific hyperparameters - gblinear_hyperparams=GBLinearForecaster.HyperParams( - n_steps=50 # Number of boosting iterations - ), - ) -) - -print("✅ Workflow configured successfully!") - -# %% [markdown] -# ## 🏋️ Step 5: Train the Model -# -# The workflow's `fit()` method handles the entire training pipeline: -# 1. **Preprocessing** — feature engineering, data validation, scaling -# 2. **Training** — fit the model on historical data -# 3. **Evaluation** — compute metrics on training data - -# %% -# Train the model on historical data -logger.info("🏋️ Starting model training...") - -result = workflow.fit(train_dataset) - -# Display training metrics -if result is not None: - logger.info("✅ Training complete!") - print("\n📊 Training Evaluation Metrics:") - print(result.metrics_full.to_dataframe()) - - if result.metrics_test is not None: - print("\n📊 Test Set Metrics (held-out validation):") - print(result.metrics_test.to_dataframe()) - -# %% [markdown] -# ## 🔮 Step 6: Generate Forecasts -# -# Now we use the trained model to predict energy load for the next 14 days. -# The output is a **ForecastDataset** containing: -# - **Median prediction** (`quantile_P50`) -# - **Lower bound** (`quantile_P10`) — 10th percentile -# - **Upper bound** (`quantile_P90`) — 90th percentile - -# %% -# Generate probabilistic forecasts for the forecast period -from openstef_core.datasets import ForecastDataset - -logger.info("🔮 Generating forecasts...") -forecast: ForecastDataset = workflow.predict(forecast_dataset) - -# Display forecast summary -print(f"\n📈 Forecast generated for {len(forecast.data)} timestamps") -print(f"📊 Quantiles: {forecast.quantiles}") -print("\n🔍 Last 5 forecast values:") -print(forecast.data.tail()) - -# %% [markdown] -# ## 📈 Step 7: Visualize Forecast Results -# -# OpenSTEF-BEAM provides **ForecastTimeSeriesPlotter** for beautiful interactive visualizations: -# - Actual measurements shown as a line -# - Forecast median shown as another line -# - Prediction intervals shown as shaded areas - -# %% -# Create an interactive forecast visualization -from openstef_beam.analysis.plots import ForecastTimeSeriesPlotter - -fig = ( - ForecastTimeSeriesPlotter() - # Add actual measurements (ground truth) - .add_measurements(measurements=forecast_dataset.data["load"]) - # Add model predictions with confidence bands - .add_model( - model_name="GBLinear", - forecast=forecast.median_series, # P50 prediction - quantiles=forecast.quantiles_data, # P10-P90 confidence band - ) - .plot() -) - -# Update layout for better presentation -fig.update_layout( - title="🔮 Energy Load Forecast vs Actual", - yaxis_title="Load (MW)", - xaxis_title="Time", - height=500, -) -fig.show() - -# %% [markdown] -# ## 🔍 Step 8: Explain Feature Importance -# -# Understanding **why** the model makes certain predictions is crucial for trust -# and debugging. GBLinear models provide clear feature importance rankings. - -# %% -# Visualize feature importance using the ExplainableForecaster interface -from typing import cast - -from openstef_models.explainability import ExplainableForecaster -from openstef_models.models.forecasting_model import ForecastingModel - -# The GBLinear model implements ExplainableForecaster, providing feature importance -forecaster = cast(ForecastingModel, workflow.model).forecaster -explainable_model = cast(ExplainableForecaster, forecaster) - -# Create an interactive treemap of feature importances -# Larger boxes = more important features -fig = explainable_model.plot_feature_importances() -fig.update_layout(title="🔍 Feature Importance Treemap") -fig.show() - -# %% [markdown] -# ## 🔬 Step 9: Visualize Feature Contributions (SHAP) -# -# While feature importance shows **which** features matter overall, **contributions** -# show how each feature pushed the prediction up or down for every individual timestep. -# GBLinear models expose these as SHAP values via `predict_contributions()`. - -# %% -# Compute per-timestep feature contributions for the forecast period -from openstef_models.explainability import ContributionsPlotter - -contributions = workflow.model.predict_contributions(forecast_dataset) - -# %% -# Heatmap: contributions over time with prediction line -ContributionsPlotter.plot_heatmap(contributions, top_n=10, show_prediction=True).show() - -# %% -# Waterfall: decompose a single timestep's prediction -ContributionsPlotter.plot_waterfall(contributions, timestep=0, top_n=10).show() - -# %% -# Bar chart: mean absolute contribution per feature -ContributionsPlotter.plot_bar(contributions, top_n=10).show() - -# %% [markdown] -# --- -# -# ## 🎯 Summary -# -# In this tutorial, you learned how to: -# -# 1. ✅ **Load energy data** from the Liander 2024 benchmark dataset -# 2. ✅ **Configure a workflow** with `ForecastingWorkflowConfig` -# 3. ✅ **Train a GBLinear model** for probabilistic forecasting -# 4. ✅ **Generate forecasts** with confidence intervals -# 5. ✅ **Visualize results** and feature importance -# -# ### 🚀 Next Steps -# -# - Try different models: `"xgboost"` for more complex patterns -# - Experiment with more quantiles for narrower prediction intervals -# - Use the **backtesting notebook** to evaluate model performance systematically -# - Explore MLflow integration for experiment tracking diff --git a/examples/tutorials/hyperparameter_tuning_with_optuna.ipynb b/examples/tutorials/hyperparameter_tuning_with_optuna.ipynb index ecf1de68d..39458e24b 100644 --- a/examples/tutorials/hyperparameter_tuning_with_optuna.ipynb +++ b/examples/tutorials/hyperparameter_tuning_with_optuna.ipynb @@ -22,17 +22,72 @@ "cell_type": "code", "execution_count": null, "id": "31e74f93", - "metadata": {}, + "metadata": { + "tags": [ + "remove-cell" + ] + }, "outputs": [], "source": [ - "# --- Setup: Logging and Display Configuration ---\n", - "# Configure logging and display settings for the notebook\n", - "from typing import Literal\n", + "import warnings\n", + "\n", + "warnings.filterwarnings(\"ignore\")\n", "\n", "from openstef_core.testing import configure_notebook_display, setup_notebook_logging\n", "\n", "configure_notebook_display()\n", - "logger = setup_notebook_logging(__name__)" + "logger = setup_notebook_logging(\n", + " __name__,\n", + " suppress=(\n", + " \"choreographer\",\n", + " \"kaleido\",\n", + " \"httpx\",\n", + " \"huggingface_hub\",\n", + " \"fsspec\",\n", + " \"filelock\",\n", + " \"openstef_core.datasets\",\n", + " \"optuna\",\n", + " \"lightgbm\",\n", + " ),\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "44af0dd4", + "metadata": {}, + "source": [ + "# Hyperparameter Tuning with Optuna\n", + "\n", + "OpenSTEF integrates with [Optuna](https://optuna.org/) for Bayesian\n", + "hyperparameter optimization. Every forecaster in OpenSTEF declares\n", + "sensible search bounds on its hyperparameters — you just choose which\n", + "ones to activate for tuning.\n", + "\n", + "**What you'll learn:**\n", + "\n", + "- Why models ship with built-in search spaces\n", + "- How to activate, deactivate, and customize tunable parameters\n", + "- How to change the optimization metric (e.g. rCRPS for probabilistic scoring)\n", + "- How to compare an untuned baseline against the tuned model\n", + "\n", + "```{note}\n", + "This tutorial runs only 5 trials for fast execution.\n", + "Increase `n_trials` for production use.\n", + "```\n", + "\n", + "**Key API references:**\n", + "[`HyperparameterTuner`](https://openstef.github.io/openstef/v4/api/generated/openstef_models.integrations.optuna.HyperparameterTuner.html)\n", + "· [`XGBoostHyperParams`](https://openstef.github.io/openstef/v4/api/generated/openstef_models.models.forecasting.html)\n", + "· [`FloatRange`](https://openstef.github.io/openstef/v4/api/generated/openstef_core.mixins.param_ranges.FloatRange.html) / [`IntRange`](https://openstef.github.io/openstef/v4/api/generated/openstef_core.mixins.param_ranges.IntRange.html)" + ] + }, + { + "cell_type": "markdown", + "id": "a8bdc792", + "metadata": {}, + "source": [ + "## Load the dataset" ] }, { @@ -42,267 +97,388 @@ "metadata": {}, "outputs": [], "source": [ - "# Download and combine the Liander benchmark dataset into a single TimeSeriesDataset.\n", - "from openstef_core.testing import load_liander_dataset, prepare_tutorial_datasets\n", + "from datetime import datetime, timedelta\n", + "\n", + "from openstef_core.testing import load_liander_dataset\n", + "from openstef_core.types import LeadTime, Q\n", "\n", "dataset = load_liander_dataset()\n", "\n", - "print(f\"Dataset shape: {dataset.data.shape}\")\n", - "print(f\"Date range: {dataset.data.index.min()} to {dataset.data.index.max()}\")\n", - "dataset.data.head()" + "train_start = datetime.fromisoformat(\"2024-03-01T00:00:00Z\")\n", + "train_end = train_start + timedelta(days=45)\n", + "forecast_end = train_end + timedelta(days=7)\n", + "\n", + "train_dataset = dataset.filter_by_range(start=train_start, end=train_end)\n", + "predict_dataset = dataset.filter_by_range(\n", + " start=train_end - timedelta(days=14),\n", + " end=forecast_end,\n", + ")\n", + "\n", + "print(f\"Training: {train_dataset.data.shape[0]:,} rows\")\n", + "print(f\"Predict: {predict_dataset.data.shape[0]:,} rows\")" ] }, { - "cell_type": "code", - "execution_count": null, - "id": "eee2c101", + "cell_type": "markdown", + "id": "4665d14e", "metadata": {}, - "outputs": [], "source": [ - "# Split the dataset into training (90 days) and forecast (14 days) periods.\n", - "train_dataset, forecast_dataset = prepare_tutorial_datasets()" + "## Understanding built-in search spaces\n", + "\n", + "Each forecaster's `HyperParams` class uses Python's `Annotated` type hints\n", + "to declare valid search bounds on every parameter. For example,\n", + "`XGBoostHyperParams` defines:\n", + "\n", + "```python\n", + "n_estimators: Annotated[int, IntRange(50, 500)] = 100\n", + "learning_rate: Annotated[float, FloatRange(0.01, 0.5, log=True)] = 0.3\n", + "max_depth: Annotated[int, IntRange(1, 15)] = 6\n", + "subsample: Annotated[float, FloatRange(0.5, 1.0)] = 1.0\n", + "```\n", + "\n", + "These ranges define **where** Optuna can search, but tuning is **not active\n", + "by default**. The `tune=True` flag explicitly activates each parameter.\n", + "This design means you always get sensible bounds without accidentally\n", + "tuning everything.\n", + "\n", + "Let's see the default search space — with nothing activated:" ] }, { "cell_type": "code", "execution_count": null, - "id": "695ea7fd", + "id": "eee2c101", "metadata": {}, "outputs": [], "source": [ - "# Visualize the training data\n", - "# The plot shows the 'load' column (energy consumption in MW) over time\n", - "fig = train_dataset.data[[\"load\"]].plot(title=\"Training Data: Energy Load over Time\")\n", - "fig.update_layout(yaxis_title=\"Load (MW)\", xaxis_title=\"Time\") # type: ignore[union-attr] # plotly Figure\n", - "fig.show() # type: ignore[union-attr]" + "from openstef_core.mixins.param_ranges import FloatRange, IntRange\n", + "from openstef_models.models.forecasting.xgboost_forecaster import XGBoostHyperParams\n", + "\n", + "default_hp = XGBoostHyperParams()\n", + "default_space = default_hp.get_search_space()\n", + "print(f\"Default tunable parameters: {len(default_space)}\")\n", + "print(\"(All parameters use their fixed defaults until you opt in with tune=True)\")" ] }, { "cell_type": "markdown", - "id": "ed1b233e", + "id": "4329d363", "metadata": {}, "source": [ - "## Define a base config with inline search space\n", + "## Customizing the search space\n", + "\n", + "To activate tuning on a parameter, pass a range with `tune=True`.\n", + "You can also narrow or widen the bounds, or leave bounds as `None` to\n", + "inherit the class-level defaults from the `Annotated` metadata.\n", + "\n", + "**Activate with custom bounds:**\n", + "```python\n", + "learning_rate=FloatRange(0.01, 0.3, log=True, tune=True)\n", + "```\n", "\n", - "Override default hyperparameters with `TuningRange(tune=True)` to mark them for tuning.\n", - "Any parameter left as a plain value keeps its default during tuning." + "**Activate with default bounds** (inherits from Annotated metadata):\n", + "```python\n", + "subsample=FloatRange(tune=True)\n", + "```\n", + "\n", + "**Keep a parameter fixed** (don't pass a range — just a plain value or omit it):\n", + "```python\n", + "max_depth=6 # fixed, not tuned\n", + "```\n", + "\n", + "Let's configure XGBoost with 4 tunable parameters and keep `reg_alpha`\n", + "fixed at a known-good value:" ] }, { "cell_type": "code", "execution_count": null, - "id": "fc276f61", + "id": "695ea7fd", "metadata": {}, "outputs": [], "source": [ - "from openstef_beam.evaluation.metric_providers import ObservedProbabilityProvider, RMAEProvider\n", - "from openstef_core.mixins.param_ranges import FloatRange, IntRange\n", - "from openstef_core.types import LeadTime, Q\n", - "from openstef_models.integrations.optuna import HyperparameterTuner\n", - "from openstef_models.models.forecasting.xgboost_forecaster import XGBoostHyperParams\n", "from openstef_models.presets import ForecastingWorkflowConfig, create_forecasting_workflow\n", "\n", "config = ForecastingWorkflowConfig(\n", " model_id=\"tuning_demo\",\n", " model=\"xgboost\",\n", - " # Forecast settings\n", - " horizons=[LeadTime.from_string(\"PT36H\")], # Predict up to 36 hours ahead\n", - " quantiles=[Q(0.5), Q(0.1), Q(0.9)], # Median + 80% prediction interval\n", - " # Target column (what we're predicting)\n", + " horizons=[LeadTime.from_string(\"PT36H\")],\n", + " quantiles=[Q(0.5), Q(0.1), Q(0.9)],\n", " target_column=\"load\",\n", - " # Weather feature columns (from the dataset)\n", " temperature_column=\"temperature_2m\",\n", " relative_humidity_column=\"relative_humidity_2m\",\n", " wind_speed_column=\"wind_speed_10m\",\n", - " radiation_column=\"shortwave_radiation\", # Solar radiation\n", + " radiation_column=\"shortwave_radiation\",\n", " pressure_column=\"surface_pressure\",\n", - " # Hyperparameters to tune\n", " xgboost_hyperparams=XGBoostHyperParams(\n", - " learning_rate=FloatRange(0.01, 0.3, log=True, tune=True), # pyright: ignore[reportCallIssue] # ranges accepted at runtime via Annotated\n", - " n_estimators=IntRange(50, 500, tune=True),\n", - " max_depth=IntRange(3, 10, tune=True),\n", - " subsample=FloatRange(0.5, 1.0, tune=True),\n", - " colsample_bytree=FloatRange(0.5, 1.0, tune=True),\n", + " # Tuned — custom bounds\n", + " learning_rate=FloatRange(0.01, 0.3, log=True, tune=True), # pyright: ignore[reportCallIssue]\n", + " n_estimators=IntRange(50, 300, tune=True),\n", + " # Tuned — inherits class-level bounds [1, 15]\n", + " max_depth=IntRange(tune=True),\n", + " # Tuned — custom narrower bounds\n", + " subsample=FloatRange(0.6, 1.0, tune=True),\n", + " # Fixed — not tuned\n", + " reg_alpha=0.1,\n", " ),\n", - " evaluation_metrics=[RMAEProvider(), ObservedProbabilityProvider()],\n", - " mlflow_storage=None, # Disable MLFlow tune to avoid reusing models between trials.\n", + " mlflow_storage=None,\n", + " verbosity=0,\n", ")" ] }, + { + "cell_type": "code", + "execution_count": null, + "id": "374108f1", + "metadata": {}, + "outputs": [], + "source": [ + "space = config.xgboost_hyperparams.get_search_space()\n", + "\n", + "print(f\"Active search space ({len(space)} parameters):\")\n", + "for name, param in space.items():\n", + " if isinstance(param, (FloatRange, IntRange)):\n", + " scale = \" [log]\" if param.log else \"\"\n", + " print(f\" {name:20s}: {type(param).__name__} [{param.low} — {param.high}]{scale}\")" + ] + }, { "cell_type": "markdown", - "id": "7168a1a8", + "id": "ed1b233e", "metadata": {}, "source": [ - "## Inspect the resolved search space\n" + "## Changing the tuning metric\n", + "\n", + "By default, `HyperparameterTuner` optimizes `R2` on the median quantile.\n", + "For probabilistic forecasts, the **relative Continuous Ranked Probability\n", + "Score (rCRPS)** is a better choice — it evaluates the full quantile\n", + "distribution, not just the median.\n", + "\n", + "To use rCRPS, add [`RCRPSProvider`](https://openstef.github.io/openstef/v4/api/generated/openstef_beam.evaluation.metric_providers.RCRPSProvider.html) to the config's `evaluation_metrics`\n", + "and set `metric_name=\"rCRPS\"` with `direction=\"minimize\"` (lower is better):" ] }, { "cell_type": "code", "execution_count": null, - "id": "41f1b6ab", + "id": "fc276f61", "metadata": {}, "outputs": [], "source": [ + "from openstef_beam.evaluation.metric_providers import ObservedProbabilityProvider, R2Provider, RCRPSProvider\n", "\n", - "# Get the search space from the hyperparams instance (resolve fills None bounds from class-level defaults).\n", - "resolved_space = config.xgboost_hyperparams.get_search_space()\n", - "\n", - "print(\"Resolved search space:\")\n", - "for name, param in resolved_space.items():\n", - " if isinstance(param, (FloatRange, IntRange)):\n", - " scale = \" [log]\" if param.log else \"\"\n", - " print(f\" {name:25s}: {type(param).__name__} [{param.low} — {param.high}]{scale}\")\n", - " else:\n", - " print(f\" {name:25s}: CategoricalRange {param.choices}\")" + "config_with_rcrps = config.model_copy(\n", + " update={\"evaluation_metrics\": [R2Provider(), ObservedProbabilityProvider(), RCRPSProvider()]}\n", + ")" ] }, { "cell_type": "markdown", - "id": "dabeaa30", + "id": "5a158b63", "metadata": {}, "source": [ - "## Run the Optuna study with `HyperparameterTuner`" + "## Train an untuned baseline\n", + "\n", + "Before tuning, let's train a model with the default hyperparameters so we\n", + "can measure the improvement." ] }, { "cell_type": "code", "execution_count": null, - "id": "8b170806", + "id": "cb569a95", + "metadata": {}, + "outputs": [], + "source": [ + "baseline_config = config_with_rcrps.model_copy(\n", + " update={\n", + " \"xgboost_hyperparams\": XGBoostHyperParams(reg_alpha=0.1),\n", + " }\n", + ")\n", + "baseline_workflow = create_forecasting_workflow(baseline_config)\n", + "baseline_result = baseline_workflow.fit(train_dataset)\n", + "baseline_forecast = baseline_workflow.predict(predict_dataset, forecast_start=train_end)\n", + "\n", + "baseline_r2 = baseline_result.metrics_val.get_metric(quantile=Q(0.5), metric_name=\"R2\")\n", + "baseline_rcrps = baseline_result.metrics_val.get_metric(quantile=\"global\", metric_name=\"rCRPS\")\n", + "print(f\"Baseline R2: {baseline_r2:.4f}\")\n", + "print(f\"Baseline rCRPS: {baseline_rcrps:.4f}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7cdfac18", "metadata": { - "lines_to_next_cell": 2 + "tags": [ + "remove-cell" + ] }, "outputs": [], + "source": [ + "assert baseline_r2 is not None and baseline_r2 > 0.0" + ] + }, + { + "cell_type": "markdown", + "id": "9b795a56", + "metadata": {}, + "source": [ + "## Run the Optuna study\n", + "\n", + "`HyperparameterTuner.fit_with_tuning()` runs the study and trains a final\n", + "workflow using [`create_forecasting_workflow`](https://openstef.github.io/openstef/v4/api/generated/openstef_models.presets.create_forecasting_workflow.html) on the full training set with the best hyperparameters.\n", + "The first trial always evaluates the default values so the search starts\n", + "from a known baseline." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5c628cb4", + "metadata": {}, + "outputs": [], "source": [ "import optuna\n", "\n", - "optuna.logging.set_verbosity(optuna.logging.WARNING) # Suppress per-trial logs\n", + "from openstef_models.integrations.optuna import HyperparameterTuner\n", + "\n", + "optuna.logging.set_verbosity(optuna.logging.WARNING)\n", "\n", "tuner = HyperparameterTuner(\n", - " config=config,\n", + " config=config_with_rcrps,\n", " train_dataset=train_dataset,\n", " create_workflow=create_forecasting_workflow,\n", - " target_quantile=Q(0.5),\n", - " metric_name=\"rMAE\",\n", + " target_quantile=\"global\",\n", + " metric_name=\"rCRPS\",\n", " direction=\"minimize\",\n", - " n_trials=20,\n", + " n_trials=5,\n", " seed=42,\n", ")\n", - "tuning_result = tuner.fit_with_tuning()\n", + "tuning_result = tuner.fit_with_tuning(show_progress_bar=False)\n", "\n", - "print(f\"Study complete: {len(tuning_result.study.trials)} trials\")\n", - "print(f\"Best value: {tuning_result.study.best_value:.4f}\")\n", + "print(f\"Trials completed: {len(tuning_result.study.trials)}\")\n", + "print(f\"Best rCRPS: {tuning_result.study.best_value:.4f}\")\n", "print(f\"Best params: {tuning_result.study.best_params}\")" ] }, { "cell_type": "code", "execution_count": null, - "id": "602f6b4f", + "id": "26bb13cc", "metadata": { - "lines_to_next_cell": 2 + "tags": [ + "remove-cell" + ] }, "outputs": [], "source": [ - "# Inspect which hyperparameters were tuned vs kept at their default.\n", - "best_config = tuning_result.best_config # type: ignore[union-attr] # known to be ForecastingWorkflowConfig\n", - "print(\"Final XGBoost hyperparameters (tuned values marked):\")\n", - "final_hp = best_config.xgboost_hyperparams\n", - "baseline_hp = config.xgboost_hyperparams\n", - "best_params = tuning_result.study.best_params\n", - "\n", - "for field in type(final_hp).model_fields:\n", - " value = getattr(final_hp, field)\n", - " baseline = getattr(baseline_hp, field)\n", - " marker: Literal[\" <- tuned\", \"\"] = \" <- tuned\" if field in best_params else \"\"\n", - " print(f\" {field:25s}: {value}{marker}\")" + "assert len(tuning_result.study.trials) == 5, f\"Expected 5 trials, got {len(tuning_result.study.trials)}\"" ] }, { "cell_type": "markdown", - "id": "2875de25", + "id": "7168a1a8", "metadata": {}, "source": [ - "## The fitted workflow\n", - "\n", - "`fit_with_tuning()` already trains a final workflow on the full training set using the best\n", - "hyperparameters — no separate fit step is needed. The result is in `tuning_result.workflow`.\n" + "## Inspect the best hyperparameters" ] }, { "cell_type": "code", "execution_count": null, - "id": "8d91be63", + "id": "41f1b6ab", "metadata": {}, "outputs": [], "source": [ - "workflow = tuning_result.workflow" + "best_hp = tuning_result.best_config.xgboost_hyperparams\n", + "best_params = tuning_result.study.best_params\n", + "\n", + "print(\"Final hyperparameters (tuned values marked):\")\n", + "for field in type(best_hp).model_fields:\n", + " value = getattr(best_hp, field)\n", + " marker = \" <- tuned\" if field in best_params else \"\"\n", + " print(f\" {field:20s}: {value}{marker}\")" ] }, { "cell_type": "markdown", - "id": "bb6ba8c7", + "id": "dabeaa30", "metadata": {}, "source": [ - "## Inspect the study and forecast\n", + "## Visualize optimization history\n", "\n", - "1. How did $rMAE$ improve over trials?\n", - "2. Which parameters had the most impact?\n", - "3. Final tuned model predictions on the held-out forecast window.\n" + "The optimization history shows how rCRPS decreased over trials. With only\n", + "5 trials results are noisy — increase `n_trials` for smoother convergence." ] }, { "cell_type": "code", "execution_count": null, - "id": "2d34a7fa", + "id": "8b170806", "metadata": { - "lines_to_next_cell": 2 + "tags": [ + "hide-input" + ] }, "outputs": [], "source": [ - "from optuna.visualization import plot_optimization_history, plot_param_importances\n", - "\n", - "study = tuning_result.study\n", + "from optuna.visualization import plot_optimization_history\n", "\n", - "# How the best score evolved over trials\n", - "fig = plot_optimization_history(study)\n", - "fig.update_layout(title=\"Optimization History: rMAE over Trials\")\n", - "fig.show()\n", + "fig = plot_optimization_history(tuning_result.study)\n", + "fig.update_layout(title=\"Optimization History: rCRPS over Trials\", height=400)\n", + "fig.show()" + ] + }, + { + "cell_type": "markdown", + "id": "4fbe1d32", + "metadata": {}, + "source": [ + "## Compare: untuned vs tuned\n", "\n", - "# Which hyperparameters mattered most (requires ≥ ~20 trials for reliable ranking)\n", - "fig2 = plot_param_importances(study)\n", - "fig2.update_layout(title=\"Hyperparameter Importances\")\n", - "fig2.show()" + "Plot both models on the same chart to visualize the improvement.\n", + "Once you're happy with the tuned parameters, run a\n", + "{doc}`backtesting_quickstart` to measure the gain over a realistic\n", + "operational timeline." ] }, { "cell_type": "code", "execution_count": null, - "id": "1b2b1124", + "id": "602f6b4f", "metadata": { - "lines_to_next_cell": 2 + "tags": [ + "hide-input" + ] }, "outputs": [], "source": [ "from openstef_beam.analysis.plots import ForecastTimeSeriesPlotter\n", "\n", - "forecast = workflow.predict(forecast_dataset)\n", + "tuned_forecast = tuning_result.workflow.predict(predict_dataset, forecast_start=train_end)\n", "\n", "fig = (\n", " ForecastTimeSeriesPlotter()\n", - " .add_measurements(measurements=forecast_dataset.data[\"load\"])\n", + " .add_measurements(measurements=predict_dataset.data[\"load\"].loc[train_end:])\n", + " .add_model(\n", + " model_name=\"XGBoost (default)\",\n", + " forecast=baseline_forecast.median_series,\n", + " quantiles=baseline_forecast.quantiles_data,\n", + " )\n", " .add_model(\n", " model_name=\"XGBoost (tuned)\",\n", - " forecast=forecast.median_series,\n", - " quantiles=forecast.quantiles_data,\n", + " forecast=tuned_forecast.median_series,\n", + " quantiles=tuned_forecast.quantiles_data,\n", " )\n", " .plot()\n", ")\n", + "\n", "fig.update_layout(\n", - " title=\"Tuned XGBoost Forecast vs Actual\",\n", - " yaxis_title=\"Load (MW)\",\n", + " title=\"Hyperparameter Tuning: Default vs Tuned XGBoost\",\n", " xaxis_title=\"Time\",\n", - " height=500,\n", + " yaxis_title=\"MW\",\n", + " height=400,\n", ")\n", "fig.show()" ] @@ -310,10 +486,31 @@ { "cell_type": "code", "execution_count": null, - "id": "f4a861db", + "id": "3a388a0c", "metadata": {}, "outputs": [], - "source": [] + "source": [ + "tuned_rcrps = tuning_result.study.best_value\n", + "\n", + "print(f\"{'Model':<20} {'rCRPS':>10}\")\n", + "print(f\"{'':-<20} {'':-^10}\")\n", + "print(f\"{'XGBoost (default)':<20} {baseline_rcrps:>10.4f}\")\n", + "print(f\"{'XGBoost (tuned)':<20} {tuned_rcrps:>10.4f}\")\n", + "print(f\"{'Improvement':<20} {baseline_rcrps - tuned_rcrps:>10.4f}\")" + ] + }, + { + "cell_type": "markdown", + "id": "2875de25", + "metadata": {}, + "source": [ + "## Next steps\n", + "\n", + "- {doc}`ensemble_forecasting` — combine tuned models into an ensemble\n", + " for even better accuracy.\n", + "- {doc}`backtesting_quickstart` — validate tuned parameters on longer\n", + " historical windows." + ] } ], "metadata": { diff --git a/examples/tutorials/hyperparameter_tuning_with_optuna.py b/examples/tutorials/hyperparameter_tuning_with_optuna.py index 6014e1afa..90d973f46 100644 --- a/examples/tutorials/hyperparameter_tuning_with_optuna.py +++ b/examples/tutorials/hyperparameter_tuning_with_optuna.py @@ -20,190 +20,323 @@ # pyright: basic -# %% -# --- Setup: Logging and Display Configuration --- -# Configure logging and display settings for the notebook -from typing import Literal +# %% tags=["remove-cell"] +import warnings + +warnings.filterwarnings("ignore") from openstef_core.testing import configure_notebook_display, setup_notebook_logging configure_notebook_display() -logger = setup_notebook_logging(__name__) +logger = setup_notebook_logging( + __name__, + suppress=( + "choreographer", + "kaleido", + "httpx", + "huggingface_hub", + "fsspec", + "filelock", + "openstef_core.datasets", + "optuna", + "lightgbm", + ), +) + +# %% [markdown] +# # Hyperparameter Tuning with Optuna +# +# OpenSTEF integrates with [Optuna](https://optuna.org/) for Bayesian +# hyperparameter optimization. Every forecaster in OpenSTEF declares +# sensible search bounds on its hyperparameters — you just choose which +# ones to activate for tuning. +# +# **What you'll learn:** +# +# - Why models ship with built-in search spaces +# - How to activate, deactivate, and customize tunable parameters +# - How to change the optimization metric (e.g. rCRPS for probabilistic scoring) +# - How to compare an untuned baseline against the tuned model +# +# ```{note} +# This tutorial runs only 5 trials for fast execution. +# Increase `n_trials` for production use. +# ``` +# +# **Key API references:** +# [`HyperparameterTuner`](https://openstef.github.io/openstef/v4/api/generated/openstef_models.integrations.optuna.HyperparameterTuner.html) +# · [`XGBoostHyperParams`](https://openstef.github.io/openstef/v4/api/generated/openstef_models.models.forecasting.html) +# · [`FloatRange`](https://openstef.github.io/openstef/v4/api/generated/openstef_core.mixins.param_ranges.FloatRange.html) / [`IntRange`](https://openstef.github.io/openstef/v4/api/generated/openstef_core.mixins.param_ranges.IntRange.html) + +# %% [markdown] +# ## Load the dataset # %% -# Download and combine the Liander benchmark dataset into a single TimeSeriesDataset. -from openstef_core.testing import load_liander_dataset, prepare_tutorial_datasets +from datetime import datetime, timedelta + +from openstef_core.testing import load_liander_dataset +from openstef_core.types import LeadTime, Q dataset = load_liander_dataset() -print(f"Dataset shape: {dataset.data.shape}") -print(f"Date range: {dataset.data.index.min()} to {dataset.data.index.max()}") -dataset.data.head() +train_start = datetime.fromisoformat("2024-03-01T00:00:00Z") +train_end = train_start + timedelta(days=45) +forecast_end = train_end + timedelta(days=7) -# %% -# Split the dataset into training (90 days) and forecast (14 days) periods. -train_dataset, forecast_dataset = prepare_tutorial_datasets() +train_dataset = dataset.filter_by_range(start=train_start, end=train_end) +predict_dataset = dataset.filter_by_range( + start=train_end - timedelta(days=14), + end=forecast_end, +) -# %% -# Visualize the training data -# The plot shows the 'load' column (energy consumption in MW) over time -fig = train_dataset.data[["load"]].plot(title="Training Data: Energy Load over Time") -fig.update_layout(yaxis_title="Load (MW)", xaxis_title="Time") # type: ignore[union-attr] # plotly Figure -fig.show() # type: ignore[union-attr] +print(f"Training: {train_dataset.data.shape[0]:,} rows") +print(f"Predict: {predict_dataset.data.shape[0]:,} rows") # %% [markdown] -# ## Define a base config with inline search space +# ## Understanding built-in search spaces +# +# Each forecaster's `HyperParams` class uses Python's `Annotated` type hints +# to declare valid search bounds on every parameter. For example, +# `XGBoostHyperParams` defines: # -# Override default hyperparameters with `TuningRange(tune=True)` to mark them for tuning. -# Any parameter left as a plain value keeps its default during tuning. +# ```python +# n_estimators: Annotated[int, IntRange(50, 500)] = 100 +# learning_rate: Annotated[float, FloatRange(0.01, 0.5, log=True)] = 0.3 +# max_depth: Annotated[int, IntRange(1, 15)] = 6 +# subsample: Annotated[float, FloatRange(0.5, 1.0)] = 1.0 +# ``` +# +# These ranges define **where** Optuna can search, but tuning is **not active +# by default**. The `tune=True` flag explicitly activates each parameter. +# This design means you always get sensible bounds without accidentally +# tuning everything. +# +# Let's see the default search space — with nothing activated: # %% -from openstef_beam.evaluation.metric_providers import ObservedProbabilityProvider, RMAEProvider from openstef_core.mixins.param_ranges import FloatRange, IntRange -from openstef_core.types import LeadTime, Q -from openstef_models.integrations.optuna import HyperparameterTuner from openstef_models.models.forecasting.xgboost_forecaster import XGBoostHyperParams + +default_hp = XGBoostHyperParams() +default_space = default_hp.get_search_space() +print(f"Default tunable parameters: {len(default_space)}") +print("(All parameters use their fixed defaults until you opt in with tune=True)") + +# %% [markdown] +# ## Customizing the search space +# +# To activate tuning on a parameter, pass a range with `tune=True`. +# You can also narrow or widen the bounds, or leave bounds as `None` to +# inherit the class-level defaults from the `Annotated` metadata. +# +# **Activate with custom bounds:** +# ```python +# learning_rate=FloatRange(0.01, 0.3, log=True, tune=True) +# ``` +# +# **Activate with default bounds** (inherits from Annotated metadata): +# ```python +# subsample=FloatRange(tune=True) +# ``` +# +# **Keep a parameter fixed** (don't pass a range — just a plain value or omit it): +# ```python +# max_depth=6 # fixed, not tuned +# ``` +# +# Let's configure XGBoost with 4 tunable parameters and keep `reg_alpha` +# fixed at a known-good value: + +# %% from openstef_models.presets import ForecastingWorkflowConfig, create_forecasting_workflow config = ForecastingWorkflowConfig( model_id="tuning_demo", model="xgboost", - # Forecast settings - horizons=[LeadTime.from_string("PT36H")], # Predict up to 36 hours ahead - quantiles=[Q(0.5), Q(0.1), Q(0.9)], # Median + 80% prediction interval - # Target column (what we're predicting) + horizons=[LeadTime.from_string("PT36H")], + quantiles=[Q(0.5), Q(0.1), Q(0.9)], target_column="load", - # Weather feature columns (from the dataset) temperature_column="temperature_2m", relative_humidity_column="relative_humidity_2m", wind_speed_column="wind_speed_10m", - radiation_column="shortwave_radiation", # Solar radiation + radiation_column="shortwave_radiation", pressure_column="surface_pressure", - # Hyperparameters to tune xgboost_hyperparams=XGBoostHyperParams( - learning_rate=FloatRange(0.01, 0.3, log=True, tune=True), # pyright: ignore[reportCallIssue] # ranges accepted at runtime via Annotated - n_estimators=IntRange(50, 500, tune=True), - max_depth=IntRange(3, 10, tune=True), - subsample=FloatRange(0.5, 1.0, tune=True), - colsample_bytree=FloatRange(0.5, 1.0, tune=True), + # Tuned — custom bounds + learning_rate=FloatRange(0.01, 0.3, log=True, tune=True), # pyright: ignore[reportCallIssue] + n_estimators=IntRange(50, 300, tune=True), + # Tuned — inherits class-level bounds [1, 15] + max_depth=IntRange(tune=True), + # Tuned — custom narrower bounds + subsample=FloatRange(0.6, 1.0, tune=True), + # Fixed — not tuned + reg_alpha=0.1, ), - evaluation_metrics=[RMAEProvider(), ObservedProbabilityProvider()], - mlflow_storage=None, # Disable MLFlow tune to avoid reusing models between trials. + mlflow_storage=None, + verbosity=0, ) +# %% +space = config.xgboost_hyperparams.get_search_space() + +print(f"Active search space ({len(space)} parameters):") +for name, param in space.items(): + if isinstance(param, (FloatRange, IntRange)): + scale = " [log]" if param.log else "" + print(f" {name:20s}: {type(param).__name__} [{param.low} — {param.high}]{scale}") + # %% [markdown] -# ## Inspect the resolved search space +# ## Changing the tuning metric +# +# By default, `HyperparameterTuner` optimizes `R2` on the median quantile. +# For probabilistic forecasts, the **relative Continuous Ranked Probability +# Score (rCRPS)** is a better choice — it evaluates the full quantile +# distribution, not just the median. # +# To use rCRPS, add [`RCRPSProvider`](https://openstef.github.io/openstef/v4/api/generated/openstef_beam.evaluation.metric_providers.RCRPSProvider.html) to the config's `evaluation_metrics` +# and set `metric_name="rCRPS"` with `direction="minimize"` (lower is better): # %% +from openstef_beam.evaluation.metric_providers import ObservedProbabilityProvider, R2Provider, RCRPSProvider -# Get the search space from the hyperparams instance (resolve fills None bounds from class-level defaults). -resolved_space = config.xgboost_hyperparams.get_search_space() +config_with_rcrps = config.model_copy( + update={"evaluation_metrics": [R2Provider(), ObservedProbabilityProvider(), RCRPSProvider()]} +) -print("Resolved search space:") -for name, param in resolved_space.items(): - if isinstance(param, (FloatRange, IntRange)): - scale = " [log]" if param.log else "" - print(f" {name:25s}: {type(param).__name__} [{param.low} — {param.high}]{scale}") - else: - print(f" {name:25s}: CategoricalRange {param.choices}") +# %% [markdown] +# ## Train an untuned baseline +# +# Before tuning, let's train a model with the default hyperparameters so we +# can measure the improvement. + +# %% +baseline_config = config_with_rcrps.model_copy( + update={ + "xgboost_hyperparams": XGBoostHyperParams(reg_alpha=0.1), + } +) +baseline_workflow = create_forecasting_workflow(baseline_config) +baseline_result = baseline_workflow.fit(train_dataset) +baseline_forecast = baseline_workflow.predict(predict_dataset, forecast_start=train_end) + +baseline_r2 = baseline_result.metrics_val.get_metric(quantile=Q(0.5), metric_name="R2") +baseline_rcrps = baseline_result.metrics_val.get_metric(quantile="global", metric_name="rCRPS") +print(f"Baseline R2: {baseline_r2:.4f}") +print(f"Baseline rCRPS: {baseline_rcrps:.4f}") + +# %% tags=["remove-cell"] +assert baseline_r2 is not None and baseline_r2 > 0.0 # %% [markdown] -# ## Run the Optuna study with `HyperparameterTuner` +# ## Run the Optuna study +# +# `HyperparameterTuner.fit_with_tuning()` runs the study and trains a final +# workflow using [`create_forecasting_workflow`](https://openstef.github.io/openstef/v4/api/generated/openstef_models.presets.create_forecasting_workflow.html) on the full training set with the best hyperparameters. +# The first trial always evaluates the default values so the search starts +# from a known baseline. # %% import optuna -optuna.logging.set_verbosity(optuna.logging.WARNING) # Suppress per-trial logs +from openstef_models.integrations.optuna import HyperparameterTuner + +optuna.logging.set_verbosity(optuna.logging.WARNING) tuner = HyperparameterTuner( - config=config, + config=config_with_rcrps, train_dataset=train_dataset, create_workflow=create_forecasting_workflow, - target_quantile=Q(0.5), - metric_name="rMAE", + target_quantile="global", + metric_name="rCRPS", direction="minimize", - n_trials=20, + n_trials=5, seed=42, ) -tuning_result = tuner.fit_with_tuning() +tuning_result = tuner.fit_with_tuning(show_progress_bar=False) -print(f"Study complete: {len(tuning_result.study.trials)} trials") -print(f"Best value: {tuning_result.study.best_value:.4f}") +print(f"Trials completed: {len(tuning_result.study.trials)}") +print(f"Best rCRPS: {tuning_result.study.best_value:.4f}") print(f"Best params: {tuning_result.study.best_params}") - -# %% -# Inspect which hyperparameters were tuned vs kept at their default. -best_config = tuning_result.best_config # type: ignore[union-attr] # known to be ForecastingWorkflowConfig -print("Final XGBoost hyperparameters (tuned values marked):") -final_hp = best_config.xgboost_hyperparams -baseline_hp = config.xgboost_hyperparams -best_params = tuning_result.study.best_params - -for field in type(final_hp).model_fields: - value = getattr(final_hp, field) - baseline = getattr(baseline_hp, field) - marker: Literal[" <- tuned", ""] = " <- tuned" if field in best_params else "" - print(f" {field:25s}: {value}{marker}") - +# %% tags=["remove-cell"] +assert len(tuning_result.study.trials) == 5, f"Expected 5 trials, got {len(tuning_result.study.trials)}" # %% [markdown] -# ## The fitted workflow -# -# `fit_with_tuning()` already trains a final workflow on the full training set using the best -# hyperparameters — no separate fit step is needed. The result is in `tuning_result.workflow`. -# +# ## Inspect the best hyperparameters # %% -workflow = tuning_result.workflow +best_hp = tuning_result.best_config.xgboost_hyperparams +best_params = tuning_result.study.best_params + +print("Final hyperparameters (tuned values marked):") +for field in type(best_hp).model_fields: + value = getattr(best_hp, field) + marker = " <- tuned" if field in best_params else "" + print(f" {field:20s}: {value}{marker}") # %% [markdown] -# ## Inspect the study and forecast -# -# 1. How did $rMAE$ improve over trials? -# 2. Which parameters had the most impact? -# 3. Final tuned model predictions on the held-out forecast window. +# ## Visualize optimization history # +# The optimization history shows how rCRPS decreased over trials. With only +# 5 trials results are noisy — increase `n_trials` for smoother convergence. -# %% -from optuna.visualization import plot_optimization_history, plot_param_importances - -study = tuning_result.study +# %% tags=["hide-input"] +from optuna.visualization import plot_optimization_history -# How the best score evolved over trials -fig = plot_optimization_history(study) -fig.update_layout(title="Optimization History: rMAE over Trials") +fig = plot_optimization_history(tuning_result.study) +fig.update_layout(title="Optimization History: rCRPS over Trials", height=400) fig.show() -# Which hyperparameters mattered most (requires ≥ ~20 trials for reliable ranking) -fig2 = plot_param_importances(study) -fig2.update_layout(title="Hyperparameter Importances") -fig2.show() - +# %% [markdown] +# ## Compare: untuned vs tuned +# +# Plot both models on the same chart to visualize the improvement. +# Once you're happy with the tuned parameters, run a +# {doc}`backtesting_quickstart` to measure the gain over a realistic +# operational timeline. -# %% +# %% tags=["hide-input"] from openstef_beam.analysis.plots import ForecastTimeSeriesPlotter -forecast = workflow.predict(forecast_dataset) +tuned_forecast = tuning_result.workflow.predict(predict_dataset, forecast_start=train_end) fig = ( ForecastTimeSeriesPlotter() - .add_measurements(measurements=forecast_dataset.data["load"]) + .add_measurements(measurements=predict_dataset.data["load"].loc[train_end:]) + .add_model( + model_name="XGBoost (default)", + forecast=baseline_forecast.median_series, + quantiles=baseline_forecast.quantiles_data, + ) .add_model( model_name="XGBoost (tuned)", - forecast=forecast.median_series, - quantiles=forecast.quantiles_data, + forecast=tuned_forecast.median_series, + quantiles=tuned_forecast.quantiles_data, ) .plot() ) + fig.update_layout( - title="Tuned XGBoost Forecast vs Actual", - yaxis_title="Load (MW)", + title="Hyperparameter Tuning: Default vs Tuned XGBoost", xaxis_title="Time", - height=500, + yaxis_title="MW", + height=400, ) fig.show() - # %% +tuned_rcrps = tuning_result.study.best_value + +print(f"{'Model':<20} {'rCRPS':>10}") +print(f"{'':-<20} {'':-^10}") +print(f"{'XGBoost (default)':<20} {baseline_rcrps:>10.4f}") +print(f"{'XGBoost (tuned)':<20} {tuned_rcrps:>10.4f}") +print(f"{'Improvement':<20} {baseline_rcrps - tuned_rcrps:>10.4f}") + +# %% [markdown] +# ## Next steps +# +# - {doc}`ensemble_forecasting` — combine tuned models into an ensemble +# for even better accuracy. +# - {doc}`backtesting_quickstart` — validate tuned parameters on longer +# historical windows. diff --git a/examples/tutorials/model_explainability.ipynb b/examples/tutorials/model_explainability.ipynb new file mode 100644 index 000000000..4d493d99f --- /dev/null +++ b/examples/tutorials/model_explainability.ipynb @@ -0,0 +1,347 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "7efd95d4", + "metadata": { + "tags": [ + "remove-cell" + ] + }, + "outputs": [], + "source": [ + "# SPDX-FileCopyrightText: 2025 Contributors to the OpenSTEF project \n", + "#\n", + "# SPDX-License-Identifier: MPL-2.0\n", + "\n", + "# pyright: basic" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f565776d", + "metadata": { + "tags": [ + "remove-cell" + ] + }, + "outputs": [], + "source": [ + "import warnings\n", + "from typing import cast\n", + "\n", + "warnings.filterwarnings(\"ignore\")\n", + "\n", + "from openstef_core.testing import configure_notebook_display, setup_notebook_logging\n", + "\n", + "configure_notebook_display()\n", + "logger = setup_notebook_logging(\n", + " __name__,\n", + " suppress=(\n", + " \"choreographer\",\n", + " \"kaleido\",\n", + " \"httpx\",\n", + " \"huggingface_hub\",\n", + " \"fsspec\",\n", + " \"filelock\",\n", + " \"openstef_core.datasets\",\n", + " ),\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "9047f142", + "metadata": {}, + "source": [ + "# Model Explainability\n", + "\n", + "Understand why a forecasting model makes the predictions it does, using\n", + "feature importance scores and per-timestep SHAP contributions.\n", + "\n", + "**What you'll learn:**\n", + "\n", + "- Inspect global feature importance with an interactive treemap\n", + "- Compute per-timestep feature contributions (SHAP values)\n", + "- Visualize contributions with heatmaps, waterfall charts, and bar charts\n", + "\n", + "```{note}\n", + "This tutorial uses a small data slice for fast execution.\n", + "See `examples/benchmarks/` for production-scale runs.\n", + "```\n", + "\n", + "**Key API references:**\n", + "[`ExplainableForecaster`](https://openstef.github.io/openstef/v4/api/generated/openstef_models.explainability.ExplainableForecaster.html)\n", + "· [`ContributionsPlotter`](https://openstef.github.io/openstef/v4/api/generated/openstef_models.explainability.ContributionsPlotter.html)\n", + "· [`FeatureImportancePlotter`](https://openstef.github.io/openstef/v4/api/generated/openstef_models.explainability.FeatureImportancePlotter.html)" + ] + }, + { + "cell_type": "markdown", + "id": "79a5bbf9", + "metadata": {}, + "source": [ + "## Train a model\n", + "\n", + "We reuse the same setup as the {doc}`forecasting_quickstart` — train a GBLinear\n", + "model on 45 days of Liander data." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "862a46b8", + "metadata": {}, + "outputs": [], + "source": [ + "from datetime import datetime, timedelta\n", + "\n", + "from openstef_core.testing import load_liander_dataset\n", + "from openstef_core.types import LeadTime, Q\n", + "from openstef_models.presets import ForecastingWorkflowConfig, create_forecasting_workflow\n", + "from openstef_models.presets.forecasting_workflow import GBLinearForecaster\n", + "\n", + "dataset = load_liander_dataset()\n", + "\n", + "train_start = datetime.fromisoformat(\"2024-03-01T00:00:00Z\")\n", + "train_end = train_start + timedelta(days=45)\n", + "forecast_end = train_end + timedelta(days=7)\n", + "\n", + "train_dataset = dataset.filter_by_range(start=train_start, end=train_end)\n", + "predict_dataset = dataset.filter_by_range(\n", + " start=train_end - timedelta(days=14),\n", + " end=forecast_end,\n", + ")\n", + "\n", + "workflow = create_forecasting_workflow(\n", + " config=ForecastingWorkflowConfig(\n", + " model_id=\"explainability_gblinear\",\n", + " model=\"gblinear\",\n", + " horizons=[LeadTime.from_string(\"PT36H\")],\n", + " quantiles=[Q(0.5), Q(0.1), Q(0.9)],\n", + " target_column=\"load\",\n", + " temperature_column=\"temperature_2m\",\n", + " relative_humidity_column=\"relative_humidity_2m\",\n", + " wind_speed_column=\"wind_speed_10m\",\n", + " radiation_column=\"shortwave_radiation\",\n", + " pressure_column=\"surface_pressure\",\n", + " verbosity=0,\n", + " mlflow_storage=None,\n", + " gblinear_hyperparams=GBLinearForecaster.HyperParams(n_steps=50),\n", + " )\n", + ")\n", + "\n", + "result = workflow.fit(train_dataset)\n", + "print(\"Training complete.\")\n", + "print(result.metrics_full.to_dataframe())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "622af3db", + "metadata": { + "tags": [ + "remove-cell" + ] + }, + "outputs": [], + "source": [ + "assert result is not None, \"Training should produce a result\"" + ] + }, + { + "cell_type": "markdown", + "id": "0680e73c", + "metadata": {}, + "source": [ + "## Feature importance\n", + "\n", + "Feature importance scores rank features by their overall impact on the model's\n", + "predictions. The [`FeatureImportancePlotter`](https://openstef.github.io/openstef/v4/api/generated/openstef_models.explainability.FeatureImportancePlotter.html) treemap visualization groups features by magnitude — larger\n", + "tiles represent more influential features." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ca38989f", + "metadata": { + "tags": [ + "hide-input" + ] + }, + "outputs": [], + "source": [ + "from openstef_models.explainability import ExplainableForecaster\n", + "from openstef_models.models.forecasting_model import ForecastingModel\n", + "\n", + "forecaster = cast(ForecastingModel, workflow.model).forecaster\n", + "explainable_model = cast(ExplainableForecaster, forecaster)\n", + "\n", + "fig = explainable_model.plot_feature_importances()\n", + "fig.update_layout(title=\"Feature importance (treemap)\", height=500)\n", + "fig.show()" + ] + }, + { + "cell_type": "markdown", + "id": "bf6cec05", + "metadata": {}, + "source": [ + "## Feature contributions\n", + "\n", + "While feature importance is a global summary, **feature contributions** explain\n", + "individual predictions. For each timestep, they decompose the prediction into\n", + "additive terms: one per feature plus a bias.\n", + "\n", + "GBLinear models provide exact SHAP values, making this decomposition faithful\n", + "to the model's internal logic. Use [`ContributionsPlotter`](https://openstef.github.io/openstef/v4/api/generated/openstef_models.explainability.ContributionsPlotter.html)\n", + "to visualize contributions as heatmaps, bar charts, or waterfall charts." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "63e99143", + "metadata": {}, + "outputs": [], + "source": [ + "from openstef_models.explainability import ContributionsPlotter\n", + "\n", + "contributions = workflow.model.predict_contributions(predict_dataset, forecast_start=train_end)\n", + "\n", + "print(f\"Contributions shape: {contributions.data.shape}\")\n", + "print(f\"Features: {contributions.data.columns.tolist()[:5]} ... ({len(contributions.data.columns)} total)\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d8c89b86", + "metadata": { + "tags": [ + "remove-cell" + ] + }, + "outputs": [], + "source": [ + "assert contributions.data.shape[0] > 100, f\"Expected >100 rows, got {contributions.data.shape[0]}\"\n", + "assert \"bias\" in contributions.data.columns, \"Contributions should include bias column\"" + ] + }, + { + "cell_type": "markdown", + "id": "06217b33", + "metadata": {}, + "source": [ + "### Heatmap — contributions over time\n", + "\n", + "Each row is a feature, each column is a timestep. Red cells indicate positive\n", + "contributions (pushing the prediction up), blue cells indicate negative ones.\n", + "The prediction line overlays the total." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b15c2e1a", + "metadata": { + "tags": [ + "hide-input" + ] + }, + "outputs": [], + "source": [ + "fig = ContributionsPlotter.plot_heatmap(contributions, top_n=10, show_prediction=True)\n", + "fig.update_layout(height=500)\n", + "fig.show()" + ] + }, + { + "cell_type": "markdown", + "id": "35727a5e", + "metadata": {}, + "source": [ + "### Bar chart — average feature impact\n", + "\n", + "Mean absolute contribution per feature, ranked from most to least impactful.\n", + "This gives a complementary view to global importance — here you see which\n", + "features actively moved predictions during the forecast window. If certain\n", + "features dominate unexpectedly, consider adjusting the pipeline via\n", + "{doc}`custom_pipeline`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a61dc558", + "metadata": { + "tags": [ + "hide-input" + ] + }, + "outputs": [], + "source": [ + "fig = ContributionsPlotter.plot_bar(contributions, top_n=12)\n", + "fig.update_layout(title=\"Mean absolute contribution per feature\", height=450)\n", + "fig.show()" + ] + }, + { + "cell_type": "markdown", + "id": "8a2abed2", + "metadata": {}, + "source": [ + "### Waterfall — single timestep decomposition\n", + "\n", + "The waterfall chart breaks down one specific prediction into its components.\n", + "Starting from the bias (baseline prediction), each feature adds or subtracts\n", + "from the final value." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0a44aa28", + "metadata": { + "tags": [ + "hide-input" + ] + }, + "outputs": [], + "source": [ + "fig = ContributionsPlotter.plot_waterfall(contributions, timestep=48, top_n=10)\n", + "fig.update_layout(title=\"Prediction decomposition (timestep 48)\", height=500)\n", + "fig.show()" + ] + }, + { + "cell_type": "markdown", + "id": "dc95f759", + "metadata": {}, + "source": [ + "## Next steps\n", + "\n", + "- {doc}`hyperparameter_tuning_with_optuna` — use explainability insights\n", + " to guide which parameters to tune.\n", + "- {doc}`custom_pipeline` — fine-tune feature engineering based on what\n", + " the contributions reveal." + ] + } + ], + "metadata": { + "jupytext": { + "formats": "ipynb,py:percent" + }, + "kernelspec": { + "display_name": ".venv", + "language": "python", + "name": "python3" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/examples/tutorials/model_explainability.py b/examples/tutorials/model_explainability.py new file mode 100644 index 000000000..e8f7ead18 --- /dev/null +++ b/examples/tutorials/model_explainability.py @@ -0,0 +1,203 @@ +# --- +# jupyter: +# jupytext: +# formats: ipynb,py:percent +# text_representation: +# extension: .py +# format_name: percent +# format_version: '1.3' +# jupytext_version: 1.19.1 +# kernelspec: +# display_name: .venv +# language: python +# name: python3 +# --- + +# %% tags=["remove-cell"] +# SPDX-FileCopyrightText: 2025 Contributors to the OpenSTEF project +# +# SPDX-License-Identifier: MPL-2.0 + +# pyright: basic + +# %% tags=["remove-cell"] +import warnings +from typing import cast + +warnings.filterwarnings("ignore") + +from openstef_core.testing import configure_notebook_display, setup_notebook_logging + +configure_notebook_display() +logger = setup_notebook_logging( + __name__, + suppress=( + "choreographer", + "kaleido", + "httpx", + "huggingface_hub", + "fsspec", + "filelock", + "openstef_core.datasets", + ), +) + +# %% [markdown] +# # Model Explainability +# +# Understand why a forecasting model makes the predictions it does, using +# feature importance scores and per-timestep SHAP contributions. +# +# **What you'll learn:** +# +# - Inspect global feature importance with an interactive treemap +# - Compute per-timestep feature contributions (SHAP values) +# - Visualize contributions with heatmaps, waterfall charts, and bar charts +# +# ```{note} +# This tutorial uses a small data slice for fast execution. +# See `examples/benchmarks/` for production-scale runs. +# ``` +# +# **Key API references:** +# [`ExplainableForecaster`](https://openstef.github.io/openstef/v4/api/generated/openstef_models.explainability.ExplainableForecaster.html) +# · [`ContributionsPlotter`](https://openstef.github.io/openstef/v4/api/generated/openstef_models.explainability.ContributionsPlotter.html) +# · [`FeatureImportancePlotter`](https://openstef.github.io/openstef/v4/api/generated/openstef_models.explainability.FeatureImportancePlotter.html) + +# %% [markdown] +# ## Train a model +# +# We reuse the same setup as the {doc}`forecasting_quickstart` — train a GBLinear +# model on 45 days of Liander data. + +# %% +from datetime import datetime, timedelta + +from openstef_core.testing import load_liander_dataset +from openstef_core.types import LeadTime, Q +from openstef_models.presets import ForecastingWorkflowConfig, create_forecasting_workflow +from openstef_models.presets.forecasting_workflow import GBLinearForecaster + +dataset = load_liander_dataset() + +train_start = datetime.fromisoformat("2024-03-01T00:00:00Z") +train_end = train_start + timedelta(days=45) +forecast_end = train_end + timedelta(days=7) + +train_dataset = dataset.filter_by_range(start=train_start, end=train_end) +predict_dataset = dataset.filter_by_range( + start=train_end - timedelta(days=14), + end=forecast_end, +) + +workflow = create_forecasting_workflow( + config=ForecastingWorkflowConfig( + model_id="explainability_gblinear", + model="gblinear", + horizons=[LeadTime.from_string("PT36H")], + quantiles=[Q(0.5), Q(0.1), Q(0.9)], + target_column="load", + temperature_column="temperature_2m", + relative_humidity_column="relative_humidity_2m", + wind_speed_column="wind_speed_10m", + radiation_column="shortwave_radiation", + pressure_column="surface_pressure", + verbosity=0, + mlflow_storage=None, + gblinear_hyperparams=GBLinearForecaster.HyperParams(n_steps=50), + ) +) + +result = workflow.fit(train_dataset) +print("Training complete.") +print(result.metrics_full.to_dataframe()) + +# %% tags=["remove-cell"] +assert result is not None, "Training should produce a result" + +# %% [markdown] +# ## Feature importance +# +# Feature importance scores rank features by their overall impact on the model's +# predictions. The [`FeatureImportancePlotter`](https://openstef.github.io/openstef/v4/api/generated/openstef_models.explainability.FeatureImportancePlotter.html) treemap visualization groups features by magnitude — larger +# tiles represent more influential features. + +# %% tags=["hide-input"] +from openstef_models.explainability import ExplainableForecaster +from openstef_models.models.forecasting_model import ForecastingModel + +forecaster = cast(ForecastingModel, workflow.model).forecaster +explainable_model = cast(ExplainableForecaster, forecaster) + +fig = explainable_model.plot_feature_importances() +fig.update_layout(title="Feature importance (treemap)", height=500) +fig.show() + +# %% [markdown] +# ## Feature contributions +# +# While feature importance is a global summary, **feature contributions** explain +# individual predictions. For each timestep, they decompose the prediction into +# additive terms: one per feature plus a bias. +# +# GBLinear models provide exact SHAP values, making this decomposition faithful +# to the model's internal logic. Use [`ContributionsPlotter`](https://openstef.github.io/openstef/v4/api/generated/openstef_models.explainability.ContributionsPlotter.html) +# to visualize contributions as heatmaps, bar charts, or waterfall charts. + +# %% +from openstef_models.explainability import ContributionsPlotter + +contributions = workflow.model.predict_contributions(predict_dataset, forecast_start=train_end) + +print(f"Contributions shape: {contributions.data.shape}") +print(f"Features: {contributions.data.columns.tolist()[:5]} ... ({len(contributions.data.columns)} total)") + +# %% tags=["remove-cell"] +assert contributions.data.shape[0] > 100, f"Expected >100 rows, got {contributions.data.shape[0]}" +assert "bias" in contributions.data.columns, "Contributions should include bias column" + +# %% [markdown] +# ### Heatmap — contributions over time +# +# Each row is a feature, each column is a timestep. Red cells indicate positive +# contributions (pushing the prediction up), blue cells indicate negative ones. +# The prediction line overlays the total. + +# %% tags=["hide-input"] +fig = ContributionsPlotter.plot_heatmap(contributions, top_n=10, show_prediction=True) +fig.update_layout(height=500) +fig.show() + +# %% [markdown] +# ### Bar chart — average feature impact +# +# Mean absolute contribution per feature, ranked from most to least impactful. +# This gives a complementary view to global importance — here you see which +# features actively moved predictions during the forecast window. If certain +# features dominate unexpectedly, consider adjusting the pipeline via +# {doc}`custom_pipeline`. + +# %% tags=["hide-input"] +fig = ContributionsPlotter.plot_bar(contributions, top_n=12) +fig.update_layout(title="Mean absolute contribution per feature", height=450) +fig.show() + +# %% [markdown] +# ### Waterfall — single timestep decomposition +# +# The waterfall chart breaks down one specific prediction into its components. +# Starting from the bias (baseline prediction), each feature adds or subtracts +# from the final value. + +# %% tags=["hide-input"] +fig = ContributionsPlotter.plot_waterfall(contributions, timestep=48, top_n=10) +fig.update_layout(title="Prediction decomposition (timestep 48)", height=500) +fig.show() + +# %% [markdown] +# ## Next steps +# +# - {doc}`hyperparameter_tuning_with_optuna` — use explainability insights +# to guide which parameters to tune. +# - {doc}`custom_pipeline` — fine-tune feature engineering based on what +# the contributions reveal. diff --git a/examples/tutorials/quantile_calibration.ipynb b/examples/tutorials/quantile_calibration.ipynb new file mode 100644 index 000000000..a772e9db5 --- /dev/null +++ b/examples/tutorials/quantile_calibration.ipynb @@ -0,0 +1,376 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "28ab33c6", + "metadata": { + "tags": [ + "remove-cell" + ] + }, + "outputs": [], + "source": [ + "# SPDX-FileCopyrightText: 2025 Contributors to the OpenSTEF project \n", + "#\n", + "# SPDX-License-Identifier: MPL-2.0\n", + "\n", + "# pyright: basic" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8d1ac2cd", + "metadata": { + "tags": [ + "remove-cell" + ] + }, + "outputs": [], + "source": [ + "import warnings\n", + "\n", + "warnings.filterwarnings(\"ignore\")\n", + "\n", + "from openstef_core.testing import configure_notebook_display, setup_notebook_logging\n", + "\n", + "configure_notebook_display()\n", + "logger = setup_notebook_logging(\n", + " __name__,\n", + " suppress=(\n", + " \"choreographer\",\n", + " \"kaleido\",\n", + " \"httpx\",\n", + " \"huggingface_hub\",\n", + " \"fsspec\",\n", + " \"filelock\",\n", + " \"openstef_core.datasets\",\n", + " ),\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "2297aa0d", + "metadata": {}, + "source": [ + "# Quantile Calibration\n", + "\n", + "Improve the reliability of probabilistic forecasts using isotonic quantile\n", + "calibration. A well-calibrated P10 quantile should exceed actual values\n", + "roughly 10 % of the time — this tutorial shows how to measure and correct\n", + "deviations.\n", + "\n", + "**What you'll learn:**\n", + "\n", + "- Measure quantile calibration with observed coverage\n", + "- Add isotonic calibration as a postprocessing step\n", + "- Compare before/after calibration on real data\n", + "\n", + "```{note}\n", + "This tutorial uses a small data slice for fast execution.\n", + "See `examples/benchmarks/` for production-scale runs.\n", + "```\n", + "\n", + "**Key API references:**\n", + "[`IsotonicQuantileCalibrator`](https://openstef.github.io/openstef/v4/api/generated/openstef_models.transforms.postprocessing.IsotonicQuantileCalibrator.html)\n", + "· [`ForecastingWorkflowConfig`](https://openstef.github.io/openstef/v4/api/generated/openstef_models.presets.ForecastingWorkflowConfig.html)" + ] + }, + { + "cell_type": "markdown", + "id": "4c98c476", + "metadata": {}, + "source": [ + "## Load data and train an uncalibrated model\n", + "\n", + "We start with the same GBLinear setup as the {doc}`forecasting_quickstart` and\n", + "measure how well its predicted quantiles match observed coverage.\n", + "The [`ForecastingWorkflowConfig`](https://openstef.github.io/openstef/v4/api/generated/openstef_models.presets.ForecastingWorkflowConfig.html)\n", + "defines the model architecture and quantile levels." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bf2cde43", + "metadata": {}, + "outputs": [], + "source": [ + "from datetime import datetime, timedelta\n", + "\n", + "import pandas as pd\n", + "import plotly.graph_objects as go\n", + "\n", + "from openstef_core.testing import load_liander_dataset\n", + "from openstef_core.types import LeadTime, Q\n", + "from openstef_models.presets import ForecastingWorkflowConfig, create_forecasting_workflow\n", + "from openstef_models.presets.forecasting_workflow import GBLinearForecaster\n", + "\n", + "dataset = load_liander_dataset()\n", + "\n", + "train_start = datetime.fromisoformat(\"2024-03-01T00:00:00Z\")\n", + "train_end = train_start + timedelta(days=45)\n", + "forecast_end = train_end + timedelta(days=7)\n", + "\n", + "train_dataset = dataset.filter_by_range(start=train_start, end=train_end)\n", + "predict_dataset = dataset.filter_by_range(\n", + " start=train_end - timedelta(days=14),\n", + " end=forecast_end,\n", + ")\n", + "\n", + "quantiles = [Q(0.1), Q(0.5), Q(0.9)]\n", + "\n", + "config = ForecastingWorkflowConfig(\n", + " model_id=\"uncalibrated_gblinear\",\n", + " model=\"gblinear\",\n", + " horizons=[LeadTime.from_string(\"PT36H\")],\n", + " quantiles=quantiles,\n", + " target_column=\"load\",\n", + " temperature_column=\"temperature_2m\",\n", + " relative_humidity_column=\"relative_humidity_2m\",\n", + " wind_speed_column=\"wind_speed_10m\",\n", + " radiation_column=\"shortwave_radiation\",\n", + " pressure_column=\"surface_pressure\",\n", + " verbosity=0,\n", + " mlflow_storage=None,\n", + " gblinear_hyperparams=GBLinearForecaster.HyperParams(n_steps=50),\n", + ")\n", + "\n", + "workflow_uncal = create_forecasting_workflow(config=config)\n", + "workflow_uncal.fit(train_dataset)\n", + "forecast_uncal = workflow_uncal.predict(predict_dataset, forecast_start=train_end)\n", + "\n", + "print(f\"Forecast rows: {len(forecast_uncal.data)}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "018066a2", + "metadata": { + "tags": [ + "remove-cell" + ] + }, + "outputs": [], + "source": [ + "assert len(forecast_uncal.data) > 100, f\"Expected >100 forecast rows, got {len(forecast_uncal.data)}\"" + ] + }, + { + "cell_type": "markdown", + "id": "390de36e", + "metadata": {}, + "source": [ + "## Measure calibration quality\n", + "\n", + "For a perfectly calibrated forecast at quantile $p$, the fraction of\n", + "observations falling below the predicted value should equal $p$. We compute\n", + "the **observed coverage** for each quantile and compare it to the expected\n", + "level." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d0229c1c", + "metadata": {}, + "outputs": [], + "source": [ + "actuals = predict_dataset.data[\"load\"].loc[train_end:].reindex(forecast_uncal.data.index).dropna()\n", + "forecast_aligned = forecast_uncal.data.loc[actuals.index]\n", + "\n", + "expected = [float(q) for q in quantiles]\n", + "observed_uncal = [float((actuals <= forecast_aligned[f\"quantile_P{int(float(q) * 100)}\"]).mean()) for q in quantiles]\n", + "\n", + "calibration_df = pd.DataFrame({\n", + " \"quantile\": [f\"P{int(float(q) * 100)}\" for q in quantiles],\n", + " \"expected\": expected,\n", + " \"observed\": observed_uncal,\n", + " \"error\": [o - e for o, e in zip(observed_uncal, expected, strict=True)],\n", + "})\n", + "print(\"Calibration before isotonic correction:\")\n", + "print(calibration_df.to_string(index=False))" + ] + }, + { + "cell_type": "markdown", + "id": "c0b8a434", + "metadata": {}, + "source": [ + "## Add isotonic calibration\n", + "\n", + "[`IsotonicQuantileCalibrator`](https://openstef.github.io/openstef/v4/api/generated/openstef_models.transforms.postprocessing.IsotonicQuantileCalibrator.html) is a postprocessing transform that learns a\n", + "monotonic mapping from predicted quantiles to observed quantile levels.\n", + "During training it fits on the validation split; during prediction it\n", + "corrects each quantile value.\n", + "\n", + "We create a second workflow identical to the first, but with the calibrator\n", + "appended to its postprocessing pipeline." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "dcd100f9", + "metadata": {}, + "outputs": [], + "source": [ + "from openstef_models.transforms.postprocessing import IsotonicQuantileCalibrator\n", + "\n", + "config_cal = config.model_copy(update={\"model_id\": \"calibrated_gblinear\"})\n", + "workflow_cal = create_forecasting_workflow(config=config_cal)\n", + "\n", + "# Append isotonic calibration to the existing postprocessing pipeline\n", + "workflow_cal.model.postprocessing.transforms.append(\n", + " IsotonicQuantileCalibrator(\n", + " quantiles=quantiles,\n", + " use_local_quantile_estimation=True,\n", + " )\n", + ")\n", + "\n", + "workflow_cal.fit(train_dataset)\n", + "forecast_cal = workflow_cal.predict(predict_dataset, forecast_start=train_end)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e7d92a1c", + "metadata": { + "tags": [ + "remove-cell" + ] + }, + "outputs": [], + "source": [ + "assert len(forecast_cal.data) > 100, f\"Expected >100 calibrated forecast rows, got {len(forecast_cal.data)}\"" + ] + }, + { + "cell_type": "markdown", + "id": "0dc57ac6", + "metadata": {}, + "source": [ + "## Compare calibration before and after" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9b28cbc3", + "metadata": {}, + "outputs": [], + "source": [ + "forecast_cal_aligned = forecast_cal.data.loc[actuals.index]\n", + "\n", + "observed_cal = [float((actuals <= forecast_cal_aligned[f\"quantile_P{int(float(q) * 100)}\"]).mean()) for q in quantiles]\n", + "\n", + "comparison_df = pd.DataFrame({\n", + " \"quantile\": [f\"P{int(float(q) * 100)}\" for q in quantiles],\n", + " \"expected\": expected,\n", + " \"observed (before)\": observed_uncal,\n", + " \"observed (after)\": observed_cal,\n", + " \"error (before)\": [o - e for o, e in zip(observed_uncal, expected, strict=True)],\n", + " \"error (after)\": [o - e for o, e in zip(observed_cal, expected, strict=True)],\n", + "})\n", + "print(comparison_df.to_string(index=False))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "aea63869", + "metadata": { + "tags": [ + "hide-input" + ] + }, + "outputs": [], + "source": [ + "fig = go.Figure()\n", + "\n", + "fig.add_trace( # pyright: ignore[reportUnknownMemberType]\n", + " go.Scatter(\n", + " x=[0, 1],\n", + " y=[0, 1],\n", + " mode=\"lines\",\n", + " name=\"Perfect calibration\",\n", + " line={\"color\": \"gray\", \"dash\": \"dash\", \"width\": 2},\n", + " )\n", + ")\n", + "\n", + "fig.add_trace( # pyright: ignore[reportUnknownMemberType]\n", + " go.Scatter(\n", + " x=expected,\n", + " y=observed_uncal,\n", + " mode=\"markers+lines\",\n", + " name=\"Before calibration\",\n", + " marker={\"size\": 12, \"color\": \"red\", \"symbol\": \"x\"},\n", + " line={\"color\": \"red\", \"width\": 2, \"dash\": \"dot\"},\n", + " )\n", + ")\n", + "\n", + "fig.add_trace( # pyright: ignore[reportUnknownMemberType]\n", + " go.Scatter(\n", + " x=expected,\n", + " y=observed_cal,\n", + " mode=\"markers+lines\",\n", + " name=\"After calibration\",\n", + " marker={\"size\": 12, \"color\": \"blue\"},\n", + " line={\"color\": \"blue\", \"width\": 2},\n", + " )\n", + ")\n", + "\n", + "fig.update_layout( # pyright: ignore[reportUnknownMemberType]\n", + " title=\"Quantile calibration: expected vs observed coverage\",\n", + " xaxis_title=\"Expected quantile level\",\n", + " yaxis_title=\"Observed coverage\",\n", + " xaxis={\"range\": [0, 1], \"tickvals\": [0, 0.1, 0.5, 0.9, 1]},\n", + " yaxis={\"range\": [0, 1], \"tickvals\": [0, 0.1, 0.5, 0.9, 1]},\n", + " height=500,\n", + " width=600,\n", + ")\n", + "fig.show()" + ] + }, + { + "cell_type": "markdown", + "id": "fb2c92a8", + "metadata": {}, + "source": [ + "Points closer to the diagonal indicate better calibration. The isotonic\n", + "correction pulls the observed coverage towards the expected level, improving\n", + "the reliability of uncertainty estimates. To measure calibration stability\n", + "over longer time horizons, combine this with a {doc}`backtesting_quickstart`." + ] + }, + { + "cell_type": "markdown", + "id": "187cc8df", + "metadata": {}, + "source": [ + "## Next steps\n", + "\n", + "- {doc}`backtesting_quickstart` — measure calibration consistency over\n", + " realistic operational periods.\n", + "- {doc}`ensemble_forecasting` — apply calibration to ensemble models\n", + " for combined accuracy and reliable uncertainty." + ] + } + ], + "metadata": { + "jupytext": { + "formats": "ipynb,py:percent" + }, + "kernelspec": { + "display_name": ".venv", + "language": "python", + "name": "python3" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/examples/tutorials/quantile_calibration.py b/examples/tutorials/quantile_calibration.py new file mode 100644 index 000000000..bf8976e07 --- /dev/null +++ b/examples/tutorials/quantile_calibration.py @@ -0,0 +1,256 @@ +# --- +# jupyter: +# jupytext: +# formats: ipynb,py:percent +# text_representation: +# extension: .py +# format_name: percent +# format_version: '1.3' +# jupytext_version: 1.19.1 +# kernelspec: +# display_name: .venv +# language: python +# name: python3 +# --- + +# %% tags=["remove-cell"] +# SPDX-FileCopyrightText: 2025 Contributors to the OpenSTEF project +# +# SPDX-License-Identifier: MPL-2.0 + +# pyright: basic + +# %% tags=["remove-cell"] +import warnings + +warnings.filterwarnings("ignore") + +from openstef_core.testing import configure_notebook_display, setup_notebook_logging + +configure_notebook_display() +logger = setup_notebook_logging( + __name__, + suppress=( + "choreographer", + "kaleido", + "httpx", + "huggingface_hub", + "fsspec", + "filelock", + "openstef_core.datasets", + ), +) + +# %% [markdown] +# # Quantile Calibration +# +# Improve the reliability of probabilistic forecasts using isotonic quantile +# calibration. A well-calibrated P10 quantile should exceed actual values +# roughly 10 % of the time — this tutorial shows how to measure and correct +# deviations. +# +# **What you'll learn:** +# +# - Measure quantile calibration with observed coverage +# - Add isotonic calibration as a postprocessing step +# - Compare before/after calibration on real data +# +# ```{note} +# This tutorial uses a small data slice for fast execution. +# See `examples/benchmarks/` for production-scale runs. +# ``` +# +# **Key API references:** +# [`IsotonicQuantileCalibrator`](https://openstef.github.io/openstef/v4/api/generated/openstef_models.transforms.postprocessing.IsotonicQuantileCalibrator.html) +# · [`ForecastingWorkflowConfig`](https://openstef.github.io/openstef/v4/api/generated/openstef_models.presets.ForecastingWorkflowConfig.html) + +# %% [markdown] +# ## Load data and train an uncalibrated model +# +# We start with the same GBLinear setup as the {doc}`forecasting_quickstart` and +# measure how well its predicted quantiles match observed coverage. +# The [`ForecastingWorkflowConfig`](https://openstef.github.io/openstef/v4/api/generated/openstef_models.presets.ForecastingWorkflowConfig.html) +# defines the model architecture and quantile levels. + +# %% +from datetime import datetime, timedelta + +import pandas as pd +import plotly.graph_objects as go + +from openstef_core.testing import load_liander_dataset +from openstef_core.types import LeadTime, Q +from openstef_models.presets import ForecastingWorkflowConfig, create_forecasting_workflow +from openstef_models.presets.forecasting_workflow import GBLinearForecaster + +dataset = load_liander_dataset() + +train_start = datetime.fromisoformat("2024-03-01T00:00:00Z") +train_end = train_start + timedelta(days=45) +forecast_end = train_end + timedelta(days=7) + +train_dataset = dataset.filter_by_range(start=train_start, end=train_end) +predict_dataset = dataset.filter_by_range( + start=train_end - timedelta(days=14), + end=forecast_end, +) + +quantiles = [Q(0.1), Q(0.5), Q(0.9)] + +config = ForecastingWorkflowConfig( + model_id="uncalibrated_gblinear", + model="gblinear", + horizons=[LeadTime.from_string("PT36H")], + quantiles=quantiles, + target_column="load", + temperature_column="temperature_2m", + relative_humidity_column="relative_humidity_2m", + wind_speed_column="wind_speed_10m", + radiation_column="shortwave_radiation", + pressure_column="surface_pressure", + verbosity=0, + mlflow_storage=None, + gblinear_hyperparams=GBLinearForecaster.HyperParams(n_steps=50), +) + +workflow_uncal = create_forecasting_workflow(config=config) +workflow_uncal.fit(train_dataset) +forecast_uncal = workflow_uncal.predict(predict_dataset, forecast_start=train_end) + +print(f"Forecast rows: {len(forecast_uncal.data)}") + +# %% tags=["remove-cell"] +assert len(forecast_uncal.data) > 100, f"Expected >100 forecast rows, got {len(forecast_uncal.data)}" + +# %% [markdown] +# ## Measure calibration quality +# +# For a perfectly calibrated forecast at quantile $p$, the fraction of +# observations falling below the predicted value should equal $p$. We compute +# the **observed coverage** for each quantile and compare it to the expected +# level. + +# %% +actuals = predict_dataset.data["load"].loc[train_end:].reindex(forecast_uncal.data.index).dropna() +forecast_aligned = forecast_uncal.data.loc[actuals.index] + +expected = [float(q) for q in quantiles] +observed_uncal = [float((actuals <= forecast_aligned[f"quantile_P{int(float(q) * 100)}"]).mean()) for q in quantiles] + +calibration_df = pd.DataFrame({ + "quantile": [f"P{int(float(q) * 100)}" for q in quantiles], + "expected": expected, + "observed": observed_uncal, + "error": [o - e for o, e in zip(observed_uncal, expected, strict=True)], +}) +print("Calibration before isotonic correction:") +print(calibration_df.to_string(index=False)) + +# %% [markdown] +# ## Add isotonic calibration +# +# [`IsotonicQuantileCalibrator`](https://openstef.github.io/openstef/v4/api/generated/openstef_models.transforms.postprocessing.IsotonicQuantileCalibrator.html) is a postprocessing transform that learns a +# monotonic mapping from predicted quantiles to observed quantile levels. +# During training it fits on the validation split; during prediction it +# corrects each quantile value. +# +# We create a second workflow identical to the first, but with the calibrator +# appended to its postprocessing pipeline. + +# %% +from openstef_models.transforms.postprocessing import IsotonicQuantileCalibrator + +config_cal = config.model_copy(update={"model_id": "calibrated_gblinear"}) +workflow_cal = create_forecasting_workflow(config=config_cal) + +# Append isotonic calibration to the existing postprocessing pipeline +workflow_cal.model.postprocessing.transforms.append( + IsotonicQuantileCalibrator( + quantiles=quantiles, + use_local_quantile_estimation=True, + ) +) + +workflow_cal.fit(train_dataset) +forecast_cal = workflow_cal.predict(predict_dataset, forecast_start=train_end) + +# %% tags=["remove-cell"] +assert len(forecast_cal.data) > 100, f"Expected >100 calibrated forecast rows, got {len(forecast_cal.data)}" + +# %% [markdown] +# ## Compare calibration before and after + +# %% +forecast_cal_aligned = forecast_cal.data.loc[actuals.index] + +observed_cal = [float((actuals <= forecast_cal_aligned[f"quantile_P{int(float(q) * 100)}"]).mean()) for q in quantiles] + +comparison_df = pd.DataFrame({ + "quantile": [f"P{int(float(q) * 100)}" for q in quantiles], + "expected": expected, + "observed (before)": observed_uncal, + "observed (after)": observed_cal, + "error (before)": [o - e for o, e in zip(observed_uncal, expected, strict=True)], + "error (after)": [o - e for o, e in zip(observed_cal, expected, strict=True)], +}) +print(comparison_df.to_string(index=False)) + +# %% tags=["hide-input"] +fig = go.Figure() + +fig.add_trace( # pyright: ignore[reportUnknownMemberType] + go.Scatter( + x=[0, 1], + y=[0, 1], + mode="lines", + name="Perfect calibration", + line={"color": "gray", "dash": "dash", "width": 2}, + ) +) + +fig.add_trace( # pyright: ignore[reportUnknownMemberType] + go.Scatter( + x=expected, + y=observed_uncal, + mode="markers+lines", + name="Before calibration", + marker={"size": 12, "color": "red", "symbol": "x"}, + line={"color": "red", "width": 2, "dash": "dot"}, + ) +) + +fig.add_trace( # pyright: ignore[reportUnknownMemberType] + go.Scatter( + x=expected, + y=observed_cal, + mode="markers+lines", + name="After calibration", + marker={"size": 12, "color": "blue"}, + line={"color": "blue", "width": 2}, + ) +) + +fig.update_layout( # pyright: ignore[reportUnknownMemberType] + title="Quantile calibration: expected vs observed coverage", + xaxis_title="Expected quantile level", + yaxis_title="Observed coverage", + xaxis={"range": [0, 1], "tickvals": [0, 0.1, 0.5, 0.9, 1]}, + yaxis={"range": [0, 1], "tickvals": [0, 0.1, 0.5, 0.9, 1]}, + height=500, + width=600, +) +fig.show() + +# %% [markdown] +# Points closer to the diagonal indicate better calibration. The isotonic +# correction pulls the observed coverage towards the expected level, improving +# the reliability of uncertainty estimates. To measure calibration stability +# over longer time horizons, combine this with a {doc}`backtesting_quickstart`. + +# %% [markdown] +# ## Next steps +# +# - {doc}`backtesting_quickstart` — measure calibration consistency over +# realistic operational periods. +# - {doc}`ensemble_forecasting` — apply calibration to ensemble models +# for combined accuracy and reliable uncertainty. diff --git a/packages/openstef-core/src/openstef_core/testing.py b/packages/openstef-core/src/openstef_core/testing.py index 3c8c4e8b2..30109c167 100644 --- a/packages/openstef-core/src/openstef_core/testing.py +++ b/packages/openstef-core/src/openstef_core/testing.py @@ -8,10 +8,11 @@ DataFrames and Series with equality semantics. """ +import logging from collections.abc import Sequence from datetime import datetime, timedelta from pathlib import Path -from typing import TYPE_CHECKING, Any, override +from typing import Any, override import numpy as np import pandas as pd @@ -19,9 +20,6 @@ from openstef_core.constants import LIANDER_DATASET_REPO_ID from openstef_core.datasets import TimeSeriesDataset, VersionedTimeSeriesDataset -if TYPE_CHECKING: - import logging - class IsSamePandas: """Utility class to allow comparison of pandas DataFrames in assertion / calls.""" @@ -185,6 +183,7 @@ def load_liander_dataset( """ try: from huggingface_hub import hf_hub_download # pyright: ignore[reportUnknownVariableType] # noqa: PLC0415 + from huggingface_hub.utils import logging as hf_logging # noqa: PLC0415 except ImportError: msg = "huggingface-hub is required for benchmark datasets: pip install openstef-core[benchmark]" raise ImportError(msg) from None @@ -197,13 +196,14 @@ def load_liander_dataset( *(extra_files or []), ] + # Suppress HF Hub noise (unauthenticated requests warning, progress bars) + hf_logging.set_verbosity_error() for filename in files_to_download: hf_hub_download( # pyright: ignore[reportCallIssue] repo_id=repo_id, filename=filename, repo_type="dataset", local_dir=local_dir, - local_dir_use_symlinks=False, ) datasets = [VersionedTimeSeriesDataset.read_parquet(local_dir / f) for f in files_to_download] @@ -238,33 +238,46 @@ def configure_notebook_display(renderer: str = "png") -> None: pio.renderers.default = renderer -_DEFAULT_NOISY_LOGGERS: tuple[str, ...] = ("choreographer", "kaleido") +_DEFAULT_NOISY_LOGGERS: tuple[str, ...] = ( + "choreographer", + "kaleido", + "huggingface_hub", + "huggingface_hub.utils._http", + "openstef_core.datasets.timeseries_dataset", +) def setup_notebook_logging( name: str | None = None, suppress: Sequence[str] | None = None, -) -> "logging.Logger": +) -> logging.Logger: """Configure logging for tutorial notebooks and return a named logger. Sets the root logger to INFO level and silences the loggers in *suppress* - by raising their level to ERROR and disabling them entirely. + by raising their level to ERROR and disabling propagation. Child loggers + sharing a prefix are also silenced. Args: name: Logger name, typically ``__name__`` of the calling module. suppress: Sequence of logger names to silence. Defaults to - ``("choreographer", "kaleido")``. + ``_DEFAULT_NOISY_LOGGERS``. Returns: Configured Logger instance. """ - import logging # noqa: PLC0415 - noisy = suppress if suppress is not None else _DEFAULT_NOISY_LOGGERS logging.basicConfig(level=logging.INFO, format="[%(asctime)s][%(levelname)s] %(message)s") for logger_name in noisy: - logging.getLogger(logger_name).setLevel(logging.ERROR) - logging.getLogger(logger_name).disabled = True + lgr = logging.getLogger(logger_name) + lgr.setLevel(logging.ERROR) + lgr.propagate = False + # Also silence any existing child loggers + prefix = logger_name + "." + for key in logging.Logger.manager.loggerDict: + if key.startswith(prefix): + child = logging.getLogger(key) + child.setLevel(logging.ERROR) + child.propagate = False return logging.getLogger(name) diff --git a/pyproject.toml b/pyproject.toml index bdb01fff2..e0e52efbd 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -127,16 +127,33 @@ lint.ignore = [ "TC006", # Let's not force quoting the first param of typing.cast "TRY003", # simplify exception messages ] -lint.per-file-ignores."./examples/tutorials/*" = [ - "D100", # Notebooks don't need module docstrings +lint.per-file-ignores."./examples/benchmarks/custom/*" = [ "E402", # Imports not at top — notebook cells have natural ordering - "E501", # Long lines in notebooks — readability is cell-scoped + "E501", # Long lines in markdown cells (API doc URLs) + "ERA001", # Jupytext YAML frontmatter looks like commented-out code + "T201", # Benchmarks may use print for demonstration +] +lint.per-file-ignores."./examples/benchmarks/liander2024/*" = [ + "D100", # Module docstring is in a remove-cell for clean notebook rendering + "E402", # Imports not at top — os.environ must be set before imports + "E501", # Long lines in markdown cells (API doc URLs) "ERA001", # Jupytext YAML frontmatter looks like commented-out code - "F821", # Cell-scoped names appear undefined to Ruff's module-level analysis - "INP001", # Not a namespace package — notebooks don't need __init__.py - "PTH", # Tutorials may use os.path for simplicity - "S101", # Tutorials may use assert for runtime checks - "T201", # Tutorials may use print for demonstration + "T201", # Benchmarks may use print for demonstration +] +lint.per-file-ignores."./examples/tutorials/*" = [ + "D100", # Notebooks don't need module docstrings + "D103", # Inline helpers in notebooks don't need docstrings + "E402", # Imports not at top — notebook cells have natural ordering + "E501", # Long lines in notebooks — readability is cell-scoped + "ERA001", # Jupytext YAML frontmatter looks like commented-out code + "F821", # Cell-scoped names appear undefined to Ruff's module-level analysis + "INP001", # Not a namespace package — notebooks don't need __init__.py + "PLR2004", # Magic values in assertions are fine for notebook checks + "PT018", # Assertion style doesn't matter in notebooks + "PTH", # Tutorials may use os.path for simplicity + "S101", # Tutorials may use assert for runtime checks + "SLF001", # Private member access needed to demonstrate internals + "T201", # Tutorials may use print for demonstration ] lint.per-file-ignores."./packages/*/tests/*" = [ "ARG", # Unused function args -> fixtures nevertheless are functionally relevant... @@ -285,17 +302,24 @@ cmd = "pytest --numprocesses=auto --doctest-modules packages/*/src --maxfail=1" [tool.poe.tasks.notebooks] help = "Sync jupytext .py percent sources → .ipynb notebooks (and vice versa)" -cmd = "jupytext --sync examples/tutorials/*.py" +sequence = [ + { cmd = "jupytext --sync examples/tutorials/*.py" }, + { cmd = "jupytext --sync examples/benchmarks/custom/*.py examples/benchmarks/liander2024/*.py" }, +] [tool.poe.tasks.notebooks-clear] help = "Strip outputs from all .ipynb notebooks" -cmd = "jupyter nbconvert --clear-output --inplace examples/tutorials/*.ipynb" +sequence = [ + { cmd = "jupyter nbconvert --clear-output --inplace examples/tutorials/*.ipynb" }, + { cmd = "jupyter nbconvert --clear-output --inplace examples/benchmarks/custom/*.ipynb examples/benchmarks/liander2024/*.ipynb" }, +] [tool.poe.tasks.notebooks-check] help = "Check that .ipynb notebooks are in sync with their jupytext .py sources and have no outputs" sequence = [ { cmd = "jupytext --sync examples/tutorials/*.py" }, - { cmd = "git diff --exit-code -- examples/tutorials/" }, + { cmd = "jupytext --sync examples/benchmarks/custom/*.py examples/benchmarks/liander2024/*.py" }, + { cmd = "git diff --exit-code -- examples/tutorials/ examples/benchmarks/" }, { script = "tools.check_notebook_outputs:main" }, ] @@ -386,8 +410,8 @@ sequence = [ ] [tool.poe.tasks._docs_sync] -help = "Sync tutorial sources into docs/source for Sphinx" -cmd = "python -c \"import shutil; shutil.rmtree('docs/source/tutorials', ignore_errors=True); shutil.copytree('examples/tutorials', 'docs/source/tutorials')\"" +help = "Sync tutorial and benchmark sources into docs/source for Sphinx" +cmd = "python -c \"import shutil; shutil.rmtree('docs/source/tutorials', ignore_errors=True); shutil.copytree('examples/tutorials', 'docs/source/tutorials'); shutil.rmtree('docs/source/benchmarks', ignore_errors=True); shutil.copytree('examples/benchmarks', 'docs/source/benchmarks')\"" [tool.poe.tasks.docs] help = "Build the documentation" diff --git a/sonar-project.properties b/sonar-project.properties index 28abab60f..5361f74cb 100644 --- a/sonar-project.properties +++ b/sonar-project.properties @@ -2,10 +2,15 @@ # # SPDX-License-Identifier: MPL-2.0 -# Exclude generated Jupyter notebook files from copy-paste detection. -# Each .ipynb is generated from its paired .py jupytext source of truth, -# so SonarCloud reports their content as duplication — it is a false positive. -# The paired .py jupytext source files share the same boilerplate dataset setup, -# so they are also excluded for the same reason. -# Test files are excluded because fixture and helper duplication is expected and intentional. -sonar.cpd.exclusions=**/*.ipynb,examples/tutorials/*.py,**/tests/**/*.py +# Exclude generated Jupyter notebooks and example scripts from SonarQube analysis. +# .ipynb files are generated from paired .py jupytext sources — analysing both is redundant. +# Tutorial and benchmark .py files are interactive notebook scripts, not library code; +# SonarQube flags patterns like user-toggles (USE_MLFLOW_STORAGE = False; if USE_MLFLOW_STORAGE) +# as "constant expressions", which are false positives for configurable notebook cells. +sonar.exclusions=**/*.ipynb,examples/tutorials/*.py,examples/benchmarks/**/*.py,docs/source/tutorials/**,docs/source/benchmarks/** + +# Target Python version for precise analysis. +sonar.python.version=3.12 + +# Exclude expected duplication in tests (fixtures, helpers, parametrized patterns). +sonar.cpd.exclusions=**/tests/**/*.py diff --git a/tools/check_notebook_outputs.py b/tools/check_notebook_outputs.py index 37f039822..1a75bba91 100644 --- a/tools/check_notebook_outputs.py +++ b/tools/check_notebook_outputs.py @@ -8,25 +8,29 @@ from pathlib import Path TUTORIALS_DIR = Path("examples/tutorials") +BENCHMARKS_DIRS = [Path("examples/benchmarks/custom"), Path("examples/benchmarks/liander2024")] def main() -> None: - """Validate that no .ipynb in tutorials has stored outputs.""" + """Validate that no .ipynb in tutorials or benchmarks has stored outputs.""" failures: list[str] = [] + checked = 0 - for nb_path in sorted(TUTORIALS_DIR.glob("*.ipynb")): - nb = json.loads(nb_path.read_text(encoding="utf-8")) - for i, cell in enumerate(nb.get("cells", [])): - if cell.get("cell_type") == "code" and cell.get("outputs"): - failures.append(f" {nb_path.name}: cell {i} has outputs") - break + for search_dir in [TUTORIALS_DIR, *BENCHMARKS_DIRS]: + for nb_path in sorted(search_dir.glob("*.ipynb")): + checked += 1 + nb = json.loads(nb_path.read_text(encoding="utf-8")) + for i, cell in enumerate(nb.get("cells", [])): + if cell.get("cell_type") == "code" and cell.get("outputs"): + failures.append(f" {nb_path.relative_to('.')}: cell {i} has outputs") + break if failures: print("ERROR: Notebooks with outputs found (run `poe notebooks-clear`):") print("\n".join(failures)) sys.exit(1) - print(f"OK: {len(list(TUTORIALS_DIR.glob('*.ipynb')))} notebooks checked, no outputs found") + print(f"OK: {checked} notebooks checked, no outputs found") if __name__ == "__main__":