From 28d4dd9ab5604909bfd84ca845c0e448d20ce8f4 Mon Sep 17 00:00:00 2001 From: TimFuermann <83589894+TimFuermann@users.noreply.github.com> Date: Wed, 9 Apr 2025 15:11:17 +0200 Subject: [PATCH 01/11] add forecast datasets to atlite - restructure cutout creation - add new datasets --- atlite/convert.py | 11 +- atlite/cutout.py | 117 +- atlite/data.py | 40 +- atlite/datasets/__init__.py | 12 +- atlite/datasets/era5.py | 187 ++- atlite/datasets/gebco.py | 97 ++ atlite/datasets/icon.py | 1309 ++++++++++++++++++++ atlite/datasets/icon_d2.py | 1171 +++++++++++++++++ atlite/datasets/icon_eu.py | 1171 +++++++++++++++++ atlite/datasets/meteo_forecast.py | 990 +++++++++++++++ atlite/datasets/meteo_historic.py | 966 +++++++++++++++ atlite/datasets/meteo_historic_forecast.py | 964 ++++++++++++++ atlite/datasets/sarah.py | 96 ++ 13 files changed, 7064 insertions(+), 67 deletions(-) create mode 100644 atlite/datasets/icon.py create mode 100644 atlite/datasets/icon_d2.py create mode 100644 atlite/datasets/icon_eu.py create mode 100644 atlite/datasets/meteo_forecast.py create mode 100644 atlite/datasets/meteo_historic.py create mode 100644 atlite/datasets/meteo_historic_forecast.py diff --git a/atlite/convert.py b/atlite/convert.py index 96a2974e..a899ed9d 100644 --- a/atlite/convert.py +++ b/atlite/convert.py @@ -148,7 +148,7 @@ def convert_and_aggregate( if matrix is not None: if shapes is not None: raise ValueError( - "Passing matrix and shapes is ambiguous. Pass only one of them." + "Passing matrix and shapes is ambiguous. Pass " "only one of them." ) if isinstance(matrix, xr.DataArray): @@ -400,7 +400,6 @@ def heat_demand(cutout, threshold=15.0, a=1.0, constant=0.0, hour_shift=0.0, **p **params, ) - # cooling demand def convert_cooling_demand(ds, threshold, a, constant, hour_shift): # Temperature is in Kelvin; take daily average @@ -817,7 +816,7 @@ def pv(cutout, panel, orientation, tracking=None, clearsky_model=None, **params) Eurosun (ISES Europe Solar Congress). """ - if isinstance(panel, (str | Path)): + if isinstance(panel, (str, Path)): panel = get_solarpanelconfig(panel) if not callable(orientation): orientation = get_orientation(orientation) @@ -906,7 +905,7 @@ def csp(cutout, installation, technology=None, **params): URL: https://www.dlr.de/sf/en/desktopdefault.aspx/tabid-11126/19467_read-48251/ """ - if isinstance(installation, (str | Path)): + if isinstance(installation, (str, Path)): installation = get_cspinstallationconfig(installation) # Overwrite technology @@ -1038,7 +1037,9 @@ def hydro( # The hydrological parameters are in units of "m of water per day" and so # they should be multiplied by 1000 and the basin area to convert to m3 # d-1 = m3 h-1 / 24 - runoff *= xr.DataArray(basins.shapes.to_crs(dict(proj="cea")).area) + runoff *= (1000.0 / 24.0) * xr.DataArray( + basins.shapes.to_crs(dict(proj="cea")).area + ) return hydrom.shift_and_aggregate_runoff_for_plants( basins, runoff, flowspeed, show_progress diff --git a/atlite/cutout.py b/atlite/cutout.py index 5dceec1f..7f5bbf8c 100644 --- a/atlite/cutout.py +++ b/atlite/cutout.py @@ -66,10 +66,10 @@ class Cutout: This class builds the starting point for most atlite functionalities. """ - + def __init__(self, path, **cutoutparams): """ - Provide an atlite cutout object. + Provide an Atlite cutout object. Create a cutout object to use atlite operations on it. Based on the provided parameters, atlite first checks whether this cutout already @@ -112,9 +112,17 @@ def __init__(self, path, **cutoutparams): dt : str, optional Frequency of the time coordinate. The default is 'h'. Valid are all pandas offset aliases. + interp_s : string, optional + Interpolation method for spatial interpolation. + The default is 'linear'. Valid are all xarray interpolation + aliases (such as: 'quadratic', 'cubic',...) + interp_t : string, optional + Interpolation method for temporal interpolation. + The default is 'linear'. Valid are all xarray interpolation + aliases (such as: 'quadratic', 'cubic',...) chunks : dict - Chunks when opening NetCDF files. For cutout preparation it is recommended - to chunk only along the time dimension. Defaults to {'time': 100} + Chunks when opening netcdf files. + Defaults to {'time': 100, 'y': 100, 'x': 100} data : xr.Dataset User provided cutout data. Save the cutout using `Cutout.to_file()` afterwards. @@ -132,15 +140,15 @@ def __init__(self, path, **cutoutparams): sarah data which has missing data for areas where dawn and nightfall happens (ca. 30 min gap). gebco_path: str - Path to find the gebco NetCDF file. Only necessary when including + Path to find the gebco netcdf file. Only necessary when including the gebco module. parallel : bool, default False Whether to open dataset in parallel mode. Take effect for all xr.open_mfdataset usages. - """ + path = Path(path).with_suffix(".nc") - chunks = cutoutparams.pop("chunks", {"time": 100}) + chunks = cutoutparams.pop("chunks", {"time": 100, "y": 100, "x": 100}) if isinstance(chunks, dict): storable_chunks = {f"chunksize_{k}": v for k, v in (chunks or {}).items()} else: @@ -154,7 +162,7 @@ def __init__(self, path, **cutoutparams): data.attrs.update(storable_chunks) if cutoutparams: warn( - f"Arguments {', '.join(cutoutparams)} are ignored, since " + f'Arguments {", ".join(cutoutparams)} are ignored, since ' "cutout is already built." ) elif "data" in cutoutparams: @@ -170,6 +178,14 @@ def __init__(self, path, **cutoutparams): x = cutoutparams.pop("x") y = cutoutparams.pop("y") time = cutoutparams.pop("time") + + dx = cutoutparams.pop("dx", 0.25) + dy = cutoutparams.pop("dy", 0.25) + dt = cutoutparams.pop("dt", "1h") + + interp_s = cutoutparams.pop("interp_s", "linear") + interp_t = cutoutparams.pop("interp_t", "linear") + module = cutoutparams.pop("module") except KeyError as exc: raise TypeError( @@ -178,12 +194,79 @@ def __init__(self, path, **cutoutparams): "passed via argument 'bounds' or 'x' and 'y'." ) from exc - # TODO: check for dx, dy, x, y fine with module requirements - coords = get_coords(x, y, time, **cutoutparams) + # Convert different time inputs to a valid time slice with + # pd.Timestamps as data + + # convert string or timestamp to slice + if isinstance(time, str) or isinstance(time, pd.Timestamp): + time = pd.Timestamp(time) + # Create a time slice using pandas datetime + time = slice(pd.Timestamp(f'{time.year}-{time.month}-{time.day} {time.hour}:{time.minute}'), + pd.Timestamp(f'{time.year}-12-31 23:00'), + pd.Timedelta(dt)) + + # convert list of timestamps to slice + if isinstance(time, list): + freq = pd.Timedelta(pd.infer_freq(time)) + if freq is not pd.Timedelta(None): + time = slice(time[0], time[-1], freq) + else: + time = slice(time[0], time[-1], pd.Timedelta(dt)) + + # check if time slices has a timestep (dt) information + if isinstance(time, slice): + if (time.step is None) or (time.step is pd.Timedelta(None)): + time = slice(pd.Timestamp(time.start), pd.Timestamp(time.stop), pd.Timedelta(dt)) + else: + time = slice(pd.Timestamp(time.start), pd.Timestamp(time.stop), pd.Timedelta(time.step)) + + # check if time slices is valid, assume if start == stop a whole + # the data till the end of the year is requested. + if time.start == time.stop: + if time.start < pd.Timestamp(f'{time.start.year}-12-31 23:00'): + time = slice(time.start, + pd.Timestamp(f'{time.start.year}-12-31 23:00'), + time.step) + else: + time = slice(time.start, time.start + 2*time.step, time.step) + + if x.step is None: + x = slice(x.start, x.stop, dx) + + if y.step is None: + y = slice(y.start, y.stop, dy) + + # TODO: check for dx, dy, x, y fine with module requirements + # A check for module requirements is added, in case multiple modules + # are used the first module requirements are considered + # Nevertheless this should be done differently and variable to consider + # cases that for example combine forecast with historic data + # In additiona a flag if parallel calculations are possible is included + time_now = pd.Timestamp.utcnow().replace(tzinfo=None).floor("h") + + if isinstance(module, list): + logger.info(f"Module requirements are set for the first module {module[0]}.") + x, y, time, parallel = datamodules[module[0]]._checkModuleRequirements(x, y, time, time_now) + else: + logger.info(f"Module requirements for module {module} are set.") + x, y, time, parallel = datamodules[module]._checkModuleRequirements(x, y, time, time_now) + + # In get coords forecast times up to 4 weeks are included + coords = get_coords(x, y, time, x.step, y.step, time.step, **cutoutparams) + + # additional attributes interpolation and parallel computing are included attrs = { "module": module, "prepared_features": [], + "tz": f'{time.start.tz}', + "dx": f'{x.step}', + "dy": f'{y.step}', + "dt": f'{time.step}', + "interp_s":f'{interp_s}', + "interp_t":f'{interp_t}', + "parallel":parallel, + "init_time": f'{time_now.strftime("%Y-%m-%d %H:%M:%S")}', **storable_chunks, **cutoutparams, } @@ -196,6 +279,7 @@ def __init__(self, path, **cutoutparams): self.path = path self.data = data + @property def name(self): @@ -441,7 +525,7 @@ def merge(self, other, path=None, **kwargs): def to_file(self, fn=None): """ - Save cutout to a NetCDF file. + Save cutout to a netcdf file. Parameters ---------- @@ -577,15 +661,6 @@ def uniform_density_layout(self, capacity_density, crs=None): """ return capacity_density * self.area(crs) - def equals(self, other): - """ - It overrides xarray.Dataset.equals and ignores the path attribute in the comparison - """ - if not isinstance(other, Cutout): - return NotImplemented - # Compare cutouts data attributes - return self.data.equals(other.data) - def layout_from_capacity_list(self, data, col="Capacity"): """ Get a capacity layout aligned to the cutout based on a capacity list. @@ -645,8 +720,6 @@ def layout_from_capacity_list(self, data, col="Capacity"): convert_and_aggregate = convert_and_aggregate - cooling_demand = cooling_demand - heat_demand = heat_demand temperature = temperature diff --git a/atlite/data.py b/atlite/data.py index b027509e..67a3a03e 100644 --- a/atlite/data.py +++ b/atlite/data.py @@ -41,19 +41,33 @@ def get_features( lock = SerializableLock() datasets = [] get_data = datamodules[module].get_data - - for feature in features: - feature_data = delayed(get_data)( - cutout, - feature, - tmpdir=tmpdir, - lock=lock, - monthly_requests=monthly_requests, - concurrent_requests=concurrent_requests, - **parameters, - ) - datasets.append(feature_data) - + + if cutout.data.attrs['parallel']: + for feature in features: + feature_data = delayed(get_data)( + cutout, + feature, + tmpdir=tmpdir, + lock=lock, + monthly_requests=monthly_requests, + concurrent_requests=concurrent_requests, + **parameters, + ) + datasets.append(feature_data) + + else: + for feature in features: + feature_data = get_data( + cutout, + feature, + tmpdir=tmpdir, + lock=lock, + monthly_requests=monthly_requests, + concurrent_requests=concurrent_requests, + **parameters, + ) + datasets.append(feature_data) + datasets = compute(*datasets) ds = xr.merge(datasets, compat="equals") diff --git a/atlite/datasets/__init__.py b/atlite/datasets/__init__.py index 045c59d8..cbd951ba 100644 --- a/atlite/datasets/__init__.py +++ b/atlite/datasets/__init__.py @@ -6,6 +6,14 @@ atlite datasets. """ -from atlite.datasets import era5, gebco, sarah +from atlite.datasets import era5, gebco, sarah, meteo_forecast, meteo_historic_forecast, meteo_historic, icon_d2, icon_eu, icon -modules = {"era5": era5, "sarah": sarah, "gebco": gebco} +modules = {"era5": era5, + "sarah": sarah, + "gebco": gebco, + "meteo_forecast": meteo_forecast, + "meteo_historic_forecast": meteo_historic_forecast, + "meteo_historic": meteo_historic, + "icon_d2": icon_d2, + "icon_eu": icon_eu, + "icon": icon} diff --git a/atlite/datasets/era5.py b/atlite/datasets/era5.py index 2b5d547c..dbdbb02e 100644 --- a/atlite/datasets/era5.py +++ b/atlite/datasets/era5.py @@ -10,6 +10,8 @@ import logging import os +import io +import zipfile import warnings import weakref from tempfile import mkstemp @@ -39,6 +41,10 @@ def nullcontext(): logger = logging.getLogger(__name__) +# Set url for data download, this allows to switch to different data +# sources more easily. +era5_url = 'https://cds.climate.copernicus.eu/api' + # Model and CRS Settings crs = 4326 @@ -59,6 +65,101 @@ def nullcontext(): static_features = {"height"} +requirements = {'x': slice(-90, 90, 0.25), + 'y': slice(-90, 90, 0.25), + 'offset': (pd.Timestamp('1940-01-01')-pd.Timestamp.utcnow().replace(tzinfo=None).floor("h")), + 'forecast': pd.Timedelta(hours=-5*24), + 'dt': pd.Timedelta(hours=1), + 'parallel': True, + } + + +def _checkModuleRequirements(x, y, time, time_now, **kwargs): + """ + Load and check the data requirements for a given module. + + Parameters: + x (slice): Defines the start, stop, and step values for the x-dimension. + y (slice): Defines the start, stop, and step values for the y-dimension. + time (slice): Defines the start, stop, and step values for the time dimension. + **kwargs: Additional optional parameters. + """ + + # Extract start, stop, and step values for x + x_start, x_stop, x_step = x.start, x.stop, x.step + + # Adjust x range based on module requirements + if requirements['x'].start > x.start: + x_start = requirements['x'].start + if requirements['x'].stop < x.stop: + x_stop = requirements['x'].stop + if requirements['x'].step > x.step: + x_step = requirements['x'].step + + x = slice(x_start, x_stop, x_step) + + # Extract start, stop, and step values for y + y_start, y_stop, y_step = y.start, y.stop, y.step + + # Adjust y range based on module requirements + if requirements['y'].start > y.start: + y_start = requirements['y'].start + if requirements['y'].stop < y.stop: + y_stop = requirements['y'].stop + if requirements['y'].step > y.step: + y_step = requirements['y'].step + + y = slice(y_start, y_stop, y_step) + + + # Extract time range parameters + time_start = time.start + time_stop = time.stop + time_step = time.step + + # Check forecast feasibility + feasible_start = time_now + requirements['offset'] + feasible_end = time_now + requirements['forecast'] + + # Ensure time_start is within feasible bounds + if time_start < feasible_start: + logger.error(f"The required forecast start time {time_start} exceeds the model requirements.") + logger.error(f"The minimum start time of the forecast for {time_now} is {feasible_start}.") + logger.error(f"The maximum historical offset of the forecast is {requirements['offset']}.") + raise ValueError(f"Invalid forecast start time: {time_start}. Must be >= {feasible_start}.") + + if time_start >= feasible_end: + logger.error(f"The required forecast start time {time_start} exceeds the model requirements.") + logger.error(f"The maximum start time of the forecast for {time_now} needs to be smaller than {feasible_end}.") + raise ValueError(f"Invalid forecast start time: {time_start}. Must be < {feasible_end}.") + + # Ensure time_stop is greater than time_start + if time_stop <= time_start: + logger.error(f"The required forecast end time {time_stop} exceeds the model requirements.") + logger.error(f"The minimum end time of the forecast for {time_now} needs to be larger than {time_start}.") + raise ValueError(f"Invalid forecast end time: {time_stop}. Must be > {time_start}.") + + # Ensure time_stop is greater than time_start + if time_stop > feasible_end: + logger.error(f"The required forecast end time {time_stop} exceeds the model requirements.") + logger.error(f"The maximum end time of the forecast for {time_now} is {feasible_end}.") + logger.error(f"The maximum forecast horizon is {requirements['forecast']}.") + raise ValueError(f"Invalid forecast end time: {time_stop}. Must be <= {feasible_end}.") + + # Ensure time step is within required limits + if (time_step is pd.Timedelta(None)) or (time.step < requirements['dt']): + logger.warning(f"The required temporal forecast resolution {time_step} exceeds the model requirements.") + logger.warning(f"The minimum temporal resolution of the forecast is {requirements['dt']}.") + logger.info(f"Set the temporal forecast resolution to the minimum: {requirements['dt']}.") + time_step = requirements['dt'] + + time = slice(time_start, time_stop, time_step) + + # Retrieve parallel processing setting from requirements + parallel = requirements['parallel'] + + return x, y, time, parallel + def _add_height(ds): """ @@ -106,6 +207,7 @@ def get_data_wind(retrieval_params): Get wind data for given retrieval parameters. """ ds = retrieve_data( + url=era5_url, variable=[ "10m_u_component_of_wind", "10m_v_component_of_wind", @@ -148,6 +250,7 @@ def get_data_influx(retrieval_params): Get influx data for given retrieval parameters. """ ds = retrieve_data( + url=era5_url, variable=[ "surface_net_solar_radiation", "surface_solar_radiation_downwards", @@ -204,6 +307,7 @@ def get_data_temperature(retrieval_params): Get wind temperature for given retrieval parameters. """ ds = retrieve_data( + url=era5_url, variable=[ "2m_temperature", "soil_temperature_level_4", @@ -228,7 +332,10 @@ def get_data_runoff(retrieval_params): """ Get runoff data for given retrieval parameters. """ - ds = retrieve_data(variable=["runoff"], **retrieval_params) + ds = retrieve_data( + url=era5_url, + variable=["runoff"], + **retrieval_params) ds = _rename_and_clean_coords(ds) ds = ds.rename({"ro": "runoff"}) @@ -248,7 +355,10 @@ def get_data_height(retrieval_params): """ Get height data for given retrieval parameters. """ - ds = retrieve_data(variable="geopotential", **retrieval_params) + ds = retrieve_data( + url=era5_url, + variable=["geopotential"], + **retrieval_params) ds = _rename_and_clean_coords(ds) ds = _add_height(ds) @@ -290,9 +400,9 @@ def retrieval_times(coords, static=False, monthly_requests=False): time = coords["time"].to_index() if static: return { - "year": str(time[0].year), - "month": str(time[0].month), - "day": str(time[0].day), + "year": [str(time[0].year)], + "month": [str(time[0].month).zfill(2)], + "day": [str(time[0].day).zfill(2)], "time": time[0].strftime("%H:00"), } @@ -304,17 +414,17 @@ def retrieval_times(coords, static=False, monthly_requests=False): for month in t.month.unique(): query = { "year": str(year), - "month": str(month), - "day": list(t[t.month == month].day.unique()), - "time": [f"{h:02d}:00" for h in t[t.month == month].hour.unique()], + "month": [str(month).zfill(2)], + "day": list(t[t.month == month].day.unique().astype(str).str.zfill(2)), + "time": ["%02d:00" % h for h in t[t.month == month].hour.unique()], } times.append(query) else: query = { - "year": str(year), - "month": list(t.month.unique()), - "day": list(t.day.unique()), - "time": [f"{h:02d}:00" for h in t.hour.unique()], + "year": [str(year)], + "month": list(t.month.unique().astype(str).str.zfill(2)), + "day": list(t.day.unique().astype(str).str.zfill(2)), + "time": ["%02d:00" % h for h in t.hour.unique()], } times.append(query) return times @@ -331,22 +441,27 @@ def noisy_unlink(path): logger.error(f"Unable to delete file {path}, as it is still in use.") -def retrieve_data(product, chunks=None, tmpdir=None, lock=None, **updates): +def retrieve_data(url, product, chunks=None, tmpdir=None, lock=None, **updates): """ Download data like ERA5 from the Climate Data Store (CDS). If you want to track the state of your request go to https://cds-beta.climate.copernicus.eu/requests?tab=all - """ - request = {"product_type": "reanalysis", "format": "netcdf"} + """ + request = {"product_type": ["reanalysis"], + "data_format": "netcdf", + "download_format": "zip"} + request.update(updates) - assert {"year", "month", "variable"}.issubset(request), ( - "Need to specify at least 'variable', 'year' and 'month'" - ) + assert {"year", "month", "variable"}.issubset( + request + ), "Need to specify at least 'variable', 'year' and 'month'" client = cdsapi.Client( - info_callback=logger.debug, debug=logging.DEBUG >= logging.root.level + url = url, + info_callback=logger.debug, + debug=logging.DEBUG >= logging.root.level ) result = client.retrieve(product, request) @@ -354,7 +469,7 @@ def retrieve_data(product, chunks=None, tmpdir=None, lock=None, **updates): lock = nullcontext() with lock: - fd, target = mkstemp(suffix=".nc", dir=tmpdir) + fd, target_zip = mkstemp(suffix=".zip", dir=tmpdir) os.close(fd) # Inform user about data being downloaded as "* variable (year-month)" @@ -362,12 +477,34 @@ def retrieve_data(product, chunks=None, tmpdir=None, lock=None, **updates): variables = atleast_1d(request["variable"]) varstr = "\n\t".join([f"{v} ({timestr})" for v in variables]) logger.info(f"CDS: Downloading variables\n\t{varstr}\n") - result.download(target) - - ds = xr.open_dataset(target, chunks=chunks or {}) + result.download(target_zip) + + # Open the .zip file in memory + with zipfile.ZipFile(target_zip, "r") as zf: + # Identify .nc files inside the .zip + nc_files = [name for name in zf.namelist() if name.endswith(".nc")] + + if not nc_files: + raise FileNotFoundError("No .nc files found in the downloaded .zip archive.") + + if len(nc_files) == 1: + # If there's only one .nc file, read it into memory + with zf.open(nc_files[0]) as nc_file: + # Pass the in-memory file-like object to Xarray + ds = xr.open_dataset(io.BytesIO(nc_file.read()), chunks=chunks or {}) + + else: + # If multiple .nc files, combine them using Xarray + datasets = [] + for nc_file in nc_files: + with zf.open(nc_file) as file: + datasets.append(xr.open_dataset(io.BytesIO(file.read()), chunks=chunks or {})) + # Combine datasets along temporal dimension + ds = xr.merge(datasets) + if tmpdir is None: - logger.debug(f"Adding finalizer for {target}") - weakref.finalize(ds._file_obj._manager, noisy_unlink, target) + logging.debug(f"Adding finalizer for {target_zip}") + weakref.finalize(ds._file_obj._manager, noisy_unlink, target_zip) return ds diff --git a/atlite/datasets/gebco.py b/atlite/datasets/gebco.py index 948e862c..90c590bd 100755 --- a/atlite/datasets/gebco.py +++ b/atlite/datasets/gebco.py @@ -11,6 +11,7 @@ import rasterio as rio import xarray as xr +import pandas as pd from pandas import to_numeric from rasterio.warp import Resampling @@ -20,6 +21,102 @@ features = {"height": ["height"]} +requirements = {'x': slice(-90, 90, 0.15), + 'y': slice(-90, 90, 0.15), + 'offset': pd.Timestamp('1940-01-01'), + 'forecast': pd.Timestamp('2050-01-01'), + 'dt': pd.Timedelta(hours=1), + 'parallel': True, + } + + +def _checkModuleRequirements(x, y, time, time_now, **kwargs): + """ + Load and check the data requirements for a given module. + + Parameters: + x (slice): Defines the start, stop, and step values for the x-dimension. + y (slice): Defines the start, stop, and step values for the y-dimension. + time (slice): Defines the start, stop, and step values for the time dimension. + **kwargs: Additional optional parameters. + """ + + # Extract start, stop, and step values for x + x_start, x_stop, x_step = x.start, x.stop, x.step + + # Adjust x range based on module requirements + if requirements['x'].start > x.start: + x_start = requirements['x'].start + if requirements['x'].stop < x.stop: + x_stop = requirements['x'].stop + if requirements['x'].step > x.step: + x_step = requirements['x'].step + + x = slice(x_start, x_stop, x_step) + + # Extract start, stop, and step values for y + y_start, y_stop, y_step = y.start, y.stop, y.step + + # Adjust y range based on module requirements + if requirements['y'].start > y.start: + y_start = requirements['y'].start + if requirements['y'].stop < y.stop: + y_stop = requirements['y'].stop + if requirements['y'].step > y.step: + y_step = requirements['y'].step + + y = slice(y_start, y_stop, y_step) + + + # Extract time range parameters + time_start = time.start + time_stop = time.stop + time_step = time.step + + # Check forecast feasibility + feasible_start = time_now + requirements['offset'] + feasible_end = time_now + requirements['forecast'] + + # Ensure time_start is within feasible bounds + if time_start < feasible_start: + logger.error(f"The required forecast start time {time_start} exceeds the model requirements.") + logger.error(f"The minimum start time of the forecast for {time_now} is {feasible_start}.") + logger.error(f"The maximum historical offset of the forecast is {requirements['offset']}.") + raise ValueError(f"Invalid forecast start time: {time_start}. Must be >= {feasible_start}.") + + if time_start >= feasible_end: + logger.error(f"The required forecast start time {time_start} exceeds the model requirements.") + logger.error(f"The maximum start time of the forecast for {time_now} needs to be smaller than {feasible_end}.") + raise ValueError(f"Invalid forecast start time: {time_start}. Must be < {feasible_end}.") + + # Ensure time_stop is greater than time_start + if time_stop <= time_start: + logger.error(f"The required forecast end time {time_stop} exceeds the model requirements.") + logger.error(f"The minimum end time of the forecast for {time_now} needs to be larger than {time_start}.") + raise ValueError(f"Invalid forecast end time: {time_stop}. Must be > {time_start}.") + + # Ensure time_stop is greater than time_start + if time_stop > feasible_end: + logger.error(f"The required forecast end time {time_stop} exceeds the model requirements.") + logger.error(f"The maximum end time of the forecast for {time_now} is {feasible_end}.") + logger.error(f"The maximum forecast horizon is {requirements['forecast']}.") + raise ValueError(f"Invalid forecast end time: {time_stop}. Must be <= {feasible_end}.") + + # Ensure time step is within required limits + if (time_step is pd.Timedelta(None)) or (time.step < requirements['dt']): + logger.warning(f"The required temporal forecast resolution {time_step} exceeds the model requirements.") + logger.warning(f"The minimum temporal resolution of the forecast is {requirements['dt']}.") + logger.info(f"Set the temporal forecast resolution to the minimum: {requirements['dt']}.") + time_step = requirements['dt'] + + time = slice(time_start, time_stop, time_step) + + # Retrieve parallel processing setting from requirements + parallel = requirements['parallel'] + + return x, y, time, parallel + + def get_data_gebco_height(xs, ys, gebco_path): x, X = xs.data[[0, -1]] y, Y = ys.data[[0, -1]] diff --git a/atlite/datasets/icon.py b/atlite/datasets/icon.py new file mode 100644 index 00000000..9c3b6475 --- /dev/null +++ b/atlite/datasets/icon.py @@ -0,0 +1,1309 @@ +# -*- coding: utf-8 -*- + +# SPDX-FileCopyrightText: 2016-2021 The Atlite Authors +# +# SPDX-License-Identifier: GPL-3.0-or-later + +""" +Module for downloading and curating data from DWD ICON dataset (via ODS). + +For further reference see: +https://www.dwd.de/DE/leistungen/nwv_icon_d2_modelldokumentation/nwv_icon_d2_modelldokumentation.html +""" + +import os +import warnings +import numpy as np +import xarray as xr +import pandas as pd +import requests +import logging + +from cdo import Cdo +from pathlib import Path +from retry import retry +from bz2 import decompress +from bs4 import BeautifulSoup + +from ..gis import maybe_swap_spatial_dims +from ..pv.solar_position import SolarPosition + +# Null context for running a with statements wihout any context +try: + from contextlib import nullcontext +except ImportError: + # for Python verions < 3.7: + import contextlib + + @contextlib.contextmanager + def nullcontext(): + yield + + +logger = logging.getLogger(__name__) + +# Local Resource Data Folder +RESOURCE_DIRECTORY = Path(__file__).parent.parent / "resources" +GRID_DIRECTORY = RESOURCE_DIRECTORY / "grid" + +# Local Data Grid Folder +dwd_icon_grid = "icon_grid_0026_R03B07_G" +dwd_icon_grid_description_folder = "ICON_GLOBAL2WORLD_0125_EASY" + +# General Open Data DWD URL +# URL for accessing Open Data from DWD (Deutscher Wetterdienst) +dwd_url = "https://opendata.dwd.de/weather/nwp/icon/grib/" +dwd_grid_url = f"https://opendata.dwd.de/weather/lib/cdo/{dwd_icon_grid}.nc.bz2" +dwd_grid_description_url = "https://opendata.dwd.de/weather/lib/cdo/{dwd_icon_grid_description_folder}.tar.bz2" + +# ICON model runs are available at fixed intervals: 00, 03, 06, 09, 12, 15, 18, 21 UTC +model_run_hours = np.array([0, 6, 12, 18]) + +# Averaging window of different model runs +averaging_window = 24 #hours + +# Coordinate Reference System (CRS) used for geospatial data +crs = 4326 + +# Dictionary defining available meteorological features and their associated data fields +features = { + "height": ["height"], # Elevation data + "wind": ["wnd100m", "wnd_azimuth", "roughness"], # Wind speed, direction, and surface roughness + "influx": [ + "influx_toa", # Top-of-atmosphere solar radiation + "influx_direct", # Direct solar radiation + "influx_diffuse", # Diffuse solar radiation + "albedo", # Surface reflectivity + "solar_altitude", # Solar altitude angle + "solar_azimuth", # Solar azimuth angle + ], + "temperature": ["temperature", "soil temperature"], # Air and soil temperature + "runoff": ["runoff"], # Surface water runoff +} + +# Features that remain constant over time +static_features = {"height"} + +# Model requirements specifying spatial and temporal constraints +requirements = { + 'x': slice(-180, 180, 0.125), # Longitude range with resolution + 'y': slice(-90, 90, 0.125), # Latitude range with resolution + 'offset': pd.Timedelta(hours=-18), # Time offset for forecast initialization + 'forecast': pd.Timedelta(hours=180), # Maximum forecast range + 'dt': pd.Timedelta(hours=1), # Temporal resolution of data + 'parallel': False, # Flag for enabling parallel processing + } + + +def _checkModuleRequirements(x, y, time, time_now, **kwargs): + """ + Load and check the data requirements for a given module. + + Parameters: + x (slice): Defines the start, stop, and step values for the x-dimension. + y (slice): Defines the start, stop, and step values for the y-dimension. + time (slice): Defines the start, stop, and step values for the time dimension. + **kwargs: Additional optional parameters. + """ + + # Download reference grid to allow regular lat lon conversion + _download_reference_grid(grid=dwd_icon_grid, + model='ICON', + reference_grid_url=dwd_grid_url) + + # Extract start, stop, and step values for x + x_start, x_stop, x_step = x.start, x.stop, x.step + + # Adjust x range based on module requirements + if requirements['x'].start > x.start: + x_start = requirements['x'].start + if requirements['x'].stop < x.stop: + x_stop = requirements['x'].stop + if requirements['x'].step > x.step: + x_step = requirements['x'].step + + x = slice(x_start, x_stop, x_step) + + # Extract start, stop, and step values for y + y_start, y_stop, y_step = y.start, y.stop, y.step + + # Adjust y range based on module requirements + if requirements['y'].start > y.start: + y_start = requirements['y'].start + if requirements['y'].stop < y.stop: + y_stop = requirements['y'].stop + if requirements['y'].step > y.step: + y_step = requirements['y'].step + + y = slice(y_start, y_stop, y_step) + + + # Extract time range parameters + time_start = time.start + time_stop = time.stop + time_step = time.step + + # Check forecast feasibility + feasible_start = time_now + requirements['offset'] + feasible_end = time_now + requirements['forecast'] + + # Ensure time_start is within feasible bounds + if time_start < feasible_start: + logger.error(f"The required forecast start time {time_start} exceeds the model requirements.") + logger.error(f"The minimum start time of the forecast for {time_now} is {feasible_start}.") + logger.error(f"The maximum historical offset of the forecast is {requirements['offset']}.") + raise ValueError(f"Invalid forecast start time: {time_start}. Must be >= {feasible_start}.") + # logger.info(f"Set the start time to the minimum start time {feasible_start} and proceed.") + # time_start = time_now + requirements['offset'] + + if time_start >= feasible_end: + logger.error(f"The required forecast start time {time_start} exceeds the model requirements.") + logger.error(f"The maximum start time of the forecast for {time_now} needs to be smaller than {feasible_end}.") + raise ValueError(f"Invalid forecast start time: {time_start}. Must be < {feasible_end}.") + # logger.info(f"Set the start time to the minimum start time {feasible_start} and proceed.") + # time_start = time_now + requirements['offset'] + + # Ensure time_stop is greater than time_start + if time_stop <= time_start: + logger.error(f"The required forecast end time {time_stop} exceeds the model requirements.") + logger.error(f"The minimum end time of the forecast for {time_now} needs to be larger than {time_start}.") + raise ValueError(f"Invalid forecast end time: {time_stop}. Must be > {time_start}.") + # logger.info(f"Set the end time to the maximum end time {feasible_end} and proceed.") + # time_stop = time_now + requirements['forecast'] + + # Ensure time_stop is greater than time_start + if time_stop > feasible_end: + logger.error(f"The required forecast end time {time_stop} exceeds the model requirements.") + logger.error(f"The maximum end time of the forecast for {time_now} is {feasible_end}.") + logger.error(f"The maximum forecast horizon is {requirements['forecast']}.") + raise ValueError(f"Invalid forecast end time: {time_stop}. Must be <= {feasible_end}.") + # logger.info(f"Set the end time to the maximum end time {feasible_end} and proceed.") + # time_stop = time_now + requirements['forecast'] + + # # Check if forecast hours exceed limits + # forecastHours = (time_stop - time_now) + # if forecastHours > requirements['forecast']: + # logger.error(f"The end time of the forecast {time_stop} exceedes the model requirements.") + # logger.error(f"The maximum end time of the forecast for {time_now} is {feasible_end}.") + # logger.error(f"The required forecast horizon {forecastHours} exceeds the maximum forecast horzion {requirements['forecast']}.") + # # logger.info(f"Set it to maximum forecast hours of {requirements['forecast']} hours.") + # # forecastHours = requirements['forecast'] + # # time_stop = time_now + forecastHours + + # # Check if offset is within required limits + # offset = (time_start - time_now) + # if offset < requirements['offset']: + # logger.error(f"The start time of the forecast {time_start} exceeds model requirements.") + # logger.error(f"The minimum start time of the forecast for {time_now} is {feasible_start}.") + # logger.warning(f"Forecast offset of {offset} hours is below model requirements.") + # logger.info(f"Set it to minimum offset of {requirements['offset']} hours.") + # offset = requirements['offset'] + # time_start = time_now + offset + + # Ensure time step is within required limits + if (time_step is pd.Timedelta(None)) or (time.step < requirements['dt']): + logger.warning(f"The required temporal forecast resolution {time_step} exceeds the model requirements.") + logger.warning(f"The minimum temporal resolution of the forecast is {requirements['dt']}.") + logger.info(f"Set the temporal forecast resolution to the minimum: {requirements['dt']}.") + time_step = requirements['dt'] + + time = slice(time_start, time_stop, time_step) + + # Retrieve parallel processing setting from requirements + parallel = requirements['parallel'] + + return x, y, time, parallel + + +def _getCurrentRun(time): + ''' + Determines the most recent available model run based on the current time. + The latest run is fully available approximately 2 hours after initialization. + To ensure the model run is successfully uploaded, the download delay time is set to 3 hours. + + This code was adapted from: https://github.com/prayer007/dwdGribExtractor/tree/main + + Parameters + ---------- + time : datetime + The current datetime in UTC. + + Returns + ------- + datetime + The timestamp of the most recent available model run, floored to the hour. + ''' + download_delay = 3 # Delay in hours before the run is fully available + + # Adjust the current time by the delay to ensure availability + adjusted_time = time - pd.Timedelta(hours=download_delay) + + # Find the most recent available run by flooring to the nearest model run hour + run_hour = max(hour for hour in model_run_hours if hour <= adjusted_time.hour) + + # Construct the correct model run time + run_time = adjusted_time.replace(hour=run_hour, minute=0, second=0, microsecond=0) + + return run_time + + + +def _createDownloadUrl(url, var, field, run, hours): + ''' + Generates a list of download URLs for meteorological data from the DWD server. + The function scrapes the available files for a given variable and model run, + filtering based on field type, forecast hours, and model levels. + + This code was adopted from: https://github.com/prayer007/dwdGribExtractor/tree/main + + Parameters + ---------- + url : string + Base URL of the DWD data server. + var : string + The variable name, optionally including levels separated by '/'. + field : string + The field parameter: 'time-invariant' (static), 'soil-level' (162cm), + 'model-level' (62;63), or 'single-level' (2D field). + run : string + Model run identifier. + hours : int + Maximum forecast hours to retrieve. + + Returns + ------- + list + List of filtered download URLs. + ''' + + # Extract variable name and associated levels + levels = pd.Series(var.split('/')[1:]).astype(int) # Convert levels to integers + var = var.split('/')[0] # Extract variable name + + # Construct the data URL based on provided parameters + data_url = "{url}{run}/{var}/".format(url=url, var=var, run=run) + + # Send an HTTP GET request to fetch available files + response = requests.get(data_url) + + # Raise an error if the request fails + response.raise_for_status() + + # Parse the HTML content to extract links + soup = BeautifulSoup(response.content, 'html.parser') + + # Find all anchor tags ('a') representing file links + link_tags = soup.find_all('a') + + # Initialize an empty list to store the file URLs + urls = [] + + # Iterate through all link tags, extract URLs, and store them + for tag in link_tags: + link = tag.get('href') # Extract the hyperlink reference + + if link: + # Construct the full URL by appending the relative link to the base URL + full_url = data_url + link + urls.append(full_url) + + # Convert the list of URLs into a Pandas Series for easy filtering + urls = pd.Series(urls) + + # Filter URLs to retain only those containing 'regular-lat-lon' grid format + urls = urls[urls.str.contains('icosahedral')] + + # Further filter URLs based on the specified model field + urls = urls[urls.str.contains(field)] + + # Apply forecast time horizon filter (excluding 'time-invariant' fields) + if field != 'time-invariant': + urls = urls[urls.str.findall(r"\_(\d{3})\_").str[0].astype(int) <= hours] + + # Filter URLs based on model levels, if specified + if not levels.empty: + url_mask = pd.Series(index=urls.index, data=False) # Initialize boolean mask + for level in levels: + url_mask += urls.str.contains(f"_{level}_") # Check if URL contains level + urls = urls[url_mask] # Apply filter + + # Convert filtered URLs back to a list + urls = list(urls) + + return urls + + +def _deaverage(da): + ''' + Converts a temporally averaged data array into individual time-step values. + Each time step's original value is reconstructed by reversing the cumulative averaging process. + + Parameters + ---------- + da : xarray.DataArray + Input data array with a time dimension containing cumulative averages. + + Returns + ------- + xarray.DataArray + Data array with de-averaged values. + ''' + # Create an integer index for time, matching da's shape + time_index = xr.DataArray(np.arange(1, da.sizes["time"] + 1), dims="time", coords={"time": da.time}) + + # Apply the reverse operation: Ψ_inst(t) = t * Ψ(t) - (t-1) * Ψ(t-1) + da_instantaneous = (time_index * da - (time_index - 1) * da.shift(time=1, fill_value=0)) + + # Fill the first timestep with NaNs, since it is always zero + da_instantaneous = da_instantaneous.where(da_instantaneous.time != da_instantaneous.time[0], np.nan) + + return da_instantaneous + + +def _deaccumulate(da): + ''' + Converts accumulated data into time-step differences. + This function takes an accumulated dataset and calculates the incremental + values between consecutive time steps. + + Parameters + ---------- + da : xarray.DataArray + Input data array with a time dimension containing accumulated values. + + Returns + ------- + xarray.DataArray + Data array with de-accumulated values (time-step differences). + ''' + + # Apply the reverse operation: Ψ_inst(t) = Ψ(t) - Ψ(t-1) + da_instantaneous = da - da.shift(time=1, fill_value=0) + + # Fill the first timestep with NaNs, since it is always zero + da_instantaneous = da_instantaneous.where(da_instantaneous.time != da_instantaneous.time[0], np.nan) + + return da_instantaneous + + +def _average_duplicate_times(ds): + """ + Averages duplicate timestamps in an xarray Dataset. + + Given an xarray Dataset with duplicated timestamps (after concatenation), + this function computes the mean over all datasets that share the same time index. + Unique timestamps remain unchanged. + + Parameters + ---------- + ds : xarray.Dataset + An xarray Dataset that contains duplicated time indices. + + Returns + ------- + xarray.Dataset + A dataset where duplicate timestamps are averaged and unique timestamps are preserved. + """ + + # Step 1: Compute the mean for duplicate timestamps while keeping unique ones + ds_mean = ds.groupby("time").mean(dim="time", keep_attrs=True) + + # Step 2: Preserve dataset and variable attributes from the original dataset + ds_mean.attrs = ds.attrs # Preserve global dataset attributes + for var in ds_mean.data_vars: + ds_mean[var].attrs = ds[var].attrs # Preserve variable attributes + + return ds_mean + + +def _mainDataCollector(url, var, field, forecast, offset, coords, area, grid, tmpdir): + ''' + Downloads meteorological data for a given variable and processes it accordingly. + + This function retrieves data from the specified URL, processes it to rename and clean + coordinates, and applies de-averaging or de-accumulation where necessary based on + the GRIB step type. + + Parameters + ---------- + url : string + The base URL for downloading data. + var : string + The variable name to be downloaded. + field : string + The field type, e.g., 'time-invariant' (static), 'soil-level' (e.g., 162cm), + 'model-level' (e.g., 62;63), or 'single-level' (2D field). + forecast : int + The number of forecast hours to retrieve. + offset : int + The forecast offset time in hours. + coords : atlite.Cutout.coords + The spatial coordinates where data is required. + tmpdir : string + Path to the temporary directory where downloaded files are stored. + + Returns + ------- + xarray.Dataset + Processed dataset containing the collected meteorological data. + ''' + + # Extract the most recent forecast run time + latestRun = forecast[0] + + # Filter only the previous runs before the latest run + previousRuns = offset[offset < latestRun] + + # Keep only entries that align with ICON model run hours + previousRuns = previousRuns[previousRuns.hour.isin(model_run_hours)].sort_values(ascending=True) + + if len(previousRuns) > 0: + # Get the hour of the earliest previous run + first_prev_hour = previousRuns[0].hour + + # Find the previous index in `model_run_hours` + prev_idx = np.where(model_run_hours == first_prev_hour)[0][0] - 1 + + # Compute the adjusted previous run + previousRun = pd.DatetimeIndex([previousRuns[0].replace( + hour=model_run_hours[prev_idx], minute=0, second=0, microsecond=0 + )]) + + # Add previousRun to previousRuns, ensuring uniqueness + previousRuns = previousRuns.union(previousRun).sort_values(ascending=True) + + # Create a list of runs including the latest run and previous runs + # Use an averaging_window of X hours for all previous runs to average the results + runs = [(run.strftime("%H"), averaging_window) for run in previousRuns] + [(latestRun.strftime("%H"), len(forecast))] + + # # Generate download URLs for the specified variable and field + # urls = [] + # for run, hours in runs: + # urls = urls + _createDownloadUrl(url, var, field, run, hours) + + # urls = pd.Series(urls).unique() + + ds_temps = [] # List to store temporary datasets + + for run, hours in runs: + + # Generate download URLs for the specified variable and field + urls = _createDownloadUrl(url, var, field, run, hours) + + # Download and collect the main dataset for the given variable + ds_temps.append(_download(urls, var, coords, area, grid, tmpdir)) + + # Concatenate along the time dimension, keeping also duplicated timestamps + ds = xr.concat(ds_temps, dim="time") + + # Average duplicates for smooth forecast transitioning + ds = _average_duplicate_times(ds) + + # # Download and collect the main dataset for the given variable + # ds_temp = _mainDataCollector(url, var, field, forecast, offset, tmpdir) + + # Rename and clean coordinate labels for consistency + # ds_temp = _rename_and_clean_coords(ds_temp) + + # # Iterate through all data variables in the dataset + # for ds_var in list(ds_temp.data_vars): + # # If the variable is an averaged quantity, apply de-averaging + # if ds_temp[ds_var].attrs['GRIB_stepType'] == 'avg': + # ds_temp[ds_var] = _deaverage(ds_temp[ds_var]) + + # # If the variable is an accumulated quantity, apply de-accumulation + # elif ds_temp[ds_var].attrs['GRIB_stepType'] == 'accum': + # ds_temp[ds_var] = _deaccumulate(ds_temp[ds_var]) + + # Return the processed dataset + return ds + + +def _interpolate(ds, static, coords, grid, interp_s, interp_t): + ''' + Interpolates a dataset to match specific latitude and longitude coordinates. + + If the data is not static, it first interpolates temporally. Then, it applies + spatial interpolation using binning to adjust the data to the grid resolution. + + Parameters + ---------- + ds : xarray.Dataset + The input dataset to be interpolated. + static : bool + Whether the dataset contains static variables (i.e., no time dimension). + coords : dict + Dictionary containing target coordinate values for interpolation. + grid : tuple + Grid resolution in (x, y) directions for spatial binning. + interp_s : string + Spatial interpolation method (not used in the function but can be applied elsewhere). + interp_t : string + Temporal interpolation method to be used. + + Returns + ------- + xarray.Dataset + The interpolated dataset adjusted to the target spatial and temporal resolution. + ''' + + # Perform temporal interpolation if the data is not static + if not static: + try: + ds = ds.interp(time=coords['time'].values, + method=interp_t, + kwargs={"fill_value": "extrapolate"}) + except ValueError: + logger.info(f"Interpolation: Not enough supporting points for used interpolation method {interp_t}.") + logger.info("Interpolation method is set to 'nearest' instead.") + ds = ds.interp(time=coords['time'].values, + method="nearest", + kwargs={"fill_value": "extrapolate"}) + + # Create bin edges and labels for x-coordinates + x_bins = coords['x'].values + x_bins = np.insert(x_bins, 0, np.round(x_bins[0] - grid[0], 8), axis=0) # Extend bin range + x_bins_label = np.round(x_bins[:-1] + grid[0], 8) # Compute bin centers + + # Create bin edges and labels for y-coordinates + y_bins = coords['y'].values + y_bins = np.insert(y_bins, 0, np.round(y_bins[0] - grid[1], 8), axis=0) # Extend bin range + y_bins_label = np.round(y_bins[:-1] + grid[1], 8) # Compute bin centers + + # Store original dataset attributes + attrs = ds.attrs + + # Perform spatial binning by grouping data into bins along x and y dimensions and computing the mean + ds = ds.groupby_bins("x", x_bins, labels=x_bins_label).mean(dim="x") + ds = ds.groupby_bins("y", y_bins, labels=y_bins_label).mean(dim="y") + + # Rename bins to standard coordinate names + ds = ds.rename({'y_bins': 'y', 'x_bins': 'x'}) + + # Reassign original dataset attributes + ds = ds.assign_attrs(attrs) + + return ds + + +@retry(tries=5, delay=5, backoff=2, logger=logger) +def _urlopen_with_retry(data_url, tmpfp, engine='cfgrib', regrid=False, **kwargs): + ''' + Attempts to download and decompress a dataset file with automatic retry on failure. + + This function fetches data from a given URL, retries up to five times in case of failure, + and decompresses the response content before saving it to a temporary file. + + Parameters + ---------- + data_url : string + The URL from which data should be downloaded. + tmpfp : string + The file path where the downloaded content will be temporarily stored. + + Returns + ------- + tuple + - resp (requests.Response): The HTTP response object from the request. + - ds (xarray.Dataset): The dataset extracted from the downloaded file. + ''' + + # Send an HTTP GET request to the data URL with a timeout of 5 seconds + resp = requests.get(data_url, timeout=5) + + # Check if the request was successful (HTTP 200 OK) + if resp.status_code == 200: + # Open the specified temporary file and write the decompressed response content + with open(tmpfp, 'wb') as f: + f.write(decompress(resp.content)) + else: + # Raise an error if the response was unsuccessful + raise ValueError(f"Error in response: {resp.reason}, status code: {resp.status_code}") + + if regrid: + # Regrid the data to regular lon-lat grid + ds = _regrid_data(tmpfp, **kwargs) + else: + # Load the downloaded file as an xarray dataset using the 'cfgrib' engine + ds = xr.open_dataset(tmpfp, engine=engine) + + # Return both the HTTP response object and the loaded dataset + return resp, ds + + +def _download(urls, var, coords, area, grid, tmpdir=None): + ''' + Collects meteorological data for all timesteps of a given variable. + + This function retrieves data files, processes them, and merges them into + a single dataset. It determines the latest available runs, downloads + the necessary files, and structures them according to the expected format. + + Parameters + ---------- + url : string + The base URL for downloading data. + var : string + The variable name to be downloaded. + field : string + The field type, e.g., 'time-invariant' (static), 'soil-level' (e.g., 162cm), + 'model-level' (e.g., 62;63), or 'single-level' (2D field). + forecast : list of datetime + List of forecast time steps. + offset : numpy array + Array representing the forecast offsets. + tmpdir : string, optional + Temporary directory for storing downloaded files. + + Returns + ------- + xarray.Dataset + Merged dataset containing the collected meteorological data. + ''' + + # # Extract the most recent forecast run time + # latestRun = forecast[0] + + # # Filter only the previous runs before the latest run + # previousRuns = offset[offset < latestRun] + + # # Keep only entries that align with ICON model run hours + # previousRuns = previousRuns[previousRuns.hour.isin(model_run_hours)] + + # # Create a list of runs including the latest run and previous runs + # runs = [(latestRun.strftime("%H"), len(forecast))] + [(run.strftime("%H"), 3) for run in previousRuns] + + # # Generate download URLs for the specified variable and field + # urls = [] + # for run, hours in runs: + # urls = urls + _createDownloadUrl(url, var, field, run, hours) + + # urls = pd.Series(urls).unique() + + ds_temps = [] # List to store temporary datasets + + # Iterate over generated URLs and process each file + for data_url in urls: + logger.info("ICON data -> Processing file: {f}".format(f=data_url)) + + # Extract filename from URL and construct temporary file path + tmpfn = os.path.basename(data_url) + tmpfn = Path(tmpfn).with_suffix('') + tmpfp = "{p}/{tmpfn}".format(tmpfn=tmpfn, p=tmpdir) + + # Attempt to download and extract the dataset + try: + resp, ds_temp = _urlopen_with_retry(data_url, + tmpfp, + regrid=True, + engine='netcdf4', + var=var, + coords=coords, + area=area, + grid=grid, + tmpdir=tmpdir) + except Exception as err: + logger.info("Could not get {url}: {err}".format(err=err, url=data_url)) + continue # Skip to next URL if download fails + + # Check if the dataset contains other coordinate + ds_coords = list(ds_temp.coords) + ds_coords_to_keep = ["valid_time", "longitude", "latitude", "generalVerticalLayer"] + ds_coords_to_drop = [ds_coord for ds_coord in ds_coords if ds_coord not in ds_coords_to_keep] + + # Remove unwanted coordinates + ds_temp = ds_temp.drop_vars(ds_coords_to_drop) + + ds_temps.append(ds_temp) + + # Merge all collected datasets into a single dataset + ds = xr.merge(ds_temps) + + # Rename and clean coordinate labels for consistency + ds = _rename_and_clean_coords(ds) + + # Iterate through all data variables in the dataset + for ds_var in list(ds.data_vars): + # If the variable is an averaged quantity, apply de-averaging + if ds[ds_var].attrs['GRIB_stepType'] == 'avg': + ds[ds_var] = _deaverage(ds[ds_var]) + + # If the variable is an accumulated quantity, apply de-accumulation + elif ds[ds_var].attrs['GRIB_stepType'] == 'accum': + ds[ds_var] = _deaccumulate(ds[ds_var]) + + return ds + + +def _download_reference_grid(grid: str | Path, model: str, reference_grid_url: str): + + assert isinstance(grid, str | Path) + + # Ensure the directory exists + os.makedirs(GRID_DIRECTORY, exist_ok=True) + + if isinstance(grid, str): + reference_grid_path = f"{GRID_DIRECTORY}/{Path(grid).with_suffix('.nc')}" + else: + reference_grid_path = grid + + # Check wheter the reference grid dataset exists in atlite/resources/grids. + # If not, download it from the grid data url. + if not os.path.isfile(reference_grid_path): + try: + logger.info(f"{model} Grid Data -> Downloading file: {reference_grid_url}") + # Download the zip file and save it temporarely + resp, reference_grid = _urlopen_with_retry(reference_grid_url, + tmpfp=reference_grid_path, + engine='netcdf4') + + except Exception as err: + logger.info(f"Could not get {reference_grid_url}: {err}") + + return reference_grid_path + + +def _regrid_data(tmp_data_filepath, var, coords, area, grid, tmpdir): + """ + Interpolate Dataset or DataArray `ds` to a new grid, using rasterio's + reproject facility. + + See also: https://mapbox.github.io/rasterio/topics/resampling.html + + Parameters + ---------- + ds : xr.Dataset|xr.DataArray + N-dim data on a spatial grid + dimx : pd.Index + New x-coordinates in destination crs. + dimx.name MUST refer to x-coord of ds. + dimy : pd.Index + New y-coordinates in destination crs. + dimy.name MUST refer to y-coord of ds. + **kwargs : + Arguments passed to rio.wrap.reproject; of note: + - resampling is one of gis.Resampling.{average,cubic,bilinear,nearest} + - src_crs, dst_crs define the different crs (default: EPSG 4326, ie latlong) + + """ + + # Data Logging of the ICON Grid + logger.info(f"ICON Grid Data -> Processing file: {dwd_grid_url}") + + # Load original dataset to later process attributes + ds_original = xr.load_dataset(tmp_data_filepath, engine='cfgrib') + + # Initialize Cdo constructor + cdo = Cdo() + # Set temporary direcotry for Cdo operations + cdo = Cdo(tempdir=tmpdir) + + # Set the reference grid as downloaded in requirements + reference_grid = f"{GRID_DIRECTORY}/{Path(dwd_icon_grid).with_suffix('.nc')}" + + # Attach the reference grid data to the input dataset + ds = cdo.setgrid(reference_grid, + input=tmp_data_filepath, + returnXDataset=True) + + # Create a temporary target grid file from target coordinates + target_grid_file = os.path.basename(f"target_grid_file_lonlat_x{area[1]}x{area[3]}_y{area[0]}x{area[2]}_g{grid[0]}x{grid[1]}") + tmp_target_grid_file = os.path.basename(f"{target_grid_file}") + tmp_target_grid_file_name = Path(tmp_target_grid_file + ".txt") + tmp_target_grid_file_path = f"{tmpdir}/{tmp_target_grid_file_name}" + if not os.path.isfile(tmp_target_grid_file_path): + with open(tmp_target_grid_file_path, "w") as f: + f.write("gridtype = lonlat\n") + f.write(f"xsize = {len(coords['x'])}\n") # number of longitudes + f.write(f"ysize = {len(coords['y'])}\n") # number of latitudes + f.write(f"xfirst = {area[1]}\n") # first longitude + f.write(f"xinc = {grid[0]}\n") # longitude increment + f.write(f"yfirst = {area[2]}\n") # first latitude + f.write(f"yinc = {grid[1]}\n") # latitude increment + + # Create a temporary transformation weight file from the reference subgrid to the target grid. + tmp_weight_file = os.path.basename(f"weight_file_lonlat_x{area[1]}x{area[3]}_y{area[0]}x{area[2]}_g{grid[0]}x{grid[1]}") + tmp_weight_file_name = Path(tmp_weight_file + ".nc") + tmp_weight_file_path = f"{tmpdir}/{tmp_weight_file_name}" + + if not os.path.isfile(tmp_weight_file_path): + cdo.gennn(tmp_target_grid_file_path, + input=ds, + output=tmp_weight_file_path) + + # Regrid the data to the target grid (triangular to latlon grid) + ds = cdo.remap(f"{tmp_target_grid_file_path},{tmp_weight_file_path}", + input=ds, + returnXDataset=True) + + # Drop 'bnds' and all associated variables from the dataset + if 'bnds' in ds.sizes: + ds = ds.drop_dims('bnds') + + if 'generalVerticalLayer' in list(ds_original.coords): + ds = ds.rename({'height': 'generalVerticalLayer'}) + + if 'heightAboveGround' in list(ds_original.coords): + ds = ds.sel({'height': 2.0}) + + if 'depthBelowLandLayer' in list(ds_original.coords): + ds = ds.isel({'depth': 0}) + + # Rename variable dimensions accordingly + ds = ds.rename({'time': "valid_time", "lon": "longitude", "lat": "latitude"}) + + # Create a mapping of variable names + rename_mapping = {old_var: new_var for old_var, new_var in zip(ds.data_vars, ds_original.data_vars)} + + # Rename all variables in ds based on ds_original + ds = ds.rename(rename_mapping) + + for data_var in list(ds_original.data_vars): + ds[data_var].attrs['GRIB_stepType'] = ds_original[data_var].attrs['GRIB_stepType'] + ds[data_var].attrs['GRIB_missingValue'] = ds_original[data_var].attrs['GRIB_missingValue'] + ds[data_var].attrs['GRIB_gridType'] = "latlon" + + return ds + + +def _rename_and_clean_coords(ds, add_lon_lat=True): + """ + Rename 'longitude' and 'latitude' columns to 'x' and 'y' and fix roundings. + + Optionally (add_lon_lat, default:True) preserves latitude and + longitude columns as 'lat' and 'lon'. + """ + ds = ds.rename({"longitude": "x", "latitude": "y"}) + if "valid_time" in ds.sizes: + ds = ds.rename({"valid_time": "time"}).unify_chunks() + # round coords since cds coords are float32 which would lead to mismatches + ds = ds.assign_coords( + x=np.round(ds.x.astype(float), 5), y=np.round(ds.y.astype(float), 5) + ) + ds = maybe_swap_spatial_dims(ds) + if add_lon_lat: + ds = ds.assign_coords(lon=ds.coords["x"], lat=ds.coords["y"]) + + return ds + +def _interpolate_to_cutout_resolution(ds, retrieval_params, static): + + # Interpolate the data spatially and temporally to the wanted cutout resolution + ds_temps = [] + for idx, var in enumerate(ds.data_vars): + ds_temps.append(_interpolate(ds[var], static, + retrieval_params['coords'], + retrieval_params['grid'], + retrieval_params['interp_s'], + retrieval_params['interp_t']) + ) + + ds = xr.merge(ds_temps) + ds = ds.assign_coords(lon=("x", ds.x.values), lat=("y", ds.y.values)) + + ds = ds.unify_chunks().chunk(chunks=retrieval_params['chunks'] or {}) + + return ds + + +def get_data_wind(retrieval_params): + ''' + Retrieves and processes wind data from the DWD server. + + The function collects wind speed and direction data at 100m above ground level, + as well as surface roughness data. It then processes and interpolates this data + to match the desired spatial and temporal resolution. + + Parameters + ---------- + retrieval_params : dict + Dictionary containing parameters for data retrieval, including coordinates, + grid resolution, and interpolation methods. + + Returns + ------- + xarray.Dataset + Processed dataset containing wind speed, wind direction, and surface roughness. + ''' + + # Retrieve wind data from model levels 62 and 63 + retrieval_params['field'] = ['model-level', 'model-level'] + ds = retrieve_data( + url=dwd_url, + variable=[ + "u/62/63", # Zonal (east-west) wind component at levels 62 and 63 + "v/62/63", # Meridional (north-south) wind component at levels 62 and 63 + ], + **retrieval_params, + ) + + # Compute the mean wind values across the general vertical layers + ds["u"] = ds["u"].mean('generalVerticalLayer') + ds["v"] = ds["v"].mean('generalVerticalLayer') + ds = ds.drop_dims('generalVerticalLayer') # Remove the dimension after averaging + ds = ds.rename({"u": "u_100m", "v": "v_100m"}) # Rename variables for clarity + + + # Retrieve surface roughness data from single-level data + retrieval_params['field'] = ['single-level'] + ds2 = retrieve_data( + url=dwd_url, + variable=["z0"], # Surface roughness length + **retrieval_params, + ) + + # Merge wind data with roughness data into a single dataset + ds = xr.merge([ds, ds2]) + + # Rename roughness variable for clarity + ds = ds.rename({"fsr": "roughness"}) + ds["roughness"] = ds["roughness"].assign_attrs( + units="m", + long_name="Surface roughness" + ) + + # Compute wind speed at 100m using the Pythagorean theorem + ds["wnd100m"] = np.sqrt(ds["u_100m"] ** 2 + ds["v_100m"] ** 2).assign_attrs( + units="m/s", long_name="100 metre wind speed" + ) + + # Compute wind direction azimuth (0 = North, π/2 = East, π = South, 3π/2 = West) + azimuth = np.arctan2(ds["u_100m"], ds["v_100m"]) + + # Ensure wind azimuth is within the 0 to 2π range + ds["wnd_azimuth"] = azimuth.where(azimuth >= 0, azimuth + 2 * np.pi) + + # Remove intermediate wind component variables after processing + ds = ds.drop_vars(["u_100m", "v_100m"]) + + return ds + + +def sanitize_wind(ds): + """Sanitize retrieved wind data.""" + ds["roughness"] = ds["roughness"].where(ds["roughness"] >= 0.0, 2e-4) + return ds + + +def get_data_influx(retrieval_params): + """Get influx data for given retrieval parameters.""" + # Retrieve single-level data + retrieval_params['field'] = ['single-level', 'single-level', 'single-level', 'single-level'] + ds = retrieve_data( + url=dwd_url, + variable=[ + "asob_t", + "aswdir_s", + "aswdifd_s", + "alb_rad", + ], + **retrieval_params, + ) + + ds = ds.rename({"avg_tnswrf": "influx_toa", + "ASWDIR_S": "influx_direct", + "ASWDIFD_S": "influx_diffuse", + "al": "albedo"}) + + ds["albedo"] = (ds["albedo"]/100).assign_attrs(units="(0 - 1)", long_name="Shortwave broadband albedo for diffuse radiation") + ds["influx_diffuse"] = ds["influx_diffuse"].assign_attrs(units="W m**-2", long_name="Surface down solar diffuse radiation") + ds["influx_direct"] = ds["influx_direct"].assign_attrs(units="W m**-2", long_name="Surface down solar direct radiation") + ds["influx_toa"] = ds["influx_toa"].assign_attrs(units="W m**-2", long_name="Net short-wave radiation flux at top of atmosphere (TOA)") + + # # Interpolate the data spatially and temporally to the wanted cutout resolution + # ds_temps = [] + # for idx, var in enumerate(ds): + # ds_temps.append(_interpolate(ds[var], False, + # retrieval_params['coords'], + # retrieval_params['grid'], + # retrieval_params['interp_s'], + # retrieval_params['interp_t']) + # ) + + # ds = xr.merge(ds_temps) + # ds = ds.assign_coords(lon=("x", ds.x.values), lat=("y", ds.y.values)) + + # ICON-EU variables are mean values for previous hour, i.e. 13:01 to 14:00 are labelled as "14:00" + # account by calculating the SolarPosition for the center of the interval for aggregation happens + # see https://github.com/PyPSA/atlite/issues/158 + # Do not show DeprecationWarning from new SolarPosition calculation (#199) + with warnings.catch_warnings(): + warnings.simplefilter("ignore", DeprecationWarning) + # Convert dt / time frequency to timedelta and shift solar position by half + # (freqs like ["H","30T"] do not work with pd.to_timedelta(...) + time_shift = ( + -1 + / 2 + * pd.to_timedelta( + pd.date_range( + "1970-01-01", periods=1, freq=pd.infer_freq(ds["time"]) + ).freq + ) + ) + sp = SolarPosition(ds, time_shift=time_shift) + + sp = sp.rename({v: f"solar_{v}" for v in sp.data_vars}) + + ds = xr.merge([ds, sp]) + + # # Interpolate the data spatially and temporally to the wanted cutout resolution + # ds_temps = [] + # for idx, var in enumerate(ds): + # ds_temps.append(_interpolate(ds[var], False, + # retrieval_params['coords'], + # retrieval_params['grid'], + # retrieval_params['interp_s'], + # retrieval_params['interp_t']) + # ) + + # ds = xr.merge(ds_temps) + # ds = ds.assign_coords(lon=("x", ds.x.values), lat=("y", ds.y.values)) + + # ds = ds.unify_chunks.chunk(chunks=retrieval_params['chunks'] or {}) + + + + # ds = ds.drop_vars(['lon','lat']) + # ds = ds.assign_coords(lon=("x", ds.x.values), lat=("y", ds.y.values)) + + return ds + + +def sanitize_influx(ds): + """Sanitize retrieved influx data.""" + for a in ("influx_direct", "influx_diffuse", "influx_toa"): + ds[a] = ds[a].clip(min=0.0) + return ds + + +def get_data_temperature(retrieval_params): + """Get wind temperature for given retrieval parameters.""" + # Retrieve single-level data + retrieval_params['field'] = ['single-level','soil-level'] + ds = retrieve_data( + url=dwd_url, + variable=["t_2m", + "t_so/162"], + **retrieval_params + ) + + ds = ds.rename({"t2m": "temperature", + "T_SO": "soil temperature"}) + + ds["temperature"] = ds["temperature"].assign_attrs(units="K", long_name="Temperature at 2m above ground") + ds["soil temperature"] = ds["soil temperature"].assign_attrs(units="K", long_name="Soil temperature in 162 cm depth ") + + return ds + + +def get_data_runoff(retrieval_params): + """Get runoff data for given retrieval parameters.""" + # Retrieve single-level data + retrieval_params['field'] = ['single-level','single-level'] + ds = retrieve_data(url=dwd_url, + variable=["runoff_s", + "runoff_g"], + **retrieval_params) + + ds["runoff"] = (ds["RUNOFF_S"] + ds["RUNOFF_G"]).assign_attrs(units="kg m**-2", long_name="Surface and Soil water runoff (accumulated since model start)") + + ds = ds.drop_vars(["RUNOFF_S", "RUNOFF_G"]) + + return ds + + +def sanitize_runoff(ds): + """Sanitize retrieved runoff data.""" + ds["runoff"] = ds["runoff"].clip(min=0.0) + return ds + + +def get_data_height(retrieval_params): + """Get height data for given retrieval parameters.""" + # Retrieve time-invariant data + retrieval_params['field'] = ['time-invariant'] + ds = retrieve_data(url=dwd_url, + variable=["hsurf"], + **retrieval_params) + + ds = ds.rename({"HSURF": "height"}) + ds["height"] = ds["height"].assign_attrs( + units="m", + long_name="Geometric Height of the earths surface above sea level (2D field)" + ) + + return ds + + +def _area(coords): + # North, West, South, East. Default: global + x0, x1 = coords["x"].min().item(), coords["x"].max().item() + y0, y1 = coords["y"].min().item(), coords["y"].max().item() + return [y1, x0, y0, x1] + + +def retrieval_times(coords, tz, static=False): + """ + Get retrieval time dimension of the forecast. + + Parameters + ---------- + coords : atlite.Cutout.coords + Coordinate object containing the time dimension. + tz : timezone + Timezone information of the input time and date (currently unused here). + static : bool, optional (default=False) + If True, return only the first forecast time step and an empty offset. + + Returns + ------- + dict + Dictionary with keys: + - 'forecast': pd.DatetimeIndex of forecast timestamps (including any filled gaps) + - 'offset': pd.DatetimeIndex of past timestamps before current model run time + """ + # Convert xarray time coordinate to pandas index + time = coords["time"].to_index() + + # Get the current model run time (e.g., most recent 6-hourly forecast release) + currentRunTime = _getCurrentRun(pd.Timestamp.utcnow().replace(tzinfo=None)) + + # Split times into forecast (≥ currentRunTime) and offset (< currentRunTime) + forecast_times = time[time >= currentRunTime] + offset_times = time[time < currentRunTime] + + # If the forecast doesn't include currentRunTime explicitly, fill the missing range + if len(forecast_times) > 0 and forecast_times[0] > currentRunTime: + # Infer time resolution, fallback to difference if not directly available + freq = time.freq or pd.infer_freq(time) + if freq is None: + freq = time[1] - time[0] # fallback to timedelta if no freq is inferable + + # Fill the missing time steps from currentRunTime up to just before the first forecast time + fill = pd.date_range(currentRunTime, forecast_times[0] - freq, freq=freq) + + # Prepend the filled range to the forecast times + forecast_times = fill.append(forecast_times) + + # If static mode is requested, return only the first forecast time and no offset + if static: + forecast_times = forecast_times[:1] + offset_times = pd.DatetimeIndex([]) + + # Return dictionary with forecast and offset times + return { + "forecast": forecast_times, + "offset": offset_times, + } + + +def noisy_unlink(path): + """Delete file at given path.""" + logger.debug(f"Deleting file {path}") + try: + os.unlink(path) + except PermissionError: + logger.error(f"Unable to delete file {path}, as it is still in use.") + + +def retrieve_data(url, product, chunks=None, tmpdir=None, lock=None, **updates): + + """ + Download data from the ICON-EU Model from the Open Data Server (ODS) of DWD. + + If you want to manually downolad the data go to: + https://opendata.dwd.de/weather/nwp/icon-eu/grib/ + """ + + request = {"product_type": "icon_eu", "format": "direct-download"} + request.update(updates) + + ds_temps = [] + #Download data for each variable individually and then merge all in one xarray + logger.info(f"open-dwd: Downloading variables\n\t{request['variable']}\n") + for idx, var in enumerate(request['variable']): + ds_temps.append(_mainDataCollector(url, + var, + request['field'][idx], + request['forecast'], + request['offset'], + request['coords'], + request['area'], + request['grid'], + tmpdir) + ) + + ds = xr.merge(ds_temps).chunk(chunks=chunks) + + return ds + + +def get_data(cutout, feature, tmpdir, + lock=None, + monthly_requests=False, + concurrent_requests=False, + **creation_parameters): + """ + Retrieve data from DWDs ICON-EU Model dataset (via ODS). + + This front-end function downloads data for a specific feature and formats + it to match the given Cutout. + + Parameters + ---------- + cutout : atlite.Cutout + feature : str + Name of the feature data to retrieve. Must be in + `atlite.datasets.icon_d2.features` + monthly_requests : bool + Takes no effect, only here for consistency with other dataset modules. + concurrent_requests : bool + Takes no effect, only here for consistency with other dataset modules. + tmpdir : str/Path + Directory where the temporary netcdf files are stored. + **creation_parameters : + Additional keyword arguments. The only effective argument is 'sanitize' + (default True) which sets sanitization of the data on or off. + + Returns + ------- + xarray.Dataset + Dataset of dask arrays of the retrieved variables. + + """ + coords = cutout.coords + + sanitize = creation_parameters.get("sanitize", True) + + retrieval_params = { + "product": "dwd_icon_eu", + "area": _area(coords), + "chunks": cutout.chunks, + "grid": [cutout.dx, cutout.dy], + "tmpdir": tmpdir, + "lock": lock, + "tz": cutout.data.tz, + "interp_s": cutout.data.interp_s, + "interp_t": cutout.data.interp_t, + "coords": coords, + } + + func = globals().get(f"get_data_{feature}") + sanitize_func = globals().get(f"sanitize_{feature}") + + logger.info(f"Requesting data for feature {feature} for ICON-EU from open-dwd...") + + def retrieve_once(time, static=False): + ds = func({**retrieval_params, **time}) + ds = _interpolate_to_cutout_resolution(ds, retrieval_params, static) + # Sanitize the data after interpolation to remove residuals + if sanitize and sanitize_func is not None: + ds = sanitize_func(ds) + return ds + + if feature in static_features: + return retrieve_once(retrieval_times(coords, cutout.data.tz, True), True).squeeze() + + dataset = retrieve_once(retrieval_times(coords, cutout.data.tz, False), False) + + return dataset.sel(time=coords["time"]) diff --git a/atlite/datasets/icon_d2.py b/atlite/datasets/icon_d2.py new file mode 100644 index 00000000..a3929e9a --- /dev/null +++ b/atlite/datasets/icon_d2.py @@ -0,0 +1,1171 @@ +# -*- coding: utf-8 -*- + +# SPDX-FileCopyrightText: 2016-2021 The Atlite Authors +# +# SPDX-License-Identifier: GPL-3.0-or-later + +""" +Module for downloading and curating data from DWD ICON-D2 dataset (via ODS). + +For further reference see: +https://www.dwd.de/DE/leistungen/nwv_icon_d2_modelldokumentation/nwv_icon_d2_modelldokumentation.html +""" + +import os +import warnings +import numpy as np +import xarray as xr +import pandas as pd +import requests +import logging + +from pathlib import Path +from retry import retry +from bz2 import decompress +from bs4 import BeautifulSoup + +from ..gis import maybe_swap_spatial_dims +from ..pv.solar_position import SolarPosition + +# Null context for running a with statements wihout any context +try: + from contextlib import nullcontext +except ImportError: + # for Python verions < 3.7: + import contextlib + + @contextlib.contextmanager + def nullcontext(): + yield + + +logger = logging.getLogger(__name__) + +# URL for accessing Open Data from DWD (Deutscher Wetterdienst) +dwd_url = "https://opendata.dwd.de/weather/nwp/icon-d2/grib/" + +# ICON-D2 model runs are available at fixed intervals: 00, 03, 06, 09, 12, 15, 18, 21 UTC +model_run_hours = np.array([0, 3, 6, 9, 12, 15, 18, 21]) + +# Averaging window of different model runs +averaging_window = 24 #hours + +# Coordinate Reference System (CRS) used for geospatial data +crs = 4326 + +# Dictionary defining available meteorological features and their associated data fields +features = { + "height": ["height"], # Elevation data + "wind": ["wnd100m", "wnd_azimuth", "roughness"], # Wind speed, direction, and surface roughness + "influx": [ + "influx_toa", # Top-of-atmosphere solar radiation + "influx_direct", # Direct solar radiation + "influx_diffuse", # Diffuse solar radiation + "albedo", # Surface reflectivity + "solar_altitude", # Solar altitude angle + "solar_azimuth", # Solar azimuth angle + ], + "temperature": ["temperature", "soil temperature"], # Air and soil temperature + "runoff": ["runoff"], # Surface water runoff +} + +# Features that remain constant over time +static_features = {"height"} + +# Model requirements specifying spatial and temporal constraints +requirements = { + 'x': slice(-3.84, 20.21, 0.02), # Longitude range with resolution + 'y': slice(43.19, 57.63, 0.02), # Latitude range with resolution + 'offset': pd.Timedelta(hours=-18), # Time offset for forecast initialization + 'forecast': pd.Timedelta(hours=48), # Maximum forecast range + 'dt': pd.Timedelta(hours=1), # Temporal resolution of data + 'parallel': True, # Flag for enabling parallel processing + } + + +def _checkModuleRequirements(x, y, time, time_now, **kwargs): + """ + Load and check the data requirements for a given module. + + Parameters: + x (slice): Defines the start, stop, and step values for the x-dimension. + y (slice): Defines the start, stop, and step values for the y-dimension. + time (slice): Defines the start, stop, and step values for the time dimension. + **kwargs: Additional optional parameters. + """ + + # Extract start, stop, and step values for x + x_start, x_stop, x_step = x.start, x.stop, x.step + + # Adjust x range based on module requirements + if requirements['x'].start > x.start: + x_start = requirements['x'].start + if requirements['x'].stop < x.stop: + x_stop = requirements['x'].stop + if requirements['x'].step > x.step: + x_step = requirements['x'].step + + x = slice(x_start, x_stop, x_step) + + # Extract start, stop, and step values for y + y_start, y_stop, y_step = y.start, y.stop, y.step + + # Adjust y range based on module requirements + if requirements['y'].start > y.start: + y_start = requirements['y'].start + if requirements['y'].stop < y.stop: + y_stop = requirements['y'].stop + if requirements['y'].step > y.step: + y_step = requirements['y'].step + + y = slice(y_start, y_stop, y_step) + + + # Extract time range parameters + time_start = time.start + time_stop = time.stop + time_step = time.step + + # Check forecast feasibility + feasible_start = time_now + requirements['offset'] + feasible_end = time_now + requirements['forecast'] + + # Ensure time_start is within feasible bounds + if time_start < feasible_start: + logger.error(f"The required forecast start time {time_start} exceeds the model requirements.") + logger.error(f"The minimum start time of the forecast for {time_now} is {feasible_start}.") + logger.error(f"The maximum historical offset of the forecast is {requirements['offset']}.") + raise ValueError(f"Invalid forecast start time: {time_start}. Must be >= {feasible_start}.") + # logger.info(f"Set the start time to the minimum start time {feasible_start} and proceed.") + # time_start = time_now + requirements['offset'] + + if time_start >= feasible_end: + logger.error(f"The required forecast start time {time_start} exceeds the model requirements.") + logger.error(f"The maximum start time of the forecast for {time_now} needs to be smaller than {feasible_end}.") + raise ValueError(f"Invalid forecast start time: {time_start}. Must be < {feasible_end}.") + # logger.info(f"Set the start time to the minimum start time {feasible_start} and proceed.") + # time_start = time_now + requirements['offset'] + + # Ensure time_stop is greater than time_start + if time_stop <= time_start: + logger.error(f"The required forecast end time {time_stop} exceeds the model requirements.") + logger.error(f"The minimum end time of the forecast for {time_now} needs to be larger than {time_start}.") + raise ValueError(f"Invalid forecast end time: {time_stop}. Must be > {time_start}.") + # logger.info(f"Set the end time to the maximum end time {feasible_end} and proceed.") + # time_stop = time_now + requirements['forecast'] + + # Ensure time_stop is greater than time_start + if time_stop > feasible_end: + logger.error(f"The required forecast end time {time_stop} exceeds the model requirements.") + logger.error(f"The maximum end time of the forecast for {time_now} is {feasible_end}.") + logger.error(f"The maximum forecast horizon is {requirements['forecast']}.") + raise ValueError(f"Invalid forecast end time: {time_stop}. Must be <= {feasible_end}.") + # logger.info(f"Set the end time to the maximum end time {feasible_end} and proceed.") + # time_stop = time_now + requirements['forecast'] + + # # Check if forecast hours exceed limits + # forecastHours = (time_stop - time_now) + # if forecastHours > requirements['forecast']: + # logger.error(f"The end time of the forecast {time_stop} exceedes the model requirements.") + # logger.error(f"The maximum end time of the forecast for {time_now} is {feasible_end}.") + # logger.error(f"The required forecast horizon {forecastHours} exceeds the maximum forecast horzion {requirements['forecast']}.") + # # logger.info(f"Set it to maximum forecast hours of {requirements['forecast']} hours.") + # # forecastHours = requirements['forecast'] + # # time_stop = time_now + forecastHours + + # # Check if offset is within required limits + # offset = (time_start - time_now) + # if offset < requirements['offset']: + # logger.error(f"The start time of the forecast {time_start} exceeds model requirements.") + # logger.error(f"The minimum start time of the forecast for {time_now} is {feasible_start}.") + # logger.warning(f"Forecast offset of {offset} hours is below model requirements.") + # logger.info(f"Set it to minimum offset of {requirements['offset']} hours.") + # offset = requirements['offset'] + # time_start = time_now + offset + + # Ensure time step is within required limits + if (time_step is pd.Timedelta(None)) or (time.step < requirements['dt']): + logger.warning(f"The required temporal forecast resolution {time_step} exceeds the model requirements.") + logger.warning(f"The minimum temporal resolution of the forecast is {requirements['dt']}.") + logger.info(f"Set the temporal forecast resolution to the minimum: {requirements['dt']}.") + time_step = requirements['dt'] + + time = slice(time_start, time_stop, time_step) + + # Retrieve parallel processing setting from requirements + parallel = requirements['parallel'] + + return x, y, time, parallel + + +def _getCurrentRun(time): + ''' + Determines the most recent available model run based on the current time. + The latest run is fully available approximately 2 hours after initialization. + To ensure the model run is successfully uploaded, the download delay time is set to 3 hours. + + This code was adapted from: https://github.com/prayer007/dwdGribExtractor/tree/main + + Parameters + ---------- + time : datetime + The current datetime in UTC. + + Returns + ------- + datetime + The timestamp of the most recent available model run, floored to the hour. + ''' + download_delay = 3 # Delay in hours before the run is fully available + + # Adjust the current time by the delay to ensure availability + adjusted_time = time - pd.Timedelta(hours=download_delay) + + # Find the most recent available run by flooring to the nearest model run hour + run_hour = max(hour for hour in model_run_hours if hour <= adjusted_time.hour) + + # Construct the correct model run time + run_time = adjusted_time.replace(hour=run_hour, minute=0, second=0, microsecond=0) + + return run_time + + + +def _createDownloadUrl(url, var, field, run, hours): + ''' + Generates a list of download URLs for meteorological data from the DWD server. + The function scrapes the available files for a given variable and model run, + filtering based on field type, forecast hours, and model levels. + + This code was adopted from: https://github.com/prayer007/dwdGribExtractor/tree/main + + Parameters + ---------- + url : string + Base URL of the DWD data server. + var : string + The variable name, optionally including levels separated by '/'. + field : string + The field parameter: 'time-invariant' (static), 'soil-level' (162cm), + 'model-level' (62;63), or 'single-level' (2D field). + run : string + Model run identifier. + hours : int + Maximum forecast hours to retrieve. + + Returns + ------- + list + List of filtered download URLs. + ''' + + # Extract variable name and associated levels + levels = pd.Series(var.split('/')[1:]).astype(int) # Convert levels to integers + var = var.split('/')[0] # Extract variable name + + # Construct the data URL based on provided parameters + data_url = "{url}{run}/{var}/".format(url=url, var=var, run=run) + + # Send an HTTP GET request to fetch available files + response = requests.get(data_url) + + # Raise an error if the request fails + response.raise_for_status() + + # Parse the HTML content to extract links + soup = BeautifulSoup(response.content, 'html.parser') + + # Find all anchor tags ('a') representing file links + link_tags = soup.find_all('a') + + # Initialize an empty list to store the file URLs + urls = [] + + # Iterate through all link tags, extract URLs, and store them + for tag in link_tags: + link = tag.get('href') # Extract the hyperlink reference + + if link: + # Construct the full URL by appending the relative link to the base URL + full_url = data_url + link + urls.append(full_url) + + # Convert the list of URLs into a Pandas Series for easy filtering + urls = pd.Series(urls) + + # Filter URLs to retain only those containing 'regular-lat-lon' grid format + urls = urls[urls.str.contains('regular-lat-lon')] + + # Further filter URLs based on the specified model field + urls = urls[urls.str.contains(field)] + + # Apply forecast time horizon filter (excluding 'time-invariant' fields) + if field != 'time-invariant': + urls = urls[urls.str.findall(r"\_(\d{3})\_").str[0].astype(int) <= hours] + + # Filter URLs based on model levels, if specified + if not levels.empty: + url_mask = pd.Series(index=urls.index, data=False) # Initialize boolean mask + for level in levels: + url_mask += urls.str.contains(f"_{level}_") # Check if URL contains level + urls = urls[url_mask] # Apply filter + + # Convert filtered URLs back to a list + urls = list(urls) + + return urls + + +def _deaverage(da): + ''' + Converts a temporally averaged data array into individual time-step values. + Each time step's original value is reconstructed by reversing the cumulative averaging process. + + Parameters + ---------- + da : xarray.DataArray + Input data array with a time dimension containing cumulative averages. + + Returns + ------- + xarray.DataArray + Data array with de-averaged values. + ''' + # Create an integer index for time, matching da's shape + time_index = xr.DataArray(np.arange(1, da.sizes["time"] + 1), dims="time", coords={"time": da.time}) + + # Apply the reverse operation: Ψ_inst(t) = t * Ψ(t) - (t-1) * Ψ(t-1) + da_instantaneous = (time_index * da - (time_index - 1) * da.shift(time=1, fill_value=0)) + + # Fill the first timestep with NaNs, since it is always zero + da_instantaneous = da_instantaneous.where(da_instantaneous.time != da_instantaneous.time[0], np.nan) + + return da_instantaneous + + +def _deaccumulate(da): + ''' + Converts accumulated data into time-step differences. + This function takes an accumulated dataset and calculates the incremental + values between consecutive time steps. + + Parameters + ---------- + da : xarray.DataArray + Input data array with a time dimension containing accumulated values. + + Returns + ------- + xarray.DataArray + Data array with de-accumulated values (time-step differences). + ''' + + # Apply the reverse operation: Ψ_inst(t) = Ψ(t) - Ψ(t-1) + da_instantaneous = da - da.shift(time=1, fill_value=0) + + # Fill the first timestep with NaNs, since it is always zero + da_instantaneous = da_instantaneous.where(da_instantaneous.time != da_instantaneous.time[0], np.nan) + + return da_instantaneous + + +def _average_duplicate_times(ds): + """ + Averages duplicate timestamps in an xarray Dataset. + + Given an xarray Dataset with duplicated timestamps (after concatenation), + this function computes the mean over all datasets that share the same time index. + Unique timestamps remain unchanged. + + Parameters + ---------- + ds : xarray.Dataset + An xarray Dataset that contains duplicated time indices. + + Returns + ------- + xarray.Dataset + A dataset where duplicate timestamps are averaged and unique timestamps are preserved. + """ + + # Step 1: Compute the mean for duplicate timestamps while keeping unique ones + ds_mean = ds.groupby("time").mean(dim="time", keep_attrs=True) + + # Step 2: Preserve dataset and variable attributes from the original dataset + ds_mean.attrs = ds.attrs # Preserve global dataset attributes + for var in ds_mean.data_vars: + ds_mean[var].attrs = ds[var].attrs # Preserve variable attributes + + return ds_mean + + +def _mainDataCollector(url, var, field, forecast, offset, coords, tmpdir): + ''' + Downloads meteorological data for a given variable and processes it accordingly. + + This function retrieves data from the specified URL, processes it to rename and clean + coordinates, and applies de-averaging or de-accumulation where necessary based on + the GRIB step type. + + Parameters + ---------- + url : string + The base URL for downloading data. + var : string + The variable name to be downloaded. + field : string + The field type, e.g., 'time-invariant' (static), 'soil-level' (e.g., 162cm), + 'model-level' (e.g., 62;63), or 'single-level' (2D field). + forecast : int + The number of forecast hours to retrieve. + offset : int + The forecast offset time in hours. + coords : atlite.Cutout.coords + The spatial coordinates where data is required. + tmpdir : string + Path to the temporary directory where downloaded files are stored. + + Returns + ------- + xarray.Dataset + Processed dataset containing the collected meteorological data. + ''' + + # Extract the most recent forecast run time + latestRun = forecast[0] + + # Filter only the previous runs before the latest run + previousRuns = offset[offset < latestRun] + + # Keep only entries that align with ICON model run hours + previousRuns = previousRuns[previousRuns.hour.isin(model_run_hours)].sort_values(ascending=True) + + if len(previousRuns) > 0: + # Get the hour of the earliest previous run + first_prev_hour = previousRuns[0].hour + + # Find the previous index in `model_run_hours` + prev_idx = np.where(model_run_hours == first_prev_hour)[0][0] - 1 + + # Compute the adjusted previous run + previousRun = pd.DatetimeIndex([previousRuns[0].replace( + hour=model_run_hours[prev_idx], minute=0, second=0, microsecond=0 + )]) + + # Add previousRun to previousRuns, ensuring uniqueness + previousRuns = previousRuns.union(previousRun).sort_values(ascending=True) + + # Create a list of runs including the latest run and previous runs + # Use an averaging_window of X hours for all previous runs to average the results + runs = [(run.strftime("%H"), averaging_window) for run in previousRuns] + [(latestRun.strftime("%H"), len(forecast))] + + # # Generate download URLs for the specified variable and field + # urls = [] + # for run, hours in runs: + # urls = urls + _createDownloadUrl(url, var, field, run, hours) + + # urls = pd.Series(urls).unique() + + ds_temps = [] # List to store temporary datasets + + for run, hours in runs: + + # Generate download URLs for the specified variable and field + urls = _createDownloadUrl(url, var, field, run, hours) + + # Download and collect the main dataset for the given variable + ds_temps.append(_download(urls, tmpdir)) + + + # Concatenate along the time dimension, keeping also duplicated timestamps + ds = xr.concat(ds_temps, dim="time") + + # Average duplicates for smooth forecast transitioning + ds = _average_duplicate_times(ds) + + # # Download and collect the main dataset for the given variable + # ds_temp = _mainDataCollector(url, var, field, forecast, offset, tmpdir) + + # Rename and clean coordinate labels for consistency + # ds_temp = _rename_and_clean_coords(ds_temp) + + # # Iterate through all data variables in the dataset + # for ds_var in list(ds_temp.data_vars): + # # If the variable is an averaged quantity, apply de-averaging + # if ds_temp[ds_var].attrs['GRIB_stepType'] == 'avg': + # ds_temp[ds_var] = _deaverage(ds_temp[ds_var]) + + # # If the variable is an accumulated quantity, apply de-accumulation + # elif ds_temp[ds_var].attrs['GRIB_stepType'] == 'accum': + # ds_temp[ds_var] = _deaccumulate(ds_temp[ds_var]) + + # Return the processed dataset + return ds + + +def _interpolate(ds, static, coords, grid, interp_s, interp_t): + ''' + Interpolates a dataset to match specific latitude and longitude coordinates. + + If the data is not static, it first interpolates temporally. Then, it applies + spatial interpolation using binning to adjust the data to the grid resolution. + + Parameters + ---------- + ds : xarray.Dataset + The input dataset to be interpolated. + static : bool + Whether the dataset contains static variables (i.e., no time dimension). + coords : dict + Dictionary containing target coordinate values for interpolation. + grid : tuple + Grid resolution in (x, y) directions for spatial binning. + interp_s : string + Spatial interpolation method (not used in the function but can be applied elsewhere). + interp_t : string + Temporal interpolation method to be used. + + Returns + ------- + xarray.Dataset + The interpolated dataset adjusted to the target spatial and temporal resolution. + ''' + + # Perform temporal interpolation if the data is not static + if not static: + try: + ds = ds.interp(time=coords['time'].values, + method=interp_t, + kwargs={"fill_value": "extrapolate"}) + except ValueError: + logger.info(f"Interpolation: Not enough supporting points for used interpolation method {interp_t}.") + logger.info("Interpolation method is set to 'nearest' instead.") + ds = ds.interp(time=coords['time'].values, + method="nearest", + kwargs={"fill_value": "extrapolate"}) + + # Create bin edges and labels for x-coordinates + x_bins = coords['x'].values + x_bins = np.insert(x_bins, 0, np.round(x_bins[0] - grid[0], 8), axis=0) # Extend bin range + x_bins_label = np.round(x_bins[:-1] + grid[0], 8) # Compute bin centers + + # Create bin edges and labels for y-coordinates + y_bins = coords['y'].values + y_bins = np.insert(y_bins, 0, np.round(y_bins[0] - grid[1], 8), axis=0) # Extend bin range + y_bins_label = np.round(y_bins[:-1] + grid[1], 8) # Compute bin centers + + # Store original dataset attributes + attrs = ds.attrs + + # Perform spatial binning by grouping data into bins along x and y dimensions and computing the mean + ds = ds.groupby_bins("x", x_bins, labels=x_bins_label).mean(dim="x") + ds = ds.groupby_bins("y", y_bins, labels=y_bins_label).mean(dim="y") + + # Rename bins to standard coordinate names + ds = ds.rename({'y_bins': 'y', 'x_bins': 'x'}) + + # Reassign original dataset attributes + ds = ds.assign_attrs(attrs) + + return ds + + +@retry(tries=5, delay=5, backoff=2, logger=logger) +def _urlopen_with_retry(data_url, tmpfp, engine='cfgrib', **kwargs): + ''' + Attempts to download and decompress a dataset file with automatic retry on failure. + + This function fetches data from a given URL, retries up to five times in case of failure, + and decompresses the response content before saving it to a temporary file. + + Parameters + ---------- + data_url : string + The URL from which data should be downloaded. + tmpfp : string + The file path where the downloaded content will be temporarily stored. + + Returns + ------- + tuple + - resp (requests.Response): The HTTP response object from the request. + - ds (xarray.Dataset): The dataset extracted from the downloaded file. + ''' + + # Send an HTTP GET request to the data URL with a timeout of 5 seconds + resp = requests.get(data_url, timeout=5) + + # Check if the request was successful (HTTP 200 OK) + if resp.status_code == 200: + # Open the specified temporary file and write the decompressed response content + with open(tmpfp, 'wb') as f: + f.write(decompress(resp.content)) + else: + # Raise an error if the response was unsuccessful + raise ValueError(f"Error in response: {resp.reason}, status code: {resp.status_code}") + + # Load the downloaded file as an xarray dataset using the 'cfgrib' engine + ds = xr.open_dataset(tmpfp, engine=engine) + + # Return both the HTTP response object and the loaded dataset + return resp, ds + + +def _download(urls, tmpdir=None): + ''' + Collects meteorological data for all timesteps of a given variable. + + This function retrieves data files, processes them, and merges them into + a single dataset. It determines the latest available runs, downloads + the necessary files, and structures them according to the expected format. + + Parameters + ---------- + url : string + The base URL for downloading data. + var : string + The variable name to be downloaded. + field : string + The field type, e.g., 'time-invariant' (static), 'soil-level' (e.g., 162cm), + 'model-level' (e.g., 62;63), or 'single-level' (2D field). + forecast : list of datetime + List of forecast time steps. + offset : numpy array + Array representing the forecast offsets. + tmpdir : string, optional + Temporary directory for storing downloaded files. + + Returns + ------- + xarray.Dataset + Merged dataset containing the collected meteorological data. + ''' + + # # Extract the most recent forecast run time + # latestRun = forecast[0] + + # # Filter only the previous runs before the latest run + # previousRuns = offset[offset < latestRun] + + # # Keep only entries that align with ICON model run hours + # previousRuns = previousRuns[previousRuns.hour.isin(model_run_hours)] + + # # Create a list of runs including the latest run and previous runs + # runs = [(latestRun.strftime("%H"), len(forecast))] + [(run.strftime("%H"), 3) for run in previousRuns] + + # # Generate download URLs for the specified variable and field + # urls = [] + # for run, hours in runs: + # urls = urls + _createDownloadUrl(url, var, field, run, hours) + + # urls = pd.Series(urls).unique() + + ds_temps = [] # List to store temporary datasets + + # Iterate over generated URLs and process each file + for data_url in urls: + logger.info("ICON-D2 data -> Processing file: {f}".format(f=data_url)) + + # Extract filename from URL and construct temporary file path + tmpfn = os.path.basename(data_url) + tmpfn = Path(tmpfn).with_suffix('') + tmpfp = "{p}/{tmpfn}".format(tmpfn=tmpfn, p=tmpdir) + + # Attempt to download and extract the dataset + try: + resp, ds_temp = _urlopen_with_retry(data_url, tmpfp) + except Exception as err: + logger.info("Could not get {url}: {err}".format(err=err, url=data_url)) + continue # Skip to next URL if download fails + + # Check if the dataset contains a 'generalVerticalLayer' coordinate + if 'generalVerticalLayer' in ds_temp.coords: + ds_coords = list(ds_temp.coords) + ds_coords_to_keep = ["valid_time", "longitude", "latitude", "generalVerticalLayer"] + ds_coords_to_drop = [ds_coord for ds_coord in ds_coords if ds_coord not in ds_coords_to_keep] + + # Expand dataset dimensions and remove unwanted coordinates + ds_temp = ds_temp.expand_dims(dim=["valid_time", "generalVerticalLayer"]).drop_vars(ds_coords_to_drop) + + # Assign coordinate values back to dataset + ds_temp = ds_temp.assign_coords({"valid_time": ds_temp.valid_time, + "latitude": ds_temp.latitude, + "longitude": ds_temp.longitude, + "generalVerticalLayer": ds_temp.generalVerticalLayer}) + ds_temps.append(ds_temp) + + else: + ds_coords = list(ds_temp.coords) + ds_coords_to_keep = ["valid_time", "longitude", "latitude"] + ds_coords_to_drop = [ds_coord for ds_coord in ds_coords if ds_coord not in ds_coords_to_keep] + + # Swap 'step' dimension with 'valid_time' if applicable + if "step" in ds_temp.dims: + ds_temp = ds_temp.swap_dims({"step": "valid_time"}).drop_vars(ds_coords_to_drop) + else: + ds_temp = ds_temp.expand_dims(dim="valid_time").drop_vars(ds_coords_to_drop) + + # Assign coordinate values back to dataset + ds_temp = ds_temp.assign_coords({"valid_time": ds_temp.valid_time, + "latitude": ds_temp.latitude, + "longitude": ds_temp.longitude}) + ds_temps.append(ds_temp) + + # Merge all collected datasets into a single dataset + ds = xr.merge(ds_temps) + + # Rename and clean coordinate labels for consistency + ds = _rename_and_clean_coords(ds) + + # Iterate through all data variables in the dataset + for ds_var in list(ds.data_vars): + # If the variable is an averaged quantity, apply de-averaging + if ds[ds_var].attrs['GRIB_stepType'] == 'avg': + ds[ds_var] = _deaverage(ds[ds_var]) + + # If the variable is an accumulated quantity, apply de-accumulation + elif ds[ds_var].attrs['GRIB_stepType'] == 'accum': + ds[ds_var] = _deaccumulate(ds[ds_var]) + + return ds + + +def _rename_and_clean_coords(ds, add_lon_lat=True): + """ + Rename 'longitude' and 'latitude' columns to 'x' and 'y' and fix roundings. + + Optionally (add_lon_lat, default:True) preserves latitude and + longitude columns as 'lat' and 'lon'. + """ + ds = ds.rename({"longitude": "x", "latitude": "y"}) + if "valid_time" in ds.sizes: + ds = ds.rename({"valid_time": "time"}).unify_chunks() + # round coords since cds coords are float32 which would lead to mismatches + ds = ds.assign_coords( + x=np.round(ds.x.astype(float), 5), y=np.round(ds.y.astype(float), 5) + ) + ds = maybe_swap_spatial_dims(ds) + if add_lon_lat: + ds = ds.assign_coords(lon=ds.coords["x"], lat=ds.coords["y"]) + + return ds + +def _interpolate_to_cutout_resolution(ds, retrieval_params, static): + + # Interpolate the data spatially and temporally to the wanted cutout resolution + ds_temps = [] + for idx, var in enumerate(ds.data_vars): + ds_temps.append(_interpolate(ds[var], static, + retrieval_params['coords'], + retrieval_params['grid'], + retrieval_params['interp_s'], + retrieval_params['interp_t']) + ) + + ds = xr.merge(ds_temps) + ds = ds.assign_coords(lon=("x", ds.x.values), lat=("y", ds.y.values)) + + ds = ds.unify_chunks().chunk(chunks=retrieval_params['chunks'] or {}) + + return ds + + +def get_data_wind(retrieval_params): + ''' + Retrieves and processes wind data from the DWD server. + + The function collects wind speed and direction data at 100m above ground level, + as well as surface roughness data. It then processes and interpolates this data + to match the desired spatial and temporal resolution. + + Parameters + ---------- + retrieval_params : dict + Dictionary containing parameters for data retrieval, including coordinates, + grid resolution, and interpolation methods. + + Returns + ------- + xarray.Dataset + Processed dataset containing wind speed, wind direction, and surface roughness. + ''' + + # Retrieve wind data from model levels 62 and 63 + retrieval_params['field'] = ['model-level', 'model-level'] + ds = retrieve_data( + url=dwd_url, + variable=[ + "u/62/63", # Zonal (east-west) wind component at levels 62 and 63 + "v/62/63", # Meridional (north-south) wind component at levels 62 and 63 + ], + **retrieval_params, + ) + + # Compute the mean wind values across the general vertical layers + ds["u"] = ds["u"].mean('generalVerticalLayer') + ds["v"] = ds["v"].mean('generalVerticalLayer') + ds = ds.drop_dims('generalVerticalLayer') # Remove the dimension after averaging + ds = ds.rename({"u": "u_100m", "v": "v_100m"}) # Rename variables for clarity + + + # Retrieve surface roughness data from single-level data + retrieval_params['field'] = ['single-level'] + ds2 = retrieve_data( + url=dwd_url, + variable=["z0"], # Surface roughness length + **retrieval_params, + ) + + # Merge wind data with roughness data into a single dataset + ds = xr.merge([ds, ds2]) + + # Rename roughness variable for clarity + ds = ds.rename({"fsr": "roughness"}) + ds["roughness"] = ds["roughness"].assign_attrs( + units="m", + long_name="Surface roughness" + ) + + # Compute wind speed at 100m using the Pythagorean theorem + ds["wnd100m"] = np.sqrt(ds["u_100m"] ** 2 + ds["v_100m"] ** 2).assign_attrs( + units="m/s", long_name="100 metre wind speed" + ) + + # Compute wind direction azimuth (0 = North, π/2 = East, π = South, 3π/2 = West) + azimuth = np.arctan2(ds["u_100m"], ds["v_100m"]) + + # Ensure wind azimuth is within the 0 to 2π range + ds["wnd_azimuth"] = azimuth.where(azimuth >= 0, azimuth + 2 * np.pi) + + # Remove intermediate wind component variables after processing + ds = ds.drop_vars(["u_100m", "v_100m"]) + + return ds + + +def sanitize_wind(ds): + """Sanitize retrieved wind data.""" + ds["roughness"] = ds["roughness"].where(ds["roughness"] >= 0.0, 2e-4) + return ds + + +def get_data_influx(retrieval_params): + """Get influx data for given retrieval parameters.""" + # Retrieve single-level data + retrieval_params['field'] = ['single-level', 'single-level', 'single-level', 'single-level'] + ds = retrieve_data( + url=dwd_url, + variable=[ + "asob_t", + "aswdir_s", + "aswdifd_s", + "alb_rad", + ], + **retrieval_params, + ) + + ds = ds.rename({"avg_tnswrf": "influx_toa", + "ASWDIR_S": "influx_direct", + "ASWDIFD_S": "influx_diffuse", + "al": "albedo"}) + + ds["albedo"] = (ds["albedo"]/100).assign_attrs(units="(0 - 1)", long_name="Shortwave broadband albedo for diffuse radiation") + ds["influx_diffuse"] = ds["influx_diffuse"].assign_attrs(units="W m**-2", long_name="Surface down solar diffuse radiation") + ds["influx_direct"] = ds["influx_direct"].assign_attrs(units="W m**-2", long_name="Surface down solar direct radiation") + ds["influx_toa"] = ds["influx_toa"].assign_attrs(units="W m**-2", long_name="Net short-wave radiation flux at top of atmosphere (TOA)") + + # # Interpolate the data spatially and temporally to the wanted cutout resolution + # ds_temps = [] + # for idx, var in enumerate(ds): + # ds_temps.append(_interpolate(ds[var], False, + # retrieval_params['coords'], + # retrieval_params['grid'], + # retrieval_params['interp_s'], + # retrieval_params['interp_t']) + # ) + + # ds = xr.merge(ds_temps) + # ds = ds.assign_coords(lon=("x", ds.x.values), lat=("y", ds.y.values)) + + # ICON-D2 variables are mean values for previous hour, i.e. 13:01 to 14:00 are labelled as "14:00" + # account by calculating the SolarPosition for the center of the interval for aggregation happens + # see https://github.com/PyPSA/atlite/issues/158 + # Do not show DeprecationWarning from new SolarPosition calculation (#199) + with warnings.catch_warnings(): + warnings.simplefilter("ignore", DeprecationWarning) + # Convert dt / time frequency to timedelta and shift solar position by half + # (freqs like ["H","30T"] do not work with pd.to_timedelta(...) + time_shift = ( + -1 + / 2 + * pd.to_timedelta( + pd.date_range( + "1970-01-01", periods=1, freq=pd.infer_freq(ds["time"]) + ).freq + ) + ) + sp = SolarPosition(ds, time_shift=time_shift) + + sp = sp.rename({v: f"solar_{v}" for v in sp.data_vars}) + + ds = xr.merge([ds, sp]) + + # # Interpolate the data spatially and temporally to the wanted cutout resolution + # ds_temps = [] + # for idx, var in enumerate(ds): + # ds_temps.append(_interpolate(ds[var], False, + # retrieval_params['coords'], + # retrieval_params['grid'], + # retrieval_params['interp_s'], + # retrieval_params['interp_t']) + # ) + + # ds = xr.merge(ds_temps) + # ds = ds.assign_coords(lon=("x", ds.x.values), lat=("y", ds.y.values)) + + # ds = ds.unify_chunks.chunk(chunks=retrieval_params['chunks'] or {}) + + + + # ds = ds.drop_vars(['lon','lat']) + # ds = ds.assign_coords(lon=("x", ds.x.values), lat=("y", ds.y.values)) + + return ds + + +def sanitize_influx(ds): + """Sanitize retrieved influx data.""" + for a in ("influx_direct", "influx_diffuse", "influx_toa"): + ds[a] = ds[a].clip(min=0.0) + return ds + + +def get_data_temperature(retrieval_params): + """Get wind temperature for given retrieval parameters.""" + # Retrieve single-level data + retrieval_params['field'] = ['single-level','soil-level'] + ds = retrieve_data( + url=dwd_url, + variable=["t_2m", + "t_so/162"], + **retrieval_params + ) + + ds = ds.rename({"t2m": "temperature", + "T_SO": "soil temperature"}) + + ds["temperature"] = ds["temperature"].assign_attrs(units="K", long_name="Temperature at 2m above ground") + ds["soil temperature"] = ds["soil temperature"].assign_attrs(units="K", long_name="Soil temperature in 162 cm depth ") + + return ds + + +def get_data_runoff(retrieval_params): + """Get runoff data for given retrieval parameters.""" + # Retrieve single-level data + retrieval_params['field'] = ['single-level','single-level'] + ds = retrieve_data(url=dwd_url, + variable=["runoff_s", + "runoff_g"], + **retrieval_params) + + ds["runoff"] = (ds["RUNOFF_S"] + ds["RUNOFF_G"]).assign_attrs(units="kg m**-2", long_name="Surface and Soil water runoff (accumulated since model start)") + + ds = ds.drop_vars(["RUNOFF_S", "RUNOFF_G"]) + + return ds + + +def sanitize_runoff(ds): + """Sanitize retrieved runoff data.""" + ds["runoff"] = ds["runoff"].clip(min=0.0) + return ds + + +def get_data_height(retrieval_params): + """Get height data for given retrieval parameters.""" + # Retrieve time-invariant data + retrieval_params['field'] = ['time-invariant'] + ds = retrieve_data(url=dwd_url, + variable=["hsurf"], + **retrieval_params) + + ds = ds.rename({"HSURF": "height"}) + ds["height"] = ds["height"].assign_attrs( + units="m", + long_name="Geometric Height of the earths surface above sea level (2D field)" + ) + + return ds + + +def _area(coords): + # North, West, South, East. Default: global + x0, x1 = coords["x"].min().item(), coords["x"].max().item() + y0, y1 = coords["y"].min().item(), coords["y"].max().item() + return [y1, x0, y0, x1] + + +def retrieval_times(coords, tz, static=False): + """ + Get retrieval time dimension of the forecast. + + Parameters + ---------- + coords : atlite.Cutout.coords + Coordinate object containing the time dimension. + tz : timezone + Timezone information of the input time and date (currently unused here). + static : bool, optional (default=False) + If True, return only the first forecast time step and an empty offset. + + Returns + ------- + dict + Dictionary with keys: + - 'forecast': pd.DatetimeIndex of forecast timestamps (including any filled gaps) + - 'offset': pd.DatetimeIndex of past timestamps before current model run time + """ + # Convert xarray time coordinate to pandas index + time = coords["time"].to_index() + + # Get the current model run time (e.g., most recent 6-hourly forecast release) + currentRunTime = _getCurrentRun(pd.Timestamp.utcnow().replace(tzinfo=None)) + + # Split times into forecast (≥ currentRunTime) and offset (< currentRunTime) + forecast_times = time[time >= currentRunTime] + offset_times = time[time < currentRunTime] + + # If the forecast doesn't include currentRunTime explicitly, fill the missing range + if len(forecast_times) > 0 and forecast_times[0] > currentRunTime: + # Infer time resolution, fallback to difference if not directly available + freq = time.freq or pd.infer_freq(time) + if freq is None: + freq = time[1] - time[0] # fallback to timedelta if no freq is inferable + + # Fill the missing time steps from currentRunTime up to just before the first forecast time + fill = pd.date_range(currentRunTime, forecast_times[0] - freq, freq=freq) + + # Prepend the filled range to the forecast times + forecast_times = fill.append(forecast_times) + + # If static mode is requested, return only the first forecast time and no offset + if static: + forecast_times = forecast_times[:1] + offset_times = pd.DatetimeIndex([]) + + # Return dictionary with forecast and offset times + return { + "forecast": forecast_times, + "offset": offset_times, + } + + +def noisy_unlink(path): + """Delete file at given path.""" + logger.debug(f"Deleting file {path}") + try: + os.unlink(path) + except PermissionError: + logger.error(f"Unable to delete file {path}, as it is still in use.") + + +def retrieve_data(url, product, chunks=None, tmpdir=None, lock=None, **updates): + + """ + Download data from the ICON-D2 Model from the Open Data Server (ODS) of DWD. + + If you want to manually downolad the data go to: + https://opendata.dwd.de/weather/nwp/icon-d2/grib/ + """ + + request = {"product_type": "icon_d2", "format": "direct-download"} + request.update(updates) + + ds_temps = [] + #Download data for each variable individually and then merge all in one xarray + logger.info(f"open-dwd: Downloading variables\n\t{request['variable']}\n") + for idx, var in enumerate(request['variable']): + ds_temps.append(_mainDataCollector(url, + var, + request['field'][idx], + request['forecast'], + request['offset'], + request['coords'], + tmpdir) + ) + + ds = xr.merge(ds_temps).chunk(chunks=chunks) + + return ds + + +def get_data(cutout, feature, tmpdir, + lock=None, + monthly_requests=False, + concurrent_requests=False, + **creation_parameters): + """ + Retrieve data from DWDs ICON-D2 Model dataset (via ODS). + + This front-end function downloads data for a specific feature and formats + it to match the given Cutout. + + Parameters + ---------- + cutout : atlite.Cutout + feature : str + Name of the feature data to retrieve. Must be in + `atlite.datasets.icon_d2.features` + monthly_requests : bool + Takes no effect, only here for consistency with other dataset modules. + concurrent_requests : bool + Takes no effect, only here for consistency with other dataset modules. + tmpdir : str/Path + Directory where the temporary netcdf files are stored. + **creation_parameters : + Additional keyword arguments. The only effective argument is 'sanitize' + (default True) which sets sanitization of the data on or off. + + Returns + ------- + xarray.Dataset + Dataset of dask arrays of the retrieved variables. + + """ + coords = cutout.coords + + sanitize = creation_parameters.get("sanitize", True) + + retrieval_params = { + "product": "dwd_icon_d2", + "area": _area(coords), + "chunks": cutout.chunks, + "grid": [cutout.dx, cutout.dy], + "tmpdir": tmpdir, + "lock": lock, + "tz": cutout.data.tz, + "interp_s": cutout.data.interp_s, + "interp_t": cutout.data.interp_t, + "coords": coords, + } + + func = globals().get(f"get_data_{feature}") + sanitize_func = globals().get(f"sanitize_{feature}") + + logger.info(f"Requesting data for feature {feature} for ICON-D2 from open-dwd...") + + def retrieve_once(time, static=False): + ds = func({**retrieval_params, **time}) + ds = _interpolate_to_cutout_resolution(ds, retrieval_params, static) + # Sanitize the data after interpolation to remove residuals + if sanitize and sanitize_func is not None: + ds = sanitize_func(ds) + return ds + + if feature in static_features: + return retrieve_once(retrieval_times(coords, cutout.data.tz, True), True).squeeze() + + dataset = retrieve_once(retrieval_times(coords, cutout.data.tz, False), False) + + return dataset.sel(time=coords["time"]) diff --git a/atlite/datasets/icon_eu.py b/atlite/datasets/icon_eu.py new file mode 100644 index 00000000..090886ef --- /dev/null +++ b/atlite/datasets/icon_eu.py @@ -0,0 +1,1171 @@ +# -*- coding: utf-8 -*- + +# SPDX-FileCopyrightText: 2016-2021 The Atlite Authors +# +# SPDX-License-Identifier: GPL-3.0-or-later + +""" +Module for downloading and curating data from DWD ICON-EU dataset (via ODS). + +For further reference see: +https://www.dwd.de/DE/leistungen/nwv_icon_d2_modelldokumentation/nwv_icon_d2_modelldokumentation.html +""" + +import os +import warnings +import numpy as np +import xarray as xr +import pandas as pd +import requests +import logging + +from pathlib import Path +from retry import retry +from bz2 import decompress +from bs4 import BeautifulSoup + +from ..gis import maybe_swap_spatial_dims +from ..pv.solar_position import SolarPosition + +# Null context for running a with statements wihout any context +try: + from contextlib import nullcontext +except ImportError: + # for Python verions < 3.7: + import contextlib + + @contextlib.contextmanager + def nullcontext(): + yield + + +logger = logging.getLogger(__name__) + +# URL for accessing Open Data from DWD (Deutscher Wetterdienst) +dwd_url = "https://opendata.dwd.de/weather/nwp/icon-eu/grib/" + +# ICON-EU model runs are available at fixed intervals: 00, 03, 06, 09, 12, 15, 18, 21 UTC +model_run_hours = np.array([0, 3, 6, 9, 12, 15, 18, 21]) + +# Averaging window of different model runs +averaging_window = 24 #hours + +# Coordinate Reference System (CRS) used for geospatial data +crs = 4326 + +# Dictionary defining available meteorological features and their associated data fields +features = { + "height": ["height"], # Elevation data + "wind": ["wnd100m", "wnd_azimuth", "roughness"], # Wind speed, direction, and surface roughness + "influx": [ + "influx_toa", # Top-of-atmosphere solar radiation + "influx_direct", # Direct solar radiation + "influx_diffuse", # Diffuse solar radiation + "albedo", # Surface reflectivity + "solar_altitude", # Solar altitude angle + "solar_azimuth", # Solar azimuth angle + ], + "temperature": ["temperature", "soil temperature"], # Air and soil temperature + "runoff": ["runoff"], # Surface water runoff +} + +# Features that remain constant over time +static_features = {"height"} + +# Model requirements specifying spatial and temporal constraints +requirements = { + 'x': slice(-23.5, 62.5, 0.0625), # Longitude range with resolution + 'y': slice(29.5, 70.5, 0.0625), # Latitude range with resolution + 'offset': pd.Timedelta(hours=-18), # Time offset for forecast initialization + 'forecast': pd.Timedelta(hours=120), # Maximum forecast range + 'dt': pd.Timedelta(hours=1), # Temporal resolution of data + 'parallel': True, # Flag for enabling parallel processing + } + + +def _checkModuleRequirements(x, y, time, time_now, **kwargs): + """ + Load and check the data requirements for a given module. + + Parameters: + x (slice): Defines the start, stop, and step values for the x-dimension. + y (slice): Defines the start, stop, and step values for the y-dimension. + time (slice): Defines the start, stop, and step values for the time dimension. + **kwargs: Additional optional parameters. + """ + + # Extract start, stop, and step values for x + x_start, x_stop, x_step = x.start, x.stop, x.step + + # Adjust x range based on module requirements + if requirements['x'].start > x.start: + x_start = requirements['x'].start + if requirements['x'].stop < x.stop: + x_stop = requirements['x'].stop + if requirements['x'].step > x.step: + x_step = requirements['x'].step + + x = slice(x_start, x_stop, x_step) + + # Extract start, stop, and step values for y + y_start, y_stop, y_step = y.start, y.stop, y.step + + # Adjust y range based on module requirements + if requirements['y'].start > y.start: + y_start = requirements['y'].start + if requirements['y'].stop < y.stop: + y_stop = requirements['y'].stop + if requirements['y'].step > y.step: + y_step = requirements['y'].step + + y = slice(y_start, y_stop, y_step) + + + # Extract time range parameters + time_start = time.start + time_stop = time.stop + time_step = time.step + + # Check forecast feasibility + feasible_start = time_now + requirements['offset'] + feasible_end = time_now + requirements['forecast'] + + # Ensure time_start is within feasible bounds + if time_start < feasible_start: + logger.error(f"The required forecast start time {time_start} exceeds the model requirements.") + logger.error(f"The minimum start time of the forecast for {time_now} is {feasible_start}.") + logger.error(f"The maximum historical offset of the forecast is {requirements['offset']}.") + raise ValueError(f"Invalid forecast start time: {time_start}. Must be >= {feasible_start}.") + # logger.info(f"Set the start time to the minimum start time {feasible_start} and proceed.") + # time_start = time_now + requirements['offset'] + + if time_start >= feasible_end: + logger.error(f"The required forecast start time {time_start} exceeds the model requirements.") + logger.error(f"The maximum start time of the forecast for {time_now} needs to be smaller than {feasible_end}.") + raise ValueError(f"Invalid forecast start time: {time_start}. Must be < {feasible_end}.") + # logger.info(f"Set the start time to the minimum start time {feasible_start} and proceed.") + # time_start = time_now + requirements['offset'] + + # Ensure time_stop is greater than time_start + if time_stop <= time_start: + logger.error(f"The required forecast end time {time_stop} exceeds the model requirements.") + logger.error(f"The minimum end time of the forecast for {time_now} needs to be larger than {time_start}.") + raise ValueError(f"Invalid forecast end time: {time_stop}. Must be > {time_start}.") + # logger.info(f"Set the end time to the maximum end time {feasible_end} and proceed.") + # time_stop = time_now + requirements['forecast'] + + # Ensure time_stop is greater than time_start + if time_stop > feasible_end: + logger.error(f"The required forecast end time {time_stop} exceeds the model requirements.") + logger.error(f"The maximum end time of the forecast for {time_now} is {feasible_end}.") + logger.error(f"The maximum forecast horizon is {requirements['forecast']}.") + raise ValueError(f"Invalid forecast end time: {time_stop}. Must be <= {feasible_end}.") + # logger.info(f"Set the end time to the maximum end time {feasible_end} and proceed.") + # time_stop = time_now + requirements['forecast'] + + # # Check if forecast hours exceed limits + # forecastHours = (time_stop - time_now) + # if forecastHours > requirements['forecast']: + # logger.error(f"The end time of the forecast {time_stop} exceedes the model requirements.") + # logger.error(f"The maximum end time of the forecast for {time_now} is {feasible_end}.") + # logger.error(f"The required forecast horizon {forecastHours} exceeds the maximum forecast horzion {requirements['forecast']}.") + # # logger.info(f"Set it to maximum forecast hours of {requirements['forecast']} hours.") + # # forecastHours = requirements['forecast'] + # # time_stop = time_now + forecastHours + + # # Check if offset is within required limits + # offset = (time_start - time_now) + # if offset < requirements['offset']: + # logger.error(f"The start time of the forecast {time_start} exceeds model requirements.") + # logger.error(f"The minimum start time of the forecast for {time_now} is {feasible_start}.") + # logger.warning(f"Forecast offset of {offset} hours is below model requirements.") + # logger.info(f"Set it to minimum offset of {requirements['offset']} hours.") + # offset = requirements['offset'] + # time_start = time_now + offset + + # Ensure time step is within required limits + if (time_step is pd.Timedelta(None)) or (time.step < requirements['dt']): + logger.warning(f"The required temporal forecast resolution {time_step} exceeds the model requirements.") + logger.warning(f"The minimum temporal resolution of the forecast is {requirements['dt']}.") + logger.info(f"Set the temporal forecast resolution to the minimum: {requirements['dt']}.") + time_step = requirements['dt'] + + time = slice(time_start, time_stop, time_step) + + # Retrieve parallel processing setting from requirements + parallel = requirements['parallel'] + + return x, y, time, parallel + + +def _getCurrentRun(time): + ''' + Determines the most recent available model run based on the current time. + The latest run is fully available approximately 2 hours after initialization. + To ensure the model run is successfully uploaded, the download delay time is set to 3 hours. + + This code was adapted from: https://github.com/prayer007/dwdGribExtractor/tree/main + + Parameters + ---------- + time : datetime + The current datetime in UTC. + + Returns + ------- + datetime + The timestamp of the most recent available model run, floored to the hour. + ''' + download_delay = 3 # Delay in hours before the run is fully available + + # Adjust the current time by the delay to ensure availability + adjusted_time = time - pd.Timedelta(hours=download_delay) + + # Find the most recent available run by flooring to the nearest model run hour + run_hour = max(hour for hour in model_run_hours if hour <= adjusted_time.hour) + + # Construct the correct model run time + run_time = adjusted_time.replace(hour=run_hour, minute=0, second=0, microsecond=0) + + return run_time + + + +def _createDownloadUrl(url, var, field, run, hours): + ''' + Generates a list of download URLs for meteorological data from the DWD server. + The function scrapes the available files for a given variable and model run, + filtering based on field type, forecast hours, and model levels. + + This code was adopted from: https://github.com/prayer007/dwdGribExtractor/tree/main + + Parameters + ---------- + url : string + Base URL of the DWD data server. + var : string + The variable name, optionally including levels separated by '/'. + field : string + The field parameter: 'time-invariant' (static), 'soil-level' (162cm), + 'model-level' (62;63), or 'single-level' (2D field). + run : string + Model run identifier. + hours : int + Maximum forecast hours to retrieve. + + Returns + ------- + list + List of filtered download URLs. + ''' + + # Extract variable name and associated levels + levels = pd.Series(var.split('/')[1:]).astype(int) # Convert levels to integers + var = var.split('/')[0] # Extract variable name + + # Construct the data URL based on provided parameters + data_url = "{url}{run}/{var}/".format(url=url, var=var, run=run) + + # Send an HTTP GET request to fetch available files + response = requests.get(data_url) + + # Raise an error if the request fails + response.raise_for_status() + + # Parse the HTML content to extract links + soup = BeautifulSoup(response.content, 'html.parser') + + # Find all anchor tags ('a') representing file links + link_tags = soup.find_all('a') + + # Initialize an empty list to store the file URLs + urls = [] + + # Iterate through all link tags, extract URLs, and store them + for tag in link_tags: + link = tag.get('href') # Extract the hyperlink reference + + if link: + # Construct the full URL by appending the relative link to the base URL + full_url = data_url + link + urls.append(full_url) + + # Convert the list of URLs into a Pandas Series for easy filtering + urls = pd.Series(urls) + + # Filter URLs to retain only those containing 'regular-lat-lon' grid format + urls = urls[urls.str.contains('regular-lat-lon')] + + # Further filter URLs based on the specified model field + urls = urls[urls.str.contains(field)] + + # Apply forecast time horizon filter (excluding 'time-invariant' fields) + if field != 'time-invariant': + urls = urls[urls.str.findall(r"\_(\d{3})\_").str[0].astype(int) <= hours] + + # Filter URLs based on model levels, if specified + if not levels.empty: + url_mask = pd.Series(index=urls.index, data=False) # Initialize boolean mask + for level in levels: + url_mask += urls.str.contains(f"_{level}_") # Check if URL contains level + urls = urls[url_mask] # Apply filter + + # Convert filtered URLs back to a list + urls = list(urls) + + return urls + + +def _deaverage(da): + ''' + Converts a temporally averaged data array into individual time-step values. + Each time step's original value is reconstructed by reversing the cumulative averaging process. + + Parameters + ---------- + da : xarray.DataArray + Input data array with a time dimension containing cumulative averages. + + Returns + ------- + xarray.DataArray + Data array with de-averaged values. + ''' + # Create an integer index for time, matching da's shape + time_index = xr.DataArray(np.arange(1, da.sizes["time"] + 1), dims="time", coords={"time": da.time}) + + # Apply the reverse operation: Ψ_inst(t) = t * Ψ(t) - (t-1) * Ψ(t-1) + da_instantaneous = (time_index * da - (time_index - 1) * da.shift(time=1, fill_value=0)) + + # Fill the first timestep with NaNs, since it is always zero + da_instantaneous = da_instantaneous.where(da_instantaneous.time != da_instantaneous.time[0], np.nan) + + return da_instantaneous + + +def _deaccumulate(da): + ''' + Converts accumulated data into time-step differences. + This function takes an accumulated dataset and calculates the incremental + values between consecutive time steps. + + Parameters + ---------- + da : xarray.DataArray + Input data array with a time dimension containing accumulated values. + + Returns + ------- + xarray.DataArray + Data array with de-accumulated values (time-step differences). + ''' + + # Apply the reverse operation: Ψ_inst(t) = Ψ(t) - Ψ(t-1) + da_instantaneous = da - da.shift(time=1, fill_value=0) + + # Fill the first timestep with NaNs, since it is always zero + da_instantaneous = da_instantaneous.where(da_instantaneous.time != da_instantaneous.time[0], np.nan) + + return da_instantaneous + + +def _average_duplicate_times(ds): + """ + Averages duplicate timestamps in an xarray Dataset. + + Given an xarray Dataset with duplicated timestamps (after concatenation), + this function computes the mean over all datasets that share the same time index. + Unique timestamps remain unchanged. + + Parameters + ---------- + ds : xarray.Dataset + An xarray Dataset that contains duplicated time indices. + + Returns + ------- + xarray.Dataset + A dataset where duplicate timestamps are averaged and unique timestamps are preserved. + """ + + # Step 1: Compute the mean for duplicate timestamps while keeping unique ones + ds_mean = ds.groupby("time").mean(dim="time", keep_attrs=True) + + # Step 2: Preserve dataset and variable attributes from the original dataset + ds_mean.attrs = ds.attrs # Preserve global dataset attributes + for var in ds_mean.data_vars: + ds_mean[var].attrs = ds[var].attrs # Preserve variable attributes + + return ds_mean + + +def _mainDataCollector(url, var, field, forecast, offset, coords, tmpdir): + ''' + Downloads meteorological data for a given variable and processes it accordingly. + + This function retrieves data from the specified URL, processes it to rename and clean + coordinates, and applies de-averaging or de-accumulation where necessary based on + the GRIB step type. + + Parameters + ---------- + url : string + The base URL for downloading data. + var : string + The variable name to be downloaded. + field : string + The field type, e.g., 'time-invariant' (static), 'soil-level' (e.g., 162cm), + 'model-level' (e.g., 62;63), or 'single-level' (2D field). + forecast : int + The number of forecast hours to retrieve. + offset : int + The forecast offset time in hours. + coords : atlite.Cutout.coords + The spatial coordinates where data is required. + tmpdir : string + Path to the temporary directory where downloaded files are stored. + + Returns + ------- + xarray.Dataset + Processed dataset containing the collected meteorological data. + ''' + + # Extract the most recent forecast run time + latestRun = forecast[0] + + # Filter only the previous runs before the latest run + previousRuns = offset[offset < latestRun] + + # Keep only entries that align with ICON model run hours + previousRuns = previousRuns[previousRuns.hour.isin(model_run_hours)].sort_values(ascending=True) + + if len(previousRuns) > 0: + # Get the hour of the earliest previous run + first_prev_hour = previousRuns[0].hour + + # Find the previous index in `model_run_hours` + prev_idx = np.where(model_run_hours == first_prev_hour)[0][0] - 1 + + # Compute the adjusted previous run + previousRun = pd.DatetimeIndex([previousRuns[0].replace( + hour=model_run_hours[prev_idx], minute=0, second=0, microsecond=0 + )]) + + # Add previousRun to previousRuns, ensuring uniqueness + previousRuns = previousRuns.union(previousRun).sort_values(ascending=True) + + # Create a list of runs including the latest run and previous runs + # Use an averaging_window of X hours for all previous runs to average the results + runs = [(run.strftime("%H"), averaging_window) for run in previousRuns] + [(latestRun.strftime("%H"), len(forecast))] + + # # Generate download URLs for the specified variable and field + # urls = [] + # for run, hours in runs: + # urls = urls + _createDownloadUrl(url, var, field, run, hours) + + # urls = pd.Series(urls).unique() + + ds_temps = [] # List to store temporary datasets + + for run, hours in runs: + + # Generate download URLs for the specified variable and field + urls = _createDownloadUrl(url, var, field, run, hours) + + # Download and collect the main dataset for the given variable + ds_temps.append(_download(urls, tmpdir)) + + + # Concatenate along the time dimension, keeping also duplicated timestamps + ds = xr.concat(ds_temps, dim="time") + + # Average duplicates for smooth forecast transitioning + ds = _average_duplicate_times(ds) + + # # Download and collect the main dataset for the given variable + # ds_temp = _mainDataCollector(url, var, field, forecast, offset, tmpdir) + + # Rename and clean coordinate labels for consistency + # ds_temp = _rename_and_clean_coords(ds_temp) + + # # Iterate through all data variables in the dataset + # for ds_var in list(ds_temp.data_vars): + # # If the variable is an averaged quantity, apply de-averaging + # if ds_temp[ds_var].attrs['GRIB_stepType'] == 'avg': + # ds_temp[ds_var] = _deaverage(ds_temp[ds_var]) + + # # If the variable is an accumulated quantity, apply de-accumulation + # elif ds_temp[ds_var].attrs['GRIB_stepType'] == 'accum': + # ds_temp[ds_var] = _deaccumulate(ds_temp[ds_var]) + + # Return the processed dataset + return ds + + +def _interpolate(ds, static, coords, grid, interp_s, interp_t): + ''' + Interpolates a dataset to match specific latitude and longitude coordinates. + + If the data is not static, it first interpolates temporally. Then, it applies + spatial interpolation using binning to adjust the data to the grid resolution. + + Parameters + ---------- + ds : xarray.Dataset + The input dataset to be interpolated. + static : bool + Whether the dataset contains static variables (i.e., no time dimension). + coords : dict + Dictionary containing target coordinate values for interpolation. + grid : tuple + Grid resolution in (x, y) directions for spatial binning. + interp_s : string + Spatial interpolation method (not used in the function but can be applied elsewhere). + interp_t : string + Temporal interpolation method to be used. + + Returns + ------- + xarray.Dataset + The interpolated dataset adjusted to the target spatial and temporal resolution. + ''' + + # Perform temporal interpolation if the data is not static + if not static: + try: + ds = ds.interp(time=coords['time'].values, + method=interp_t, + kwargs={"fill_value": "extrapolate"}) + except ValueError: + logger.info(f"Interpolation: Not enough supporting points for used interpolation method {interp_t}.") + logger.info("Interpolation method is set to 'nearest' instead.") + ds = ds.interp(time=coords['time'].values, + method="nearest", + kwargs={"fill_value": "extrapolate"}) + + # Create bin edges and labels for x-coordinates + x_bins = coords['x'].values + x_bins = np.insert(x_bins, 0, np.round(x_bins[0] - grid[0], 8), axis=0) # Extend bin range + x_bins_label = np.round(x_bins[:-1] + grid[0], 8) # Compute bin centers + + # Create bin edges and labels for y-coordinates + y_bins = coords['y'].values + y_bins = np.insert(y_bins, 0, np.round(y_bins[0] - grid[1], 8), axis=0) # Extend bin range + y_bins_label = np.round(y_bins[:-1] + grid[1], 8) # Compute bin centers + + # Store original dataset attributes + attrs = ds.attrs + + # Perform spatial binning by grouping data into bins along x and y dimensions and computing the mean + ds = ds.groupby_bins("x", x_bins, labels=x_bins_label).mean(dim="x") + ds = ds.groupby_bins("y", y_bins, labels=y_bins_label).mean(dim="y") + + # Rename bins to standard coordinate names + ds = ds.rename({'y_bins': 'y', 'x_bins': 'x'}) + + # Reassign original dataset attributes + ds = ds.assign_attrs(attrs) + + return ds + + +@retry(tries=5, delay=5, backoff=2, logger=logger) +def _urlopen_with_retry(data_url, tmpfp, engine='cfgrib', **kwargs): + ''' + Attempts to download and decompress a dataset file with automatic retry on failure. + + This function fetches data from a given URL, retries up to five times in case of failure, + and decompresses the response content before saving it to a temporary file. + + Parameters + ---------- + data_url : string + The URL from which data should be downloaded. + tmpfp : string + The file path where the downloaded content will be temporarily stored. + + Returns + ------- + tuple + - resp (requests.Response): The HTTP response object from the request. + - ds (xarray.Dataset): The dataset extracted from the downloaded file. + ''' + + # Send an HTTP GET request to the data URL with a timeout of 5 seconds + resp = requests.get(data_url, timeout=5) + + # Check if the request was successful (HTTP 200 OK) + if resp.status_code == 200: + # Open the specified temporary file and write the decompressed response content + with open(tmpfp, 'wb') as f: + f.write(decompress(resp.content)) + else: + # Raise an error if the response was unsuccessful + raise ValueError(f"Error in response: {resp.reason}, status code: {resp.status_code}") + + # Load the downloaded file as an xarray dataset using the 'cfgrib' engine + ds = xr.open_dataset(tmpfp, engine=engine) + + # Return both the HTTP response object and the loaded dataset + return resp, ds + + +def _download(urls, tmpdir=None): + ''' + Collects meteorological data for all timesteps of a given variable. + + This function retrieves data files, processes them, and merges them into + a single dataset. It determines the latest available runs, downloads + the necessary files, and structures them according to the expected format. + + Parameters + ---------- + url : string + The base URL for downloading data. + var : string + The variable name to be downloaded. + field : string + The field type, e.g., 'time-invariant' (static), 'soil-level' (e.g., 162cm), + 'model-level' (e.g., 62;63), or 'single-level' (2D field). + forecast : list of datetime + List of forecast time steps. + offset : numpy array + Array representing the forecast offsets. + tmpdir : string, optional + Temporary directory for storing downloaded files. + + Returns + ------- + xarray.Dataset + Merged dataset containing the collected meteorological data. + ''' + + # # Extract the most recent forecast run time + # latestRun = forecast[0] + + # # Filter only the previous runs before the latest run + # previousRuns = offset[offset < latestRun] + + # # Keep only entries that align with ICON model run hours + # previousRuns = previousRuns[previousRuns.hour.isin(model_run_hours)] + + # # Create a list of runs including the latest run and previous runs + # runs = [(latestRun.strftime("%H"), len(forecast))] + [(run.strftime("%H"), 3) for run in previousRuns] + + # # Generate download URLs for the specified variable and field + # urls = [] + # for run, hours in runs: + # urls = urls + _createDownloadUrl(url, var, field, run, hours) + + # urls = pd.Series(urls).unique() + + ds_temps = [] # List to store temporary datasets + + # Iterate over generated URLs and process each file + for data_url in urls: + logger.info("ICON-D2 data -> Processing file: {f}".format(f=data_url)) + + # Extract filename from URL and construct temporary file path + tmpfn = os.path.basename(data_url) + tmpfn = Path(tmpfn).with_suffix('') + tmpfp = "{p}/{tmpfn}".format(tmpfn=tmpfn, p=tmpdir) + + # Attempt to download and extract the dataset + try: + resp, ds_temp = _urlopen_with_retry(data_url, tmpfp) + except Exception as err: + logger.info("Could not get {url}: {err}".format(err=err, url=data_url)) + continue # Skip to next URL if download fails + + # Check if the dataset contains a 'generalVerticalLayer' coordinate + if 'generalVerticalLayer' in ds_temp.coords: + ds_coords = list(ds_temp.coords) + ds_coords_to_keep = ["valid_time", "longitude", "latitude", "generalVerticalLayer"] + ds_coords_to_drop = [ds_coord for ds_coord in ds_coords if ds_coord not in ds_coords_to_keep] + + # Expand dataset dimensions and remove unwanted coordinates + ds_temp = ds_temp.expand_dims(dim=["valid_time", "generalVerticalLayer"]).drop_vars(ds_coords_to_drop) + + # Assign coordinate values back to dataset + ds_temp = ds_temp.assign_coords({"valid_time": ds_temp.valid_time, + "latitude": ds_temp.latitude, + "longitude": ds_temp.longitude, + "generalVerticalLayer": ds_temp.generalVerticalLayer}) + ds_temps.append(ds_temp) + + else: + ds_coords = list(ds_temp.coords) + ds_coords_to_keep = ["valid_time", "longitude", "latitude"] + ds_coords_to_drop = [ds_coord for ds_coord in ds_coords if ds_coord not in ds_coords_to_keep] + + # Swap 'step' dimension with 'valid_time' if applicable + if "step" in ds_temp.dims: + ds_temp = ds_temp.swap_dims({"step": "valid_time"}).drop_vars(ds_coords_to_drop) + else: + ds_temp = ds_temp.expand_dims(dim="valid_time").drop_vars(ds_coords_to_drop) + + # Assign coordinate values back to dataset + ds_temp = ds_temp.assign_coords({"valid_time": ds_temp.valid_time, + "latitude": ds_temp.latitude, + "longitude": ds_temp.longitude}) + ds_temps.append(ds_temp) + + # Merge all collected datasets into a single dataset + ds = xr.merge(ds_temps) + + # Rename and clean coordinate labels for consistency + ds = _rename_and_clean_coords(ds) + + # Iterate through all data variables in the dataset + for ds_var in list(ds.data_vars): + # If the variable is an averaged quantity, apply de-averaging + if ds[ds_var].attrs['GRIB_stepType'] == 'avg': + ds[ds_var] = _deaverage(ds[ds_var]) + + # If the variable is an accumulated quantity, apply de-accumulation + elif ds[ds_var].attrs['GRIB_stepType'] == 'accum': + ds[ds_var] = _deaccumulate(ds[ds_var]) + + return ds + + +def _rename_and_clean_coords(ds, add_lon_lat=True): + """ + Rename 'longitude' and 'latitude' columns to 'x' and 'y' and fix roundings. + + Optionally (add_lon_lat, default:True) preserves latitude and + longitude columns as 'lat' and 'lon'. + """ + ds = ds.rename({"longitude": "x", "latitude": "y"}) + if "valid_time" in ds.sizes: + ds = ds.rename({"valid_time": "time"}).unify_chunks() + # round coords since cds coords are float32 which would lead to mismatches + ds = ds.assign_coords( + x=np.round(ds.x.astype(float), 5), y=np.round(ds.y.astype(float), 5) + ) + ds = maybe_swap_spatial_dims(ds) + if add_lon_lat: + ds = ds.assign_coords(lon=ds.coords["x"], lat=ds.coords["y"]) + + return ds + +def _interpolate_to_cutout_resolution(ds, retrieval_params, static): + + # Interpolate the data spatially and temporally to the wanted cutout resolution + ds_temps = [] + for idx, var in enumerate(ds.data_vars): + ds_temps.append(_interpolate(ds[var], static, + retrieval_params['coords'], + retrieval_params['grid'], + retrieval_params['interp_s'], + retrieval_params['interp_t']) + ) + + ds = xr.merge(ds_temps) + ds = ds.assign_coords(lon=("x", ds.x.values), lat=("y", ds.y.values)) + + ds = ds.unify_chunks().chunk(chunks=retrieval_params['chunks'] or {}) + + return ds + + +def get_data_wind(retrieval_params): + ''' + Retrieves and processes wind data from the DWD server. + + The function collects wind speed and direction data at 100m above ground level, + as well as surface roughness data. It then processes and interpolates this data + to match the desired spatial and temporal resolution. + + Parameters + ---------- + retrieval_params : dict + Dictionary containing parameters for data retrieval, including coordinates, + grid resolution, and interpolation methods. + + Returns + ------- + xarray.Dataset + Processed dataset containing wind speed, wind direction, and surface roughness. + ''' + + # Retrieve wind data from model levels 62 and 63 + retrieval_params['field'] = ['model-level', 'model-level'] + ds = retrieve_data( + url=dwd_url, + variable=[ + "u/62/63", # Zonal (east-west) wind component at levels 62 and 63 + "v/62/63", # Meridional (north-south) wind component at levels 62 and 63 + ], + **retrieval_params, + ) + + # Compute the mean wind values across the general vertical layers + ds["u"] = ds["u"].mean('generalVerticalLayer') + ds["v"] = ds["v"].mean('generalVerticalLayer') + ds = ds.drop_dims('generalVerticalLayer') # Remove the dimension after averaging + ds = ds.rename({"u": "u_100m", "v": "v_100m"}) # Rename variables for clarity + + + # Retrieve surface roughness data from single-level data + retrieval_params['field'] = ['single-level'] + ds2 = retrieve_data( + url=dwd_url, + variable=["z0"], # Surface roughness length + **retrieval_params, + ) + + # Merge wind data with roughness data into a single dataset + ds = xr.merge([ds, ds2]) + + # Rename roughness variable for clarity + ds = ds.rename({"fsr": "roughness"}) + ds["roughness"] = ds["roughness"].assign_attrs( + units="m", + long_name="Surface roughness" + ) + + # Compute wind speed at 100m using the Pythagorean theorem + ds["wnd100m"] = np.sqrt(ds["u_100m"] ** 2 + ds["v_100m"] ** 2).assign_attrs( + units="m/s", long_name="100 metre wind speed" + ) + + # Compute wind direction azimuth (0 = North, π/2 = East, π = South, 3π/2 = West) + azimuth = np.arctan2(ds["u_100m"], ds["v_100m"]) + + # Ensure wind azimuth is within the 0 to 2π range + ds["wnd_azimuth"] = azimuth.where(azimuth >= 0, azimuth + 2 * np.pi) + + # Remove intermediate wind component variables after processing + ds = ds.drop_vars(["u_100m", "v_100m"]) + + return ds + + +def sanitize_wind(ds): + """Sanitize retrieved wind data.""" + ds["roughness"] = ds["roughness"].where(ds["roughness"] >= 0.0, 2e-4) + return ds + + +def get_data_influx(retrieval_params): + """Get influx data for given retrieval parameters.""" + # Retrieve single-level data + retrieval_params['field'] = ['single-level', 'single-level', 'single-level', 'single-level'] + ds = retrieve_data( + url=dwd_url, + variable=[ + "asob_t", + "aswdir_s", + "aswdifd_s", + "alb_rad", + ], + **retrieval_params, + ) + + ds = ds.rename({"avg_tnswrf": "influx_toa", + "ASWDIR_S": "influx_direct", + "ASWDIFD_S": "influx_diffuse", + "al": "albedo"}) + + ds["albedo"] = (ds["albedo"]/100).assign_attrs(units="(0 - 1)", long_name="Shortwave broadband albedo for diffuse radiation") + ds["influx_diffuse"] = ds["influx_diffuse"].assign_attrs(units="W m**-2", long_name="Surface down solar diffuse radiation") + ds["influx_direct"] = ds["influx_direct"].assign_attrs(units="W m**-2", long_name="Surface down solar direct radiation") + ds["influx_toa"] = ds["influx_toa"].assign_attrs(units="W m**-2", long_name="Net short-wave radiation flux at top of atmosphere (TOA)") + + # # Interpolate the data spatially and temporally to the wanted cutout resolution + # ds_temps = [] + # for idx, var in enumerate(ds): + # ds_temps.append(_interpolate(ds[var], False, + # retrieval_params['coords'], + # retrieval_params['grid'], + # retrieval_params['interp_s'], + # retrieval_params['interp_t']) + # ) + + # ds = xr.merge(ds_temps) + # ds = ds.assign_coords(lon=("x", ds.x.values), lat=("y", ds.y.values)) + + # ICON-EU variables are mean values for previous hour, i.e. 13:01 to 14:00 are labelled as "14:00" + # account by calculating the SolarPosition for the center of the interval for aggregation happens + # see https://github.com/PyPSA/atlite/issues/158 + # Do not show DeprecationWarning from new SolarPosition calculation (#199) + with warnings.catch_warnings(): + warnings.simplefilter("ignore", DeprecationWarning) + # Convert dt / time frequency to timedelta and shift solar position by half + # (freqs like ["H","30T"] do not work with pd.to_timedelta(...) + time_shift = ( + -1 + / 2 + * pd.to_timedelta( + pd.date_range( + "1970-01-01", periods=1, freq=pd.infer_freq(ds["time"]) + ).freq + ) + ) + sp = SolarPosition(ds, time_shift=time_shift) + + sp = sp.rename({v: f"solar_{v}" for v in sp.data_vars}) + + ds = xr.merge([ds, sp]) + + # # Interpolate the data spatially and temporally to the wanted cutout resolution + # ds_temps = [] + # for idx, var in enumerate(ds): + # ds_temps.append(_interpolate(ds[var], False, + # retrieval_params['coords'], + # retrieval_params['grid'], + # retrieval_params['interp_s'], + # retrieval_params['interp_t']) + # ) + + # ds = xr.merge(ds_temps) + # ds = ds.assign_coords(lon=("x", ds.x.values), lat=("y", ds.y.values)) + + # ds = ds.unify_chunks.chunk(chunks=retrieval_params['chunks'] or {}) + + + + # ds = ds.drop_vars(['lon','lat']) + # ds = ds.assign_coords(lon=("x", ds.x.values), lat=("y", ds.y.values)) + + return ds + + +def sanitize_influx(ds): + """Sanitize retrieved influx data.""" + for a in ("influx_direct", "influx_diffuse", "influx_toa"): + ds[a] = ds[a].clip(min=0.0) + return ds + + +def get_data_temperature(retrieval_params): + """Get wind temperature for given retrieval parameters.""" + # Retrieve single-level data + retrieval_params['field'] = ['single-level','soil-level'] + ds = retrieve_data( + url=dwd_url, + variable=["t_2m", + "t_so/162"], + **retrieval_params + ) + + ds = ds.rename({"t2m": "temperature", + "T_SO": "soil temperature"}) + + ds["temperature"] = ds["temperature"].assign_attrs(units="K", long_name="Temperature at 2m above ground") + ds["soil temperature"] = ds["soil temperature"].assign_attrs(units="K", long_name="Soil temperature in 162 cm depth ") + + return ds + + +def get_data_runoff(retrieval_params): + """Get runoff data for given retrieval parameters.""" + # Retrieve single-level data + retrieval_params['field'] = ['single-level','single-level'] + ds = retrieve_data(url=dwd_url, + variable=["runoff_s", + "runoff_g"], + **retrieval_params) + + ds["runoff"] = (ds["RUNOFF_S"] + ds["RUNOFF_G"]).assign_attrs(units="kg m**-2", long_name="Surface and Soil water runoff (accumulated since model start)") + + ds = ds.drop_vars(["RUNOFF_S", "RUNOFF_G"]) + + return ds + + +def sanitize_runoff(ds): + """Sanitize retrieved runoff data.""" + ds["runoff"] = ds["runoff"].clip(min=0.0) + return ds + + +def get_data_height(retrieval_params): + """Get height data for given retrieval parameters.""" + # Retrieve time-invariant data + retrieval_params['field'] = ['time-invariant'] + ds = retrieve_data(url=dwd_url, + variable=["hsurf"], + **retrieval_params) + + ds = ds.rename({"HSURF": "height"}) + ds["height"] = ds["height"].assign_attrs( + units="m", + long_name="Geometric Height of the earths surface above sea level (2D field)" + ) + + return ds + + +def _area(coords): + # North, West, South, East. Default: global + x0, x1 = coords["x"].min().item(), coords["x"].max().item() + y0, y1 = coords["y"].min().item(), coords["y"].max().item() + return [y1, x0, y0, x1] + + +def retrieval_times(coords, tz, static=False): + """ + Get retrieval time dimension of the forecast. + + Parameters + ---------- + coords : atlite.Cutout.coords + Coordinate object containing the time dimension. + tz : timezone + Timezone information of the input time and date (currently unused here). + static : bool, optional (default=False) + If True, return only the first forecast time step and an empty offset. + + Returns + ------- + dict + Dictionary with keys: + - 'forecast': pd.DatetimeIndex of forecast timestamps (including any filled gaps) + - 'offset': pd.DatetimeIndex of past timestamps before current model run time + """ + # Convert xarray time coordinate to pandas index + time = coords["time"].to_index() + + # Get the current model run time (e.g., most recent 6-hourly forecast release) + currentRunTime = _getCurrentRun(pd.Timestamp.utcnow().replace(tzinfo=None)) + + # Split times into forecast (≥ currentRunTime) and offset (< currentRunTime) + forecast_times = time[time >= currentRunTime] + offset_times = time[time < currentRunTime] + + # If the forecast doesn't include currentRunTime explicitly, fill the missing range + if len(forecast_times) > 0 and forecast_times[0] > currentRunTime: + # Infer time resolution, fallback to difference if not directly available + freq = time.freq or pd.infer_freq(time) + if freq is None: + freq = time[1] - time[0] # fallback to timedelta if no freq is inferable + + # Fill the missing time steps from currentRunTime up to just before the first forecast time + fill = pd.date_range(currentRunTime, forecast_times[0] - freq, freq=freq) + + # Prepend the filled range to the forecast times + forecast_times = fill.append(forecast_times) + + # If static mode is requested, return only the first forecast time and no offset + if static: + forecast_times = forecast_times[:1] + offset_times = pd.DatetimeIndex([]) + + # Return dictionary with forecast and offset times + return { + "forecast": forecast_times, + "offset": offset_times, + } + + +def noisy_unlink(path): + """Delete file at given path.""" + logger.debug(f"Deleting file {path}") + try: + os.unlink(path) + except PermissionError: + logger.error(f"Unable to delete file {path}, as it is still in use.") + + +def retrieve_data(url, product, chunks=None, tmpdir=None, lock=None, **updates): + + """ + Download data from the ICON-EU Model from the Open Data Server (ODS) of DWD. + + If you want to manually downolad the data go to: + https://opendata.dwd.de/weather/nwp/icon-eu/grib/ + """ + + request = {"product_type": "icon_eu", "format": "direct-download"} + request.update(updates) + + ds_temps = [] + #Download data for each variable individually and then merge all in one xarray + logger.info(f"open-dwd: Downloading variables\n\t{request['variable']}\n") + for idx, var in enumerate(request['variable']): + ds_temps.append(_mainDataCollector(url, + var, + request['field'][idx], + request['forecast'], + request['offset'], + request['coords'], + tmpdir) + ) + + ds = xr.merge(ds_temps).chunk(chunks=chunks) + + return ds + + +def get_data(cutout, feature, tmpdir, + lock=None, + monthly_requests=False, + concurrent_requests=False, + **creation_parameters): + """ + Retrieve data from DWDs ICON-EU Model dataset (via ODS). + + This front-end function downloads data for a specific feature and formats + it to match the given Cutout. + + Parameters + ---------- + cutout : atlite.Cutout + feature : str + Name of the feature data to retrieve. Must be in + `atlite.datasets.icon_d2.features` + monthly_requests : bool + Takes no effect, only here for consistency with other dataset modules. + concurrent_requests : bool + Takes no effect, only here for consistency with other dataset modules. + tmpdir : str/Path + Directory where the temporary netcdf files are stored. + **creation_parameters : + Additional keyword arguments. The only effective argument is 'sanitize' + (default True) which sets sanitization of the data on or off. + + Returns + ------- + xarray.Dataset + Dataset of dask arrays of the retrieved variables. + + """ + coords = cutout.coords + + sanitize = creation_parameters.get("sanitize", True) + + retrieval_params = { + "product": "dwd_icon_eu", + "area": _area(coords), + "chunks": cutout.chunks, + "grid": [cutout.dx, cutout.dy], + "tmpdir": tmpdir, + "lock": lock, + "tz": cutout.data.tz, + "interp_s": cutout.data.interp_s, + "interp_t": cutout.data.interp_t, + "coords": coords, + } + + func = globals().get(f"get_data_{feature}") + sanitize_func = globals().get(f"sanitize_{feature}") + + logger.info(f"Requesting data for feature {feature} for ICON-EU from open-dwd...") + + def retrieve_once(time, static=False): + ds = func({**retrieval_params, **time}) + ds = _interpolate_to_cutout_resolution(ds, retrieval_params, static) + # Sanitize the data after interpolation to remove residuals + if sanitize and sanitize_func is not None: + ds = sanitize_func(ds) + return ds + + if feature in static_features: + return retrieve_once(retrieval_times(coords, cutout.data.tz, True), True).squeeze() + + dataset = retrieve_once(retrieval_times(coords, cutout.data.tz, False), False) + + return dataset.sel(time=coords["time"]) diff --git a/atlite/datasets/meteo_forecast.py b/atlite/datasets/meteo_forecast.py new file mode 100644 index 00000000..1da22a94 --- /dev/null +++ b/atlite/datasets/meteo_forecast.py @@ -0,0 +1,990 @@ +# -*- coding: utf-8 -*- + +# SPDX-FileCopyrightText: 2016-2021 The Atlite Authors +# +# SPDX-License-Identifier: GPL-3.0-or-later + +""" +Module for downloading and curating forecast data from different weather prediction models (via Open-Meteo). + +For further reference see +https://open-meteo.com/en/docs +""" + +import logging +import os +import io +import re +import zipfile +import warnings +import weakref +import time +import xarray as xr +import datetime +import openmeteo_requests +import requests_cache +import cdsapi +import numpy as np +import pandas as pd + +from numpy import atleast_1d +from dask import compute, delayed +from tempfile import mkstemp +from retry_requests import retry + +from ..gis import maybe_swap_spatial_dims +from ..pv.solar_position import SolarPosition + +# Null context for running a with statements wihout any context +try: + from contextlib import nullcontext +except ImportError: + # for Python verions < 3.7: + import contextlib + + @contextlib.contextmanager + def nullcontext(): + yield + + +logger = logging.getLogger(__name__) + +# Setup Open-Meteo client with 7-day cache and retry on failure +# Cache duration: -1 = never expire, 0 = no caching, timedelta = expire after set time +cache_window = datetime.timedelta(days=7) +cache_session = requests_cache.CachedSession('.meteo.cache', backend='sqlite', expire_after=cache_window) +retry_session = retry(cache_session, retries=5, backoff_factor=3) +openmeteo = openmeteo_requests.Client(session=retry_session) + +# Set url for data download, this allows to switch to different data +# sources more easily. +era5_url = 'https://cds.climate.copernicus.eu/api' +meteo_url1 = "https://api.open-meteo.com/v1/forecast" +meteo_url2 = "https://api.open-meteo.com/v1/ecmwf" + +# Open-Meteo request limits +MINUTE_LIMIT = 600 +HOUR_LIMIT = 5000 +DAY_LIMIT = 10000 + +# Delay of ERA5 data upload +# For Open-Meteo slow changing variables from ERA5 are always interpolated +# Starting from data of at least 7 days before nowtime +ERA5_DELAY = pd.Timedelta(hours=-6.0*24) + +# Model and CRS Settings +crs = 4326 + +features = { + "height": ["height"], + "wind": ["wnd80m", "wnd_azimuth", "roughness"], + "influx": [ + "influx_toa", + "influx_direct", + "influx_diffuse", + "albedo", + "solar_altitude", + "solar_azimuth", + ], + "temperature": ["temperature", "soil temperature"], + "runoff": ["runoff"] +} + +static_features = {"height"} + +requirements = {'x': slice(-90, 90, 0.1), + 'y': slice(-90, 90, 0.1), + 'offset': pd.Timedelta(hours=-92.0*24), + 'forecast': pd.Timedelta(hours=16.0*24), + 'dt': pd.Timedelta(hours=1), + 'parallel': False, + } + + +def _checkModuleRequirements(x, y, time, time_now, **kwargs): + """ + Load and check the data requirements for a given module. + + Parameters: + x (slice): Defines the start, stop, and step values for the x-dimension. + y (slice): Defines the start, stop, and step values for the y-dimension. + time (slice): Defines the start, stop, and step values for the time dimension. + **kwargs: Additional optional parameters. + """ + + # Extract start, stop, and step values for x + x_start, x_stop, x_step = x.start, x.stop, x.step + + # Adjust x range based on module requirements + if requirements['x'].start > x.start: + x_start = requirements['x'].start + if requirements['x'].stop < x.stop: + x_stop = requirements['x'].stop + if requirements['x'].step > x.step: + x_step = requirements['x'].step + + x = slice(x_start, x_stop, x_step) + + # Extract start, stop, and step values for y + y_start, y_stop, y_step = y.start, y.stop, y.step + + # Adjust y range based on module requirements + if requirements['y'].start > y.start: + y_start = requirements['y'].start + if requirements['y'].stop < y.stop: + y_stop = requirements['y'].stop + if requirements['y'].step > y.step: + y_step = requirements['y'].step + + y = slice(y_start, y_stop, y_step) + + + # Extract time range parameters + time_start = time.start + time_stop = time.stop + time_step = time.step + + # Check forecast feasibility + feasible_start = time_now + requirements['offset'] + feasible_end = time_now + requirements['forecast'] + + # Ensure time_start is within feasible bounds + if time_start < feasible_start: + logger.error(f"The required forecast start time {time_start} exceeds the model requirements.") + logger.error(f"The minimum start time of the forecast for {time_now} is {feasible_start}.") + logger.error(f"The maximum historical offset of the forecast is {requirements['offset']}.") + raise ValueError(f"Invalid forecast start time: {time_start}. Must be >= {feasible_start}.") + + if time_start >= feasible_end: + logger.error(f"The required forecast start time {time_start} exceeds the model requirements.") + logger.error(f"The maximum start time of the forecast for {time_now} needs to be smaller than {feasible_end}.") + raise ValueError(f"Invalid forecast start time: {time_start}. Must be < {feasible_end}.") + + # Ensure time_stop is greater than time_start + if time_stop <= time_start: + logger.error(f"The required forecast end time {time_stop} exceeds the model requirements.") + logger.error(f"The minimum end time of the forecast for {time_now} needs to be larger than {time_start}.") + raise ValueError(f"Invalid forecast end time: {time_stop}. Must be > {time_start}.") + + # Ensure time_stop is greater than time_start + if time_stop > feasible_end: + logger.error(f"The required forecast end time {time_stop} exceeds the model requirements.") + logger.error(f"The maximum end time of the forecast for {time_now} is {feasible_end}.") + logger.error(f"The maximum forecast horizon is {requirements['forecast']}.") + raise ValueError(f"Invalid forecast end time: {time_stop}. Must be <= {feasible_end}.") + + # Ensure time step is within required limits + if (time_step is pd.Timedelta(None)) or (time.step < requirements['dt']): + logger.warning(f"The required temporal forecast resolution {time_step} exceeds the model requirements.") + logger.warning(f"The minimum temporal resolution of the forecast is {requirements['dt']}.") + logger.info(f"Set the temporal forecast resolution to the minimum: {requirements['dt']}.") + time_step = requirements['dt'] + + time = slice(time_start, time_stop, time_step) + + # Retrieve parallel processing setting from requirements + parallel = requirements['parallel'] + + return x, y, time, parallel + + +def _add_height(ds): + """Convert geopotential 'z' to geopotential height following [1]. + + References + ---------- + [1] ERA5: surface elevation and orography, retrieved: 10.02.2019 + https://confluence.ecmwf.int/display/CKB/ERA5%3A+surface+elevation+and+orography + + """ + g0 = 9.80665 + z = ds["z"] + if "time" in z.coords: + z = z.isel(time=0, drop=True) + ds["height"] = z / g0 + ds = ds.drop_vars("z") + return ds + + +def _rename_and_clean_coords(ds, add_lon_lat=True): + """ + Rename 'longitude' and 'latitude' columns to 'x' and 'y' and fix roundings. + + Optionally (add_lon_lat, default:True) preserves latitude and + longitude columns as 'lat' and 'lon'. + """ + ds = ds.rename({"longitude": "x", "latitude": "y"}) + if "valid_time" in ds.sizes: + ds = ds.rename({"valid_time": "time"}).unify_chunks() + # round coords since cds coords are float32 which would lead to mismatches + ds = ds.assign_coords( + x=np.round(ds.x.astype(float), 5), y=np.round(ds.y.astype(float), 5) + ) + ds = maybe_swap_spatial_dims(ds) + if add_lon_lat: + ds = ds.assign_coords(lon=ds.coords["x"], lat=ds.coords["y"]) + + # Combine ERA5 and ERA5T data into a single dimension. + # See https://github.com/PyPSA/atlite/issues/190 + if "expver" in ds.coords: + unique_expver = np.unique(ds["expver"].values) + if len(unique_expver) > 1: + expver_dim = xr.DataArray( + unique_expver, dims=["expver"], coords={"expver": unique_expver} + ) + ds = ( + ds.assign_coords({"expver_dim": expver_dim}) + .drop_vars("expver") + .rename({"expver_dim": "expver"}) + .set_index(expver="expver") + ) + for var in ds.data_vars: + ds[var] = ds[var].expand_dims("expver") + # expver=1 is ERA5 data, expver=5 is ERA5T data This combines both + # by filling in NaNs from ERA5 data with values from ERA5T. + ds = ds.sel(expver="0001").combine_first(ds.sel(expver="0005")) + ds = ds.drop_vars(["expver", "number"], errors="ignore") + + return ds + + +def _interpolate(ds, ds_ref, static, interp_s, interp_t): + + #Interpolate data to specific latitude and longitude values given as input (due to specific model resolution) + + if not static: + try: + ds = ds.interp( + time=ds_ref.time.values, + method=interp_t, + kwargs={"fill_value": "extrapolate"}, + ) + except ValueError: + logger.info(f"Interpolation: Not enough supporting points for used interpolation method {interp_t}.") + logger.info("Interpolation method is set to 'nearest' instead.") + ds = ds.interp( + time=ds_ref.time.values, + method='nearest', + kwargs={"fill_value": "extrapolate"}, + ) + + try: + ds = ds.interp( + x=ds_ref.x.values, + y=ds_ref.y.values, + method=interp_s, + kwargs={"fill_value": "extrapolate"}, + ) + except ValueError: + logger.info(f"Interpolation: Not enough supporting points for used interpolation method {interp_s}.") + logger.info("Interpolation method is set to 'nearest' instead.") + ds = ds.interp( + x=ds_ref.x.values, + y=ds_ref.y.values, + method='nearest', + kwargs={"fill_value": "extrapolate"}, + ) + + return ds + +def get_data_meteo_wind(retrieval_params): + """Get all data from meteo API for given retrieval parameters at once to save requests and runtime.""" + + ds = retrieve_meteo_data( + url=meteo_url1, + variable=[ + "windspeed_80m", + "winddirection_80m", + ], + **retrieval_params, + ) + + return ds + + +def get_data_meteo_influx(retrieval_params): + """Get all data from meteo API for given retrieval parameters at once to save requests and runtime.""" + + ds = retrieve_meteo_data( + url=meteo_url1, + variable=[ + # "shortwave_radiation", + "direct_radiation", + "diffuse_radiation", + # "direct_normal_irradiance", + # "terrestrial_radiation", + ], + **retrieval_params, + ) + + return ds + + +def get_data_meteo_temperature(retrieval_params): + """Get all data from meteo API for given retrieval parameters at once to save requests and runtime.""" + + ds = retrieve_meteo_data( + url=meteo_url1, + variable=[ + "temperature_2m", + "soil_temperature_54cm", + ], + **retrieval_params, + ) + + return ds + + +def get_data_meteo_runoff(retrieval_params): + """Get all data from meteo API for given retrieval parameters at once to save requests and runtime.""" + + ds = retrieve_meteo_data( + url=meteo_url2, + variable=[ + "runoff", + ], + **retrieval_params, + ) + + return ds + + +def sanitize_runoff(ds): + """ + Sanitize retrieved runoff data. + """ + ds["runoff"] = ds["runoff"].clip(min=0.0) + return ds + + +def get_data_era5_wind(retrieval_params): + """Get wind data for given retrieval parameters.""" + + ds = retrieve_era5_data( + url=era5_url, + variable=["forecast_surface_roughness"], + **retrieval_params, + ) + + ds = _rename_and_clean_coords(ds) + + return ds + + +def combine_data_wind(ds_meteo, ds_era5, interp_s, interp_t): + + ds_era5 = _interpolate(ds=ds_era5, + ds_ref=ds_meteo, + static=False, + interp_s=interp_s, + interp_t=interp_t, + ) + + ds = xr.merge([ds_meteo, ds_era5]) + + ds = ds.rename( + { + "windspeed_80m": "wnd80m", + "winddirection_80m": "wnd_azimuth", + "fsr": "roughness", + } + ) + + ds.wnd80m.attrs.update(units="m s**-1", long_name="Wind speed at 80m above ground") + ds.wnd_azimuth.attrs.update( + units="degree", long_name="Wind direction at 80m above ground" + ) + + # unify_chunks() is necessary to avoid a bug in xarray + ds = ds.unify_chunks() + + return ds + + +def sanitize_wind(ds): + """Sanitize retrieved wind data.""" + ds["roughness"] = ds["roughness"].where(ds["roughness"] >= 0.0, 2e-4) + return ds + + +def get_data_era5_influx(retrieval_params): + """Get influx data for given retrieval parameters.""" + + ds = retrieve_era5_data( + url=era5_url, + variable=["forecast_albedo", + "toa_incident_solar_radiation"], + **retrieval_params, + ) + + ds = _rename_and_clean_coords(ds) + + return ds + + +def combine_data_influx(ds_meteo, ds_era5, interp_s, interp_t): + + + ds_era5 = _interpolate(ds=ds_era5, + ds_ref=ds_meteo, + static=False, + interp_s=interp_s, + interp_t=interp_t, + ) + + ds = xr.merge([ds_meteo, ds_era5]) + + ds = ds.rename( + { + "direct_radiation": "influx_direct", + "diffuse_radiation": "influx_diffuse", + "tisr": "influx_toa", + "fal": "albedo", + } + ) + + # Convert from energy to power J m**-2 -> W m**-2 and clip negative fluxes + ds["influx_toa"] = ds["influx_toa"] / (60.0 * 60.0) + + ds.influx_direct.attrs.update( + units="W m**-2", long_name="Surface direct solar radiation downwards" + ) + ds.influx_diffuse.attrs.update( + units="W m**-2", long_name="Surface diffuse solar radiation downwards" + ) + ds.influx_toa.attrs.update( + units="W m**-2", long_name="TOA incident solar radiation" + ) + + # unify_chunks() is necessary to avoid a bug in xarray + ds = ds.unify_chunks() + + # ERA5 variables are mean values for previous hour, i.e. 13:01 to 14:00 are labelled as "14:00" + # account by calculating the SolarPosition for the center of the interval for aggregation happens + # see https://github.com/PyPSA/atlite/issues/158 + # Do not show DeprecationWarning from new SolarPosition calculation (#199) + with warnings.catch_warnings(): + warnings.simplefilter("ignore", DeprecationWarning) + time_shift = pd.to_timedelta("-30 minutes") + sp = SolarPosition(ds, time_shift=time_shift) + sp = sp.rename({v: f"solar_{v}" for v in sp.data_vars}) + + ds = xr.merge([ds, sp]) + + return ds + + +def sanitize_influx(ds): + """Sanitize retrieved influx data.""" + for a in ("influx_direct", "influx_diffuse", "influx_toa"): + ds[a] = ds[a].clip(min=0.0) + return ds + + +def combine_data_temperature(ds_meteo, ds_era5, interp_s, interp_t): + """Get wind temperature for given retrieval parameters.""" + ds = xr.merge([ds_meteo, ds_era5]) + + ds = ds.rename( + {"temperature_2m": "temperature", + "soil_temperature_54cm": "soil temperature"} + ) + + # Convert from Celsius to Kelvin C -> K, by adding 273.15 + ds = ds + 273.15 + + ds["temperature"].attrs.update(units="K", long_name="2 metre temperature") + ds["soil temperature"].attrs.update(units="K", long_name="Soil temperature 54cm") + + # unify_chunks() is necessary to avoid a bug in xarray + ds = ds.unify_chunks() + + return ds + + +def get_data_era5_height(retrieval_params): + """Get height data for given retrieval parameters.""" + ds = retrieve_era5_data( + url=era5_url, + variable=["geopotential"], + **retrieval_params) + + ds = _rename_and_clean_coords(ds) + ds = _add_height(ds) + + return ds + + +def _area(coords): + # North, West, South, East. Default: global + x0, x1 = coords["x"].min().item(), coords["x"].max().item() + y0, y1 = coords["y"].min().item(), coords["y"].max().item() + return [y1, x0, y0, x1] + + +def noisy_unlink(path): + """Delete file at given path.""" + logger.debug(f"Deleting file {path}") + try: + os.unlink(path) + except PermissionError: + logger.error(f"Unable to delete file {path}, as it is still in use.") + + +def retrieve_meteo_data(url, product, chunks=None, tmpdir=None, lock=None, **updates): + """ + Download meteorological data (e.g., ERA5-style) from the Open-Meteo API. + + Parameters + ---------- + url : str + API endpoint (typically `meteo_url`). + product : str + Product identifier (not used here but reserved for compatibility). + chunks : dict, optional + Chunking configuration for final xarray dataset. + tmpdir : str, optional + Temporary storage path (not used here). + lock : threading.Lock, optional + Lock for parallel-safe writing (not used here). + updates : dict + Additional request parameters including 'coords', 'start', 'end', and 'variable'. + + Returns + ------- + xarray.Dataset + Weather data with time, latitude, and longitude dimensions, chunked as requested. + """ + + # Build request from base + user overrides + request = {"product_type": "meteo_api", "format": "direct_download"} + request.update(updates) + + # Generate list of (lon, lat) coordinate pairs + grid = np.meshgrid(request['coords']['x'], request['coords']['y']) + coords = pd.DataFrame(zip(grid[0].flatten(), grid[1].flatten()), columns=['longitude', 'latitude']) + + # Calculate time and variable counts + start_date = request['start'].strftime('%Y-%m-%d') + end_date = request['end'].strftime('%Y-%m-%d') + nr_days = abs((pd.to_datetime(start_date) - pd.to_datetime(end_date)).days) + nr_variables = len(request['variable']) + nr_locations = len(coords) + + # Estimate request weight based on Open-Meteo's internal model + weight_of_full_api_request = max(nr_variables / 10, (nr_variables / 10) * (nr_days / 14)) * nr_locations + + # Dynamically determine chunk size based on rate limit thresholds + if weight_of_full_api_request <= MINUTE_LIMIT: + chunk_size = MINUTE_LIMIT / 10 + elif weight_of_full_api_request <= HOUR_LIMIT: + chunk_size = HOUR_LIMIT / 10 + else: + chunk_size = DAY_LIMIT / 10 + + chunk_size = int(max(1, chunk_size)) # Ensure chunk_size ≥ 1 and integer + + logger.info(f"Meteo-API: Downloading variables\n\t{request['variable']}\n") + logger.info(f"Meteo-API: Expected request weight of full request: {weight_of_full_api_request}") + + # Loop through coordinate grid in blocks and request data + data = [] + for i in range(0, len(coords), chunk_size): + coord_chunk = coords.iloc[i:i + chunk_size] + + # Prepare API parameters for the current chunk + params = { + "longitude": coord_chunk['longitude'].tolist(), + "latitude": coord_chunk['latitude'].tolist(), + "hourly": request['variable'], + "wind_speed_unit": "ms", + "start_date": start_date, + "end_date": end_date + } + + try: + responses = openmeteo.weather_api(url, params=params) + except Exception as e: + logger.info(f"{e}") + try: + # Extract and classify rate limiting error + rate_limiting_error = list(e.args)[0]['reason'] + match = re.search(r"(Minutely|Hourly|Daily) API request limit", rate_limiting_error) + if match: + apply_rate_limiting(error=match[0]) + else: + apply_rate_limiting(error=None) + # Retry after delay + responses = openmeteo.weather_api(url, params=params) + except Exception as e: + # Skip this chunk on repeated failure + logger.error(f"Meteo-API: Failed to fetch data for block starting at " + f"({coord_chunk.loc[0, 'longitude']}, {coord_chunk.loc[0, 'latitude']}): {e}") + continue + + # Parse chunked response and append results + data.extend(parse_meteo_responses(responses, params)) + + # Combine all parsed DataFrames into a single xarray dataset + ds = pd.concat(data).to_xarray() + ds = _rename_and_clean_coords(ds) + ds = ds.chunk(chunks=chunks) + + return ds + + +def parse_meteo_responses(responses, params): + """ + Parse raw Open-Meteo API responses into a list of DataFrames. + + Parameters + ---------- + responses : list + List of Open-Meteo response objects, one per coordinate. + params : dict + Parameters used in the API call, containing latitude, longitude, and variable info. + + Returns + ------- + list of pd.DataFrame + Each DataFrame contains one location’s weather data with time as index. + """ + + data = [] + for j, response in enumerate(responses): + # Reconstruct time index based on interval + range_start = pd.to_datetime(response.Hourly().Time(), unit="s") + range_end = pd.to_datetime(response.Hourly().TimeEnd(), unit="s") + date_range = pd.date_range(start=range_start, end=range_end, + freq=pd.Timedelta(seconds=response.Hourly().Interval()), + inclusive="left") + + # Initialize empty DataFrame for the current location + response_df = pd.DataFrame(columns=params["hourly"]) + response_df["time"] = date_range + response_df["latitude"] = params["latitude"][j] + response_df["longitude"] = params["longitude"][j] + response_df = response_df.set_index(["time", "latitude", "longitude"]) + + # Fill in variable values + for i, param in enumerate(params["hourly"]): + response_df[param] = response.Hourly().Variables(i).ValuesAsNumpy() + + data.append(response_df) + + return data + + +def apply_rate_limiting(error=None): + """ + Apply appropriate sleep duration based on API rate-limiting error. + + Parameters + ---------- + error : str or None + One of 'Minutely API request limit', 'Hourly API request limit', + 'Daily API request limit', or None (fallback delay). + + Behavior + -------- + - Sleeps 60s for minutely errors + - Sleeps 1h for hourly errors + - Sleeps until 00:05 next day for daily errors + - Sleeps 2 minutes as fallback + """ + + now = datetime.datetime.now() + midnight = datetime.datetime(now.year, now.month, now.day) + datetime.timedelta(days=1, minutes=5) + time_until_midnight = (midnight - now).total_seconds() + + sleep_times = { + None: 120, # Fallback for unknown errors + "Minutely API request limit": 60, + "Hourly API request limit": 60 * 60, + "Daily API request limit": time_until_midnight + } + + sleep_time = sleep_times[error] + logger.info(f"Sleeping for {sleep_time / 60:.2f} minutes.") + time.sleep(sleep_time) + + +def retrieve_era5_data(url, product, chunks=None, tmpdir=None, lock=None, **updates): + """ + Download data like ERA5 from the Climate Data Store (CDS). + + If you want to track the state of your request go to + https://cds-beta.climate.copernicus.eu/requests?tab=all + """ + request = {"product_type": ["reanalysis"], + "data_format": "netcdf", + "download_format": "zip"} + + request.update(updates) + + assert {"year", "month", "variable"}.issubset( + request + ), "Need to specify at least 'variable', 'year' and 'month'" + + client = cdsapi.Client( + url = url, + info_callback=logger.debug, + debug=logging.DEBUG >= logging.root.level + ) + result = client.retrieve(product, request) + + if lock is None: + lock = nullcontext() + + with lock: + fd, target_zip = mkstemp(suffix=".zip", dir=tmpdir) + os.close(fd) + + # Inform user about data being downloaded as "* variable (year-month)" + timestr = f"{request['year']}-{request['month']}" + variables = atleast_1d(request["variable"]) + varstr = "\n\t".join([f"{v} ({timestr})" for v in variables]) + logger.info(f"CDS: Downloading variables\n\t{varstr}\n") + result.download(target_zip) + + # Open the .zip file in memory + with zipfile.ZipFile(target_zip, "r") as zf: + # Identify .nc files inside the .zip + nc_files = [name for name in zf.namelist() if name.endswith(".nc")] + + if not nc_files: + raise FileNotFoundError("No .nc files found in the downloaded .zip archive.") + + if len(nc_files) == 1: + # If there's only one .nc file, read it into memory + with zf.open(nc_files[0]) as nc_file: + # Pass the in-memory file-like object to Xarray + ds = xr.open_dataset(io.BytesIO(nc_file.read()), chunks=chunks or {}) + + else: + # If multiple .nc files, combine them using Xarray + datasets = [] + for nc_file in nc_files: + with zf.open(nc_file) as file: + dataset = xr.open_dataset(io.BytesIO(file.read()), chunks=chunks or {}) + + if 'expver' in dataset.variables: + dataset = dataset.drop_vars(["expver", "number"], errors="ignore") + + datasets.append(dataset) + + ds = xr.merge(datasets) + + if tmpdir is None: + logging.debug(f"Adding finalizer for {target_zip}") + weakref.finalize(ds._file_obj._manager, noisy_unlink, target_zip) + + return ds + + +def retrieval_times_era5_forecast(coords, initialization_time, static=False, monthly_requests=False): + """ + Get list of retrieval cdsapi arguments for time dimension in coordinates. + + If static is False, this function creates a query for each month and year + in the time axis in coords. This ensures not running into size query limits + of the cdsapi even with very (spatially) large cutouts. + If static is True, the function return only one set of parameters + for the very first time point. + + Parameters + ---------- + coords : atlite.Cutout.coords + static : bool, optional + monthly_requests : bool, optional + If True, the data is requested on a monthly basis. This is useful for + large cutouts, where the data is requested in smaller chunks. The + default is False + + Returns + ------- + list of dicts witht retrieval arguments + + """ + + # Convert time coordinates to a pandas Index + time = coords["time"].to_index() + frequency = time.freq + + # Determine the latest available ERA5 data time based on initialization time and required delay + latest_era5_time = pd.Timestamp(initialization_time) + ERA5_DELAY + + # Round up to the next full day and subtract 1 hour to align with ERA5 update frequency + latest_era5_time = latest_era5_time.ceil('D') - pd.Timedelta(hours=1) + + # Define the minimum required time horizon for ERA5 downloads (last 24 hours) + minimum_era5_time_horizon = pd.date_range( + start=latest_era5_time - pd.Timedelta(days=1), # One day before the latest available time + end=latest_era5_time, # Up to the latest available time + freq=frequency # Maintain original time frequency + ) + + # Merge the existing time index with the minimum ERA5 time horizon, avoiding duplicates + time = time.union(minimum_era5_time_horizon) + + # Ensure a continuous time index by filling missing values based on the determined frequency + complete_time_range = pd.date_range(start=time.min(), end=time.max(), freq=frequency) + + # Keep only timestamps up to the latest available ERA5 time + time = complete_time_range[complete_time_range <= latest_era5_time] + + if static: + return { + "year": [str(time[0].year)], + "month": [str(time[0].month).zfill(2)], + "day": [str(time[0].day).zfill(2)], + "time": time[0].strftime("%H:00"), + } + + # Prepare request for all months and years + times = [] + for year in time.year.unique(): + t = time[time.year == year] + if monthly_requests: + for month in t.month.unique(): + query = { + "year": str(year), + "month": [str(month).zfill(2)], + "day": list(t[t.month == month].day.unique().astype(str).str.zfill(2)), + "time": ["%02d:00" % h for h in t[t.month == month].hour.unique()], + } + times.append(query) + else: + query = { + "year": [str(year)], + "month": list(t.month.unique().astype(str).str.zfill(2)), + "day": list(t.day.unique().astype(str).str.zfill(2)), + "time": ["%02d:00" % h for h in t.hour.unique()], + } + times.append(query) + return times + + +def get_data( + cutout, + feature, + tmpdir, + lock=None, + monthly_requests=False, + concurrent_requests=False, + **creation_parameters, +): + """ + Retrieve data from Meteo API. + + This front-end function downloads data for a specific feature and formats + it to match the given Cutout. + + Parameters + ---------- + cutout : atlite.Cutout + feature : str + Name of the feature data to retrieve. Must be in + `atlite.datasets.era5.features` + tmpdir : str/Path + Directory where the temporary netcdf files are stored. + monthly_requests : bool, optional + If True, the data is requested on a monthly basis in ERA5. This is useful for + large cutouts, where the data is requested in smaller chunks. The + default is False + concurrent_requests : bool, optional + If True, the monthly data requests are posted concurrently. + Only has an effect if `monthly_requests` is True. + **creation_parameters : + Additional keyword arguments. The only effective argument is 'sanitize' + (default True) which sets sanitization of the data on or off. + + Returns + ------- + xarray.Dataset + Dataset of dask arrays of the retrieved variables. + + """ + coords = cutout.coords + initialization_time = creation_parameters['init_time'] + + sanitize = creation_parameters.get("sanitize", True) + + retrieval_params_meteo = { + "product": "meteo_historic_forecast_api_data", + "area": _area(coords), + "chunks": cutout.chunks, + "grid": [cutout.dx, cutout.dy], + "tmpdir": tmpdir, + "lock": lock, + } + + retrieval_params_era5 = { + "product": "reanalysis-era5-single-levels", + "area": _area(coords), + "chunks": cutout.chunks, + "grid": [cutout.dx, cutout.dy], + "tmpdir": tmpdir, + "lock": lock, + } + + + # Get fast changing variabels from meteo forecast + func_meteo = globals().get(f"get_data_meteo_{feature}") + logger.info(f"Requesting data for feature {feature} from meteo...") + + + if func_meteo is not None: + datasets_meteo = func_meteo({**retrieval_params_meteo, + **{"start": coords["time"].to_index()[0], + "end": coords["time"].to_index()[-1], + "coords": coords}}) + else: + datasets_meteo = xr.Dataset() + + + def retrieve_once(time): + ds = func_era5({**retrieval_params_era5, **time}) + return ds + + + # Get missing and slow changing variabels from era5 data and interpolation + func_era5 = globals().get(f"get_data_era5_{feature}") + + if func_era5 is not None: + logger.info(f"Requesting addtional data for feature {feature} from era5...") + + if feature in static_features: + return retrieve_once(retrieval_times_era5_forecast(coords, initialization_time, static=True)).squeeze() + + time_chunks = retrieval_times_era5_forecast(coords, initialization_time, monthly_requests=monthly_requests) + if concurrent_requests: + delayed_datasets = [delayed(retrieve_once)(chunk) for chunk in time_chunks] + datasets_era5 = compute(*delayed_datasets) + else: + datasets_era5 = map(retrieve_once, time_chunks) + + datasets_era5 = xr.concat(datasets_era5, dim="time") + + else: + datasets_era5 = xr.Dataset() + + # Combine datasets and calculate the required variables + combine_func = globals().get(f"combine_data_{feature}") + + logger.info(f"Combine meteo and era5 datasets for feature {feature}...") + + if combine_func is not None: + datasets = combine_func(datasets_meteo, + datasets_era5, + cutout.data.interp_s, + cutout.data.interp_t) + else: + datasets = xr.merge([datasets_meteo, datasets_era5]) + + sanitize_func = globals().get(f"sanitize_{feature}") + if sanitize and sanitize_func is not None: + # Sanitize the data after interpolation to remove residuals + datasets = sanitize_func(datasets) + + return datasets.sel(time=coords["time"]) diff --git a/atlite/datasets/meteo_historic.py b/atlite/datasets/meteo_historic.py new file mode 100644 index 00000000..6d0fef3e --- /dev/null +++ b/atlite/datasets/meteo_historic.py @@ -0,0 +1,966 @@ +# -*- coding: utf-8 -*- + +# SPDX-FileCopyrightText: 2016-2021 The Atlite Authors +# +# SPDX-License-Identifier: GPL-3.0-or-later + +""" +Module for downloading and curating historic data from ECMWFs ERA5 dataset (via Open-Meteo). + +For further reference see +https://open-meteo.com/en/docs/historical-weather-api +""" + +import logging +import os +import io +import re +import zipfile +import warnings +import weakref +import time +import xarray as xr +import datetime +import openmeteo_requests +import requests_cache +import cdsapi +import numpy as np +import pandas as pd + +from numpy import atleast_1d +from dask import compute, delayed +from tempfile import mkstemp +from retry_requests import retry + +from ..gis import maybe_swap_spatial_dims +from ..pv.solar_position import SolarPosition + +# Null context for running a with statements wihout any context +try: + from contextlib import nullcontext +except ImportError: + # for Python verions < 3.7: + import contextlib + + @contextlib.contextmanager + def nullcontext(): + yield + + +logger = logging.getLogger(__name__) + +# Setup Open-Meteo client with 7-day cache and retry on failure +# Cache duration: -1 = never expire, 0 = no caching, timedelta = expire after set time +cache_window = datetime.timedelta(days=7) +cache_session = requests_cache.CachedSession('.meteo.cache', backend='sqlite', expire_after=cache_window) +retry_session = retry(cache_session, retries=5, backoff_factor=3) +openmeteo = openmeteo_requests.Client(session=retry_session) + +# Set url for data download, this allows to switch to different data +# sources more easily. +era5_url = 'https://cds.climate.copernicus.eu/api' +meteo_url = "https://archive-api.open-meteo.com/v1/archive" + +# Open-Meteo request limits +MINUTE_LIMIT = 600 +HOUR_LIMIT = 5000 +DAY_LIMIT = 10000 + +# Delay of ERA5 data upload +# For Open-Meteo slow changing variables from ERA5 are always interpolated +# Starting from data of at least 7 days before nowtime +ERA5_DELAY = pd.Timedelta(hours=-6.0*24) + +# Model and CRS Settings +crs = 4326 + +features = { + "height": ["height"], + "wind": ["wnd100m", "wnd_azimuth", "roughness"], + "influx": [ + "influx_toa", + "influx_direct", + "influx_diffuse", + "albedo", + "solar_altitude", + "solar_azimuth", + ], + "temperature": ["temperature", "soil temperature"], +} + +static_features = {"height"} + +requirements = {'x': slice(-90, 90, 0.1), + 'y': slice(-90, 90, 0.1), + 'offset': (pd.Timestamp('1940-01-01')-pd.Timestamp.utcnow().replace(tzinfo=None).floor("h")), + 'forecast': (pd.Timedelta(hours=-1.0*24)+(pd.Timestamp.utcnow().replace(tzinfo=None).ceil("d")-pd.Timestamp.utcnow().replace(tzinfo=None))).floor("h"), + 'dt': pd.Timedelta(hours=1), + 'parallel': False, + } + + +def _checkModuleRequirements(x, y, time, time_now, **kwargs): + """ + Load and check the data requirements for a given module. + + Parameters: + x (slice): Defines the start, stop, and step values for the x-dimension. + y (slice): Defines the start, stop, and step values for the y-dimension. + time (slice): Defines the start, stop, and step values for the time dimension. + **kwargs: Additional optional parameters. + """ + + # Extract start, stop, and step values for x + x_start, x_stop, x_step = x.start, x.stop, x.step + + # Adjust x range based on module requirements + if requirements['x'].start > x.start: + x_start = requirements['x'].start + if requirements['x'].stop < x.stop: + x_stop = requirements['x'].stop + if requirements['x'].step > x.step: + x_step = requirements['x'].step + + x = slice(x_start, x_stop, x_step) + + # Extract start, stop, and step values for y + y_start, y_stop, y_step = y.start, y.stop, y.step + + # Adjust y range based on module requirements + if requirements['y'].start > y.start: + y_start = requirements['y'].start + if requirements['y'].stop < y.stop: + y_stop = requirements['y'].stop + if requirements['y'].step > y.step: + y_step = requirements['y'].step + + y = slice(y_start, y_stop, y_step) + + + # Extract time range parameters + time_start = time.start + time_stop = time.stop + time_step = time.step + + # Check forecast feasibility + feasible_start = time_now + requirements['offset'] + feasible_end = time_now + requirements['forecast'] + + # Ensure time_start is within feasible bounds + if time_start < feasible_start: + logger.error(f"The required forecast start time {time_start} exceeds the model requirements.") + logger.error(f"The minimum start time of the forecast for {time_now} is {feasible_start}.") + logger.error(f"The maximum historical offset of the forecast is {requirements['offset']}.") + raise ValueError(f"Invalid forecast start time: {time_start}. Must be >= {feasible_start}.") + + if time_start >= feasible_end: + logger.error(f"The required forecast start time {time_start} exceeds the model requirements.") + logger.error(f"The maximum start time of the forecast for {time_now} needs to be smaller than {feasible_end}.") + raise ValueError(f"Invalid forecast start time: {time_start}. Must be < {feasible_end}.") + + # Ensure time_stop is greater than time_start + if time_stop <= time_start: + logger.error(f"The required forecast end time {time_stop} exceeds the model requirements.") + logger.error(f"The minimum end time of the forecast for {time_now} needs to be larger than {time_start}.") + raise ValueError(f"Invalid forecast end time: {time_stop}. Must be > {time_start}.") + + # Ensure time_stop is greater than time_start + if time_stop > feasible_end: + logger.error(f"The required forecast end time {time_stop} exceeds the model requirements.") + logger.error(f"The maximum end time of the forecast for {time_now} is {feasible_end}.") + logger.error(f"The maximum forecast horizon is {requirements['forecast']}.") + raise ValueError(f"Invalid forecast end time: {time_stop}. Must be <= {feasible_end}.") + + # Ensure time step is within required limits + if (time_step is pd.Timedelta(None)) or (time.step < requirements['dt']): + logger.warning(f"The required temporal forecast resolution {time_step} exceeds the model requirements.") + logger.warning(f"The minimum temporal resolution of the forecast is {requirements['dt']}.") + logger.info(f"Set the temporal forecast resolution to the minimum: {requirements['dt']}.") + time_step = requirements['dt'] + + time = slice(time_start, time_stop, time_step) + + # Retrieve parallel processing setting from requirements + parallel = requirements['parallel'] + + return x, y, time, parallel + + +def _add_height(ds): + """Convert geopotential 'z' to geopotential height following [1]. + + References + ---------- + [1] ERA5: surface elevation and orography, retrieved: 10.02.2019 + https://confluence.ecmwf.int/display/CKB/ERA5%3A+surface+elevation+and+orography + + """ + g0 = 9.80665 + z = ds["z"] + if "time" in z.coords: + z = z.isel(time=0, drop=True) + ds["height"] = z / g0 + ds = ds.drop_vars("z") + return ds + + +def _rename_and_clean_coords(ds, add_lon_lat=True): + """ + Rename 'longitude' and 'latitude' columns to 'x' and 'y' and fix roundings. + + Optionally (add_lon_lat, default:True) preserves latitude and + longitude columns as 'lat' and 'lon'. + """ + ds = ds.rename({"longitude": "x", "latitude": "y"}) + if "valid_time" in ds.sizes: + ds = ds.rename({"valid_time": "time"}).unify_chunks() + # round coords since cds coords are float32 which would lead to mismatches + ds = ds.assign_coords( + x=np.round(ds.x.astype(float), 5), y=np.round(ds.y.astype(float), 5) + ) + ds = maybe_swap_spatial_dims(ds) + if add_lon_lat: + ds = ds.assign_coords(lon=ds.coords["x"], lat=ds.coords["y"]) + + # Combine ERA5 and ERA5T data into a single dimension. + # See https://github.com/PyPSA/atlite/issues/190 + if "expver" in ds.coords: + unique_expver = np.unique(ds["expver"].values) + if len(unique_expver) > 1: + expver_dim = xr.DataArray( + unique_expver, dims=["expver"], coords={"expver": unique_expver} + ) + ds = ( + ds.assign_coords({"expver_dim": expver_dim}) + .drop_vars("expver") + .rename({"expver_dim": "expver"}) + .set_index(expver="expver") + ) + for var in ds.data_vars: + ds[var] = ds[var].expand_dims("expver") + # expver=1 is ERA5 data, expver=5 is ERA5T data This combines both + # by filling in NaNs from ERA5 data with values from ERA5T. + ds = ds.sel(expver="0001").combine_first(ds.sel(expver="0005")) + ds = ds.drop_vars(["expver", "number"], errors="ignore") + + return ds + + +def _interpolate(ds, ds_ref, static, interp_s, interp_t): + + #Interpolate data to specific latitude and longitude values given as input (due to specific model resolution) + + if not static: + try: + ds = ds.interp( + time=ds_ref.time.values, + method=interp_t, + kwargs={"fill_value": "extrapolate"}, + ) + except ValueError: + logger.info(f"Interpolation: Not enough supporting points for used interpolation method {interp_t}.") + logger.info("Interpolation method is set to 'nearest' instead.") + ds = ds.interp( + time=ds_ref.time.values, + method='nearest', + kwargs={"fill_value": "extrapolate"}, + ) + + try: + ds = ds.interp( + x=ds_ref.x.values, + y=ds_ref.y.values, + method=interp_s, + kwargs={"fill_value": "extrapolate"}, + ) + except ValueError: + logger.info(f"Interpolation: Not enough supporting points for used interpolation method {interp_s}.") + logger.info("Interpolation method is set to 'nearest' instead.") + ds = ds.interp( + x=ds_ref.x.values, + y=ds_ref.y.values, + method='nearest', + kwargs={"fill_value": "extrapolate"}, + ) + + return ds + + +def get_data_meteo_wind(retrieval_params): + """Get all data from meteo API for given retrieval parameters at once to save requests and runtime.""" + + ds = retrieve_meteo_data( + url=meteo_url, + variable=[ + "windspeed_100m", + "winddirection_100m", + ], + **retrieval_params, + ) + + return ds + + +def get_data_meteo_influx(retrieval_params): + """Get all data from meteo API for given retrieval parameters at once to save requests and runtime.""" + + ds = retrieve_meteo_data( + url=meteo_url, + variable=[ + # "shortwave_radiation", + "direct_radiation", + "diffuse_radiation", + # "direct_normal_irradiance", + # "terrestrial_radiation", + ], + **retrieval_params, + ) + + return ds + + +def get_data_meteo_temperature(retrieval_params): + """Get all data from meteo API for given retrieval parameters at once to save requests and runtime.""" + + ds = retrieve_meteo_data( + url=meteo_url, + variable=[ + "temperature_2m", + "soil_temperature_54cm", + ], + **retrieval_params, + ) + + return ds + + +def get_data_era5_wind(retrieval_params): + """Get wind data for given retrieval parameters.""" + + ds = retrieve_era5_data( + url=era5_url, + variable=["forecast_surface_roughness"], + **retrieval_params, + ) + + ds = _rename_and_clean_coords(ds) + + return ds + + +def combine_data_wind(ds_meteo, ds_era5, interp_s, interp_t): + + ds_era5 = _interpolate(ds=ds_era5, + ds_ref=ds_meteo, + static=False, + interp_s=interp_s, + interp_t=interp_t, + ) + + ds = xr.merge([ds_meteo, ds_era5]) + + ds = ds.rename( + { + "windspeed_100m": "wnd100m", + "winddirection_100m": "wnd_azimuth", + "fsr": "roughness", + } + ) + + ds.wnd100m.attrs.update(units="m s**-1", long_name="Wind speed at 100m above ground") + ds.wnd_azimuth.attrs.update( + units="degree", long_name="Wind direction at 100m above ground" + ) + + # unify_chunks() is necessary to avoid a bug in xarray + ds = ds.unify_chunks() + + return ds + + +def sanitize_wind(ds): + """Sanitize retrieved wind data.""" + ds["roughness"] = ds["roughness"].where(ds["roughness"] >= 0.0, 2e-4) + return ds + + +def get_data_era5_influx(retrieval_params): + """Get influx data for given retrieval parameters.""" + + ds = retrieve_era5_data( + url=era5_url, + variable=["forecast_albedo", + "toa_incident_solar_radiation"], + **retrieval_params, + ) + + ds = _rename_and_clean_coords(ds) + + return ds + + +def combine_data_influx(ds_meteo, ds_era5, interp_s, interp_t): + + ds_era5 = _interpolate(ds=ds_era5, + ds_ref=ds_meteo, + static=False, + interp_s=interp_s, + interp_t=interp_t, + ) + + ds = xr.merge([ds_meteo, ds_era5]) + + ds = ds.rename( + { + "direct_radiation": "influx_direct", + "diffuse_radiation": "influx_diffuse", + "tisr": "influx_toa", + "fal": "albedo", + } + ) + + # Convert from energy to power J m**-2 -> W m**-2 and clip negative fluxes + ds["influx_toa"] = ds["influx_toa"] / (60.0 * 60.0) + + ds.influx_direct.attrs.update( + units="W m**-2", long_name="Surface direct solar radiation downwards" + ) + ds.influx_diffuse.attrs.update( + units="W m**-2", long_name="Surface diffuse solar radiation downwards" + ) + ds.influx_toa.attrs.update( + units="W m**-2", long_name="TOA incident solar radiation" + ) + + # unify_chunks() is necessary to avoid a bug in xarray + ds = ds.unify_chunks() + + # ERA5 variables are mean values for previous hour, i.e. 13:01 to 14:00 are labelled as "14:00" + # account by calculating the SolarPosition for the center of the interval for aggregation happens + # see https://github.com/PyPSA/atlite/issues/158 + # Do not show DeprecationWarning from new SolarPosition calculation (#199) + with warnings.catch_warnings(): + warnings.simplefilter("ignore", DeprecationWarning) + time_shift = pd.to_timedelta("-30 minutes") + sp = SolarPosition(ds, time_shift=time_shift) + sp = sp.rename({v: f"solar_{v}" for v in sp.data_vars}) + + ds = xr.merge([ds, sp]) + + return ds + + +def sanitize_influx(ds): + """Sanitize retrieved influx data.""" + for a in ("influx_direct", "influx_diffuse", "influx_toa"): + ds[a] = ds[a].clip(min=0.0) + return ds + + +def combine_data_temperature(ds_meteo, ds_era5, interp_s, interp_t): + """Get wind temperature for given retrieval parameters.""" + ds = xr.merge([ds_meteo, ds_era5]) + + ds = ds.rename( + {"temperature_2m": "temperature", + "soil_temperature_54cm": "soil temperature"} + ) + + # Convert from Celsius to Kelvin C -> K, by adding 273.15 + ds = ds + 273.15 + + ds["temperature"].attrs.update(units="K", long_name="2 metre temperature") + ds["soil temperature"].attrs.update(units="K", long_name="Soil temperature 54cm") + + # unify_chunks() is necessary to avoid a bug in xarray + ds = ds.unify_chunks() + + return ds + + +def get_data_era5_height(retrieval_params): + """Get height data for given retrieval parameters.""" + ds = retrieve_era5_data( + url=era5_url, + variable=["geopotential"], + **retrieval_params) + + ds = _rename_and_clean_coords(ds) + ds = _add_height(ds) + + return ds + + +def _area(coords): + # North, West, South, East. Default: global + x0, x1 = coords["x"].min().item(), coords["x"].max().item() + y0, y1 = coords["y"].min().item(), coords["y"].max().item() + return [y1, x0, y0, x1] + + +def noisy_unlink(path): + """Delete file at given path.""" + logger.debug(f"Deleting file {path}") + try: + os.unlink(path) + except PermissionError: + logger.error(f"Unable to delete file {path}, as it is still in use.") + + +def retrieve_meteo_data(url, product, chunks=None, tmpdir=None, lock=None, **updates): + """ + Download meteorological data (e.g., ERA5-style) from the Open-Meteo API. + + Parameters + ---------- + url : str + API endpoint (typically `meteo_url`). + product : str + Product identifier (not used here but reserved for compatibility). + chunks : dict, optional + Chunking configuration for final xarray dataset. + tmpdir : str, optional + Temporary storage path (not used here). + lock : threading.Lock, optional + Lock for parallel-safe writing (not used here). + updates : dict + Additional request parameters including 'coords', 'start', 'end', and 'variable'. + + Returns + ------- + xarray.Dataset + Weather data with time, latitude, and longitude dimensions, chunked as requested. + """ + + # Build request from base + user overrides + request = {"product_type": "meteo_api", "format": "direct_download"} + request.update(updates) + + # Generate list of (lon, lat) coordinate pairs + grid = np.meshgrid(request['coords']['x'], request['coords']['y']) + coords = pd.DataFrame(zip(grid[0].flatten(), grid[1].flatten()), columns=['longitude', 'latitude']) + + # Calculate time and variable counts + start_date = request['start'].strftime('%Y-%m-%d') + end_date = request['end'].strftime('%Y-%m-%d') + nr_days = abs((pd.to_datetime(start_date) - pd.to_datetime(end_date)).days) + nr_variables = len(request['variable']) + nr_locations = len(coords) + + # Estimate request weight based on Open-Meteo's internal model + weight_of_full_api_request = max(nr_variables / 10, (nr_variables / 10) * (nr_days / 14)) * nr_locations + + # Dynamically determine chunk size based on rate limit thresholds + if weight_of_full_api_request <= MINUTE_LIMIT: + chunk_size = MINUTE_LIMIT / 10 + elif weight_of_full_api_request <= HOUR_LIMIT: + chunk_size = HOUR_LIMIT / 10 + else: + chunk_size = DAY_LIMIT / 10 + + chunk_size = int(max(1, chunk_size)) # Ensure chunk_size ≥ 1 and integer + + logger.info(f"Meteo-API: Downloading variables\n\t{request['variable']}\n") + logger.info(f"Meteo-API: Expected request weight of full request: {weight_of_full_api_request}") + + # Loop through coordinate grid in blocks and request data + data = [] + for i in range(0, len(coords), chunk_size): + coord_chunk = coords.iloc[i:i + chunk_size] + + # Prepare API parameters for the current chunk + params = { + "longitude": coord_chunk['longitude'].tolist(), + "latitude": coord_chunk['latitude'].tolist(), + "hourly": request['variable'], + "wind_speed_unit": "ms", + "start_date": start_date, + "end_date": end_date + } + + try: + responses = openmeteo.weather_api(url, params=params) + except Exception as e: + logger.info(f"{e}") + try: + # Extract and classify rate limiting error + rate_limiting_error = list(e.args)[0]['reason'] + match = re.search(r"(Minutely|Hourly|Daily) API request limit", rate_limiting_error) + if match: + apply_rate_limiting(error=match[0]) + else: + apply_rate_limiting(error=None) + # Retry after delay + responses = openmeteo.weather_api(url, params=params) + except Exception as e: + # Skip this chunk on repeated failure + logger.error(f"Meteo-API: Failed to fetch data for block starting at " + f"({coord_chunk.loc[0, 'longitude']}, {coord_chunk.loc[0, 'latitude']}): {e}") + continue + + # Parse chunked response and append results + data.extend(parse_meteo_responses(responses, params)) + + # Combine all parsed DataFrames into a single xarray dataset + ds = pd.concat(data).to_xarray() + ds = _rename_and_clean_coords(ds) + ds = ds.chunk(chunks=chunks) + + return ds + + +def parse_meteo_responses(responses, params): + """ + Parse raw Open-Meteo API responses into a list of DataFrames. + + Parameters + ---------- + responses : list + List of Open-Meteo response objects, one per coordinate. + params : dict + Parameters used in the API call, containing latitude, longitude, and variable info. + + Returns + ------- + list of pd.DataFrame + Each DataFrame contains one location’s weather data with time as index. + """ + + data = [] + for j, response in enumerate(responses): + # Reconstruct time index based on interval + range_start = pd.to_datetime(response.Hourly().Time(), unit="s") + range_end = pd.to_datetime(response.Hourly().TimeEnd(), unit="s") + date_range = pd.date_range(start=range_start, end=range_end, + freq=pd.Timedelta(seconds=response.Hourly().Interval()), + inclusive="left") + + # Initialize empty DataFrame for the current location + response_df = pd.DataFrame(columns=params["hourly"]) + response_df["time"] = date_range + response_df["latitude"] = params["latitude"][j] + response_df["longitude"] = params["longitude"][j] + response_df = response_df.set_index(["time", "latitude", "longitude"]) + + # Fill in variable values + for i, param in enumerate(params["hourly"]): + response_df[param] = response.Hourly().Variables(i).ValuesAsNumpy() + + data.append(response_df) + + return data + + +def apply_rate_limiting(error=None): + """ + Apply appropriate sleep duration based on API rate-limiting error. + + Parameters + ---------- + error : str or None + One of 'Minutely API request limit', 'Hourly API request limit', + 'Daily API request limit', or None (fallback delay). + + Behavior + -------- + - Sleeps 60s for minutely errors + - Sleeps 1h for hourly errors + - Sleeps until 00:05 next day for daily errors + - Sleeps 2 minutes as fallback + """ + + now = datetime.datetime.now() + midnight = datetime.datetime(now.year, now.month, now.day) + datetime.timedelta(days=1, minutes=5) + time_until_midnight = (midnight - now).total_seconds() + + sleep_times = { + None: 120, # Fallback for unknown errors + "Minutely API request limit": 60, + "Hourly API request limit": 60 * 60, + "Daily API request limit": time_until_midnight + } + + sleep_time = sleep_times[error] + logger.info(f"Sleeping for {sleep_time / 60:.2f} minutes.") + time.sleep(sleep_time) + + +def retrieve_era5_data(url, product, chunks=None, tmpdir=None, lock=None, **updates): + """ + Download data like ERA5 from the Climate Data Store (CDS). + + If you want to track the state of your request go to + https://cds-beta.climate.copernicus.eu/requests?tab=all + """ + request = {"product_type": ["reanalysis"], + "data_format": "netcdf", + "download_format": "zip"} + + request.update(updates) + + assert {"year", "month", "variable"}.issubset( + request + ), "Need to specify at least 'variable', 'year' and 'month'" + + client = cdsapi.Client( + url = url, + info_callback=logger.debug, + debug=logging.DEBUG >= logging.root.level + ) + result = client.retrieve(product, request) + + if lock is None: + lock = nullcontext() + + with lock: + fd, target_zip = mkstemp(suffix=".zip", dir=tmpdir) + os.close(fd) + + # Inform user about data being downloaded as "* variable (year-month)" + timestr = f"{request['year']}-{request['month']}" + variables = atleast_1d(request["variable"]) + varstr = "\n\t".join([f"{v} ({timestr})" for v in variables]) + logger.info(f"CDS: Downloading variables\n\t{varstr}\n") + result.download(target_zip) + + # Open the .zip file in memory + with zipfile.ZipFile(target_zip, "r") as zf: + # Identify .nc files inside the .zip + nc_files = [name for name in zf.namelist() if name.endswith(".nc")] + + if not nc_files: + raise FileNotFoundError("No .nc files found in the downloaded .zip archive.") + + if len(nc_files) == 1: + # If there's only one .nc file, read it into memory + with zf.open(nc_files[0]) as nc_file: + # Pass the in-memory file-like object to Xarray + ds = xr.open_dataset(io.BytesIO(nc_file.read()), chunks=chunks or {}) + + else: + # If multiple .nc files, combine them using Xarray + datasets = [] + for nc_file in nc_files: + with zf.open(nc_file) as file: + dataset = xr.open_dataset(io.BytesIO(file.read()), chunks=chunks or {}) + + if 'expver' in dataset.variables: + dataset = dataset.drop_vars(["expver", "number"], errors="ignore") + + datasets.append(dataset) + + ds = xr.merge(datasets) + + if tmpdir is None: + logging.debug(f"Adding finalizer for {target_zip}") + weakref.finalize(ds._file_obj._manager, noisy_unlink, target_zip) + + return ds + + +def retrieval_times_era5_forecast(coords, initialization_time, static=False, monthly_requests=False): + """ + Get list of retrieval cdsapi arguments for time dimension in coordinates. + + If static is False, this function creates a query for each month and year + in the time axis in coords. This ensures not running into size query limits + of the cdsapi even with very (spatially) large cutouts. + If static is True, the function return only one set of parameters + for the very first time point. + + Parameters + ---------- + coords : atlite.Cutout.coords + static : bool, optional + monthly_requests : bool, optional + If True, the data is requested on a monthly basis. This is useful for + large cutouts, where the data is requested in smaller chunks. The + default is False + + Returns + ------- + list of dicts witht retrieval arguments + + """ + + # Convert time coordinates to a pandas Index + time = coords["time"].to_index() + frequency = time.freq + + # Determine the latest available ERA5 data time based on initialization time and required delay + latest_era5_time = pd.Timestamp(initialization_time) + ERA5_DELAY + + # Round up to the next full day and subtract 1 hour to align with ERA5 update frequency + latest_era5_time = latest_era5_time.ceil('D') - pd.Timedelta(hours=1) + + # Define the minimum required time horizon for ERA5 downloads (last 24 hours) + minimum_era5_time_horizon = pd.date_range( + start=latest_era5_time - pd.Timedelta(days=1), # One day before the latest available time + end=latest_era5_time, # Up to the latest available time + freq=frequency # Maintain original time frequency + ) + + # Merge the existing time index with the minimum ERA5 time horizon, avoiding duplicates + time = time.union(minimum_era5_time_horizon) + + # Ensure a continuous time index by filling missing values based on the determined frequency + complete_time_range = pd.date_range(start=time.min(), end=time.max(), freq=frequency) + + # Keep only timestamps up to the latest available ERA5 time + time = complete_time_range[complete_time_range <= latest_era5_time] + + if static: + return { + "year": [str(time[0].year)], + "month": [str(time[0].month).zfill(2)], + "day": [str(time[0].day).zfill(2)], + "time": time[0].strftime("%H:00"), + } + + # Prepare request for all months and years + times = [] + for year in time.year.unique(): + t = time[time.year == year] + if monthly_requests: + for month in t.month.unique(): + query = { + "year": str(year), + "month": [str(month).zfill(2)], + "day": list(t[t.month == month].day.unique().astype(str).str.zfill(2)), + "time": ["%02d:00" % h for h in t[t.month == month].hour.unique()], + } + times.append(query) + else: + query = { + "year": [str(year)], + "month": list(t.month.unique().astype(str).str.zfill(2)), + "day": list(t.day.unique().astype(str).str.zfill(2)), + "time": ["%02d:00" % h for h in t.hour.unique()], + } + times.append(query) + return times + + +def get_data( + cutout, + feature, + tmpdir, + lock=None, + monthly_requests=False, + concurrent_requests=False, + **creation_parameters, +): + """ + Retrieve data from Meteo API. + + This front-end function downloads data for a specific feature and formats + it to match the given Cutout. + + Parameters + ---------- + cutout : atlite.Cutout + feature : str + Name of the feature data to retrieve. Must be in + `atlite.datasets.era5.features` + tmpdir : str/Path + Directory where the temporary netcdf files are stored. + monthly_requests : bool, optional + If True, the data is requested on a monthly basis in ERA5. This is useful for + large cutouts, where the data is requested in smaller chunks. The + default is False + concurrent_requests : bool, optional + If True, the monthly data requests are posted concurrently. + Only has an effect if `monthly_requests` is True. + **creation_parameters : + Additional keyword arguments. The only effective argument is 'sanitize' + (default True) which sets sanitization of the data on or off. + + Returns + ------- + xarray.Dataset + Dataset of dask arrays of the retrieved variables. + + """ + coords = cutout.coords + initialization_time = creation_parameters['init_time'] + + sanitize = creation_parameters.get("sanitize", True) + + retrieval_params_meteo = { + "product": "meteo_historic_forecast_api_data", + "area": _area(coords), + "chunks": cutout.chunks, + "grid": [cutout.dx, cutout.dy], + "tmpdir": tmpdir, + "lock": lock, + } + + retrieval_params_era5 = { + "product": "reanalysis-era5-single-levels", + "area": _area(coords), + "chunks": cutout.chunks, + "grid": [cutout.dx, cutout.dy], + "tmpdir": tmpdir, + "lock": lock, + } + + + # Get fast changing variabels from meteo forecast + func_meteo = globals().get(f"get_data_meteo_{feature}") + logger.info(f"Requesting data for feature {feature} from meteo...") + + + if func_meteo is not None: + datasets_meteo = func_meteo({**retrieval_params_meteo, + **{"start": coords["time"].to_index()[0], + "end": coords["time"].to_index()[-1], + "coords": coords}}) + else: + datasets_meteo = xr.Dataset() + + + def retrieve_once(time): + ds = func_era5({**retrieval_params_era5, **time}) + return ds + + + # Get missing and slow changing variabels from era5 data and interpolation + func_era5 = globals().get(f"get_data_era5_{feature}") + + if func_era5 is not None: + logger.info(f"Requesting addtional data for feature {feature} from era5...") + + if feature in static_features: + return retrieve_once(retrieval_times_era5_forecast(coords, initialization_time, static=True)).squeeze() + + time_chunks = retrieval_times_era5_forecast(coords, initialization_time, monthly_requests=monthly_requests) + if concurrent_requests: + delayed_datasets = [delayed(retrieve_once)(chunk) for chunk in time_chunks] + datasets_era5 = compute(*delayed_datasets) + else: + datasets_era5 = map(retrieve_once, time_chunks) + + datasets_era5 = xr.concat(datasets_era5, dim="time") + + else: + datasets_era5 = xr.Dataset() + + # Combine datasets and calculate the required variables + combine_func = globals().get(f"combine_data_{feature}") + + logger.info(f"Combine meteo and era5 datasets for feature {feature}...") + + if combine_func is not None: + datasets = combine_func(datasets_meteo, + datasets_era5, + cutout.data.interp_s, + cutout.data.interp_t) + else: + datasets = xr.merge([datasets_meteo, datasets_era5]) + + sanitize_func = globals().get(f"sanitize_{feature}") + if sanitize and sanitize_func is not None: + # Sanitize the data after interpolation to remove residuals + datasets = sanitize_func(datasets) + + return datasets.sel(time=coords["time"]) diff --git a/atlite/datasets/meteo_historic_forecast.py b/atlite/datasets/meteo_historic_forecast.py new file mode 100644 index 00000000..6e306dd7 --- /dev/null +++ b/atlite/datasets/meteo_historic_forecast.py @@ -0,0 +1,964 @@ +# -*- coding: utf-8 -*- + +# SPDX-FileCopyrightText: 2016-2021 The Atlite Authors +# +# SPDX-License-Identifier: GPL-3.0-or-later + +""" +Module for downloading and curating historic forecast data from different weather prediction models (via Open-Meteo). + +For further reference see +https://open-meteo.com/en/docs/historical-forecast-api +""" + +import logging +import os +import io +import re +import zipfile +import warnings +import weakref +import time +import xarray as xr +import datetime +import openmeteo_requests +import requests_cache +import cdsapi +import numpy as np +import pandas as pd + +from numpy import atleast_1d +from dask import compute, delayed +from tempfile import mkstemp +from retry_requests import retry + +from ..gis import maybe_swap_spatial_dims +from ..pv.solar_position import SolarPosition + +# Null context for running a with statements wihout any context +try: + from contextlib import nullcontext +except ImportError: + # for Python verions < 3.7: + import contextlib + + @contextlib.contextmanager + def nullcontext(): + yield + + +logger = logging.getLogger(__name__) + + +# Setup Open-Meteo client with 7-day cache and retry on failure +# Cache duration: -1 = never expire, 0 = no caching, timedelta = expire after set time +cache_window = datetime.timedelta(days=7) +cache_session = requests_cache.CachedSession('.meteo.cache', backend='sqlite', expire_after=cache_window) +retry_session = retry(cache_session, retries=5, backoff_factor=3) +openmeteo = openmeteo_requests.Client(session=retry_session) + +# Define data source URLs (easy to switch if needed) +era5_url = 'https://cds.climate.copernicus.eu/api' +meteo_url = "https://historical-forecast-api.open-meteo.com/v1/forecast" + +# Open-Meteo request limits +MINUTE_LIMIT = 600 +HOUR_LIMIT = 5000 +DAY_LIMIT = 10000 + +# Delay of ERA5 data upload +# For Open-Meteo slow changing variables from ERA5 are always interpolated +# Starting from data of at least 7 days before nowtime +ERA5_DELAY = pd.Timedelta(hours=-6.0*24) + +# Model and CRS Settings +crs = 4326 + +features = { + "height": ["height"], + "wind": ["wnd80m", "wnd_azimuth", "roughness"], + "influx": [ + "influx_toa", + "influx_direct", + "influx_diffuse", + "albedo", + "solar_altitude", + "solar_azimuth", + ], + "temperature": ["temperature", "soil temperature"], +} + +static_features = {"height"} + +requirements = {'x': slice(-90, 90, 0.1), + 'y': slice(-90, 90, 0.1), + 'offset': (pd.Timestamp('2022-01-01')-pd.Timestamp.utcnow().replace(tzinfo=None).floor("h")), + 'forecast': ((pd.Timestamp.utcnow().replace(tzinfo=None).ceil("d")-pd.Timestamp.utcnow().replace(tzinfo=None))).floor("h"), + 'dt': pd.Timedelta(hours=1), + 'parallel': True, + } + + +def _checkModuleRequirements(x, y, time, time_now, **kwargs): + """ + Load and check the data requirements for a given module. + + Parameters: + x (slice): Defines the start, stop, and step values for the x-dimension. + y (slice): Defines the start, stop, and step values for the y-dimension. + time (slice): Defines the start, stop, and step values for the time dimension. + **kwargs: Additional optional parameters. + """ + + # Extract start, stop, and step values for x + x_start, x_stop, x_step = x.start, x.stop, x.step + + # Adjust x range based on module requirements + if requirements['x'].start > x.start: + x_start = requirements['x'].start + if requirements['x'].stop < x.stop: + x_stop = requirements['x'].stop + if requirements['x'].step > x.step: + x_step = requirements['x'].step + + x = slice(x_start, x_stop, x_step) + + # Extract start, stop, and step values for y + y_start, y_stop, y_step = y.start, y.stop, y.step + + # Adjust y range based on module requirements + if requirements['y'].start > y.start: + y_start = requirements['y'].start + if requirements['y'].stop < y.stop: + y_stop = requirements['y'].stop + if requirements['y'].step > y.step: + y_step = requirements['y'].step + + y = slice(y_start, y_stop, y_step) + + + # Extract time range parameters + time_start = time.start + time_stop = time.stop + time_step = time.step + + # Check forecast feasibility + feasible_start = time_now + requirements['offset'] + feasible_end = time_now + requirements['forecast'] + + # Ensure time_start is within feasible bounds + if time_start < feasible_start: + logger.error(f"The required forecast start time {time_start} exceeds the model requirements.") + logger.error(f"The minimum start time of the forecast for {time_now} is {feasible_start}.") + logger.error(f"The maximum historical offset of the forecast is {requirements['offset']}.") + raise ValueError(f"Invalid forecast start time: {time_start}. Must be >= {feasible_start}.") + + if time_start >= feasible_end: + logger.error(f"The required forecast start time {time_start} exceeds the model requirements.") + logger.error(f"The maximum start time of the forecast for {time_now} needs to be smaller than {feasible_end}.") + raise ValueError(f"Invalid forecast start time: {time_start}. Must be < {feasible_end}.") + + # Ensure time_stop is greater than time_start + if time_stop <= time_start: + logger.error(f"The required forecast end time {time_stop} exceeds the model requirements.") + logger.error(f"The minimum end time of the forecast for {time_now} needs to be larger than {time_start}.") + raise ValueError(f"Invalid forecast end time: {time_stop}. Must be > {time_start}.") + + # Ensure time_stop is greater than time_start + if time_stop > feasible_end: + logger.error(f"The required forecast end time {time_stop} exceeds the model requirements.") + logger.error(f"The maximum end time of the forecast for {time_now} is {feasible_end}.") + logger.error(f"The maximum forecast horizon is {requirements['forecast']}.") + raise ValueError(f"Invalid forecast end time: {time_stop}. Must be <= {feasible_end}.") + + # Ensure time step is within required limits + if (time_step is pd.Timedelta(None)) or (time.step < requirements['dt']): + logger.warning(f"The required temporal forecast resolution {time_step} exceeds the model requirements.") + logger.warning(f"The minimum temporal resolution of the forecast is {requirements['dt']}.") + logger.info(f"Set the temporal forecast resolution to the minimum: {requirements['dt']}.") + time_step = requirements['dt'] + + time = slice(time_start, time_stop, time_step) + + # Retrieve parallel processing setting from requirements + parallel = requirements['parallel'] + + return x, y, time, parallel + + +def _add_height(ds): + """Convert geopotential 'z' to geopotential height following [1]. + + References + ---------- + [1] ERA5: surface elevation and orography, retrieved: 10.02.2019 + https://confluence.ecmwf.int/display/CKB/ERA5%3A+surface+elevation+and+orography + + """ + g0 = 9.80665 + z = ds["z"] + if "time" in z.coords: + z = z.isel(time=0, drop=True) + ds["height"] = z / g0 + ds = ds.drop_vars("z") + return ds + + +def _rename_and_clean_coords(ds, add_lon_lat=True): + """ + Rename 'longitude' and 'latitude' columns to 'x' and 'y' and fix roundings. + + Optionally (add_lon_lat, default:True) preserves latitude and + longitude columns as 'lat' and 'lon'. + """ + ds = ds.rename({"longitude": "x", "latitude": "y"}) + if "valid_time" in ds.sizes: + ds = ds.rename({"valid_time": "time"}).unify_chunks() + # round coords since cds coords are float32 which would lead to mismatches + ds = ds.assign_coords( + x=np.round(ds.x.astype(float), 5), y=np.round(ds.y.astype(float), 5) + ) + ds = maybe_swap_spatial_dims(ds) + if add_lon_lat: + ds = ds.assign_coords(lon=ds.coords["x"], lat=ds.coords["y"]) + + # Combine ERA5 and ERA5T data into a single dimension. + # See https://github.com/PyPSA/atlite/issues/190 + if "expver" in ds.coords: + unique_expver = np.unique(ds["expver"].values) + if len(unique_expver) > 1: + expver_dim = xr.DataArray( + unique_expver, dims=["expver"], coords={"expver": unique_expver} + ) + ds = ( + ds.assign_coords({"expver_dim": expver_dim}) + .drop_vars("expver") + .rename({"expver_dim": "expver"}) + .set_index(expver="expver") + ) + for var in ds.data_vars: + ds[var] = ds[var].expand_dims("expver") + # expver=1 is ERA5 data, expver=5 is ERA5T data This combines both + # by filling in NaNs from ERA5 data with values from ERA5T. + ds = ds.sel(expver="0001").combine_first(ds.sel(expver="0005")) + ds = ds.drop_vars(["expver", "number"], errors="ignore") + + return ds + + +def _interpolate(ds, ds_ref, static, interp_s, interp_t): + + #Interpolate data to specific latitude and longitude values given as input (due to specific model resolution) + + if not static: + try: + ds = ds.interp( + time=ds_ref.time.values, + method=interp_t, + kwargs={"fill_value": "extrapolate"}, + ) + except ValueError: + logger.info(f"Interpolation: Not enough supporting points for used interpolation method {interp_t}.") + logger.info("Interpolation method is set to 'nearest' instead.") + ds = ds.interp( + time=ds_ref.time.values, + method='nearest', + kwargs={"fill_value": "extrapolate"}, + ) + + try: + ds = ds.interp( + x=ds_ref.x.values, + y=ds_ref.y.values, + method=interp_s, + kwargs={"fill_value": "extrapolate"}, + ) + except ValueError: + logger.info(f"Interpolation: Not enough supporting points for used interpolation method {interp_s}.") + logger.info("Interpolation method is set to 'nearest' instead.") + ds = ds.interp( + x=ds_ref.x.values, + y=ds_ref.y.values, + method='nearest', + kwargs={"fill_value": "extrapolate"}, + ) + + return ds + + +def get_data_meteo_wind(retrieval_params): + """Get all data from meteo API for given retrieval parameters at once to save requests and runtime.""" + + ds = retrieve_meteo_data( + url=meteo_url, + variable=[ + "windspeed_80m", + "winddirection_80m", + ], + **retrieval_params, + ) + + return ds + + +def get_data_meteo_influx(retrieval_params): + """Get all data from meteo API for given retrieval parameters at once to save requests and runtime.""" + + ds = retrieve_meteo_data( + url=meteo_url, + variable=[ + # "shortwave_radiation", + "direct_radiation", + "diffuse_radiation", + # "direct_normal_irradiance", + # "terrestrial_radiation", + ], + **retrieval_params, + ) + + return ds + + +def get_data_meteo_temperature(retrieval_params): + """Get all data from meteo API for given retrieval parameters at once to save requests and runtime.""" + + ds = retrieve_meteo_data( + url=meteo_url, + variable=[ + "temperature_2m", + "soil_temperature_54cm", + ], + **retrieval_params, + ) + + return ds + + +def get_data_era5_wind(retrieval_params): + """Get wind data for given retrieval parameters.""" + + ds = retrieve_era5_data( + url=era5_url, + variable=["forecast_surface_roughness"], + **retrieval_params, + ) + + ds = _rename_and_clean_coords(ds) + + return ds + + +def combine_data_wind(ds_meteo, ds_era5, interp_s, interp_t): + + ds_era5 = _interpolate(ds=ds_era5, + ds_ref=ds_meteo, + static=False, + interp_s=interp_s, + interp_t=interp_t, + ) + + ds = xr.merge([ds_meteo, ds_era5]) + + ds = ds.rename( + { + "windspeed_80m": "wnd80m", + "winddirection_80m": "wnd_azimuth", + "fsr": "roughness", + } + ) + + ds.wnd80m.attrs.update(units="m s**-1", long_name="Wind speed at 80m above ground") + ds.wnd_azimuth.attrs.update( + units="degree", long_name="Wind direction at 80m above ground" + ) + + # unify_chunks() is necessary to avoid a bug in xarray + ds = ds.unify_chunks() + + return ds + + +def sanitize_wind(ds): + """Sanitize retrieved wind data.""" + ds["roughness"] = ds["roughness"].where(ds["roughness"] >= 0.0, 2e-4) + return ds + + +def get_data_era5_influx(retrieval_params): + """Get influx data for given retrieval parameters.""" + + ds = retrieve_era5_data( + url=era5_url, + variable=["forecast_albedo", + "toa_incident_solar_radiation"], + **retrieval_params, + ) + + ds = _rename_and_clean_coords(ds) + + return ds + + +def combine_data_influx(ds_meteo, ds_era5, interp_s, interp_t): + + ds_era5 = _interpolate(ds=ds_era5, + ds_ref=ds_meteo, + static=False, + interp_s=interp_s, + interp_t=interp_t, + ) + + ds = xr.merge([ds_meteo, ds_era5]) + + ds = ds.rename( + { + "direct_radiation": "influx_direct", + "diffuse_radiation": "influx_diffuse", + "tisr": "influx_toa", + "fal": "albedo", + } + ) + + # Convert from energy to power J m**-2 -> W m**-2 and clip negative fluxes + ds["influx_toa"] = ds["influx_toa"] / (60.0 * 60.0) + + ds.influx_direct.attrs.update( + units="W m**-2", long_name="Surface direct solar radiation downwards" + ) + ds.influx_diffuse.attrs.update( + units="W m**-2", long_name="Surface diffuse solar radiation downwards" + ) + ds.influx_toa.attrs.update( + units="W m**-2", long_name="TOA incident solar radiation" + ) + + # unify_chunks() is necessary to avoid a bug in xarray + ds = ds.unify_chunks() + + # ERA5 variables are mean values for previous hour, i.e. 13:01 to 14:00 are labelled as "14:00" + # account by calculating the SolarPosition for the center of the interval for aggregation happens + # see https://github.com/PyPSA/atlite/issues/158 + # Do not show DeprecationWarning from new SolarPosition calculation (#199) + with warnings.catch_warnings(): + warnings.simplefilter("ignore", DeprecationWarning) + time_shift = pd.to_timedelta("-30 minutes") + sp = SolarPosition(ds, time_shift=time_shift) + sp = sp.rename({v: f"solar_{v}" for v in sp.data_vars}) + + ds = xr.merge([ds, sp]) + + return ds + + +def sanitize_influx(ds): + """Sanitize retrieved influx data.""" + for a in ("influx_direct", "influx_diffuse", "influx_toa"): + ds[a] = ds[a].clip(min=0.0) + return ds + + +def combine_data_temperature(ds_meteo, ds_era5, interp_s, interp_t): + """Get wind temperature for given retrieval parameters.""" + ds = xr.merge([ds_meteo, ds_era5]) + + ds = ds.rename( + {"temperature_2m": "temperature", + "soil_temperature_54cm": "soil temperature"} + ) + + # Convert from Celsius to Kelvin C -> K, by adding 273.15 + ds = ds + 273.15 + + ds["temperature"].attrs.update(units="K", long_name="2 metre temperature") + ds["soil temperature"].attrs.update(units="K", long_name="Soil temperature 54cm") + + # unify_chunks() is necessary to avoid a bug in xarray + ds = ds.unify_chunks() + + return ds + + +def get_data_era5_height(retrieval_params): + """Get height data for given retrieval parameters.""" + ds = retrieve_era5_data( + url=era5_url, + variable=["geopotential"], + **retrieval_params) + + ds = _rename_and_clean_coords(ds) + ds = _add_height(ds) + + return ds + + +def _area(coords): + # North, West, South, East. Default: global + x0, x1 = coords["x"].min().item(), coords["x"].max().item() + y0, y1 = coords["y"].min().item(), coords["y"].max().item() + return [y1, x0, y0, x1] + + +def noisy_unlink(path): + """Delete file at given path.""" + logger.debug(f"Deleting file {path}") + try: + os.unlink(path) + except PermissionError: + logger.error(f"Unable to delete file {path}, as it is still in use.") + + +def retrieve_meteo_data(url, product, chunks=None, tmpdir=None, lock=None, **updates): + """ + Download meteorological data (e.g., ERA5-style) from the Open-Meteo API. + + Parameters + ---------- + url : str + API endpoint (typically `meteo_url`). + product : str + Product identifier (not used here but reserved for compatibility). + chunks : dict, optional + Chunking configuration for final xarray dataset. + tmpdir : str, optional + Temporary storage path (not used here). + lock : threading.Lock, optional + Lock for parallel-safe writing (not used here). + updates : dict + Additional request parameters including 'coords', 'start', 'end', and 'variable'. + + Returns + ------- + xarray.Dataset + Weather data with time, latitude, and longitude dimensions, chunked as requested. + """ + + # Build request from base + user overrides + request = {"product_type": "meteo_api", "format": "direct_download"} + request.update(updates) + + # Generate list of (lon, lat) coordinate pairs + grid = np.meshgrid(request['coords']['x'], request['coords']['y']) + coords = pd.DataFrame(zip(grid[0].flatten(), grid[1].flatten()), columns=['longitude', 'latitude']) + + # Calculate time and variable counts + start_date = request['start'].strftime('%Y-%m-%d') + end_date = request['end'].strftime('%Y-%m-%d') + nr_days = abs((pd.to_datetime(start_date) - pd.to_datetime(end_date)).days) + nr_variables = len(request['variable']) + nr_locations = len(coords) + + # Estimate request weight based on Open-Meteo's internal model + weight_of_full_api_request = max(nr_variables / 10, (nr_variables / 10) * (nr_days / 14)) * nr_locations + + # Dynamically determine chunk size based on rate limit thresholds + if weight_of_full_api_request <= MINUTE_LIMIT: + chunk_size = MINUTE_LIMIT / 10 + elif weight_of_full_api_request <= HOUR_LIMIT: + chunk_size = HOUR_LIMIT / 10 + else: + chunk_size = DAY_LIMIT / 10 + + chunk_size = int(max(1, chunk_size)) # Ensure chunk_size ≥ 1 and integer + + logger.info(f"Meteo-API: Downloading variables\n\t{request['variable']}\n") + logger.info(f"Meteo-API: Expected request weight of full request: {weight_of_full_api_request}") + + # Loop through coordinate grid in blocks and request data + data = [] + for i in range(0, len(coords), chunk_size): + coord_chunk = coords.iloc[i:i + chunk_size] + + # Prepare API parameters for the current chunk + params = { + "longitude": coord_chunk['longitude'].tolist(), + "latitude": coord_chunk['latitude'].tolist(), + "hourly": request['variable'], + "wind_speed_unit": "ms", + "start_date": start_date, + "end_date": end_date + } + + try: + responses = openmeteo.weather_api(url, params=params) + except Exception as e: + logger.info(f"{e}") + try: + # Extract and classify rate limiting error + rate_limiting_error = list(e.args)[0]['reason'] + match = re.search(r"(Minutely|Hourly|Daily) API request limit", rate_limiting_error) + if match: + apply_rate_limiting(error=match[0]) + else: + apply_rate_limiting(error=None) + # Retry after delay + responses = openmeteo.weather_api(url, params=params) + except Exception as e: + # Skip this chunk on repeated failure + logger.error(f"Meteo-API: Failed to fetch data for block starting at " + f"({coord_chunk.loc[0, 'longitude']}, {coord_chunk.loc[0, 'latitude']}): {e}") + continue + + # Parse chunked response and append results + data.extend(parse_meteo_responses(responses, params)) + + # Combine all parsed DataFrames into a single xarray dataset + ds = pd.concat(data).to_xarray() + ds = _rename_and_clean_coords(ds) + ds = ds.chunk(chunks=chunks) + + return ds + +def parse_meteo_responses(responses, params): + """ + Parse raw Open-Meteo API responses into a list of DataFrames. + + Parameters + ---------- + responses : list + List of Open-Meteo response objects, one per coordinate. + params : dict + Parameters used in the API call, containing latitude, longitude, and variable info. + + Returns + ------- + list of pd.DataFrame + Each DataFrame contains one location’s weather data with time as index. + """ + + data = [] + for j, response in enumerate(responses): + # Reconstruct time index based on interval + range_start = pd.to_datetime(response.Hourly().Time(), unit="s") + range_end = pd.to_datetime(response.Hourly().TimeEnd(), unit="s") + date_range = pd.date_range(start=range_start, end=range_end, + freq=pd.Timedelta(seconds=response.Hourly().Interval()), + inclusive="left") + + # Initialize empty DataFrame for the current location + response_df = pd.DataFrame(columns=params["hourly"]) + response_df["time"] = date_range + response_df["latitude"] = params["latitude"][j] + response_df["longitude"] = params["longitude"][j] + response_df = response_df.set_index(["time", "latitude", "longitude"]) + + # Fill in variable values + for i, param in enumerate(params["hourly"]): + response_df[param] = response.Hourly().Variables(i).ValuesAsNumpy() + + data.append(response_df) + + return data + +def apply_rate_limiting(error=None): + """ + Apply appropriate sleep duration based on API rate-limiting error. + + Parameters + ---------- + error : str or None + One of 'Minutely API request limit', 'Hourly API request limit', + 'Daily API request limit', or None (fallback delay). + + Behavior + -------- + - Sleeps 60s for minutely errors + - Sleeps 1h for hourly errors + - Sleeps until 00:05 next day for daily errors + - Sleeps 2 minutes as fallback + """ + + now = datetime.datetime.now() + midnight = datetime.datetime(now.year, now.month, now.day) + datetime.timedelta(days=1, minutes=5) + time_until_midnight = (midnight - now).total_seconds() + + sleep_times = { + None: 120, # Fallback for unknown errors + "Minutely API request limit": 60, + "Hourly API request limit": 60 * 60, + "Daily API request limit": time_until_midnight + } + + sleep_time = sleep_times[error] + logger.info(f"Sleeping for {sleep_time / 60:.2f} minutes.") + time.sleep(sleep_time) + + +def retrieve_era5_data(url, product, chunks=None, tmpdir=None, lock=None, **updates): + """ + Download data like ERA5 from the Climate Data Store (CDS). + + If you want to track the state of your request go to + https://cds-beta.climate.copernicus.eu/requests?tab=all + """ + request = {"product_type": ["reanalysis"], + "data_format": "netcdf", + "download_format": "zip"} + + request.update(updates) + + assert {"year", "month", "variable"}.issubset( + request + ), "Need to specify at least 'variable', 'year' and 'month'" + + client = cdsapi.Client( + url = url, + info_callback=logger.debug, + debug=logging.DEBUG >= logging.root.level + ) + result = client.retrieve(product, request) + + if lock is None: + lock = nullcontext() + + with lock: + fd, target_zip = mkstemp(suffix=".zip", dir=tmpdir) + os.close(fd) + + # Inform user about data being downloaded as "* variable (year-month)" + timestr = f"{request['year']}-{request['month']}" + variables = atleast_1d(request["variable"]) + varstr = "\n\t".join([f"{v} ({timestr})" for v in variables]) + logger.info(f"CDS: Downloading variables\n\t{varstr}\n") + result.download(target_zip) + + # Open the .zip file in memory + with zipfile.ZipFile(target_zip, "r") as zf: + # Identify .nc files inside the .zip + nc_files = [name for name in zf.namelist() if name.endswith(".nc")] + + if not nc_files: + raise FileNotFoundError("No .nc files found in the downloaded .zip archive.") + + if len(nc_files) == 1: + # If there's only one .nc file, read it into memory + with zf.open(nc_files[0]) as nc_file: + # Pass the in-memory file-like object to Xarray + ds = xr.open_dataset(io.BytesIO(nc_file.read()), chunks=chunks or {}) + + else: + # If multiple .nc files, combine them using Xarray + datasets = [] + for nc_file in nc_files: + with zf.open(nc_file) as file: + dataset = xr.open_dataset(io.BytesIO(file.read()), chunks=chunks or {}) + + if 'expver' in dataset.variables: + dataset = dataset.drop_vars(["expver", "number"], errors="ignore") + + datasets.append(dataset) + + ds = xr.merge(datasets) + + if tmpdir is None: + logging.debug(f"Adding finalizer for {target_zip}") + weakref.finalize(ds._file_obj._manager, noisy_unlink, target_zip) + + return ds + + +def retrieval_times_era5_forecast(coords, initialization_time, static=False, monthly_requests=False): + """ + Get list of retrieval cdsapi arguments for time dimension in coordinates. + + If static is False, this function creates a query for each month and year + in the time axis in coords. This ensures not running into size query limits + of the cdsapi even with very (spatially) large cutouts. + If static is True, the function return only one set of parameters + for the very first time point. + + Parameters + ---------- + coords : atlite.Cutout.coords + static : bool, optional + monthly_requests : bool, optional + If True, the data is requested on a monthly basis. This is useful for + large cutouts, where the data is requested in smaller chunks. The + default is False + + Returns + ------- + list of dicts witht retrieval arguments + + """ + + # Convert time coordinates to a pandas Index + time = coords["time"].to_index() + frequency = time.freq + + # Determine the latest available ERA5 data time based on initialization time and required delay + latest_era5_time = pd.Timestamp(initialization_time) + ERA5_DELAY + + # Round up to the next full day and subtract 1 hour to align with ERA5 update frequency + latest_era5_time = latest_era5_time.ceil('D') - pd.Timedelta(hours=1) + + # Define the minimum required time horizon for ERA5 downloads (last 24 hours) + minimum_era5_time_horizon = pd.date_range( + start=latest_era5_time - pd.Timedelta(days=1), # One day before the latest available time + end=latest_era5_time, # Up to the latest available time + freq=frequency # Maintain original time frequency + ) + + # Merge the existing time index with the minimum ERA5 time horizon, avoiding duplicates + time = time.union(minimum_era5_time_horizon) + + # Ensure a continuous time index by filling missing values based on the determined frequency + complete_time_range = pd.date_range(start=time.min(), end=time.max(), freq=frequency) + + # Keep only timestamps up to the latest available ERA5 time + time = complete_time_range[complete_time_range <= latest_era5_time] + + if static: + return { + "year": [str(time[0].year)], + "month": [str(time[0].month).zfill(2)], + "day": [str(time[0].day).zfill(2)], + "time": time[0].strftime("%H:00"), + } + + # Prepare request for all months and years + times = [] + for year in time.year.unique(): + t = time[time.year == year] + if monthly_requests: + for month in t.month.unique(): + query = { + "year": str(year), + "month": [str(month).zfill(2)], + "day": list(t[t.month == month].day.unique().astype(str).str.zfill(2)), + "time": ["%02d:00" % h for h in t[t.month == month].hour.unique()], + } + times.append(query) + else: + query = { + "year": [str(year)], + "month": list(t.month.unique().astype(str).str.zfill(2)), + "day": list(t.day.unique().astype(str).str.zfill(2)), + "time": ["%02d:00" % h for h in t.hour.unique()], + } + times.append(query) + return times + + +def get_data( + cutout, + feature, + tmpdir, + lock=None, + monthly_requests=False, + concurrent_requests=False, + **creation_parameters, +): + """ + Retrieve data from Meteo API. + + This front-end function downloads data for a specific feature and formats + it to match the given Cutout. + + Parameters + ---------- + cutout : atlite.Cutout + feature : str + Name of the feature data to retrieve. Must be in + `atlite.datasets.era5.features` + tmpdir : str/Path + Directory where the temporary netcdf files are stored. + monthly_requests : bool, optional + If True, the data is requested on a monthly basis in ERA5. This is useful for + large cutouts, where the data is requested in smaller chunks. The + default is False + concurrent_requests : bool, optional + If True, the monthly data requests are posted concurrently. + Only has an effect if `monthly_requests` is True. + **creation_parameters : + Additional keyword arguments. The only effective argument is 'sanitize' + (default True) which sets sanitization of the data on or off. + + Returns + ------- + xarray.Dataset + Dataset of dask arrays of the retrieved variables. + + """ + coords = cutout.coords + initialization_time = creation_parameters['init_time'] + + sanitize = creation_parameters.get("sanitize", True) + + retrieval_params_meteo = { + "product": "meteo_historic_forecast_api_data", + "area": _area(coords), + "chunks": cutout.chunks, + "grid": [cutout.dx, cutout.dy], + "tmpdir": tmpdir, + "lock": lock, + } + + retrieval_params_era5 = { + "product": "reanalysis-era5-single-levels", + "area": _area(coords), + "chunks": cutout.chunks, + "grid": [cutout.dx, cutout.dy], + "tmpdir": tmpdir, + "lock": lock, + } + + + # Get fast changing variabels from meteo forecast + func_meteo = globals().get(f"get_data_meteo_{feature}") + logger.info(f"Requesting data for feature {feature} from meteo...") + + + if func_meteo is not None: + datasets_meteo = func_meteo({**retrieval_params_meteo, + **{"start": coords["time"].to_index()[0], + "end": coords["time"].to_index()[-1], + "coords": coords}}) + else: + datasets_meteo = xr.Dataset() + + + def retrieve_once(time): + ds = func_era5({**retrieval_params_era5, **time}) + return ds + + + # Get missing and slow changing variabels from era5 data and interpolation + func_era5 = globals().get(f"get_data_era5_{feature}") + + if func_era5 is not None: + logger.info(f"Requesting addtional data for feature {feature} from era5...") + + if feature in static_features: + return retrieve_once(retrieval_times_era5_forecast(coords, initialization_time, static=True)).squeeze() + + time_chunks = retrieval_times_era5_forecast(coords, initialization_time, monthly_requests=monthly_requests) + if concurrent_requests: + delayed_datasets = [delayed(retrieve_once)(chunk) for chunk in time_chunks] + datasets_era5 = compute(*delayed_datasets) + else: + datasets_era5 = map(retrieve_once, time_chunks) + + datasets_era5 = xr.concat(datasets_era5, dim="time") + + else: + datasets_era5 = xr.Dataset() + + # Combine datasets and calculate the required variables + combine_func = globals().get(f"combine_data_{feature}") + + logger.info(f"Combine meteo and era5 datasets for feature {feature}...") + + if combine_func is not None: + datasets = combine_func(datasets_meteo, + datasets_era5, + cutout.data.interp_s, + cutout.data.interp_t) + else: + datasets = xr.merge([datasets_meteo, datasets_era5]) + + sanitize_func = globals().get(f"sanitize_{feature}") + if sanitize and sanitize_func is not None: + # Sanitize the data after interpolation to remove residuals + datasets = sanitize_func(datasets) + + return datasets.sel(time=coords["time"]) diff --git a/atlite/datasets/sarah.py b/atlite/datasets/sarah.py index ec7851ff..c4f1ee12 100644 --- a/atlite/datasets/sarah.py +++ b/atlite/datasets/sarah.py @@ -39,6 +39,102 @@ static_features = {} +requirements = {'x': slice(-65.0, 65.0, 0.05), + 'y': slice(-65.0, 65.0, 0.05), + 'offset': pd.Timestamp('1983-01-01'), + 'forecast': pd.Timestamp('2017-12-31 23:00'), + 'dt': pd.Timedelta(hours=0.5), + 'parallel': True, + } + + +def _checkModuleRequirements(x, y, time, time_now, **kwargs): + """ + Load and check the data requirements for a given module. + + Parameters: + x (slice): Defines the start, stop, and step values for the x-dimension. + y (slice): Defines the start, stop, and step values for the y-dimension. + time (slice): Defines the start, stop, and step values for the time dimension. + **kwargs: Additional optional parameters. + """ + + # Extract start, stop, and step values for x + x_start, x_stop, x_step = x.start, x.stop, x.step + + # Adjust x range based on module requirements + if requirements['x'].start > x.start: + x_start = requirements['x'].start + if requirements['x'].stop < x.stop: + x_stop = requirements['x'].stop + if requirements['x'].step > x.step: + x_step = requirements['x'].step + + x = slice(x_start, x_stop, x_step) + + # Extract start, stop, and step values for y + y_start, y_stop, y_step = y.start, y.stop, y.step + + # Adjust y range based on module requirements + if requirements['y'].start > y.start: + y_start = requirements['y'].start + if requirements['y'].stop < y.stop: + y_stop = requirements['y'].stop + if requirements['y'].step > y.step: + y_step = requirements['y'].step + + y = slice(y_start, y_stop, y_step) + + + # Extract time range parameters + time_start = time.start + time_stop = time.stop + time_step = time.step + + # Check forecast feasibility + feasible_start = time_now + requirements['offset'] + feasible_end = time_now + requirements['forecast'] + + # Ensure time_start is within feasible bounds + if time_start < feasible_start: + logger.error(f"The required forecast start time {time_start} exceeds the model requirements.") + logger.error(f"The minimum start time of the forecast for {time_now} is {feasible_start}.") + logger.error(f"The maximum historical offset of the forecast is {requirements['offset']}.") + raise ValueError(f"Invalid forecast start time: {time_start}. Must be >= {feasible_start}.") + + if time_start >= feasible_end: + logger.error(f"The required forecast start time {time_start} exceeds the model requirements.") + logger.error(f"The maximum start time of the forecast for {time_now} needs to be smaller than {feasible_end}.") + raise ValueError(f"Invalid forecast start time: {time_start}. Must be < {feasible_end}.") + + # Ensure time_stop is greater than time_start + if time_stop <= time_start: + logger.error(f"The required forecast end time {time_stop} exceeds the model requirements.") + logger.error(f"The minimum end time of the forecast for {time_now} needs to be larger than {time_start}.") + raise ValueError(f"Invalid forecast end time: {time_stop}. Must be > {time_start}.") + + # Ensure time_stop is greater than time_start + if time_stop > feasible_end: + logger.error(f"The required forecast end time {time_stop} exceeds the model requirements.") + logger.error(f"The maximum end time of the forecast for {time_now} is {feasible_end}.") + logger.error(f"The maximum forecast horizon is {requirements['forecast']}.") + raise ValueError(f"Invalid forecast end time: {time_stop}. Must be <= {feasible_end}.") + + # Ensure time step is within required limits + if (time_step is pd.Timedelta(None)) or (time.step < requirements['dt']): + logger.warning(f"The required temporal forecast resolution {time_step} exceeds the model requirements.") + logger.warning(f"The minimum temporal resolution of the forecast is {requirements['dt']}.") + logger.info(f"Set the temporal forecast resolution to the minimum: {requirements['dt']}.") + time_step = requirements['dt'] + + time = slice(time_start, time_stop, time_step) + + # Retrieve parallel processing setting from requirements + parallel = requirements['parallel'] + + return x, y, time, parallel + + def get_filenames(sarah_dir, coords): """ Get all files in directory `sarah_dir` relevent for coordinates `coords`. From 81cbc72292b78d2ba990fd6c407377735929e646 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 9 Apr 2025 13:24:28 +0000 Subject: [PATCH 02/11] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- atlite/convert.py | 3 +- atlite/cutout.py | 101 ++- atlite/data.py | 12 +- atlite/datasets/__init__.py | 32 +- atlite/datasets/era5.py | 222 +++-- atlite/datasets/gebco.py | 151 ++-- atlite/datasets/icon.py | 935 ++++++++++++--------- atlite/datasets/icon_d2.py | 818 ++++++++++-------- atlite/datasets/icon_eu.py | 818 ++++++++++-------- atlite/datasets/meteo_forecast.py | 509 ++++++----- atlite/datasets/meteo_historic.py | 516 +++++++----- atlite/datasets/meteo_historic_forecast.py | 513 ++++++----- atlite/datasets/sarah.py | 149 ++-- 13 files changed, 2725 insertions(+), 2054 deletions(-) diff --git a/atlite/convert.py b/atlite/convert.py index a899ed9d..cad6f62f 100644 --- a/atlite/convert.py +++ b/atlite/convert.py @@ -148,7 +148,7 @@ def convert_and_aggregate( if matrix is not None: if shapes is not None: raise ValueError( - "Passing matrix and shapes is ambiguous. Pass " "only one of them." + "Passing matrix and shapes is ambiguous. Pass only one of them." ) if isinstance(matrix, xr.DataArray): @@ -400,6 +400,7 @@ def heat_demand(cutout, threshold=15.0, a=1.0, constant=0.0, hour_shift=0.0, **p **params, ) + # cooling demand def convert_cooling_demand(ds, threshold, a, constant, hour_shift): # Temperature is in Kelvin; take daily average diff --git a/atlite/cutout.py b/atlite/cutout.py index 7f5bbf8c..77d5c9c4 100644 --- a/atlite/cutout.py +++ b/atlite/cutout.py @@ -32,7 +32,6 @@ from atlite.convert import ( coefficient_of_performance, convert_and_aggregate, - cooling_demand, csp, dewpoint_temperature, heat_demand, @@ -66,7 +65,7 @@ class Cutout: This class builds the starting point for most atlite functionalities. """ - + def __init__(self, path, **cutoutparams): """ Provide an Atlite cutout object. @@ -121,7 +120,7 @@ def __init__(self, path, **cutoutparams): The default is 'linear'. Valid are all xarray interpolation aliases (such as: 'quadratic', 'cubic',...) chunks : dict - Chunks when opening netcdf files. + Chunks when opening netcdf files. Defaults to {'time': 100, 'y': 100, 'x': 100} data : xr.Dataset User provided cutout data. Save the cutout using `Cutout.to_file()` @@ -162,7 +161,7 @@ def __init__(self, path, **cutoutparams): data.attrs.update(storable_chunks) if cutoutparams: warn( - f'Arguments {", ".join(cutoutparams)} are ignored, since ' + f"Arguments {', '.join(cutoutparams)} are ignored, since " "cutout is already built." ) elif "data" in cutoutparams: @@ -178,14 +177,14 @@ def __init__(self, path, **cutoutparams): x = cutoutparams.pop("x") y = cutoutparams.pop("y") time = cutoutparams.pop("time") - + dx = cutoutparams.pop("dx", 0.25) dy = cutoutparams.pop("dy", 0.25) dt = cutoutparams.pop("dt", "1h") - + interp_s = cutoutparams.pop("interp_s", "linear") interp_t = cutoutparams.pop("interp_t", "linear") - + module = cutoutparams.pop("module") except KeyError as exc: raise TypeError( @@ -194,46 +193,59 @@ def __init__(self, path, **cutoutparams): "passed via argument 'bounds' or 'x' and 'y'." ) from exc - # Convert different time inputs to a valid time slice with # pd.Timestamps as data - + # convert string or timestamp to slice - if isinstance(time, str) or isinstance(time, pd.Timestamp): + if isinstance(time, str) or isinstance(time, pd.Timestamp): time = pd.Timestamp(time) # Create a time slice using pandas datetime - time = slice(pd.Timestamp(f'{time.year}-{time.month}-{time.day} {time.hour}:{time.minute}'), - pd.Timestamp(f'{time.year}-12-31 23:00'), - pd.Timedelta(dt)) - + time = slice( + pd.Timestamp( + f"{time.year}-{time.month}-{time.day} {time.hour}:{time.minute}" + ), + pd.Timestamp(f"{time.year}-12-31 23:00"), + pd.Timedelta(dt), + ) + # convert list of timestamps to slice if isinstance(time, list): freq = pd.Timedelta(pd.infer_freq(time)) if freq is not pd.Timedelta(None): time = slice(time[0], time[-1], freq) else: - time = slice(time[0], time[-1], pd.Timedelta(dt)) - + time = slice(time[0], time[-1], pd.Timedelta(dt)) + # check if time slices has a timestep (dt) information - if isinstance(time, slice): + if isinstance(time, slice): if (time.step is None) or (time.step is pd.Timedelta(None)): - time = slice(pd.Timestamp(time.start), pd.Timestamp(time.stop), pd.Timedelta(dt)) + time = slice( + pd.Timestamp(time.start), + pd.Timestamp(time.stop), + pd.Timedelta(dt), + ) else: - time = slice(pd.Timestamp(time.start), pd.Timestamp(time.stop), pd.Timedelta(time.step)) - + time = slice( + pd.Timestamp(time.start), + pd.Timestamp(time.stop), + pd.Timedelta(time.step), + ) + # check if time slices is valid, assume if start == stop a whole # the data till the end of the year is requested. if time.start == time.stop: - if time.start < pd.Timestamp(f'{time.start.year}-12-31 23:00'): - time = slice(time.start, - pd.Timestamp(f'{time.start.year}-12-31 23:00'), - time.step) + if time.start < pd.Timestamp(f"{time.start.year}-12-31 23:00"): + time = slice( + time.start, + pd.Timestamp(f"{time.start.year}-12-31 23:00"), + time.step, + ) else: - time = slice(time.start, time.start + 2*time.step, time.step) - + time = slice(time.start, time.start + 2 * time.step, time.step) + if x.step is None: x = slice(x.start, x.stop, dx) - + if y.step is None: y = slice(y.start, y.stop, dy) @@ -244,29 +256,35 @@ def __init__(self, path, **cutoutparams): # cases that for example combine forecast with historic data # In additiona a flag if parallel calculations are possible is included time_now = pd.Timestamp.utcnow().replace(tzinfo=None).floor("h") - + if isinstance(module, list): - logger.info(f"Module requirements are set for the first module {module[0]}.") - x, y, time, parallel = datamodules[module[0]]._checkModuleRequirements(x, y, time, time_now) + logger.info( + f"Module requirements are set for the first module {module[0]}." + ) + x, y, time, parallel = datamodules[module[0]]._checkModuleRequirements( + x, y, time, time_now + ) else: logger.info(f"Module requirements for module {module} are set.") - x, y, time, parallel = datamodules[module]._checkModuleRequirements(x, y, time, time_now) - + x, y, time, parallel = datamodules[module]._checkModuleRequirements( + x, y, time, time_now + ) + # In get coords forecast times up to 4 weeks are included coords = get_coords(x, y, time, x.step, y.step, time.step, **cutoutparams) - + # additional attributes interpolation and parallel computing are included attrs = { "module": module, "prepared_features": [], - "tz": f'{time.start.tz}', - "dx": f'{x.step}', - "dy": f'{y.step}', - "dt": f'{time.step}', - "interp_s":f'{interp_s}', - "interp_t":f'{interp_t}', - "parallel":parallel, - "init_time": f'{time_now.strftime("%Y-%m-%d %H:%M:%S")}', + "tz": f"{time.start.tz}", + "dx": f"{x.step}", + "dy": f"{y.step}", + "dt": f"{time.step}", + "interp_s": f"{interp_s}", + "interp_t": f"{interp_t}", + "parallel": parallel, + "init_time": f"{time_now.strftime('%Y-%m-%d %H:%M:%S')}", **storable_chunks, **cutoutparams, } @@ -279,7 +297,6 @@ def __init__(self, path, **cutoutparams): self.path = path self.data = data - @property def name(self): diff --git a/atlite/data.py b/atlite/data.py index 67a3a03e..46b7a30c 100644 --- a/atlite/data.py +++ b/atlite/data.py @@ -41,9 +41,9 @@ def get_features( lock = SerializableLock() datasets = [] get_data = datamodules[module].get_data - - if cutout.data.attrs['parallel']: - for feature in features: + + if cutout.data.attrs["parallel"]: + for feature in features: feature_data = delayed(get_data)( cutout, feature, @@ -54,9 +54,9 @@ def get_features( **parameters, ) datasets.append(feature_data) - + else: - for feature in features: + for feature in features: feature_data = get_data( cutout, feature, @@ -67,7 +67,7 @@ def get_features( **parameters, ) datasets.append(feature_data) - + datasets = compute(*datasets) ds = xr.merge(datasets, compat="equals") diff --git a/atlite/datasets/__init__.py b/atlite/datasets/__init__.py index cbd951ba..40797791 100644 --- a/atlite/datasets/__init__.py +++ b/atlite/datasets/__init__.py @@ -6,14 +6,26 @@ atlite datasets. """ -from atlite.datasets import era5, gebco, sarah, meteo_forecast, meteo_historic_forecast, meteo_historic, icon_d2, icon_eu, icon +from atlite.datasets import ( + era5, + gebco, + icon, + icon_d2, + icon_eu, + meteo_forecast, + meteo_historic, + meteo_historic_forecast, + sarah, +) -modules = {"era5": era5, - "sarah": sarah, - "gebco": gebco, - "meteo_forecast": meteo_forecast, - "meteo_historic_forecast": meteo_historic_forecast, - "meteo_historic": meteo_historic, - "icon_d2": icon_d2, - "icon_eu": icon_eu, - "icon": icon} +modules = { + "era5": era5, + "sarah": sarah, + "gebco": gebco, + "meteo_forecast": meteo_forecast, + "meteo_historic_forecast": meteo_historic_forecast, + "meteo_historic": meteo_historic, + "icon_d2": icon_d2, + "icon_eu": icon_eu, + "icon": icon, +} diff --git a/atlite/datasets/era5.py b/atlite/datasets/era5.py index dbdbb02e..f7283f39 100644 --- a/atlite/datasets/era5.py +++ b/atlite/datasets/era5.py @@ -8,12 +8,12 @@ https://confluence.ecmwf.int/display/CKB/ERA5%3A+data+documentation """ +import io import logging import os -import io -import zipfile import warnings import weakref +import zipfile from tempfile import mkstemp import cdsapi @@ -41,9 +41,9 @@ def nullcontext(): logger = logging.getLogger(__name__) -# Set url for data download, this allows to switch to different data +# Set url for data download, this allows to switch to different data # sources more easily. -era5_url = 'https://cds.climate.copernicus.eu/api' +era5_url = "https://cds.climate.copernicus.eu/api" # Model and CRS Settings crs = 4326 @@ -65,99 +65,135 @@ def nullcontext(): static_features = {"height"} -requirements = {'x': slice(-90, 90, 0.25), - 'y': slice(-90, 90, 0.25), - 'offset': (pd.Timestamp('1940-01-01')-pd.Timestamp.utcnow().replace(tzinfo=None).floor("h")), - 'forecast': pd.Timedelta(hours=-5*24), - 'dt': pd.Timedelta(hours=1), - 'parallel': True, - } +requirements = { + "x": slice(-90, 90, 0.25), + "y": slice(-90, 90, 0.25), + "offset": ( + pd.Timestamp("1940-01-01") + - pd.Timestamp.utcnow().replace(tzinfo=None).floor("h") + ), + "forecast": pd.Timedelta(hours=-5 * 24), + "dt": pd.Timedelta(hours=1), + "parallel": True, +} def _checkModuleRequirements(x, y, time, time_now, **kwargs): """ Load and check the data requirements for a given module. - - Parameters: + + Parameters + ---------- x (slice): Defines the start, stop, and step values for the x-dimension. y (slice): Defines the start, stop, and step values for the y-dimension. time (slice): Defines the start, stop, and step values for the time dimension. **kwargs: Additional optional parameters. """ - + # Extract start, stop, and step values for x x_start, x_stop, x_step = x.start, x.stop, x.step - + # Adjust x range based on module requirements - if requirements['x'].start > x.start: - x_start = requirements['x'].start - if requirements['x'].stop < x.stop: - x_stop = requirements['x'].stop - if requirements['x'].step > x.step: - x_step = requirements['x'].step - + if requirements["x"].start > x.start: + x_start = requirements["x"].start + if requirements["x"].stop < x.stop: + x_stop = requirements["x"].stop + if requirements["x"].step > x.step: + x_step = requirements["x"].step + x = slice(x_start, x_stop, x_step) - + # Extract start, stop, and step values for y y_start, y_stop, y_step = y.start, y.stop, y.step - + # Adjust y range based on module requirements - if requirements['y'].start > y.start: - y_start = requirements['y'].start - if requirements['y'].stop < y.stop: - y_stop = requirements['y'].stop - if requirements['y'].step > y.step: - y_step = requirements['y'].step - + if requirements["y"].start > y.start: + y_start = requirements["y"].start + if requirements["y"].stop < y.stop: + y_stop = requirements["y"].stop + if requirements["y"].step > y.step: + y_step = requirements["y"].step + y = slice(y_start, y_stop, y_step) - - + # Extract time range parameters time_start = time.start time_stop = time.stop time_step = time.step - + # Check forecast feasibility - feasible_start = time_now + requirements['offset'] - feasible_end = time_now + requirements['forecast'] - + feasible_start = time_now + requirements["offset"] + feasible_end = time_now + requirements["forecast"] + # Ensure time_start is within feasible bounds if time_start < feasible_start: - logger.error(f"The required forecast start time {time_start} exceeds the model requirements.") - logger.error(f"The minimum start time of the forecast for {time_now} is {feasible_start}.") - logger.error(f"The maximum historical offset of the forecast is {requirements['offset']}.") - raise ValueError(f"Invalid forecast start time: {time_start}. Must be >= {feasible_start}.") - + logger.error( + f"The required forecast start time {time_start} exceeds the model requirements." + ) + logger.error( + f"The minimum start time of the forecast for {time_now} is {feasible_start}." + ) + logger.error( + f"The maximum historical offset of the forecast is {requirements['offset']}." + ) + raise ValueError( + f"Invalid forecast start time: {time_start}. Must be >= {feasible_start}." + ) + if time_start >= feasible_end: - logger.error(f"The required forecast start time {time_start} exceeds the model requirements.") - logger.error(f"The maximum start time of the forecast for {time_now} needs to be smaller than {feasible_end}.") - raise ValueError(f"Invalid forecast start time: {time_start}. Must be < {feasible_end}.") + logger.error( + f"The required forecast start time {time_start} exceeds the model requirements." + ) + logger.error( + f"The maximum start time of the forecast for {time_now} needs to be smaller than {feasible_end}." + ) + raise ValueError( + f"Invalid forecast start time: {time_start}. Must be < {feasible_end}." + ) # Ensure time_stop is greater than time_start if time_stop <= time_start: - logger.error(f"The required forecast end time {time_stop} exceeds the model requirements.") - logger.error(f"The minimum end time of the forecast for {time_now} needs to be larger than {time_start}.") - raise ValueError(f"Invalid forecast end time: {time_stop}. Must be > {time_start}.") - + logger.error( + f"The required forecast end time {time_stop} exceeds the model requirements." + ) + logger.error( + f"The minimum end time of the forecast for {time_now} needs to be larger than {time_start}." + ) + raise ValueError( + f"Invalid forecast end time: {time_stop}. Must be > {time_start}." + ) + # Ensure time_stop is greater than time_start if time_stop > feasible_end: - logger.error(f"The required forecast end time {time_stop} exceeds the model requirements.") - logger.error(f"The maximum end time of the forecast for {time_now} is {feasible_end}.") + logger.error( + f"The required forecast end time {time_stop} exceeds the model requirements." + ) + logger.error( + f"The maximum end time of the forecast for {time_now} is {feasible_end}." + ) logger.error(f"The maximum forecast horizon is {requirements['forecast']}.") - raise ValueError(f"Invalid forecast end time: {time_stop}. Must be <= {feasible_end}.") - + raise ValueError( + f"Invalid forecast end time: {time_stop}. Must be <= {feasible_end}." + ) + # Ensure time step is within required limits - if (time_step is pd.Timedelta(None)) or (time.step < requirements['dt']): - logger.warning(f"The required temporal forecast resolution {time_step} exceeds the model requirements.") - logger.warning(f"The minimum temporal resolution of the forecast is {requirements['dt']}.") - logger.info(f"Set the temporal forecast resolution to the minimum: {requirements['dt']}.") - time_step = requirements['dt'] - + if (time_step is pd.Timedelta(None)) or (time.step < requirements["dt"]): + logger.warning( + f"The required temporal forecast resolution {time_step} exceeds the model requirements." + ) + logger.warning( + f"The minimum temporal resolution of the forecast is {requirements['dt']}." + ) + logger.info( + f"Set the temporal forecast resolution to the minimum: {requirements['dt']}." + ) + time_step = requirements["dt"] + time = slice(time_start, time_stop, time_step) - + # Retrieve parallel processing setting from requirements - parallel = requirements['parallel'] - + parallel = requirements["parallel"] + return x, y, time, parallel @@ -332,10 +368,7 @@ def get_data_runoff(retrieval_params): """ Get runoff data for given retrieval parameters. """ - ds = retrieve_data( - url=era5_url, - variable=["runoff"], - **retrieval_params) + ds = retrieve_data(url=era5_url, variable=["runoff"], **retrieval_params) ds = _rename_and_clean_coords(ds) ds = ds.rename({"ro": "runoff"}) @@ -355,10 +388,7 @@ def get_data_height(retrieval_params): """ Get height data for given retrieval parameters. """ - ds = retrieve_data( - url=era5_url, - variable=["geopotential"], - **retrieval_params) + ds = retrieve_data(url=era5_url, variable=["geopotential"], **retrieval_params) ds = _rename_and_clean_coords(ds) ds = _add_height(ds) @@ -415,7 +445,9 @@ def retrieval_times(coords, static=False, monthly_requests=False): query = { "year": str(year), "month": [str(month).zfill(2)], - "day": list(t[t.month == month].day.unique().astype(str).str.zfill(2)), + "day": list( + t[t.month == month].day.unique().astype(str).str.zfill(2) + ), "time": ["%02d:00" % h for h in t[t.month == month].hour.unique()], } times.append(query) @@ -447,21 +479,21 @@ def retrieve_data(url, product, chunks=None, tmpdir=None, lock=None, **updates): If you want to track the state of your request go to https://cds-beta.climate.copernicus.eu/requests?tab=all - """ - request = {"product_type": ["reanalysis"], - "data_format": "netcdf", - "download_format": "zip"} - + """ + request = { + "product_type": ["reanalysis"], + "data_format": "netcdf", + "download_format": "zip", + } + request.update(updates) - assert {"year", "month", "variable"}.issubset( - request - ), "Need to specify at least 'variable', 'year' and 'month'" + assert {"year", "month", "variable"}.issubset(request), ( + "Need to specify at least 'variable', 'year' and 'month'" + ) client = cdsapi.Client( - url = url, - info_callback=logger.debug, - debug=logging.DEBUG >= logging.root.level + url=url, info_callback=logger.debug, debug=logging.DEBUG >= logging.root.level ) result = client.retrieve(product, request) @@ -478,30 +510,38 @@ def retrieve_data(url, product, chunks=None, tmpdir=None, lock=None, **updates): varstr = "\n\t".join([f"{v} ({timestr})" for v in variables]) logger.info(f"CDS: Downloading variables\n\t{varstr}\n") result.download(target_zip) - + # Open the .zip file in memory with zipfile.ZipFile(target_zip, "r") as zf: # Identify .nc files inside the .zip nc_files = [name for name in zf.namelist() if name.endswith(".nc")] - + if not nc_files: - raise FileNotFoundError("No .nc files found in the downloaded .zip archive.") - + raise FileNotFoundError( + "No .nc files found in the downloaded .zip archive." + ) + if len(nc_files) == 1: # If there's only one .nc file, read it into memory with zf.open(nc_files[0]) as nc_file: # Pass the in-memory file-like object to Xarray - ds = xr.open_dataset(io.BytesIO(nc_file.read()), chunks=chunks or {}) - + ds = xr.open_dataset( + io.BytesIO(nc_file.read()), chunks=chunks or {} + ) + else: # If multiple .nc files, combine them using Xarray datasets = [] for nc_file in nc_files: with zf.open(nc_file) as file: - datasets.append(xr.open_dataset(io.BytesIO(file.read()), chunks=chunks or {})) + datasets.append( + xr.open_dataset( + io.BytesIO(file.read()), chunks=chunks or {} + ) + ) # Combine datasets along temporal dimension - ds = xr.merge(datasets) - + ds = xr.merge(datasets) + if tmpdir is None: logging.debug(f"Adding finalizer for {target_zip}") weakref.finalize(ds._file_obj._manager, noisy_unlink, target_zip) diff --git a/atlite/datasets/gebco.py b/atlite/datasets/gebco.py index 90c590bd..b3eac7ec 100755 --- a/atlite/datasets/gebco.py +++ b/atlite/datasets/gebco.py @@ -9,9 +9,9 @@ import logging +import pandas as pd import rasterio as rio import xarray as xr -import pandas as pd from pandas import to_numeric from rasterio.warp import Resampling @@ -21,99 +21,132 @@ features = {"height": ["height"]} -requirements = {'x': slice(-90, 90, 0.15), - 'y': slice(-90, 90, 0.15), - 'offset': pd.Timestamp('1940-01-01'), - 'forecast': pd.Timestamp('2050-01-01'), - 'dt': pd.Timedelta(hours=1), - 'parallel': True, - } +requirements = { + "x": slice(-90, 90, 0.15), + "y": slice(-90, 90, 0.15), + "offset": pd.Timestamp("1940-01-01"), + "forecast": pd.Timestamp("2050-01-01"), + "dt": pd.Timedelta(hours=1), + "parallel": True, +} def _checkModuleRequirements(x, y, time, time_now, **kwargs): """ Load and check the data requirements for a given module. - - Parameters: + + Parameters + ---------- x (slice): Defines the start, stop, and step values for the x-dimension. y (slice): Defines the start, stop, and step values for the y-dimension. time (slice): Defines the start, stop, and step values for the time dimension. **kwargs: Additional optional parameters. """ - + # Extract start, stop, and step values for x x_start, x_stop, x_step = x.start, x.stop, x.step - + # Adjust x range based on module requirements - if requirements['x'].start > x.start: - x_start = requirements['x'].start - if requirements['x'].stop < x.stop: - x_stop = requirements['x'].stop - if requirements['x'].step > x.step: - x_step = requirements['x'].step - + if requirements["x"].start > x.start: + x_start = requirements["x"].start + if requirements["x"].stop < x.stop: + x_stop = requirements["x"].stop + if requirements["x"].step > x.step: + x_step = requirements["x"].step + x = slice(x_start, x_stop, x_step) - + # Extract start, stop, and step values for y y_start, y_stop, y_step = y.start, y.stop, y.step - + # Adjust y range based on module requirements - if requirements['y'].start > y.start: - y_start = requirements['y'].start - if requirements['y'].stop < y.stop: - y_stop = requirements['y'].stop - if requirements['y'].step > y.step: - y_step = requirements['y'].step - + if requirements["y"].start > y.start: + y_start = requirements["y"].start + if requirements["y"].stop < y.stop: + y_stop = requirements["y"].stop + if requirements["y"].step > y.step: + y_step = requirements["y"].step + y = slice(y_start, y_stop, y_step) - - + # Extract time range parameters time_start = time.start time_stop = time.stop time_step = time.step - + # Check forecast feasibility - feasible_start = time_now + requirements['offset'] - feasible_end = time_now + requirements['forecast'] - + feasible_start = time_now + requirements["offset"] + feasible_end = time_now + requirements["forecast"] + # Ensure time_start is within feasible bounds if time_start < feasible_start: - logger.error(f"The required forecast start time {time_start} exceeds the model requirements.") - logger.error(f"The minimum start time of the forecast for {time_now} is {feasible_start}.") - logger.error(f"The maximum historical offset of the forecast is {requirements['offset']}.") - raise ValueError(f"Invalid forecast start time: {time_start}. Must be >= {feasible_start}.") - + logger.error( + f"The required forecast start time {time_start} exceeds the model requirements." + ) + logger.error( + f"The minimum start time of the forecast for {time_now} is {feasible_start}." + ) + logger.error( + f"The maximum historical offset of the forecast is {requirements['offset']}." + ) + raise ValueError( + f"Invalid forecast start time: {time_start}. Must be >= {feasible_start}." + ) + if time_start >= feasible_end: - logger.error(f"The required forecast start time {time_start} exceeds the model requirements.") - logger.error(f"The maximum start time of the forecast for {time_now} needs to be smaller than {feasible_end}.") - raise ValueError(f"Invalid forecast start time: {time_start}. Must be < {feasible_end}.") + logger.error( + f"The required forecast start time {time_start} exceeds the model requirements." + ) + logger.error( + f"The maximum start time of the forecast for {time_now} needs to be smaller than {feasible_end}." + ) + raise ValueError( + f"Invalid forecast start time: {time_start}. Must be < {feasible_end}." + ) # Ensure time_stop is greater than time_start if time_stop <= time_start: - logger.error(f"The required forecast end time {time_stop} exceeds the model requirements.") - logger.error(f"The minimum end time of the forecast for {time_now} needs to be larger than {time_start}.") - raise ValueError(f"Invalid forecast end time: {time_stop}. Must be > {time_start}.") - + logger.error( + f"The required forecast end time {time_stop} exceeds the model requirements." + ) + logger.error( + f"The minimum end time of the forecast for {time_now} needs to be larger than {time_start}." + ) + raise ValueError( + f"Invalid forecast end time: {time_stop}. Must be > {time_start}." + ) + # Ensure time_stop is greater than time_start if time_stop > feasible_end: - logger.error(f"The required forecast end time {time_stop} exceeds the model requirements.") - logger.error(f"The maximum end time of the forecast for {time_now} is {feasible_end}.") + logger.error( + f"The required forecast end time {time_stop} exceeds the model requirements." + ) + logger.error( + f"The maximum end time of the forecast for {time_now} is {feasible_end}." + ) logger.error(f"The maximum forecast horizon is {requirements['forecast']}.") - raise ValueError(f"Invalid forecast end time: {time_stop}. Must be <= {feasible_end}.") - + raise ValueError( + f"Invalid forecast end time: {time_stop}. Must be <= {feasible_end}." + ) + # Ensure time step is within required limits - if (time_step is pd.Timedelta(None)) or (time.step < requirements['dt']): - logger.warning(f"The required temporal forecast resolution {time_step} exceeds the model requirements.") - logger.warning(f"The minimum temporal resolution of the forecast is {requirements['dt']}.") - logger.info(f"Set the temporal forecast resolution to the minimum: {requirements['dt']}.") - time_step = requirements['dt'] - + if (time_step is pd.Timedelta(None)) or (time.step < requirements["dt"]): + logger.warning( + f"The required temporal forecast resolution {time_step} exceeds the model requirements." + ) + logger.warning( + f"The minimum temporal resolution of the forecast is {requirements['dt']}." + ) + logger.info( + f"Set the temporal forecast resolution to the minimum: {requirements['dt']}." + ) + time_step = requirements["dt"] + time = slice(time_start, time_stop, time_step) - + # Retrieve parallel processing setting from requirements - parallel = requirements['parallel'] - + parallel = requirements["parallel"] + return x, y, time, parallel diff --git a/atlite/datasets/icon.py b/atlite/datasets/icon.py index 9c3b6475..29393d62 100644 --- a/atlite/datasets/icon.py +++ b/atlite/datasets/icon.py @@ -1,5 +1,3 @@ -# -*- coding: utf-8 -*- - # SPDX-FileCopyrightText: 2016-2021 The Atlite Authors # # SPDX-License-Identifier: GPL-3.0-or-later @@ -11,19 +9,19 @@ https://www.dwd.de/DE/leistungen/nwv_icon_d2_modelldokumentation/nwv_icon_d2_modelldokumentation.html """ +import logging import os import warnings +from bz2 import decompress +from pathlib import Path + import numpy as np -import xarray as xr import pandas as pd import requests -import logging - +import xarray as xr +from bs4 import BeautifulSoup from cdo import Cdo -from pathlib import Path from retry import retry -from bz2 import decompress -from bs4 import BeautifulSoup from ..gis import maybe_swap_spatial_dims from ..pv.solar_position import SolarPosition @@ -54,13 +52,15 @@ def nullcontext(): # URL for accessing Open Data from DWD (Deutscher Wetterdienst) dwd_url = "https://opendata.dwd.de/weather/nwp/icon/grib/" dwd_grid_url = f"https://opendata.dwd.de/weather/lib/cdo/{dwd_icon_grid}.nc.bz2" -dwd_grid_description_url = "https://opendata.dwd.de/weather/lib/cdo/{dwd_icon_grid_description_folder}.tar.bz2" +dwd_grid_description_url = ( + "https://opendata.dwd.de/weather/lib/cdo/{dwd_icon_grid_description_folder}.tar.bz2" +) # ICON model runs are available at fixed intervals: 00, 03, 06, 09, 12, 15, 18, 21 UTC model_run_hours = np.array([0, 6, 12, 18]) # Averaging window of different model runs -averaging_window = 24 #hours +averaging_window = 24 # hours # Coordinate Reference System (CRS) used for geospatial data crs = 4326 @@ -68,7 +68,11 @@ def nullcontext(): # Dictionary defining available meteorological features and their associated data fields features = { "height": ["height"], # Elevation data - "wind": ["wnd100m", "wnd_azimuth", "roughness"], # Wind speed, direction, and surface roughness + "wind": [ + "wnd100m", + "wnd_azimuth", + "roughness", + ], # Wind speed, direction, and surface roughness "influx": [ "influx_toa", # Top-of-atmosphere solar radiation "influx_direct", # Direct solar radiation @@ -86,137 +90,169 @@ def nullcontext(): # Model requirements specifying spatial and temporal constraints requirements = { - 'x': slice(-180, 180, 0.125), # Longitude range with resolution - 'y': slice(-90, 90, 0.125), # Latitude range with resolution - 'offset': pd.Timedelta(hours=-18), # Time offset for forecast initialization - 'forecast': pd.Timedelta(hours=180), # Maximum forecast range - 'dt': pd.Timedelta(hours=1), # Temporal resolution of data - 'parallel': False, # Flag for enabling parallel processing - } + "x": slice(-180, 180, 0.125), # Longitude range with resolution + "y": slice(-90, 90, 0.125), # Latitude range with resolution + "offset": pd.Timedelta(hours=-18), # Time offset for forecast initialization + "forecast": pd.Timedelta(hours=180), # Maximum forecast range + "dt": pd.Timedelta(hours=1), # Temporal resolution of data + "parallel": False, # Flag for enabling parallel processing +} def _checkModuleRequirements(x, y, time, time_now, **kwargs): """ Load and check the data requirements for a given module. - - Parameters: + + Parameters + ---------- x (slice): Defines the start, stop, and step values for the x-dimension. y (slice): Defines the start, stop, and step values for the y-dimension. time (slice): Defines the start, stop, and step values for the time dimension. **kwargs: Additional optional parameters. """ - + # Download reference grid to allow regular lat lon conversion - _download_reference_grid(grid=dwd_icon_grid, - model='ICON', - reference_grid_url=dwd_grid_url) - + _download_reference_grid( + grid=dwd_icon_grid, model="ICON", reference_grid_url=dwd_grid_url + ) + # Extract start, stop, and step values for x x_start, x_stop, x_step = x.start, x.stop, x.step - + # Adjust x range based on module requirements - if requirements['x'].start > x.start: - x_start = requirements['x'].start - if requirements['x'].stop < x.stop: - x_stop = requirements['x'].stop - if requirements['x'].step > x.step: - x_step = requirements['x'].step - + if requirements["x"].start > x.start: + x_start = requirements["x"].start + if requirements["x"].stop < x.stop: + x_stop = requirements["x"].stop + if requirements["x"].step > x.step: + x_step = requirements["x"].step + x = slice(x_start, x_stop, x_step) - + # Extract start, stop, and step values for y y_start, y_stop, y_step = y.start, y.stop, y.step - + # Adjust y range based on module requirements - if requirements['y'].start > y.start: - y_start = requirements['y'].start - if requirements['y'].stop < y.stop: - y_stop = requirements['y'].stop - if requirements['y'].step > y.step: - y_step = requirements['y'].step - + if requirements["y"].start > y.start: + y_start = requirements["y"].start + if requirements["y"].stop < y.stop: + y_stop = requirements["y"].stop + if requirements["y"].step > y.step: + y_step = requirements["y"].step + y = slice(y_start, y_stop, y_step) - - + # Extract time range parameters time_start = time.start time_stop = time.stop time_step = time.step - + # Check forecast feasibility - feasible_start = time_now + requirements['offset'] - feasible_end = time_now + requirements['forecast'] - + feasible_start = time_now + requirements["offset"] + feasible_end = time_now + requirements["forecast"] + # Ensure time_start is within feasible bounds if time_start < feasible_start: - logger.error(f"The required forecast start time {time_start} exceeds the model requirements.") - logger.error(f"The minimum start time of the forecast for {time_now} is {feasible_start}.") - logger.error(f"The maximum historical offset of the forecast is {requirements['offset']}.") - raise ValueError(f"Invalid forecast start time: {time_start}. Must be >= {feasible_start}.") - # logger.info(f"Set the start time to the minimum start time {feasible_start} and proceed.") + logger.error( + f"The required forecast start time {time_start} exceeds the model requirements." + ) + logger.error( + f"The minimum start time of the forecast for {time_now} is {feasible_start}." + ) + logger.error( + f"The maximum historical offset of the forecast is {requirements['offset']}." + ) + raise ValueError( + f"Invalid forecast start time: {time_start}. Must be >= {feasible_start}." + ) + # logger.info(f"Set the start time to the minimum start time {feasible_start} and proceed.") # time_start = time_now + requirements['offset'] - + if time_start >= feasible_end: - logger.error(f"The required forecast start time {time_start} exceeds the model requirements.") - logger.error(f"The maximum start time of the forecast for {time_now} needs to be smaller than {feasible_end}.") - raise ValueError(f"Invalid forecast start time: {time_start}. Must be < {feasible_end}.") - # logger.info(f"Set the start time to the minimum start time {feasible_start} and proceed.") + logger.error( + f"The required forecast start time {time_start} exceeds the model requirements." + ) + logger.error( + f"The maximum start time of the forecast for {time_now} needs to be smaller than {feasible_end}." + ) + raise ValueError( + f"Invalid forecast start time: {time_start}. Must be < {feasible_end}." + ) + # logger.info(f"Set the start time to the minimum start time {feasible_start} and proceed.") # time_start = time_now + requirements['offset'] # Ensure time_stop is greater than time_start if time_stop <= time_start: - logger.error(f"The required forecast end time {time_stop} exceeds the model requirements.") - logger.error(f"The minimum end time of the forecast for {time_now} needs to be larger than {time_start}.") - raise ValueError(f"Invalid forecast end time: {time_stop}. Must be > {time_start}.") - # logger.info(f"Set the end time to the maximum end time {feasible_end} and proceed.") - # time_stop = time_now + requirements['forecast'] - + logger.error( + f"The required forecast end time {time_stop} exceeds the model requirements." + ) + logger.error( + f"The minimum end time of the forecast for {time_now} needs to be larger than {time_start}." + ) + raise ValueError( + f"Invalid forecast end time: {time_stop}. Must be > {time_start}." + ) + # logger.info(f"Set the end time to the maximum end time {feasible_end} and proceed.") + # time_stop = time_now + requirements['forecast'] + # Ensure time_stop is greater than time_start if time_stop > feasible_end: - logger.error(f"The required forecast end time {time_stop} exceeds the model requirements.") - logger.error(f"The maximum end time of the forecast for {time_now} is {feasible_end}.") + logger.error( + f"The required forecast end time {time_stop} exceeds the model requirements." + ) + logger.error( + f"The maximum end time of the forecast for {time_now} is {feasible_end}." + ) logger.error(f"The maximum forecast horizon is {requirements['forecast']}.") - raise ValueError(f"Invalid forecast end time: {time_stop}. Must be <= {feasible_end}.") - # logger.info(f"Set the end time to the maximum end time {feasible_end} and proceed.") - # time_stop = time_now + requirements['forecast'] - + raise ValueError( + f"Invalid forecast end time: {time_stop}. Must be <= {feasible_end}." + ) + # logger.info(f"Set the end time to the maximum end time {feasible_end} and proceed.") + # time_stop = time_now + requirements['forecast'] + # # Check if forecast hours exceed limits # forecastHours = (time_stop - time_now) # if forecastHours > requirements['forecast']: # logger.error(f"The end time of the forecast {time_stop} exceedes the model requirements.") # logger.error(f"The maximum end time of the forecast for {time_now} is {feasible_end}.") # logger.error(f"The required forecast horizon {forecastHours} exceeds the maximum forecast horzion {requirements['forecast']}.") - # # logger.info(f"Set it to maximum forecast hours of {requirements['forecast']} hours.") + # # logger.info(f"Set it to maximum forecast hours of {requirements['forecast']} hours.") # # forecastHours = requirements['forecast'] # # time_stop = time_now + forecastHours - + # # Check if offset is within required limits - # offset = (time_start - time_now) + # offset = (time_start - time_now) # if offset < requirements['offset']: # logger.error(f"The start time of the forecast {time_start} exceeds model requirements.") # logger.error(f"The minimum start time of the forecast for {time_now} is {feasible_start}.") # logger.warning(f"Forecast offset of {offset} hours is below model requirements.") - # logger.info(f"Set it to minimum offset of {requirements['offset']} hours.") + # logger.info(f"Set it to minimum offset of {requirements['offset']} hours.") # offset = requirements['offset'] # time_start = time_now + offset - + # Ensure time step is within required limits - if (time_step is pd.Timedelta(None)) or (time.step < requirements['dt']): - logger.warning(f"The required temporal forecast resolution {time_step} exceeds the model requirements.") - logger.warning(f"The minimum temporal resolution of the forecast is {requirements['dt']}.") - logger.info(f"Set the temporal forecast resolution to the minimum: {requirements['dt']}.") - time_step = requirements['dt'] - + if (time_step is pd.Timedelta(None)) or (time.step < requirements["dt"]): + logger.warning( + f"The required temporal forecast resolution {time_step} exceeds the model requirements." + ) + logger.warning( + f"The minimum temporal resolution of the forecast is {requirements['dt']}." + ) + logger.info( + f"Set the temporal forecast resolution to the minimum: {requirements['dt']}." + ) + time_step = requirements["dt"] + time = slice(time_start, time_stop, time_step) - + # Retrieve parallel processing setting from requirements - parallel = requirements['parallel'] - + parallel = requirements["parallel"] + return x, y, time, parallel def _getCurrentRun(time): - ''' + """ Determines the most recent available model run based on the current time. The latest run is fully available approximately 2 hours after initialization. To ensure the model run is successfully uploaded, the download delay time is set to 3 hours. @@ -227,35 +263,34 @@ def _getCurrentRun(time): ---------- time : datetime The current datetime in UTC. - + Returns ------- datetime The timestamp of the most recent available model run, floored to the hour. - ''' + """ download_delay = 3 # Delay in hours before the run is fully available - + # Adjust the current time by the delay to ensure availability adjusted_time = time - pd.Timedelta(hours=download_delay) # Find the most recent available run by flooring to the nearest model run hour run_hour = max(hour for hour in model_run_hours if hour <= adjusted_time.hour) - + # Construct the correct model run time run_time = adjusted_time.replace(hour=run_hour, minute=0, second=0, microsecond=0) return run_time - def _createDownloadUrl(url, var, field, run, hours): - ''' + """ Generates a list of download URLs for meteorological data from the DWD server. The function scrapes the available files for a given variable and model run, filtering based on field type, forecast hours, and model levels. - + This code was adopted from: https://github.com/prayer007/dwdGribExtractor/tree/main - + Parameters ---------- url : string @@ -263,130 +298,138 @@ def _createDownloadUrl(url, var, field, run, hours): var : string The variable name, optionally including levels separated by '/'. field : string - The field parameter: 'time-invariant' (static), 'soil-level' (162cm), + The field parameter: 'time-invariant' (static), 'soil-level' (162cm), 'model-level' (62;63), or 'single-level' (2D field). run : string Model run identifier. hours : int Maximum forecast hours to retrieve. - + Returns ------- list List of filtered download URLs. - ''' - + """ + # Extract variable name and associated levels - levels = pd.Series(var.split('/')[1:]).astype(int) # Convert levels to integers - var = var.split('/')[0] # Extract variable name - + levels = pd.Series(var.split("/")[1:]).astype(int) # Convert levels to integers + var = var.split("/")[0] # Extract variable name + # Construct the data URL based on provided parameters - data_url = "{url}{run}/{var}/".format(url=url, var=var, run=run) - + data_url = f"{url}{run}/{var}/" + # Send an HTTP GET request to fetch available files response = requests.get(data_url) - + # Raise an error if the request fails - response.raise_for_status() - + response.raise_for_status() + # Parse the HTML content to extract links - soup = BeautifulSoup(response.content, 'html.parser') - + soup = BeautifulSoup(response.content, "html.parser") + # Find all anchor tags ('a') representing file links - link_tags = soup.find_all('a') - + link_tags = soup.find_all("a") + # Initialize an empty list to store the file URLs urls = [] - + # Iterate through all link tags, extract URLs, and store them for tag in link_tags: - link = tag.get('href') # Extract the hyperlink reference - + link = tag.get("href") # Extract the hyperlink reference + if link: # Construct the full URL by appending the relative link to the base URL full_url = data_url + link urls.append(full_url) - + # Convert the list of URLs into a Pandas Series for easy filtering urls = pd.Series(urls) - + # Filter URLs to retain only those containing 'regular-lat-lon' grid format - urls = urls[urls.str.contains('icosahedral')] - + urls = urls[urls.str.contains("icosahedral")] + # Further filter URLs based on the specified model field urls = urls[urls.str.contains(field)] - + # Apply forecast time horizon filter (excluding 'time-invariant' fields) - if field != 'time-invariant': - urls = urls[urls.str.findall(r"\_(\d{3})\_").str[0].astype(int) <= hours] - + if field != "time-invariant": + urls = urls[urls.str.findall(r"\_(\d{3})\_").str[0].astype(int) <= hours] + # Filter URLs based on model levels, if specified if not levels.empty: url_mask = pd.Series(index=urls.index, data=False) # Initialize boolean mask for level in levels: url_mask += urls.str.contains(f"_{level}_") # Check if URL contains level - urls = urls[url_mask] # Apply filter + urls = urls[url_mask] # Apply filter # Convert filtered URLs back to a list urls = list(urls) - + return urls def _deaverage(da): - ''' + """ Converts a temporally averaged data array into individual time-step values. Each time step's original value is reconstructed by reversing the cumulative averaging process. - + Parameters ---------- da : xarray.DataArray Input data array with a time dimension containing cumulative averages. - + Returns ------- xarray.DataArray Data array with de-averaged values. - ''' + """ # Create an integer index for time, matching da's shape - time_index = xr.DataArray(np.arange(1, da.sizes["time"] + 1), dims="time", coords={"time": da.time}) + time_index = xr.DataArray( + np.arange(1, da.sizes["time"] + 1), dims="time", coords={"time": da.time} + ) # Apply the reverse operation: Ψ_inst(t) = t * Ψ(t) - (t-1) * Ψ(t-1) - da_instantaneous = (time_index * da - (time_index - 1) * da.shift(time=1, fill_value=0)) - + da_instantaneous = time_index * da - (time_index - 1) * da.shift( + time=1, fill_value=0 + ) + # Fill the first timestep with NaNs, since it is always zero - da_instantaneous = da_instantaneous.where(da_instantaneous.time != da_instantaneous.time[0], np.nan) + da_instantaneous = da_instantaneous.where( + da_instantaneous.time != da_instantaneous.time[0], np.nan + ) + + return da_instantaneous - return da_instantaneous - def _deaccumulate(da): - ''' + """ Converts accumulated data into time-step differences. This function takes an accumulated dataset and calculates the incremental values between consecutive time steps. - + Parameters ---------- da : xarray.DataArray Input data array with a time dimension containing accumulated values. - + Returns ------- xarray.DataArray Data array with de-accumulated values (time-step differences). - ''' - + """ + # Apply the reverse operation: Ψ_inst(t) = Ψ(t) - Ψ(t-1) da_instantaneous = da - da.shift(time=1, fill_value=0) - + # Fill the first timestep with NaNs, since it is always zero - da_instantaneous = da_instantaneous.where(da_instantaneous.time != da_instantaneous.time[0], np.nan) + da_instantaneous = da_instantaneous.where( + da_instantaneous.time != da_instantaneous.time[0], np.nan + ) - return da_instantaneous + return da_instantaneous -def _average_duplicate_times(ds): +def _average_duplicate_times(ds): """ Averages duplicate timestamps in an xarray Dataset. @@ -417,13 +460,13 @@ def _average_duplicate_times(ds): def _mainDataCollector(url, var, field, forecast, offset, coords, area, grid, tmpdir): - ''' + """ Downloads meteorological data for a given variable and processes it accordingly. - + This function retrieves data from the specified URL, processes it to rename and clean coordinates, and applies de-averaging or de-accumulation where necessary based on the GRIB step type. - + Parameters ---------- url : string @@ -441,13 +484,13 @@ def _mainDataCollector(url, var, field, forecast, offset, coords, area, grid, tm The spatial coordinates where data is required. tmpdir : string Path to the temporary directory where downloaded files are stored. - + Returns ------- xarray.Dataset Processed dataset containing the collected meteorological data. - ''' - + """ + # Extract the most recent forecast run time latestRun = forecast[0] @@ -455,62 +498,69 @@ def _mainDataCollector(url, var, field, forecast, offset, coords, area, grid, tm previousRuns = offset[offset < latestRun] # Keep only entries that align with ICON model run hours - previousRuns = previousRuns[previousRuns.hour.isin(model_run_hours)].sort_values(ascending=True) - + previousRuns = previousRuns[previousRuns.hour.isin(model_run_hours)].sort_values( + ascending=True + ) + if len(previousRuns) > 0: # Get the hour of the earliest previous run first_prev_hour = previousRuns[0].hour - + # Find the previous index in `model_run_hours` prev_idx = np.where(model_run_hours == first_prev_hour)[0][0] - 1 - + # Compute the adjusted previous run - previousRun = pd.DatetimeIndex([previousRuns[0].replace( - hour=model_run_hours[prev_idx], minute=0, second=0, microsecond=0 - )]) - + previousRun = pd.DatetimeIndex( + [ + previousRuns[0].replace( + hour=model_run_hours[prev_idx], minute=0, second=0, microsecond=0 + ) + ] + ) + # Add previousRun to previousRuns, ensuring uniqueness previousRuns = previousRuns.union(previousRun).sort_values(ascending=True) # Create a list of runs including the latest run and previous runs # Use an averaging_window of X hours for all previous runs to average the results - runs = [(run.strftime("%H"), averaging_window) for run in previousRuns] + [(latestRun.strftime("%H"), len(forecast))] + runs = [(run.strftime("%H"), averaging_window) for run in previousRuns] + [ + (latestRun.strftime("%H"), len(forecast)) + ] # # Generate download URLs for the specified variable and field # urls = [] # for run, hours in runs: # urls = urls + _createDownloadUrl(url, var, field, run, hours) - + # urls = pd.Series(urls).unique() - + ds_temps = [] # List to store temporary datasets - + for run, hours in runs: - # Generate download URLs for the specified variable and field urls = _createDownloadUrl(url, var, field, run, hours) - + # Download and collect the main dataset for the given variable - ds_temps.append(_download(urls, var, coords, area, grid, tmpdir)) + ds_temps.append(_download(urls, var, coords, area, grid, tmpdir)) # Concatenate along the time dimension, keeping also duplicated timestamps ds = xr.concat(ds_temps, dim="time") - + # Average duplicates for smooth forecast transitioning ds = _average_duplicate_times(ds) - + # # Download and collect the main dataset for the given variable - # ds_temp = _mainDataCollector(url, var, field, forecast, offset, tmpdir) - + # ds_temp = _mainDataCollector(url, var, field, forecast, offset, tmpdir) + # Rename and clean coordinate labels for consistency # ds_temp = _rename_and_clean_coords(ds_temp) - + # # Iterate through all data variables in the dataset # for ds_var in list(ds_temp.data_vars): # # If the variable is an averaged quantity, apply de-averaging # if ds_temp[ds_var].attrs['GRIB_stepType'] == 'avg': # ds_temp[ds_var] = _deaverage(ds_temp[ds_var]) - + # # If the variable is an accumulated quantity, apply de-accumulation # elif ds_temp[ds_var].attrs['GRIB_stepType'] == 'accum': # ds_temp[ds_var] = _deaccumulate(ds_temp[ds_var]) @@ -520,12 +570,12 @@ def _mainDataCollector(url, var, field, forecast, offset, coords, area, grid, tm def _interpolate(ds, static, coords, grid, interp_s, interp_t): - ''' + """ Interpolates a dataset to match specific latitude and longitude coordinates. - + If the data is not static, it first interpolates temporally. Then, it applies spatial interpolation using binning to adjust the data to the grid resolution. - + Parameters ---------- ds : xarray.Dataset @@ -540,105 +590,117 @@ def _interpolate(ds, static, coords, grid, interp_s, interp_t): Spatial interpolation method (not used in the function but can be applied elsewhere). interp_t : string Temporal interpolation method to be used. - + Returns ------- xarray.Dataset The interpolated dataset adjusted to the target spatial and temporal resolution. - ''' - + """ + # Perform temporal interpolation if the data is not static if not static: try: - ds = ds.interp(time=coords['time'].values, - method=interp_t, - kwargs={"fill_value": "extrapolate"}) + ds = ds.interp( + time=coords["time"].values, + method=interp_t, + kwargs={"fill_value": "extrapolate"}, + ) except ValueError: - logger.info(f"Interpolation: Not enough supporting points for used interpolation method {interp_t}.") + logger.info( + f"Interpolation: Not enough supporting points for used interpolation method {interp_t}." + ) logger.info("Interpolation method is set to 'nearest' instead.") - ds = ds.interp(time=coords['time'].values, - method="nearest", - kwargs={"fill_value": "extrapolate"}) - + ds = ds.interp( + time=coords["time"].values, + method="nearest", + kwargs={"fill_value": "extrapolate"}, + ) + # Create bin edges and labels for x-coordinates - x_bins = coords['x'].values - x_bins = np.insert(x_bins, 0, np.round(x_bins[0] - grid[0], 8), axis=0) # Extend bin range + x_bins = coords["x"].values + x_bins = np.insert( + x_bins, 0, np.round(x_bins[0] - grid[0], 8), axis=0 + ) # Extend bin range x_bins_label = np.round(x_bins[:-1] + grid[0], 8) # Compute bin centers - + # Create bin edges and labels for y-coordinates - y_bins = coords['y'].values - y_bins = np.insert(y_bins, 0, np.round(y_bins[0] - grid[1], 8), axis=0) # Extend bin range + y_bins = coords["y"].values + y_bins = np.insert( + y_bins, 0, np.round(y_bins[0] - grid[1], 8), axis=0 + ) # Extend bin range y_bins_label = np.round(y_bins[:-1] + grid[1], 8) # Compute bin centers - + # Store original dataset attributes attrs = ds.attrs - + # Perform spatial binning by grouping data into bins along x and y dimensions and computing the mean ds = ds.groupby_bins("x", x_bins, labels=x_bins_label).mean(dim="x") ds = ds.groupby_bins("y", y_bins, labels=y_bins_label).mean(dim="y") - + # Rename bins to standard coordinate names - ds = ds.rename({'y_bins': 'y', 'x_bins': 'x'}) - + ds = ds.rename({"y_bins": "y", "x_bins": "x"}) + # Reassign original dataset attributes ds = ds.assign_attrs(attrs) - + return ds @retry(tries=5, delay=5, backoff=2, logger=logger) -def _urlopen_with_retry(data_url, tmpfp, engine='cfgrib', regrid=False, **kwargs): - ''' +def _urlopen_with_retry(data_url, tmpfp, engine="cfgrib", regrid=False, **kwargs): + """ Attempts to download and decompress a dataset file with automatic retry on failure. - + This function fetches data from a given URL, retries up to five times in case of failure, and decompresses the response content before saving it to a temporary file. - + Parameters ---------- data_url : string The URL from which data should be downloaded. tmpfp : string The file path where the downloaded content will be temporarily stored. - + Returns ------- tuple - resp (requests.Response): The HTTP response object from the request. - ds (xarray.Dataset): The dataset extracted from the downloaded file. - ''' - + """ + # Send an HTTP GET request to the data URL with a timeout of 5 seconds resp = requests.get(data_url, timeout=5) # Check if the request was successful (HTTP 200 OK) if resp.status_code == 200: # Open the specified temporary file and write the decompressed response content - with open(tmpfp, 'wb') as f: + with open(tmpfp, "wb") as f: f.write(decompress(resp.content)) else: # Raise an error if the response was unsuccessful - raise ValueError(f"Error in response: {resp.reason}, status code: {resp.status_code}") - + raise ValueError( + f"Error in response: {resp.reason}, status code: {resp.status_code}" + ) + if regrid: - # Regrid the data to regular lon-lat grid + # Regrid the data to regular lon-lat grid ds = _regrid_data(tmpfp, **kwargs) else: # Load the downloaded file as an xarray dataset using the 'cfgrib' engine - ds = xr.open_dataset(tmpfp, engine=engine) - + ds = xr.open_dataset(tmpfp, engine=engine) + # Return both the HTTP response object and the loaded dataset return resp, ds def _download(urls, var, coords, area, grid, tmpdir=None): - ''' + """ Collects meteorological data for all timesteps of a given variable. - + This function retrieves data files, processes them, and merges them into a single dataset. It determines the latest available runs, downloads the necessary files, and structures them according to the expected format. - + Parameters ---------- url : string @@ -654,13 +716,13 @@ def _download(urls, var, coords, area, grid, tmpdir=None): Array representing the forecast offsets. tmpdir : string, optional Temporary directory for storing downloaded files. - + Returns ------- xarray.Dataset Merged dataset containing the collected meteorological data. - ''' - + """ + # # Extract the most recent forecast run time # latestRun = forecast[0] @@ -677,89 +739,97 @@ def _download(urls, var, coords, area, grid, tmpdir=None): # urls = [] # for run, hours in runs: # urls = urls + _createDownloadUrl(url, var, field, run, hours) - + # urls = pd.Series(urls).unique() - + ds_temps = [] # List to store temporary datasets - + # Iterate over generated URLs and process each file for data_url in urls: - logger.info("ICON data -> Processing file: {f}".format(f=data_url)) - + logger.info(f"ICON data -> Processing file: {data_url}") + # Extract filename from URL and construct temporary file path - tmpfn = os.path.basename(data_url) - tmpfn = Path(tmpfn).with_suffix('') - tmpfp = "{p}/{tmpfn}".format(tmpfn=tmpfn, p=tmpdir) - + tmpfn = os.path.basename(data_url) + tmpfn = Path(tmpfn).with_suffix("") + tmpfp = f"{tmpdir}/{tmpfn}" + # Attempt to download and extract the dataset try: - resp, ds_temp = _urlopen_with_retry(data_url, - tmpfp, - regrid=True, - engine='netcdf4', - var=var, - coords=coords, - area=area, - grid=grid, - tmpdir=tmpdir) + resp, ds_temp = _urlopen_with_retry( + data_url, + tmpfp, + regrid=True, + engine="netcdf4", + var=var, + coords=coords, + area=area, + grid=grid, + tmpdir=tmpdir, + ) except Exception as err: - logger.info("Could not get {url}: {err}".format(err=err, url=data_url)) + logger.info(f"Could not get {data_url}: {err}") continue # Skip to next URL if download fails - + # Check if the dataset contains other coordinate ds_coords = list(ds_temp.coords) - ds_coords_to_keep = ["valid_time", "longitude", "latitude", "generalVerticalLayer"] - ds_coords_to_drop = [ds_coord for ds_coord in ds_coords if ds_coord not in ds_coords_to_keep] - + ds_coords_to_keep = [ + "valid_time", + "longitude", + "latitude", + "generalVerticalLayer", + ] + ds_coords_to_drop = [ + ds_coord for ds_coord in ds_coords if ds_coord not in ds_coords_to_keep + ] + # Remove unwanted coordinates ds_temp = ds_temp.drop_vars(ds_coords_to_drop) - + ds_temps.append(ds_temp) - + # Merge all collected datasets into a single dataset - ds = xr.merge(ds_temps) - + ds = xr.merge(ds_temps) + # Rename and clean coordinate labels for consistency ds = _rename_and_clean_coords(ds) - + # Iterate through all data variables in the dataset for ds_var in list(ds.data_vars): # If the variable is an averaged quantity, apply de-averaging - if ds[ds_var].attrs['GRIB_stepType'] == 'avg': + if ds[ds_var].attrs["GRIB_stepType"] == "avg": ds[ds_var] = _deaverage(ds[ds_var]) - + # If the variable is an accumulated quantity, apply de-accumulation - elif ds[ds_var].attrs['GRIB_stepType'] == 'accum': + elif ds[ds_var].attrs["GRIB_stepType"] == "accum": ds[ds_var] = _deaccumulate(ds[ds_var]) return ds def _download_reference_grid(grid: str | Path, model: str, reference_grid_url: str): - assert isinstance(grid, str | Path) - + # Ensure the directory exists - os.makedirs(GRID_DIRECTORY, exist_ok=True) - + os.makedirs(GRID_DIRECTORY, exist_ok=True) + if isinstance(grid, str): reference_grid_path = f"{GRID_DIRECTORY}/{Path(grid).with_suffix('.nc')}" else: reference_grid_path = grid - + # Check wheter the reference grid dataset exists in atlite/resources/grids. # If not, download it from the grid data url. if not os.path.isfile(reference_grid_path): try: - logger.info(f"{model} Grid Data -> Downloading file: {reference_grid_url}") + logger.info(f"{model} Grid Data -> Downloading file: {reference_grid_url}") # Download the zip file and save it temporarely - resp, reference_grid = _urlopen_with_retry(reference_grid_url, - tmpfp=reference_grid_path, - engine='netcdf4') - + resp, reference_grid = _urlopen_with_retry( + reference_grid_url, tmpfp=reference_grid_path, engine="netcdf4" + ) + except Exception as err: logger.info(f"Could not get {reference_grid_url}: {err}") - + return reference_grid_path @@ -788,81 +858,90 @@ def _regrid_data(tmp_data_filepath, var, coords, area, grid, tmpdir): """ # Data Logging of the ICON Grid - logger.info(f"ICON Grid Data -> Processing file: {dwd_grid_url}") - + logger.info(f"ICON Grid Data -> Processing file: {dwd_grid_url}") + # Load original dataset to later process attributes - ds_original = xr.load_dataset(tmp_data_filepath, engine='cfgrib') - + ds_original = xr.load_dataset(tmp_data_filepath, engine="cfgrib") + # Initialize Cdo constructor cdo = Cdo() # Set temporary direcotry for Cdo operations - cdo = Cdo(tempdir=tmpdir) - + cdo = Cdo(tempdir=tmpdir) + # Set the reference grid as downloaded in requirements reference_grid = f"{GRID_DIRECTORY}/{Path(dwd_icon_grid).with_suffix('.nc')}" - + # Attach the reference grid data to the input dataset - ds = cdo.setgrid(reference_grid, - input=tmp_data_filepath, - returnXDataset=True) - + ds = cdo.setgrid(reference_grid, input=tmp_data_filepath, returnXDataset=True) + # Create a temporary target grid file from target coordinates - target_grid_file = os.path.basename(f"target_grid_file_lonlat_x{area[1]}x{area[3]}_y{area[0]}x{area[2]}_g{grid[0]}x{grid[1]}") - tmp_target_grid_file = os.path.basename(f"{target_grid_file}") + target_grid_file = os.path.basename( + f"target_grid_file_lonlat_x{area[1]}x{area[3]}_y{area[0]}x{area[2]}_g{grid[0]}x{grid[1]}" + ) + tmp_target_grid_file = os.path.basename(f"{target_grid_file}") tmp_target_grid_file_name = Path(tmp_target_grid_file + ".txt") tmp_target_grid_file_path = f"{tmpdir}/{tmp_target_grid_file_name}" if not os.path.isfile(tmp_target_grid_file_path): with open(tmp_target_grid_file_path, "w") as f: f.write("gridtype = lonlat\n") - f.write(f"xsize = {len(coords['x'])}\n") # number of longitudes - f.write(f"ysize = {len(coords['y'])}\n") # number of latitudes - f.write(f"xfirst = {area[1]}\n") # first longitude - f.write(f"xinc = {grid[0]}\n") # longitude increment - f.write(f"yfirst = {area[2]}\n") # first latitude - f.write(f"yinc = {grid[1]}\n") # latitude increment - + f.write(f"xsize = {len(coords['x'])}\n") # number of longitudes + f.write(f"ysize = {len(coords['y'])}\n") # number of latitudes + f.write(f"xfirst = {area[1]}\n") # first longitude + f.write(f"xinc = {grid[0]}\n") # longitude increment + f.write(f"yfirst = {area[2]}\n") # first latitude + f.write(f"yinc = {grid[1]}\n") # latitude increment + # Create a temporary transformation weight file from the reference subgrid to the target grid. - tmp_weight_file = os.path.basename(f"weight_file_lonlat_x{area[1]}x{area[3]}_y{area[0]}x{area[2]}_g{grid[0]}x{grid[1]}") + tmp_weight_file = os.path.basename( + f"weight_file_lonlat_x{area[1]}x{area[3]}_y{area[0]}x{area[2]}_g{grid[0]}x{grid[1]}" + ) tmp_weight_file_name = Path(tmp_weight_file + ".nc") tmp_weight_file_path = f"{tmpdir}/{tmp_weight_file_name}" if not os.path.isfile(tmp_weight_file_path): - cdo.gennn(tmp_target_grid_file_path, - input=ds, - output=tmp_weight_file_path) - + cdo.gennn(tmp_target_grid_file_path, input=ds, output=tmp_weight_file_path) + # Regrid the data to the target grid (triangular to latlon grid) - ds = cdo.remap(f"{tmp_target_grid_file_path},{tmp_weight_file_path}", - input=ds, - returnXDataset=True) + ds = cdo.remap( + f"{tmp_target_grid_file_path},{tmp_weight_file_path}", + input=ds, + returnXDataset=True, + ) # Drop 'bnds' and all associated variables from the dataset - if 'bnds' in ds.sizes: - ds = ds.drop_dims('bnds') - - if 'generalVerticalLayer' in list(ds_original.coords): - ds = ds.rename({'height': 'generalVerticalLayer'}) - - if 'heightAboveGround' in list(ds_original.coords): - ds = ds.sel({'height': 2.0}) - - if 'depthBelowLandLayer' in list(ds_original.coords): - ds = ds.isel({'depth': 0}) - + if "bnds" in ds.sizes: + ds = ds.drop_dims("bnds") + + if "generalVerticalLayer" in list(ds_original.coords): + ds = ds.rename({"height": "generalVerticalLayer"}) + + if "heightAboveGround" in list(ds_original.coords): + ds = ds.sel({"height": 2.0}) + + if "depthBelowLandLayer" in list(ds_original.coords): + ds = ds.isel({"depth": 0}) + # Rename variable dimensions accordingly - ds = ds.rename({'time': "valid_time", "lon": "longitude", "lat": "latitude"}) - + ds = ds.rename({"time": "valid_time", "lon": "longitude", "lat": "latitude"}) + # Create a mapping of variable names - rename_mapping = {old_var: new_var for old_var, new_var in zip(ds.data_vars, ds_original.data_vars)} - + rename_mapping = { + old_var: new_var + for old_var, new_var in zip(ds.data_vars, ds_original.data_vars) + } + # Rename all variables in ds based on ds_original ds = ds.rename(rename_mapping) - + for data_var in list(ds_original.data_vars): - ds[data_var].attrs['GRIB_stepType'] = ds_original[data_var].attrs['GRIB_stepType'] - ds[data_var].attrs['GRIB_missingValue'] = ds_original[data_var].attrs['GRIB_missingValue'] - ds[data_var].attrs['GRIB_gridType'] = "latlon" - + ds[data_var].attrs["GRIB_stepType"] = ds_original[data_var].attrs[ + "GRIB_stepType" + ] + ds[data_var].attrs["GRIB_missingValue"] = ds_original[data_var].attrs[ + "GRIB_missingValue" + ] + ds[data_var].attrs["GRIB_gridType"] = "latlon" + return ds @@ -886,48 +965,52 @@ def _rename_and_clean_coords(ds, add_lon_lat=True): return ds + def _interpolate_to_cutout_resolution(ds, retrieval_params, static): - # Interpolate the data spatially and temporally to the wanted cutout resolution ds_temps = [] for idx, var in enumerate(ds.data_vars): - ds_temps.append(_interpolate(ds[var], static, - retrieval_params['coords'], - retrieval_params['grid'], - retrieval_params['interp_s'], - retrieval_params['interp_t']) - ) - + ds_temps.append( + _interpolate( + ds[var], + static, + retrieval_params["coords"], + retrieval_params["grid"], + retrieval_params["interp_s"], + retrieval_params["interp_t"], + ) + ) + ds = xr.merge(ds_temps) ds = ds.assign_coords(lon=("x", ds.x.values), lat=("y", ds.y.values)) - - ds = ds.unify_chunks().chunk(chunks=retrieval_params['chunks'] or {}) - + + ds = ds.unify_chunks().chunk(chunks=retrieval_params["chunks"] or {}) + return ds def get_data_wind(retrieval_params): - ''' + """ Retrieves and processes wind data from the DWD server. - + The function collects wind speed and direction data at 100m above ground level, as well as surface roughness data. It then processes and interpolates this data to match the desired spatial and temporal resolution. - + Parameters ---------- retrieval_params : dict Dictionary containing parameters for data retrieval, including coordinates, grid resolution, and interpolation methods. - + Returns ------- xarray.Dataset Processed dataset containing wind speed, wind direction, and surface roughness. - ''' - + """ + # Retrieve wind data from model levels 62 and 63 - retrieval_params['field'] = ['model-level', 'model-level'] + retrieval_params["field"] = ["model-level", "model-level"] ds = retrieve_data( url=dwd_url, variable=[ @@ -936,46 +1019,44 @@ def get_data_wind(retrieval_params): ], **retrieval_params, ) - + # Compute the mean wind values across the general vertical layers - ds["u"] = ds["u"].mean('generalVerticalLayer') - ds["v"] = ds["v"].mean('generalVerticalLayer') - ds = ds.drop_dims('generalVerticalLayer') # Remove the dimension after averaging + ds["u"] = ds["u"].mean("generalVerticalLayer") + ds["v"] = ds["v"].mean("generalVerticalLayer") + ds = ds.drop_dims("generalVerticalLayer") # Remove the dimension after averaging ds = ds.rename({"u": "u_100m", "v": "v_100m"}) # Rename variables for clarity - - + # Retrieve surface roughness data from single-level data - retrieval_params['field'] = ['single-level'] + retrieval_params["field"] = ["single-level"] ds2 = retrieve_data( url=dwd_url, variable=["z0"], # Surface roughness length **retrieval_params, ) - + # Merge wind data with roughness data into a single dataset ds = xr.merge([ds, ds2]) - + # Rename roughness variable for clarity ds = ds.rename({"fsr": "roughness"}) ds["roughness"] = ds["roughness"].assign_attrs( - units="m", - long_name="Surface roughness" + units="m", long_name="Surface roughness" ) - + # Compute wind speed at 100m using the Pythagorean theorem ds["wnd100m"] = np.sqrt(ds["u_100m"] ** 2 + ds["v_100m"] ** 2).assign_attrs( units="m/s", long_name="100 metre wind speed" ) - + # Compute wind direction azimuth (0 = North, π/2 = East, π = South, 3π/2 = West) azimuth = np.arctan2(ds["u_100m"], ds["v_100m"]) - + # Ensure wind azimuth is within the 0 to 2π range ds["wnd_azimuth"] = azimuth.where(azimuth >= 0, azimuth + 2 * np.pi) - + # Remove intermediate wind component variables after processing ds = ds.drop_vars(["u_100m", "v_100m"]) - + return ds @@ -988,7 +1069,12 @@ def sanitize_wind(ds): def get_data_influx(retrieval_params): """Get influx data for given retrieval parameters.""" # Retrieve single-level data - retrieval_params['field'] = ['single-level', 'single-level', 'single-level', 'single-level'] + retrieval_params["field"] = [ + "single-level", + "single-level", + "single-level", + "single-level", + ] ds = retrieve_data( url=dwd_url, variable=[ @@ -1000,29 +1086,42 @@ def get_data_influx(retrieval_params): **retrieval_params, ) - ds = ds.rename({"avg_tnswrf": "influx_toa", - "ASWDIR_S": "influx_direct", - "ASWDIFD_S": "influx_diffuse", - "al": "albedo"}) - - ds["albedo"] = (ds["albedo"]/100).assign_attrs(units="(0 - 1)", long_name="Shortwave broadband albedo for diffuse radiation") - ds["influx_diffuse"] = ds["influx_diffuse"].assign_attrs(units="W m**-2", long_name="Surface down solar diffuse radiation") - ds["influx_direct"] = ds["influx_direct"].assign_attrs(units="W m**-2", long_name="Surface down solar direct radiation") - ds["influx_toa"] = ds["influx_toa"].assign_attrs(units="W m**-2", long_name="Net short-wave radiation flux at top of atmosphere (TOA)") - + ds = ds.rename( + { + "avg_tnswrf": "influx_toa", + "ASWDIR_S": "influx_direct", + "ASWDIFD_S": "influx_diffuse", + "al": "albedo", + } + ) + + ds["albedo"] = (ds["albedo"] / 100).assign_attrs( + units="(0 - 1)", long_name="Shortwave broadband albedo for diffuse radiation" + ) + ds["influx_diffuse"] = ds["influx_diffuse"].assign_attrs( + units="W m**-2", long_name="Surface down solar diffuse radiation" + ) + ds["influx_direct"] = ds["influx_direct"].assign_attrs( + units="W m**-2", long_name="Surface down solar direct radiation" + ) + ds["influx_toa"] = ds["influx_toa"].assign_attrs( + units="W m**-2", + long_name="Net short-wave radiation flux at top of atmosphere (TOA)", + ) + # # Interpolate the data spatially and temporally to the wanted cutout resolution # ds_temps = [] # for idx, var in enumerate(ds): - # ds_temps.append(_interpolate(ds[var], False, - # retrieval_params['coords'], + # ds_temps.append(_interpolate(ds[var], False, + # retrieval_params['coords'], # retrieval_params['grid'], # retrieval_params['interp_s'], # retrieval_params['interp_t']) # ) - + # ds = xr.merge(ds_temps) # ds = ds.assign_coords(lon=("x", ds.x.values), lat=("y", ds.y.values)) - + # ICON-EU variables are mean values for previous hour, i.e. 13:01 to 14:00 are labelled as "14:00" # account by calculating the SolarPosition for the center of the interval for aggregation happens # see https://github.com/PyPSA/atlite/issues/158 @@ -1041,31 +1140,29 @@ def get_data_influx(retrieval_params): ) ) sp = SolarPosition(ds, time_shift=time_shift) - + sp = sp.rename({v: f"solar_{v}" for v in sp.data_vars}) ds = xr.merge([ds, sp]) - + # # Interpolate the data spatially and temporally to the wanted cutout resolution # ds_temps = [] # for idx, var in enumerate(ds): - # ds_temps.append(_interpolate(ds[var], False, - # retrieval_params['coords'], + # ds_temps.append(_interpolate(ds[var], False, + # retrieval_params['coords'], # retrieval_params['grid'], # retrieval_params['interp_s'], # retrieval_params['interp_t']) # ) - + # ds = xr.merge(ds_temps) # ds = ds.assign_coords(lon=("x", ds.x.values), lat=("y", ds.y.values)) - + # ds = ds.unify_chunks.chunk(chunks=retrieval_params['chunks'] or {}) - - - + # ds = ds.drop_vars(['lon','lat']) # ds = ds.assign_coords(lon=("x", ds.x.values), lat=("y", ds.y.values)) - + return ds @@ -1079,36 +1176,36 @@ def sanitize_influx(ds): def get_data_temperature(retrieval_params): """Get wind temperature for given retrieval parameters.""" # Retrieve single-level data - retrieval_params['field'] = ['single-level','soil-level'] - ds = retrieve_data( - url=dwd_url, - variable=["t_2m", - "t_so/162"], - **retrieval_params + retrieval_params["field"] = ["single-level", "soil-level"] + ds = retrieve_data(url=dwd_url, variable=["t_2m", "t_so/162"], **retrieval_params) + + ds = ds.rename({"t2m": "temperature", "T_SO": "soil temperature"}) + + ds["temperature"] = ds["temperature"].assign_attrs( + units="K", long_name="Temperature at 2m above ground" + ) + ds["soil temperature"] = ds["soil temperature"].assign_attrs( + units="K", long_name="Soil temperature in 162 cm depth " ) - ds = ds.rename({"t2m": "temperature", - "T_SO": "soil temperature"}) - - ds["temperature"] = ds["temperature"].assign_attrs(units="K", long_name="Temperature at 2m above ground") - ds["soil temperature"] = ds["soil temperature"].assign_attrs(units="K", long_name="Soil temperature in 162 cm depth ") - return ds def get_data_runoff(retrieval_params): """Get runoff data for given retrieval parameters.""" # Retrieve single-level data - retrieval_params['field'] = ['single-level','single-level'] - ds = retrieve_data(url=dwd_url, - variable=["runoff_s", - "runoff_g"], - **retrieval_params) - - ds["runoff"] = (ds["RUNOFF_S"] + ds["RUNOFF_G"]).assign_attrs(units="kg m**-2", long_name="Surface and Soil water runoff (accumulated since model start)") + retrieval_params["field"] = ["single-level", "single-level"] + ds = retrieve_data( + url=dwd_url, variable=["runoff_s", "runoff_g"], **retrieval_params + ) + + ds["runoff"] = (ds["RUNOFF_S"] + ds["RUNOFF_G"]).assign_attrs( + units="kg m**-2", + long_name="Surface and Soil water runoff (accumulated since model start)", + ) ds = ds.drop_vars(["RUNOFF_S", "RUNOFF_G"]) - + return ds @@ -1121,16 +1218,14 @@ def sanitize_runoff(ds): def get_data_height(retrieval_params): """Get height data for given retrieval parameters.""" # Retrieve time-invariant data - retrieval_params['field'] = ['time-invariant'] - ds = retrieve_data(url=dwd_url, - variable=["hsurf"], - **retrieval_params) - + retrieval_params["field"] = ["time-invariant"] + ds = retrieve_data(url=dwd_url, variable=["hsurf"], **retrieval_params) + ds = ds.rename({"HSURF": "height"}) ds["height"] = ds["height"].assign_attrs( units="m", - long_name="Geometric Height of the earths surface above sea level (2D field)" - ) + long_name="Geometric Height of the earths surface above sea level (2D field)", + ) return ds @@ -1194,7 +1289,7 @@ def retrieval_times(coords, tz, static=False): return { "forecast": forecast_times, "offset": offset_times, - } + } def noisy_unlink(path): @@ -1205,44 +1300,50 @@ def noisy_unlink(path): except PermissionError: logger.error(f"Unable to delete file {path}, as it is still in use.") - -def retrieve_data(url, product, chunks=None, tmpdir=None, lock=None, **updates): +def retrieve_data(url, product, chunks=None, tmpdir=None, lock=None, **updates): """ Download data from the ICON-EU Model from the Open Data Server (ODS) of DWD. - + If you want to manually downolad the data go to: https://opendata.dwd.de/weather/nwp/icon-eu/grib/ """ - + request = {"product_type": "icon_eu", "format": "direct-download"} request.update(updates) ds_temps = [] - #Download data for each variable individually and then merge all in one xarray + # Download data for each variable individually and then merge all in one xarray logger.info(f"open-dwd: Downloading variables\n\t{request['variable']}\n") - for idx, var in enumerate(request['variable']): - ds_temps.append(_mainDataCollector(url, - var, - request['field'][idx], - request['forecast'], - request['offset'], - request['coords'], - request['area'], - request['grid'], - tmpdir) - ) - + for idx, var in enumerate(request["variable"]): + ds_temps.append( + _mainDataCollector( + url, + var, + request["field"][idx], + request["forecast"], + request["offset"], + request["coords"], + request["area"], + request["grid"], + tmpdir, + ) + ) + ds = xr.merge(ds_temps).chunk(chunks=chunks) - + return ds -def get_data(cutout, feature, tmpdir, - lock=None, - monthly_requests=False, - concurrent_requests=False, - **creation_parameters): +def get_data( + cutout, + feature, + tmpdir, + lock=None, + monthly_requests=False, + concurrent_requests=False, + **creation_parameters, +): """ Retrieve data from DWDs ICON-EU Model dataset (via ODS). @@ -1274,7 +1375,7 @@ def get_data(cutout, feature, tmpdir, coords = cutout.coords sanitize = creation_parameters.get("sanitize", True) - + retrieval_params = { "product": "dwd_icon_eu", "area": _area(coords), @@ -1302,8 +1403,10 @@ def retrieve_once(time, static=False): return ds if feature in static_features: - return retrieve_once(retrieval_times(coords, cutout.data.tz, True), True).squeeze() - + return retrieve_once( + retrieval_times(coords, cutout.data.tz, True), True + ).squeeze() + dataset = retrieve_once(retrieval_times(coords, cutout.data.tz, False), False) return dataset.sel(time=coords["time"]) diff --git a/atlite/datasets/icon_d2.py b/atlite/datasets/icon_d2.py index a3929e9a..eaae775b 100644 --- a/atlite/datasets/icon_d2.py +++ b/atlite/datasets/icon_d2.py @@ -1,5 +1,3 @@ -# -*- coding: utf-8 -*- - # SPDX-FileCopyrightText: 2016-2021 The Atlite Authors # # SPDX-License-Identifier: GPL-3.0-or-later @@ -11,18 +9,18 @@ https://www.dwd.de/DE/leistungen/nwv_icon_d2_modelldokumentation/nwv_icon_d2_modelldokumentation.html """ +import logging import os import warnings +from bz2 import decompress +from pathlib import Path + import numpy as np -import xarray as xr import pandas as pd import requests -import logging - -from pathlib import Path -from retry import retry -from bz2 import decompress +import xarray as xr from bs4 import BeautifulSoup +from retry import retry from ..gis import maybe_swap_spatial_dims from ..pv.solar_position import SolarPosition @@ -48,7 +46,7 @@ def nullcontext(): model_run_hours = np.array([0, 3, 6, 9, 12, 15, 18, 21]) # Averaging window of different model runs -averaging_window = 24 #hours +averaging_window = 24 # hours # Coordinate Reference System (CRS) used for geospatial data crs = 4326 @@ -56,7 +54,11 @@ def nullcontext(): # Dictionary defining available meteorological features and their associated data fields features = { "height": ["height"], # Elevation data - "wind": ["wnd100m", "wnd_azimuth", "roughness"], # Wind speed, direction, and surface roughness + "wind": [ + "wnd100m", + "wnd_azimuth", + "roughness", + ], # Wind speed, direction, and surface roughness "influx": [ "influx_toa", # Top-of-atmosphere solar radiation "influx_direct", # Direct solar radiation @@ -74,132 +76,164 @@ def nullcontext(): # Model requirements specifying spatial and temporal constraints requirements = { - 'x': slice(-3.84, 20.21, 0.02), # Longitude range with resolution - 'y': slice(43.19, 57.63, 0.02), # Latitude range with resolution - 'offset': pd.Timedelta(hours=-18), # Time offset for forecast initialization - 'forecast': pd.Timedelta(hours=48), # Maximum forecast range - 'dt': pd.Timedelta(hours=1), # Temporal resolution of data - 'parallel': True, # Flag for enabling parallel processing - } + "x": slice(-3.84, 20.21, 0.02), # Longitude range with resolution + "y": slice(43.19, 57.63, 0.02), # Latitude range with resolution + "offset": pd.Timedelta(hours=-18), # Time offset for forecast initialization + "forecast": pd.Timedelta(hours=48), # Maximum forecast range + "dt": pd.Timedelta(hours=1), # Temporal resolution of data + "parallel": True, # Flag for enabling parallel processing +} def _checkModuleRequirements(x, y, time, time_now, **kwargs): """ Load and check the data requirements for a given module. - - Parameters: + + Parameters + ---------- x (slice): Defines the start, stop, and step values for the x-dimension. y (slice): Defines the start, stop, and step values for the y-dimension. time (slice): Defines the start, stop, and step values for the time dimension. **kwargs: Additional optional parameters. """ - + # Extract start, stop, and step values for x x_start, x_stop, x_step = x.start, x.stop, x.step - + # Adjust x range based on module requirements - if requirements['x'].start > x.start: - x_start = requirements['x'].start - if requirements['x'].stop < x.stop: - x_stop = requirements['x'].stop - if requirements['x'].step > x.step: - x_step = requirements['x'].step - + if requirements["x"].start > x.start: + x_start = requirements["x"].start + if requirements["x"].stop < x.stop: + x_stop = requirements["x"].stop + if requirements["x"].step > x.step: + x_step = requirements["x"].step + x = slice(x_start, x_stop, x_step) - + # Extract start, stop, and step values for y y_start, y_stop, y_step = y.start, y.stop, y.step - + # Adjust y range based on module requirements - if requirements['y'].start > y.start: - y_start = requirements['y'].start - if requirements['y'].stop < y.stop: - y_stop = requirements['y'].stop - if requirements['y'].step > y.step: - y_step = requirements['y'].step - + if requirements["y"].start > y.start: + y_start = requirements["y"].start + if requirements["y"].stop < y.stop: + y_stop = requirements["y"].stop + if requirements["y"].step > y.step: + y_step = requirements["y"].step + y = slice(y_start, y_stop, y_step) - - + # Extract time range parameters time_start = time.start time_stop = time.stop time_step = time.step - + # Check forecast feasibility - feasible_start = time_now + requirements['offset'] - feasible_end = time_now + requirements['forecast'] - + feasible_start = time_now + requirements["offset"] + feasible_end = time_now + requirements["forecast"] + # Ensure time_start is within feasible bounds if time_start < feasible_start: - logger.error(f"The required forecast start time {time_start} exceeds the model requirements.") - logger.error(f"The minimum start time of the forecast for {time_now} is {feasible_start}.") - logger.error(f"The maximum historical offset of the forecast is {requirements['offset']}.") - raise ValueError(f"Invalid forecast start time: {time_start}. Must be >= {feasible_start}.") - # logger.info(f"Set the start time to the minimum start time {feasible_start} and proceed.") + logger.error( + f"The required forecast start time {time_start} exceeds the model requirements." + ) + logger.error( + f"The minimum start time of the forecast for {time_now} is {feasible_start}." + ) + logger.error( + f"The maximum historical offset of the forecast is {requirements['offset']}." + ) + raise ValueError( + f"Invalid forecast start time: {time_start}. Must be >= {feasible_start}." + ) + # logger.info(f"Set the start time to the minimum start time {feasible_start} and proceed.") # time_start = time_now + requirements['offset'] - + if time_start >= feasible_end: - logger.error(f"The required forecast start time {time_start} exceeds the model requirements.") - logger.error(f"The maximum start time of the forecast for {time_now} needs to be smaller than {feasible_end}.") - raise ValueError(f"Invalid forecast start time: {time_start}. Must be < {feasible_end}.") - # logger.info(f"Set the start time to the minimum start time {feasible_start} and proceed.") + logger.error( + f"The required forecast start time {time_start} exceeds the model requirements." + ) + logger.error( + f"The maximum start time of the forecast for {time_now} needs to be smaller than {feasible_end}." + ) + raise ValueError( + f"Invalid forecast start time: {time_start}. Must be < {feasible_end}." + ) + # logger.info(f"Set the start time to the minimum start time {feasible_start} and proceed.") # time_start = time_now + requirements['offset'] # Ensure time_stop is greater than time_start if time_stop <= time_start: - logger.error(f"The required forecast end time {time_stop} exceeds the model requirements.") - logger.error(f"The minimum end time of the forecast for {time_now} needs to be larger than {time_start}.") - raise ValueError(f"Invalid forecast end time: {time_stop}. Must be > {time_start}.") - # logger.info(f"Set the end time to the maximum end time {feasible_end} and proceed.") - # time_stop = time_now + requirements['forecast'] - + logger.error( + f"The required forecast end time {time_stop} exceeds the model requirements." + ) + logger.error( + f"The minimum end time of the forecast for {time_now} needs to be larger than {time_start}." + ) + raise ValueError( + f"Invalid forecast end time: {time_stop}. Must be > {time_start}." + ) + # logger.info(f"Set the end time to the maximum end time {feasible_end} and proceed.") + # time_stop = time_now + requirements['forecast'] + # Ensure time_stop is greater than time_start if time_stop > feasible_end: - logger.error(f"The required forecast end time {time_stop} exceeds the model requirements.") - logger.error(f"The maximum end time of the forecast for {time_now} is {feasible_end}.") + logger.error( + f"The required forecast end time {time_stop} exceeds the model requirements." + ) + logger.error( + f"The maximum end time of the forecast for {time_now} is {feasible_end}." + ) logger.error(f"The maximum forecast horizon is {requirements['forecast']}.") - raise ValueError(f"Invalid forecast end time: {time_stop}. Must be <= {feasible_end}.") - # logger.info(f"Set the end time to the maximum end time {feasible_end} and proceed.") - # time_stop = time_now + requirements['forecast'] - + raise ValueError( + f"Invalid forecast end time: {time_stop}. Must be <= {feasible_end}." + ) + # logger.info(f"Set the end time to the maximum end time {feasible_end} and proceed.") + # time_stop = time_now + requirements['forecast'] + # # Check if forecast hours exceed limits # forecastHours = (time_stop - time_now) # if forecastHours > requirements['forecast']: # logger.error(f"The end time of the forecast {time_stop} exceedes the model requirements.") # logger.error(f"The maximum end time of the forecast for {time_now} is {feasible_end}.") # logger.error(f"The required forecast horizon {forecastHours} exceeds the maximum forecast horzion {requirements['forecast']}.") - # # logger.info(f"Set it to maximum forecast hours of {requirements['forecast']} hours.") + # # logger.info(f"Set it to maximum forecast hours of {requirements['forecast']} hours.") # # forecastHours = requirements['forecast'] # # time_stop = time_now + forecastHours - + # # Check if offset is within required limits - # offset = (time_start - time_now) + # offset = (time_start - time_now) # if offset < requirements['offset']: # logger.error(f"The start time of the forecast {time_start} exceeds model requirements.") # logger.error(f"The minimum start time of the forecast for {time_now} is {feasible_start}.") # logger.warning(f"Forecast offset of {offset} hours is below model requirements.") - # logger.info(f"Set it to minimum offset of {requirements['offset']} hours.") + # logger.info(f"Set it to minimum offset of {requirements['offset']} hours.") # offset = requirements['offset'] # time_start = time_now + offset - + # Ensure time step is within required limits - if (time_step is pd.Timedelta(None)) or (time.step < requirements['dt']): - logger.warning(f"The required temporal forecast resolution {time_step} exceeds the model requirements.") - logger.warning(f"The minimum temporal resolution of the forecast is {requirements['dt']}.") - logger.info(f"Set the temporal forecast resolution to the minimum: {requirements['dt']}.") - time_step = requirements['dt'] - + if (time_step is pd.Timedelta(None)) or (time.step < requirements["dt"]): + logger.warning( + f"The required temporal forecast resolution {time_step} exceeds the model requirements." + ) + logger.warning( + f"The minimum temporal resolution of the forecast is {requirements['dt']}." + ) + logger.info( + f"Set the temporal forecast resolution to the minimum: {requirements['dt']}." + ) + time_step = requirements["dt"] + time = slice(time_start, time_stop, time_step) - + # Retrieve parallel processing setting from requirements - parallel = requirements['parallel'] - + parallel = requirements["parallel"] + return x, y, time, parallel def _getCurrentRun(time): - ''' + """ Determines the most recent available model run based on the current time. The latest run is fully available approximately 2 hours after initialization. To ensure the model run is successfully uploaded, the download delay time is set to 3 hours. @@ -210,35 +244,34 @@ def _getCurrentRun(time): ---------- time : datetime The current datetime in UTC. - + Returns ------- datetime The timestamp of the most recent available model run, floored to the hour. - ''' + """ download_delay = 3 # Delay in hours before the run is fully available - + # Adjust the current time by the delay to ensure availability adjusted_time = time - pd.Timedelta(hours=download_delay) # Find the most recent available run by flooring to the nearest model run hour run_hour = max(hour for hour in model_run_hours if hour <= adjusted_time.hour) - + # Construct the correct model run time run_time = adjusted_time.replace(hour=run_hour, minute=0, second=0, microsecond=0) return run_time - def _createDownloadUrl(url, var, field, run, hours): - ''' + """ Generates a list of download URLs for meteorological data from the DWD server. The function scrapes the available files for a given variable and model run, filtering based on field type, forecast hours, and model levels. - + This code was adopted from: https://github.com/prayer007/dwdGribExtractor/tree/main - + Parameters ---------- url : string @@ -246,130 +279,138 @@ def _createDownloadUrl(url, var, field, run, hours): var : string The variable name, optionally including levels separated by '/'. field : string - The field parameter: 'time-invariant' (static), 'soil-level' (162cm), + The field parameter: 'time-invariant' (static), 'soil-level' (162cm), 'model-level' (62;63), or 'single-level' (2D field). run : string Model run identifier. hours : int Maximum forecast hours to retrieve. - + Returns ------- list List of filtered download URLs. - ''' - + """ + # Extract variable name and associated levels - levels = pd.Series(var.split('/')[1:]).astype(int) # Convert levels to integers - var = var.split('/')[0] # Extract variable name - + levels = pd.Series(var.split("/")[1:]).astype(int) # Convert levels to integers + var = var.split("/")[0] # Extract variable name + # Construct the data URL based on provided parameters - data_url = "{url}{run}/{var}/".format(url=url, var=var, run=run) - + data_url = f"{url}{run}/{var}/" + # Send an HTTP GET request to fetch available files response = requests.get(data_url) - + # Raise an error if the request fails - response.raise_for_status() - + response.raise_for_status() + # Parse the HTML content to extract links - soup = BeautifulSoup(response.content, 'html.parser') - + soup = BeautifulSoup(response.content, "html.parser") + # Find all anchor tags ('a') representing file links - link_tags = soup.find_all('a') - + link_tags = soup.find_all("a") + # Initialize an empty list to store the file URLs urls = [] - + # Iterate through all link tags, extract URLs, and store them for tag in link_tags: - link = tag.get('href') # Extract the hyperlink reference - + link = tag.get("href") # Extract the hyperlink reference + if link: # Construct the full URL by appending the relative link to the base URL full_url = data_url + link urls.append(full_url) - + # Convert the list of URLs into a Pandas Series for easy filtering urls = pd.Series(urls) - + # Filter URLs to retain only those containing 'regular-lat-lon' grid format - urls = urls[urls.str.contains('regular-lat-lon')] - + urls = urls[urls.str.contains("regular-lat-lon")] + # Further filter URLs based on the specified model field urls = urls[urls.str.contains(field)] - + # Apply forecast time horizon filter (excluding 'time-invariant' fields) - if field != 'time-invariant': - urls = urls[urls.str.findall(r"\_(\d{3})\_").str[0].astype(int) <= hours] - + if field != "time-invariant": + urls = urls[urls.str.findall(r"\_(\d{3})\_").str[0].astype(int) <= hours] + # Filter URLs based on model levels, if specified if not levels.empty: url_mask = pd.Series(index=urls.index, data=False) # Initialize boolean mask for level in levels: url_mask += urls.str.contains(f"_{level}_") # Check if URL contains level - urls = urls[url_mask] # Apply filter + urls = urls[url_mask] # Apply filter # Convert filtered URLs back to a list urls = list(urls) - + return urls def _deaverage(da): - ''' + """ Converts a temporally averaged data array into individual time-step values. Each time step's original value is reconstructed by reversing the cumulative averaging process. - + Parameters ---------- da : xarray.DataArray Input data array with a time dimension containing cumulative averages. - + Returns ------- xarray.DataArray Data array with de-averaged values. - ''' + """ # Create an integer index for time, matching da's shape - time_index = xr.DataArray(np.arange(1, da.sizes["time"] + 1), dims="time", coords={"time": da.time}) + time_index = xr.DataArray( + np.arange(1, da.sizes["time"] + 1), dims="time", coords={"time": da.time} + ) # Apply the reverse operation: Ψ_inst(t) = t * Ψ(t) - (t-1) * Ψ(t-1) - da_instantaneous = (time_index * da - (time_index - 1) * da.shift(time=1, fill_value=0)) - + da_instantaneous = time_index * da - (time_index - 1) * da.shift( + time=1, fill_value=0 + ) + # Fill the first timestep with NaNs, since it is always zero - da_instantaneous = da_instantaneous.where(da_instantaneous.time != da_instantaneous.time[0], np.nan) + da_instantaneous = da_instantaneous.where( + da_instantaneous.time != da_instantaneous.time[0], np.nan + ) + + return da_instantaneous - return da_instantaneous - def _deaccumulate(da): - ''' + """ Converts accumulated data into time-step differences. This function takes an accumulated dataset and calculates the incremental values between consecutive time steps. - + Parameters ---------- da : xarray.DataArray Input data array with a time dimension containing accumulated values. - + Returns ------- xarray.DataArray Data array with de-accumulated values (time-step differences). - ''' - + """ + # Apply the reverse operation: Ψ_inst(t) = Ψ(t) - Ψ(t-1) da_instantaneous = da - da.shift(time=1, fill_value=0) - + # Fill the first timestep with NaNs, since it is always zero - da_instantaneous = da_instantaneous.where(da_instantaneous.time != da_instantaneous.time[0], np.nan) + da_instantaneous = da_instantaneous.where( + da_instantaneous.time != da_instantaneous.time[0], np.nan + ) - return da_instantaneous + return da_instantaneous -def _average_duplicate_times(ds): +def _average_duplicate_times(ds): """ Averages duplicate timestamps in an xarray Dataset. @@ -400,13 +441,13 @@ def _average_duplicate_times(ds): def _mainDataCollector(url, var, field, forecast, offset, coords, tmpdir): - ''' + """ Downloads meteorological data for a given variable and processes it accordingly. - + This function retrieves data from the specified URL, processes it to rename and clean coordinates, and applies de-averaging or de-accumulation where necessary based on the GRIB step type. - + Parameters ---------- url : string @@ -424,13 +465,13 @@ def _mainDataCollector(url, var, field, forecast, offset, coords, tmpdir): The spatial coordinates where data is required. tmpdir : string Path to the temporary directory where downloaded files are stored. - + Returns ------- xarray.Dataset Processed dataset containing the collected meteorological data. - ''' - + """ + # Extract the most recent forecast run time latestRun = forecast[0] @@ -438,63 +479,69 @@ def _mainDataCollector(url, var, field, forecast, offset, coords, tmpdir): previousRuns = offset[offset < latestRun] # Keep only entries that align with ICON model run hours - previousRuns = previousRuns[previousRuns.hour.isin(model_run_hours)].sort_values(ascending=True) - + previousRuns = previousRuns[previousRuns.hour.isin(model_run_hours)].sort_values( + ascending=True + ) + if len(previousRuns) > 0: # Get the hour of the earliest previous run first_prev_hour = previousRuns[0].hour - + # Find the previous index in `model_run_hours` prev_idx = np.where(model_run_hours == first_prev_hour)[0][0] - 1 - + # Compute the adjusted previous run - previousRun = pd.DatetimeIndex([previousRuns[0].replace( - hour=model_run_hours[prev_idx], minute=0, second=0, microsecond=0 - )]) - + previousRun = pd.DatetimeIndex( + [ + previousRuns[0].replace( + hour=model_run_hours[prev_idx], minute=0, second=0, microsecond=0 + ) + ] + ) + # Add previousRun to previousRuns, ensuring uniqueness previousRuns = previousRuns.union(previousRun).sort_values(ascending=True) # Create a list of runs including the latest run and previous runs # Use an averaging_window of X hours for all previous runs to average the results - runs = [(run.strftime("%H"), averaging_window) for run in previousRuns] + [(latestRun.strftime("%H"), len(forecast))] + runs = [(run.strftime("%H"), averaging_window) for run in previousRuns] + [ + (latestRun.strftime("%H"), len(forecast)) + ] # # Generate download URLs for the specified variable and field # urls = [] # for run, hours in runs: # urls = urls + _createDownloadUrl(url, var, field, run, hours) - + # urls = pd.Series(urls).unique() - + ds_temps = [] # List to store temporary datasets - + for run, hours in runs: - # Generate download URLs for the specified variable and field urls = _createDownloadUrl(url, var, field, run, hours) - + # Download and collect the main dataset for the given variable - ds_temps.append(_download(urls, tmpdir)) - + ds_temps.append(_download(urls, tmpdir)) # Concatenate along the time dimension, keeping also duplicated timestamps ds = xr.concat(ds_temps, dim="time") - + # Average duplicates for smooth forecast transitioning ds = _average_duplicate_times(ds) - + # # Download and collect the main dataset for the given variable - # ds_temp = _mainDataCollector(url, var, field, forecast, offset, tmpdir) - + # ds_temp = _mainDataCollector(url, var, field, forecast, offset, tmpdir) + # Rename and clean coordinate labels for consistency # ds_temp = _rename_and_clean_coords(ds_temp) - + # # Iterate through all data variables in the dataset # for ds_var in list(ds_temp.data_vars): # # If the variable is an averaged quantity, apply de-averaging # if ds_temp[ds_var].attrs['GRIB_stepType'] == 'avg': # ds_temp[ds_var] = _deaverage(ds_temp[ds_var]) - + # # If the variable is an accumulated quantity, apply de-accumulation # elif ds_temp[ds_var].attrs['GRIB_stepType'] == 'accum': # ds_temp[ds_var] = _deaccumulate(ds_temp[ds_var]) @@ -504,12 +551,12 @@ def _mainDataCollector(url, var, field, forecast, offset, coords, tmpdir): def _interpolate(ds, static, coords, grid, interp_s, interp_t): - ''' + """ Interpolates a dataset to match specific latitude and longitude coordinates. - + If the data is not static, it first interpolates temporally. Then, it applies spatial interpolation using binning to adjust the data to the grid resolution. - + Parameters ---------- ds : xarray.Dataset @@ -524,101 +571,113 @@ def _interpolate(ds, static, coords, grid, interp_s, interp_t): Spatial interpolation method (not used in the function but can be applied elsewhere). interp_t : string Temporal interpolation method to be used. - + Returns ------- xarray.Dataset The interpolated dataset adjusted to the target spatial and temporal resolution. - ''' - + """ + # Perform temporal interpolation if the data is not static if not static: try: - ds = ds.interp(time=coords['time'].values, - method=interp_t, - kwargs={"fill_value": "extrapolate"}) + ds = ds.interp( + time=coords["time"].values, + method=interp_t, + kwargs={"fill_value": "extrapolate"}, + ) except ValueError: - logger.info(f"Interpolation: Not enough supporting points for used interpolation method {interp_t}.") + logger.info( + f"Interpolation: Not enough supporting points for used interpolation method {interp_t}." + ) logger.info("Interpolation method is set to 'nearest' instead.") - ds = ds.interp(time=coords['time'].values, - method="nearest", - kwargs={"fill_value": "extrapolate"}) - + ds = ds.interp( + time=coords["time"].values, + method="nearest", + kwargs={"fill_value": "extrapolate"}, + ) + # Create bin edges and labels for x-coordinates - x_bins = coords['x'].values - x_bins = np.insert(x_bins, 0, np.round(x_bins[0] - grid[0], 8), axis=0) # Extend bin range + x_bins = coords["x"].values + x_bins = np.insert( + x_bins, 0, np.round(x_bins[0] - grid[0], 8), axis=0 + ) # Extend bin range x_bins_label = np.round(x_bins[:-1] + grid[0], 8) # Compute bin centers - + # Create bin edges and labels for y-coordinates - y_bins = coords['y'].values - y_bins = np.insert(y_bins, 0, np.round(y_bins[0] - grid[1], 8), axis=0) # Extend bin range + y_bins = coords["y"].values + y_bins = np.insert( + y_bins, 0, np.round(y_bins[0] - grid[1], 8), axis=0 + ) # Extend bin range y_bins_label = np.round(y_bins[:-1] + grid[1], 8) # Compute bin centers - + # Store original dataset attributes attrs = ds.attrs - + # Perform spatial binning by grouping data into bins along x and y dimensions and computing the mean ds = ds.groupby_bins("x", x_bins, labels=x_bins_label).mean(dim="x") ds = ds.groupby_bins("y", y_bins, labels=y_bins_label).mean(dim="y") - + # Rename bins to standard coordinate names - ds = ds.rename({'y_bins': 'y', 'x_bins': 'x'}) - + ds = ds.rename({"y_bins": "y", "x_bins": "x"}) + # Reassign original dataset attributes ds = ds.assign_attrs(attrs) - + return ds @retry(tries=5, delay=5, backoff=2, logger=logger) -def _urlopen_with_retry(data_url, tmpfp, engine='cfgrib', **kwargs): - ''' +def _urlopen_with_retry(data_url, tmpfp, engine="cfgrib", **kwargs): + """ Attempts to download and decompress a dataset file with automatic retry on failure. - + This function fetches data from a given URL, retries up to five times in case of failure, and decompresses the response content before saving it to a temporary file. - + Parameters ---------- data_url : string The URL from which data should be downloaded. tmpfp : string The file path where the downloaded content will be temporarily stored. - + Returns ------- tuple - resp (requests.Response): The HTTP response object from the request. - ds (xarray.Dataset): The dataset extracted from the downloaded file. - ''' - + """ + # Send an HTTP GET request to the data URL with a timeout of 5 seconds resp = requests.get(data_url, timeout=5) # Check if the request was successful (HTTP 200 OK) if resp.status_code == 200: # Open the specified temporary file and write the decompressed response content - with open(tmpfp, 'wb') as f: + with open(tmpfp, "wb") as f: f.write(decompress(resp.content)) else: # Raise an error if the response was unsuccessful - raise ValueError(f"Error in response: {resp.reason}, status code: {resp.status_code}") - + raise ValueError( + f"Error in response: {resp.reason}, status code: {resp.status_code}" + ) + # Load the downloaded file as an xarray dataset using the 'cfgrib' engine - ds = xr.open_dataset(tmpfp, engine=engine) - + ds = xr.open_dataset(tmpfp, engine=engine) + # Return both the HTTP response object and the loaded dataset return resp, ds def _download(urls, tmpdir=None): - ''' + """ Collects meteorological data for all timesteps of a given variable. - + This function retrieves data files, processes them, and merges them into a single dataset. It determines the latest available runs, downloads the necessary files, and structures them according to the expected format. - + Parameters ---------- url : string @@ -634,13 +693,13 @@ def _download(urls, tmpdir=None): Array representing the forecast offsets. tmpdir : string, optional Temporary directory for storing downloaded files. - + Returns ------- xarray.Dataset Merged dataset containing the collected meteorological data. - ''' - + """ + # # Extract the most recent forecast run time # latestRun = forecast[0] @@ -657,74 +716,97 @@ def _download(urls, tmpdir=None): # urls = [] # for run, hours in runs: # urls = urls + _createDownloadUrl(url, var, field, run, hours) - + # urls = pd.Series(urls).unique() - + ds_temps = [] # List to store temporary datasets - + # Iterate over generated URLs and process each file for data_url in urls: - logger.info("ICON-D2 data -> Processing file: {f}".format(f=data_url)) - + logger.info(f"ICON-D2 data -> Processing file: {data_url}") + # Extract filename from URL and construct temporary file path - tmpfn = os.path.basename(data_url) - tmpfn = Path(tmpfn).with_suffix('') - tmpfp = "{p}/{tmpfn}".format(tmpfn=tmpfn, p=tmpdir) - + tmpfn = os.path.basename(data_url) + tmpfn = Path(tmpfn).with_suffix("") + tmpfp = f"{tmpdir}/{tmpfn}" + # Attempt to download and extract the dataset try: resp, ds_temp = _urlopen_with_retry(data_url, tmpfp) except Exception as err: - logger.info("Could not get {url}: {err}".format(err=err, url=data_url)) + logger.info(f"Could not get {data_url}: {err}") continue # Skip to next URL if download fails - + # Check if the dataset contains a 'generalVerticalLayer' coordinate - if 'generalVerticalLayer' in ds_temp.coords: + if "generalVerticalLayer" in ds_temp.coords: ds_coords = list(ds_temp.coords) - ds_coords_to_keep = ["valid_time", "longitude", "latitude", "generalVerticalLayer"] - ds_coords_to_drop = [ds_coord for ds_coord in ds_coords if ds_coord not in ds_coords_to_keep] - + ds_coords_to_keep = [ + "valid_time", + "longitude", + "latitude", + "generalVerticalLayer", + ] + ds_coords_to_drop = [ + ds_coord for ds_coord in ds_coords if ds_coord not in ds_coords_to_keep + ] + # Expand dataset dimensions and remove unwanted coordinates - ds_temp = ds_temp.expand_dims(dim=["valid_time", "generalVerticalLayer"]).drop_vars(ds_coords_to_drop) - + ds_temp = ds_temp.expand_dims( + dim=["valid_time", "generalVerticalLayer"] + ).drop_vars(ds_coords_to_drop) + # Assign coordinate values back to dataset - ds_temp = ds_temp.assign_coords({"valid_time": ds_temp.valid_time, - "latitude": ds_temp.latitude, - "longitude": ds_temp.longitude, - "generalVerticalLayer": ds_temp.generalVerticalLayer}) + ds_temp = ds_temp.assign_coords( + { + "valid_time": ds_temp.valid_time, + "latitude": ds_temp.latitude, + "longitude": ds_temp.longitude, + "generalVerticalLayer": ds_temp.generalVerticalLayer, + } + ) ds_temps.append(ds_temp) - + else: ds_coords = list(ds_temp.coords) - ds_coords_to_keep = ["valid_time", "longitude", "latitude"] - ds_coords_to_drop = [ds_coord for ds_coord in ds_coords if ds_coord not in ds_coords_to_keep] - + ds_coords_to_keep = ["valid_time", "longitude", "latitude"] + ds_coords_to_drop = [ + ds_coord for ds_coord in ds_coords if ds_coord not in ds_coords_to_keep + ] + # Swap 'step' dimension with 'valid_time' if applicable if "step" in ds_temp.dims: - ds_temp = ds_temp.swap_dims({"step": "valid_time"}).drop_vars(ds_coords_to_drop) + ds_temp = ds_temp.swap_dims({"step": "valid_time"}).drop_vars( + ds_coords_to_drop + ) else: - ds_temp = ds_temp.expand_dims(dim="valid_time").drop_vars(ds_coords_to_drop) - + ds_temp = ds_temp.expand_dims(dim="valid_time").drop_vars( + ds_coords_to_drop + ) + # Assign coordinate values back to dataset - ds_temp = ds_temp.assign_coords({"valid_time": ds_temp.valid_time, - "latitude": ds_temp.latitude, - "longitude": ds_temp.longitude}) + ds_temp = ds_temp.assign_coords( + { + "valid_time": ds_temp.valid_time, + "latitude": ds_temp.latitude, + "longitude": ds_temp.longitude, + } + ) ds_temps.append(ds_temp) - + # Merge all collected datasets into a single dataset - ds = xr.merge(ds_temps) - + ds = xr.merge(ds_temps) + # Rename and clean coordinate labels for consistency ds = _rename_and_clean_coords(ds) - + # Iterate through all data variables in the dataset for ds_var in list(ds.data_vars): # If the variable is an averaged quantity, apply de-averaging - if ds[ds_var].attrs['GRIB_stepType'] == 'avg': + if ds[ds_var].attrs["GRIB_stepType"] == "avg": ds[ds_var] = _deaverage(ds[ds_var]) - + # If the variable is an accumulated quantity, apply de-accumulation - elif ds[ds_var].attrs['GRIB_stepType'] == 'accum': + elif ds[ds_var].attrs["GRIB_stepType"] == "accum": ds[ds_var] = _deaccumulate(ds[ds_var]) return ds @@ -750,48 +832,52 @@ def _rename_and_clean_coords(ds, add_lon_lat=True): return ds + def _interpolate_to_cutout_resolution(ds, retrieval_params, static): - # Interpolate the data spatially and temporally to the wanted cutout resolution ds_temps = [] for idx, var in enumerate(ds.data_vars): - ds_temps.append(_interpolate(ds[var], static, - retrieval_params['coords'], - retrieval_params['grid'], - retrieval_params['interp_s'], - retrieval_params['interp_t']) - ) - + ds_temps.append( + _interpolate( + ds[var], + static, + retrieval_params["coords"], + retrieval_params["grid"], + retrieval_params["interp_s"], + retrieval_params["interp_t"], + ) + ) + ds = xr.merge(ds_temps) ds = ds.assign_coords(lon=("x", ds.x.values), lat=("y", ds.y.values)) - - ds = ds.unify_chunks().chunk(chunks=retrieval_params['chunks'] or {}) - + + ds = ds.unify_chunks().chunk(chunks=retrieval_params["chunks"] or {}) + return ds def get_data_wind(retrieval_params): - ''' + """ Retrieves and processes wind data from the DWD server. - + The function collects wind speed and direction data at 100m above ground level, as well as surface roughness data. It then processes and interpolates this data to match the desired spatial and temporal resolution. - + Parameters ---------- retrieval_params : dict Dictionary containing parameters for data retrieval, including coordinates, grid resolution, and interpolation methods. - + Returns ------- xarray.Dataset Processed dataset containing wind speed, wind direction, and surface roughness. - ''' - + """ + # Retrieve wind data from model levels 62 and 63 - retrieval_params['field'] = ['model-level', 'model-level'] + retrieval_params["field"] = ["model-level", "model-level"] ds = retrieve_data( url=dwd_url, variable=[ @@ -800,46 +886,44 @@ def get_data_wind(retrieval_params): ], **retrieval_params, ) - + # Compute the mean wind values across the general vertical layers - ds["u"] = ds["u"].mean('generalVerticalLayer') - ds["v"] = ds["v"].mean('generalVerticalLayer') - ds = ds.drop_dims('generalVerticalLayer') # Remove the dimension after averaging + ds["u"] = ds["u"].mean("generalVerticalLayer") + ds["v"] = ds["v"].mean("generalVerticalLayer") + ds = ds.drop_dims("generalVerticalLayer") # Remove the dimension after averaging ds = ds.rename({"u": "u_100m", "v": "v_100m"}) # Rename variables for clarity - - + # Retrieve surface roughness data from single-level data - retrieval_params['field'] = ['single-level'] + retrieval_params["field"] = ["single-level"] ds2 = retrieve_data( url=dwd_url, variable=["z0"], # Surface roughness length **retrieval_params, ) - + # Merge wind data with roughness data into a single dataset ds = xr.merge([ds, ds2]) - + # Rename roughness variable for clarity ds = ds.rename({"fsr": "roughness"}) ds["roughness"] = ds["roughness"].assign_attrs( - units="m", - long_name="Surface roughness" + units="m", long_name="Surface roughness" ) - + # Compute wind speed at 100m using the Pythagorean theorem ds["wnd100m"] = np.sqrt(ds["u_100m"] ** 2 + ds["v_100m"] ** 2).assign_attrs( units="m/s", long_name="100 metre wind speed" ) - + # Compute wind direction azimuth (0 = North, π/2 = East, π = South, 3π/2 = West) azimuth = np.arctan2(ds["u_100m"], ds["v_100m"]) - + # Ensure wind azimuth is within the 0 to 2π range ds["wnd_azimuth"] = azimuth.where(azimuth >= 0, azimuth + 2 * np.pi) - + # Remove intermediate wind component variables after processing ds = ds.drop_vars(["u_100m", "v_100m"]) - + return ds @@ -852,7 +936,12 @@ def sanitize_wind(ds): def get_data_influx(retrieval_params): """Get influx data for given retrieval parameters.""" # Retrieve single-level data - retrieval_params['field'] = ['single-level', 'single-level', 'single-level', 'single-level'] + retrieval_params["field"] = [ + "single-level", + "single-level", + "single-level", + "single-level", + ] ds = retrieve_data( url=dwd_url, variable=[ @@ -864,29 +953,42 @@ def get_data_influx(retrieval_params): **retrieval_params, ) - ds = ds.rename({"avg_tnswrf": "influx_toa", - "ASWDIR_S": "influx_direct", - "ASWDIFD_S": "influx_diffuse", - "al": "albedo"}) - - ds["albedo"] = (ds["albedo"]/100).assign_attrs(units="(0 - 1)", long_name="Shortwave broadband albedo for diffuse radiation") - ds["influx_diffuse"] = ds["influx_diffuse"].assign_attrs(units="W m**-2", long_name="Surface down solar diffuse radiation") - ds["influx_direct"] = ds["influx_direct"].assign_attrs(units="W m**-2", long_name="Surface down solar direct radiation") - ds["influx_toa"] = ds["influx_toa"].assign_attrs(units="W m**-2", long_name="Net short-wave radiation flux at top of atmosphere (TOA)") - + ds = ds.rename( + { + "avg_tnswrf": "influx_toa", + "ASWDIR_S": "influx_direct", + "ASWDIFD_S": "influx_diffuse", + "al": "albedo", + } + ) + + ds["albedo"] = (ds["albedo"] / 100).assign_attrs( + units="(0 - 1)", long_name="Shortwave broadband albedo for diffuse radiation" + ) + ds["influx_diffuse"] = ds["influx_diffuse"].assign_attrs( + units="W m**-2", long_name="Surface down solar diffuse radiation" + ) + ds["influx_direct"] = ds["influx_direct"].assign_attrs( + units="W m**-2", long_name="Surface down solar direct radiation" + ) + ds["influx_toa"] = ds["influx_toa"].assign_attrs( + units="W m**-2", + long_name="Net short-wave radiation flux at top of atmosphere (TOA)", + ) + # # Interpolate the data spatially and temporally to the wanted cutout resolution # ds_temps = [] # for idx, var in enumerate(ds): - # ds_temps.append(_interpolate(ds[var], False, - # retrieval_params['coords'], + # ds_temps.append(_interpolate(ds[var], False, + # retrieval_params['coords'], # retrieval_params['grid'], # retrieval_params['interp_s'], # retrieval_params['interp_t']) # ) - + # ds = xr.merge(ds_temps) # ds = ds.assign_coords(lon=("x", ds.x.values), lat=("y", ds.y.values)) - + # ICON-D2 variables are mean values for previous hour, i.e. 13:01 to 14:00 are labelled as "14:00" # account by calculating the SolarPosition for the center of the interval for aggregation happens # see https://github.com/PyPSA/atlite/issues/158 @@ -905,31 +1007,29 @@ def get_data_influx(retrieval_params): ) ) sp = SolarPosition(ds, time_shift=time_shift) - + sp = sp.rename({v: f"solar_{v}" for v in sp.data_vars}) ds = xr.merge([ds, sp]) - + # # Interpolate the data spatially and temporally to the wanted cutout resolution # ds_temps = [] # for idx, var in enumerate(ds): - # ds_temps.append(_interpolate(ds[var], False, - # retrieval_params['coords'], + # ds_temps.append(_interpolate(ds[var], False, + # retrieval_params['coords'], # retrieval_params['grid'], # retrieval_params['interp_s'], # retrieval_params['interp_t']) # ) - + # ds = xr.merge(ds_temps) # ds = ds.assign_coords(lon=("x", ds.x.values), lat=("y", ds.y.values)) - + # ds = ds.unify_chunks.chunk(chunks=retrieval_params['chunks'] or {}) - - - + # ds = ds.drop_vars(['lon','lat']) # ds = ds.assign_coords(lon=("x", ds.x.values), lat=("y", ds.y.values)) - + return ds @@ -943,36 +1043,36 @@ def sanitize_influx(ds): def get_data_temperature(retrieval_params): """Get wind temperature for given retrieval parameters.""" # Retrieve single-level data - retrieval_params['field'] = ['single-level','soil-level'] - ds = retrieve_data( - url=dwd_url, - variable=["t_2m", - "t_so/162"], - **retrieval_params + retrieval_params["field"] = ["single-level", "soil-level"] + ds = retrieve_data(url=dwd_url, variable=["t_2m", "t_so/162"], **retrieval_params) + + ds = ds.rename({"t2m": "temperature", "T_SO": "soil temperature"}) + + ds["temperature"] = ds["temperature"].assign_attrs( + units="K", long_name="Temperature at 2m above ground" + ) + ds["soil temperature"] = ds["soil temperature"].assign_attrs( + units="K", long_name="Soil temperature in 162 cm depth " ) - ds = ds.rename({"t2m": "temperature", - "T_SO": "soil temperature"}) - - ds["temperature"] = ds["temperature"].assign_attrs(units="K", long_name="Temperature at 2m above ground") - ds["soil temperature"] = ds["soil temperature"].assign_attrs(units="K", long_name="Soil temperature in 162 cm depth ") - return ds def get_data_runoff(retrieval_params): """Get runoff data for given retrieval parameters.""" # Retrieve single-level data - retrieval_params['field'] = ['single-level','single-level'] - ds = retrieve_data(url=dwd_url, - variable=["runoff_s", - "runoff_g"], - **retrieval_params) - - ds["runoff"] = (ds["RUNOFF_S"] + ds["RUNOFF_G"]).assign_attrs(units="kg m**-2", long_name="Surface and Soil water runoff (accumulated since model start)") + retrieval_params["field"] = ["single-level", "single-level"] + ds = retrieve_data( + url=dwd_url, variable=["runoff_s", "runoff_g"], **retrieval_params + ) + + ds["runoff"] = (ds["RUNOFF_S"] + ds["RUNOFF_G"]).assign_attrs( + units="kg m**-2", + long_name="Surface and Soil water runoff (accumulated since model start)", + ) ds = ds.drop_vars(["RUNOFF_S", "RUNOFF_G"]) - + return ds @@ -985,16 +1085,14 @@ def sanitize_runoff(ds): def get_data_height(retrieval_params): """Get height data for given retrieval parameters.""" # Retrieve time-invariant data - retrieval_params['field'] = ['time-invariant'] - ds = retrieve_data(url=dwd_url, - variable=["hsurf"], - **retrieval_params) - + retrieval_params["field"] = ["time-invariant"] + ds = retrieve_data(url=dwd_url, variable=["hsurf"], **retrieval_params) + ds = ds.rename({"HSURF": "height"}) ds["height"] = ds["height"].assign_attrs( units="m", - long_name="Geometric Height of the earths surface above sea level (2D field)" - ) + long_name="Geometric Height of the earths surface above sea level (2D field)", + ) return ds @@ -1058,7 +1156,7 @@ def retrieval_times(coords, tz, static=False): return { "forecast": forecast_times, "offset": offset_times, - } + } def noisy_unlink(path): @@ -1069,42 +1167,48 @@ def noisy_unlink(path): except PermissionError: logger.error(f"Unable to delete file {path}, as it is still in use.") - -def retrieve_data(url, product, chunks=None, tmpdir=None, lock=None, **updates): +def retrieve_data(url, product, chunks=None, tmpdir=None, lock=None, **updates): """ Download data from the ICON-D2 Model from the Open Data Server (ODS) of DWD. - + If you want to manually downolad the data go to: https://opendata.dwd.de/weather/nwp/icon-d2/grib/ """ - + request = {"product_type": "icon_d2", "format": "direct-download"} request.update(updates) ds_temps = [] - #Download data for each variable individually and then merge all in one xarray + # Download data for each variable individually and then merge all in one xarray logger.info(f"open-dwd: Downloading variables\n\t{request['variable']}\n") - for idx, var in enumerate(request['variable']): - ds_temps.append(_mainDataCollector(url, - var, - request['field'][idx], - request['forecast'], - request['offset'], - request['coords'], - tmpdir) - ) - + for idx, var in enumerate(request["variable"]): + ds_temps.append( + _mainDataCollector( + url, + var, + request["field"][idx], + request["forecast"], + request["offset"], + request["coords"], + tmpdir, + ) + ) + ds = xr.merge(ds_temps).chunk(chunks=chunks) - + return ds -def get_data(cutout, feature, tmpdir, - lock=None, - monthly_requests=False, - concurrent_requests=False, - **creation_parameters): +def get_data( + cutout, + feature, + tmpdir, + lock=None, + monthly_requests=False, + concurrent_requests=False, + **creation_parameters, +): """ Retrieve data from DWDs ICON-D2 Model dataset (via ODS). @@ -1136,7 +1240,7 @@ def get_data(cutout, feature, tmpdir, coords = cutout.coords sanitize = creation_parameters.get("sanitize", True) - + retrieval_params = { "product": "dwd_icon_d2", "area": _area(coords), @@ -1164,8 +1268,10 @@ def retrieve_once(time, static=False): return ds if feature in static_features: - return retrieve_once(retrieval_times(coords, cutout.data.tz, True), True).squeeze() - + return retrieve_once( + retrieval_times(coords, cutout.data.tz, True), True + ).squeeze() + dataset = retrieve_once(retrieval_times(coords, cutout.data.tz, False), False) return dataset.sel(time=coords["time"]) diff --git a/atlite/datasets/icon_eu.py b/atlite/datasets/icon_eu.py index 090886ef..6beda945 100644 --- a/atlite/datasets/icon_eu.py +++ b/atlite/datasets/icon_eu.py @@ -1,5 +1,3 @@ -# -*- coding: utf-8 -*- - # SPDX-FileCopyrightText: 2016-2021 The Atlite Authors # # SPDX-License-Identifier: GPL-3.0-or-later @@ -11,18 +9,18 @@ https://www.dwd.de/DE/leistungen/nwv_icon_d2_modelldokumentation/nwv_icon_d2_modelldokumentation.html """ +import logging import os import warnings +from bz2 import decompress +from pathlib import Path + import numpy as np -import xarray as xr import pandas as pd import requests -import logging - -from pathlib import Path -from retry import retry -from bz2 import decompress +import xarray as xr from bs4 import BeautifulSoup +from retry import retry from ..gis import maybe_swap_spatial_dims from ..pv.solar_position import SolarPosition @@ -48,7 +46,7 @@ def nullcontext(): model_run_hours = np.array([0, 3, 6, 9, 12, 15, 18, 21]) # Averaging window of different model runs -averaging_window = 24 #hours +averaging_window = 24 # hours # Coordinate Reference System (CRS) used for geospatial data crs = 4326 @@ -56,7 +54,11 @@ def nullcontext(): # Dictionary defining available meteorological features and their associated data fields features = { "height": ["height"], # Elevation data - "wind": ["wnd100m", "wnd_azimuth", "roughness"], # Wind speed, direction, and surface roughness + "wind": [ + "wnd100m", + "wnd_azimuth", + "roughness", + ], # Wind speed, direction, and surface roughness "influx": [ "influx_toa", # Top-of-atmosphere solar radiation "influx_direct", # Direct solar radiation @@ -74,132 +76,164 @@ def nullcontext(): # Model requirements specifying spatial and temporal constraints requirements = { - 'x': slice(-23.5, 62.5, 0.0625), # Longitude range with resolution - 'y': slice(29.5, 70.5, 0.0625), # Latitude range with resolution - 'offset': pd.Timedelta(hours=-18), # Time offset for forecast initialization - 'forecast': pd.Timedelta(hours=120), # Maximum forecast range - 'dt': pd.Timedelta(hours=1), # Temporal resolution of data - 'parallel': True, # Flag for enabling parallel processing - } + "x": slice(-23.5, 62.5, 0.0625), # Longitude range with resolution + "y": slice(29.5, 70.5, 0.0625), # Latitude range with resolution + "offset": pd.Timedelta(hours=-18), # Time offset for forecast initialization + "forecast": pd.Timedelta(hours=120), # Maximum forecast range + "dt": pd.Timedelta(hours=1), # Temporal resolution of data + "parallel": True, # Flag for enabling parallel processing +} def _checkModuleRequirements(x, y, time, time_now, **kwargs): """ Load and check the data requirements for a given module. - - Parameters: + + Parameters + ---------- x (slice): Defines the start, stop, and step values for the x-dimension. y (slice): Defines the start, stop, and step values for the y-dimension. time (slice): Defines the start, stop, and step values for the time dimension. **kwargs: Additional optional parameters. """ - + # Extract start, stop, and step values for x x_start, x_stop, x_step = x.start, x.stop, x.step - + # Adjust x range based on module requirements - if requirements['x'].start > x.start: - x_start = requirements['x'].start - if requirements['x'].stop < x.stop: - x_stop = requirements['x'].stop - if requirements['x'].step > x.step: - x_step = requirements['x'].step - + if requirements["x"].start > x.start: + x_start = requirements["x"].start + if requirements["x"].stop < x.stop: + x_stop = requirements["x"].stop + if requirements["x"].step > x.step: + x_step = requirements["x"].step + x = slice(x_start, x_stop, x_step) - + # Extract start, stop, and step values for y y_start, y_stop, y_step = y.start, y.stop, y.step - + # Adjust y range based on module requirements - if requirements['y'].start > y.start: - y_start = requirements['y'].start - if requirements['y'].stop < y.stop: - y_stop = requirements['y'].stop - if requirements['y'].step > y.step: - y_step = requirements['y'].step - + if requirements["y"].start > y.start: + y_start = requirements["y"].start + if requirements["y"].stop < y.stop: + y_stop = requirements["y"].stop + if requirements["y"].step > y.step: + y_step = requirements["y"].step + y = slice(y_start, y_stop, y_step) - - + # Extract time range parameters time_start = time.start time_stop = time.stop time_step = time.step - + # Check forecast feasibility - feasible_start = time_now + requirements['offset'] - feasible_end = time_now + requirements['forecast'] - + feasible_start = time_now + requirements["offset"] + feasible_end = time_now + requirements["forecast"] + # Ensure time_start is within feasible bounds if time_start < feasible_start: - logger.error(f"The required forecast start time {time_start} exceeds the model requirements.") - logger.error(f"The minimum start time of the forecast for {time_now} is {feasible_start}.") - logger.error(f"The maximum historical offset of the forecast is {requirements['offset']}.") - raise ValueError(f"Invalid forecast start time: {time_start}. Must be >= {feasible_start}.") - # logger.info(f"Set the start time to the minimum start time {feasible_start} and proceed.") + logger.error( + f"The required forecast start time {time_start} exceeds the model requirements." + ) + logger.error( + f"The minimum start time of the forecast for {time_now} is {feasible_start}." + ) + logger.error( + f"The maximum historical offset of the forecast is {requirements['offset']}." + ) + raise ValueError( + f"Invalid forecast start time: {time_start}. Must be >= {feasible_start}." + ) + # logger.info(f"Set the start time to the minimum start time {feasible_start} and proceed.") # time_start = time_now + requirements['offset'] - + if time_start >= feasible_end: - logger.error(f"The required forecast start time {time_start} exceeds the model requirements.") - logger.error(f"The maximum start time of the forecast for {time_now} needs to be smaller than {feasible_end}.") - raise ValueError(f"Invalid forecast start time: {time_start}. Must be < {feasible_end}.") - # logger.info(f"Set the start time to the minimum start time {feasible_start} and proceed.") + logger.error( + f"The required forecast start time {time_start} exceeds the model requirements." + ) + logger.error( + f"The maximum start time of the forecast for {time_now} needs to be smaller than {feasible_end}." + ) + raise ValueError( + f"Invalid forecast start time: {time_start}. Must be < {feasible_end}." + ) + # logger.info(f"Set the start time to the minimum start time {feasible_start} and proceed.") # time_start = time_now + requirements['offset'] # Ensure time_stop is greater than time_start if time_stop <= time_start: - logger.error(f"The required forecast end time {time_stop} exceeds the model requirements.") - logger.error(f"The minimum end time of the forecast for {time_now} needs to be larger than {time_start}.") - raise ValueError(f"Invalid forecast end time: {time_stop}. Must be > {time_start}.") - # logger.info(f"Set the end time to the maximum end time {feasible_end} and proceed.") - # time_stop = time_now + requirements['forecast'] - + logger.error( + f"The required forecast end time {time_stop} exceeds the model requirements." + ) + logger.error( + f"The minimum end time of the forecast for {time_now} needs to be larger than {time_start}." + ) + raise ValueError( + f"Invalid forecast end time: {time_stop}. Must be > {time_start}." + ) + # logger.info(f"Set the end time to the maximum end time {feasible_end} and proceed.") + # time_stop = time_now + requirements['forecast'] + # Ensure time_stop is greater than time_start if time_stop > feasible_end: - logger.error(f"The required forecast end time {time_stop} exceeds the model requirements.") - logger.error(f"The maximum end time of the forecast for {time_now} is {feasible_end}.") + logger.error( + f"The required forecast end time {time_stop} exceeds the model requirements." + ) + logger.error( + f"The maximum end time of the forecast for {time_now} is {feasible_end}." + ) logger.error(f"The maximum forecast horizon is {requirements['forecast']}.") - raise ValueError(f"Invalid forecast end time: {time_stop}. Must be <= {feasible_end}.") - # logger.info(f"Set the end time to the maximum end time {feasible_end} and proceed.") - # time_stop = time_now + requirements['forecast'] - + raise ValueError( + f"Invalid forecast end time: {time_stop}. Must be <= {feasible_end}." + ) + # logger.info(f"Set the end time to the maximum end time {feasible_end} and proceed.") + # time_stop = time_now + requirements['forecast'] + # # Check if forecast hours exceed limits # forecastHours = (time_stop - time_now) # if forecastHours > requirements['forecast']: # logger.error(f"The end time of the forecast {time_stop} exceedes the model requirements.") # logger.error(f"The maximum end time of the forecast for {time_now} is {feasible_end}.") # logger.error(f"The required forecast horizon {forecastHours} exceeds the maximum forecast horzion {requirements['forecast']}.") - # # logger.info(f"Set it to maximum forecast hours of {requirements['forecast']} hours.") + # # logger.info(f"Set it to maximum forecast hours of {requirements['forecast']} hours.") # # forecastHours = requirements['forecast'] # # time_stop = time_now + forecastHours - + # # Check if offset is within required limits - # offset = (time_start - time_now) + # offset = (time_start - time_now) # if offset < requirements['offset']: # logger.error(f"The start time of the forecast {time_start} exceeds model requirements.") # logger.error(f"The minimum start time of the forecast for {time_now} is {feasible_start}.") # logger.warning(f"Forecast offset of {offset} hours is below model requirements.") - # logger.info(f"Set it to minimum offset of {requirements['offset']} hours.") + # logger.info(f"Set it to minimum offset of {requirements['offset']} hours.") # offset = requirements['offset'] # time_start = time_now + offset - + # Ensure time step is within required limits - if (time_step is pd.Timedelta(None)) or (time.step < requirements['dt']): - logger.warning(f"The required temporal forecast resolution {time_step} exceeds the model requirements.") - logger.warning(f"The minimum temporal resolution of the forecast is {requirements['dt']}.") - logger.info(f"Set the temporal forecast resolution to the minimum: {requirements['dt']}.") - time_step = requirements['dt'] - + if (time_step is pd.Timedelta(None)) or (time.step < requirements["dt"]): + logger.warning( + f"The required temporal forecast resolution {time_step} exceeds the model requirements." + ) + logger.warning( + f"The minimum temporal resolution of the forecast is {requirements['dt']}." + ) + logger.info( + f"Set the temporal forecast resolution to the minimum: {requirements['dt']}." + ) + time_step = requirements["dt"] + time = slice(time_start, time_stop, time_step) - + # Retrieve parallel processing setting from requirements - parallel = requirements['parallel'] - + parallel = requirements["parallel"] + return x, y, time, parallel def _getCurrentRun(time): - ''' + """ Determines the most recent available model run based on the current time. The latest run is fully available approximately 2 hours after initialization. To ensure the model run is successfully uploaded, the download delay time is set to 3 hours. @@ -210,35 +244,34 @@ def _getCurrentRun(time): ---------- time : datetime The current datetime in UTC. - + Returns ------- datetime The timestamp of the most recent available model run, floored to the hour. - ''' + """ download_delay = 3 # Delay in hours before the run is fully available - + # Adjust the current time by the delay to ensure availability adjusted_time = time - pd.Timedelta(hours=download_delay) # Find the most recent available run by flooring to the nearest model run hour run_hour = max(hour for hour in model_run_hours if hour <= adjusted_time.hour) - + # Construct the correct model run time run_time = adjusted_time.replace(hour=run_hour, minute=0, second=0, microsecond=0) return run_time - def _createDownloadUrl(url, var, field, run, hours): - ''' + """ Generates a list of download URLs for meteorological data from the DWD server. The function scrapes the available files for a given variable and model run, filtering based on field type, forecast hours, and model levels. - + This code was adopted from: https://github.com/prayer007/dwdGribExtractor/tree/main - + Parameters ---------- url : string @@ -246,130 +279,138 @@ def _createDownloadUrl(url, var, field, run, hours): var : string The variable name, optionally including levels separated by '/'. field : string - The field parameter: 'time-invariant' (static), 'soil-level' (162cm), + The field parameter: 'time-invariant' (static), 'soil-level' (162cm), 'model-level' (62;63), or 'single-level' (2D field). run : string Model run identifier. hours : int Maximum forecast hours to retrieve. - + Returns ------- list List of filtered download URLs. - ''' - + """ + # Extract variable name and associated levels - levels = pd.Series(var.split('/')[1:]).astype(int) # Convert levels to integers - var = var.split('/')[0] # Extract variable name - + levels = pd.Series(var.split("/")[1:]).astype(int) # Convert levels to integers + var = var.split("/")[0] # Extract variable name + # Construct the data URL based on provided parameters - data_url = "{url}{run}/{var}/".format(url=url, var=var, run=run) - + data_url = f"{url}{run}/{var}/" + # Send an HTTP GET request to fetch available files response = requests.get(data_url) - + # Raise an error if the request fails - response.raise_for_status() - + response.raise_for_status() + # Parse the HTML content to extract links - soup = BeautifulSoup(response.content, 'html.parser') - + soup = BeautifulSoup(response.content, "html.parser") + # Find all anchor tags ('a') representing file links - link_tags = soup.find_all('a') - + link_tags = soup.find_all("a") + # Initialize an empty list to store the file URLs urls = [] - + # Iterate through all link tags, extract URLs, and store them for tag in link_tags: - link = tag.get('href') # Extract the hyperlink reference - + link = tag.get("href") # Extract the hyperlink reference + if link: # Construct the full URL by appending the relative link to the base URL full_url = data_url + link urls.append(full_url) - + # Convert the list of URLs into a Pandas Series for easy filtering urls = pd.Series(urls) - + # Filter URLs to retain only those containing 'regular-lat-lon' grid format - urls = urls[urls.str.contains('regular-lat-lon')] - + urls = urls[urls.str.contains("regular-lat-lon")] + # Further filter URLs based on the specified model field urls = urls[urls.str.contains(field)] - + # Apply forecast time horizon filter (excluding 'time-invariant' fields) - if field != 'time-invariant': - urls = urls[urls.str.findall(r"\_(\d{3})\_").str[0].astype(int) <= hours] - + if field != "time-invariant": + urls = urls[urls.str.findall(r"\_(\d{3})\_").str[0].astype(int) <= hours] + # Filter URLs based on model levels, if specified if not levels.empty: url_mask = pd.Series(index=urls.index, data=False) # Initialize boolean mask for level in levels: url_mask += urls.str.contains(f"_{level}_") # Check if URL contains level - urls = urls[url_mask] # Apply filter + urls = urls[url_mask] # Apply filter # Convert filtered URLs back to a list urls = list(urls) - + return urls def _deaverage(da): - ''' + """ Converts a temporally averaged data array into individual time-step values. Each time step's original value is reconstructed by reversing the cumulative averaging process. - + Parameters ---------- da : xarray.DataArray Input data array with a time dimension containing cumulative averages. - + Returns ------- xarray.DataArray Data array with de-averaged values. - ''' + """ # Create an integer index for time, matching da's shape - time_index = xr.DataArray(np.arange(1, da.sizes["time"] + 1), dims="time", coords={"time": da.time}) + time_index = xr.DataArray( + np.arange(1, da.sizes["time"] + 1), dims="time", coords={"time": da.time} + ) # Apply the reverse operation: Ψ_inst(t) = t * Ψ(t) - (t-1) * Ψ(t-1) - da_instantaneous = (time_index * da - (time_index - 1) * da.shift(time=1, fill_value=0)) - + da_instantaneous = time_index * da - (time_index - 1) * da.shift( + time=1, fill_value=0 + ) + # Fill the first timestep with NaNs, since it is always zero - da_instantaneous = da_instantaneous.where(da_instantaneous.time != da_instantaneous.time[0], np.nan) + da_instantaneous = da_instantaneous.where( + da_instantaneous.time != da_instantaneous.time[0], np.nan + ) + + return da_instantaneous - return da_instantaneous - def _deaccumulate(da): - ''' + """ Converts accumulated data into time-step differences. This function takes an accumulated dataset and calculates the incremental values between consecutive time steps. - + Parameters ---------- da : xarray.DataArray Input data array with a time dimension containing accumulated values. - + Returns ------- xarray.DataArray Data array with de-accumulated values (time-step differences). - ''' - + """ + # Apply the reverse operation: Ψ_inst(t) = Ψ(t) - Ψ(t-1) da_instantaneous = da - da.shift(time=1, fill_value=0) - + # Fill the first timestep with NaNs, since it is always zero - da_instantaneous = da_instantaneous.where(da_instantaneous.time != da_instantaneous.time[0], np.nan) + da_instantaneous = da_instantaneous.where( + da_instantaneous.time != da_instantaneous.time[0], np.nan + ) - return da_instantaneous + return da_instantaneous -def _average_duplicate_times(ds): +def _average_duplicate_times(ds): """ Averages duplicate timestamps in an xarray Dataset. @@ -400,13 +441,13 @@ def _average_duplicate_times(ds): def _mainDataCollector(url, var, field, forecast, offset, coords, tmpdir): - ''' + """ Downloads meteorological data for a given variable and processes it accordingly. - + This function retrieves data from the specified URL, processes it to rename and clean coordinates, and applies de-averaging or de-accumulation where necessary based on the GRIB step type. - + Parameters ---------- url : string @@ -424,13 +465,13 @@ def _mainDataCollector(url, var, field, forecast, offset, coords, tmpdir): The spatial coordinates where data is required. tmpdir : string Path to the temporary directory where downloaded files are stored. - + Returns ------- xarray.Dataset Processed dataset containing the collected meteorological data. - ''' - + """ + # Extract the most recent forecast run time latestRun = forecast[0] @@ -438,63 +479,69 @@ def _mainDataCollector(url, var, field, forecast, offset, coords, tmpdir): previousRuns = offset[offset < latestRun] # Keep only entries that align with ICON model run hours - previousRuns = previousRuns[previousRuns.hour.isin(model_run_hours)].sort_values(ascending=True) - + previousRuns = previousRuns[previousRuns.hour.isin(model_run_hours)].sort_values( + ascending=True + ) + if len(previousRuns) > 0: # Get the hour of the earliest previous run first_prev_hour = previousRuns[0].hour - + # Find the previous index in `model_run_hours` prev_idx = np.where(model_run_hours == first_prev_hour)[0][0] - 1 - + # Compute the adjusted previous run - previousRun = pd.DatetimeIndex([previousRuns[0].replace( - hour=model_run_hours[prev_idx], minute=0, second=0, microsecond=0 - )]) - + previousRun = pd.DatetimeIndex( + [ + previousRuns[0].replace( + hour=model_run_hours[prev_idx], minute=0, second=0, microsecond=0 + ) + ] + ) + # Add previousRun to previousRuns, ensuring uniqueness previousRuns = previousRuns.union(previousRun).sort_values(ascending=True) # Create a list of runs including the latest run and previous runs # Use an averaging_window of X hours for all previous runs to average the results - runs = [(run.strftime("%H"), averaging_window) for run in previousRuns] + [(latestRun.strftime("%H"), len(forecast))] + runs = [(run.strftime("%H"), averaging_window) for run in previousRuns] + [ + (latestRun.strftime("%H"), len(forecast)) + ] # # Generate download URLs for the specified variable and field # urls = [] # for run, hours in runs: # urls = urls + _createDownloadUrl(url, var, field, run, hours) - + # urls = pd.Series(urls).unique() - + ds_temps = [] # List to store temporary datasets - + for run, hours in runs: - # Generate download URLs for the specified variable and field urls = _createDownloadUrl(url, var, field, run, hours) - + # Download and collect the main dataset for the given variable - ds_temps.append(_download(urls, tmpdir)) - + ds_temps.append(_download(urls, tmpdir)) # Concatenate along the time dimension, keeping also duplicated timestamps ds = xr.concat(ds_temps, dim="time") - + # Average duplicates for smooth forecast transitioning ds = _average_duplicate_times(ds) - + # # Download and collect the main dataset for the given variable - # ds_temp = _mainDataCollector(url, var, field, forecast, offset, tmpdir) - + # ds_temp = _mainDataCollector(url, var, field, forecast, offset, tmpdir) + # Rename and clean coordinate labels for consistency # ds_temp = _rename_and_clean_coords(ds_temp) - + # # Iterate through all data variables in the dataset # for ds_var in list(ds_temp.data_vars): # # If the variable is an averaged quantity, apply de-averaging # if ds_temp[ds_var].attrs['GRIB_stepType'] == 'avg': # ds_temp[ds_var] = _deaverage(ds_temp[ds_var]) - + # # If the variable is an accumulated quantity, apply de-accumulation # elif ds_temp[ds_var].attrs['GRIB_stepType'] == 'accum': # ds_temp[ds_var] = _deaccumulate(ds_temp[ds_var]) @@ -504,12 +551,12 @@ def _mainDataCollector(url, var, field, forecast, offset, coords, tmpdir): def _interpolate(ds, static, coords, grid, interp_s, interp_t): - ''' + """ Interpolates a dataset to match specific latitude and longitude coordinates. - + If the data is not static, it first interpolates temporally. Then, it applies spatial interpolation using binning to adjust the data to the grid resolution. - + Parameters ---------- ds : xarray.Dataset @@ -524,101 +571,113 @@ def _interpolate(ds, static, coords, grid, interp_s, interp_t): Spatial interpolation method (not used in the function but can be applied elsewhere). interp_t : string Temporal interpolation method to be used. - + Returns ------- xarray.Dataset The interpolated dataset adjusted to the target spatial and temporal resolution. - ''' - + """ + # Perform temporal interpolation if the data is not static if not static: try: - ds = ds.interp(time=coords['time'].values, - method=interp_t, - kwargs={"fill_value": "extrapolate"}) + ds = ds.interp( + time=coords["time"].values, + method=interp_t, + kwargs={"fill_value": "extrapolate"}, + ) except ValueError: - logger.info(f"Interpolation: Not enough supporting points for used interpolation method {interp_t}.") + logger.info( + f"Interpolation: Not enough supporting points for used interpolation method {interp_t}." + ) logger.info("Interpolation method is set to 'nearest' instead.") - ds = ds.interp(time=coords['time'].values, - method="nearest", - kwargs={"fill_value": "extrapolate"}) - + ds = ds.interp( + time=coords["time"].values, + method="nearest", + kwargs={"fill_value": "extrapolate"}, + ) + # Create bin edges and labels for x-coordinates - x_bins = coords['x'].values - x_bins = np.insert(x_bins, 0, np.round(x_bins[0] - grid[0], 8), axis=0) # Extend bin range + x_bins = coords["x"].values + x_bins = np.insert( + x_bins, 0, np.round(x_bins[0] - grid[0], 8), axis=0 + ) # Extend bin range x_bins_label = np.round(x_bins[:-1] + grid[0], 8) # Compute bin centers - + # Create bin edges and labels for y-coordinates - y_bins = coords['y'].values - y_bins = np.insert(y_bins, 0, np.round(y_bins[0] - grid[1], 8), axis=0) # Extend bin range + y_bins = coords["y"].values + y_bins = np.insert( + y_bins, 0, np.round(y_bins[0] - grid[1], 8), axis=0 + ) # Extend bin range y_bins_label = np.round(y_bins[:-1] + grid[1], 8) # Compute bin centers - + # Store original dataset attributes attrs = ds.attrs - + # Perform spatial binning by grouping data into bins along x and y dimensions and computing the mean ds = ds.groupby_bins("x", x_bins, labels=x_bins_label).mean(dim="x") ds = ds.groupby_bins("y", y_bins, labels=y_bins_label).mean(dim="y") - + # Rename bins to standard coordinate names - ds = ds.rename({'y_bins': 'y', 'x_bins': 'x'}) - + ds = ds.rename({"y_bins": "y", "x_bins": "x"}) + # Reassign original dataset attributes ds = ds.assign_attrs(attrs) - + return ds @retry(tries=5, delay=5, backoff=2, logger=logger) -def _urlopen_with_retry(data_url, tmpfp, engine='cfgrib', **kwargs): - ''' +def _urlopen_with_retry(data_url, tmpfp, engine="cfgrib", **kwargs): + """ Attempts to download and decompress a dataset file with automatic retry on failure. - + This function fetches data from a given URL, retries up to five times in case of failure, and decompresses the response content before saving it to a temporary file. - + Parameters ---------- data_url : string The URL from which data should be downloaded. tmpfp : string The file path where the downloaded content will be temporarily stored. - + Returns ------- tuple - resp (requests.Response): The HTTP response object from the request. - ds (xarray.Dataset): The dataset extracted from the downloaded file. - ''' - + """ + # Send an HTTP GET request to the data URL with a timeout of 5 seconds resp = requests.get(data_url, timeout=5) # Check if the request was successful (HTTP 200 OK) if resp.status_code == 200: # Open the specified temporary file and write the decompressed response content - with open(tmpfp, 'wb') as f: + with open(tmpfp, "wb") as f: f.write(decompress(resp.content)) else: # Raise an error if the response was unsuccessful - raise ValueError(f"Error in response: {resp.reason}, status code: {resp.status_code}") - + raise ValueError( + f"Error in response: {resp.reason}, status code: {resp.status_code}" + ) + # Load the downloaded file as an xarray dataset using the 'cfgrib' engine - ds = xr.open_dataset(tmpfp, engine=engine) - + ds = xr.open_dataset(tmpfp, engine=engine) + # Return both the HTTP response object and the loaded dataset return resp, ds def _download(urls, tmpdir=None): - ''' + """ Collects meteorological data for all timesteps of a given variable. - + This function retrieves data files, processes them, and merges them into a single dataset. It determines the latest available runs, downloads the necessary files, and structures them according to the expected format. - + Parameters ---------- url : string @@ -634,13 +693,13 @@ def _download(urls, tmpdir=None): Array representing the forecast offsets. tmpdir : string, optional Temporary directory for storing downloaded files. - + Returns ------- xarray.Dataset Merged dataset containing the collected meteorological data. - ''' - + """ + # # Extract the most recent forecast run time # latestRun = forecast[0] @@ -657,74 +716,97 @@ def _download(urls, tmpdir=None): # urls = [] # for run, hours in runs: # urls = urls + _createDownloadUrl(url, var, field, run, hours) - + # urls = pd.Series(urls).unique() - + ds_temps = [] # List to store temporary datasets - + # Iterate over generated URLs and process each file for data_url in urls: - logger.info("ICON-D2 data -> Processing file: {f}".format(f=data_url)) - + logger.info(f"ICON-D2 data -> Processing file: {data_url}") + # Extract filename from URL and construct temporary file path - tmpfn = os.path.basename(data_url) - tmpfn = Path(tmpfn).with_suffix('') - tmpfp = "{p}/{tmpfn}".format(tmpfn=tmpfn, p=tmpdir) - + tmpfn = os.path.basename(data_url) + tmpfn = Path(tmpfn).with_suffix("") + tmpfp = f"{tmpdir}/{tmpfn}" + # Attempt to download and extract the dataset try: resp, ds_temp = _urlopen_with_retry(data_url, tmpfp) except Exception as err: - logger.info("Could not get {url}: {err}".format(err=err, url=data_url)) + logger.info(f"Could not get {data_url}: {err}") continue # Skip to next URL if download fails - + # Check if the dataset contains a 'generalVerticalLayer' coordinate - if 'generalVerticalLayer' in ds_temp.coords: + if "generalVerticalLayer" in ds_temp.coords: ds_coords = list(ds_temp.coords) - ds_coords_to_keep = ["valid_time", "longitude", "latitude", "generalVerticalLayer"] - ds_coords_to_drop = [ds_coord for ds_coord in ds_coords if ds_coord not in ds_coords_to_keep] - + ds_coords_to_keep = [ + "valid_time", + "longitude", + "latitude", + "generalVerticalLayer", + ] + ds_coords_to_drop = [ + ds_coord for ds_coord in ds_coords if ds_coord not in ds_coords_to_keep + ] + # Expand dataset dimensions and remove unwanted coordinates - ds_temp = ds_temp.expand_dims(dim=["valid_time", "generalVerticalLayer"]).drop_vars(ds_coords_to_drop) - + ds_temp = ds_temp.expand_dims( + dim=["valid_time", "generalVerticalLayer"] + ).drop_vars(ds_coords_to_drop) + # Assign coordinate values back to dataset - ds_temp = ds_temp.assign_coords({"valid_time": ds_temp.valid_time, - "latitude": ds_temp.latitude, - "longitude": ds_temp.longitude, - "generalVerticalLayer": ds_temp.generalVerticalLayer}) + ds_temp = ds_temp.assign_coords( + { + "valid_time": ds_temp.valid_time, + "latitude": ds_temp.latitude, + "longitude": ds_temp.longitude, + "generalVerticalLayer": ds_temp.generalVerticalLayer, + } + ) ds_temps.append(ds_temp) - + else: ds_coords = list(ds_temp.coords) - ds_coords_to_keep = ["valid_time", "longitude", "latitude"] - ds_coords_to_drop = [ds_coord for ds_coord in ds_coords if ds_coord not in ds_coords_to_keep] - + ds_coords_to_keep = ["valid_time", "longitude", "latitude"] + ds_coords_to_drop = [ + ds_coord for ds_coord in ds_coords if ds_coord not in ds_coords_to_keep + ] + # Swap 'step' dimension with 'valid_time' if applicable if "step" in ds_temp.dims: - ds_temp = ds_temp.swap_dims({"step": "valid_time"}).drop_vars(ds_coords_to_drop) + ds_temp = ds_temp.swap_dims({"step": "valid_time"}).drop_vars( + ds_coords_to_drop + ) else: - ds_temp = ds_temp.expand_dims(dim="valid_time").drop_vars(ds_coords_to_drop) - + ds_temp = ds_temp.expand_dims(dim="valid_time").drop_vars( + ds_coords_to_drop + ) + # Assign coordinate values back to dataset - ds_temp = ds_temp.assign_coords({"valid_time": ds_temp.valid_time, - "latitude": ds_temp.latitude, - "longitude": ds_temp.longitude}) + ds_temp = ds_temp.assign_coords( + { + "valid_time": ds_temp.valid_time, + "latitude": ds_temp.latitude, + "longitude": ds_temp.longitude, + } + ) ds_temps.append(ds_temp) - + # Merge all collected datasets into a single dataset - ds = xr.merge(ds_temps) - + ds = xr.merge(ds_temps) + # Rename and clean coordinate labels for consistency ds = _rename_and_clean_coords(ds) - + # Iterate through all data variables in the dataset for ds_var in list(ds.data_vars): # If the variable is an averaged quantity, apply de-averaging - if ds[ds_var].attrs['GRIB_stepType'] == 'avg': + if ds[ds_var].attrs["GRIB_stepType"] == "avg": ds[ds_var] = _deaverage(ds[ds_var]) - + # If the variable is an accumulated quantity, apply de-accumulation - elif ds[ds_var].attrs['GRIB_stepType'] == 'accum': + elif ds[ds_var].attrs["GRIB_stepType"] == "accum": ds[ds_var] = _deaccumulate(ds[ds_var]) return ds @@ -750,48 +832,52 @@ def _rename_and_clean_coords(ds, add_lon_lat=True): return ds + def _interpolate_to_cutout_resolution(ds, retrieval_params, static): - # Interpolate the data spatially and temporally to the wanted cutout resolution ds_temps = [] for idx, var in enumerate(ds.data_vars): - ds_temps.append(_interpolate(ds[var], static, - retrieval_params['coords'], - retrieval_params['grid'], - retrieval_params['interp_s'], - retrieval_params['interp_t']) - ) - + ds_temps.append( + _interpolate( + ds[var], + static, + retrieval_params["coords"], + retrieval_params["grid"], + retrieval_params["interp_s"], + retrieval_params["interp_t"], + ) + ) + ds = xr.merge(ds_temps) ds = ds.assign_coords(lon=("x", ds.x.values), lat=("y", ds.y.values)) - - ds = ds.unify_chunks().chunk(chunks=retrieval_params['chunks'] or {}) - + + ds = ds.unify_chunks().chunk(chunks=retrieval_params["chunks"] or {}) + return ds def get_data_wind(retrieval_params): - ''' + """ Retrieves and processes wind data from the DWD server. - + The function collects wind speed and direction data at 100m above ground level, as well as surface roughness data. It then processes and interpolates this data to match the desired spatial and temporal resolution. - + Parameters ---------- retrieval_params : dict Dictionary containing parameters for data retrieval, including coordinates, grid resolution, and interpolation methods. - + Returns ------- xarray.Dataset Processed dataset containing wind speed, wind direction, and surface roughness. - ''' - + """ + # Retrieve wind data from model levels 62 and 63 - retrieval_params['field'] = ['model-level', 'model-level'] + retrieval_params["field"] = ["model-level", "model-level"] ds = retrieve_data( url=dwd_url, variable=[ @@ -800,46 +886,44 @@ def get_data_wind(retrieval_params): ], **retrieval_params, ) - + # Compute the mean wind values across the general vertical layers - ds["u"] = ds["u"].mean('generalVerticalLayer') - ds["v"] = ds["v"].mean('generalVerticalLayer') - ds = ds.drop_dims('generalVerticalLayer') # Remove the dimension after averaging + ds["u"] = ds["u"].mean("generalVerticalLayer") + ds["v"] = ds["v"].mean("generalVerticalLayer") + ds = ds.drop_dims("generalVerticalLayer") # Remove the dimension after averaging ds = ds.rename({"u": "u_100m", "v": "v_100m"}) # Rename variables for clarity - - + # Retrieve surface roughness data from single-level data - retrieval_params['field'] = ['single-level'] + retrieval_params["field"] = ["single-level"] ds2 = retrieve_data( url=dwd_url, variable=["z0"], # Surface roughness length **retrieval_params, ) - + # Merge wind data with roughness data into a single dataset ds = xr.merge([ds, ds2]) - + # Rename roughness variable for clarity ds = ds.rename({"fsr": "roughness"}) ds["roughness"] = ds["roughness"].assign_attrs( - units="m", - long_name="Surface roughness" + units="m", long_name="Surface roughness" ) - + # Compute wind speed at 100m using the Pythagorean theorem ds["wnd100m"] = np.sqrt(ds["u_100m"] ** 2 + ds["v_100m"] ** 2).assign_attrs( units="m/s", long_name="100 metre wind speed" ) - + # Compute wind direction azimuth (0 = North, π/2 = East, π = South, 3π/2 = West) azimuth = np.arctan2(ds["u_100m"], ds["v_100m"]) - + # Ensure wind azimuth is within the 0 to 2π range ds["wnd_azimuth"] = azimuth.where(azimuth >= 0, azimuth + 2 * np.pi) - + # Remove intermediate wind component variables after processing ds = ds.drop_vars(["u_100m", "v_100m"]) - + return ds @@ -852,7 +936,12 @@ def sanitize_wind(ds): def get_data_influx(retrieval_params): """Get influx data for given retrieval parameters.""" # Retrieve single-level data - retrieval_params['field'] = ['single-level', 'single-level', 'single-level', 'single-level'] + retrieval_params["field"] = [ + "single-level", + "single-level", + "single-level", + "single-level", + ] ds = retrieve_data( url=dwd_url, variable=[ @@ -864,29 +953,42 @@ def get_data_influx(retrieval_params): **retrieval_params, ) - ds = ds.rename({"avg_tnswrf": "influx_toa", - "ASWDIR_S": "influx_direct", - "ASWDIFD_S": "influx_diffuse", - "al": "albedo"}) - - ds["albedo"] = (ds["albedo"]/100).assign_attrs(units="(0 - 1)", long_name="Shortwave broadband albedo for diffuse radiation") - ds["influx_diffuse"] = ds["influx_diffuse"].assign_attrs(units="W m**-2", long_name="Surface down solar diffuse radiation") - ds["influx_direct"] = ds["influx_direct"].assign_attrs(units="W m**-2", long_name="Surface down solar direct radiation") - ds["influx_toa"] = ds["influx_toa"].assign_attrs(units="W m**-2", long_name="Net short-wave radiation flux at top of atmosphere (TOA)") - + ds = ds.rename( + { + "avg_tnswrf": "influx_toa", + "ASWDIR_S": "influx_direct", + "ASWDIFD_S": "influx_diffuse", + "al": "albedo", + } + ) + + ds["albedo"] = (ds["albedo"] / 100).assign_attrs( + units="(0 - 1)", long_name="Shortwave broadband albedo for diffuse radiation" + ) + ds["influx_diffuse"] = ds["influx_diffuse"].assign_attrs( + units="W m**-2", long_name="Surface down solar diffuse radiation" + ) + ds["influx_direct"] = ds["influx_direct"].assign_attrs( + units="W m**-2", long_name="Surface down solar direct radiation" + ) + ds["influx_toa"] = ds["influx_toa"].assign_attrs( + units="W m**-2", + long_name="Net short-wave radiation flux at top of atmosphere (TOA)", + ) + # # Interpolate the data spatially and temporally to the wanted cutout resolution # ds_temps = [] # for idx, var in enumerate(ds): - # ds_temps.append(_interpolate(ds[var], False, - # retrieval_params['coords'], + # ds_temps.append(_interpolate(ds[var], False, + # retrieval_params['coords'], # retrieval_params['grid'], # retrieval_params['interp_s'], # retrieval_params['interp_t']) # ) - + # ds = xr.merge(ds_temps) # ds = ds.assign_coords(lon=("x", ds.x.values), lat=("y", ds.y.values)) - + # ICON-EU variables are mean values for previous hour, i.e. 13:01 to 14:00 are labelled as "14:00" # account by calculating the SolarPosition for the center of the interval for aggregation happens # see https://github.com/PyPSA/atlite/issues/158 @@ -905,31 +1007,29 @@ def get_data_influx(retrieval_params): ) ) sp = SolarPosition(ds, time_shift=time_shift) - + sp = sp.rename({v: f"solar_{v}" for v in sp.data_vars}) ds = xr.merge([ds, sp]) - + # # Interpolate the data spatially and temporally to the wanted cutout resolution # ds_temps = [] # for idx, var in enumerate(ds): - # ds_temps.append(_interpolate(ds[var], False, - # retrieval_params['coords'], + # ds_temps.append(_interpolate(ds[var], False, + # retrieval_params['coords'], # retrieval_params['grid'], # retrieval_params['interp_s'], # retrieval_params['interp_t']) # ) - + # ds = xr.merge(ds_temps) # ds = ds.assign_coords(lon=("x", ds.x.values), lat=("y", ds.y.values)) - + # ds = ds.unify_chunks.chunk(chunks=retrieval_params['chunks'] or {}) - - - + # ds = ds.drop_vars(['lon','lat']) # ds = ds.assign_coords(lon=("x", ds.x.values), lat=("y", ds.y.values)) - + return ds @@ -943,36 +1043,36 @@ def sanitize_influx(ds): def get_data_temperature(retrieval_params): """Get wind temperature for given retrieval parameters.""" # Retrieve single-level data - retrieval_params['field'] = ['single-level','soil-level'] - ds = retrieve_data( - url=dwd_url, - variable=["t_2m", - "t_so/162"], - **retrieval_params + retrieval_params["field"] = ["single-level", "soil-level"] + ds = retrieve_data(url=dwd_url, variable=["t_2m", "t_so/162"], **retrieval_params) + + ds = ds.rename({"t2m": "temperature", "T_SO": "soil temperature"}) + + ds["temperature"] = ds["temperature"].assign_attrs( + units="K", long_name="Temperature at 2m above ground" + ) + ds["soil temperature"] = ds["soil temperature"].assign_attrs( + units="K", long_name="Soil temperature in 162 cm depth " ) - ds = ds.rename({"t2m": "temperature", - "T_SO": "soil temperature"}) - - ds["temperature"] = ds["temperature"].assign_attrs(units="K", long_name="Temperature at 2m above ground") - ds["soil temperature"] = ds["soil temperature"].assign_attrs(units="K", long_name="Soil temperature in 162 cm depth ") - return ds def get_data_runoff(retrieval_params): """Get runoff data for given retrieval parameters.""" # Retrieve single-level data - retrieval_params['field'] = ['single-level','single-level'] - ds = retrieve_data(url=dwd_url, - variable=["runoff_s", - "runoff_g"], - **retrieval_params) - - ds["runoff"] = (ds["RUNOFF_S"] + ds["RUNOFF_G"]).assign_attrs(units="kg m**-2", long_name="Surface and Soil water runoff (accumulated since model start)") + retrieval_params["field"] = ["single-level", "single-level"] + ds = retrieve_data( + url=dwd_url, variable=["runoff_s", "runoff_g"], **retrieval_params + ) + + ds["runoff"] = (ds["RUNOFF_S"] + ds["RUNOFF_G"]).assign_attrs( + units="kg m**-2", + long_name="Surface and Soil water runoff (accumulated since model start)", + ) ds = ds.drop_vars(["RUNOFF_S", "RUNOFF_G"]) - + return ds @@ -985,16 +1085,14 @@ def sanitize_runoff(ds): def get_data_height(retrieval_params): """Get height data for given retrieval parameters.""" # Retrieve time-invariant data - retrieval_params['field'] = ['time-invariant'] - ds = retrieve_data(url=dwd_url, - variable=["hsurf"], - **retrieval_params) - + retrieval_params["field"] = ["time-invariant"] + ds = retrieve_data(url=dwd_url, variable=["hsurf"], **retrieval_params) + ds = ds.rename({"HSURF": "height"}) ds["height"] = ds["height"].assign_attrs( units="m", - long_name="Geometric Height of the earths surface above sea level (2D field)" - ) + long_name="Geometric Height of the earths surface above sea level (2D field)", + ) return ds @@ -1058,7 +1156,7 @@ def retrieval_times(coords, tz, static=False): return { "forecast": forecast_times, "offset": offset_times, - } + } def noisy_unlink(path): @@ -1069,42 +1167,48 @@ def noisy_unlink(path): except PermissionError: logger.error(f"Unable to delete file {path}, as it is still in use.") - -def retrieve_data(url, product, chunks=None, tmpdir=None, lock=None, **updates): +def retrieve_data(url, product, chunks=None, tmpdir=None, lock=None, **updates): """ Download data from the ICON-EU Model from the Open Data Server (ODS) of DWD. - + If you want to manually downolad the data go to: https://opendata.dwd.de/weather/nwp/icon-eu/grib/ """ - + request = {"product_type": "icon_eu", "format": "direct-download"} request.update(updates) ds_temps = [] - #Download data for each variable individually and then merge all in one xarray + # Download data for each variable individually and then merge all in one xarray logger.info(f"open-dwd: Downloading variables\n\t{request['variable']}\n") - for idx, var in enumerate(request['variable']): - ds_temps.append(_mainDataCollector(url, - var, - request['field'][idx], - request['forecast'], - request['offset'], - request['coords'], - tmpdir) - ) - + for idx, var in enumerate(request["variable"]): + ds_temps.append( + _mainDataCollector( + url, + var, + request["field"][idx], + request["forecast"], + request["offset"], + request["coords"], + tmpdir, + ) + ) + ds = xr.merge(ds_temps).chunk(chunks=chunks) - + return ds -def get_data(cutout, feature, tmpdir, - lock=None, - monthly_requests=False, - concurrent_requests=False, - **creation_parameters): +def get_data( + cutout, + feature, + tmpdir, + lock=None, + monthly_requests=False, + concurrent_requests=False, + **creation_parameters, +): """ Retrieve data from DWDs ICON-EU Model dataset (via ODS). @@ -1136,7 +1240,7 @@ def get_data(cutout, feature, tmpdir, coords = cutout.coords sanitize = creation_parameters.get("sanitize", True) - + retrieval_params = { "product": "dwd_icon_eu", "area": _area(coords), @@ -1164,8 +1268,10 @@ def retrieve_once(time, static=False): return ds if feature in static_features: - return retrieve_once(retrieval_times(coords, cutout.data.tz, True), True).squeeze() - + return retrieve_once( + retrieval_times(coords, cutout.data.tz, True), True + ).squeeze() + dataset = retrieve_once(retrieval_times(coords, cutout.data.tz, False), False) return dataset.sel(time=coords["time"]) diff --git a/atlite/datasets/meteo_forecast.py b/atlite/datasets/meteo_forecast.py index 1da22a94..38b189e9 100644 --- a/atlite/datasets/meteo_forecast.py +++ b/atlite/datasets/meteo_forecast.py @@ -1,5 +1,3 @@ -# -*- coding: utf-8 -*- - # SPDX-FileCopyrightText: 2016-2021 The Atlite Authors # # SPDX-License-Identifier: GPL-3.0-or-later @@ -11,25 +9,25 @@ https://open-meteo.com/en/docs """ +import datetime +import io import logging import os -import io import re -import zipfile +import time import warnings import weakref -import time -import xarray as xr -import datetime -import openmeteo_requests -import requests_cache +import zipfile +from tempfile import mkstemp + import cdsapi import numpy as np +import openmeteo_requests import pandas as pd - -from numpy import atleast_1d +import requests_cache +import xarray as xr from dask import compute, delayed -from tempfile import mkstemp +from numpy import atleast_1d from retry_requests import retry from ..gis import maybe_swap_spatial_dims @@ -45,20 +43,22 @@ @contextlib.contextmanager def nullcontext(): yield - + logger = logging.getLogger(__name__) # Setup Open-Meteo client with 7-day cache and retry on failure # Cache duration: -1 = never expire, 0 = no caching, timedelta = expire after set time cache_window = datetime.timedelta(days=7) -cache_session = requests_cache.CachedSession('.meteo.cache', backend='sqlite', expire_after=cache_window) +cache_session = requests_cache.CachedSession( + ".meteo.cache", backend="sqlite", expire_after=cache_window +) retry_session = retry(cache_session, retries=5, backoff_factor=3) openmeteo = openmeteo_requests.Client(session=retry_session) -# Set url for data download, this allows to switch to different data +# Set url for data download, this allows to switch to different data # sources more easily. -era5_url = 'https://cds.climate.copernicus.eu/api' +era5_url = "https://cds.climate.copernicus.eu/api" meteo_url1 = "https://api.open-meteo.com/v1/forecast" meteo_url2 = "https://api.open-meteo.com/v1/ecmwf" @@ -70,7 +70,7 @@ def nullcontext(): # Delay of ERA5 data upload # For Open-Meteo slow changing variables from ERA5 are always interpolated # Starting from data of at least 7 days before nowtime -ERA5_DELAY = pd.Timedelta(hours=-6.0*24) +ERA5_DELAY = pd.Timedelta(hours=-6.0 * 24) # Model and CRS Settings crs = 4326 @@ -87,109 +87,143 @@ def nullcontext(): "solar_azimuth", ], "temperature": ["temperature", "soil temperature"], - "runoff": ["runoff"] + "runoff": ["runoff"], } static_features = {"height"} -requirements = {'x': slice(-90, 90, 0.1), - 'y': slice(-90, 90, 0.1), - 'offset': pd.Timedelta(hours=-92.0*24), - 'forecast': pd.Timedelta(hours=16.0*24), - 'dt': pd.Timedelta(hours=1), - 'parallel': False, - } +requirements = { + "x": slice(-90, 90, 0.1), + "y": slice(-90, 90, 0.1), + "offset": pd.Timedelta(hours=-92.0 * 24), + "forecast": pd.Timedelta(hours=16.0 * 24), + "dt": pd.Timedelta(hours=1), + "parallel": False, +} def _checkModuleRequirements(x, y, time, time_now, **kwargs): """ Load and check the data requirements for a given module. - - Parameters: + + Parameters + ---------- x (slice): Defines the start, stop, and step values for the x-dimension. y (slice): Defines the start, stop, and step values for the y-dimension. time (slice): Defines the start, stop, and step values for the time dimension. **kwargs: Additional optional parameters. """ - + # Extract start, stop, and step values for x x_start, x_stop, x_step = x.start, x.stop, x.step - + # Adjust x range based on module requirements - if requirements['x'].start > x.start: - x_start = requirements['x'].start - if requirements['x'].stop < x.stop: - x_stop = requirements['x'].stop - if requirements['x'].step > x.step: - x_step = requirements['x'].step - + if requirements["x"].start > x.start: + x_start = requirements["x"].start + if requirements["x"].stop < x.stop: + x_stop = requirements["x"].stop + if requirements["x"].step > x.step: + x_step = requirements["x"].step + x = slice(x_start, x_stop, x_step) - + # Extract start, stop, and step values for y y_start, y_stop, y_step = y.start, y.stop, y.step - + # Adjust y range based on module requirements - if requirements['y'].start > y.start: - y_start = requirements['y'].start - if requirements['y'].stop < y.stop: - y_stop = requirements['y'].stop - if requirements['y'].step > y.step: - y_step = requirements['y'].step - + if requirements["y"].start > y.start: + y_start = requirements["y"].start + if requirements["y"].stop < y.stop: + y_stop = requirements["y"].stop + if requirements["y"].step > y.step: + y_step = requirements["y"].step + y = slice(y_start, y_stop, y_step) - - + # Extract time range parameters time_start = time.start time_stop = time.stop time_step = time.step - + # Check forecast feasibility - feasible_start = time_now + requirements['offset'] - feasible_end = time_now + requirements['forecast'] - + feasible_start = time_now + requirements["offset"] + feasible_end = time_now + requirements["forecast"] + # Ensure time_start is within feasible bounds if time_start < feasible_start: - logger.error(f"The required forecast start time {time_start} exceeds the model requirements.") - logger.error(f"The minimum start time of the forecast for {time_now} is {feasible_start}.") - logger.error(f"The maximum historical offset of the forecast is {requirements['offset']}.") - raise ValueError(f"Invalid forecast start time: {time_start}. Must be >= {feasible_start}.") - + logger.error( + f"The required forecast start time {time_start} exceeds the model requirements." + ) + logger.error( + f"The minimum start time of the forecast for {time_now} is {feasible_start}." + ) + logger.error( + f"The maximum historical offset of the forecast is {requirements['offset']}." + ) + raise ValueError( + f"Invalid forecast start time: {time_start}. Must be >= {feasible_start}." + ) + if time_start >= feasible_end: - logger.error(f"The required forecast start time {time_start} exceeds the model requirements.") - logger.error(f"The maximum start time of the forecast for {time_now} needs to be smaller than {feasible_end}.") - raise ValueError(f"Invalid forecast start time: {time_start}. Must be < {feasible_end}.") + logger.error( + f"The required forecast start time {time_start} exceeds the model requirements." + ) + logger.error( + f"The maximum start time of the forecast for {time_now} needs to be smaller than {feasible_end}." + ) + raise ValueError( + f"Invalid forecast start time: {time_start}. Must be < {feasible_end}." + ) # Ensure time_stop is greater than time_start if time_stop <= time_start: - logger.error(f"The required forecast end time {time_stop} exceeds the model requirements.") - logger.error(f"The minimum end time of the forecast for {time_now} needs to be larger than {time_start}.") - raise ValueError(f"Invalid forecast end time: {time_stop}. Must be > {time_start}.") - + logger.error( + f"The required forecast end time {time_stop} exceeds the model requirements." + ) + logger.error( + f"The minimum end time of the forecast for {time_now} needs to be larger than {time_start}." + ) + raise ValueError( + f"Invalid forecast end time: {time_stop}. Must be > {time_start}." + ) + # Ensure time_stop is greater than time_start if time_stop > feasible_end: - logger.error(f"The required forecast end time {time_stop} exceeds the model requirements.") - logger.error(f"The maximum end time of the forecast for {time_now} is {feasible_end}.") + logger.error( + f"The required forecast end time {time_stop} exceeds the model requirements." + ) + logger.error( + f"The maximum end time of the forecast for {time_now} is {feasible_end}." + ) logger.error(f"The maximum forecast horizon is {requirements['forecast']}.") - raise ValueError(f"Invalid forecast end time: {time_stop}. Must be <= {feasible_end}.") - + raise ValueError( + f"Invalid forecast end time: {time_stop}. Must be <= {feasible_end}." + ) + # Ensure time step is within required limits - if (time_step is pd.Timedelta(None)) or (time.step < requirements['dt']): - logger.warning(f"The required temporal forecast resolution {time_step} exceeds the model requirements.") - logger.warning(f"The minimum temporal resolution of the forecast is {requirements['dt']}.") - logger.info(f"Set the temporal forecast resolution to the minimum: {requirements['dt']}.") - time_step = requirements['dt'] - + if (time_step is pd.Timedelta(None)) or (time.step < requirements["dt"]): + logger.warning( + f"The required temporal forecast resolution {time_step} exceeds the model requirements." + ) + logger.warning( + f"The minimum temporal resolution of the forecast is {requirements['dt']}." + ) + logger.info( + f"Set the temporal forecast resolution to the minimum: {requirements['dt']}." + ) + time_step = requirements["dt"] + time = slice(time_start, time_stop, time_step) - + # Retrieve parallel processing setting from requirements - parallel = requirements['parallel'] - + parallel = requirements["parallel"] + return x, y, time, parallel def _add_height(ds): - """Convert geopotential 'z' to geopotential height following [1]. + """ + Convert geopotential 'z' to geopotential height following [1]. References ---------- @@ -249,44 +283,48 @@ def _rename_and_clean_coords(ds, add_lon_lat=True): def _interpolate(ds, ds_ref, static, interp_s, interp_t): - - #Interpolate data to specific latitude and longitude values given as input (due to specific model resolution) - + # Interpolate data to specific latitude and longitude values given as input (due to specific model resolution) + if not static: try: ds = ds.interp( - time=ds_ref.time.values, - method=interp_t, - kwargs={"fill_value": "extrapolate"}, - ) + time=ds_ref.time.values, + method=interp_t, + kwargs={"fill_value": "extrapolate"}, + ) except ValueError: - logger.info(f"Interpolation: Not enough supporting points for used interpolation method {interp_t}.") + logger.info( + f"Interpolation: Not enough supporting points for used interpolation method {interp_t}." + ) logger.info("Interpolation method is set to 'nearest' instead.") ds = ds.interp( - time=ds_ref.time.values, - method='nearest', - kwargs={"fill_value": "extrapolate"}, - ) + time=ds_ref.time.values, + method="nearest", + kwargs={"fill_value": "extrapolate"}, + ) try: ds = ds.interp( - x=ds_ref.x.values, - y=ds_ref.y.values, - method=interp_s, - kwargs={"fill_value": "extrapolate"}, - ) + x=ds_ref.x.values, + y=ds_ref.y.values, + method=interp_s, + kwargs={"fill_value": "extrapolate"}, + ) except ValueError: - logger.info(f"Interpolation: Not enough supporting points for used interpolation method {interp_s}.") + logger.info( + f"Interpolation: Not enough supporting points for used interpolation method {interp_s}." + ) logger.info("Interpolation method is set to 'nearest' instead.") ds = ds.interp( - x=ds_ref.x.values, - y=ds_ref.y.values, - method='nearest', - kwargs={"fill_value": "extrapolate"}, - ) - + x=ds_ref.x.values, + y=ds_ref.y.values, + method="nearest", + kwargs={"fill_value": "extrapolate"}, + ) + return ds + def get_data_meteo_wind(retrieval_params): """Get all data from meteo API for given retrieval parameters at once to save requests and runtime.""" @@ -298,7 +336,7 @@ def get_data_meteo_wind(retrieval_params): ], **retrieval_params, ) - + return ds @@ -331,7 +369,7 @@ def get_data_meteo_temperature(retrieval_params): ], **retrieval_params, ) - + return ds @@ -359,7 +397,7 @@ def sanitize_runoff(ds): def get_data_era5_wind(retrieval_params): """Get wind data for given retrieval parameters.""" - + ds = retrieve_era5_data( url=era5_url, variable=["forecast_surface_roughness"], @@ -372,14 +410,14 @@ def get_data_era5_wind(retrieval_params): def combine_data_wind(ds_meteo, ds_era5, interp_s, interp_t): - - ds_era5 = _interpolate(ds=ds_era5, - ds_ref=ds_meteo, - static=False, - interp_s=interp_s, - interp_t=interp_t, - ) - + ds_era5 = _interpolate( + ds=ds_era5, + ds_ref=ds_meteo, + static=False, + interp_s=interp_s, + interp_t=interp_t, + ) + ds = xr.merge([ds_meteo, ds_era5]) ds = ds.rename( @@ -394,7 +432,7 @@ def combine_data_wind(ds_meteo, ds_era5, interp_s, interp_t): ds.wnd_azimuth.attrs.update( units="degree", long_name="Wind direction at 80m above ground" ) - + # unify_chunks() is necessary to avoid a bug in xarray ds = ds.unify_chunks() @@ -412,26 +450,24 @@ def get_data_era5_influx(retrieval_params): ds = retrieve_era5_data( url=era5_url, - variable=["forecast_albedo", - "toa_incident_solar_radiation"], + variable=["forecast_albedo", "toa_incident_solar_radiation"], **retrieval_params, ) - + ds = _rename_and_clean_coords(ds) return ds - - + + def combine_data_influx(ds_meteo, ds_era5, interp_s, interp_t): - + ds_era5 = _interpolate( + ds=ds_era5, + ds_ref=ds_meteo, + static=False, + interp_s=interp_s, + interp_t=interp_t, + ) - ds_era5 = _interpolate(ds=ds_era5, - ds_ref=ds_meteo, - static=False, - interp_s=interp_s, - interp_t=interp_t, - ) - ds = xr.merge([ds_meteo, ds_era5]) ds = ds.rename( @@ -455,7 +491,7 @@ def combine_data_influx(ds_meteo, ds_era5, interp_s, interp_t): ds.influx_toa.attrs.update( units="W m**-2", long_name="TOA incident solar radiation" ) - + # unify_chunks() is necessary to avoid a bug in xarray ds = ds.unify_chunks() @@ -484,10 +520,9 @@ def sanitize_influx(ds): def combine_data_temperature(ds_meteo, ds_era5, interp_s, interp_t): """Get wind temperature for given retrieval parameters.""" ds = xr.merge([ds_meteo, ds_era5]) - + ds = ds.rename( - {"temperature_2m": "temperature", - "soil_temperature_54cm": "soil temperature"} + {"temperature_2m": "temperature", "soil_temperature_54cm": "soil temperature"} ) # Convert from Celsius to Kelvin C -> K, by adding 273.15 @@ -504,10 +539,7 @@ def combine_data_temperature(ds_meteo, ds_era5, interp_s, interp_t): def get_data_era5_height(retrieval_params): """Get height data for given retrieval parameters.""" - ds = retrieve_era5_data( - url=era5_url, - variable=["geopotential"], - **retrieval_params) + ds = retrieve_era5_data(url=era5_url, variable=["geopotential"], **retrieval_params) ds = _rename_and_clean_coords(ds) ds = _add_height(ds) @@ -561,18 +593,22 @@ def retrieve_meteo_data(url, product, chunks=None, tmpdir=None, lock=None, **upd request.update(updates) # Generate list of (lon, lat) coordinate pairs - grid = np.meshgrid(request['coords']['x'], request['coords']['y']) - coords = pd.DataFrame(zip(grid[0].flatten(), grid[1].flatten()), columns=['longitude', 'latitude']) + grid = np.meshgrid(request["coords"]["x"], request["coords"]["y"]) + coords = pd.DataFrame( + zip(grid[0].flatten(), grid[1].flatten()), columns=["longitude", "latitude"] + ) # Calculate time and variable counts - start_date = request['start'].strftime('%Y-%m-%d') - end_date = request['end'].strftime('%Y-%m-%d') + start_date = request["start"].strftime("%Y-%m-%d") + end_date = request["end"].strftime("%Y-%m-%d") nr_days = abs((pd.to_datetime(start_date) - pd.to_datetime(end_date)).days) - nr_variables = len(request['variable']) + nr_variables = len(request["variable"]) nr_locations = len(coords) # Estimate request weight based on Open-Meteo's internal model - weight_of_full_api_request = max(nr_variables / 10, (nr_variables / 10) * (nr_days / 14)) * nr_locations + weight_of_full_api_request = ( + max(nr_variables / 10, (nr_variables / 10) * (nr_days / 14)) * nr_locations + ) # Dynamically determine chunk size based on rate limit thresholds if weight_of_full_api_request <= MINUTE_LIMIT: @@ -585,21 +621,23 @@ def retrieve_meteo_data(url, product, chunks=None, tmpdir=None, lock=None, **upd chunk_size = int(max(1, chunk_size)) # Ensure chunk_size ≥ 1 and integer logger.info(f"Meteo-API: Downloading variables\n\t{request['variable']}\n") - logger.info(f"Meteo-API: Expected request weight of full request: {weight_of_full_api_request}") + logger.info( + f"Meteo-API: Expected request weight of full request: {weight_of_full_api_request}" + ) # Loop through coordinate grid in blocks and request data data = [] for i in range(0, len(coords), chunk_size): - coord_chunk = coords.iloc[i:i + chunk_size] + coord_chunk = coords.iloc[i : i + chunk_size] # Prepare API parameters for the current chunk params = { - "longitude": coord_chunk['longitude'].tolist(), - "latitude": coord_chunk['latitude'].tolist(), - "hourly": request['variable'], + "longitude": coord_chunk["longitude"].tolist(), + "latitude": coord_chunk["latitude"].tolist(), + "hourly": request["variable"], "wind_speed_unit": "ms", "start_date": start_date, - "end_date": end_date + "end_date": end_date, } try: @@ -608,8 +646,10 @@ def retrieve_meteo_data(url, product, chunks=None, tmpdir=None, lock=None, **upd logger.info(f"{e}") try: # Extract and classify rate limiting error - rate_limiting_error = list(e.args)[0]['reason'] - match = re.search(r"(Minutely|Hourly|Daily) API request limit", rate_limiting_error) + rate_limiting_error = list(e.args)[0]["reason"] + match = re.search( + r"(Minutely|Hourly|Daily) API request limit", rate_limiting_error + ) if match: apply_rate_limiting(error=match[0]) else: @@ -618,8 +658,10 @@ def retrieve_meteo_data(url, product, chunks=None, tmpdir=None, lock=None, **upd responses = openmeteo.weather_api(url, params=params) except Exception as e: # Skip this chunk on repeated failure - logger.error(f"Meteo-API: Failed to fetch data for block starting at " - f"({coord_chunk.loc[0, 'longitude']}, {coord_chunk.loc[0, 'latitude']}): {e}") + logger.error( + f"Meteo-API: Failed to fetch data for block starting at " + f"({coord_chunk.loc[0, 'longitude']}, {coord_chunk.loc[0, 'latitude']}): {e}" + ) continue # Parse chunked response and append results @@ -629,7 +671,7 @@ def retrieve_meteo_data(url, product, chunks=None, tmpdir=None, lock=None, **upd ds = pd.concat(data).to_xarray() ds = _rename_and_clean_coords(ds) ds = ds.chunk(chunks=chunks) - + return ds @@ -655,9 +697,12 @@ def parse_meteo_responses(responses, params): # Reconstruct time index based on interval range_start = pd.to_datetime(response.Hourly().Time(), unit="s") range_end = pd.to_datetime(response.Hourly().TimeEnd(), unit="s") - date_range = pd.date_range(start=range_start, end=range_end, - freq=pd.Timedelta(seconds=response.Hourly().Interval()), - inclusive="left") + date_range = pd.date_range( + start=range_start, + end=range_end, + freq=pd.Timedelta(seconds=response.Hourly().Interval()), + inclusive="left", + ) # Initialize empty DataFrame for the current location response_df = pd.DataFrame(columns=params["hourly"]) @@ -694,14 +739,16 @@ def apply_rate_limiting(error=None): """ now = datetime.datetime.now() - midnight = datetime.datetime(now.year, now.month, now.day) + datetime.timedelta(days=1, minutes=5) + midnight = datetime.datetime(now.year, now.month, now.day) + datetime.timedelta( + days=1, minutes=5 + ) time_until_midnight = (midnight - now).total_seconds() sleep_times = { None: 120, # Fallback for unknown errors "Minutely API request limit": 60, "Hourly API request limit": 60 * 60, - "Daily API request limit": time_until_midnight + "Daily API request limit": time_until_midnight, } sleep_time = sleep_times[error] @@ -715,21 +762,21 @@ def retrieve_era5_data(url, product, chunks=None, tmpdir=None, lock=None, **upda If you want to track the state of your request go to https://cds-beta.climate.copernicus.eu/requests?tab=all - """ - request = {"product_type": ["reanalysis"], - "data_format": "netcdf", - "download_format": "zip"} - + """ + request = { + "product_type": ["reanalysis"], + "data_format": "netcdf", + "download_format": "zip", + } + request.update(updates) - assert {"year", "month", "variable"}.issubset( - request - ), "Need to specify at least 'variable', 'year' and 'month'" + assert {"year", "month", "variable"}.issubset(request), ( + "Need to specify at least 'variable', 'year' and 'month'" + ) client = cdsapi.Client( - url = url, - info_callback=logger.debug, - debug=logging.DEBUG >= logging.root.level + url=url, info_callback=logger.debug, debug=logging.DEBUG >= logging.root.level ) result = client.retrieve(product, request) @@ -746,35 +793,43 @@ def retrieve_era5_data(url, product, chunks=None, tmpdir=None, lock=None, **upda varstr = "\n\t".join([f"{v} ({timestr})" for v in variables]) logger.info(f"CDS: Downloading variables\n\t{varstr}\n") result.download(target_zip) - + # Open the .zip file in memory with zipfile.ZipFile(target_zip, "r") as zf: # Identify .nc files inside the .zip nc_files = [name for name in zf.namelist() if name.endswith(".nc")] - + if not nc_files: - raise FileNotFoundError("No .nc files found in the downloaded .zip archive.") - + raise FileNotFoundError( + "No .nc files found in the downloaded .zip archive." + ) + if len(nc_files) == 1: # If there's only one .nc file, read it into memory with zf.open(nc_files[0]) as nc_file: # Pass the in-memory file-like object to Xarray - ds = xr.open_dataset(io.BytesIO(nc_file.read()), chunks=chunks or {}) - + ds = xr.open_dataset( + io.BytesIO(nc_file.read()), chunks=chunks or {} + ) + else: # If multiple .nc files, combine them using Xarray datasets = [] for nc_file in nc_files: with zf.open(nc_file) as file: - dataset = xr.open_dataset(io.BytesIO(file.read()), chunks=chunks or {}) - - if 'expver' in dataset.variables: - dataset = dataset.drop_vars(["expver", "number"], errors="ignore") + dataset = xr.open_dataset( + io.BytesIO(file.read()), chunks=chunks or {} + ) + + if "expver" in dataset.variables: + dataset = dataset.drop_vars( + ["expver", "number"], errors="ignore" + ) datasets.append(dataset) - - ds = xr.merge(datasets) - + + ds = xr.merge(datasets) + if tmpdir is None: logging.debug(f"Adding finalizer for {target_zip}") weakref.finalize(ds._file_obj._manager, noisy_unlink, target_zip) @@ -782,7 +837,9 @@ def retrieve_era5_data(url, product, chunks=None, tmpdir=None, lock=None, **upda return ds -def retrieval_times_era5_forecast(coords, initialization_time, static=False, monthly_requests=False): +def retrieval_times_era5_forecast( + coords, initialization_time, static=False, monthly_requests=False +): """ Get list of retrieval cdsapi arguments for time dimension in coordinates. @@ -806,33 +863,36 @@ def retrieval_times_era5_forecast(coords, initialization_time, static=False, mon list of dicts witht retrieval arguments """ - + # Convert time coordinates to a pandas Index time = coords["time"].to_index() frequency = time.freq - + # Determine the latest available ERA5 data time based on initialization time and required delay latest_era5_time = pd.Timestamp(initialization_time) + ERA5_DELAY - + # Round up to the next full day and subtract 1 hour to align with ERA5 update frequency - latest_era5_time = latest_era5_time.ceil('D') - pd.Timedelta(hours=1) - + latest_era5_time = latest_era5_time.ceil("D") - pd.Timedelta(hours=1) + # Define the minimum required time horizon for ERA5 downloads (last 24 hours) minimum_era5_time_horizon = pd.date_range( - start=latest_era5_time - pd.Timedelta(days=1), # One day before the latest available time + start=latest_era5_time + - pd.Timedelta(days=1), # One day before the latest available time end=latest_era5_time, # Up to the latest available time - freq=frequency # Maintain original time frequency + freq=frequency, # Maintain original time frequency ) - + # Merge the existing time index with the minimum ERA5 time horizon, avoiding duplicates time = time.union(minimum_era5_time_horizon) - + # Ensure a continuous time index by filling missing values based on the determined frequency - complete_time_range = pd.date_range(start=time.min(), end=time.max(), freq=frequency) - + complete_time_range = pd.date_range( + start=time.min(), end=time.max(), freq=frequency + ) + # Keep only timestamps up to the latest available ERA5 time time = complete_time_range[complete_time_range <= latest_era5_time] - + if static: return { "year": [str(time[0].year)], @@ -850,7 +910,9 @@ def retrieval_times_era5_forecast(coords, initialization_time, static=False, mon query = { "year": str(year), "month": [str(month).zfill(2)], - "day": list(t[t.month == month].day.unique().astype(str).str.zfill(2)), + "day": list( + t[t.month == month].day.unique().astype(str).str.zfill(2) + ), "time": ["%02d:00" % h for h in t[t.month == month].hour.unique()], } times.append(query) @@ -906,7 +968,7 @@ def get_data( """ coords = cutout.coords - initialization_time = creation_parameters['init_time'] + initialization_time = creation_parameters["init_time"] sanitize = creation_parameters.get("sanitize", True) @@ -918,7 +980,7 @@ def get_data( "tmpdir": tmpdir, "lock": lock, } - + retrieval_params_era5 = { "product": "reanalysis-era5-single-levels", "area": _area(coords), @@ -928,63 +990,68 @@ def get_data( "lock": lock, } - # Get fast changing variabels from meteo forecast - func_meteo = globals().get(f"get_data_meteo_{feature}") + func_meteo = globals().get(f"get_data_meteo_{feature}") logger.info(f"Requesting data for feature {feature} from meteo...") - - + if func_meteo is not None: - datasets_meteo = func_meteo({**retrieval_params_meteo, - **{"start": coords["time"].to_index()[0], - "end": coords["time"].to_index()[-1], - "coords": coords}}) + datasets_meteo = func_meteo( + { + **retrieval_params_meteo, + **{ + "start": coords["time"].to_index()[0], + "end": coords["time"].to_index()[-1], + "coords": coords, + }, + } + ) else: datasets_meteo = xr.Dataset() - def retrieve_once(time): ds = func_era5({**retrieval_params_era5, **time}) return ds - # Get missing and slow changing variabels from era5 data and interpolation func_era5 = globals().get(f"get_data_era5_{feature}") if func_era5 is not None: logger.info(f"Requesting addtional data for feature {feature} from era5...") - + if feature in static_features: - return retrieve_once(retrieval_times_era5_forecast(coords, initialization_time, static=True)).squeeze() - - time_chunks = retrieval_times_era5_forecast(coords, initialization_time, monthly_requests=monthly_requests) + return retrieve_once( + retrieval_times_era5_forecast(coords, initialization_time, static=True) + ).squeeze() + + time_chunks = retrieval_times_era5_forecast( + coords, initialization_time, monthly_requests=monthly_requests + ) if concurrent_requests: delayed_datasets = [delayed(retrieve_once)(chunk) for chunk in time_chunks] datasets_era5 = compute(*delayed_datasets) else: datasets_era5 = map(retrieve_once, time_chunks) - + datasets_era5 = xr.concat(datasets_era5, dim="time") - + else: datasets_era5 = xr.Dataset() - + # Combine datasets and calculate the required variables combine_func = globals().get(f"combine_data_{feature}") - + logger.info(f"Combine meteo and era5 datasets for feature {feature}...") - + if combine_func is not None: - datasets = combine_func(datasets_meteo, - datasets_era5, - cutout.data.interp_s, - cutout.data.interp_t) + datasets = combine_func( + datasets_meteo, datasets_era5, cutout.data.interp_s, cutout.data.interp_t + ) else: datasets = xr.merge([datasets_meteo, datasets_era5]) - + sanitize_func = globals().get(f"sanitize_{feature}") if sanitize and sanitize_func is not None: # Sanitize the data after interpolation to remove residuals datasets = sanitize_func(datasets) - + return datasets.sel(time=coords["time"]) diff --git a/atlite/datasets/meteo_historic.py b/atlite/datasets/meteo_historic.py index 6d0fef3e..c6739b56 100644 --- a/atlite/datasets/meteo_historic.py +++ b/atlite/datasets/meteo_historic.py @@ -1,5 +1,3 @@ -# -*- coding: utf-8 -*- - # SPDX-FileCopyrightText: 2016-2021 The Atlite Authors # # SPDX-License-Identifier: GPL-3.0-or-later @@ -11,25 +9,25 @@ https://open-meteo.com/en/docs/historical-weather-api """ +import datetime +import io import logging import os -import io import re -import zipfile +import time import warnings import weakref -import time -import xarray as xr -import datetime -import openmeteo_requests -import requests_cache +import zipfile +from tempfile import mkstemp + import cdsapi import numpy as np +import openmeteo_requests import pandas as pd - -from numpy import atleast_1d +import requests_cache +import xarray as xr from dask import compute, delayed -from tempfile import mkstemp +from numpy import atleast_1d from retry_requests import retry from ..gis import maybe_swap_spatial_dims @@ -45,20 +43,22 @@ @contextlib.contextmanager def nullcontext(): yield - + logger = logging.getLogger(__name__) # Setup Open-Meteo client with 7-day cache and retry on failure # Cache duration: -1 = never expire, 0 = no caching, timedelta = expire after set time cache_window = datetime.timedelta(days=7) -cache_session = requests_cache.CachedSession('.meteo.cache', backend='sqlite', expire_after=cache_window) +cache_session = requests_cache.CachedSession( + ".meteo.cache", backend="sqlite", expire_after=cache_window +) retry_session = retry(cache_session, retries=5, backoff_factor=3) openmeteo = openmeteo_requests.Client(session=retry_session) -# Set url for data download, this allows to switch to different data +# Set url for data download, this allows to switch to different data # sources more easily. -era5_url = 'https://cds.climate.copernicus.eu/api' +era5_url = "https://cds.climate.copernicus.eu/api" meteo_url = "https://archive-api.open-meteo.com/v1/archive" # Open-Meteo request limits @@ -69,7 +69,7 @@ def nullcontext(): # Delay of ERA5 data upload # For Open-Meteo slow changing variables from ERA5 are always interpolated # Starting from data of at least 7 days before nowtime -ERA5_DELAY = pd.Timedelta(hours=-6.0*24) +ERA5_DELAY = pd.Timedelta(hours=-6.0 * 24) # Model and CRS Settings crs = 4326 @@ -90,104 +90,147 @@ def nullcontext(): static_features = {"height"} -requirements = {'x': slice(-90, 90, 0.1), - 'y': slice(-90, 90, 0.1), - 'offset': (pd.Timestamp('1940-01-01')-pd.Timestamp.utcnow().replace(tzinfo=None).floor("h")), - 'forecast': (pd.Timedelta(hours=-1.0*24)+(pd.Timestamp.utcnow().replace(tzinfo=None).ceil("d")-pd.Timestamp.utcnow().replace(tzinfo=None))).floor("h"), - 'dt': pd.Timedelta(hours=1), - 'parallel': False, - } +requirements = { + "x": slice(-90, 90, 0.1), + "y": slice(-90, 90, 0.1), + "offset": ( + pd.Timestamp("1940-01-01") + - pd.Timestamp.utcnow().replace(tzinfo=None).floor("h") + ), + "forecast": ( + pd.Timedelta(hours=-1.0 * 24) + + ( + pd.Timestamp.utcnow().replace(tzinfo=None).ceil("d") + - pd.Timestamp.utcnow().replace(tzinfo=None) + ) + ).floor("h"), + "dt": pd.Timedelta(hours=1), + "parallel": False, +} def _checkModuleRequirements(x, y, time, time_now, **kwargs): """ Load and check the data requirements for a given module. - - Parameters: + + Parameters + ---------- x (slice): Defines the start, stop, and step values for the x-dimension. y (slice): Defines the start, stop, and step values for the y-dimension. time (slice): Defines the start, stop, and step values for the time dimension. **kwargs: Additional optional parameters. """ - + # Extract start, stop, and step values for x x_start, x_stop, x_step = x.start, x.stop, x.step - + # Adjust x range based on module requirements - if requirements['x'].start > x.start: - x_start = requirements['x'].start - if requirements['x'].stop < x.stop: - x_stop = requirements['x'].stop - if requirements['x'].step > x.step: - x_step = requirements['x'].step - + if requirements["x"].start > x.start: + x_start = requirements["x"].start + if requirements["x"].stop < x.stop: + x_stop = requirements["x"].stop + if requirements["x"].step > x.step: + x_step = requirements["x"].step + x = slice(x_start, x_stop, x_step) - + # Extract start, stop, and step values for y y_start, y_stop, y_step = y.start, y.stop, y.step - + # Adjust y range based on module requirements - if requirements['y'].start > y.start: - y_start = requirements['y'].start - if requirements['y'].stop < y.stop: - y_stop = requirements['y'].stop - if requirements['y'].step > y.step: - y_step = requirements['y'].step - + if requirements["y"].start > y.start: + y_start = requirements["y"].start + if requirements["y"].stop < y.stop: + y_stop = requirements["y"].stop + if requirements["y"].step > y.step: + y_step = requirements["y"].step + y = slice(y_start, y_stop, y_step) - - + # Extract time range parameters time_start = time.start time_stop = time.stop time_step = time.step - + # Check forecast feasibility - feasible_start = time_now + requirements['offset'] - feasible_end = time_now + requirements['forecast'] - + feasible_start = time_now + requirements["offset"] + feasible_end = time_now + requirements["forecast"] + # Ensure time_start is within feasible bounds if time_start < feasible_start: - logger.error(f"The required forecast start time {time_start} exceeds the model requirements.") - logger.error(f"The minimum start time of the forecast for {time_now} is {feasible_start}.") - logger.error(f"The maximum historical offset of the forecast is {requirements['offset']}.") - raise ValueError(f"Invalid forecast start time: {time_start}. Must be >= {feasible_start}.") - + logger.error( + f"The required forecast start time {time_start} exceeds the model requirements." + ) + logger.error( + f"The minimum start time of the forecast for {time_now} is {feasible_start}." + ) + logger.error( + f"The maximum historical offset of the forecast is {requirements['offset']}." + ) + raise ValueError( + f"Invalid forecast start time: {time_start}. Must be >= {feasible_start}." + ) + if time_start >= feasible_end: - logger.error(f"The required forecast start time {time_start} exceeds the model requirements.") - logger.error(f"The maximum start time of the forecast for {time_now} needs to be smaller than {feasible_end}.") - raise ValueError(f"Invalid forecast start time: {time_start}. Must be < {feasible_end}.") + logger.error( + f"The required forecast start time {time_start} exceeds the model requirements." + ) + logger.error( + f"The maximum start time of the forecast for {time_now} needs to be smaller than {feasible_end}." + ) + raise ValueError( + f"Invalid forecast start time: {time_start}. Must be < {feasible_end}." + ) # Ensure time_stop is greater than time_start if time_stop <= time_start: - logger.error(f"The required forecast end time {time_stop} exceeds the model requirements.") - logger.error(f"The minimum end time of the forecast for {time_now} needs to be larger than {time_start}.") - raise ValueError(f"Invalid forecast end time: {time_stop}. Must be > {time_start}.") - + logger.error( + f"The required forecast end time {time_stop} exceeds the model requirements." + ) + logger.error( + f"The minimum end time of the forecast for {time_now} needs to be larger than {time_start}." + ) + raise ValueError( + f"Invalid forecast end time: {time_stop}. Must be > {time_start}." + ) + # Ensure time_stop is greater than time_start if time_stop > feasible_end: - logger.error(f"The required forecast end time {time_stop} exceeds the model requirements.") - logger.error(f"The maximum end time of the forecast for {time_now} is {feasible_end}.") + logger.error( + f"The required forecast end time {time_stop} exceeds the model requirements." + ) + logger.error( + f"The maximum end time of the forecast for {time_now} is {feasible_end}." + ) logger.error(f"The maximum forecast horizon is {requirements['forecast']}.") - raise ValueError(f"Invalid forecast end time: {time_stop}. Must be <= {feasible_end}.") - + raise ValueError( + f"Invalid forecast end time: {time_stop}. Must be <= {feasible_end}." + ) + # Ensure time step is within required limits - if (time_step is pd.Timedelta(None)) or (time.step < requirements['dt']): - logger.warning(f"The required temporal forecast resolution {time_step} exceeds the model requirements.") - logger.warning(f"The minimum temporal resolution of the forecast is {requirements['dt']}.") - logger.info(f"Set the temporal forecast resolution to the minimum: {requirements['dt']}.") - time_step = requirements['dt'] - + if (time_step is pd.Timedelta(None)) or (time.step < requirements["dt"]): + logger.warning( + f"The required temporal forecast resolution {time_step} exceeds the model requirements." + ) + logger.warning( + f"The minimum temporal resolution of the forecast is {requirements['dt']}." + ) + logger.info( + f"Set the temporal forecast resolution to the minimum: {requirements['dt']}." + ) + time_step = requirements["dt"] + time = slice(time_start, time_stop, time_step) - + # Retrieve parallel processing setting from requirements - parallel = requirements['parallel'] - + parallel = requirements["parallel"] + return x, y, time, parallel def _add_height(ds): - """Convert geopotential 'z' to geopotential height following [1]. + """ + Convert geopotential 'z' to geopotential height following [1]. References ---------- @@ -247,42 +290,45 @@ def _rename_and_clean_coords(ds, add_lon_lat=True): def _interpolate(ds, ds_ref, static, interp_s, interp_t): - - #Interpolate data to specific latitude and longitude values given as input (due to specific model resolution) - + # Interpolate data to specific latitude and longitude values given as input (due to specific model resolution) + if not static: try: ds = ds.interp( - time=ds_ref.time.values, - method=interp_t, - kwargs={"fill_value": "extrapolate"}, - ) + time=ds_ref.time.values, + method=interp_t, + kwargs={"fill_value": "extrapolate"}, + ) except ValueError: - logger.info(f"Interpolation: Not enough supporting points for used interpolation method {interp_t}.") + logger.info( + f"Interpolation: Not enough supporting points for used interpolation method {interp_t}." + ) logger.info("Interpolation method is set to 'nearest' instead.") ds = ds.interp( - time=ds_ref.time.values, - method='nearest', - kwargs={"fill_value": "extrapolate"}, - ) + time=ds_ref.time.values, + method="nearest", + kwargs={"fill_value": "extrapolate"}, + ) try: ds = ds.interp( - x=ds_ref.x.values, - y=ds_ref.y.values, - method=interp_s, - kwargs={"fill_value": "extrapolate"}, - ) + x=ds_ref.x.values, + y=ds_ref.y.values, + method=interp_s, + kwargs={"fill_value": "extrapolate"}, + ) except ValueError: - logger.info(f"Interpolation: Not enough supporting points for used interpolation method {interp_s}.") + logger.info( + f"Interpolation: Not enough supporting points for used interpolation method {interp_s}." + ) logger.info("Interpolation method is set to 'nearest' instead.") ds = ds.interp( - x=ds_ref.x.values, - y=ds_ref.y.values, - method='nearest', - kwargs={"fill_value": "extrapolate"}, - ) - + x=ds_ref.x.values, + y=ds_ref.y.values, + method="nearest", + kwargs={"fill_value": "extrapolate"}, + ) + return ds @@ -336,7 +382,7 @@ def get_data_meteo_temperature(retrieval_params): def get_data_era5_wind(retrieval_params): """Get wind data for given retrieval parameters.""" - + ds = retrieve_era5_data( url=era5_url, variable=["forecast_surface_roughness"], @@ -349,14 +395,14 @@ def get_data_era5_wind(retrieval_params): def combine_data_wind(ds_meteo, ds_era5, interp_s, interp_t): - - ds_era5 = _interpolate(ds=ds_era5, - ds_ref=ds_meteo, - static=False, - interp_s=interp_s, - interp_t=interp_t, - ) - + ds_era5 = _interpolate( + ds=ds_era5, + ds_ref=ds_meteo, + static=False, + interp_s=interp_s, + interp_t=interp_t, + ) + ds = xr.merge([ds_meteo, ds_era5]) ds = ds.rename( @@ -367,11 +413,13 @@ def combine_data_wind(ds_meteo, ds_era5, interp_s, interp_t): } ) - ds.wnd100m.attrs.update(units="m s**-1", long_name="Wind speed at 100m above ground") + ds.wnd100m.attrs.update( + units="m s**-1", long_name="Wind speed at 100m above ground" + ) ds.wnd_azimuth.attrs.update( units="degree", long_name="Wind direction at 100m above ground" ) - + # unify_chunks() is necessary to avoid a bug in xarray ds = ds.unify_chunks() @@ -389,25 +437,24 @@ def get_data_era5_influx(retrieval_params): ds = retrieve_era5_data( url=era5_url, - variable=["forecast_albedo", - "toa_incident_solar_radiation"], + variable=["forecast_albedo", "toa_incident_solar_radiation"], **retrieval_params, ) - + ds = _rename_and_clean_coords(ds) return ds - - + + def combine_data_influx(ds_meteo, ds_era5, interp_s, interp_t): - - ds_era5 = _interpolate(ds=ds_era5, - ds_ref=ds_meteo, - static=False, - interp_s=interp_s, - interp_t=interp_t, - ) - + ds_era5 = _interpolate( + ds=ds_era5, + ds_ref=ds_meteo, + static=False, + interp_s=interp_s, + interp_t=interp_t, + ) + ds = xr.merge([ds_meteo, ds_era5]) ds = ds.rename( @@ -431,7 +478,7 @@ def combine_data_influx(ds_meteo, ds_era5, interp_s, interp_t): ds.influx_toa.attrs.update( units="W m**-2", long_name="TOA incident solar radiation" ) - + # unify_chunks() is necessary to avoid a bug in xarray ds = ds.unify_chunks() @@ -460,10 +507,9 @@ def sanitize_influx(ds): def combine_data_temperature(ds_meteo, ds_era5, interp_s, interp_t): """Get wind temperature for given retrieval parameters.""" ds = xr.merge([ds_meteo, ds_era5]) - + ds = ds.rename( - {"temperature_2m": "temperature", - "soil_temperature_54cm": "soil temperature"} + {"temperature_2m": "temperature", "soil_temperature_54cm": "soil temperature"} ) # Convert from Celsius to Kelvin C -> K, by adding 273.15 @@ -480,10 +526,7 @@ def combine_data_temperature(ds_meteo, ds_era5, interp_s, interp_t): def get_data_era5_height(retrieval_params): """Get height data for given retrieval parameters.""" - ds = retrieve_era5_data( - url=era5_url, - variable=["geopotential"], - **retrieval_params) + ds = retrieve_era5_data(url=era5_url, variable=["geopotential"], **retrieval_params) ds = _rename_and_clean_coords(ds) ds = _add_height(ds) @@ -537,18 +580,22 @@ def retrieve_meteo_data(url, product, chunks=None, tmpdir=None, lock=None, **upd request.update(updates) # Generate list of (lon, lat) coordinate pairs - grid = np.meshgrid(request['coords']['x'], request['coords']['y']) - coords = pd.DataFrame(zip(grid[0].flatten(), grid[1].flatten()), columns=['longitude', 'latitude']) + grid = np.meshgrid(request["coords"]["x"], request["coords"]["y"]) + coords = pd.DataFrame( + zip(grid[0].flatten(), grid[1].flatten()), columns=["longitude", "latitude"] + ) # Calculate time and variable counts - start_date = request['start'].strftime('%Y-%m-%d') - end_date = request['end'].strftime('%Y-%m-%d') + start_date = request["start"].strftime("%Y-%m-%d") + end_date = request["end"].strftime("%Y-%m-%d") nr_days = abs((pd.to_datetime(start_date) - pd.to_datetime(end_date)).days) - nr_variables = len(request['variable']) + nr_variables = len(request["variable"]) nr_locations = len(coords) # Estimate request weight based on Open-Meteo's internal model - weight_of_full_api_request = max(nr_variables / 10, (nr_variables / 10) * (nr_days / 14)) * nr_locations + weight_of_full_api_request = ( + max(nr_variables / 10, (nr_variables / 10) * (nr_days / 14)) * nr_locations + ) # Dynamically determine chunk size based on rate limit thresholds if weight_of_full_api_request <= MINUTE_LIMIT: @@ -561,21 +608,23 @@ def retrieve_meteo_data(url, product, chunks=None, tmpdir=None, lock=None, **upd chunk_size = int(max(1, chunk_size)) # Ensure chunk_size ≥ 1 and integer logger.info(f"Meteo-API: Downloading variables\n\t{request['variable']}\n") - logger.info(f"Meteo-API: Expected request weight of full request: {weight_of_full_api_request}") + logger.info( + f"Meteo-API: Expected request weight of full request: {weight_of_full_api_request}" + ) # Loop through coordinate grid in blocks and request data data = [] for i in range(0, len(coords), chunk_size): - coord_chunk = coords.iloc[i:i + chunk_size] + coord_chunk = coords.iloc[i : i + chunk_size] # Prepare API parameters for the current chunk params = { - "longitude": coord_chunk['longitude'].tolist(), - "latitude": coord_chunk['latitude'].tolist(), - "hourly": request['variable'], + "longitude": coord_chunk["longitude"].tolist(), + "latitude": coord_chunk["latitude"].tolist(), + "hourly": request["variable"], "wind_speed_unit": "ms", "start_date": start_date, - "end_date": end_date + "end_date": end_date, } try: @@ -584,8 +633,10 @@ def retrieve_meteo_data(url, product, chunks=None, tmpdir=None, lock=None, **upd logger.info(f"{e}") try: # Extract and classify rate limiting error - rate_limiting_error = list(e.args)[0]['reason'] - match = re.search(r"(Minutely|Hourly|Daily) API request limit", rate_limiting_error) + rate_limiting_error = list(e.args)[0]["reason"] + match = re.search( + r"(Minutely|Hourly|Daily) API request limit", rate_limiting_error + ) if match: apply_rate_limiting(error=match[0]) else: @@ -594,8 +645,10 @@ def retrieve_meteo_data(url, product, chunks=None, tmpdir=None, lock=None, **upd responses = openmeteo.weather_api(url, params=params) except Exception as e: # Skip this chunk on repeated failure - logger.error(f"Meteo-API: Failed to fetch data for block starting at " - f"({coord_chunk.loc[0, 'longitude']}, {coord_chunk.loc[0, 'latitude']}): {e}") + logger.error( + f"Meteo-API: Failed to fetch data for block starting at " + f"({coord_chunk.loc[0, 'longitude']}, {coord_chunk.loc[0, 'latitude']}): {e}" + ) continue # Parse chunked response and append results @@ -605,7 +658,7 @@ def retrieve_meteo_data(url, product, chunks=None, tmpdir=None, lock=None, **upd ds = pd.concat(data).to_xarray() ds = _rename_and_clean_coords(ds) ds = ds.chunk(chunks=chunks) - + return ds @@ -631,9 +684,12 @@ def parse_meteo_responses(responses, params): # Reconstruct time index based on interval range_start = pd.to_datetime(response.Hourly().Time(), unit="s") range_end = pd.to_datetime(response.Hourly().TimeEnd(), unit="s") - date_range = pd.date_range(start=range_start, end=range_end, - freq=pd.Timedelta(seconds=response.Hourly().Interval()), - inclusive="left") + date_range = pd.date_range( + start=range_start, + end=range_end, + freq=pd.Timedelta(seconds=response.Hourly().Interval()), + inclusive="left", + ) # Initialize empty DataFrame for the current location response_df = pd.DataFrame(columns=params["hourly"]) @@ -670,14 +726,16 @@ def apply_rate_limiting(error=None): """ now = datetime.datetime.now() - midnight = datetime.datetime(now.year, now.month, now.day) + datetime.timedelta(days=1, minutes=5) + midnight = datetime.datetime(now.year, now.month, now.day) + datetime.timedelta( + days=1, minutes=5 + ) time_until_midnight = (midnight - now).total_seconds() sleep_times = { None: 120, # Fallback for unknown errors "Minutely API request limit": 60, "Hourly API request limit": 60 * 60, - "Daily API request limit": time_until_midnight + "Daily API request limit": time_until_midnight, } sleep_time = sleep_times[error] @@ -691,21 +749,21 @@ def retrieve_era5_data(url, product, chunks=None, tmpdir=None, lock=None, **upda If you want to track the state of your request go to https://cds-beta.climate.copernicus.eu/requests?tab=all - """ - request = {"product_type": ["reanalysis"], - "data_format": "netcdf", - "download_format": "zip"} - + """ + request = { + "product_type": ["reanalysis"], + "data_format": "netcdf", + "download_format": "zip", + } + request.update(updates) - assert {"year", "month", "variable"}.issubset( - request - ), "Need to specify at least 'variable', 'year' and 'month'" + assert {"year", "month", "variable"}.issubset(request), ( + "Need to specify at least 'variable', 'year' and 'month'" + ) client = cdsapi.Client( - url = url, - info_callback=logger.debug, - debug=logging.DEBUG >= logging.root.level + url=url, info_callback=logger.debug, debug=logging.DEBUG >= logging.root.level ) result = client.retrieve(product, request) @@ -722,35 +780,43 @@ def retrieve_era5_data(url, product, chunks=None, tmpdir=None, lock=None, **upda varstr = "\n\t".join([f"{v} ({timestr})" for v in variables]) logger.info(f"CDS: Downloading variables\n\t{varstr}\n") result.download(target_zip) - + # Open the .zip file in memory with zipfile.ZipFile(target_zip, "r") as zf: # Identify .nc files inside the .zip nc_files = [name for name in zf.namelist() if name.endswith(".nc")] - + if not nc_files: - raise FileNotFoundError("No .nc files found in the downloaded .zip archive.") - + raise FileNotFoundError( + "No .nc files found in the downloaded .zip archive." + ) + if len(nc_files) == 1: # If there's only one .nc file, read it into memory with zf.open(nc_files[0]) as nc_file: # Pass the in-memory file-like object to Xarray - ds = xr.open_dataset(io.BytesIO(nc_file.read()), chunks=chunks or {}) - + ds = xr.open_dataset( + io.BytesIO(nc_file.read()), chunks=chunks or {} + ) + else: # If multiple .nc files, combine them using Xarray datasets = [] for nc_file in nc_files: with zf.open(nc_file) as file: - dataset = xr.open_dataset(io.BytesIO(file.read()), chunks=chunks or {}) - - if 'expver' in dataset.variables: - dataset = dataset.drop_vars(["expver", "number"], errors="ignore") + dataset = xr.open_dataset( + io.BytesIO(file.read()), chunks=chunks or {} + ) + + if "expver" in dataset.variables: + dataset = dataset.drop_vars( + ["expver", "number"], errors="ignore" + ) datasets.append(dataset) - - ds = xr.merge(datasets) - + + ds = xr.merge(datasets) + if tmpdir is None: logging.debug(f"Adding finalizer for {target_zip}") weakref.finalize(ds._file_obj._manager, noisy_unlink, target_zip) @@ -758,7 +824,9 @@ def retrieve_era5_data(url, product, chunks=None, tmpdir=None, lock=None, **upda return ds -def retrieval_times_era5_forecast(coords, initialization_time, static=False, monthly_requests=False): +def retrieval_times_era5_forecast( + coords, initialization_time, static=False, monthly_requests=False +): """ Get list of retrieval cdsapi arguments for time dimension in coordinates. @@ -782,33 +850,36 @@ def retrieval_times_era5_forecast(coords, initialization_time, static=False, mon list of dicts witht retrieval arguments """ - + # Convert time coordinates to a pandas Index time = coords["time"].to_index() frequency = time.freq - + # Determine the latest available ERA5 data time based on initialization time and required delay latest_era5_time = pd.Timestamp(initialization_time) + ERA5_DELAY - + # Round up to the next full day and subtract 1 hour to align with ERA5 update frequency - latest_era5_time = latest_era5_time.ceil('D') - pd.Timedelta(hours=1) - + latest_era5_time = latest_era5_time.ceil("D") - pd.Timedelta(hours=1) + # Define the minimum required time horizon for ERA5 downloads (last 24 hours) minimum_era5_time_horizon = pd.date_range( - start=latest_era5_time - pd.Timedelta(days=1), # One day before the latest available time + start=latest_era5_time + - pd.Timedelta(days=1), # One day before the latest available time end=latest_era5_time, # Up to the latest available time - freq=frequency # Maintain original time frequency + freq=frequency, # Maintain original time frequency ) - + # Merge the existing time index with the minimum ERA5 time horizon, avoiding duplicates time = time.union(minimum_era5_time_horizon) - + # Ensure a continuous time index by filling missing values based on the determined frequency - complete_time_range = pd.date_range(start=time.min(), end=time.max(), freq=frequency) - + complete_time_range = pd.date_range( + start=time.min(), end=time.max(), freq=frequency + ) + # Keep only timestamps up to the latest available ERA5 time time = complete_time_range[complete_time_range <= latest_era5_time] - + if static: return { "year": [str(time[0].year)], @@ -826,7 +897,9 @@ def retrieval_times_era5_forecast(coords, initialization_time, static=False, mon query = { "year": str(year), "month": [str(month).zfill(2)], - "day": list(t[t.month == month].day.unique().astype(str).str.zfill(2)), + "day": list( + t[t.month == month].day.unique().astype(str).str.zfill(2) + ), "time": ["%02d:00" % h for h in t[t.month == month].hour.unique()], } times.append(query) @@ -882,7 +955,7 @@ def get_data( """ coords = cutout.coords - initialization_time = creation_parameters['init_time'] + initialization_time = creation_parameters["init_time"] sanitize = creation_parameters.get("sanitize", True) @@ -894,7 +967,7 @@ def get_data( "tmpdir": tmpdir, "lock": lock, } - + retrieval_params_era5 = { "product": "reanalysis-era5-single-levels", "area": _area(coords), @@ -904,63 +977,68 @@ def get_data( "lock": lock, } - # Get fast changing variabels from meteo forecast - func_meteo = globals().get(f"get_data_meteo_{feature}") + func_meteo = globals().get(f"get_data_meteo_{feature}") logger.info(f"Requesting data for feature {feature} from meteo...") - - + if func_meteo is not None: - datasets_meteo = func_meteo({**retrieval_params_meteo, - **{"start": coords["time"].to_index()[0], - "end": coords["time"].to_index()[-1], - "coords": coords}}) + datasets_meteo = func_meteo( + { + **retrieval_params_meteo, + **{ + "start": coords["time"].to_index()[0], + "end": coords["time"].to_index()[-1], + "coords": coords, + }, + } + ) else: datasets_meteo = xr.Dataset() - def retrieve_once(time): ds = func_era5({**retrieval_params_era5, **time}) return ds - # Get missing and slow changing variabels from era5 data and interpolation func_era5 = globals().get(f"get_data_era5_{feature}") if func_era5 is not None: logger.info(f"Requesting addtional data for feature {feature} from era5...") - + if feature in static_features: - return retrieve_once(retrieval_times_era5_forecast(coords, initialization_time, static=True)).squeeze() - - time_chunks = retrieval_times_era5_forecast(coords, initialization_time, monthly_requests=monthly_requests) + return retrieve_once( + retrieval_times_era5_forecast(coords, initialization_time, static=True) + ).squeeze() + + time_chunks = retrieval_times_era5_forecast( + coords, initialization_time, monthly_requests=monthly_requests + ) if concurrent_requests: delayed_datasets = [delayed(retrieve_once)(chunk) for chunk in time_chunks] datasets_era5 = compute(*delayed_datasets) else: datasets_era5 = map(retrieve_once, time_chunks) - + datasets_era5 = xr.concat(datasets_era5, dim="time") - + else: datasets_era5 = xr.Dataset() - + # Combine datasets and calculate the required variables combine_func = globals().get(f"combine_data_{feature}") - + logger.info(f"Combine meteo and era5 datasets for feature {feature}...") - + if combine_func is not None: - datasets = combine_func(datasets_meteo, - datasets_era5, - cutout.data.interp_s, - cutout.data.interp_t) + datasets = combine_func( + datasets_meteo, datasets_era5, cutout.data.interp_s, cutout.data.interp_t + ) else: datasets = xr.merge([datasets_meteo, datasets_era5]) - + sanitize_func = globals().get(f"sanitize_{feature}") if sanitize and sanitize_func is not None: # Sanitize the data after interpolation to remove residuals datasets = sanitize_func(datasets) - + return datasets.sel(time=coords["time"]) diff --git a/atlite/datasets/meteo_historic_forecast.py b/atlite/datasets/meteo_historic_forecast.py index 6e306dd7..df2bbea7 100644 --- a/atlite/datasets/meteo_historic_forecast.py +++ b/atlite/datasets/meteo_historic_forecast.py @@ -1,5 +1,3 @@ -# -*- coding: utf-8 -*- - # SPDX-FileCopyrightText: 2016-2021 The Atlite Authors # # SPDX-License-Identifier: GPL-3.0-or-later @@ -11,25 +9,25 @@ https://open-meteo.com/en/docs/historical-forecast-api """ +import datetime +import io import logging import os -import io import re -import zipfile +import time import warnings import weakref -import time -import xarray as xr -import datetime -import openmeteo_requests -import requests_cache +import zipfile +from tempfile import mkstemp + import cdsapi import numpy as np +import openmeteo_requests import pandas as pd - -from numpy import atleast_1d +import requests_cache +import xarray as xr from dask import compute, delayed -from tempfile import mkstemp +from numpy import atleast_1d from retry_requests import retry from ..gis import maybe_swap_spatial_dims @@ -45,7 +43,7 @@ @contextlib.contextmanager def nullcontext(): yield - + logger = logging.getLogger(__name__) @@ -53,12 +51,14 @@ def nullcontext(): # Setup Open-Meteo client with 7-day cache and retry on failure # Cache duration: -1 = never expire, 0 = no caching, timedelta = expire after set time cache_window = datetime.timedelta(days=7) -cache_session = requests_cache.CachedSession('.meteo.cache', backend='sqlite', expire_after=cache_window) +cache_session = requests_cache.CachedSession( + ".meteo.cache", backend="sqlite", expire_after=cache_window +) retry_session = retry(cache_session, retries=5, backoff_factor=3) openmeteo = openmeteo_requests.Client(session=retry_session) # Define data source URLs (easy to switch if needed) -era5_url = 'https://cds.climate.copernicus.eu/api' +era5_url = "https://cds.climate.copernicus.eu/api" meteo_url = "https://historical-forecast-api.open-meteo.com/v1/forecast" # Open-Meteo request limits @@ -69,7 +69,7 @@ def nullcontext(): # Delay of ERA5 data upload # For Open-Meteo slow changing variables from ERA5 are always interpolated # Starting from data of at least 7 days before nowtime -ERA5_DELAY = pd.Timedelta(hours=-6.0*24) +ERA5_DELAY = pd.Timedelta(hours=-6.0 * 24) # Model and CRS Settings crs = 4326 @@ -90,104 +90,144 @@ def nullcontext(): static_features = {"height"} -requirements = {'x': slice(-90, 90, 0.1), - 'y': slice(-90, 90, 0.1), - 'offset': (pd.Timestamp('2022-01-01')-pd.Timestamp.utcnow().replace(tzinfo=None).floor("h")), - 'forecast': ((pd.Timestamp.utcnow().replace(tzinfo=None).ceil("d")-pd.Timestamp.utcnow().replace(tzinfo=None))).floor("h"), - 'dt': pd.Timedelta(hours=1), - 'parallel': True, - } +requirements = { + "x": slice(-90, 90, 0.1), + "y": slice(-90, 90, 0.1), + "offset": ( + pd.Timestamp("2022-01-01") + - pd.Timestamp.utcnow().replace(tzinfo=None).floor("h") + ), + "forecast": ( + pd.Timestamp.utcnow().replace(tzinfo=None).ceil("d") + - pd.Timestamp.utcnow().replace(tzinfo=None) + ).floor("h"), + "dt": pd.Timedelta(hours=1), + "parallel": True, +} def _checkModuleRequirements(x, y, time, time_now, **kwargs): """ Load and check the data requirements for a given module. - - Parameters: + + Parameters + ---------- x (slice): Defines the start, stop, and step values for the x-dimension. y (slice): Defines the start, stop, and step values for the y-dimension. time (slice): Defines the start, stop, and step values for the time dimension. **kwargs: Additional optional parameters. """ - + # Extract start, stop, and step values for x x_start, x_stop, x_step = x.start, x.stop, x.step - + # Adjust x range based on module requirements - if requirements['x'].start > x.start: - x_start = requirements['x'].start - if requirements['x'].stop < x.stop: - x_stop = requirements['x'].stop - if requirements['x'].step > x.step: - x_step = requirements['x'].step - + if requirements["x"].start > x.start: + x_start = requirements["x"].start + if requirements["x"].stop < x.stop: + x_stop = requirements["x"].stop + if requirements["x"].step > x.step: + x_step = requirements["x"].step + x = slice(x_start, x_stop, x_step) - + # Extract start, stop, and step values for y y_start, y_stop, y_step = y.start, y.stop, y.step - + # Adjust y range based on module requirements - if requirements['y'].start > y.start: - y_start = requirements['y'].start - if requirements['y'].stop < y.stop: - y_stop = requirements['y'].stop - if requirements['y'].step > y.step: - y_step = requirements['y'].step - + if requirements["y"].start > y.start: + y_start = requirements["y"].start + if requirements["y"].stop < y.stop: + y_stop = requirements["y"].stop + if requirements["y"].step > y.step: + y_step = requirements["y"].step + y = slice(y_start, y_stop, y_step) - - + # Extract time range parameters time_start = time.start time_stop = time.stop time_step = time.step - + # Check forecast feasibility - feasible_start = time_now + requirements['offset'] - feasible_end = time_now + requirements['forecast'] - + feasible_start = time_now + requirements["offset"] + feasible_end = time_now + requirements["forecast"] + # Ensure time_start is within feasible bounds if time_start < feasible_start: - logger.error(f"The required forecast start time {time_start} exceeds the model requirements.") - logger.error(f"The minimum start time of the forecast for {time_now} is {feasible_start}.") - logger.error(f"The maximum historical offset of the forecast is {requirements['offset']}.") - raise ValueError(f"Invalid forecast start time: {time_start}. Must be >= {feasible_start}.") - + logger.error( + f"The required forecast start time {time_start} exceeds the model requirements." + ) + logger.error( + f"The minimum start time of the forecast for {time_now} is {feasible_start}." + ) + logger.error( + f"The maximum historical offset of the forecast is {requirements['offset']}." + ) + raise ValueError( + f"Invalid forecast start time: {time_start}. Must be >= {feasible_start}." + ) + if time_start >= feasible_end: - logger.error(f"The required forecast start time {time_start} exceeds the model requirements.") - logger.error(f"The maximum start time of the forecast for {time_now} needs to be smaller than {feasible_end}.") - raise ValueError(f"Invalid forecast start time: {time_start}. Must be < {feasible_end}.") + logger.error( + f"The required forecast start time {time_start} exceeds the model requirements." + ) + logger.error( + f"The maximum start time of the forecast for {time_now} needs to be smaller than {feasible_end}." + ) + raise ValueError( + f"Invalid forecast start time: {time_start}. Must be < {feasible_end}." + ) # Ensure time_stop is greater than time_start if time_stop <= time_start: - logger.error(f"The required forecast end time {time_stop} exceeds the model requirements.") - logger.error(f"The minimum end time of the forecast for {time_now} needs to be larger than {time_start}.") - raise ValueError(f"Invalid forecast end time: {time_stop}. Must be > {time_start}.") - + logger.error( + f"The required forecast end time {time_stop} exceeds the model requirements." + ) + logger.error( + f"The minimum end time of the forecast for {time_now} needs to be larger than {time_start}." + ) + raise ValueError( + f"Invalid forecast end time: {time_stop}. Must be > {time_start}." + ) + # Ensure time_stop is greater than time_start if time_stop > feasible_end: - logger.error(f"The required forecast end time {time_stop} exceeds the model requirements.") - logger.error(f"The maximum end time of the forecast for {time_now} is {feasible_end}.") + logger.error( + f"The required forecast end time {time_stop} exceeds the model requirements." + ) + logger.error( + f"The maximum end time of the forecast for {time_now} is {feasible_end}." + ) logger.error(f"The maximum forecast horizon is {requirements['forecast']}.") - raise ValueError(f"Invalid forecast end time: {time_stop}. Must be <= {feasible_end}.") - + raise ValueError( + f"Invalid forecast end time: {time_stop}. Must be <= {feasible_end}." + ) + # Ensure time step is within required limits - if (time_step is pd.Timedelta(None)) or (time.step < requirements['dt']): - logger.warning(f"The required temporal forecast resolution {time_step} exceeds the model requirements.") - logger.warning(f"The minimum temporal resolution of the forecast is {requirements['dt']}.") - logger.info(f"Set the temporal forecast resolution to the minimum: {requirements['dt']}.") - time_step = requirements['dt'] - + if (time_step is pd.Timedelta(None)) or (time.step < requirements["dt"]): + logger.warning( + f"The required temporal forecast resolution {time_step} exceeds the model requirements." + ) + logger.warning( + f"The minimum temporal resolution of the forecast is {requirements['dt']}." + ) + logger.info( + f"Set the temporal forecast resolution to the minimum: {requirements['dt']}." + ) + time_step = requirements["dt"] + time = slice(time_start, time_stop, time_step) - + # Retrieve parallel processing setting from requirements - parallel = requirements['parallel'] - + parallel = requirements["parallel"] + return x, y, time, parallel def _add_height(ds): - """Convert geopotential 'z' to geopotential height following [1]. + """ + Convert geopotential 'z' to geopotential height following [1]. References ---------- @@ -247,42 +287,45 @@ def _rename_and_clean_coords(ds, add_lon_lat=True): def _interpolate(ds, ds_ref, static, interp_s, interp_t): - - #Interpolate data to specific latitude and longitude values given as input (due to specific model resolution) - + # Interpolate data to specific latitude and longitude values given as input (due to specific model resolution) + if not static: try: ds = ds.interp( - time=ds_ref.time.values, - method=interp_t, - kwargs={"fill_value": "extrapolate"}, - ) + time=ds_ref.time.values, + method=interp_t, + kwargs={"fill_value": "extrapolate"}, + ) except ValueError: - logger.info(f"Interpolation: Not enough supporting points for used interpolation method {interp_t}.") + logger.info( + f"Interpolation: Not enough supporting points for used interpolation method {interp_t}." + ) logger.info("Interpolation method is set to 'nearest' instead.") ds = ds.interp( - time=ds_ref.time.values, - method='nearest', - kwargs={"fill_value": "extrapolate"}, - ) + time=ds_ref.time.values, + method="nearest", + kwargs={"fill_value": "extrapolate"}, + ) try: ds = ds.interp( - x=ds_ref.x.values, - y=ds_ref.y.values, - method=interp_s, - kwargs={"fill_value": "extrapolate"}, - ) + x=ds_ref.x.values, + y=ds_ref.y.values, + method=interp_s, + kwargs={"fill_value": "extrapolate"}, + ) except ValueError: - logger.info(f"Interpolation: Not enough supporting points for used interpolation method {interp_s}.") + logger.info( + f"Interpolation: Not enough supporting points for used interpolation method {interp_s}." + ) logger.info("Interpolation method is set to 'nearest' instead.") ds = ds.interp( - x=ds_ref.x.values, - y=ds_ref.y.values, - method='nearest', - kwargs={"fill_value": "extrapolate"}, - ) - + x=ds_ref.x.values, + y=ds_ref.y.values, + method="nearest", + kwargs={"fill_value": "extrapolate"}, + ) + return ds @@ -297,7 +340,7 @@ def get_data_meteo_wind(retrieval_params): ], **retrieval_params, ) - + return ds @@ -315,7 +358,7 @@ def get_data_meteo_influx(retrieval_params): ], **retrieval_params, ) - + return ds @@ -336,7 +379,7 @@ def get_data_meteo_temperature(retrieval_params): def get_data_era5_wind(retrieval_params): """Get wind data for given retrieval parameters.""" - + ds = retrieve_era5_data( url=era5_url, variable=["forecast_surface_roughness"], @@ -349,14 +392,14 @@ def get_data_era5_wind(retrieval_params): def combine_data_wind(ds_meteo, ds_era5, interp_s, interp_t): - - ds_era5 = _interpolate(ds=ds_era5, - ds_ref=ds_meteo, - static=False, - interp_s=interp_s, - interp_t=interp_t, - ) - + ds_era5 = _interpolate( + ds=ds_era5, + ds_ref=ds_meteo, + static=False, + interp_s=interp_s, + interp_t=interp_t, + ) + ds = xr.merge([ds_meteo, ds_era5]) ds = ds.rename( @@ -371,7 +414,7 @@ def combine_data_wind(ds_meteo, ds_era5, interp_s, interp_t): ds.wnd_azimuth.attrs.update( units="degree", long_name="Wind direction at 80m above ground" ) - + # unify_chunks() is necessary to avoid a bug in xarray ds = ds.unify_chunks() @@ -389,25 +432,24 @@ def get_data_era5_influx(retrieval_params): ds = retrieve_era5_data( url=era5_url, - variable=["forecast_albedo", - "toa_incident_solar_radiation"], + variable=["forecast_albedo", "toa_incident_solar_radiation"], **retrieval_params, ) - + ds = _rename_and_clean_coords(ds) return ds - - + + def combine_data_influx(ds_meteo, ds_era5, interp_s, interp_t): - - ds_era5 = _interpolate(ds=ds_era5, - ds_ref=ds_meteo, - static=False, - interp_s=interp_s, - interp_t=interp_t, - ) - + ds_era5 = _interpolate( + ds=ds_era5, + ds_ref=ds_meteo, + static=False, + interp_s=interp_s, + interp_t=interp_t, + ) + ds = xr.merge([ds_meteo, ds_era5]) ds = ds.rename( @@ -431,7 +473,7 @@ def combine_data_influx(ds_meteo, ds_era5, interp_s, interp_t): ds.influx_toa.attrs.update( units="W m**-2", long_name="TOA incident solar radiation" ) - + # unify_chunks() is necessary to avoid a bug in xarray ds = ds.unify_chunks() @@ -460,10 +502,9 @@ def sanitize_influx(ds): def combine_data_temperature(ds_meteo, ds_era5, interp_s, interp_t): """Get wind temperature for given retrieval parameters.""" ds = xr.merge([ds_meteo, ds_era5]) - + ds = ds.rename( - {"temperature_2m": "temperature", - "soil_temperature_54cm": "soil temperature"} + {"temperature_2m": "temperature", "soil_temperature_54cm": "soil temperature"} ) # Convert from Celsius to Kelvin C -> K, by adding 273.15 @@ -480,10 +521,7 @@ def combine_data_temperature(ds_meteo, ds_era5, interp_s, interp_t): def get_data_era5_height(retrieval_params): """Get height data for given retrieval parameters.""" - ds = retrieve_era5_data( - url=era5_url, - variable=["geopotential"], - **retrieval_params) + ds = retrieve_era5_data(url=era5_url, variable=["geopotential"], **retrieval_params) ds = _rename_and_clean_coords(ds) ds = _add_height(ds) @@ -537,18 +575,22 @@ def retrieve_meteo_data(url, product, chunks=None, tmpdir=None, lock=None, **upd request.update(updates) # Generate list of (lon, lat) coordinate pairs - grid = np.meshgrid(request['coords']['x'], request['coords']['y']) - coords = pd.DataFrame(zip(grid[0].flatten(), grid[1].flatten()), columns=['longitude', 'latitude']) + grid = np.meshgrid(request["coords"]["x"], request["coords"]["y"]) + coords = pd.DataFrame( + zip(grid[0].flatten(), grid[1].flatten()), columns=["longitude", "latitude"] + ) # Calculate time and variable counts - start_date = request['start'].strftime('%Y-%m-%d') - end_date = request['end'].strftime('%Y-%m-%d') + start_date = request["start"].strftime("%Y-%m-%d") + end_date = request["end"].strftime("%Y-%m-%d") nr_days = abs((pd.to_datetime(start_date) - pd.to_datetime(end_date)).days) - nr_variables = len(request['variable']) + nr_variables = len(request["variable"]) nr_locations = len(coords) # Estimate request weight based on Open-Meteo's internal model - weight_of_full_api_request = max(nr_variables / 10, (nr_variables / 10) * (nr_days / 14)) * nr_locations + weight_of_full_api_request = ( + max(nr_variables / 10, (nr_variables / 10) * (nr_days / 14)) * nr_locations + ) # Dynamically determine chunk size based on rate limit thresholds if weight_of_full_api_request <= MINUTE_LIMIT: @@ -561,21 +603,23 @@ def retrieve_meteo_data(url, product, chunks=None, tmpdir=None, lock=None, **upd chunk_size = int(max(1, chunk_size)) # Ensure chunk_size ≥ 1 and integer logger.info(f"Meteo-API: Downloading variables\n\t{request['variable']}\n") - logger.info(f"Meteo-API: Expected request weight of full request: {weight_of_full_api_request}") + logger.info( + f"Meteo-API: Expected request weight of full request: {weight_of_full_api_request}" + ) # Loop through coordinate grid in blocks and request data data = [] for i in range(0, len(coords), chunk_size): - coord_chunk = coords.iloc[i:i + chunk_size] + coord_chunk = coords.iloc[i : i + chunk_size] # Prepare API parameters for the current chunk params = { - "longitude": coord_chunk['longitude'].tolist(), - "latitude": coord_chunk['latitude'].tolist(), - "hourly": request['variable'], + "longitude": coord_chunk["longitude"].tolist(), + "latitude": coord_chunk["latitude"].tolist(), + "hourly": request["variable"], "wind_speed_unit": "ms", "start_date": start_date, - "end_date": end_date + "end_date": end_date, } try: @@ -584,8 +628,10 @@ def retrieve_meteo_data(url, product, chunks=None, tmpdir=None, lock=None, **upd logger.info(f"{e}") try: # Extract and classify rate limiting error - rate_limiting_error = list(e.args)[0]['reason'] - match = re.search(r"(Minutely|Hourly|Daily) API request limit", rate_limiting_error) + rate_limiting_error = list(e.args)[0]["reason"] + match = re.search( + r"(Minutely|Hourly|Daily) API request limit", rate_limiting_error + ) if match: apply_rate_limiting(error=match[0]) else: @@ -594,8 +640,10 @@ def retrieve_meteo_data(url, product, chunks=None, tmpdir=None, lock=None, **upd responses = openmeteo.weather_api(url, params=params) except Exception as e: # Skip this chunk on repeated failure - logger.error(f"Meteo-API: Failed to fetch data for block starting at " - f"({coord_chunk.loc[0, 'longitude']}, {coord_chunk.loc[0, 'latitude']}): {e}") + logger.error( + f"Meteo-API: Failed to fetch data for block starting at " + f"({coord_chunk.loc[0, 'longitude']}, {coord_chunk.loc[0, 'latitude']}): {e}" + ) continue # Parse chunked response and append results @@ -605,9 +653,10 @@ def retrieve_meteo_data(url, product, chunks=None, tmpdir=None, lock=None, **upd ds = pd.concat(data).to_xarray() ds = _rename_and_clean_coords(ds) ds = ds.chunk(chunks=chunks) - + return ds + def parse_meteo_responses(responses, params): """ Parse raw Open-Meteo API responses into a list of DataFrames. @@ -630,9 +679,12 @@ def parse_meteo_responses(responses, params): # Reconstruct time index based on interval range_start = pd.to_datetime(response.Hourly().Time(), unit="s") range_end = pd.to_datetime(response.Hourly().TimeEnd(), unit="s") - date_range = pd.date_range(start=range_start, end=range_end, - freq=pd.Timedelta(seconds=response.Hourly().Interval()), - inclusive="left") + date_range = pd.date_range( + start=range_start, + end=range_end, + freq=pd.Timedelta(seconds=response.Hourly().Interval()), + inclusive="left", + ) # Initialize empty DataFrame for the current location response_df = pd.DataFrame(columns=params["hourly"]) @@ -649,6 +701,7 @@ def parse_meteo_responses(responses, params): return data + def apply_rate_limiting(error=None): """ Apply appropriate sleep duration based on API rate-limiting error. @@ -668,14 +721,16 @@ def apply_rate_limiting(error=None): """ now = datetime.datetime.now() - midnight = datetime.datetime(now.year, now.month, now.day) + datetime.timedelta(days=1, minutes=5) + midnight = datetime.datetime(now.year, now.month, now.day) + datetime.timedelta( + days=1, minutes=5 + ) time_until_midnight = (midnight - now).total_seconds() sleep_times = { None: 120, # Fallback for unknown errors "Minutely API request limit": 60, "Hourly API request limit": 60 * 60, - "Daily API request limit": time_until_midnight + "Daily API request limit": time_until_midnight, } sleep_time = sleep_times[error] @@ -689,21 +744,21 @@ def retrieve_era5_data(url, product, chunks=None, tmpdir=None, lock=None, **upda If you want to track the state of your request go to https://cds-beta.climate.copernicus.eu/requests?tab=all - """ - request = {"product_type": ["reanalysis"], - "data_format": "netcdf", - "download_format": "zip"} - + """ + request = { + "product_type": ["reanalysis"], + "data_format": "netcdf", + "download_format": "zip", + } + request.update(updates) - assert {"year", "month", "variable"}.issubset( - request - ), "Need to specify at least 'variable', 'year' and 'month'" + assert {"year", "month", "variable"}.issubset(request), ( + "Need to specify at least 'variable', 'year' and 'month'" + ) client = cdsapi.Client( - url = url, - info_callback=logger.debug, - debug=logging.DEBUG >= logging.root.level + url=url, info_callback=logger.debug, debug=logging.DEBUG >= logging.root.level ) result = client.retrieve(product, request) @@ -720,35 +775,43 @@ def retrieve_era5_data(url, product, chunks=None, tmpdir=None, lock=None, **upda varstr = "\n\t".join([f"{v} ({timestr})" for v in variables]) logger.info(f"CDS: Downloading variables\n\t{varstr}\n") result.download(target_zip) - + # Open the .zip file in memory with zipfile.ZipFile(target_zip, "r") as zf: # Identify .nc files inside the .zip nc_files = [name for name in zf.namelist() if name.endswith(".nc")] - + if not nc_files: - raise FileNotFoundError("No .nc files found in the downloaded .zip archive.") - + raise FileNotFoundError( + "No .nc files found in the downloaded .zip archive." + ) + if len(nc_files) == 1: # If there's only one .nc file, read it into memory with zf.open(nc_files[0]) as nc_file: # Pass the in-memory file-like object to Xarray - ds = xr.open_dataset(io.BytesIO(nc_file.read()), chunks=chunks or {}) - + ds = xr.open_dataset( + io.BytesIO(nc_file.read()), chunks=chunks or {} + ) + else: # If multiple .nc files, combine them using Xarray datasets = [] for nc_file in nc_files: with zf.open(nc_file) as file: - dataset = xr.open_dataset(io.BytesIO(file.read()), chunks=chunks or {}) - - if 'expver' in dataset.variables: - dataset = dataset.drop_vars(["expver", "number"], errors="ignore") + dataset = xr.open_dataset( + io.BytesIO(file.read()), chunks=chunks or {} + ) + + if "expver" in dataset.variables: + dataset = dataset.drop_vars( + ["expver", "number"], errors="ignore" + ) datasets.append(dataset) - - ds = xr.merge(datasets) - + + ds = xr.merge(datasets) + if tmpdir is None: logging.debug(f"Adding finalizer for {target_zip}") weakref.finalize(ds._file_obj._manager, noisy_unlink, target_zip) @@ -756,7 +819,9 @@ def retrieve_era5_data(url, product, chunks=None, tmpdir=None, lock=None, **upda return ds -def retrieval_times_era5_forecast(coords, initialization_time, static=False, monthly_requests=False): +def retrieval_times_era5_forecast( + coords, initialization_time, static=False, monthly_requests=False +): """ Get list of retrieval cdsapi arguments for time dimension in coordinates. @@ -780,33 +845,36 @@ def retrieval_times_era5_forecast(coords, initialization_time, static=False, mon list of dicts witht retrieval arguments """ - + # Convert time coordinates to a pandas Index time = coords["time"].to_index() frequency = time.freq - + # Determine the latest available ERA5 data time based on initialization time and required delay latest_era5_time = pd.Timestamp(initialization_time) + ERA5_DELAY - + # Round up to the next full day and subtract 1 hour to align with ERA5 update frequency - latest_era5_time = latest_era5_time.ceil('D') - pd.Timedelta(hours=1) - + latest_era5_time = latest_era5_time.ceil("D") - pd.Timedelta(hours=1) + # Define the minimum required time horizon for ERA5 downloads (last 24 hours) minimum_era5_time_horizon = pd.date_range( - start=latest_era5_time - pd.Timedelta(days=1), # One day before the latest available time + start=latest_era5_time + - pd.Timedelta(days=1), # One day before the latest available time end=latest_era5_time, # Up to the latest available time - freq=frequency # Maintain original time frequency + freq=frequency, # Maintain original time frequency ) - + # Merge the existing time index with the minimum ERA5 time horizon, avoiding duplicates time = time.union(minimum_era5_time_horizon) - + # Ensure a continuous time index by filling missing values based on the determined frequency - complete_time_range = pd.date_range(start=time.min(), end=time.max(), freq=frequency) - + complete_time_range = pd.date_range( + start=time.min(), end=time.max(), freq=frequency + ) + # Keep only timestamps up to the latest available ERA5 time time = complete_time_range[complete_time_range <= latest_era5_time] - + if static: return { "year": [str(time[0].year)], @@ -824,7 +892,9 @@ def retrieval_times_era5_forecast(coords, initialization_time, static=False, mon query = { "year": str(year), "month": [str(month).zfill(2)], - "day": list(t[t.month == month].day.unique().astype(str).str.zfill(2)), + "day": list( + t[t.month == month].day.unique().astype(str).str.zfill(2) + ), "time": ["%02d:00" % h for h in t[t.month == month].hour.unique()], } times.append(query) @@ -880,7 +950,7 @@ def get_data( """ coords = cutout.coords - initialization_time = creation_parameters['init_time'] + initialization_time = creation_parameters["init_time"] sanitize = creation_parameters.get("sanitize", True) @@ -892,7 +962,7 @@ def get_data( "tmpdir": tmpdir, "lock": lock, } - + retrieval_params_era5 = { "product": "reanalysis-era5-single-levels", "area": _area(coords), @@ -902,63 +972,68 @@ def get_data( "lock": lock, } - # Get fast changing variabels from meteo forecast - func_meteo = globals().get(f"get_data_meteo_{feature}") + func_meteo = globals().get(f"get_data_meteo_{feature}") logger.info(f"Requesting data for feature {feature} from meteo...") - - + if func_meteo is not None: - datasets_meteo = func_meteo({**retrieval_params_meteo, - **{"start": coords["time"].to_index()[0], - "end": coords["time"].to_index()[-1], - "coords": coords}}) + datasets_meteo = func_meteo( + { + **retrieval_params_meteo, + **{ + "start": coords["time"].to_index()[0], + "end": coords["time"].to_index()[-1], + "coords": coords, + }, + } + ) else: datasets_meteo = xr.Dataset() - def retrieve_once(time): ds = func_era5({**retrieval_params_era5, **time}) return ds - # Get missing and slow changing variabels from era5 data and interpolation func_era5 = globals().get(f"get_data_era5_{feature}") if func_era5 is not None: logger.info(f"Requesting addtional data for feature {feature} from era5...") - + if feature in static_features: - return retrieve_once(retrieval_times_era5_forecast(coords, initialization_time, static=True)).squeeze() - - time_chunks = retrieval_times_era5_forecast(coords, initialization_time, monthly_requests=monthly_requests) + return retrieve_once( + retrieval_times_era5_forecast(coords, initialization_time, static=True) + ).squeeze() + + time_chunks = retrieval_times_era5_forecast( + coords, initialization_time, monthly_requests=monthly_requests + ) if concurrent_requests: delayed_datasets = [delayed(retrieve_once)(chunk) for chunk in time_chunks] datasets_era5 = compute(*delayed_datasets) else: datasets_era5 = map(retrieve_once, time_chunks) - + datasets_era5 = xr.concat(datasets_era5, dim="time") - + else: datasets_era5 = xr.Dataset() - + # Combine datasets and calculate the required variables combine_func = globals().get(f"combine_data_{feature}") - + logger.info(f"Combine meteo and era5 datasets for feature {feature}...") - + if combine_func is not None: - datasets = combine_func(datasets_meteo, - datasets_era5, - cutout.data.interp_s, - cutout.data.interp_t) + datasets = combine_func( + datasets_meteo, datasets_era5, cutout.data.interp_s, cutout.data.interp_t + ) else: datasets = xr.merge([datasets_meteo, datasets_era5]) - + sanitize_func = globals().get(f"sanitize_{feature}") if sanitize and sanitize_func is not None: # Sanitize the data after interpolation to remove residuals datasets = sanitize_func(datasets) - + return datasets.sel(time=coords["time"]) diff --git a/atlite/datasets/sarah.py b/atlite/datasets/sarah.py index c4f1ee12..8ed5fce8 100644 --- a/atlite/datasets/sarah.py +++ b/atlite/datasets/sarah.py @@ -39,99 +39,132 @@ static_features = {} -requirements = {'x': slice(-65.0, 65.0, 0.05), - 'y': slice(-65.0, 65.0, 0.05), - 'offset': pd.Timestamp('1983-01-01'), - 'forecast': pd.Timestamp('2017-12-31 23:00'), - 'dt': pd.Timedelta(hours=0.5), - 'parallel': True, - } +requirements = { + "x": slice(-65.0, 65.0, 0.05), + "y": slice(-65.0, 65.0, 0.05), + "offset": pd.Timestamp("1983-01-01"), + "forecast": pd.Timestamp("2017-12-31 23:00"), + "dt": pd.Timedelta(hours=0.5), + "parallel": True, +} def _checkModuleRequirements(x, y, time, time_now, **kwargs): """ Load and check the data requirements for a given module. - - Parameters: + + Parameters + ---------- x (slice): Defines the start, stop, and step values for the x-dimension. y (slice): Defines the start, stop, and step values for the y-dimension. time (slice): Defines the start, stop, and step values for the time dimension. **kwargs: Additional optional parameters. """ - + # Extract start, stop, and step values for x x_start, x_stop, x_step = x.start, x.stop, x.step - + # Adjust x range based on module requirements - if requirements['x'].start > x.start: - x_start = requirements['x'].start - if requirements['x'].stop < x.stop: - x_stop = requirements['x'].stop - if requirements['x'].step > x.step: - x_step = requirements['x'].step - + if requirements["x"].start > x.start: + x_start = requirements["x"].start + if requirements["x"].stop < x.stop: + x_stop = requirements["x"].stop + if requirements["x"].step > x.step: + x_step = requirements["x"].step + x = slice(x_start, x_stop, x_step) - + # Extract start, stop, and step values for y y_start, y_stop, y_step = y.start, y.stop, y.step - + # Adjust y range based on module requirements - if requirements['y'].start > y.start: - y_start = requirements['y'].start - if requirements['y'].stop < y.stop: - y_stop = requirements['y'].stop - if requirements['y'].step > y.step: - y_step = requirements['y'].step - + if requirements["y"].start > y.start: + y_start = requirements["y"].start + if requirements["y"].stop < y.stop: + y_stop = requirements["y"].stop + if requirements["y"].step > y.step: + y_step = requirements["y"].step + y = slice(y_start, y_stop, y_step) - - + # Extract time range parameters time_start = time.start time_stop = time.stop time_step = time.step - + # Check forecast feasibility - feasible_start = time_now + requirements['offset'] - feasible_end = time_now + requirements['forecast'] - + feasible_start = time_now + requirements["offset"] + feasible_end = time_now + requirements["forecast"] + # Ensure time_start is within feasible bounds if time_start < feasible_start: - logger.error(f"The required forecast start time {time_start} exceeds the model requirements.") - logger.error(f"The minimum start time of the forecast for {time_now} is {feasible_start}.") - logger.error(f"The maximum historical offset of the forecast is {requirements['offset']}.") - raise ValueError(f"Invalid forecast start time: {time_start}. Must be >= {feasible_start}.") - + logger.error( + f"The required forecast start time {time_start} exceeds the model requirements." + ) + logger.error( + f"The minimum start time of the forecast for {time_now} is {feasible_start}." + ) + logger.error( + f"The maximum historical offset of the forecast is {requirements['offset']}." + ) + raise ValueError( + f"Invalid forecast start time: {time_start}. Must be >= {feasible_start}." + ) + if time_start >= feasible_end: - logger.error(f"The required forecast start time {time_start} exceeds the model requirements.") - logger.error(f"The maximum start time of the forecast for {time_now} needs to be smaller than {feasible_end}.") - raise ValueError(f"Invalid forecast start time: {time_start}. Must be < {feasible_end}.") + logger.error( + f"The required forecast start time {time_start} exceeds the model requirements." + ) + logger.error( + f"The maximum start time of the forecast for {time_now} needs to be smaller than {feasible_end}." + ) + raise ValueError( + f"Invalid forecast start time: {time_start}. Must be < {feasible_end}." + ) # Ensure time_stop is greater than time_start if time_stop <= time_start: - logger.error(f"The required forecast end time {time_stop} exceeds the model requirements.") - logger.error(f"The minimum end time of the forecast for {time_now} needs to be larger than {time_start}.") - raise ValueError(f"Invalid forecast end time: {time_stop}. Must be > {time_start}.") - + logger.error( + f"The required forecast end time {time_stop} exceeds the model requirements." + ) + logger.error( + f"The minimum end time of the forecast for {time_now} needs to be larger than {time_start}." + ) + raise ValueError( + f"Invalid forecast end time: {time_stop}. Must be > {time_start}." + ) + # Ensure time_stop is greater than time_start if time_stop > feasible_end: - logger.error(f"The required forecast end time {time_stop} exceeds the model requirements.") - logger.error(f"The maximum end time of the forecast for {time_now} is {feasible_end}.") + logger.error( + f"The required forecast end time {time_stop} exceeds the model requirements." + ) + logger.error( + f"The maximum end time of the forecast for {time_now} is {feasible_end}." + ) logger.error(f"The maximum forecast horizon is {requirements['forecast']}.") - raise ValueError(f"Invalid forecast end time: {time_stop}. Must be <= {feasible_end}.") - + raise ValueError( + f"Invalid forecast end time: {time_stop}. Must be <= {feasible_end}." + ) + # Ensure time step is within required limits - if (time_step is pd.Timedelta(None)) or (time.step < requirements['dt']): - logger.warning(f"The required temporal forecast resolution {time_step} exceeds the model requirements.") - logger.warning(f"The minimum temporal resolution of the forecast is {requirements['dt']}.") - logger.info(f"Set the temporal forecast resolution to the minimum: {requirements['dt']}.") - time_step = requirements['dt'] - + if (time_step is pd.Timedelta(None)) or (time.step < requirements["dt"]): + logger.warning( + f"The required temporal forecast resolution {time_step} exceeds the model requirements." + ) + logger.warning( + f"The minimum temporal resolution of the forecast is {requirements['dt']}." + ) + logger.info( + f"Set the temporal forecast resolution to the minimum: {requirements['dt']}." + ) + time_step = requirements["dt"] + time = slice(time_start, time_stop, time_step) - + # Retrieve parallel processing setting from requirements - parallel = requirements['parallel'] - + parallel = requirements["parallel"] + return x, y, time, parallel From a14a3f505f9400ed6d689553185cd48355b4ad83 Mon Sep 17 00:00:00 2001 From: TimFuermann <83589894+TimFuermann@users.noreply.github.com> Date: Wed, 9 Apr 2025 15:28:26 +0200 Subject: [PATCH 03/11] Create environment.yaml add cdo to the environment --- environment.yaml | 41 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) create mode 100644 environment.yaml diff --git a/environment.yaml b/environment.yaml new file mode 100644 index 00000000..ad8046d4 --- /dev/null +++ b/environment.yaml @@ -0,0 +1,41 @@ +# -*- coding: utf-8 -*- + +# SPDX-FileCopyrightText: 2016 - 2023 The Atlite Authors +# +# SPDX-License-Identifier: MIT + +name: atlite-private +channels: +- conda-forge +- defaults +dependencies: +- python>=3.6 +- numpy<2 +- scipy +- pandas>=0.25 +- xarray>=0.16.2 +- netcdf4 +- dask>=2021.10.0 +- toolz +- yaml +- requests +- geopandas +- cdsapi>=0.7,<0.7.3 +- pyproj>=2.0 +- rasterio!=1.2.10 +- shapely +- progressbar2 +- tqdm +- cdo + + # dev tools +- black +- pre-commit + + # Recommended for pandas and xarray +- bottleneck +- numexpr + + # For testing +- pip +- pytest From 8e93368ebde944f931a7dada20238c3471f48a86 Mon Sep 17 00:00:00 2001 From: TimFuermann <83589894+TimFuermann@users.noreply.github.com> Date: Wed, 9 Apr 2025 15:33:35 +0200 Subject: [PATCH 04/11] Update cutout.py --- atlite/cutout.py | 1 + 1 file changed, 1 insertion(+) diff --git a/atlite/cutout.py b/atlite/cutout.py index 77d5c9c4..0c7e54bc 100644 --- a/atlite/cutout.py +++ b/atlite/cutout.py @@ -35,6 +35,7 @@ csp, dewpoint_temperature, heat_demand, + cooling_demand, hydro, irradiation, line_rating, From 1b783176724aec8215b98c331425659744ea1632 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 9 Apr 2025 13:34:06 +0000 Subject: [PATCH 05/11] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- atlite/cutout.py | 1 - 1 file changed, 1 deletion(-) diff --git a/atlite/cutout.py b/atlite/cutout.py index 0c7e54bc..77d5c9c4 100644 --- a/atlite/cutout.py +++ b/atlite/cutout.py @@ -35,7 +35,6 @@ csp, dewpoint_temperature, heat_demand, - cooling_demand, hydro, irradiation, line_rating, From a4872cca19f1f7b01534feb11c700002d7977ed2 Mon Sep 17 00:00:00 2001 From: TimFuermann <83589894+TimFuermann@users.noreply.github.com> Date: Wed, 9 Apr 2025 15:40:08 +0200 Subject: [PATCH 06/11] Update convert.py --- atlite/convert.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/atlite/convert.py b/atlite/convert.py index cad6f62f..96a2974e 100644 --- a/atlite/convert.py +++ b/atlite/convert.py @@ -817,7 +817,7 @@ def pv(cutout, panel, orientation, tracking=None, clearsky_model=None, **params) Eurosun (ISES Europe Solar Congress). """ - if isinstance(panel, (str, Path)): + if isinstance(panel, (str | Path)): panel = get_solarpanelconfig(panel) if not callable(orientation): orientation = get_orientation(orientation) @@ -906,7 +906,7 @@ def csp(cutout, installation, technology=None, **params): URL: https://www.dlr.de/sf/en/desktopdefault.aspx/tabid-11126/19467_read-48251/ """ - if isinstance(installation, (str, Path)): + if isinstance(installation, (str | Path)): installation = get_cspinstallationconfig(installation) # Overwrite technology @@ -1038,9 +1038,7 @@ def hydro( # The hydrological parameters are in units of "m of water per day" and so # they should be multiplied by 1000 and the basin area to convert to m3 # d-1 = m3 h-1 / 24 - runoff *= (1000.0 / 24.0) * xr.DataArray( - basins.shapes.to_crs(dict(proj="cea")).area - ) + runoff *= xr.DataArray(basins.shapes.to_crs(dict(proj="cea")).area) return hydrom.shift_and_aggregate_runoff_for_plants( basins, runoff, flowspeed, show_progress From d4349a8f0a043de9a00999e933b6156937f1a173 Mon Sep 17 00:00:00 2001 From: TimFuermann <83589894+TimFuermann@users.noreply.github.com> Date: Wed, 9 Apr 2025 15:43:36 +0200 Subject: [PATCH 07/11] Update cutout.py --- atlite/cutout.py | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/atlite/cutout.py b/atlite/cutout.py index 77d5c9c4..82806efa 100644 --- a/atlite/cutout.py +++ b/atlite/cutout.py @@ -32,6 +32,7 @@ from atlite.convert import ( coefficient_of_performance, convert_and_aggregate, + cooling_demand, csp, dewpoint_temperature, heat_demand, @@ -68,7 +69,7 @@ class Cutout: def __init__(self, path, **cutoutparams): """ - Provide an Atlite cutout object. + Provide an atlite cutout object. Create a cutout object to use atlite operations on it. Based on the provided parameters, atlite first checks whether this cutout already @@ -139,7 +140,7 @@ def __init__(self, path, **cutoutparams): sarah data which has missing data for areas where dawn and nightfall happens (ca. 30 min gap). gebco_path: str - Path to find the gebco netcdf file. Only necessary when including + Path to find the gebco NetCDF file. Only necessary when including the gebco module. parallel : bool, default False Whether to open dataset in parallel mode. Take effect for all @@ -542,7 +543,7 @@ def merge(self, other, path=None, **kwargs): def to_file(self, fn=None): """ - Save cutout to a netcdf file. + Save cutout to a NetCDF file. Parameters ---------- @@ -579,6 +580,15 @@ def __repr__(self): ) ) + def equals(self, other): + """ + It overrides xarray.Dataset.equals and ignores the path attribute in the comparison + """ + if not isinstance(other, Cutout): + return NotImplemented + # Compare cutouts data attributes + return self.data.equals(other.data) + def indicatormatrix(self, shapes, shapes_crs=4326): """ Compute the indicatormatrix. @@ -736,6 +746,8 @@ def layout_from_capacity_list(self, data, col="Capacity"): # Conversion and aggregation functions convert_and_aggregate = convert_and_aggregate + + cooling_demand = cooling_demand heat_demand = heat_demand From 6ad71e09df3e975ba52c3f9ee1b4265f197e4d25 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 9 Apr 2025 13:44:39 +0000 Subject: [PATCH 08/11] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- atlite/cutout.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/atlite/cutout.py b/atlite/cutout.py index 82806efa..21e3beb6 100644 --- a/atlite/cutout.py +++ b/atlite/cutout.py @@ -581,13 +581,13 @@ def __repr__(self): ) def equals(self, other): - """ - It overrides xarray.Dataset.equals and ignores the path attribute in the comparison - """ - if not isinstance(other, Cutout): - return NotImplemented - # Compare cutouts data attributes - return self.data.equals(other.data) + """ + It overrides xarray.Dataset.equals and ignores the path attribute in the comparison + """ + if not isinstance(other, Cutout): + return NotImplemented + # Compare cutouts data attributes + return self.data.equals(other.data) def indicatormatrix(self, shapes, shapes_crs=4326): """ @@ -746,7 +746,7 @@ def layout_from_capacity_list(self, data, col="Capacity"): # Conversion and aggregation functions convert_and_aggregate = convert_and_aggregate - + cooling_demand = cooling_demand heat_demand = heat_demand From c1e805eed8ac0a43a78c7a94fcf0101c4eb09bba Mon Sep 17 00:00:00 2001 From: TimFuermann <83589894+TimFuermann@users.noreply.github.com> Date: Wed, 9 Apr 2025 15:47:05 +0200 Subject: [PATCH 09/11] update cutout and environment --- atlite/cutout.py | 17 +++++++++-------- environment.yaml | 4 ++++ 2 files changed, 13 insertions(+), 8 deletions(-) diff --git a/atlite/cutout.py b/atlite/cutout.py index 82806efa..90174987 100644 --- a/atlite/cutout.py +++ b/atlite/cutout.py @@ -580,14 +580,6 @@ def __repr__(self): ) ) - def equals(self, other): - """ - It overrides xarray.Dataset.equals and ignores the path attribute in the comparison - """ - if not isinstance(other, Cutout): - return NotImplemented - # Compare cutouts data attributes - return self.data.equals(other.data) def indicatormatrix(self, shapes, shapes_crs=4326): """ @@ -688,6 +680,15 @@ def uniform_density_layout(self, capacity_density, crs=None): """ return capacity_density * self.area(crs) + def equals(self, other): + """ + It overrides xarray.Dataset.equals and ignores the path attribute in the comparison + """ + if not isinstance(other, Cutout): + return NotImplemented + # Compare cutouts data attributes + return self.data.equals(other.data) + def layout_from_capacity_list(self, data, col="Capacity"): """ Get a capacity layout aligned to the cutout based on a capacity list. diff --git a/environment.yaml b/environment.yaml index ad8046d4..b94ad9f9 100644 --- a/environment.yaml +++ b/environment.yaml @@ -27,6 +27,10 @@ dependencies: - progressbar2 - tqdm - cdo +- openmeteo_requests +- requests_cache +- zipfile +- re # dev tools - black From 2b0370490641b8932f79ba45162144cfcba6fb11 Mon Sep 17 00:00:00 2001 From: TimFuermann <83589894+TimFuermann@users.noreply.github.com> Date: Wed, 9 Apr 2025 15:49:26 +0200 Subject: [PATCH 10/11] Update cutout.py --- atlite/cutout.py | 9 --------- 1 file changed, 9 deletions(-) diff --git a/atlite/cutout.py b/atlite/cutout.py index 21db8b1d..80282f87 100644 --- a/atlite/cutout.py +++ b/atlite/cutout.py @@ -580,15 +580,6 @@ def __repr__(self): ) ) - def equals(self, other): - """ - It overrides xarray.Dataset.equals and ignores the path attribute in the comparison - """ - if not isinstance(other, Cutout): - return NotImplemented - # Compare cutouts data attributes - return self.data.equals(other.data) - def indicatormatrix(self, shapes, shapes_crs=4326): """ Compute the indicatormatrix. From 0714029ece2ebc1173b54d3eebc62381a9e06fc2 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 9 Apr 2025 13:51:12 +0000 Subject: [PATCH 11/11] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- atlite/cutout.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/atlite/cutout.py b/atlite/cutout.py index 80282f87..4cb28a1f 100644 --- a/atlite/cutout.py +++ b/atlite/cutout.py @@ -680,13 +680,13 @@ def uniform_density_layout(self, capacity_density, crs=None): return capacity_density * self.area(crs) def equals(self, other): - """ - It overrides xarray.Dataset.equals and ignores the path attribute in the comparison - """ - if not isinstance(other, Cutout): - return NotImplemented - # Compare cutouts data attributes - return self.data.equals(other.data) + """ + It overrides xarray.Dataset.equals and ignores the path attribute in the comparison + """ + if not isinstance(other, Cutout): + return NotImplemented + # Compare cutouts data attributes + return self.data.equals(other.data) def layout_from_capacity_list(self, data, col="Capacity"): """