From d665a78bd30b0d9173a21349c1a084cdf752229d Mon Sep 17 00:00:00 2001 From: Alister Trabattoni Date: Sun, 24 May 2026 20:18:07 +0200 Subject: [PATCH 01/21] Ignore AI agents. --- .gitignore | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index 49297063..e925a3cf 100644 --- a/.gitignore +++ b/.gitignore @@ -162,4 +162,8 @@ cython_debug/ # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore # and can be added to the global gitignore or merged into this file. For a more nuclear # option (not recommended) you can uncomment the following to ignore the entire idea folder. -#.idea/ \ No newline at end of file +#.idea/ + +# AI agents +CLAUDE.md +.claude \ No newline at end of file From 9d43fab9afe6cf28ea73b90ff556a68299c861e5 Mon Sep 17 00:00:00 2001 From: Alister Trabattoni Date: Sun, 24 May 2026 20:18:28 +0200 Subject: [PATCH 02/21] Add pytest coverage configuration. --- pyproject.toml | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index fa890d3f..8c848488 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -60,5 +60,17 @@ tests = [ profile = "black" [tool.pytest.ini_options] -addopts = ["--doctest-modules", "--import-mode=importlib"] +addopts = ["--doctest-modules", "--import-mode=importlib", "--cov=xdas", "--cov-report=term-missing"] doctest_optionflags = "NORMALIZE_WHITESPACE" +markers = ["slow: marks tests as slow (run with --slow)"] + +[tool.coverage.run] +source = ["xdas"] +branch = true + +[tool.coverage.report] +show_missing = true +skip_covered = false +exclude_also = [ + "raise NotImplementedError", +] From 81679be7a1eb6c7ba2fab306fde3d2ce12da4996 Mon Sep 17 00:00:00 2001 From: Alister Trabattoni Date: Sun, 24 May 2026 20:22:32 +0200 Subject: [PATCH 03/21] Rework docs: user_guide/index - sampled_coordinates - processing --- docs/getting-started.md | 4 +- .../coordinates/sampled-coordinates.md | 130 +++++++++++++++++- docs/user-guide/data-structures/dataarray.md | 6 +- docs/user-guide/faq.md | 90 +++++++++++- .../{ => how-to}/convert-displacement.md | 4 +- docs/user-guide/how-to/index.md | 9 ++ docs/user-guide/index.md | 62 +++++++-- docs/user-guide/{ => io}/data-formats.md | 4 +- docs/user-guide/io/index.md | 11 ++ docs/user-guide/{ => io}/miniseed.md | 2 +- docs/user-guide/{ => io}/virtual-datasets.md | 4 +- docs/user-guide/{ => pipeline}/atoms.md | 3 +- docs/user-guide/pipeline/index.md | 13 ++ docs/user-guide/{ => pipeline}/processing.md | 64 ++++++++- docs/user-guide/{ => pipeline}/streaming.md | 8 ++ 15 files changed, 382 insertions(+), 32 deletions(-) rename docs/user-guide/{ => how-to}/convert-displacement.md (92%) create mode 100644 docs/user-guide/how-to/index.md rename docs/user-guide/{ => io}/data-formats.md (99%) create mode 100644 docs/user-guide/io/index.md rename docs/user-guide/{ => io}/miniseed.md (99%) rename docs/user-guide/{ => io}/virtual-datasets.md (97%) rename docs/user-guide/{ => pipeline}/atoms.md (99%) create mode 100644 docs/user-guide/pipeline/index.md rename docs/user-guide/{ => pipeline}/processing.md (54%) rename docs/user-guide/{ => pipeline}/streaming.md (97%) diff --git a/docs/getting-started.md b/docs/getting-started.md index 6dc592ac..9960e7f1 100644 --- a/docs/getting-started.md +++ b/docs/getting-started.md @@ -48,7 +48,7 @@ Most instruments usually produces datasets made out of a multitude of files, eac ### Linking multiple files -If you are considering a unique acquisition you can use {py:func}`~xdas.open`. You can either pass a list of paths or a path pattern containing wildcards to specify which files must be linked together. Xdas should automatically detect the file format. Xdas support a variety of DAS formats and it is easy to add support to any custom or missing format. See the [](user-guide/data-formats) section for more information. +If you are considering a unique acquisition you can use {py:func}`~xdas.open`. You can either pass a list of paths or a path pattern containing wildcards to specify which files must be linked together. Xdas should automatically detect the file format. Xdas support a variety of DAS formats and it is easy to add support to any custom or missing format. See the [](user-guide/io/data-formats) section for more information. In the example here, we have three files of interest in the current working directory: @@ -307,4 +307,4 @@ assert chunked.equals(monolithic) # again equal but could be applied to much bi !rm -r output ``` -This part was a short summary about atoms and chunk processing. To go deeper on the atom part you can head to the [](user-guide/atoms) section. To further study chunk processing you can head to the [](user-guide/processing) section. \ No newline at end of file +This part was a short summary about atoms and chunk processing. To go deeper on the atom part you can head to the [](user-guide/pipeline/atoms) section. To further study chunk processing you can head to the [](user-guide/pipeline/processing) section. \ No newline at end of file diff --git a/docs/user-guide/coordinates/sampled-coordinates.md b/docs/user-guide/coordinates/sampled-coordinates.md index e02500ff..77be0fc3 100644 --- a/docs/user-guide/coordinates/sampled-coordinates.md +++ b/docs/user-guide/coordinates/sampled-coordinates.md @@ -6,4 +6,132 @@ kernelspec: # Sampled Coordinates -TODO \ No newline at end of file +## Overview + +A {py:class}`~xdas.coordinates.SampledCoordinate` describes a coordinate whose values are +spaced by a **fixed sampling interval**. Unlike {py:class}`~xdas.coordinates.InterpCoordinate`, +which uses piecewise linear interpolation between arbitrary tie points, +`SampledCoordinate` exploits the regularity of the grid and stores only: + +- `tie_values` — the start value of each contiguous segment. +- `tie_lengths` — the number of samples in each segment. +- `sampling_interval` — the fixed step shared by all segments. + +This makes it more compact and numerically stable than the interpolated variant, and it +maps directly to the block-based time representation used in the miniSEED and SEED formats. + +```{code-cell} +import numpy as np +import xdas as xd +from xdas.coordinates import SampledCoordinate + +coord = SampledCoordinate( + { + "tie_values": [0.0, 200.0], + "tie_lengths": [10, 8], + "sampling_interval": 10.0, + } +) +coord +``` + +The two segments start at 0 and 200, each stepping by 10. The gap between them (from 100 +to 200) is explicit — there is simply no segment covering that range. + +## Materialising values + +Calling `.values` returns the full coordinate vector as a dense NumPy array: + +```{code-cell} +coord.values +``` + +Individual values are obtained from indices with `.get_value` and the reverse mapping +(index from value) with `.to_index`: + +```{code-cell} +coord.get_value(3) +``` + +```{code-cell} +coord.to_index(30.0) +``` + +## Datetime coordinates + +The most common use of `SampledCoordinate` is for the time axis of DAS data. The +`sampling_interval` must be a `numpy.timedelta64`: + +```{code-cell} +t0 = np.datetime64("2024-01-01T00:00:00.000", "ms") +dt = np.timedelta64(4, "ms") # 250 Hz + +coord = SampledCoordinate( + { + "tie_values": [t0, t0 + np.timedelta64(1, "s")], + "tie_lengths": [250, 250], + "sampling_interval": dt, + } +) +coord +``` + +## Gaps and multi-segment coordinates + +Multiple segments represent an acquisition with gaps. Each element of `tie_values` marks +the start of one contiguous block, and `tie_lengths` gives its duration in samples: + +```{code-cell} +t0 = np.datetime64("2024-01-01T00:00:00.000", "ms") +dt = np.timedelta64(4, "ms") + +segments = [ + (t0, 500), # 2 s block + (t0 + np.timedelta64(3, "s"), 500), # another 2 s block after a 1 s gap +] +coord = SampledCoordinate( + { + "tie_values": [s[0] for s in segments], + "tie_lengths": [s[1] for s in segments], + "sampling_interval": dt, + } +) +coord +``` + +## Simplifying near-regular coordinates + +When opening files whose timestamps are not perfectly aligned (e.g. NTP-synchronized +acquisitions), small drifts create many short segments. The `simplify` method merges +segments whose start time is within `tolerance` of the expected position: + +```{code-cell} +t0 = np.datetime64("2024-01-01T00:00:00.000", "ms") +dt = np.timedelta64(4, "ms") + +# Simulate a one-sample drift at the boundary +t1_drifted = t0 + np.timedelta64(2000, "ms") + np.timedelta64(1, "ms") + +coord = SampledCoordinate( + { + "tie_values": [t0, t1_drifted], + "tie_lengths": [500, 500], + "sampling_interval": dt, + } +) +print("Before:", len(coord.tie_values), "segments") + +tol = np.timedelta64(10, "ms") +coord = coord.simplify(tol) +print("After: ", len(coord.tie_values), "segments") +coord +``` + +## When to use SampledCoordinate vs InterpCoordinate + +| | `SampledCoordinate` | `InterpCoordinate` | +|---|---|---| +| Sampling | Strictly uniform (one `sampling_interval`) | Variable (piecewise linear) | +| Memory | Very compact | Compact | +| Use case | DAS / seismic time axes, uniform grids | Non-uniform grids, GPS-corrected time | +| miniSEED/SEED compatible | Yes | No | diff --git a/docs/user-guide/data-structures/dataarray.md b/docs/user-guide/data-structures/dataarray.md index efada2d3..3abab81b 100644 --- a/docs/user-guide/data-structures/dataarray.md +++ b/docs/user-guide/data-structures/dataarray.md @@ -15,7 +15,7 @@ os.chdir("../../_data") {py:class}`~xdas.DataArray` is the base class to load and manipulate big datasets to in *xdas*. It is mainly composed of two attributes: -- `data`: any N-dimensional array-like object. Compared to *xarray* `xdas.DataArray` are more permissive to the kinds of array-like objects that can be used. In particular, [virtual arrays](../virtual-datasets) can be used. +- `data`: any N-dimensional array-like object. Compared to *xarray* `xdas.DataArray` are more permissive to the kinds of array-like objects that can be used. In particular, [virtual arrays](../io/virtual-datasets) can be used. - `coords`: a dict-like container of coordinates. As opposed to *xarray*, which uses dense arrays to label each point, *xdas* also implements [interpolated coordinates](../coordinates/interpolated-coordinates) that provides an efficient representation of evenly spaced data (gracefully handling gaps and small sampling variations). ![](/_static/dataarray.svg) @@ -57,7 +57,7 @@ da ## Writing a DataArray to disk -*xdas* uses the CF conventions to write {py:class}`xdas.DataArray` to disk as netCDF4 files. If the DataArray was generated from a netCDF4/HDF5 file and only slicing was performed, the DataArray can be written as a pointer to the original data using the `virtual` argument. See the part on [](../virtual-datasets). +*xdas* uses the CF conventions to write {py:class}`xdas.DataArray` to disk as netCDF4 files. If the DataArray was generated from a netCDF4/HDF5 file and only slicing was performed, the DataArray can be written as a pointer to the original data using the `virtual` argument. See the part on [](../io/virtual-datasets). ```{code-cell} da.to_netcdf("dataarray.nc", virtual=None) # try to write virtual, here it's impossible @@ -67,7 +67,7 @@ da.to_netcdf("dataarray.nc", virtual=None) # try to write virtual, here it's im Xdas can read several DAS file format with {py:func}`~xdas.open` along with its own format. Xdas uses the netCDF4 format with CF conventions. By default Xdas assumes that files are Xdas NetCDF format. If not the case the `engine` argument must be passed. -To learn how to read your custom DAS data format with *xdas*, please see the chapter on [](../data-formats.md). +To learn how to read your custom DAS data format with *xdas*, please see the chapter on [](../io/data-formats.md). ```{code-cell} da = xd.open("dataarray.nc", engine=None) # by default Xdas NetCDF diff --git a/docs/user-guide/faq.md b/docs/user-guide/faq.md index b8173c4f..333ef5a3 100644 --- a/docs/user-guide/faq.md +++ b/docs/user-guide/faq.md @@ -2,9 +2,95 @@ ## Why not using Xarray and Dask? -Originally, Xdas was meant to be a simple add-on to Xarray, taking advantage of its [Dask integration] (https://docs.xarray.dev/en/stable/user-guide/dask.html). But two main limitations forced us to create a parallel project: +Originally, Xdas was meant to be a simple add-on to Xarray, taking advantage of its [Dask integration](https://docs.xarray.dev/en/stable/user-guide/dask.html). But two main limitations forced us to create a parallel project: - Coordinates have to be loaded into memory as NumPy arrays. This is prohibitive for very long time series, where storing the time coordinate as a dense array with a value for each time sample leads to metadata that in some extreme cases cannot fit in memory. - Dask arrays become sluggish when dealing with a very large number of files. Dask is a pure Python package, and processing graphs of millions of tasks can take several seconds or more. Also, Dask does not provide a way to serialise a graph for later reuse. -Because of this, and the fact that the Xarray object was not designed to be subclassed, we decided to go our own way. Hopefully, if the progress of Xarray allows it, we could imagine merging the two projects. Xdas tries to follow the Xarray API as much as possible. +Because of this, and the fact that the Xarray object was not designed to be subclassed, we decided to go our own way. Hopefully, if the progress of Xarray allows it, we could imagine merging the two projects. Xdas tries to follow the Xarray API as much as possible. + +## Which coordinate type should I use for my time axis? + +Use {py:class}`~xdas.coordinates.SampledCoordinate` when your acquisition has a constant +sampling rate (even if there are gaps between files). It is the most compact representation +and maps directly to the block time model used in miniSEED / SEED. + +Use {py:class}`~xdas.coordinates.InterpCoordinate` when the sampling rate itself varies +within a single acquisition, or when the data has been GPS-corrected and the timestamps +are not strictly uniform. + +See the [](coordinates/sampled-coordinates.md) and +[](coordinates/interpolated-coordinates.md) pages for details. + +## My virtual dataset returns NaN values. What is going on? + +NaN values in a virtual dataset almost always mean one of two things: + +1. **Files have moved or been deleted.** The virtual dataset only stores pointers. If the + pointed-to files are no longer at the recorded path, HDF5 silently returns NaN. +2. **Too many files are open simultaneously.** The HDF5 C library has a [known + limit](https://forum.hdfgroup.org/t/virtual-datasets-and-open-file-limit/6757) on the + number of concurrently open files. Raise the system limit with `ulimit -n ` + or load smaller slices of data. + +## How do I fix gaps and overlaps between files? + +Small timing errors (e.g. NTP drift) often create sub-sample overlaps between consecutive +files. Use the `simplify` method on the time coordinate to merge nearly-contiguous segments +within a given tolerance: + +```python +import numpy as np +tolerance = np.timedelta64(30, "ms") # typically enough for NTP-synced experiments +da["time"] = da["time"].simplify(tolerance) +``` + +Larger overlaps or gaps require manual inspection. See [](coordinates/interpolated-coordinates.md) +for the `get_discontinuities` method. + +## What is the difference between `xd.open`, `xd.open_dataarray`, and `xd.open_mfdataarray`? + +- {py:func}`xdas.open` — the recommended entry point. It auto-detects the file format and + dispatches to the appropriate lower-level function based on the path pattern (single + file, glob, or field template). +- {py:func}`xdas.open_dataarray` — opens a single file (or a previously saved virtual + dataset file) and returns a {py:class}`~xdas.DataArray`. +- {py:func}`xdas.open_mfdataarray` — opens multiple files matching a pattern and + concatenates them along the time axis into a single {py:class}`~xdas.DataArray`. + +In practice you almost never need to call `open_dataarray` or `open_mfdataarray` directly. + +## My filter produces different results when applied chunk by chunk. Why? + +Recursive (IIR) filters are stateful: each output sample depends on previous input and +output samples. When you split data into chunks and apply the filter independently to +each chunk, the state is re-initialised at every boundary and the transient response +distorts the result near each chunk edge. + +Use the stateful atom equivalents from {py:mod}`xdas.atoms` (e.g. +{py:class}`~xdas.atoms.IIRFilter`, {py:class}`~xdas.atoms.LFilter`) inside a +{py:class}`~xdas.atoms.Sequential` pipeline. These atoms carry the filter state across +chunk boundaries automatically when used with {py:func}`~xdas.processing.process`. + +## Can I use xdas with seismic data that is not DAS? + +Yes. The data model is generic: a {py:class}`~xdas.DataArray` can represent any +labeled N-dimensional array. The [](io/miniseed.md) page shows a complete example with +a large-N seismic array stored as miniSEED files. All signal processing routines in +{py:mod}`xdas.signal` and {py:mod}`xdas.fft` work on any DataArray regardless of the +physical quantity it represents. + +## How do I convert a xdas DataArray to/from xarray? + +```python +# xdas → xarray +xr_da = da.to_xarray() + +# xarray → xdas +xd_da = xd.DataArray.from_xarray(xr_da) +``` + +Note that the coordinate representation is simplified during the round-trip: *xarray* +always uses dense coordinate arrays, so a `SampledCoordinate` or `InterpCoordinate` will +be converted to a {py:class}`~xdas.coordinates.DenseCoordinate` when going through +*xarray*. diff --git a/docs/user-guide/convert-displacement.md b/docs/user-guide/how-to/convert-displacement.md similarity index 92% rename from docs/user-guide/convert-displacement.md rename to docs/user-guide/how-to/convert-displacement.md index d142efdc..1839fd3a 100644 --- a/docs/user-guide/convert-displacement.md +++ b/docs/user-guide/how-to/convert-displacement.md @@ -9,7 +9,7 @@ kernelspec: import os import xdas as xd -os.chdir("../_data") +os.chdir("../../_data") ``` # Convert to displacement @@ -36,4 +36,4 @@ displacement = xs.sliding_mean_removal(deformation, wlen=2000.0, dim="distance") displacement.plot(yincrease=False, vmin=-0.5, vmax=0.5); ``` -[REF]: \ No newline at end of file +[REF]: diff --git a/docs/user-guide/how-to/index.md b/docs/user-guide/how-to/index.md new file mode 100644 index 00000000..2f8b275d --- /dev/null +++ b/docs/user-guide/how-to/index.md @@ -0,0 +1,9 @@ +# How-To Guides + +Targeted guides for common domain-specific tasks. + +```{toctree} +:maxdepth: 1 + +convert-displacement +``` diff --git a/docs/user-guide/index.md b/docs/user-guide/index.md index b8323343..db3240d5 100644 --- a/docs/user-guide/index.md +++ b/docs/user-guide/index.md @@ -1,16 +1,60 @@ # User Guide +````{grid} 1 2 2 2 +:gutter: 4 +:padding: 2 2 0 0 + +```{grid-item-card} Data Structures +:link: data-structures/index +:link-type: doc +The two core objects: {py:class}`~xdas.DataArray` for a single acquisition and +{py:class}`~xdas.DataCollection` for grouping multiple acquisitions together. +``` + +```{grid-item-card} Coordinates +:link: coordinates/index +:link-type: doc +How array axes are mapped to physical values — interpolated, sampled, dense, and scalar +coordinate types, including handling of gaps and overlaps. +``` + +```{grid-item-card} I/O +:link: io/index +:link-type: doc +Supported DAS file formats, how to add a custom engine, and how virtual datasets let you +access large multi-file acquisitions as a single seamless array. +``` + +```{grid-item-card} Pipeline Processing +:link: pipeline/index +:link-type: doc +Building processing sequences with {py:class}`~xdas.atoms.Atom` objects and applying +them chunk-by-chunk on datasets larger than memory, including real-time streaming. +``` + +```{grid-item-card} How-To Guides +:link: how-to/index +:link-type: doc +Short, task-focused recipes for common domain-specific workflows such as converting +strain-rate to displacement. +``` + +```{grid-item-card} FAQ +:link: faq +:link-type: doc +Answers to frequently asked questions, including why *xdas* exists alongside *xarray* +and *dask*. +``` +```` + ```{toctree} -:maxdepth: 1 +:maxdepth: 2 +:hidden: data-structures/index coordinates/index -data-formats -virtual-datasets -miniseed -convert-displacement -atoms -processing -streaming +io/index +pipeline/index +how-to/index faq -``` \ No newline at end of file +``` diff --git a/docs/user-guide/data-formats.md b/docs/user-guide/io/data-formats.md similarity index 99% rename from docs/user-guide/data-formats.md rename to docs/user-guide/io/data-formats.md index 4f2f99b1..e32b327c 100644 --- a/docs/user-guide/data-formats.md +++ b/docs/user-guide/io/data-formats.md @@ -8,7 +8,7 @@ kernelspec: :tags: [remove-cell] import os -os.chdir("../_data") +os.chdir("../../_data") ``` # Data Formats @@ -116,4 +116,4 @@ Once the class is created and instanciated you can then use it : # Replace "other_format.hdf5" by the path of your file da = xd.open("other_format.hdf5", engine="my_engine", ctype="sampled") da -``` \ No newline at end of file +``` diff --git a/docs/user-guide/io/index.md b/docs/user-guide/io/index.md new file mode 100644 index 00000000..f1f05fbf --- /dev/null +++ b/docs/user-guide/io/index.md @@ -0,0 +1,11 @@ +# I/O + +This section covers reading and writing data with *xdas*. + +```{toctree} +:maxdepth: 1 + +data-formats +virtual-datasets +miniseed +``` diff --git a/docs/user-guide/miniseed.md b/docs/user-guide/io/miniseed.md similarity index 99% rename from docs/user-guide/miniseed.md rename to docs/user-guide/io/miniseed.md index b675d619..e599b10a 100644 --- a/docs/user-guide/miniseed.md +++ b/docs/user-guide/io/miniseed.md @@ -8,7 +8,7 @@ kernelspec: :tags: [remove-cell] import os -os.chdir("../_data") +os.chdir("../../_data") import warnings warnings.filterwarnings("ignore") diff --git a/docs/user-guide/virtual-datasets.md b/docs/user-guide/io/virtual-datasets.md similarity index 97% rename from docs/user-guide/virtual-datasets.md rename to docs/user-guide/io/virtual-datasets.md index a99147ab..6830a05c 100644 --- a/docs/user-guide/virtual-datasets.md +++ b/docs/user-guide/io/virtual-datasets.md @@ -9,7 +9,7 @@ kernelspec: import os import xdas as xd -os.chdir("../_data") +os.chdir("../../_data") ``` # Virtual Datasets @@ -49,7 +49,7 @@ The generic {py:func}`xdas.open` funtion should cover all your needs. But you ca | {py:func}`xdas.open_mfdatatree` | {py:class}`~xdas.DataCollection` | Open a directory tree of files, organizing data in a data collection. | | {py:func}`xdas.open_datacollection` | {py:class}`~xdas.DataCollection` | Open a (virtual) collection. | -Please refer to the [](data-structures/datacollection.md) section for the functions that return a data collection. +Please refer to the [](../data-structures/datacollection.md) section for the functions that return a data collection. ## Linking multi-file datasets diff --git a/docs/user-guide/atoms.md b/docs/user-guide/pipeline/atoms.md similarity index 99% rename from docs/user-guide/atoms.md rename to docs/user-guide/pipeline/atoms.md index efcb4117..ad0e82f1 100644 --- a/docs/user-guide/atoms.md +++ b/docs/user-guide/pipeline/atoms.md @@ -9,7 +9,7 @@ kernelspec: import os import xdas as xd -os.chdir("../_data") +os.chdir("../../_data") ``` # Composing a processing sequence @@ -84,4 +84,3 @@ class MyStatefulRoutine(Atom): # Apply routine to DataArray ``da`` ... ``` - diff --git a/docs/user-guide/pipeline/index.md b/docs/user-guide/pipeline/index.md new file mode 100644 index 00000000..14f99308 --- /dev/null +++ b/docs/user-guide/pipeline/index.md @@ -0,0 +1,13 @@ +# Pipeline Processing + +This section covers how to build and execute processing pipelines in *xdas*, from +composing atomic operations to applying them on larger-than-memory datasets and +streaming them over a network. + +```{toctree} +:maxdepth: 1 + +atoms +processing +streaming +``` diff --git a/docs/user-guide/processing.md b/docs/user-guide/pipeline/processing.md similarity index 54% rename from docs/user-guide/processing.md rename to docs/user-guide/pipeline/processing.md index 51347ad7..2de8e131 100644 --- a/docs/user-guide/processing.md +++ b/docs/user-guide/pipeline/processing.md @@ -9,15 +9,11 @@ kernelspec: import os import xdas as xd -os.chdir("../_data") +os.chdir("../../_data") ``` # Processing larger-than-memory data -```{warning} -The API of this part of xdas is still experimental. -``` - ## Chunked processing: basic concepts Given the sheer size of DAS data, it is often impossible to process an entire data set directly in memory. Hence, chunked-based processing is a necessity that requires an additional layer of computational logistics. A naive approach to chunked processing would be to load a chunk of data, apply a `Sequential` pipeline to it (see [*Composing a processing sequence*](atoms.md)), and write the resulting data to disk. Assuming that disk I/O is the limiting factor, this scenario would leave the CPU mostly idle as it has to wait for new data to be read and processed data to be written to disk. @@ -32,4 +28,60 @@ A second feature of xdas, is that it automatically handles state updates and tra ## Example -**TODO** \ No newline at end of file +The following example shows how to apply a simple processing pipeline to a large dataset. +First, build and validate the pipeline on a small in-memory subset: + +```{code-cell} +:tags: [remove-output] + +import numpy as np +import xdas as xd +import xdas.signal as xs +from xdas.atoms import Sequential, Partial, LFilter +from xdas.processing import process, DataArrayLoader, DataArrayWriter +from scipy.signal import iirfilter + +da = xd.synthetics.wavelet_wavefronts() + +b, a = iirfilter(4, 0.1, btype="high") + +atom = Sequential( + [ + Partial(xs.decimate, 2, ftype="fir", dim="distance"), + LFilter(b, a, dim="time"), + Partial(np.square), + ] +) + +monolithic = atom(da) +``` + +Then apply the same pipeline chunk-by-chunk using {py:func}`~xdas.processing.process`. +The {py:class}`~xdas.processing.DataArrayLoader` splits the input into fixed-size chunks +along a given dimension, while {py:class}`~xdas.processing.DataArrayWriter` collects and +writes each processed chunk to a directory on disk: + +```{code-cell} +:tags: [remove-output] + +import os +os.makedirs("output", exist_ok=True) + +dl = DataArrayLoader(da, chunks={"time": 100}) +dw = DataArrayWriter("output") +chunked = process(atom, dl, dw) + +assert chunked.equals(monolithic) +``` + +```{code-cell} +:tags: [remove-cell] + +import shutil +shutil.rmtree("output") +``` + +The result is identical to the monolithic run but can scale to datasets that do not fit in +memory. The loader and writer can be swapped for other variants — for example, +{py:class}`~xdas.processing.ZMQPublisher` to stream results over a network (see +[](streaming.md)). diff --git a/docs/user-guide/streaming.md b/docs/user-guide/pipeline/streaming.md similarity index 97% rename from docs/user-guide/streaming.md rename to docs/user-guide/pipeline/streaming.md index b582eb68..f4e9fea4 100644 --- a/docs/user-guide/streaming.md +++ b/docs/user-guide/pipeline/streaming.md @@ -4,6 +4,14 @@ kernelspec: name: python3 --- +```{code-cell} +:tags: [remove-cell] + +import os +import xdas as xd +os.chdir("../../_data") +``` + # Streaming data Xdas allows to stream data over any network using [ZeroMQ](https://zeromq.org). Xdas use the Publisher and Subscriber patterns meaning that on one node the data is published and that any number of subscribers can receive the data stream. From c9ffe736ad4bf1ec9aa33eb7c633fb05819aabb1 Mon Sep 17 00:00:00 2001 From: Alister Trabattoni Date: Sun, 24 May 2026 20:36:58 +0200 Subject: [PATCH 04/21] Add docstrings everywhere. --- xdas/__init__.py | 8 ++ xdas/atoms/__init__.py | 7 ++ xdas/atoms/core.py | 24 +++++ xdas/atoms/ml.py | 37 +++++++ xdas/atoms/signal.py | 75 +++++++++++++ xdas/config.py | 31 ++++++ xdas/coordinates/__init__.py | 9 ++ xdas/coordinates/core.py | 199 +++++++++++++++++++++++++++++++++++ xdas/coordinates/default.py | 34 ++++++ xdas/coordinates/dense.py | 49 +++++++++ xdas/coordinates/interp.py | 97 +++++++++++++++++ xdas/coordinates/sampled.py | 79 ++++++++++++++ xdas/coordinates/scalar.py | 30 ++++++ xdas/core/__init__.py | 4 + xdas/core/dataarray.py | 32 ++++++ xdas/core/datacollection.py | 26 +++++ xdas/core/methods.py | 15 +++ xdas/core/numpy.py | 22 ++++ xdas/core/routines.py | 21 ++++ xdas/dask/__init__.py | 5 + xdas/dask/core.py | 27 +++++ xdas/dask/serial.py | 37 +++++++ xdas/fft.py | 5 + xdas/io/__init__.py | 5 + xdas/io/apsensing.py | 7 ++ xdas/io/asn.py | 21 ++++ xdas/io/core.py | 10 ++ xdas/io/febus.py | 6 ++ xdas/io/miniseed.py | 43 ++++++++ xdas/io/prodml.py | 8 ++ xdas/io/silixa.py | 9 ++ xdas/io/tdms.py | 2 + xdas/io/terra15.py | 7 ++ xdas/io/utils.py | 5 + xdas/io/xdas.py | 50 +++++++++ xdas/parallel.py | 28 +++++ xdas/picking.py | 5 + xdas/processing/__init__.py | 5 + xdas/processing/core.py | 82 +++++++++++++++ xdas/processing/monitor.py | 23 ++++ xdas/signal.py | 6 ++ xdas/spectral.py | 6 ++ xdas/synthetics.py | 31 ++++++ xdas/trigger.py | 5 + xdas/virtual.py | 85 +++++++++++++++ 45 files changed, 1322 insertions(+) diff --git a/xdas/__init__.py b/xdas/__init__.py index 2e84165d..db6bbe56 100644 --- a/xdas/__init__.py +++ b/xdas/__init__.py @@ -1,3 +1,11 @@ +""" +Xdas — labeled N-dimensional array library for Distributed Acoustic Sensing data. + +Provides :class:`DataArray` with lazy multi-file HDF5/NetCDF4 access, coordinate +types for physical axes, multi-threaded signal processing pipelines, and I/O engines +for common DAS instrument formats. +""" + __version__ = "0.2.6" from . import ( diff --git a/xdas/atoms/__init__.py b/xdas/atoms/__init__.py index d5f34e4c..7149a1bf 100644 --- a/xdas/atoms/__init__.py +++ b/xdas/atoms/__init__.py @@ -1,3 +1,10 @@ +""" +Stateful processing units (atoms) for building chunked data pipelines. + +Exports :class:`Atom`, :class:`State`, :class:`Sequential`, :class:`Partial`, +:func:`atomized`, signal-processing atoms, and the ML-based :class:`MLPicker`. +""" + from ..trigger import Trigger from .core import Atom, Partial, Sequential, State, atomized from .ml import MLPicker diff --git a/xdas/atoms/core.py b/xdas/atoms/core.py index 90999b1e..54b471e2 100644 --- a/xdas/atoms/core.py +++ b/xdas/atoms/core.py @@ -1,3 +1,8 @@ +""" +Base classes for stateful processing atoms: :class:`Atom`, :class:`State`, +:class:`Sequential`, :class:`Partial`, and the :func:`atomized` decorator. +""" + import importlib from collections.abc import Callable from functools import wraps @@ -129,12 +134,14 @@ def __setattr__(self, name, value): @property def state(self): + """Dict of the current state, including nested atom states.""" return self._state | { name: filter.state for name, filter in self._atoms.items() if filter.state } @property def initialized(self): + """``True`` if every state key has been initialised (no ``...`` sentinels remain).""" return all(value is not ... for value in self._state.values()) def initialize(self, x, **flags): ... @@ -153,15 +160,25 @@ def __call__(self, x, **flags): return y def reset(self): + """Reset all state entries to ``...`` (uninitialised sentinel).""" for key in self._state: setattr(self, key, State(...)) for _, filter in self._atoms.items(): filter.reset() def save_state(self, path): + """Serialise the current state to a NetCDF4 file at *path*.""" DataCollection(self.state).to_netcdf(path) def set_state(self, state): + """ + Restore the atom state from a previously saved state dict. + + Parameters + ---------- + state : dict + Mapping of state key → value as returned by :attr:`state`. + """ for key, value in state.items(): if isinstance(value, DataArray): setattr( @@ -173,6 +190,7 @@ def set_state(self, state): filter.set_state(value) def load_state(self, path): + """Load the atom state from the NetCDF4 file at *path*.""" state = open_datacollection(path).load() self.set_state(state) @@ -266,6 +284,7 @@ def __init__(self, atoms: Any, name: str | None = None) -> None: self.name = name def call(self, x: Any, **flags) -> Any: + """Pass *x* through each atom in order and return the final result.""" for atom in self: x = atom(x, **flags) return x @@ -374,9 +393,11 @@ def __init__( @property def stateful(self): + """``True`` if any keyword argument is being passed as state.""" return bool(self._state) def call(self, x: Any, **flags) -> Any: + """Call the wrapped function with *x* substituted at the ``...`` position.""" args = tuple(x if arg is ... else arg for arg in self.args) kwargs = self.kwargs | self._state if self.stateful: @@ -410,12 +431,14 @@ def __reduce__(self): @classmethod def from_state(cls, state): + """Reconstruct a :class:`Partial` from a serialised *state* dict.""" func = getattr( importlib.import_module(state["func"]["module"]), state["func"]["name"] ) return cls(func, *state["args"], name=state["name"], **state["kwargs"]) def get_state(self): + """Return a JSON-serialisable dict describing the wrapped function and args.""" return { "func": {"module": self.func.__module__, "name": self.func.__name__}, "args": self.args, @@ -493,6 +516,7 @@ def atomized(func): @wraps(func) def wrapper(*args, **kwargs): + """Dispatch to Partial/Sequential when ``...`` or an Atom is passed, else call directly.""" if any(arg is ... for arg in args): return Partial(func, *args, **kwargs) elif objs := tuple(arg for arg in args if isinstance(arg, Atom)): diff --git a/xdas/atoms/ml.py b/xdas/atoms/ml.py index 1c8c3ce5..c953f921 100644 --- a/xdas/atoms/ml.py +++ b/xdas/atoms/ml.py @@ -1,3 +1,9 @@ +""" +Machine-learning atom: :class:`MLPicker` wraps SeisBench models as pipeline atoms. + +Torch and SeisBench are loaded lazily so they remain optional dependencies. +""" + import importlib import numpy as np @@ -8,6 +14,8 @@ class LazyModule: + """Defer importing *name* until the first attribute access.""" + def __init__(self, name): self._name = name self._module = None @@ -28,6 +36,26 @@ def __getattr__(self, name): class MLPicker(Atom): + """ + Wraps a SeisBench phase-picking model as a streaming :class:`Atom`. + + Uses an overlapping sliding-window strategy to apply the model to + arbitrarily long data and to stitch the per-segment probability outputs + back into a continuous DataArray. + + Parameters + ---------- + model : seisbench.models.WaveformModel + A SeisBench model in evaluation mode (will be moved to *device*). + dim : str + Dimension name along which the model slides (usually ``"time"``). + device : str or torch.device, optional + Torch device. Defaults to CUDA if available, else CPU. + component_strategy : str, optional + How to fill the channel dimension: ``"clone"`` replicates the + single-component signal, or pass a component letter to select it. + """ + def __init__(self, model, dim, device=None, component_strategy="clone"): super().__init__() if device is None: @@ -50,33 +78,41 @@ def __init__(self, model, dim, device=None, component_strategy="clone"): @property def nperseg(self): + """Number of samples per segment (= model input length).""" return self.model.in_samples @property def noverlap(self): + """Number of overlapping samples between consecutive segments.""" return self.nperseg // 2 @property def step(self): + """Stride between the start of consecutive segments.""" return self.nperseg - self.noverlap @property def phases(self): + """List of phase label strings produced by the model.""" return list(self.model.labels) @property def in_channels(self): + """Number of input channels the model expects.""" return self.model.in_channels @property def classes(self): + """Number of output classes (phases) the model produces.""" return self.model.classes @property def blinding(self): + """``(left, right)`` blinding samples from the model's default args.""" return self.model.default_args["blinding"] def initialize(self, da, chunk_dim=None, **flags): + """Allocate circular buffers sized to *da*'s batch and segment dimensions.""" self.batch_size = State( np.prod([size for dim, size in da.sizes.items() if not dim == self.dim]) ) @@ -114,6 +150,7 @@ def initialize(self, da, chunk_dim=None, **flags): self.buffer = State(None) def call(self, da, **flags): + """Run the model over *da*, managing a carry-over buffer for chunked input.""" if self.buffer is None: out = self._process(da) else: diff --git a/xdas/atoms/signal.py b/xdas/atoms/signal.py index d1adcfea..2b77aeee 100644 --- a/xdas/atoms/signal.py +++ b/xdas/atoms/signal.py @@ -1,3 +1,10 @@ +""" +Signal-processing atoms: stateful wrappers around common filtering and +resampling operations (:class:`ResamplePoly`, :class:`IIRFilter`, +:class:`FIRFilter`, :class:`LFilter`, :class:`SOSFilter`, +:class:`DownSample`, :class:`UpSample`). +""" + from fractions import Fraction import numpy as np @@ -92,10 +99,12 @@ def __init__(self, target, maxfactor=100, window=("kaiser", 5.0), dim="last"): self.fs = State(...) def initialize(self, da, **flags): + """Measure the current sampling rate from *da* and compute resampling ratios.""" self.fs = State(1.0 / get_sampling_interval(da, self.dim)) self.initialize_from_state() def initialize_from_state(self): + """Recompute the up/down factors and FIR cut-off from the stored sampling rate.""" fraction = Fraction(self.target / self.fs) fraction = fraction.limit_denominator(self.maxfactor) fraction = 1 / (1 / fraction).limit_denominator(self.maxfactor) @@ -110,6 +119,7 @@ def initialize_from_state(self): self.downsampling.factor = down def call(self, da, **flags): + """Apply polyphase resampling (upsample → FIR filter → downsample) to *da*.""" if self.upsampling.factor == 1 and self.downsampling.factor == 1: return da da = self.upsampling(da, **flags) @@ -223,10 +233,12 @@ def __init__( self.fs = State(...) def initialize(self, da, **flags): + """Determine the sampling rate from *da* and recompute the IIR coefficients.""" self.fs = State(1.0 / get_sampling_interval(da, self.dim)) self.initialize_from_state() def initialize_from_state(self): + """Recompute and store the IIR coefficients from the current design parameters.""" coeffs = sp.iirfilter( self.order, self.cutoff, @@ -246,6 +258,7 @@ def initialize_from_state(self): raise ValueError() def call(self, da, **flags): + """Delegate to the underlying :class:`LFilter` or :class:`SOSFilter` atom.""" return self.iirfilter(da, **flags) @@ -353,10 +366,12 @@ def __init__( self.fs = State(...) def initialize(self, da, **flags): + """Determine the sampling rate from *da* and recompute the FIR taps.""" self.fs = State(1.0 / get_sampling_interval(da, self.dim)) self.initialize_from_state() def initialize_from_state(self): + """Recompute the FIR taps and lag from the current design parameters.""" taps = sp.firwin( self.numtaps, self.cutoff, @@ -370,12 +385,28 @@ def initialize_from_state(self): self.lfilter.b = taps def call(self, da, **flags): + """Apply the FIR taps to *da* and correct the time coordinate for filter lag.""" da = self.lfilter(da, **flags) da[self.dim] -= get_sampling_interval(da, self.dim, cast=False) * self.lag return da class LFilter(Atom): + """ + Stateful direct-form IIR/FIR filter using :func:`scipy.signal.lfilter`. + + Parameters + ---------- + b : array-like + Numerator polynomial coefficients. + a : array-like + Denominator polynomial coefficients. + dim : str or int, optional + Dimension to filter along. Defaults to ``"last"``. + parallel : int, bool, or None, optional + Worker count for parallelisation. + """ + def __init__(self, b, a, dim="last", parallel=None): super().__init__() self.b = b @@ -386,6 +417,7 @@ def __init__(self, b, a, dim="last", parallel=None): self.zi = State(...) def initialize(self, da, chunk_dim=None, **flags): + """Set the filter axis and allocate the initial conditions buffer.""" self.axis = State(da.get_axis_num(self.dim)) if self.dim == chunk_dim: n_sections = max(len(self.a), len(self.b)) - 1 @@ -398,6 +430,7 @@ def initialize(self, da, chunk_dim=None, **flags): self.zi = State(None) def call(self, da, **flags): + """Apply the filter to *da*, updating the state if chunked.""" across = int(self.axis == 0) if self.zi is None: func = parallelize((None, None, across), across, self.parallel)(sp.lfilter) @@ -412,6 +445,19 @@ def call(self, da, **flags): class SOSFilter(Atom): + """ + Stateful second-order-sections IIR filter using :func:`scipy.signal.sosfilt`. + + Parameters + ---------- + sos : array-like, shape (n_sections, 6) + SOS filter coefficients as returned by e.g. :func:`scipy.signal.iirfilter`. + dim : str or int, optional + Dimension to filter along. Defaults to ``"last"``. + parallel : int, bool, or None, optional + Worker count for parallelisation. + """ + def __init__(self, sos, dim="last", parallel=None): super().__init__() self.sos = sos @@ -421,6 +467,7 @@ def __init__(self, sos, dim="last", parallel=None): self.zi = State(...) def initialize(self, da, chunk_dim=None, **flags): + """Set the filter axis and allocate the SOS initial-conditions buffer.""" self.axis = State(da.get_axis_num(self.dim)) if self.dim == chunk_dim: n_sections = self.sos.shape[0] @@ -433,6 +480,7 @@ def initialize(self, da, chunk_dim=None, **flags): self.zi = State(None) def call(self, da, **flags): + """Apply the SOS filter to *da*, updating the state if chunked.""" across = int(self.axis == 0) if self.zi is None: func = parallelize((None, across), across, self.parallel)(sp.sosfilt) @@ -447,6 +495,17 @@ def call(self, da, **flags): class DownSample(Atom): + """ + Stateful integer downsampling by selecting every *factor*-th sample. + + Parameters + ---------- + factor : int + Downsampling factor. + dim : str or int, optional + Dimension to downsample along. Defaults to ``"last"``. + """ + def __init__(self, factor, dim="last"): super().__init__() self.factor = factor @@ -454,12 +513,14 @@ def __init__(self, factor, dim="last"): self.buffer = State(...) def initialize(self, da, chunk_dim=None, **flags): + """Initialise the carry-over buffer for chunked operation.""" if chunk_dim == self.dim: self.buffer = State(da.isel({self.dim: slice(0, 0)})) else: self.buffer = State(None) def call(self, da, **flags): + """Downsample *da*, buffering the trailing partial stride when chunked.""" if self.factor == 1: return da if self.buffer is not None: @@ -471,6 +532,19 @@ def call(self, da, **flags): class UpSample(Atom): + """ + Integer upsampling by zero-insertion (and optional energy scaling). + + Parameters + ---------- + factor : int + Upsampling factor. + scale : bool, optional + If ``True``, scale inserted samples so energy is preserved. + dim : str or int, optional + Dimension to upsample along. Defaults to ``"last"``. + """ + def __init__(self, factor, scale=True, dim="last"): super().__init__() self.factor = factor @@ -478,6 +552,7 @@ def __init__(self, factor, scale=True, dim="last"): self.dim = dim def call(self, da, **flags): + """Upsample *da* by inserting zeros between every original sample.""" if self.factor == 1: return da shape = tuple( diff --git a/xdas/config.py b/xdas/config.py index bd081973..3e3a1ace 100644 --- a/xdas/config.py +++ b/xdas/config.py @@ -1,13 +1,44 @@ +""" +Global runtime configuration for xdas (e.g. worker-thread count). + +Use :func:`get` and :func:`set` to read and write configuration values. +""" + import os class Config: + """Global configuration store backed by a plain dict.""" + config = {"n_workers": os.cpu_count()} def get(key): + """ + Return the current value of configuration key *key*. + + Parameters + ---------- + key : str + Configuration key (e.g. ``"n_workers"``). + + Returns + ------- + object + The stored configuration value. + """ return Config.config[key] def set(key, value): + """ + Set configuration key *key* to *value*. + + Parameters + ---------- + key : str + Configuration key (e.g. ``"n_workers"``). + value : object + New value to store. + """ Config.config[key] = value diff --git a/xdas/coordinates/__init__.py b/xdas/coordinates/__init__.py index f7eaeaee..7bd99542 100644 --- a/xdas/coordinates/__init__.py +++ b/xdas/coordinates/__init__.py @@ -1,3 +1,12 @@ +""" +Coordinate types that describe how array axes map to physical values. + +Exports :class:`Coordinates` (container) and all concrete coordinate classes: +:class:`Coordinate` (factory/base), :class:`DefaultCoordinate`, +:class:`DenseCoordinate`, :class:`InterpCoordinate`, +:class:`SampledCoordinate`, :class:`ScalarCoordinate`. +""" + from .core import Coordinate, Coordinates, get_sampling_interval from .default import DefaultCoordinate from .dense import DenseCoordinate diff --git a/xdas/coordinates/core.py b/xdas/coordinates/core.py index 45b9a679..1e47bc2c 100644 --- a/xdas/coordinates/core.py +++ b/xdas/coordinates/core.py @@ -1,3 +1,9 @@ +""" +Core coordinate infrastructure: :class:`Coordinates` container, +:class:`Coordinate` factory/base class, and shared helpers used by all +concrete coordinate types (parsing, interpolation, tolerance handling). +""" + import weakref from copy import copy, deepcopy from functools import wraps @@ -8,8 +14,11 @@ def wraps_first_last(func): + """Decorator that resolves ``"first"`` and ``"last"`` dim aliases before calling *func*.""" + @wraps(func) def wrapper(self, dim, *args, **kwargs): + """Resolve ``"first"``/``"last"`` aliases then delegate to *func*.""" if dim == "first": dim = self._dims[0] if dim == "last": @@ -118,16 +127,19 @@ def __reduce__(self): @property def dims(self): + """Ordered tuple of dimension names for this coordinates container.""" return self._dims @property def parent(self): + """The parent object (usually a :class:`DataArray`) this container is attached to.""" if hasattr(self, "_parent"): return self._parent() else: return None def isdim(self, name): + """Return ``True`` if *name* is a dimensional coordinate (i.e. its dim equals its name).""" return self[name].dim == name def get_query(self, item): @@ -168,10 +180,28 @@ def get_query(self, item): return query def to_index(self, item, method=None, endpoint=True): + """ + Convert an item selector to a dict of per-dimension integer indices. + + Parameters + ---------- + item : indexer-like, sequence, or mapping + Passed to :meth:`get_query` to resolve dimension-by-dimension indexers. + method : str, optional + Interpolation method forwarded to each coordinate's :meth:`~Coordinate.to_index`. + endpoint : bool, optional + Whether to include the stop endpoint of slice selectors. Default ``True``. + + Returns + ------- + dict + Mapping from dimension name to integer index or slice. + """ query = self.get_query(item) return {dim: self[dim].to_index(query[dim], method, endpoint) for dim in query} def equals(self, other): + """Return ``True`` if *other* is a :class:`Coordinates` with identical coordinate values.""" if not isinstance(other, Coordinates): return False for name in self: @@ -214,6 +244,7 @@ def to_dict(self): @classmethod def from_dict(cls, dct): + """Reconstruct a :class:`Coordinates` from the dict returned by :meth:`to_dict`.""" return cls( {key: Coordinate.from_dict(value) for key, value in dct["coords"].items()}, dct["dims"], @@ -221,19 +252,29 @@ def from_dict(cls, dct): @classmethod def from_dataset(cls, dataset, name): + """Build a :class:`Coordinates` by delegating to each registered coordinate subclass.""" return cls(Coordinate.from_dataset(dataset, name)) def copy(self, deep=True): + """Return a copy of this :class:`Coordinates` container. + + Parameters + ---------- + deep : bool, optional + If ``True`` (default) perform a deep copy of every coordinate. + """ return self.__class__({key: value.copy(deep) for key, value in self.items()}) @wraps_first_last def drop_dims(self, *dims): + """Return a new :class:`Coordinates` with *dims* and their associated coordinates removed.""" coords = {key: value for key, value in self.items() if value.dim not in dims} dims = tuple(value for value in self.dims if value not in dims) return self.__class__(coords, dims) @wraps_first_last def drop_coords(self, *names): + """Return a new :class:`Coordinates` with the named coordinates removed.""" coords = {key: value for key, value in self.items() if key not in names} return self.__class__(coords, self.dims) @@ -253,6 +294,27 @@ def _assign_parent(self, parent): class Coordinate: + """ + Base class and factory for all coordinate types. + + When called as ``Coordinate(data)``, acts as a factory and returns the first + registered subclass whose :meth:`isvalid` method accepts *data*. When + subclassed, use the ``name=`` keyword in the class definition to register + the subclass (e.g. ``class MyCoord(Coordinate, name="mycoord")``). + + Concrete subclasses must implement :meth:`isvalid`, :meth:`equals`, + and :meth:`to_dict` at minimum. + + Parameters + ---------- + data : array-like or mapping + The coordinate data. Interpretation is subclass-specific. + dim : str, optional + Name of the dimension this coordinate is associated with. + dtype : dtype-like, optional + Desired dtype for the underlying data array. + """ + _registry = {} def __init_subclass__(cls, *, name=None, **kwargs): @@ -315,30 +377,37 @@ def __array_function__(self, func, types, args, kwargs): @staticmethod def isvalid(data): + """Return ``True`` if *data* is a valid input for this coordinate subclass.""" raise NotImplementedError @property def dtype(self): + """NumPy dtype of the underlying data array.""" return self.data.dtype @property def ndim(self): + """Number of dimensions of the underlying data array (always 1 for dimensional coords).""" return self.data.ndim @property def shape(self): + """Shape tuple of the underlying data array.""" return self.data.shape @property def values(self): + """Materialised numpy array of coordinate values.""" return self.__array__() @property def empty(self): + """``True`` if the coordinate has zero length.""" return len(self) == 0 @property def parent(self): + """The parent :class:`Coordinates` container, or ``None`` if unattached.""" if hasattr(self, "_parent"): return self._parent() else: @@ -346,6 +415,7 @@ def parent(self): @property def name(self): + """The name under which this coordinate is stored in its parent container.""" if self.parent is None: return self.dim return next((name for name in self.parent if self.parent[name] is self), None) @@ -354,6 +424,19 @@ def _assign_parent(self, parent): self._parent = weakref.ref(parent) def get_sampling_interval(self, cast=True): + """ + Return the average sample spacing (end-to-end distance divided by N-1). + + Parameters + ---------- + cast : bool, optional + If ``True`` (default), cast timedelta64 results to seconds (float). + + Returns + ------- + float or None + ``None`` if the coordinate has fewer than two elements. + """ if len(self) < 2: return None delta = (self[-1].values - self[0].values) / (len(self) - 1) @@ -363,6 +446,7 @@ def get_sampling_interval(self, cast=True): return delta def is_monotonic_increasing(self): + """Return ``True`` if all consecutive differences in this coordinate are positive.""" if np.issubdtype(self.dtype, np.datetime64): zero = np.timedelta64(0) else: @@ -370,12 +454,21 @@ def is_monotonic_increasing(self): return np.all(np.diff(self.values) > zero) def isdim(self): + """Return ``True`` if this coordinate is a dimensional coordinate in its parent container.""" if self.parent is None or self.name is None: return None else: return self.parent.isdim(self.name) def copy(self, deep=True): + """ + Return a copy of this coordinate. + + Parameters + ---------- + deep : bool, optional + If ``True`` (default) perform a deep copy; otherwise a shallow copy. + """ if deep: func = deepcopy else: @@ -383,15 +476,48 @@ def copy(self, deep=True): return self.__class__(func(self.data), func(self.dim), func(self.dtype)) def equals(self, other): + """Return ``True`` if *other* represents the same coordinate values. Subclass must implement.""" raise NotImplementedError def to_index(self, item, method=None, endpoint=True): + """ + Convert a label-based selector to an integer index or slice. + + Parameters + ---------- + item : label, slice, or array-like + Selector to resolve. + method : str, optional + Look-up method (e.g. ``"ffill"``, ``"bfill"``). + endpoint : bool, optional + Whether to include the stop of a slice. Default ``True``. + + Returns + ------- + int or slice + """ if isinstance(item, slice): return self.slice_indexer(item.start, item.stop, item.step, endpoint) else: return self.get_indexer(item, method) def format_index(self, idx, bounds="raise"): + """ + Normalise integer index *idx*, handling negative indices and optional bounds checking. + + Parameters + ---------- + idx : int or array-like of int + Index or indices to normalise. + bounds : {"raise", "clip"}, optional + ``"raise"`` (default) raises :exc:`IndexError` for out-of-bounds indices; + ``"clip"`` clamps them to the valid range. + + Returns + ------- + numpy.ndarray + Non-negative integer index array. + """ idx = np.asarray(idx) if not np.issubdtype(idx.dtype, np.integer): raise IndexError("only integer are valid index") @@ -404,6 +530,24 @@ def format_index(self, idx, bounds="raise"): return idx def slice_indexer(self, start=None, stop=None, step=None, endpoint=True): + """ + Return an integer :class:`slice` corresponding to the label range [*start*, *stop*]. + + Parameters + ---------- + start : label, optional + First label to include (inclusive, via ``"bfill"`` look-up). + stop : label, optional + Last label to include (inclusive by default, via ``"ffill"`` look-up). + step : not supported + Reserved; raises :exc:`NotImplementedError` if provided. + endpoint : bool, optional + If ``True`` (default), include *stop* in the result. + + Returns + ------- + slice + """ if start is not None: try: start_index = self.get_indexer(start, method="bfill") @@ -430,27 +574,35 @@ def slice_indexer(self, start=None, stop=None, step=None, endpoint=True): return slice(start_index, stop_index) def isscalar(self): + """Return ``True`` if this is a :class:`ScalarCoordinate` (non-dimensional).""" return False def isdefault(self): + """Return ``True`` if this is a :class:`DefaultCoordinate` (integer range).""" return False def isdense(self): + """Return ``True`` if this is a :class:`DenseCoordinate` (explicit numpy array).""" return False def isinterp(self): + """Return ``True`` if this is an :class:`InterpCoordinate` (piecewise-linear).""" return False def issampled(self): + """Return ``True`` if this is a :class:`SampledCoordinate` (regularly sampled).""" return False def concat(self, other): + """Concatenate *other* coordinate to this one. Subclass must implement.""" raise NotImplementedError(f"concat is not implemented for {self.__class__}") def simplify(self, tolerance=None): + """Reduce tie-point count within *tolerance*. Subclass must implement.""" raise NotImplementedError(f"simplify is not implemented for {self.__class__}") def get_split_indices(self, kind="discontinuities", tolerance=False): + """Return integer indices where this coordinate should be split. Subclass must implement.""" raise NotImplementedError( f"get_split_indices is not implemented for {self.__class__}" ) @@ -563,6 +715,7 @@ def get_availabilities(self): return pd.DataFrame.from_records(records) def to_dataarray(self): + """Convert this coordinate to a :class:`~xdas.DataArray` with a single dimension.""" from ..core.dataarray import DataArray # TODO: avoid defered import? if self.name is None: @@ -588,13 +741,16 @@ def to_dataarray(self): ) def to_dict(self): + """Serialise this coordinate to a plain-dict representation. Subclass must implement.""" raise NotImplementedError @classmethod def from_dict(cls, dct): + """Reconstruct a coordinate from the dict returned by :meth:`to_dict`.""" return cls(**dct) def to_dataset(self, dataset, attrs): + """Write this coordinate into an xarray *dataset*, updating *attrs* in place.""" dataset = dataset.assign_coords( {self.name: (self.dim, self.values) if self.dim else self.values} ) @@ -602,6 +758,7 @@ def to_dataset(self, dataset, attrs): @classmethod def from_dataset(cls, dataset, name): + """Read coordinates named *name* from an xarray *dataset* via each registered subclass.""" coords = {} for subcls in cls.__subclasses__(): if hasattr(subcls, "from_dataset"): @@ -610,10 +767,32 @@ def from_dataset(cls, dataset, name): @classmethod def from_block(cls, start, size, step, dim=None, dtype=None): + """Construct a coordinate from a start value, element count, and step size. Subclass must implement.""" raise NotImplementedError def parse(data, dim=None): + """ + Normalise *data* / *dim* inputs accepted by coordinate constructors. + + Unpacks ``(dim, data)`` tuples and strips :class:`Coordinate` wrappers so + that downstream constructors always receive a plain data object and an + optional dimension string. + + Parameters + ---------- + data : array-like, Coordinate, or (dim, array-like) tuple + Raw coordinate input. + dim : str, optional + Explicit dimension name; overrides any dimension carried by *data*. + + Returns + ------- + data : array-like + Unwrapped data. + dim : str or None + Resolved dimension name. + """ if isinstance(data, tuple): if dim is None: dim, data = data @@ -627,6 +806,23 @@ def parse(data, dim=None): def parse_tolerance(tolerance, dtype): + """ + Normalise *tolerance* to the correct type for *dtype*. + + Converts ``None`` to zero, and for datetime64 dtypes converts a + numeric tolerance (in seconds) to the appropriate :class:`numpy.timedelta64`. + + Parameters + ---------- + tolerance : float or None + Raw tolerance value. + dtype : numpy.dtype + The dtype of the coordinate values the tolerance will be compared against. + + Returns + ------- + tolerance : int, float, or numpy.timedelta64 + """ if np.issubdtype(dtype, np.datetime64): if tolerance is None: tolerance = np.timedelta64(0) @@ -661,11 +857,13 @@ def get_sampling_interval(da, dim, cast=True): def isscalar(data): + """Return ``True`` if *data* converts to a 0-d non-object numpy array.""" data = np.asarray(data) return (data.dtype != np.dtype(object)) and (data.ndim == 0) def is_monotonic_increasing(x): + """Return ``True`` if every element of *x* is strictly greater than the previous one.""" if np.issubdtype(x.dtype, np.datetime64): return np.all(np.diff(x) > np.timedelta64(0)) else: @@ -673,6 +871,7 @@ def is_monotonic_increasing(x): def format_datetime(x): + """Format a datetime64-like *x* as an ISO string, truncating sub-millisecond digits.""" string = str(x) if "." in string: datetime, digits = string.split(".") diff --git a/xdas/coordinates/default.py b/xdas/coordinates/default.py index 6d2bae7a..54c9cfc5 100644 --- a/xdas/coordinates/default.py +++ b/xdas/coordinates/default.py @@ -1,9 +1,31 @@ +""" +:class:`DefaultCoordinate`: integer-range coordinate used when no coordinate +is explicitly provided for an axis. +""" + import numpy as np from .core import Coordinate, isscalar, parse class DefaultCoordinate(Coordinate, name="default"): + """ + Integer-range coordinate, equivalent to ``np.arange(size)``. + + Used automatically when no explicit coordinate is provided for an axis. + Internally stored as ``{"size": int}`` rather than a full array to avoid + memory allocation until values are actually needed. + + Parameters + ---------- + data : {"size": int} or None, optional + Mapping with a single ``"size"`` key. ``None`` creates an empty coordinate. + dim : str, optional + Dimension name. + dtype : ignored + Not supported; raises :exc:`ValueError` if provided. + """ + def __init__(self, data=None, dim=None, dtype=None): # empty if data is None: @@ -24,22 +46,27 @@ def __init__(self, data=None, dim=None, dtype=None): @property def empty(self): + """``True`` if the coordinate has size zero.""" return self.data["size"] == 0 @property def dtype(self): + """Always ``numpy.int64``.""" return np.int64 @property def ndim(self): + """Always 1.""" return 1 @property def shape(self): + """Shape tuple ``(size,)``.""" return (len(self),) @staticmethod def isvalid(data): + """Return ``True`` if *data* is ``{"size": int}``.""" match data: case {"size": None | int(_)}: return True @@ -67,22 +94,28 @@ def __array_function__(self, func, types, args, kwargs): raise NotImplementedError def isdefault(self): + """Return ``True`` (this is a :class:`DefaultCoordinate`).""" return True def get_sampling_interval(self, cast=True): + """Return the sample spacing, always 1 for integer-range coordinates.""" return 1 def equals(self, other): + """Return ``True`` if *other* is a :class:`DefaultCoordinate` of the same size.""" if isinstance(other, self.__class__): return self.data["size"] == other.data["size"] def get_indexer(self, value, method=None): + """Return *value* directly (integer index equals label for range coordinates).""" return value def slice_indexer(self, start=None, stop=None, step=None, endpoint=True): + """Return a :class:`slice` with *start*, *stop*, *step* unchanged.""" return slice(start, stop, step) def concat(self, other): + """Return a new :class:`DefaultCoordinate` whose size is the sum of both sizes.""" if not isinstance(other, self.__class__): raise TypeError(f"cannot concatenate {type(other)} to {self.__class__}") if not self.dim == other.dim: @@ -90,4 +123,5 @@ def concat(self, other): return self.__class__({"size": len(self) + len(other)}, self.dim) def to_dict(self): + """Serialise to ``{"dim": ..., "data": ..., "dtype": ...}``.""" return {"dim": self.dim, "data": self.data.tolist(), "dtype": str(self.dtype)} diff --git a/xdas/coordinates/dense.py b/xdas/coordinates/dense.py index 291460c8..ef981924 100644 --- a/xdas/coordinates/dense.py +++ b/xdas/coordinates/dense.py @@ -1,3 +1,7 @@ +""" +:class:`DenseCoordinate`: coordinate backed by a full numpy array. +""" + import numpy as np import pandas as pd @@ -5,6 +9,22 @@ class DenseCoordinate(Coordinate, name="dense"): + """ + Coordinate backed by an explicit numpy array. + + Suitable for irregularly-spaced or small axes where every value must be + stored. Look-up is performed via a :class:`pandas.Index`. + + Parameters + ---------- + data : array-like or None, optional + 1-D array of coordinate values. ``None`` creates an empty coordinate. + dim : str, optional + Dimension name. + dtype : dtype-like, optional + Cast *data* to this dtype on construction. + """ + def __init__(self, data=None, dim=None, dtype=None): # empty if data is None: @@ -21,17 +41,21 @@ def __init__(self, data=None, dim=None, dtype=None): @property def index(self): + """A :class:`pandas.Index` view of the underlying data array.""" return pd.Index(self.data) @staticmethod def isvalid(data): + """Return ``True`` if *data* converts to a 1-D non-object numpy array.""" data = np.asarray(data) return (data.dtype != np.dtype(object)) and (data.ndim == 1) def isdense(self): + """Return ``True`` (this is a :class:`DenseCoordinate`).""" return True def equals(self, other): + """Return ``True`` if *other* is a :class:`DenseCoordinate` with identical values and dtype.""" if isinstance(other, self.__class__): return ( np.array_equal(self.data, other.data) @@ -42,6 +66,25 @@ def equals(self, other): return False def get_indexer(self, value, method=None): + """ + Return the integer index (or indices) for *value*. + + Parameters + ---------- + value : scalar or array-like + Label(s) to look up. + method : str, optional + Forwarded to :meth:`pandas.Index.get_indexer` (e.g. ``"ffill"``). + + Returns + ------- + int or numpy.ndarray + + Raises + ------ + KeyError + If any requested label is not found (indexer returns -1). + """ if np.isscalar(value): out = self.index.get_indexer([value], method).item() else: @@ -51,6 +94,7 @@ def get_indexer(self, value, method=None): return out def slice_indexer(self, start=None, stop=None, step=None, endpoint=True): + """Return an integer :class:`slice` for label range [*start*, *stop*] via :class:`pandas.Index`.""" slc = self.index.slice_indexer(start, stop, step) if ( (not endpoint) @@ -61,6 +105,7 @@ def slice_indexer(self, start=None, stop=None, step=None, endpoint=True): return slc def concat(self, other): + """Concatenate *other* :class:`DenseCoordinate` values to this one.""" if not isinstance(other, self.__class__): raise TypeError(f"cannot concatenate {type(other)} to {self.__class__}") if not self.dim == other.dim: @@ -74,6 +119,7 @@ def concat(self, other): return self.__class__(np.concatenate([self.data, other.data]), self.dim) def get_div_points(self, tolerance=None): + """Return sorted split-point indices where consecutive differences exceed *tolerance*.""" deltas = np.diff(self.data) if tolerance is not None: div_points = np.nonzero(np.abs(deltas) >= tolerance)[0] + 1 @@ -85,6 +131,7 @@ def get_div_points(self, tolerance=None): return div_points def to_dict(self): + """Serialise to ``{"dim": ..., "data": ..., "dtype": ...}``.""" if np.issubdtype(self.dtype, np.datetime64): data = self.data.astype(str).tolist() else: @@ -93,6 +140,7 @@ def to_dict(self): @classmethod def from_dataset(cls, dataset, name): + """Extract all coordinates from an xarray *dataset* variable *name* as plain arrays.""" return { name: ( ( @@ -111,5 +159,6 @@ def from_dataset(cls, dataset, name): @classmethod def from_block(cls, start, size, step, dim=None, dtype=None): + """Build a :class:`DenseCoordinate` from ``start + step * arange(size)``.""" data = start + step * np.arange(size) return cls(data, dim=dim, dtype=dtype) diff --git a/xdas/coordinates/interp.py b/xdas/coordinates/interp.py index 6aeba27f..9a5f5969 100644 --- a/xdas/coordinates/interp.py +++ b/xdas/coordinates/interp.py @@ -1,3 +1,8 @@ +""" +:class:`InterpCoordinate`: piecewise-linear coordinate defined by tie points, +using ``xinterp`` for forward and inverse interpolation. +""" + import re import numpy as np @@ -76,30 +81,37 @@ def __init__(self, data=None, dim=None, dtype=None): @property def tie_indices(self): + """Integer array of tie-point positions (starts at 0, strictly increasing).""" return self.data["tie_indices"] @property def tie_values(self): + """Array of tie-point values (numeric or datetime64, strictly increasing).""" return self.data["tie_values"] @property def dtype(self): + """Dtype of the tie values (and of all materialised coordinate values).""" return self.tie_values.dtype @property def empty(self): + """``True`` if no tie points have been set.""" return self.tie_indices.shape == (0,) @property def ndim(self): + """Always 1.""" return self.tie_values.ndim @property def shape(self): + """Shape tuple ``(len(self),)``.""" return (len(self),) @property def indices(self): + """Full integer index array from 0 to the last tie-point index (inclusive).""" if self.empty: return np.array([], dtype="int") else: @@ -107,6 +119,7 @@ def indices(self): @property def values(self): + """Materialised numpy array of all coordinate values via piecewise interpolation.""" if self.empty: return np.array([], dtype=self.dtype) else: @@ -114,6 +127,7 @@ def values(self): @staticmethod def isvalid(data): + """Return ``True`` if *data* is a dict with ``tie_indices`` and ``tie_values`` keys.""" match data: case {"tie_indices": _, "tie_values": _}: return True @@ -174,9 +188,23 @@ def __array_function__(self, func, types, args, kwargs): raise NotImplementedError def isinterp(self): + """Return ``True`` (this is an :class:`InterpCoordinate`).""" return True def get_sampling_interval(self, cast=True): + """ + Return the median sample spacing across all tie-point segments. + + Parameters + ---------- + cast : bool, optional + If ``True`` (default), cast timedelta64 to seconds. + + Returns + ------- + float or None + ``None`` if fewer than two elements. + """ if len(self) < 2: return None num = np.diff(self.tie_values) @@ -190,6 +218,7 @@ def get_sampling_interval(self, cast=True): return delta def equals(self, other): + """Return ``True`` if *other* has identical tie points, dim, and dtype.""" return ( np.array_equal(self.tie_indices, other.tie_indices) and np.array_equal(self.tie_values, other.tie_values) @@ -198,10 +227,12 @@ def equals(self, other): ) def get_value(self, index): + """Interpolate coordinate values at integer position(s) *index*.""" index = self.format_index(index) return forward(index, self.tie_indices, self.tie_values) def slice_index(self, index_slice): + """Return a new :class:`InterpCoordinate` for the integer slice *index_slice*.""" start_index, stop_index, step_index = index_slice.indices(len(self)) if step_index < 0: raise NotImplementedError("negative slice step is not implemented") @@ -236,6 +267,20 @@ def slice_index(self, index_slice): return coord def get_indexer(self, value, method=None): + """ + Return the integer index for a label *value* via inverse interpolation. + + Parameters + ---------- + value : scalar, str (ISO datetime), or array-like + Label(s) to locate. + method : str, optional + Forwarded to ``xinterp.inverse`` (e.g. ``"ffill"``, ``"bfill"``). + + Returns + ------- + int or numpy.ndarray + """ if isinstance(value, str): value = np.datetime64(value) else: @@ -256,6 +301,7 @@ def get_indexer(self, value, method=None): return indexer def concat(self, other): + """Append *other* :class:`InterpCoordinate` after this one, shifting its tie indices.""" if not isinstance(other, self.__class__): raise TypeError(f"cannot concatenate {type(other)} to {self.__class__}") if not self.dim == other.dim: @@ -278,6 +324,7 @@ def concat(self, other): return coord def decimate(self, q): + """Return a new coordinate keeping every *q*-th sample (integer decimation).""" tie_indices = (self.tie_indices // q) * q for k in range(1, len(tie_indices) - 1): if tie_indices[k] == tie_indices[k - 1]: @@ -289,6 +336,15 @@ def decimate(self, q): ) def simplify(self, tolerance=None): + """ + Reduce the number of tie points using the Douglas-Peucker algorithm. + + Parameters + ---------- + tolerance : float, timedelta, or None + Maximum allowed deviation from the original piecewise-linear curve. + ``None`` uses zero tolerance (lossless). ``False`` returns ``self`` unchanged. + """ if tolerance is False: return self # TODO: copy tolerance = parse_tolerance(tolerance, self.dtype) @@ -300,6 +356,22 @@ def simplify(self, tolerance=None): ) def get_split_indices(self, kind="discontinuities", tolerance=False): + """ + Return tie-point indices where consecutive segments are discontinuous. + + Parameters + ---------- + kind : {"discontinuities", "gaps", "overlaps"}, optional + Which type of split to detect. Default ``"discontinuities"``. + tolerance : float, timedelta, or ``False`` + Minimum magnitude of gap/overlap to report. ``False`` returns all + consecutive tie-point pairs regardless of size. + + Returns + ------- + numpy.ndarray + Integer positions (into the full coordinate array) of each split. + """ valid_kinds = {"discontinuities", "gaps", "overlaps"} if kind not in valid_kinds: raise ValueError(f"`kind` must be one of {valid_kinds}; got {kind!r}") @@ -340,11 +412,13 @@ def get_split_indices(self, kind="discontinuities", tolerance=False): @classmethod def from_array(cls, arr, dim=None, tolerance=None): + """Build an :class:`InterpCoordinate` from a full array *arr*, optionally simplified.""" return cls( {"tie_indices": np.arange(len(arr)), "tie_values": arr}, dim ).simplify(tolerance) def to_dict(self): + """Serialise to ``{"dim": ..., "data": {"tie_indices": ..., "tie_values": ...}, "dtype": ...}``.""" tie_indices = self.data["tie_indices"] tie_values = self.data["tie_values"] if np.issubdtype(tie_values.dtype, np.datetime64): @@ -356,6 +430,7 @@ def to_dict(self): return {"dim": self.dim, "data": data, "dtype": str(self.dtype)} def to_dataset(self, dataset, attrs): + """Write tie points into an xarray *dataset* using CF coordinate interpolation conventions.""" mapping = f"{self.name}: {self.name}_indices {self.name}_values" if "coordinate_interpolation" in attrs: attrs["coordinate_interpolation"] += " " + mapping @@ -382,6 +457,7 @@ def to_dataset(self, dataset, attrs): @classmethod def from_dataset(cls, dataset, name): + """Read interpolated coordinates from *dataset* using the ``coordinate_interpolation`` attribute.""" coords = {} mapping = dataset[name].attrs.pop("coordinate_interpolation", None) if mapping is not None: @@ -394,6 +470,7 @@ def from_dataset(cls, dataset, name): @classmethod def from_block(cls, start, size, step, dim=None, dtype=None): + """Build a two-point :class:`InterpCoordinate` covering [start, start + step*(size-1)].""" return cls( { "tie_indices": [0, size - 1], @@ -404,6 +481,26 @@ def from_block(cls, start, size, step, dim=None, dtype=None): def douglas_peucker(x, y, epsilon): + """ + Reduce the piecewise-linear curve *(x, y)* using the Douglas-Peucker algorithm. + + Points are dropped when they deviate less than *epsilon* from the simplified + line connecting their neighbours. + + Parameters + ---------- + x : numpy.ndarray + Monotonically increasing sample positions (tie indices). + y : numpy.ndarray + Corresponding coordinate values (tie values). + epsilon : float or numpy.timedelta64 + Maximum allowed deviation to retain a point. + + Returns + ------- + x_simplified : numpy.ndarray + y_simplified : numpy.ndarray + """ mask = np.ones(len(x), dtype=bool) stack = [(0, len(x))] while stack: diff --git a/xdas/coordinates/sampled.py b/xdas/coordinates/sampled.py index a2e2ca0a..cf9b6867 100644 --- a/xdas/coordinates/sampled.py +++ b/xdas/coordinates/sampled.py @@ -1,3 +1,8 @@ +""" +:class:`SampledCoordinate`: regularly-sampled coordinate described by tie +points and a fixed ``sampling_interval`` between them. +""" + import re import numpy as np @@ -101,38 +106,47 @@ def __init__(self, data=None, dim=None, dtype=None): @property def tie_values(self): + """Start values of each regularly-sampled segment.""" return self.data["tie_values"] @property def tie_lengths(self): + """Number of samples in each regularly-sampled segment.""" return self.data["tie_lengths"] @property def sampling_interval(self): + """Fixed step between consecutive samples (shared across all segments).""" return self.data["sampling_interval"] @property def dtype(self): + """Dtype of the tie values (and of all materialised coordinate values).""" return self.tie_values.dtype @property def tie_indices(self): + """Start integer index of each segment within the full coordinate array.""" return np.concatenate(([0], np.cumsum(self.tie_lengths[:-1]))) @property def empty(self): + """``True`` if no segments have been set.""" return self.tie_values.shape == (0,) @property def ndim(self): + """Always 1.""" return self.tie_values.ndim @property def shape(self): + """Shape tuple ``(len(self),)``.""" return (len(self),) @property def indices(self): + """Full integer index array from 0 to ``len(self) - 1``.""" if self.empty: return np.array([], dtype="int") else: @@ -140,6 +154,7 @@ def indices(self): @property def values(self): + """Materialised numpy array of all coordinate values.""" if self.empty: return np.array([], dtype=self.dtype) else: @@ -147,14 +162,17 @@ def values(self): @property def start(self): + """Value at index 0 (first tie value).""" return self.tie_values[0] @property def end(self): + """Value one step past the last sample (exclusive upper bound).""" return self.tie_values[-1] + self.sampling_interval * self.tie_lengths[-1] @staticmethod def isvalid(data): + """Return ``True`` if *data* has ``tie_values``, ``tie_lengths``, and ``sampling_interval`` keys.""" match data: case { "tie_values": _, @@ -227,15 +245,25 @@ def __array_function__(self, func, types, args, kwargs): raise NotImplementedError def issampled(self): + """Return ``True`` (this is a :class:`SampledCoordinate`).""" return True def get_sampling_interval(self, cast=True): + """ + Return the sampling interval. + + Parameters + ---------- + cast : bool, optional + If ``True`` (default), cast timedelta64 to seconds (float). + """ delta = self.sampling_interval if cast and np.issubdtype(delta.dtype, np.timedelta64): delta = delta / np.timedelta64(1, "s") return delta def equals(self, other): + """Return ``True`` if *other* has identical tie values, lengths, sampling interval, dim, and dtype.""" return ( np.array_equal(self.tie_values, other.tie_values) and np.array_equal(self.tie_lengths, other.tie_lengths) @@ -245,6 +273,7 @@ def equals(self, other): ) def get_value(self, index): + """Compute coordinate value(s) at integer position(s) *index* using the stored segments.""" index = self.format_index(index, bounds="raise") reference = np.searchsorted(self.tie_indices, index, side="right") - 1 return self.tie_values[reference] + ( @@ -252,6 +281,7 @@ def get_value(self, index): ) def slice_index(self, index_slice): + """Return a new :class:`SampledCoordinate` for the integer slice *index_slice*.""" # normalize slice start, stop, step = index_slice.indices(len(self)) @@ -287,6 +317,25 @@ def slice_index(self, index_slice): return self.__class__(data, self.dim) def get_indexer(self, value, method=None): + """ + Return the integer index for label *value* using the segment structure. + + Parameters + ---------- + value : scalar, str (ISO datetime), or array-like + Label(s) to locate. + method : {None, "nearest", "ffill", "bfill"}, optional + How to handle values that fall in gaps or between samples. + + Returns + ------- + int or numpy.ndarray + + Raises + ------ + KeyError + If *value* falls in an overlap region or is not found (exact mode). + """ if isinstance(value, str): value = np.datetime64(value) else: @@ -358,6 +407,7 @@ def get_indexer(self, value, method=None): return self.tie_indices[reference] + offset def concat(self, other): + """Append *other* :class:`SampledCoordinate` segments after this one.""" if not isinstance(other, self.__class__): raise TypeError(f"cannot concatenate {type(other)} to {self.__class__}") if not self.dim == other.dim: @@ -384,9 +434,19 @@ def concat(self, other): ) def decimate(self, q): + """Return a new coordinate keeping every *q*-th sample (integer decimation).""" return self[::q] def simplify(self, tolerance=None): + """ + Merge adjacent segments whose gap is within *tolerance* of the sampling interval. + + Parameters + ---------- + tolerance : float, timedelta, or None + Maximum allowed discrepancy between the expected and actual start of the + next segment. ``None`` uses zero tolerance. ``False`` returns ``self`` unchanged. + """ if tolerance is False: return self # TODO: copy tolerance = parse_tolerance(tolerance, self.dtype) @@ -409,6 +469,20 @@ def simplify(self, tolerance=None): ) def get_split_indices(self, kind="discontinuities", tolerance=False): + """ + Return integer indices of segment boundaries (start of each segment except the first). + + Parameters + ---------- + kind : {"discontinuities", "gaps", "overlaps"}, optional + Which boundary type to return. Default ``"discontinuities"``. + tolerance : float, timedelta, or ``False`` + Minimum magnitude of the discrepancy to report. + + Returns + ------- + numpy.ndarray + """ valid_kinds = {"discontinuities", "gaps", "overlaps"} if kind not in valid_kinds: raise ValueError(f"`kind` must be one of {valid_kinds}; got {kind!r}") @@ -456,9 +530,11 @@ def get_split_indices(self, kind="discontinuities", tolerance=False): @classmethod def from_array(cls, arr, dim=None, sampling_interval=None): + """Not supported — raises :exc:`NotImplementedError`.""" raise NotImplementedError("from_array is not implemented for SampledCoordinate") def to_dict(self): + """Serialise to ``{"dim": ..., "data": {"tie_values": ..., "tie_lengths": ..., "sampling_interval": ...}, "dtype": ...}``.""" tie_values = self.data["tie_values"] tie_lengths = self.data["tie_lengths"] if np.issubdtype(tie_values.dtype, np.datetime64): @@ -471,6 +547,7 @@ def to_dict(self): return {"dim": self.dim, "data": data, "dtype": str(self.dtype)} def to_dataset(self, dataset, attrs): + """Write sampling metadata into an xarray *dataset* using CF tie-point conventions.""" mapping = f"{self.name}: {self.name}_sampling" if "coordinate_sampling" in attrs: attrs["coordinate_sampling"] += " " + mapping @@ -506,6 +583,7 @@ def to_dataset(self, dataset, attrs): @classmethod def from_dataset(cls, dataset, name): + """Read sampled coordinates from *dataset* using the ``coordinate_sampling`` attribute.""" coords = {} mapping = dataset[name].attrs.pop("coordinate_sampling", None) if mapping is not None: @@ -536,6 +614,7 @@ def from_dataset(cls, dataset, name): @classmethod def from_block(cls, start, size, step, dim=None, dtype=None): + """Build a single-segment :class:`SampledCoordinate` starting at *start* with *size* samples and step *step*.""" data = { "tie_values": [start], "tie_lengths": [size], diff --git a/xdas/coordinates/scalar.py b/xdas/coordinates/scalar.py index b6bebbbd..e7beff3a 100644 --- a/xdas/coordinates/scalar.py +++ b/xdas/coordinates/scalar.py @@ -1,9 +1,31 @@ +""" +:class:`ScalarCoordinate`: non-dimensional (scalar) coordinate that carries a +single value without being tied to an array axis. +""" + import numpy as np from .core import Coordinate, parse class ScalarCoordinate(Coordinate, name="scalar"): + """ + Non-dimensional coordinate that carries a single scalar value. + + Unlike dimensional coordinates, a :class:`ScalarCoordinate` is not tied + to an array axis and has no length. Typical use: metadata attached to a + :class:`DataArray` (e.g. an instrument identifier or a shot time). + + Parameters + ---------- + data : scalar-like + The scalar value. Cannot be ``None``. + dim : must be ``None`` + Passing a non-``None`` value raises :exc:`ValueError`. + dtype : dtype-like, optional + Cast *data* to this dtype. + """ + def __init__(self, data=None, dim=None, dtype=None): if data is None: raise TypeError("scalar coordinate cannot be empty, please provide a value") @@ -16,34 +38,42 @@ def __init__(self, data=None, dim=None, dtype=None): @property def dim(self): + """Always ``None`` — scalar coordinates have no associated dimension.""" return None @dim.setter def dim(self, value): + """Not supported — raises :exc:`ValueError` if *value* is not ``None``.""" if value is not None: raise ValueError("A scalar coordinate cannot have a `dim` other that None") @staticmethod def isvalid(data): + """Return ``True`` if *data* converts to a 0-d non-object numpy array.""" data = np.asarray(data) return (data.dtype != np.dtype(object)) and (data.ndim == 0) def isscalar(self): + """Return ``True`` (this is a :class:`ScalarCoordinate`).""" return True def get_sampling_interval(self, cast=True): + """Return ``None`` — scalar coordinates have no sample spacing.""" return None def equals(self, other): + """Return ``True`` if *other* is a :class:`ScalarCoordinate` with the same value.""" if isinstance(other, self.__class__): return self.data == other.data else: return False def to_index(self, item, method=None, endpoint=True): + """Not supported — raises :exc:`NotImplementedError`.""" raise NotImplementedError("cannot get index of scalar coordinate") def to_dict(self): + """Serialise to ``{"dim": None, "data": ..., "dtype": ...}``.""" if np.issubdtype(self.dtype, np.datetime64): data = self.data.astype(str).item() else: diff --git a/xdas/core/__init__.py b/xdas/core/__init__.py index e69de29b..ea86c4a9 100644 --- a/xdas/core/__init__.py +++ b/xdas/core/__init__.py @@ -0,0 +1,4 @@ +""" +Core data types for xdas: :class:`DataArray`, :class:`DataCollection`, +and supporting routines, methods, and NumPy dispatch. +""" diff --git a/xdas/core/dataarray.py b/xdas/core/dataarray.py index 8c7c31d7..23e0928d 100644 --- a/xdas/core/dataarray.py +++ b/xdas/core/dataarray.py @@ -1,3 +1,8 @@ +""" +:class:`DataArray`: the primary N-dimensional array object with labeled +coordinates, NumPy/Dask backing, and lazy :class:`VirtualArray` support. +""" + import copy from functools import partial @@ -175,17 +180,21 @@ def __getattr__(self, name): raise AttributeError(f"'DataArray' object has no attribute '{name}'") def conj(self): + """Return the complex conjugate, element-wise.""" return np.conj(self) def conjugate(self): + """Return the complex conjugate, element-wise (alias of :meth:`conj`).""" return np.conjugate(self) @property def data(self): + """The underlying array (numpy, dask, or :class:`~xdas.virtual.VirtualArray`).""" return self._data @data.setter def data(self, value): + """Replace the underlying data; must have the same shape as the current array.""" if not hasattr(value, "__array__"): value = np.asarray(value) if not value.shape == self.shape: @@ -197,10 +206,12 @@ def data(self, value): @property def coords(self): + """The :class:`~xdas.coordinates.Coordinates` container for this array.""" return self._coords @coords.setter def coords(self, value): + """Replace the coordinates container; dimensions must remain unchanged.""" value = Coordinates(value) if not value.dims == self.coords.dims: raise ValueError( @@ -212,10 +223,12 @@ def coords(self, value): @property def dims(self): + """Tuple of dimension names in axis order.""" return self.coords.dims @dims.setter def dims(self, value): + """Not supported — raises :exc:`AttributeError` directing users to rename/transpose instead.""" raise AttributeError( "you cannot assign dims on a DataArray, " "use .rename(), .transpose() or .swap_dims() instead" @@ -223,41 +236,51 @@ def dims(self, value): @property def shape(self): + """Shape tuple of the underlying data array.""" return self.data.shape @property def dtype(self): + """NumPy dtype of the underlying data array.""" return self.data.dtype @property def ndim(self): + """Number of dimensions.""" return self.data.ndim @property def size(self): + """Total number of elements.""" return self.data.size @property def sizes(self): + """Dict-like mapping from dimension name to its size.""" return DimSizer(self) @property def nbytes(self): + """Total byte size of the underlying data.""" return self.data.nbytes @property def values(self): + """Materialised numpy array of all values.""" return self.__array__() @property def empty(self): + """``True`` if any dimension has size zero.""" return np.prod(self.data.shape) == 0 @property def loc(self): + """Label-based indexer; supports ``da.loc[label]`` and ``da.loc[label] = value``.""" return LocIndexer(self) def equals(self, other): + """Return ``True`` if *other* has equal data, coordinates, dims, name, and attrs.""" if isinstance(other, self.__class__): if not self.dtype == other.dtype: return False @@ -391,10 +414,12 @@ def sel( return da def drop_dims(self, *dims): + """Return a new :class:`DataArray` with *dims* and their coordinates removed.""" coords = self.coords.drop_dims(*dims) return self.__class__(self.data, coords, coords.dims, self.name, self.attrs) def drop_coords(self, *names): + """Return a new :class:`DataArray` with the named coordinates removed.""" coords = self.coords.drop_coords(*names) return self.__class__(self.data, coords, coords.dims, self.name, self.attrs) @@ -477,6 +502,7 @@ def rename(self, new_name_or_name_dict=None, **names): return self.__class__(self.data, new_coords, new_dims, new_name, self.attrs) def load(self): + """Load the data into memory and return a new :class:`DataArray` backed by a numpy array.""" return self.copy(data=self.data.__array__()) def assign_coords(self, coords=None, **coords_kwargs): @@ -629,6 +655,7 @@ def swap_dims(self, dims_dict=None, **dims_kwargs): @property def T(self): + """Transposed array with dimension order reversed.""" return self.transpose() def transpose(self, *dims): @@ -779,6 +806,7 @@ def to_xarray(self): @classmethod def from_xarray(cls, da): + """Build a :class:`DataArray` from an :class:`xarray.DataArray` *da*.""" return cls(da.data, da.coords, da.dims, da.name, da.attrs) def to_stream( @@ -986,6 +1014,8 @@ def plot(self, *args, **kwargs): class LocIndexer: + """Label-based indexer returned by :attr:`DataArray.loc`.""" + def __init__(self, obj): self.obj = obj @@ -999,6 +1029,8 @@ def __setitem__(self, key, value): class DimSizer(dict): + """Dict-like mapping from dimension names to their sizes, returned by :attr:`DataArray.sizes`.""" + def __init__(self, obj): super().__init__({dim: size for dim, size in zip(obj.dims, obj.shape)}) diff --git a/xdas/core/datacollection.py b/xdas/core/datacollection.py index 2ef6dbab..b3a5ae1b 100644 --- a/xdas/core/datacollection.py +++ b/xdas/core/datacollection.py @@ -1,3 +1,8 @@ +""" +:class:`DataCollection`, :class:`DataSequence`, and :class:`DataMapping`: +nested tree structures for grouping multiple :class:`DataArray` objects. +""" + import os from fnmatch import fnmatch from pathlib import Path @@ -65,6 +70,7 @@ def __new__(cls, data, name=None): @property def empty(self): + """``True`` if the collection contains no elements.""" return len(self) == 0 def query(self, indexers=None, **indexers_kwargs): @@ -150,9 +156,11 @@ def query(self, indexers=None, **indexers_kwargs): return self def issequence(self): + """Return ``True`` if this is a :class:`DataSequence`.""" return isinstance(self, DataSequence) def ismapping(self): + """Return ``True`` if this is a :class:`DataMapping`.""" return isinstance(self, DataMapping) @classmethod @@ -229,6 +237,7 @@ def __reduce__(self): @property def fields(self): + """Ordered, deduplicated tuple of node names at this level and its immediate children.""" out = (self.name,) + tuple( value.name for value in self.values() if isinstance(value, DataCollection) ) @@ -243,17 +252,20 @@ def to_netcdf( encoding=None, create_dirs=False, ): + """Write this :class:`DataMapping` to a NetCDF file (see :func:`~xdas.io.xdas.save_datamapping`).""" from ..io.xdas import save_datamapping save_datamapping(self, fname, mode, group, virtual, encoding, create_dirs) @classmethod def from_netcdf(cls, fname, group=None): + """Lazily read a :class:`DataMapping` from a NetCDF file (see :func:`~xdas.io.xdas.open_datamapping`).""" from ..io.xdas import open_datamapping return open_datamapping(fname, group) def equals(self, other): + """Return ``True`` if *other* is a :class:`DataMapping` with identical keys and values.""" if not isinstance(other, self.__class__): return False if not self.name == other.name: @@ -416,16 +428,19 @@ def __reduce__(self): @property def fields(self): + """Ordered, deduplicated tuple of node names at this level and its immediate children.""" out = (self.name,) + tuple( value.name for value in self if isinstance(value, DataCollection) ) return uniquifiy(out) def to_mapping(self): + """Convert to an integer-keyed :class:`DataMapping`.""" return DataMapping({key: value for key, value in enumerate(self)}, self.name) @classmethod def from_mapping(cls, data): + """Build a :class:`DataSequence` from the values of a :class:`DataMapping`.""" return cls(data.values(), data.name) def to_netcdf( @@ -437,6 +452,7 @@ def to_netcdf( encoding=None, create_dirs=False, ): + """Write this :class:`DataSequence` to a NetCDF file by converting to a mapping first.""" self.to_mapping().to_netcdf( fname, mode=mode, @@ -448,9 +464,11 @@ def to_netcdf( @classmethod def from_netcdf(cls, fname, group=None): + """Lazily read a :class:`DataSequence` from a NetCDF file.""" return DataMapping.from_netcdf(fname, group).from_mapping() def equals(self, other): + """Return ``True`` if *other* is a :class:`DataSequence` with identical elements.""" if not isinstance(other, self.__class__): return False if not self.name == other.name: @@ -582,6 +600,12 @@ def copy(self, deep=True): def parse(data, name=None): + """ + Normalise *(data, name)* inputs accepted by :class:`DataCollection` constructors. + + Unpacks ``(name, data)`` tuples and propagates the name from an existing + :class:`DataCollection` when no explicit name is given. + """ if isinstance(data, tuple): if name is None: name, data = data @@ -593,6 +617,7 @@ def parse(data, name=None): def get_depth(group): + """Return the maximum nesting depth of an HDF5 *group* by counting ``"/"`` separators.""" if not isinstance(group, h5py.Group): raise ValueError("not a group") depths = [] @@ -601,5 +626,6 @@ def get_depth(group): def uniquifiy(seq): + """Return a deduplicated tuple of *seq* elements in their original order.""" seen = set() return tuple(x for x in seq if x not in seen and not seen.add(x)) diff --git a/xdas/core/methods.py b/xdas/core/methods.py index d5484b84..13da0f09 100644 --- a/xdas/core/methods.py +++ b/xdas/core/methods.py @@ -1,3 +1,8 @@ +""" +Registration helpers and implementations for :class:`DataArray` instance +methods dispatched through ``HANDLED_METHODS``. +""" + import numpy as np from ..atoms.core import atomized @@ -5,7 +10,17 @@ def implements(name=None): + """ + Register *func* in ``HANDLED_METHODS`` so it becomes available as a :class:`DataArray` method. + + Parameters + ---------- + name : str, optional + Method name to register under. Defaults to ``func.__name__``. + """ + def decorator(func): + """Register *func* under *key* and return it unchanged.""" key = name if name is not None else func.__name__ HANDLED_METHODS[key] = func return func diff --git a/xdas/core/numpy.py b/xdas/core/numpy.py index 3319329d..6c63045e 100644 --- a/xdas/core/numpy.py +++ b/xdas/core/numpy.py @@ -1,3 +1,8 @@ +""" +NumPy function dispatch for :class:`DataArray` via ``__array_function__``, +mapping NumPy functions to coordinate-aware implementations. +""" + from inspect import signature import numpy as np @@ -7,7 +12,10 @@ def implements(numpy_function): + """Register *func* as the :class:`DataArray` implementation of *numpy_function*.""" + def decorator(func): + """Register *func* and return it unchanged.""" HANDLED_NUMPY_FUNCTIONS[numpy_function] = func return func @@ -15,11 +23,25 @@ def decorator(func): def handled(reduce=False, drop_coords=False, **defaults): + """ + Decorator factory that wraps a NumPy function to be coordinate-aware. + + Parameters + ---------- + reduce : bool, optional + If ``True``, drop the reduced dimension from the output coordinates. + drop_coords : bool, optional + If ``True``, return a plain array without wrapping in :class:`DataArray`. + **defaults : dict + Default keyword arguments forwarded to the wrapped function. + """ def decorator(func): + """Build and register the coordinate-aware wrapper for *func*.""" sig = signature(func) @implements(func) def wrapper(*args, **kwargs): + """Forward *func* call while preserving or reducing DataArray coordinates.""" ba = sig.bind(*args, **kwargs) ba.apply_defaults() ba.arguments.update(defaults) diff --git a/xdas/core/routines.py b/xdas/core/routines.py index 5f2c777f..537f55ec 100644 --- a/xdas/core/routines.py +++ b/xdas/core/routines.py @@ -1,3 +1,9 @@ +""" +Top-level routines for opening, concatenating, aligning, and splitting +:class:`DataArray` and :class:`DataCollection` objects, including +multi-file helpers (``open_mfdataarray``, ``open_mfdatacollection``). +""" + import os import re import warnings @@ -829,6 +835,14 @@ def __init__(self, message): class Bag: + """ + Accumulator that collects :class:`DataArray` objects for concatenation along *dim*. + + Compatibility checks (dims, shape, coords, sampling interval, dtype) are run on + each appended object; incompatible objects raise :exc:`CompatibilityError` so the + caller can start a new bag. + """ + def __init__(self, dim): self.objs = [] self.dim = dim @@ -837,6 +851,7 @@ def __iter__(self): return iter(self.objs) def initialize(self, da): + """Set *da* as the first element and record its shape, coords, sampling interval, and dtype.""" self.objs = [da] self.dims = da.dims self.subshape = tuple( @@ -854,6 +869,7 @@ def initialize(self, da): self.dtype = da.dtype def append(self, da): + """Add *da* after running all compatibility checks; initialises on first call.""" if not self.objs: self.initialize(da) else: @@ -865,19 +881,23 @@ def append(self, da): self.objs.append(da) def check_dims(self, da): + """Raise :exc:`CompatibilityError` if *da* has different dimensions.""" if not self.dims == da.dims: raise CompatibilityError("dimensions are not compatible") def check_shape(self, da): + """Raise :exc:`CompatibilityError` if *da* has a different non-concat shape.""" subshape = tuple(size for dim, size in da.sizes.items() if not dim == self.dim) if not self.subshape == subshape: raise CompatibilityError("shapes are not compatible") def check_dtype(self, da): + """Raise :exc:`CompatibilityError` if *da* has a different dtype.""" if not self.dtype == da.dtype: raise CompatibilityError("data types are not compatible") def check_coords(self, da): + """Raise :exc:`CompatibilityError` if *da* has incompatible non-concat coordinates.""" subcoords = ( da.coords.drop_dims(self.dim) if self.dim in self.dims @@ -887,6 +907,7 @@ def check_coords(self, da): raise CompatibilityError("coordinates are not compatible") def check_sampling_interval(self, da): + """Raise :exc:`CompatibilityError` if *da* has a different sampling interval.""" if self.delta is None: pass else: diff --git a/xdas/dask/__init__.py b/xdas/dask/__init__.py index 4f612c92..362b88fb 100644 --- a/xdas/dask/__init__.py +++ b/xdas/dask/__init__.py @@ -1 +1,6 @@ +""" +Dask integration helpers for serializing and deserializing dask arrays +inside xdas HDF5 files. +""" + from .core import create_variable, dumps, loads diff --git a/xdas/dask/core.py b/xdas/dask/core.py index 911145ad..cc7d1ee6 100644 --- a/xdas/dask/core.py +++ b/xdas/dask/core.py @@ -1,3 +1,8 @@ +""" +Functions to store and restore dask arrays as HDF5 variables using msgpack +serialization of the dask task graph. +""" + import numpy as np from dask.array import Array @@ -5,6 +10,27 @@ def create_variable(arr, file, name, dims=None, dtype=None): + """ + Serialize *arr* and store it as an HDF5 variable attribute. + + Parameters + ---------- + arr : dask.array.Array + Dask array to persist. + file : netCDF4-like file handle + Open file in which to create the variable. + name : str + Variable name inside the file. + dims : sequence of str, optional + Dimension names for the variable. + dtype : dtype-like, optional + Data type for the variable. + + Returns + ------- + variable + The newly created file variable. + """ variable = file.create_variable(name, dims, dtype) variable.attrs.update({"__dask_array__": np.frombuffer(dumps(arr), "uint8")}) return variable @@ -52,6 +78,7 @@ def fuse(graph): def iskey(obj): + """Return ``True`` if *obj* looks like a dask graph key (string or ``(str, int…)`` tuple).""" if isinstance(obj, str) and len(obj) > 0: return True elif ( diff --git a/xdas/dask/serial.py b/xdas/dask/serial.py index bcf6862d..f4f2ca4c 100644 --- a/xdas/dask/serial.py +++ b/xdas/dask/serial.py @@ -1,3 +1,8 @@ +""" +msgpack-based serialization for dask task graphs, handling tuples, slices, +callables, ``methodcaller``, and ``itemgetter`` objects. +""" + import importlib import msgpack @@ -13,6 +18,21 @@ def encode(obj): + """ + msgpack *default* hook — encode non-native types as :class:`msgpack.ExtType`. + + Handles ``tuple``, ``slice``, ``callable``, :class:`methodcaller`, and + :class:`itemgetter`. + + Parameters + ---------- + obj : object + Object to encode. + + Returns + ------- + msgpack.ExtType + """ if isinstance(obj, tuple): code = codes["tuple"] obj = list(obj) @@ -35,6 +55,21 @@ def encode(obj): def decode(code, data): + """ + msgpack *ext_hook* — decode an :class:`msgpack.ExtType` back to the original object. + + Parameters + ---------- + code : int + Extension type code (one of the values in :data:`codes`). + data : bytes + Raw msgpack bytes for the payload. + + Returns + ------- + object + The decoded Python object. + """ obj = loads(data) if code == codes["tuple"]: return tuple(obj) @@ -51,8 +86,10 @@ def decode(code, data): def dumps(obj): + """Serialize *obj* to msgpack bytes, encoding extension types via :func:`encode`.""" return msgpack.dumps(obj, default=encode, strict_types=True) def loads(obj): + """Deserialize msgpack *obj* bytes, restoring extension types via :func:`decode`.""" return msgpack.loads(obj, strict_map_key=False, ext_hook=decode) diff --git a/xdas/fft.py b/xdas/fft.py index 99cdfbb5..413d4439 100644 --- a/xdas/fft.py +++ b/xdas/fft.py @@ -1,3 +1,8 @@ +""" +FFT functions that preserve :class:`DataArray` coordinates: :func:`fft`, +:func:`ifft`, :func:`rfft`, :func:`irfft`, :func:`fftfreq`, :func:`rfftfreq`. +""" + import numpy as np from .atoms.core import atomized diff --git a/xdas/io/__init__.py b/xdas/io/__init__.py index e47836bb..5689acce 100644 --- a/xdas/io/__init__.py +++ b/xdas/io/__init__.py @@ -1,3 +1,8 @@ +""" +I/O subsystem: plugin-based :class:`Engine` registry and concrete engines for +xdas native, ASN, APSensing, Febus, MiniSEED, ProdML, Silixa, Terra15 formats. +""" + from . import ( apsensing, asn, diff --git a/xdas/io/apsensing.py b/xdas/io/apsensing.py index 034fbbcd..4a04f1e9 100644 --- a/xdas/io/apsensing.py +++ b/xdas/io/apsensing.py @@ -1,3 +1,7 @@ +""" +I/O engine for APSensing HDF5 files (:class:`APSensingEngine`). +""" + import h5py import numpy as np @@ -8,6 +12,8 @@ class APSensingEngine(Engine, name="apsensing"): + """Engine for reading APSensing HDF5 files.""" + _supported_vtypes = ["hdf5"] _supported_ctypes = { "time": ["interpolated", "sampled", "dense"], @@ -15,6 +21,7 @@ class APSensingEngine(Engine, name="apsensing"): } def open_dataarray(self, fname): + """Read an APSensing HDF5 file *fname* and return a virtual :class:`DataArray`.""" with h5py.File(fname, "r") as file: t0 = file["Metadata"]["Timestamp"][()].item().decode() fs = file["ProcessingServer"]["DataRate"][()].item() diff --git a/xdas/io/asn.py b/xdas/io/asn.py index c6aee728..caac424b 100644 --- a/xdas/io/asn.py +++ b/xdas/io/asn.py @@ -1,3 +1,8 @@ +""" +I/O engine for ASN HDF5 files (:class:`ASNEngine`) and a ZMQ-based +real-time subscriber (:class:`ZMQSubscriber`) for live ASN streams. +""" + import json from bisect import bisect_left, bisect_right @@ -12,6 +17,8 @@ class ASNEngine(Engine, name="asn"): + """Engine for reading ASN HDF5 files.""" + _supported_vtypes = ["hdf5"] _supported_ctypes = { "time": ["interpolated", "sampled", "dense"], @@ -19,6 +26,7 @@ class ASNEngine(Engine, name="asn"): } def open_dataarray(self, fname): + """Read an ASN HDF5 file *fname* and return a virtual :class:`DataArray`.""" with h5py.File(fname, "r") as file: header = file["header"] demod = file["demodSpec"] @@ -82,6 +90,15 @@ def _get_roi_bound_indices(self, all_dists, n_start, n_end, dx): class ZMQSubscriber: + """ + Iterator that pulls :class:`DataArray` chunks from a live ASN ZMQ publisher. + + Parameters + ---------- + address : str + ZMQ address of the publisher (e.g. ``"tcp://localhost:5555"``). + """ + def __init__(self, address): """ Initializes a ZMQStream object. @@ -213,17 +230,21 @@ def __init__(self, address): @property def header(self): + """The last welcome-message header dict sent to new subscribers.""" return self._header @header.setter def header(self, header): + """Set the welcome-message header and push it to the ZMQ socket option.""" self._header = header self.socket.setsockopt(zmq.XPUB_WELCOME_MSG, json.dumps(header).encode("utf-8")) def submit(self, da): + """Publish *da* over ZMQ.""" self._send(da) def write(self, da): + """Alias for :meth:`submit`.""" self._send(da) def _connect(self, address): diff --git a/xdas/io/core.py b/xdas/io/core.py index 96891c29..4faf0651 100644 --- a/xdas/io/core.py +++ b/xdas/io/core.py @@ -1,3 +1,8 @@ +""" +Plugin base class :class:`Engine` for file format handlers, plus +:class:`AutoEngine` for format auto-detection and :func:`get_free_port`. +""" + import socket @@ -86,15 +91,19 @@ def __class_getitem__(cls, item): raise KeyError(f"Item '{item}' not found in registry or aliases") def open_dataarray(self, fname, **kwargs): + """Open *fname* and return a :class:`DataArray` (abstract).""" raise NotImplementedError def save_dataarray(self, da, fname, **kwargs): + """Write *da* to *fname* (abstract).""" raise NotImplementedError def open_datacollection(self, fname, **kwargs): + """Open *fname* and return a :class:`DataCollection` (abstract).""" raise NotImplementedError def save_datacollection(self, dc, fname, **kwargs): + """Write *dc* to *fname* (abstract).""" raise NotImplementedError def _parse_vtype(self, vtype): @@ -187,6 +196,7 @@ class AutoEngine(Engine): _last_successful_engine = "xdas" def open_dataarray(self, fname, **kwargs): + """Try each registered engine in order and return the first successful result.""" for engine in self._ordered_engines(): try: out = Engine[engine](vtype=self.vtype, ctype=self.ctype).open_dataarray( diff --git a/xdas/io/febus.py b/xdas/io/febus.py index c1b95277..0022d41e 100644 --- a/xdas/io/febus.py +++ b/xdas/io/febus.py @@ -1,3 +1,7 @@ +""" +I/O engine for Febus HDF5 files (:class:`FebusEngine`). +""" + import warnings import h5py @@ -11,6 +15,8 @@ class FebusEngine(Engine, name="febus"): + """Engine for reading Febus HDF5 files.""" + _supported_vtypes = ["hdf5"] _supported_ctypes = { "time": ["interpolated", "sampled", "dense"], diff --git a/xdas/io/miniseed.py b/xdas/io/miniseed.py index 919eef71..a13a2a22 100644 --- a/xdas/io/miniseed.py +++ b/xdas/io/miniseed.py @@ -1,3 +1,7 @@ +""" +I/O engine for MiniSEED files via ObsPy (:class:`MiniSEEDEngine`). +""" + import dask import numpy as np import obspy @@ -9,12 +13,15 @@ class MiniSEEDEngine(Engine, name="miniseed"): + """Engine for reading MiniSEED files via ObsPy as lazy dask-backed DataArrays.""" + _supported_vtypes = ["dask"] _supported_ctypes = { "time": ["interpolated", "sampled", "dense"], } def open_dataarray(self, fname, ignore_last_sample=False, ctype="interpolated"): + """Return a lazy dask-backed :class:`DataArray` for the MiniSEED file *fname*.""" shape, dtype, coords, method = self.read_header( fname, ignore_last_sample, ctype ) @@ -26,6 +33,7 @@ def open_dataarray(self, fname, ignore_last_sample=False, ctype="interpolated"): return DataArray(data, coords) def read_header(self, path, ignore_last_sample, ctype): + """Read metadata from *path* and return ``(shape, dtype, coords, method)``.""" st = obspy.read(path, headonly=True) dtype = uniquifiy(tr.data.dtype for tr in st) @@ -79,6 +87,7 @@ def read_header(self, path, ignore_last_sample, ctype): return shape, dtype, coords, method def read_data(self, path, method, ignore_last_sample): + """Load and return the raw data array from *path* using *method*.""" st = obspy.read(path) if method == "synchronized": if ignore_last_sample: @@ -107,6 +116,23 @@ def to_stream( channel="{:1}N1", dim={"last": "first"}, ): + """ + Convert a 2-D :class:`DataArray` to an :class:`obspy.Stream`. + + Parameters + ---------- + da : DataArray + 2-D array with one time and one distance/channel dimension. + network, station, location, channel : str + SEED identifiers. *station* and *channel* may contain ``{:...}`` + format specs that are filled with the channel index. + dim : dict, optional + ``{distance_dim: time_dim}`` mapping. Defaults to ``{"last": "first"}``. + + Returns + ------- + obspy.Stream + """ dimdist, dimtime = dim.copy().popitem() if not da.ndim == 2: raise ValueError("the data array must be 2D") @@ -134,6 +160,20 @@ def to_stream( def from_stream(st, dims=("channel", "time")): + """ + Convert an :class:`obspy.Stream` to a :class:`DataArray`. + + Parameters + ---------- + st : obspy.Stream + Homogeneous stream (all traces must share start time and sample rate). + dims : tuple of str, optional + Dimension names for the output array. + + Returns + ------- + DataArray + """ data = np.stack([tr.data for tr in st]) channel = [tr.id for tr in st] time = { @@ -147,6 +187,7 @@ def from_stream(st, dims=("channel", "time")): def get_time_coord(tr, ignore_last_sample, ctype): + """Build a :class:`Coordinate` for the time axis of trace *tr*.""" t0 = np.datetime64(tr.stats.starttime) dt = np.rint(1e6 * tr.stats.delta).astype("m8[us]").astype("m8[ns]") nt = tr.stats.npts - int(ignore_last_sample) @@ -154,6 +195,7 @@ def get_time_coord(tr, ignore_last_sample, ctype): def uniquifiy(seq): + """Return the unique elements of *seq* in order; unwrap to scalar if only one.""" seen = set() seq = list(x for x in seq if x not in seen and not seen.add(x)) if len(seq) == 1: @@ -163,6 +205,7 @@ def uniquifiy(seq): def get_band_code(sampling_rate): + """Return the SEED band code character for *sampling_rate* (Hz).""" band_code = ["T", "P", "R", "U", "V", "L", "M", "B", "H", "C", "F"] limits = [0.000001, 0.00001, 0.0001, 0.001, 0.01, 0.1, 1, 10, 80, 250, 1000, 5000] index = np.searchsorted(limits, sampling_rate, "right") - 1 diff --git a/xdas/io/prodml.py b/xdas/io/prodml.py index 69ceb2c9..9e1bbe0d 100644 --- a/xdas/io/prodml.py +++ b/xdas/io/prodml.py @@ -1,3 +1,8 @@ +""" +I/O engine for ProdML HDF5 files (:class:`ProdML`), also known as +OptaSense and Sintela format. +""" + import h5py import numpy as np import pandas as pd @@ -9,6 +14,8 @@ class ProdML(Engine, name="prodml", aliases=["optasense", "sintela"]): + """Engine for reading ProdML / OptaSense / Sintela HDF5 files.""" + _supported_vtypes = ["hdf5"] _supported_ctypes = { "time": ["interpolated"], @@ -16,6 +23,7 @@ class ProdML(Engine, name="prodml", aliases=["optasense", "sintela"]): } def open_dataarray(self, fname, swapped_dims=False): + """Read a ProdML HDF5 file *fname* and return a virtual :class:`DataArray`.""" with h5py.File(fname, "r") as file: acquisition = file["Acquisition"] dx = acquisition.attrs["SpatialSamplingInterval"] diff --git a/xdas/io/silixa.py b/xdas/io/silixa.py index 9ede0e47..91f35a2e 100644 --- a/xdas/io/silixa.py +++ b/xdas/io/silixa.py @@ -1,3 +1,7 @@ +""" +I/O engine for Silixa TDMS files (:class:`SilixaEngine`). +""" + import dask import numpy as np @@ -8,6 +12,8 @@ class SilixaEngine(Engine, name="silixa"): + """Engine for reading Silixa iDAS TDMS files as lazy dask-backed DataArrays.""" + _supported_vtypes = ["dask"] _supported_ctypes = { "time": ["interpolated", "sampled", "dense"], @@ -15,6 +21,7 @@ class SilixaEngine(Engine, name="silixa"): } def open_dataarray(self, fname): + """Return a lazy dask-backed :class:`DataArray` for the TDMS file *fname*.""" shape, dtype, coords = self.read_header(fname) data = dask.array.from_delayed( dask.delayed(self.read_data)(fname), shape, dtype @@ -22,6 +29,7 @@ def open_dataarray(self, fname): return DataArray(data, coords) def read_header(self, fname): + """Read TDMS header and return ``(shape, dtype, coords)``.""" with TdmsReader(fname) as tdms: props = tdms.get_properties() shape = tdms.channel_length, tdms.fileinfo["n_channels"] @@ -44,6 +52,7 @@ def read_header(self, fname): return shape, dtype, coords def read_data(self, fname): + """Read and return the raw data array from the TDMS file *fname*.""" with TdmsReader(fname) as tdms: data = tdms.get_data() return data diff --git a/xdas/io/tdms.py b/xdas/io/tdms.py index c2b94305..58f57f3c 100644 --- a/xdas/io/tdms.py +++ b/xdas/io/tdms.py @@ -31,6 +31,7 @@ def load_property_map(xls_file): + """Load a tag-renaming map from the ``Sheet1`` of an Excel file.""" prop_map = pd.read_excel(xls_file, sheetname="Sheet1") return ( prop_map[["CurrentTag", "CorrectTag"]] @@ -44,6 +45,7 @@ def load_property_map(xls_file): def write_property_dict(prop_dict, out_file): + """Write *prop_dict* as a Python-literal assignment to *out_file*.""" from pprint import pformat f = open(out_file, "w") diff --git a/xdas/io/terra15.py b/xdas/io/terra15.py index 85d1c787..cc0ad6f6 100644 --- a/xdas/io/terra15.py +++ b/xdas/io/terra15.py @@ -1,3 +1,7 @@ +""" +I/O engine for Terra15 HDF5 files (:class:`Terra15Engine`). +""" + import h5py import pandas as pd @@ -8,6 +12,8 @@ class Terra15Engine(Engine, name="terra15"): + """Engine for reading Terra15 HDF5 files.""" + _supported_vtypes = ["hdf5"] _supported_ctypes = { "time": ["interpolated"], @@ -15,6 +21,7 @@ class Terra15Engine(Engine, name="terra15"): } def open_dataarray(self, fname, tz="UTC"): + """Read a Terra15 HDF5 file *fname* and return a virtual :class:`DataArray`.""" with h5py.File(fname, "r") as file: ti = ( pd.Timestamp(file["data_product"]["gps_time"][0], unit="s", tz=tz) diff --git a/xdas/io/utils.py b/xdas/io/utils.py index d26cf0a5..58af7c19 100644 --- a/xdas/io/utils.py +++ b/xdas/io/utils.py @@ -1,3 +1,8 @@ +""" +HDF5 utility functions for compressing datasets while preserving file +structure and metadata. +""" + import h5py import hdf5plugin diff --git a/xdas/io/xdas.py b/xdas/io/xdas.py index 9c392267..89539924 100644 --- a/xdas/io/xdas.py +++ b/xdas/io/xdas.py @@ -1,3 +1,8 @@ +""" +I/O engine for the native xdas HDF5/NetCDF4 format (:class:`XdasEngine`), +supporting :class:`DataArray`, :class:`DataSequence`, and :class:`DataMapping`. +""" + import os from pathlib import Path @@ -16,21 +21,40 @@ class XdasEngine(Engine, name="xdas"): + """Engine for the native xdas HDF5/NetCDF4 format.""" def open_dataarray(self, fname, **kwargs): + """Delegate to module-level :func:`open_dataarray`.""" return open_dataarray(fname, **kwargs) def save_dataarray(self, da, fname, **kwargs): + """Delegate to module-level :func:`save_dataarray`.""" return save_dataarray(da, fname, **kwargs) def open_datacollection(self, fname, **kwargs): + """Delegate to module-level :func:`open_datamapping`.""" return open_datamapping(fname, **kwargs) def save_datacollection(self, dc, fname, **kwargs): + """Delegate to module-level :func:`save_datamapping`.""" return save_datamapping(dc, fname, **kwargs) def open_dataarray(fname, group=None): + """ + Read a :class:`DataArray` from a native xdas NetCDF4/HDF5 file. + + Parameters + ---------- + fname : str or Path + Path to the file. + group : str, optional + HDF5 group path inside the file. + + Returns + ------- + DataArray + """ if isinstance(fname, Path): fname = str(fname) @@ -87,6 +111,26 @@ def open_dataarray(fname, group=None): def save_dataarray( da, fname, mode="w", group=None, virtual=None, encoding=None, create_dirs=False ): + """ + Write *da* to a native xdas NetCDF4/HDF5 file. + + Parameters + ---------- + da : DataArray + Data to write. + fname : str or Path + Output file path. + mode : str, optional + File open mode (``"w"`` or ``"a"``). + group : str, optional + HDF5 group path within the file. + virtual : bool, optional + If ``True``, write as a virtual (lazy) dataset. + encoding : dict, optional + HDF5/NetCDF4 encoding options. + create_dirs : bool, optional + Create parent directories if they do not exist. + """ if isinstance(fname, Path): fname = str(fname) @@ -149,6 +193,7 @@ def save_dataarray( def open_datacollection(fname, group=None): + """Read a :class:`DataCollection` from *fname*, auto-detecting sequence vs. mapping.""" if isinstance(fname, Path): fname = str(fname) dc = open_datamapping(fname, group) @@ -165,6 +210,7 @@ def open_datacollection(fname, group=None): def save_datacollection( dc, fname, mode="w", group=None, virtual=None, encoding=None, create_dirs=False ): + """Write *dc* to *fname*, dispatching to sequence or mapping writer as needed.""" if isinstance(fname, Path): fname = str(fname) @@ -177,6 +223,7 @@ def save_datacollection( def open_datamapping(fname, group=None): + """Read a :class:`DataMapping` from *fname*.""" if isinstance(fname, Path): fname = str(fname) @@ -210,6 +257,7 @@ def open_datamapping(fname, group=None): def save_datamapping( dm, fname, mode="w", group=None, virtual=None, encoding=None, create_dirs=False ): + """Write :class:`DataMapping` *dm* to *fname*, writing each key as a separate group.""" if mode == "w" and group is None and os.path.exists(fname): os.remove(fname) for key in dm: @@ -227,6 +275,7 @@ def save_datamapping( def open_datasequence(fname, group=None): + """Read a :class:`DataSequence` from *fname* via :func:`open_datamapping`.""" dm = open_datamapping(fname, group) return DataSequence.from_mapping(dm) @@ -234,6 +283,7 @@ def open_datasequence(fname, group=None): def save_datasequence( ds, fname, mode="w", group=None, virtual=None, encoding=None, create_dirs=False ): + """Write :class:`DataSequence` *ds* to *fname* by converting to a mapping first.""" dm = ds.to_mapping() save_datamapping(dm, fname, mode, group, virtual, encoding, create_dirs) diff --git a/xdas/parallel.py b/xdas/parallel.py index ddc43eb2..a8cadcd4 100644 --- a/xdas/parallel.py +++ b/xdas/parallel.py @@ -1,3 +1,8 @@ +""" +Thread-parallelism decorator :func:`parallelize` for splitting array axes +across workers using :class:`~concurrent.futures.ThreadPoolExecutor`. +""" + import os from concurrent.futures import ThreadPoolExecutor from functools import wraps @@ -8,9 +13,31 @@ def parallelize(split_axis=0, concat_axis=0, parallel=None): + """ + Decorator factory that splits array positional arguments across threads. + + Parameters + ---------- + split_axis : int or tuple of int, optional + Axis (or axes) along which to split positional array arguments. + Use ``None`` for arguments that should not be split. + concat_axis : int or tuple of int, optional + Axis (or axes) along which to concatenate the per-worker outputs. + parallel : int, bool, or None, optional + Worker count override. Forwarded to :func:`get_workers_count`. + + Returns + ------- + decorator : callable + A function decorator. + """ + def decorator(func): + """Return a thread-parallelised wrapper for *func*.""" + @wraps(func) def wrapper(*args, **kwargs): + """Split inputs, dispatch to a thread pool, then concatenate outputs.""" split_axes = split_axis if isinstance(split_axis, tuple) else (split_axis,) split_axes += (None,) * (len(args) - len(split_axes)) inputs = tuple( @@ -20,6 +47,7 @@ def wrapper(*args, **kwargs): args = tuple(value for value, axis in zip(args, split_axes) if axis is None) def fn(_inputs, tuplize=True): + """Call *func* on one chunk; optionally wrap scalar output in a tuple.""" _inputs = iter(_inputs) _args = iter(args) _args = tuple( diff --git a/xdas/picking.py b/xdas/picking.py index bea3275e..ceb4264d 100644 --- a/xdas/picking.py +++ b/xdas/picking.py @@ -1,3 +1,8 @@ +""" +Phase-pick utilities: tapered selection and cross-correlation based +picking of onset times in :class:`DataArray` objects. +""" + import numpy as np from numba import njit, prange from scipy.fft import next_fast_len diff --git a/xdas/processing/__init__.py b/xdas/processing/__init__.py index 2cbe1d76..fd6a3e9b 100644 --- a/xdas/processing/__init__.py +++ b/xdas/processing/__init__.py @@ -1,3 +1,8 @@ +""" +Chunked processing pipeline: loaders, writers, real-time streaming, and +the :func:`process` orchestrator for larger-than-memory datasets. +""" + from .core import ( DataArrayLoader, DataArrayWriter, diff --git a/xdas/processing/core.py b/xdas/processing/core.py index 65797a87..46e7d447 100644 --- a/xdas/processing/core.py +++ b/xdas/processing/core.py @@ -1,3 +1,9 @@ +""" +Core processing infrastructure: :class:`DataArrayLoader`, :class:`DataArrayWriter`, +:class:`DataFrameWriter`, :class:`StreamWriter`, :class:`ZMQPublisher`, +:class:`ZMQSubscriber`, :class:`RealTimeLoader`, and the :func:`process` function. +""" + import os from concurrent.futures import ThreadPoolExecutor from glob import glob @@ -166,10 +172,22 @@ def __iter__(self): @property def nbytes(self): + """Total bytes of the underlying :class:`DataArray`.""" return self.da.nbytes class RealTimeLoader(Observer): + """ + Real-time :class:`DataArray` loader that watches a directory for new files. + + Parameters + ---------- + path : str or Path + Directory to watch. + engine : str, optional + Engine used to open arriving files. Defaults to ``"netcdf"``. + """ + def __init__(self, path, engine="netcdf"): super().__init__() self.path = str(path) if isinstance(path, Path) else path @@ -190,11 +208,14 @@ def __next__(self): class Handler(FileSystemEventHandler): + """Watchdog event handler that loads closed files into a queue.""" + def __init__(self, queue, engine): self.engine = engine self.queue = queue def on_closed(self, event): + """Load the newly-closed file and place it in the queue.""" da = open_dataarray(event.src_path, engine=self.engine) self.queue.put(da.load()) @@ -251,6 +272,14 @@ def __init__( self._count = 0 def submit(self, chunk): + """ + Asynchronously write *chunk* to disk and register the path for later concat. + + Parameters + ---------- + chunk : DataArray + Processed data chunk to persist. + """ if not isinstance(chunk, DataArray): raise TypeError(f"`chunk` must by a DataArray object, not a {type(chunk)}") if not len(self._futures) < self.max_buffers: @@ -261,6 +290,7 @@ def submit(self, chunk): self._count += 1 def write(self, chunk): + """Alias for :meth:`submit`.""" return self.submit(chunk) def _write(self, chunk, count): @@ -269,9 +299,11 @@ def _write(self, chunk, count): return open_dataarray(path) def shutdown(self): + """Shut down the internal thread pool.""" self._executor.shutdown() def result(self): + """Flush all pending writes and return the concatenated :class:`DataArray`.""" while self._futures: future = self._futures.pop(0) result = future.result() @@ -321,6 +353,14 @@ def __init__(self, path, parse_dates=None, create_dirs=False): self._future = None def submit(self, df): + """ + Asynchronously append *df* to the CSV file. + + Parameters + ---------- + df : pandas.DataFrame + DataFrame chunk to write. + """ if not isinstance(df, pd.DataFrame): raise TypeError(f"`df` must by a DataFrame object, not a {type(df)}") if self._future is not None: @@ -328,6 +368,7 @@ def submit(self, df): self._future = self._executor.submit(self._write, df) def write(self, df): + """Alias for :meth:`submit`.""" return self.submit(df) def _write(self, df): @@ -338,9 +379,11 @@ def _write(self, df): df.to_csv(self.path, mode="a", header=False, index=False) def shutdown(self): + """Shut down the internal thread pool.""" self._executor.shutdown() def result(self): + """Flush pending writes and return the full CSV as a :class:`pandas.DataFrame`.""" self._future.result() self.shutdown() try: @@ -493,6 +536,14 @@ def _to_flat(self, st): new_st.write(os.path.join(self.dirpath, self.fname), **self.kw_write) def submit(self, st): + """ + Asynchronously write *st* to a temporary MiniSEED file. + + Parameters + ---------- + st : obspy.Stream + Stream chunk to persist. + """ if not isinstance(st, obspy.Stream): raise TypeError(f"`st` must by a DataFrame object, not a {type(st)}") if self._future is not None: @@ -500,15 +551,18 @@ def submit(self, st): self._future = self._executor.submit(self._write, st) def write(self, st): + """Alias for :meth:`submit`.""" return self.submit(st) def _write(self, st): st.write(f"{self.dirpath}/{st[0].stats.starttime}_tmp.mseed", **self.kw_write) def shutdown(self): + """Shut down the internal thread pool.""" self._executor.shutdown() def result(self): + """Merge all temporary MiniSEED files and write the final output.""" self._future.result() self.shutdown() pattern = f"{self.dirpath}/*_tmp.mseed" @@ -586,9 +640,11 @@ def submit(self, da): self._socket.send(tobytes(da, self.encoding)) def write(self, da): + """Alias for :meth:`submit`.""" self.submit(da) def result(): + """Return ``None`` — ZMQPublisher has no aggregated result.""" return None @@ -657,6 +713,20 @@ def __next__(self): def tobytes(da, encoding=None): + """ + Serialise *da* to raw NetCDF4 bytes via a temporary file. + + Parameters + ---------- + da : DataArray + DataArray to serialise. + encoding : dict, optional + HDF5/NetCDF4 encoding options forwarded to :meth:`DataArray.to_netcdf`. + + Returns + ------- + bytes + """ with TemporaryDirectory() as tmpdir: path = os.path.join(tmpdir, "tmp.nc") da.to_netcdf(path, virtual=False, encoding=encoding) @@ -665,6 +735,18 @@ def tobytes(da, encoding=None): def frombuffer(da): + """ + Deserialise raw NetCDF4 *da* bytes into a loaded :class:`DataArray`. + + Parameters + ---------- + da : bytes + Raw bytes as produced by :func:`tobytes`. + + Returns + ------- + DataArray + """ with TemporaryDirectory() as tmpdir: path = os.path.join(tmpdir, "tmp.nc") with open(path, "wb") as file: diff --git a/xdas/processing/monitor.py b/xdas/processing/monitor.py index 095710ab..9f01aacb 100644 --- a/xdas/processing/monitor.py +++ b/xdas/processing/monitor.py @@ -1,9 +1,24 @@ +""" +:class:`Monitor`: tqdm-based throughput tracker for chunked processing pipelines. +""" + from time import perf_counter from tqdm import tqdm class Monitor: + """ + Throughput tracker for chunked processing pipelines backed by a tqdm progress bar. + + Parameters + ---------- + total : int or None, optional + Total expected bytes; forwarded to :class:`tqdm.tqdm`. + smoothing : float, optional + EMA smoothing factor for per-phase timing. Defaults to ``0.3``. + """ + def __init__(self, total=None, smoothing=0.3): self.pbar = tqdm(total=total, unit="B", unit_scale=True, unit_divisor=1024) self.smoothing = smoothing @@ -13,9 +28,11 @@ def __init__(self, total=None, smoothing=0.3): self.cum = {} def tic(self, key): + """Record the start time for phase *key*.""" self.time[key] = perf_counter() def toc(self, nbytes): + """Record end times, update EMAs, advance the progress bar by *nbytes*.""" time = perf_counter() values = list(self.time.values()) + [time] for idx, key in enumerate(self.time): @@ -34,25 +51,31 @@ def toc(self, nbytes): self.pbar.set_postfix(self.usage_str()) def close(self): + """Display the final average usage and close the progress bar.""" self.pbar.set_postfix(self.average_usage_str()) self.pbar.close() def usage(self): + """Return the current per-phase time fraction dict for the last iteration.""" total = sum(self.iter.values()) return {key: self.iter[key] / total for key in self.iter} def average_usage(self): + """Return the cumulative per-phase time fraction dict.""" total = sum(self.cum.values()) return {key: self.cum[key] / total for key in self.cum} @staticmethod def format(x): + """Format *x* as a percentage string with one decimal place.""" return f"{100 * x:.1f}%" def usage_str(self): + """Return the current per-phase usage as a dict of formatted percentage strings.""" d = self.usage() return {key: self.format(value) for key, value in d.items()} def average_usage_str(self): + """Return the cumulative per-phase usage as a dict of formatted percentage strings.""" d = self.average_usage() return {key: self.format(value) for key, value in d.items()} diff --git a/xdas/signal.py b/xdas/signal.py index fefb4d53..7a710f0f 100644 --- a/xdas/signal.py +++ b/xdas/signal.py @@ -1,3 +1,9 @@ +""" +Signal processing functions for :class:`DataArray`: filtering, resampling, +tapering, detrending, and spectral helpers, all coordinate-aware and +multi-threaded via :func:`~xdas.parallel.parallelize`. +""" + import numpy as np import scipy.signal as sp diff --git a/xdas/spectral.py b/xdas/spectral.py index c4a6e279..7ff0d948 100644 --- a/xdas/spectral.py +++ b/xdas/spectral.py @@ -1,3 +1,8 @@ +""" +Spectral analysis functions for :class:`DataArray`: Short-Time Fourier +Transform (:func:`stft`) and related helpers. +""" + import numpy as np from scipy.fft import fft, fftfreq, fftshift, rfft, rfftfreq from scipy.signal import get_window @@ -91,6 +96,7 @@ def stft( freqs = {"tie_indices": [0, len(freqs) - 1], "tie_values": [freqs[0], freqs[-1]]} def func(x): + """Apply windowed FFT to produce the STFT output array.""" if nperseg == 1 and noverlap == 0: result = x[..., np.newaxis] else: diff --git a/xdas/synthetics.py b/xdas/synthetics.py index 84107237..e72ebd81 100644 --- a/xdas/synthetics.py +++ b/xdas/synthetics.py @@ -1,3 +1,8 @@ +""" +Synthetic DAS data generators used in doctests and test fixtures: +:func:`wavelet_wavefronts` and :func:`randn_wavefronts`. +""" + import numpy as np import scipy.signal as sp @@ -97,6 +102,18 @@ def wavelet_wavefronts( def randn_wavefronts(): + """ + Generate a large random-noise synthetic DAS :class:`DataArray`. + + Returns a 200 s × 100 km array (10 Hz temporal, 100 m spatial sampling) + with step-onset noise bursts simulating P- and S-wave arrivals from a + single source located 20 km off-axis. + + Returns + ------- + DataArray + Synthetic DAS data with ``time`` and ``distance`` coordinates. + """ # ensure reporducibility np.random.seed(42) @@ -143,6 +160,20 @@ def randn_wavefronts(): def dummy(shape=(1000, 100)): + """ + Return a minimal random :class:`DataArray` for quick testing. + + Parameters + ---------- + shape : tuple of int, optional + ``(n_time, n_distance)`` shape. Defaults to ``(1000, 100)``. + + Returns + ------- + DataArray + DataArray filled with Gaussian noise, sampled at 10 Hz over + ``[0, 1000]`` m with ``time`` starting at 2024-01-01. + """ starttime = np.datetime64("2024-01-01T00:00:00.000000000") endtime = starttime + (shape[0] - 1) * np.timedelta64(100, "ms") time = {"tie_indices": [0, shape[0] - 1], "tie_values": [starttime, endtime]} diff --git a/xdas/trigger.py b/xdas/trigger.py index 02142b6d..3bb5b720 100644 --- a/xdas/trigger.py +++ b/xdas/trigger.py @@ -1,3 +1,8 @@ +""" +Threshold-based triggering atom :class:`Trigger` for detecting phase +arrivals in :class:`DataArray` objects using an on/off mechanism. +""" + import numpy as np import pandas as pd from numba import njit diff --git a/xdas/virtual.py b/xdas/virtual.py index 136f96d8..b2bb56ee 100644 --- a/xdas/virtual.py +++ b/xdas/virtual.py @@ -1,3 +1,9 @@ +""" +Virtual (lazy) array types: :class:`VirtualArray` base, :class:`VirtualSource` +for a single HDF5/NetCDF4 dataset slice, and :class:`VirtualStack` for +concatenating multiple sources along an axis. +""" + import os from copy import copy, deepcopy from tempfile import TemporaryDirectory @@ -7,6 +13,13 @@ class VirtualArray: + """ + Abstract base class for lazy array objects backed by HDF5/NetCDF4 files. + + Subclasses must implement :meth:`shape`, :meth:`dtype`, :meth:`__getitem__`, + :meth:`__array__`, and :meth:`to_dataset`. + """ + def __repr__(self): return f"{self.__class__.__name__}: {_to_human(self.nbytes)} ({self.dtype})" @@ -18,21 +31,26 @@ def __array__(self, dtype=None): @property def shape(self): + """Tuple of array dimensions (abstract — must be overridden).""" NotImplemented @property def dtype(self): + """NumPy dtype of the array elements (abstract — must be overridden).""" NotImplemented def to_dataset(self, file_or_group, name): + """Write this virtual array as an HDF5 dataset (abstract — must be overridden).""" NotImplemented @property def ndim(self): + """Number of dimensions.""" return len(self.shape) @property def size(self): + """Total number of elements.""" if self.shape: return np.prod(self.shape) else: @@ -40,16 +58,37 @@ def size(self): @property def empty(self): + """``True`` if the array contains no elements.""" return self.size == 0 @property def nbytes(self): + """Total number of bytes occupied by the array elements.""" if self.shape: return self.size * self.dtype.itemsize else: return 0 def create_variable(self, file, name, dims=None, dtype=None): + """ + Write this virtual array into *file* and register it as a named variable. + + Parameters + ---------- + file : netCDF4-like file handle + Open writable file. + name : str + Variable name to create inside *file*. + dims : sequence of str, optional + Dimension names for the variable. + dtype : dtype-like, optional + Override data type for the variable. + + Returns + ------- + variable + The newly created file variable. + """ self.to_dataset(file._h5group, name) variable = file._variable_cls(file, name, dims) file._variables[name] = variable @@ -60,6 +99,18 @@ def create_variable(self, file, name, dims=None, dtype=None): class VirtualStack(VirtualArray): + """ + Lazy concatenation of multiple :class:`VirtualSource` objects along one axis. + + Parameters + ---------- + sources : list of VirtualSource, optional + Initial list of sources to stack. All sources must share the same + dtype and shape on every axis other than *axis*. + axis : int, optional + Concatenation axis. Defaults to ``0``. + """ + def __init__(self, sources=[], axis=0): self._sources = list() self._axis = axis @@ -129,14 +180,17 @@ def __array__(self, dtype=None): @property def sources(self): + """List of :class:`VirtualSource` objects in the stack.""" return self._sources @property def axis(self): + """Concatenation axis index.""" return self._axis @property def shape(self): + """Shape of the concatenated virtual array.""" return tuple( ( sum(source.shape[self._axis] for source in self._sources) @@ -148,23 +202,42 @@ def shape(self): @property def dtype(self): + """NumPy dtype shared by all sources in the stack.""" if not hasattr(self, "_dtype"): raise AttributeError("empty stack has no dtype") return self._dtype def append(self, source): + """ + Append *source* to the stack. + + Parameters + ---------- + source : VirtualSource + Source to add. Must have the same dtype and off-axis shape as + existing sources. + """ if not self._sources: self._initialize(source) self._check(source) self._sources.append(source) def extend(self, sources): + """ + Extend the stack with an iterable of sources. + + Parameters + ---------- + sources : list of VirtualSource + Sources to append, validated one-by-one via :meth:`append`. + """ if not isinstance(sources, list): raise TypeError("`sources` must be a list") for source in sources: self.append(source) def to_dataset(self, file_or_group, name): + """Write the stacked virtual array as an HDF5 virtual dataset.""" self._to_layout().to_dataset(file_or_group, name) def _initialize(self, source): @@ -284,13 +357,16 @@ def __setitem__(self, key, value): @property def shape(self): + """Shape of the layout after any lazy selections.""" return self._sel.shape @property def dtype(self): + """NumPy dtype of the layout.""" return self._layout.dtype def to_dataset(self, file_or_group, name): + """Write the layout as an HDF5 virtual dataset in *file_or_group*.""" if np.issubdtype(self.dtype, np.integer): fillvalue = np.iinfo(self.dtype).min elif np.issubdtype(self.dtype, np.floating): @@ -396,17 +472,21 @@ def __array__(self, dtype=None): @property def vsource(self): + """Underlying :class:`h5py.VirtualSource` with the current selection applied.""" return self._vsource.__getitem__(self._sel.get_indexer()) @property def shape(self): + """Shape of the selected region of the source dataset.""" return self.vsource.shape @property def dtype(self): + """NumPy dtype of the source dataset.""" return self.vsource.dtype def to_dataset(self, file_or_group, name): + """Write this source as an HDF5 virtual dataset in *file_or_group*.""" self._to_layout().to_dataset(file_or_group, name) def _to_layout(self): @@ -491,6 +571,7 @@ def __getitem__(self, key): @property def shape(self): + """Shape of the array after all selections are applied.""" return tuple( len(selector) for selector in self._selectors @@ -499,9 +580,11 @@ def shape(self): @property def ndim(self): + """Number of dimensions remaining after selections.""" return len(self.shape) def get_indexer(self): + """Return a tuple of slices/ints that materialises the accumulated selections.""" return tuple(selector.get_indexer() for selector in self._selectors) @@ -531,6 +614,7 @@ def __init__(self, index): self._index = index def get_indexer(self): + """Return the stored integer index.""" return self._index @@ -569,6 +653,7 @@ def __len__(self): return len(self._range) def get_indexer(self): + """Return a :class:`slice` that represents the stored range selection.""" if len(self) == 0: return slice(0) elif self._range.stop < 0: From e4d620bfd09eaa5901058a0c1f2c94230f023cbe Mon Sep 17 00:00:00 2001 From: Alister Trabattoni Date: Sun, 24 May 2026 20:55:25 +0200 Subject: [PATCH 05/21] Make testing faster. Some test are marked as slow. Run them with --slow. --- tests/conftest.py | 21 +++++++++++++++++++++ tests/io/test_generic.py | 3 +++ tests/io/test_utils.py | 1 + tests/test_atoms.py | 3 +++ tests/test_core.py | 6 +++--- xdas/core/numpy.py | 1 + xdas/core/routines.py | 18 +++++++++++------- xdas/picking.py | 2 +- xdas/synthetics.py | 7 +++---- xdas/trigger.py | 5 ++++- 10 files changed, 51 insertions(+), 16 deletions(-) create mode 100644 tests/conftest.py diff --git a/tests/conftest.py b/tests/conftest.py new file mode 100644 index 00000000..7a3f1159 --- /dev/null +++ b/tests/conftest.py @@ -0,0 +1,21 @@ +import pytest + +import xdas + + +def pytest_configure(config): + xdas.config.set("n_workers", 1) + + +def pytest_addoption(parser): + parser.addoption( + "--slow", action="store_true", default=False, help="run slow tests" + ) + + +def pytest_collection_modifyitems(config, items): + if not config.getoption("--slow"): + skip_slow = pytest.mark.skip(reason="slow test, use --slow to run") + for item in items: + if "slow" in item.keywords: + item.add_marker(skip_slow) diff --git a/tests/io/test_generic.py b/tests/io/test_generic.py index a177ec2a..91e7aa73 100644 --- a/tests/io/test_generic.py +++ b/tests/io/test_generic.py @@ -1,5 +1,6 @@ import dascore as dc import numpy as np +import pytest from dascore.utils.downloader import fetch import xdas as xd @@ -21,6 +22,7 @@ class TestGenericIO: "sample_tdms_file_v4713.tdms", # NOTE: dascore does not really know what it does ] + @pytest.mark.slow def test_auto_open_files(self): for engine, fnames in self.TEST_FILES.items(): for fname in fnames: @@ -29,6 +31,7 @@ def test_auto_open_files(self): da_auto = xd.open(path) assert da.equals(da_auto) + @pytest.mark.slow def test_compare_with_dascore(self): for engine, fnames in self.TEST_FILES.items(): for fname in fnames: diff --git a/tests/io/test_utils.py b/tests/io/test_utils.py index 0621d5e4..a8915c22 100644 --- a/tests/io/test_utils.py +++ b/tests/io/test_utils.py @@ -28,6 +28,7 @@ def assert_attrs_equal(actual, desired, strict=False): for key in actual: np.testing.assert_array_equal(actual[key], desired[key], strict) + @pytest.mark.slow @pytest.mark.parametrize("test_file, dataset_location", TEST_FILES) def test_compression(self, tmp_path, test_file, dataset_location): src_path = dascore.utils.downloader.fetch(test_file) diff --git a/tests/test_atoms.py b/tests/test_atoms.py index 85db364d..c1ff65fd 100644 --- a/tests/test_atoms.py +++ b/tests/test_atoms.py @@ -1,6 +1,7 @@ import pickle import numpy as np +import pytest import scipy.signal as sp import xdas as xd @@ -232,6 +233,7 @@ def test_nothing_to_do(self): class TestMLPicker: + @pytest.mark.slow def test_picker(self): from seisbench.models import PhaseNet @@ -244,6 +246,7 @@ def test_picker(self): result = xd.concat([picker(chunk, chunk_dim="time") for chunk in chunks]) assert result.equals(expected) + @pytest.mark.slow def test_compare_with_seisbench(self): import obspy from seisbench.models import PhaseNet diff --git a/tests/test_core.py b/tests/test_core.py index 5a1297f9..3018be0f 100644 --- a/tests/test_core.py +++ b/tests/test_core.py @@ -63,17 +63,17 @@ def test_open_mfdataarray_grouping(self, tmp_path): { "starttime": "2023-01-01T00:00:00", "resolution": (np.timedelta64(20, "ms"), 20.0), - "nchunk": 10, + "nchunk": 3, }, { "starttime": "2023-01-01T06:00:00", "resolution": (np.timedelta64(10, "ms"), 20.0), - "nchunk": 10, + "nchunk": 3, }, { "starttime": "2023-01-01T12:00:00", "resolution": (np.timedelta64(10, "ms"), 10.0), - "nchunk": 10, + "nchunk": 3, }, ] count = 1 diff --git a/xdas/core/numpy.py b/xdas/core/numpy.py index 6c63045e..32173e3a 100644 --- a/xdas/core/numpy.py +++ b/xdas/core/numpy.py @@ -35,6 +35,7 @@ def handled(reduce=False, drop_coords=False, **defaults): **defaults : dict Default keyword arguments forwarded to the wrapped function. """ + def decorator(func): """Build and register the coordinate-aware wrapper for *func*.""" sig = signature(func) diff --git a/xdas/core/routines.py b/xdas/core/routines.py index 537f55ec..d09ba4a7 100644 --- a/xdas/core/routines.py +++ b/xdas/core/routines.py @@ -545,12 +545,18 @@ def open_mfdataarray( "to 100 000." ) max_workers = get_workers_count(parallel) + objs = [] + failures = [] if (max_workers == 1) or (engine == "miniseed"): # TODO: dirty miniseed fix - if verbose: - iterator = tqdm(paths, desc="Fetching metadata from files") - else: - iterator = paths - objs = [open_dataarray(path, engine=engine, **kwargs) for path in iterator] + iterator = ( + tqdm(paths, desc="Fetching metadata from files") if verbose else paths + ) + for path in iterator: + try: + objs.append(open_dataarray(path, engine=engine, **kwargs)) + except Exception as e: + failures.append((path, e)) + warnings.warn(f"could not open {path}: {e}", RuntimeWarning) else: executor = get_reusable_executor(max_workers) futures_to_paths = { @@ -565,8 +571,6 @@ def open_mfdataarray( ) else: iterator = as_completed(futures_to_paths) - objs = [] - failures = [] for future in iterator: try: obj = future.result() diff --git a/xdas/picking.py b/xdas/picking.py index ceb4264d..32bf6d23 100644 --- a/xdas/picking.py +++ b/xdas/picking.py @@ -117,7 +117,7 @@ def tapered_selection(da, start, end, window=None, size=None, dim="last"): return xd.DataArray(data, coords=coords, dims=da.dims) -@njit(parallel=True) +@njit(parallel=True, cache=True) def _tapered_selection(data, sel, start, stop, size, window): out = np.zeros((sel.size, size), dtype=data.dtype) w = window.size // 2 diff --git a/xdas/synthetics.py b/xdas/synthetics.py index e72ebd81..017b6147 100644 --- a/xdas/synthetics.py +++ b/xdas/synthetics.py @@ -70,10 +70,9 @@ def wavelet_wavefronts( d = np.hypot(xc, (s - np.mean(s))) # channel distance to source [m] ttp = d / vp # P-wave travel time [s] tts = d / vs # S-wave travel time [s] - data = np.zeros(shape) - for k in range(shape[1]): - data[:, k] += sp.gausspulse(t - ttp[k] - t0, fc) / 2 # P is twice weaker - data[:, k] += sp.gausspulse(t - tts[k] - t0, fc) + t_col = t[:, np.newaxis] + data = sp.gausspulse(t_col - ttp - t0, fc) / 2 # P is twice weaker + data += sp.gausspulse(t_col - tts - t0, fc) data /= np.max(np.abs(data), axis=0, keepdims=True) # normalize data += np.random.randn(*shape) / snr # add noise diff --git a/xdas/trigger.py b/xdas/trigger.py index 3bb5b720..d96eff0f 100644 --- a/xdas/trigger.py +++ b/xdas/trigger.py @@ -548,7 +548,10 @@ def _find_picks_numeric(cft, thresh, axis=-1, buffer=None, offset=None): return out -@njit("Tuple((i8[:], i8[:], f8[:]))(f8[:, :], f8, f8, b1[:], i8[:], f8[:], i8)") +@njit( + "Tuple((i8[:], i8[:], f8[:]))(f8[:, :], f8, f8, b1[:], i8[:], f8[:], i8)", + cache=True, +) def _trigger( cft, thresh_on, thresh_off, buffer_status, buffer_index, buffer_value, offset ): From 0dc1e85dbc6a465bd1ef04055547b39ed8bbb919 Mon Sep 17 00:00:00 2001 From: Alister Trabattoni Date: Sun, 24 May 2026 21:07:42 +0200 Subject: [PATCH 06/21] Update the API part of the doc. --- docs/api/atoms.md | 44 ++++++++++++++-- docs/api/processing.md | 50 ++++++++++++++++++ docs/api/signal.md | 2 +- docs/api/synthetics.md | 1 + docs/api/virtual.md | 112 ++++++++++++++++++++++++++++++++++++----- docs/api/xdas.md | 59 +++++++++++++++++++--- xdas/core/dataarray.py | 2 +- 7 files changed, 246 insertions(+), 24 deletions(-) diff --git a/docs/api/atoms.md b/docs/api/atoms.md index 4779b721..c0b26cc3 100644 --- a/docs/api/atoms.md +++ b/docs/api/atoms.md @@ -24,7 +24,6 @@ Attributes Atom.initialized ``` - Methods ```{eval-rst} @@ -34,19 +33,57 @@ Methods Atom.initialize_from_state Atom.call Atom.reset + Atom.save_state + Atom.set_state + Atom.load_state ``` -## Core atoms +## Core atoms ```{eval-rst} .. autosummary:: :toctree: ../_autosummary Sequential - Partial State ``` +### Partial + +```{eval-rst} +.. autosummary:: + :toctree: ../_autosummary + + Partial +``` + +Attributes + +```{eval-rst} +.. autosummary:: + + Partial.stateful +``` + +Methods + +```{eval-rst} +.. autosummary:: + + Partial.call + Partial.from_state + Partial.get_state +``` + +## Decorators + +```{eval-rst} +.. autosummary:: + :toctree: ../_autosummary + + atomized +``` + ## Signal processing ```{eval-rst} @@ -57,6 +94,7 @@ Methods FIRFilter IIRFilter LFilter + MLPicker ResamplePoly SOSFilter Trigger diff --git a/docs/api/processing.md b/docs/api/processing.md index 01aced98..e4774ae1 100644 --- a/docs/api/processing.md +++ b/docs/api/processing.md @@ -4,16 +4,66 @@ # xdas.processing +## Functions + ```{eval-rst} .. autosummary:: :toctree: ../_autosummary process +``` + +## Loaders + +```{eval-rst} +.. autosummary:: + :toctree: ../_autosummary + DataArrayLoader RealTimeLoader +``` + +### DataArrayLoader + +```{eval-rst} +.. autosummary:: + :toctree: ../_autosummary + + DataArrayLoader.nbytes +``` + +## Writers + +```{eval-rst} +.. autosummary:: + :toctree: ../_autosummary + DataArrayWriter DataFrameWriter StreamWriter ZMQPublisher ZMQSubscriber +``` + +### DataArrayWriter + +```{eval-rst} +.. autosummary:: + :toctree: ../_autosummary + + DataArrayWriter.submit + DataArrayWriter.write + DataArrayWriter.shutdown + DataArrayWriter.result +``` + +### ZMQPublisher + +```{eval-rst} +.. autosummary:: + :toctree: ../_autosummary + + ZMQPublisher.submit + ZMQPublisher.write + ZMQPublisher.result ``` \ No newline at end of file diff --git a/docs/api/signal.md b/docs/api/signal.md index a3e4530f..bc6433de 100644 --- a/docs/api/signal.md +++ b/docs/api/signal.md @@ -36,7 +36,7 @@ medfilt ``` -## Spectral analysisi +## Spectral analysis ```{eval-rst} .. autosummary:: diff --git a/docs/api/synthetics.md b/docs/api/synthetics.md index 93bfcf61..d318d4d9 100644 --- a/docs/api/synthetics.md +++ b/docs/api/synthetics.md @@ -10,4 +10,5 @@ wavelet_wavefronts randn_wavefronts + dummy ``` \ No newline at end of file diff --git a/docs/api/virtual.md b/docs/api/virtual.md index 5c7a0bb2..3c1e40a1 100644 --- a/docs/api/virtual.md +++ b/docs/api/virtual.md @@ -6,57 +6,130 @@ ## VirtualArray +Base class for all virtual array types. + +Attributes + ```{eval-rst} .. autosummary:: :toctree: ../_autosummary VirtualArray.shape VirtualArray.dtype - VirtualArray.to_dataset VirtualArray.ndim VirtualArray.size - VirtualArray.empty VirtualArray.nbytes + VirtualArray.empty +``` + +Methods + +```{eval-rst} +.. autosummary:: + :toctree: ../_autosummary + + VirtualArray.to_dataset +``` + +## VirtualSource + +A lazy pointer to a single dataset inside an HDF5/NetCDF4 file. + +Constructor + +```{eval-rst} +.. autosummary:: + :toctree: ../_autosummary + + VirtualSource +``` + +Attributes + +```{eval-rst} +.. autosummary:: + + VirtualSource.vsource + VirtualSource.shape + VirtualSource.dtype + VirtualSource.ndim + VirtualSource.size + VirtualSource.nbytes + VirtualSource.empty +``` + +Methods + +```{eval-rst} +.. autosummary:: + + VirtualSource.to_dataset ``` ## VirtualStack +A lazy concatenation of multiple {py:class}`VirtualSource` objects along one axis. + +Constructor + ```{eval-rst} .. autosummary:: :toctree: ../_autosummary + VirtualStack +``` + +Attributes + +```{eval-rst} +.. autosummary:: + VirtualStack.sources VirtualStack.axis VirtualStack.shape VirtualStack.dtype + VirtualStack.ndim + VirtualStack.size + VirtualStack.nbytes + VirtualStack.empty +``` + +Methods + +```{eval-rst} +.. autosummary:: + VirtualStack.append VirtualStack.extend VirtualStack.to_dataset - VirtualStack._to_layout ``` ## VirtualLayout +Internal HDF5 virtual dataset layout object. + ```{eval-rst} .. autosummary:: :toctree: ../_autosummary + VirtualLayout +``` + +Attributes + +```{eval-rst} +.. autosummary:: + VirtualLayout.shape VirtualLayout.dtype - VirtualLayout.to_dataset ``` -## VirtualSource +Methods ```{eval-rst} .. autosummary:: - :toctree: ../_autosummary - VirtualSource.vsource - VirtualSource.shape - VirtualSource.dtype - VirtualSource.to_dataset - VirtualSource._to_layout + VirtualLayout.to_dataset ``` ## Selection @@ -65,7 +138,22 @@ .. autosummary:: :toctree: ../_autosummary + Selection +``` + +Attributes + +```{eval-rst} +.. autosummary:: + Selection.shape Selection.ndim +``` + +Methods + +```{eval-rst} +.. autosummary:: + Selection.get_indexer -``` \ No newline at end of file +``` diff --git a/docs/api/xdas.md b/docs/api/xdas.md index 61e88dd8..5d364b5f 100644 --- a/docs/api/xdas.md +++ b/docs/api/xdas.md @@ -28,9 +28,11 @@ asdataarray combine_by_coords combine_by_field + concat concatenate concat_coords split + plot_availability ``` ## Mathematical and statistical functions @@ -72,6 +74,31 @@ DataArray ``` +Attributes + +```{eval-rst} +.. autosummary:: + :toctree: ../_autosummary + + DataArray.data + DataArray.coords + DataArray.dims + DataArray.shape + DataArray.dtype + DataArray.ndim + DataArray.size + DataArray.sizes + DataArray.nbytes + DataArray.values + DataArray.empty + DataArray.loc + DataArray.T + DataArray.conj + DataArray.conjugate +``` + +Methods + ```{eval-rst} .. autosummary:: :toctree: ../_autosummary @@ -80,8 +107,14 @@ DataArray.get_axis_num DataArray.isel DataArray.sel + DataArray.drop_dims + DataArray.drop_coords DataArray.copy DataArray.rename + DataArray.assign_coords + DataArray.swap_dims + DataArray.transpose + DataArray.expand_dims DataArray.load DataArray.to_xarray DataArray.from_xarray @@ -89,6 +122,8 @@ DataArray.from_stream DataArray.to_netcdf DataArray.from_netcdf + DataArray.to_dict + DataArray.from_dict DataArray.plot ``` @@ -103,6 +138,15 @@ Constructor DataCollection ``` +Attributes + +```{eval-rst} +.. autosummary:: + :toctree: ../_autosummary + + DataCollection.empty +``` + Methods ```{eval-rst} @@ -132,13 +176,14 @@ Methods .. autosummary:: :toctree: ../_autosummary - DataMapping.to_netcdf - DataMapping.from_netcdf + DataMapping.copy DataMapping.equals DataMapping.isel DataMapping.sel DataMapping.load DataMapping.map + DataMapping.to_netcdf + DataMapping.from_netcdf ``` ### DataSequence @@ -158,14 +203,14 @@ Methods .. autosummary:: :toctree: ../_autosummary - DataSequence.to_mapping - DataSequence.from_mapping - DataSequence.to_netcdf - DataSequence.from_netcdf + DataSequence.copy DataSequence.equals DataSequence.isel DataSequence.sel DataSequence.load DataSequence.map + DataSequence.to_mapping + DataSequence.from_mapping + DataSequence.to_netcdf + DataSequence.from_netcdf ``` - diff --git a/xdas/core/dataarray.py b/xdas/core/dataarray.py index 23e0928d..d1cc9bdf 100644 --- a/xdas/core/dataarray.py +++ b/xdas/core/dataarray.py @@ -626,7 +626,7 @@ def swap_dims(self, dims_dict=None, **dims_kwargs): * y (y): [0 1] Assign a new empy coordinate z as dimensional coordinate. - Use the **kwargs syntax this time: + Use the ``**kwargs`` syntax this time: >>> da.swap_dims(x="z") From 3a8d5913e3becfd96cdbf6f115e8bca6cf41954a Mon Sep 17 00:00:00 2001 From: Alister Trabattoni Date: Sun, 24 May 2026 21:10:24 +0200 Subject: [PATCH 07/21] Fix docstrings. --- xdas/coordinates/core.py | 4 ++-- xdas/core/dataarray.py | 13 ++++++---- xdas/core/datacollection.py | 22 ++++++++++++----- xdas/core/routines.py | 28 +++++++++++----------- xdas/processing/core.py | 2 +- xdas/signal.py | 48 ++++++++++++++++++------------------- xdas/synthetics.py | 22 ++++++++--------- 7 files changed, 77 insertions(+), 62 deletions(-) diff --git a/xdas/coordinates/core.py b/xdas/coordinates/core.py index 1e47bc2c..c5931a81 100644 --- a/xdas/coordinates/core.py +++ b/xdas/coordinates/core.py @@ -840,12 +840,12 @@ def get_sampling_interval(da, dim, cast=True): Parameters ---------- - da : DataArray or DataArray or DataArray + da : DataArray The data from which extract the sample spacing. dim : str The dimension along which get the sample spacing. cast: bool, optional - Wether to cast datetime64 to seconds, by default True. + Whether to cast datetime64 to seconds, by default True. Returns ------- diff --git a/xdas/core/dataarray.py b/xdas/core/dataarray.py index d1cc9bdf..b7979953 100644 --- a/xdas/core/dataarray.py +++ b/xdas/core/dataarray.py @@ -25,7 +25,7 @@ class DataArray(NDArrayOperatorsMixin): """ N-dimensional array with labeled coordinates and dimensions. - It is the equivalent of and xarray.DataArray but with custom coordinate objects. + It is the equivalent of an xarray.DataArray but with custom coordinate objects. Most of the DataArray API follows the DataArray one. DataArray objects also provide virtual dataset capabilities to manipulate huge multi-file NETCDF4 or HDF5 datasets. @@ -368,6 +368,11 @@ def sel( indexers : dict, optional A dict with keys matching dimensions and values given by scalars, slices or arrays of tick labels. + method : str, optional + Method to use for inexact matches. None (default) means only exact matches. + "nearest" finds the nearest index value. + endpoint : bool, optional + Whether to include the endpoint of a slice. Default is True. drop : bool, optional If ``drop=True``, drop coordinates variables in `indexers` instead of making them scalar. @@ -833,7 +838,7 @@ def to_stream( The channel code. If the string can be formatted, the band code will be inferred from the sampling rate. By default "{:1}N1" dim : dict, optional - A dict with as key the spatial dimension to split into traces, and as key + A dict with as key the spatial dimension to split into traces, and as value the temporal dimension. By default {"last": "first"}. Returns @@ -851,7 +856,7 @@ def from_stream(cls, st, dims=("channel", "time")): """ Convert an obspy stream into a data array. - Traces in the stream must have the same length an must be syncronized. Traces + Traces in the stream must have the same length and must be syncronized. Traces are stacked along the first axis. The trace ids are used as labels along the first dimension. @@ -893,7 +898,7 @@ def to_netcdf( group : str, optional Path to the netCDF4 group in the given file to open. virtual : bool, optional - Weather to write a virtual dataset. The DataArray data must be a VirtualSource + Whether to write a virtual dataset. The DataArray data must be a VirtualSource or a VirtualLayout. Default (None) is to try to write a virtual dataset if possible. encoding : dict, optional diff --git a/xdas/core/datacollection.py b/xdas/core/datacollection.py index b3a5ae1b..9b136472 100644 --- a/xdas/core/datacollection.py +++ b/xdas/core/datacollection.py @@ -280,7 +280,7 @@ def isel(self, indexers=None, **indexers_kwargs): """ Perform index selection to each data array of the data collection. - If a selection results in a empty data array, the data array is discarted. + If a selection results in a empty data array, the data array is discarded. See `DataArray.isel` for more details. @@ -312,7 +312,7 @@ def sel(self, indexers=None, method=None, endpoint=True, **indexers_kwargs): """ Perform labeled selection to each data array of the data collection. - If a selection results in a empty data array, the data array is discarted. + If a selection results in a empty data array, the data array is discarded. See DataArray.sel for more details. @@ -321,13 +321,18 @@ def sel(self, indexers=None, method=None, endpoint=True, **indexers_kwargs): indexers : dict, optional A dict with keys matching dimensions and values given by scalars, slices or arrays of tick labels. + method : str, optional + Method to use for inexact matches. None (default) means only exact matches. + endpoint : bool, optional + Whether to include the endpoint of a slice. Default is True. **indexers_kwargs : dict, optional The keyword arguments form of integers. Overwrite indexers input if both are provided. Returns ------- - The selected data collection. + DataCollection + The selected data collection. """ data = { @@ -483,7 +488,7 @@ def isel(self, indexers=None, **indexers_kwargs): """ Perform index selection to each data array of the data collection. - If a selection results in a empty data array, the data array is discarted. + If a selection results in a empty data array, the data array is discarded. See `DataArray.isel` for more details. @@ -513,7 +518,7 @@ def sel(self, indexers=None, method=None, endpoint=True, **indexers_kwargs): """ Perform labeled selection to each data array of the data collection. - If a selection results in a empty data array, the data array is discarted. + If a selection results in a empty data array, the data array is discarded. See DataArray.sel for more details. @@ -522,13 +527,18 @@ def sel(self, indexers=None, method=None, endpoint=True, **indexers_kwargs): indexers : dict, optional A dict with keys matching dimensions and values given by scalars, slices or arrays of tick labels. + method : str, optional + Method to use for inexact matches. None (default) means only exact matches. + endpoint : bool, optional + Whether to include the endpoint of a slice. Default is True. **indexers_kwargs : dict, optional The keyword arguments form of integers. Overwrite indexers input if both are provided. Returns ------- - The selected data collection. + DataCollection + The selected data collection. """ data = [ diff --git a/xdas/core/routines.py b/xdas/core/routines.py index d09ba4a7..c8ce1a0e 100644 --- a/xdas/core/routines.py +++ b/xdas/core/routines.py @@ -209,7 +209,7 @@ def open_mfdatacollection( The path names given as a shell-style wildcards string or a list of paths. dim : str, optional The dimension along which the data arrays are concatenated. Default to "first". - tolerance : float of timedelta64, optional + tolerance : float or timedelta64, optional During concatenation, the tolerance to consider that the end of a file is continuous with beginning of the following one. For time coordinates, numeric values are considered as seconds. Default to zero tolerance. @@ -304,14 +304,14 @@ def open_mfdatatree( The path descriptor. dim : str, optional The dimension along which the data arrays are concatenated. Default to "first". - tolerance : float of timedelta64, optional + tolerance : float or timedelta64, optional During concatenation, the tolerance to consider that the end of a file is continuous with beginning of the following one. For time coordinates, numeric values are considered as seconds. Default to zero tolerance. squeeze : bool, optional Whether to return a DataArray instead of a DataCollection if the combination results in a data collection containing a unique data array. - engine: str of callable, optional + engine: str or callable, optional The type of file to open or a read function. Default to xdas netcdf format. parallel: bool or int, optional Whether to use multiprocessing to fetch file metadata. If False or 1, @@ -414,14 +414,14 @@ def collect( The names of the levels of the tree hierarchy. dim : str, optional The dimension along which the data arrays are concatenated. Default to "first". - tolerance : float of timedelta64, optional + tolerance : float or timedelta64, optional During concatenation, the tolerance to consider that the end of a file is continuous with beginning of the following one. For time coordinates, numeric values are considered as seconds. Default to zero tolerance. squeeze : bool, optional Whether to return a DataArray instead of a DataCollection if the combination results in a data collection containing a unique data array. - engine: str of callable, optional + engine: str or callable, optional The type of file to open or a read function. Default to xdas netcdf format. parallel: bool or int, optional Whether to use multiprocessing to fetch file metadata. If False or 1, @@ -496,14 +496,14 @@ def open_mfdataarray( The path names given as a shell-style wildcards string or a list of paths. dim : str, optional The dimension along which the data arrays are concatenated. Default to "first". - tolerance : float of timedelta64, optional + tolerance : float or timedelta64, optional During concatenation, the tolerance to consider that the end of a file is continuous with beginning of the following one. For time coordinates, numeric values are considered as seconds. Default to zero tolerance. squeeze : bool, optional Whether to return a DataArray instead of a DataCollection if the combination results in a data collection containing a unique data array. - engine: str of callable, optional + engine: str or callable, optional The type of file to open or a read function. Default to xdas netcdf format. parallel: bool or int, optional Whether to use multiprocessing to fetch file metadata. If False or 1, @@ -598,7 +598,7 @@ def open_dataarray(fname, engine=None, vtype=None, ctype=None, **kwargs): ---------- fname : str The path of the dataarray. - engine: str of callable, optional + engine: str or callable, optional The type of file to open or a read function. Default to xdas netcdf format. **kwargs Additional keyword arguments to be passed to the read function. @@ -664,7 +664,7 @@ def asdataarray(obj, tolerance=None): """ Try to convert given object to a dataarray. - Only support DataArray or DataArray as input. + Only supports DataArray or xr.DataArray as input. Parameters ---------- @@ -683,7 +683,7 @@ def asdataarray(obj, tolerance=None): Raises ------ ValueError - _description_ + If the object cannot be converted to a DataArray. """ if isinstance(obj, DataArray): return obj @@ -712,7 +712,7 @@ def combine_by_field( The data collections to combine. dim : str, optional The dimension along which concatenate. Default to "first". - tolerance : float of timedelta64, optional + tolerance : float or timedelta64, optional The tolerance to consider that the end of a file is continuous with beginning of the following. For time coordinates, numeric values are considered as seconds. Zero by default. @@ -776,7 +776,7 @@ def combine_by_coords( The data arrays to combine. dim : str, optional The dimension along which concatenate. Default to "first". - tolerance : float of timedelta64, optional + tolerance : float or timedelta64, optional The tolerance to consider that the end of a file is continuous with beginning of the following. For time coordinates, numeric values are considered as seconds. Zero by default. @@ -930,7 +930,7 @@ def concat(objs, dim="first", tolerance=None, virtual=None, verbose=None): List of data arrays to concatenate. dim : str The dimension along which concatenate. - tolerance : float of timedelta64, optional + tolerance : float or timedelta64, optional The tolerance to consider that the end of a file is continuous with beginning of the following, For time coordinates, numeric values are considered as seconds. Zero by default. @@ -1010,7 +1010,7 @@ def concat_coords(objs, *, sort=False, return_order=False, tolerance=False): return_order : bool, optional If True, return `(coord, order)` where `order` is the list of indices used to sort the input objects. - tolerance : float of timedelta64, optional + tolerance : float or timedelta64, optional The tolerance to consider that the end of a coordinate object is continuous with beginning of the following, For time coordinates, numeric values are considered as seconds. No simplification by default. diff --git a/xdas/processing/core.py b/xdas/processing/core.py index 46e7d447..b5038541 100644 --- a/xdas/processing/core.py +++ b/xdas/processing/core.py @@ -321,7 +321,7 @@ class DataFrameWriter: path : str The path to the csv file. parse_dates : bool, int, optional - Weather to parse dates when reopening the csv file a the end of the process + Whether to parse dates when reopening the csv file at the end of the process create_dirs : bool, optional Whether to create parent directories if they do not exist. Default is False. diff --git a/xdas/signal.py b/xdas/signal.py index 7a710f0f..3814569d 100644 --- a/xdas/signal.py +++ b/xdas/signal.py @@ -21,7 +21,7 @@ def detrend(da, type="linear", dim="last", parallel=None): Parameters ---------- - da : DataArray or DataArray + da : DataArray The data to detrend. type : str Either "linear" or "constant". @@ -30,7 +30,7 @@ def detrend(da, type="linear", dim="last", parallel=None): Returns ------- - DataArray or DataArray + DataArray The detrended data. Notes @@ -52,18 +52,18 @@ def taper(da, window="hann", fftbins=False, dim="last", parallel=None): Parameters ---------- - da : DataArray or DataArray + da : DataArray The data to taper. window : str or tuple, optional The window to use, by default "hann" fftbins : bool, optional - Weather to use a periodic windowing, by default False + Whether to use a periodic windowing, by default False dim : str, optional - Dimension along the which to taper, by default "time" + Dimension along which to taper, by default "last" Returns ------- - DataArray or DataArray + DataArray The tapered data. """ axis = da.get_axis_num(dim) @@ -268,7 +268,7 @@ def resample_poly( The upsampling factor. down : int The downsampling factor. - dim : int, optional + dim : str, optional The dimension of `da` that is resampled. Default is last. window : string, tuple, or array_like, optional Desired window to use to design the low-pass filter, or the FIR filter @@ -453,7 +453,7 @@ def filtfilt( is not 1, then both `a` and `b` are normalized by ``a[0]``. da : DataArray The array of data to be filtered. - dim : srt, optional + dim : str, optional The dimension of `da` to which the filter is applied. Default is last. padtype : str or None, optional @@ -684,7 +684,7 @@ def decimate(da, q, n=None, ftype="iir", zero_phase=True, dim="last", parallel=N Parameters ---------- - da : DataArray or DataArray + da : DataArray The signal to be downsampled, as an N-dimensional dataarray. q : int The downsampling factor. When using IIR downsampling, it is recommended @@ -706,7 +706,7 @@ def decimate(da, q, n=None, ftype="iir", zero_phase=True, dim="last", parallel=N Returns ------- - DataArray or DataArray + DataArray The down-sampled signal. Notes @@ -733,16 +733,16 @@ def integrate(da, midpoints=False, dim="last", parallel=None): Parameters ---------- - da : DataArray or DataArray + da : DataArray The data to integrate. midpoints : bool, optional Whether to move the coordinates by half a step, by default False. dim : str, optional - The dimension along which to integrate, by default "distance". + The dimension along which to integrate, by default "last". Returns ------- - DataArray or DataArray + DataArray The integrated data. Notes @@ -769,17 +769,17 @@ def differentiate(da, midpoints=False, dim="last", parallel=None): Parameters ---------- - da : DataArray or DataArray - The data to integrate. + da : DataArray + The data to differentiate. midpoints : bool, optional Whether to move the coordinates by half a step, by default False. dim : str, optional - The dimension along which to integrate, by default "distance". + The dimension along which to differentiate, by default "last". Returns ------- - DataArray or DataArray - The integrated data. + DataArray + The differentiated data. Notes ----- @@ -805,18 +805,18 @@ def segment_mean_removal(da, limits, window="hann", dim="last"): # TODO: parall Parameters ---------- - da : DataArray or DataArray + da : DataArray The data that segment mean should be removed. limits : list of float The segments limits. window : str, optional The tapering windows to apply at each window, by default "hann". dim : str, optional - The axis along which remove the segment means, by default "distance". + The dimension along which to remove the segment means, by default "last". Returns ------- - DataArray or DataArray + DataArray The data with segment means removed. """ out = da.copy() @@ -841,7 +841,7 @@ def sliding_mean_removal( Parameters ---------- - da : DataArray or DataArray + da : DataArray The data that sliding mean should be removed. wlen : float Length of the sliding mean. @@ -850,11 +850,11 @@ def sliding_mean_removal( pad_mode : str, optional Padding mode used, by default "reflect" dim : str, optional - The dimension along which remove the sliding mean, by default "distance" + The dimension along which to remove the sliding mean, by default "last" Returns ------- - DataArray or DataArray + DataArray The data with sliding mean removed. Notes diff --git a/xdas/synthetics.py b/xdas/synthetics.py index 017b6147..9ebf50d9 100644 --- a/xdas/synthetics.py +++ b/xdas/synthetics.py @@ -17,19 +17,19 @@ def wavelet_wavefronts( nchunk=None, ): """ - Generate some dummy files to bu used in code testing. - - It generates a monolithic `sample.nc` file and a chunked files (`001.nc`, `002.nc`, - `003.nc`). + Generate a synthetic DAS :class:`DataArray` with wavelet wavefronts. Parameters ---------- - dirpath : str, optional - Directory where files will be written. If None, not file will be written. - starttime : str - The starttime of the file, will be parsed by `np.datetime64(starttime)`. - resolution : (timedelta64, float) - The temporal and spatial sampling intervals. + starttime : str, optional + The starttime of the data, parsed by ``np.datetime64(starttime)``. + Default is ``"2023-01-01T00:00:00"``. + resolution : (timedelta64, float), optional + The temporal and spatial sampling intervals. Default is + ``(np.timedelta64(20, "ms"), 25.0)``. + nchunk : int, optional + If provided, splits the result into ``nchunk`` chunks and returns a + list of DataArrays instead of a single DataArray. Examples -------- @@ -119,7 +119,7 @@ def randn_wavefronts(): # sampling resolution = (np.timedelta64(10, "ms"), 100.0) starttime = np.datetime64("2024-01-01T00:00:00").astype("datetime64[ns]") - span = (np.timedelta64(200, "s"), 100000.0) # (100 s, 10 km) + span = (np.timedelta64(200, "s"), 100000.0) # (200 s, 100 km) shape = (span[0] // resolution[0], int(span[1] // resolution[1]) + 1) t = np.arange(shape[0]) * resolution[0] / np.timedelta64(1, "s") # time values [s] s = np.arange(shape[1]) * resolution[1] # distance values [m] From 9d867d75434cb0082399743096014cde7fb1468e Mon Sep 17 00:00:00 2001 From: Alister Trabattoni Date: Sun, 24 May 2026 21:15:30 +0200 Subject: [PATCH 08/21] Fix some docstrings. --- xdas/signal.py | 65 ++++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 58 insertions(+), 7 deletions(-) diff --git a/xdas/signal.py b/xdas/signal.py index 3814569d..95cbaaa7 100644 --- a/xdas/signal.py +++ b/xdas/signal.py @@ -27,6 +27,10 @@ def detrend(da, type="linear", dim="last", parallel=None): Either "linear" or "constant". dim : str The dimension along which to detrend the data. + parallel : bool or int, optional + Number of threads to use. True uses all cores, False uses one, an int + uses that many, None defers to the global xdas configuration. Default + is None. Returns ------- @@ -60,6 +64,10 @@ def taper(da, window="hann", fftbins=False, dim="last", parallel=None): Whether to use a periodic windowing, by default False dim : str, optional Dimension along which to taper, by default "last" + parallel : bool or int, optional + Number of threads to use. True uses all cores, False uses one, an int + uses that many, None defers to the global xdas configuration. Default + is None. Returns ------- @@ -97,6 +105,10 @@ def filter(da, freq, btype, corners=4, zerophase=False, dim="last", parallel=Non the resulting filtered trace. dim: str, optional The dimension along which to filter. + parallel : bool or int, optional + Number of threads to use. True uses all cores, False uses one, an int + uses that many, None defers to the global xdas configuration. Default + is None. Returns ------- @@ -131,9 +143,10 @@ def hilbert(da, N=None, dim="last", parallel=None): Number of Fourier components. Default: `da.sizes[dim]`. dim: str, optional The dimension along which to transform. Default: last. - parallel: bool or int, optional - Whether to parallelize the function, if True all cores are used, - if False single core, if int: number of cores. + parallel : bool or int, optional + Number of threads to use. True uses all cores, False uses one, an int + uses that many, None defers to the global xdas configuration. Default + is None. Returns ------- @@ -194,6 +207,10 @@ def resample(da, num, dim="last", window=None, domain="time", parallel=None): domain: string, optional A string indicating the domain of the input x: `time` Consider the input da as time-domain (Default), `freq` Consider the input da as frequency-domain. + parallel : bool or int, optional + Number of threads to use. True uses all cores, False uses one, an int + uses that many, None defers to the global xdas configuration. Default + is None. Returns ------- @@ -284,6 +301,10 @@ def resample_poly( respectively of the array along the dimension. cval : float, optional Value to use if `padtype='constant'`. Default is zero. + parallel : bool or int, optional + Number of threads to use. True uses all cores, False uses one, an int + uses that many, None defers to the global xdas configuration. Default + is None. Notes ----- @@ -365,9 +386,11 @@ def lfilter(b, a, da, dim="last", zi=None, parallel=None): zi : array_like or str, optional Initial conditions for the filter delays. If `zi` is None or ... then initial rest is assumed. - parallel: bool or int, optional - Whether to parallelize the function, if true: all cores are used, if false: - single core, if int: n cores are used. + parallel : bool or int, optional + Number of threads to use. True uses all cores, False uses one, an int + uses that many, None defers to the global xdas configuration. Default + is None. + Returns ------- da : DataArray @@ -477,6 +500,10 @@ def filtfilt( impulse response of the filter. If `irlen` is None, no part of the impulse response is ignored. For a long signal, specifying `irlen` can significantly improve the performance of the filter. + parallel : bool or int, optional + Number of threads to use. True uses all cores, False uses one, an int + uses that many, None defers to the global xdas configuration. Default + is None. Notes ----- @@ -544,6 +571,10 @@ def sosfilt(sos, da, dim="last", zi=None, parallel=None): ``..., 2, ...`` denotes the shape of `da`, but with ``da.sizes[dim]`` replaced by 2. If `zi` is None,... , or is not given then initial rest (i.e. all zeros) is assumed. + parallel : bool or int, optional + Number of threads to use. True uses all cores, False uses one, an int + uses that many, None defers to the global xdas configuration. Default + is None. Returns ------- @@ -625,7 +656,7 @@ def sosfiltfilt(sos, da, dim="last", padtype="odd", padlen=None, parallel=None): padlen : int or None, optional The number of elements by which to extend `da` at both ends of `dim` before applying the filter. This value must be less than - ``da.sizes[do,] - 1``. ``padlen=0`` implies no padding. + ``da.sizes[dim] - 1``. ``padlen=0`` implies no padding. The default value is:: 3 * (2 * len(sos) + 1 - min((sos[:, 2] == 0).sum(), @@ -635,6 +666,10 @@ def sosfiltfilt(sos, da, dim="last", padtype="odd", padlen=None, parallel=None): and zeros at the origin (e.g. for odd-order filters) to yield equivalent estimates of `padlen` to those of `filtfilt` for second-order section filters built with `scipy.signal` functions. + parallel : bool or int, optional + Number of threads to use. True uses all cores, False uses one, an int + uses that many, None defers to the global xdas configuration. Default + is None. Returns ------- @@ -703,6 +738,10 @@ def decimate(da, q, n=None, ftype="iir", zero_phase=True, dim="last", parallel=N when using an IIR filter, and shifting the outputs back by the filter's group delay when using an FIR filter. The default value of ``True`` is recommended, since a phase shift is generally not desired. + parallel : bool or int, optional + Number of threads to use. True uses all cores, False uses one, an int + uses that many, None defers to the global xdas configuration. Default + is None. Returns ------- @@ -739,6 +778,10 @@ def integrate(da, midpoints=False, dim="last", parallel=None): Whether to move the coordinates by half a step, by default False. dim : str, optional The dimension along which to integrate, by default "last". + parallel : bool or int, optional + Number of threads to use. True uses all cores, False uses one, an int + uses that many, None defers to the global xdas configuration. Default + is None. Returns ------- @@ -775,6 +818,10 @@ def differentiate(da, midpoints=False, dim="last", parallel=None): Whether to move the coordinates by half a step, by default False. dim : str, optional The dimension along which to differentiate, by default "last". + parallel : bool or int, optional + Number of threads to use. True uses all cores, False uses one, an int + uses that many, None defers to the global xdas configuration. Default + is None. Returns ------- @@ -851,6 +898,10 @@ def sliding_mean_removal( Padding mode used, by default "reflect" dim : str, optional The dimension along which to remove the sliding mean, by default "last" + parallel : bool or int, optional + Number of threads to use. True uses all cores, False uses one, an int + uses that many, None defers to the global xdas configuration. Default + is None. Returns ------- From d89dad94b70eb0a42f1fbd464fc62959620e689c Mon Sep 17 00:00:00 2001 From: Alister Trabattoni Date: Sun, 24 May 2026 21:17:34 +0200 Subject: [PATCH 09/21] Invert slow test logic to not skip by default. --- pyproject.toml | 2 +- tests/conftest.py | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 8c848488..b09a00b3 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -62,7 +62,7 @@ profile = "black" [tool.pytest.ini_options] addopts = ["--doctest-modules", "--import-mode=importlib", "--cov=xdas", "--cov-report=term-missing"] doctest_optionflags = "NORMALIZE_WHITESPACE" -markers = ["slow: marks tests as slow (run with --slow)"] +markers = ["slow: marks tests as slow (skip with --skip-slow)"] [tool.coverage.run] source = ["xdas"] diff --git a/tests/conftest.py b/tests/conftest.py index 7a3f1159..969d28e8 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -9,13 +9,13 @@ def pytest_configure(config): def pytest_addoption(parser): parser.addoption( - "--slow", action="store_true", default=False, help="run slow tests" + "--skip-slow", action="store_true", default=False, help="skip slow tests" ) def pytest_collection_modifyitems(config, items): - if not config.getoption("--slow"): - skip_slow = pytest.mark.skip(reason="slow test, use --slow to run") + if config.getoption("--skip-slow"): + skip_slow = pytest.mark.skip(reason="slow test, skipped with --skip-slow") for item in items: if "slow" in item.keywords: item.add_marker(skip_slow) From 3711cffcadcb5bb6315e8f3e8148f578e15cc682 Mon Sep 17 00:00:00 2001 From: Alister Trabattoni Date: Sun, 24 May 2026 21:20:33 +0200 Subject: [PATCH 10/21] One more missing docstring argument. --- xdas/parallel.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/xdas/parallel.py b/xdas/parallel.py index a8cadcd4..a9880d5a 100644 --- a/xdas/parallel.py +++ b/xdas/parallel.py @@ -128,6 +128,9 @@ def concatenate(arrays, axis=0, out=None, dtype=None, n_workers=None): dtype: str or numpy.dtype If provided, the destination array will have this dtype. Cannot be provided together with out. + n_workers : int or None, optional + Number of threads to use for writing chunks. None defers to the global + xdas configuration. Default is None. Returns ------- From e485a9ace9b197daee45893bfd6a67bbd86432f7 Mon Sep 17 00:00:00 2001 From: Alister Trabattoni Date: Sun, 24 May 2026 21:41:01 +0200 Subject: [PATCH 11/21] Increase test coverage. --- tests/coordinates/test_default.py | 128 ++++++++++++++++ tests/test_datacollection.py | 240 ++++++++++++++++++++++++++++++ tests/test_methods.py | 212 ++++++++++++++++++++++++++ tests/test_parallel.py | 77 +++++++++- tests/test_picking.py | 58 ++++++++ tests/test_trigger.py | 41 ++++- xdas/coordinates/default.py | 4 +- xdas/core/datacollection.py | 4 +- xdas/parallel.py | 2 +- xdas/picking.py | 2 +- xdas/trigger.py | 4 +- 11 files changed, 762 insertions(+), 10 deletions(-) create mode 100644 tests/coordinates/test_default.py create mode 100644 tests/test_methods.py diff --git a/tests/coordinates/test_default.py b/tests/coordinates/test_default.py new file mode 100644 index 00000000..0df4756d --- /dev/null +++ b/tests/coordinates/test_default.py @@ -0,0 +1,128 @@ +import numpy as np +import pytest + +from xdas.coordinates import Coordinate +from xdas.coordinates.default import DefaultCoordinate + + +class TestDefaultCoordinate: + def test_isvalid(self): + assert DefaultCoordinate.isvalid({"size": 5}) + assert DefaultCoordinate.isvalid({"size": None}) + assert not DefaultCoordinate.isvalid({"size": 1.5}) + assert not DefaultCoordinate.isvalid({"length": 5}) + assert not DefaultCoordinate.isvalid([1, 2, 3]) + assert not DefaultCoordinate.isvalid(5) + + def test_init_default(self): + coord = DefaultCoordinate() + assert coord.empty + assert len(coord) == 0 + + def test_init_with_size(self): + coord = DefaultCoordinate({"size": 5}, "x") + assert not coord.empty + assert len(coord) == 5 + assert coord.dim == "x" + + def test_init_invalid_data(self): + with pytest.raises(TypeError, match="must be a mapping"): + DefaultCoordinate([1, 2, 3]) + + def test_init_dtype_rejected(self): + with pytest.raises(ValueError, match="dtype"): + DefaultCoordinate({"size": 3}, dtype=np.int32) + + def test_empty_property(self): + assert DefaultCoordinate({"size": 0}).empty + assert not DefaultCoordinate({"size": 1}).empty + + def test_dtype(self): + assert DefaultCoordinate({"size": 3}).dtype == np.int64 + + def test_ndim(self): + assert DefaultCoordinate({"size": 3}).ndim == 1 + + def test_shape(self): + assert DefaultCoordinate({"size": 5}).shape == (5,) + + def test_len_with_none(self): + coord = DefaultCoordinate({"size": None}) + assert len(coord) == 0 + + def test_len_with_size(self): + assert len(DefaultCoordinate({"size": 7})) == 7 + + def test_getitem_scalar(self): + coord = DefaultCoordinate({"size": 5}, "x") + result = coord[2] + assert isinstance(result, Coordinate) + assert result.dim is None # scalar → no dim + + def test_getitem_slice(self): + coord = DefaultCoordinate({"size": 5}, "x") + result = coord[1:3] + assert len(result) == 2 + assert result.dim == "x" + + def test_array(self): + coord = DefaultCoordinate({"size": 4}) + arr = np.asarray(coord) + np.testing.assert_array_equal(arr, np.arange(4)) + + def test_isdefault(self): + assert DefaultCoordinate({"size": 3}).isdefault() + + def test_get_sampling_interval(self): + assert DefaultCoordinate({"size": 3}).get_sampling_interval() == 1 + + def test_equals_same(self): + assert DefaultCoordinate({"size": 3}).equals(DefaultCoordinate({"size": 3})) + + def test_equals_different_size(self): + assert not DefaultCoordinate({"size": 3}).equals(DefaultCoordinate({"size": 5})) + + def test_equals_wrong_type(self): + from xdas.coordinates import DenseCoordinate + + result = DefaultCoordinate({"size": 3}).equals(DenseCoordinate(np.arange(3), "x")) + assert result is None + + def test_get_indexer(self): + coord = DefaultCoordinate({"size": 5}) + assert coord.get_indexer(3) == 3 + + def test_slice_indexer(self): + coord = DefaultCoordinate({"size": 5}) + s = coord.slice_indexer(1, 4, 2) + assert s == slice(1, 4, 2) + + def test_concat(self): + a = DefaultCoordinate({"size": 3}, "x") + b = DefaultCoordinate({"size": 2}, "x") + c = a.concat(b) + assert len(c) == 5 + assert c.dim == "x" + + def test_concat_type_error(self): + from xdas.coordinates import DenseCoordinate + + a = DefaultCoordinate({"size": 3}, "x") + b = DenseCoordinate(np.array([0, 1, 2]), "x") + with pytest.raises(TypeError): + a.concat(b) + + def test_concat_dim_mismatch(self): + a = DefaultCoordinate({"size": 3}, "x") + b = DefaultCoordinate({"size": 2}, "y") + with pytest.raises(ValueError): + a.concat(b) + + def test_to_from_dict(self): + coord = DefaultCoordinate({"size": 5}, "x") + dct = coord.to_dict() + assert dct["dim"] == "x" + assert dct["data"] == {"size": 5} + restored = DefaultCoordinate.from_dict(dct) + assert restored.equals(coord) + assert restored.dim == coord.dim diff --git a/tests/test_datacollection.py b/tests/test_datacollection.py index b5a9a1c4..151b376a 100644 --- a/tests/test_datacollection.py +++ b/tests/test_datacollection.py @@ -141,3 +141,243 @@ def test_map(self): result = dc.map(atom) expected = self.nest(atom(da)) assert result.equals(expected) + + def test_flat_map(self): + # DataMapping with DataArrays as direct values + da = wavelet_wavefronts() + dc = xd.DataCollection({"a": da, "b": da}, "flat") + atom = xs.decimate(..., 2, ftype="fir") + result = dc.map(atom) + assert result["a"].equals(atom(da)) + + def test_flat_sequence_map(self): + # DataSequence with DataArrays as direct values + da = wavelet_wavefronts() + dc = xd.DataCollection([da, da], "seq") + atom = xs.decimate(..., 2, ftype="fir") + result = dc.map(atom) + assert result[0].equals(atom(da)) + + def test_datacollection_from_dataarray(self): + da = wavelet_wavefronts() + # When DataArray is passed, rename and return it + result = xd.DataCollection(da, "myname") + assert isinstance(result, xd.DataArray) + assert result.name == "myname" + + def test_datacollection_from_raw_data(self): + import numpy as np + data = np.ones((3, 4)) + result = xd.DataCollection(data, "raw") + assert isinstance(result, xd.DataArray) + + def test_empty_mapping_repr(self): + from xdas.core.datacollection import DataMapping + dm = DataMapping({}, "empty") + assert repr(dm) == "Empty" + + def test_mapping_reduce(self): + import pickle + da = wavelet_wavefronts() + dc = xd.DataCollection({"a": da}, "test") + pickled = pickle.dumps(dc) + restored = pickle.loads(pickled) + assert restored.equals(dc) + + def test_sequence_reduce(self): + import pickle + da = wavelet_wavefronts() + dc = xd.DataCollection([da, da], "test") + pickled = pickle.dumps(dc) + restored = pickle.loads(pickled) + assert restored.equals(dc) + + def test_sequence_fields(self): + da = wavelet_wavefronts() + dc = xd.DataCollection([da, da], "seq") + assert "seq" in dc.fields + + def test_mapping_equals_false_different_type(self): + da = wavelet_wavefronts() + dm = xd.DataCollection({"a": da}, "test") + assert not dm.equals(xd.DataCollection([da], "test")) + + def test_mapping_equals_false_different_name(self): + da = wavelet_wavefronts() + dm1 = xd.DataCollection({"a": da}, "name1") + dm2 = xd.DataCollection({"a": da}, "name2") + assert not dm1.equals(dm2) + + def test_mapping_equals_false_different_keys(self): + da = wavelet_wavefronts() + dm1 = xd.DataCollection({"a": da}, "test") + dm2 = xd.DataCollection({"b": da}, "test") + assert not dm1.equals(dm2) + + def test_mapping_equals_false_different_values(self): + da = wavelet_wavefronts() + da2 = wavelet_wavefronts() + da2.data[:] = 0 + dm1 = xd.DataCollection({"a": da}, "test") + dm2 = xd.DataCollection({"a": da2}, "test") + assert not dm1.equals(dm2) + + def test_sequence_equals_false(self): + da = wavelet_wavefronts() + ds1 = xd.DataCollection([da, da], "seq") + ds2 = xd.DataCollection([da, da], "other") + assert not ds1.equals(ds2) + + def test_sequence_equals_false_wrong_type(self): + da = wavelet_wavefronts() + ds = xd.DataCollection([da], "seq") + dm = xd.DataCollection({"a": da}, "seq") + assert not ds.equals(dm) + + def test_sequence_load(self): + da = wavelet_wavefronts() + dc = xd.DataCollection([da, da], "seq") + loaded = dc.load() + assert isinstance(loaded, type(dc)) + + def test_mapping_load(self): + da = wavelet_wavefronts() + dc = xd.DataCollection({"a": da, "b": da}, "test") + loaded = dc.load() + assert isinstance(loaded, type(dc)) + + def test_sequence_copy(self): + da = wavelet_wavefronts() + dc = xd.DataCollection([da, da], "seq") + copy = dc.copy() + assert copy.equals(dc) + + def test_sequence_isel(self): + da = wavelet_wavefronts() + dc = xd.DataCollection([da, da], "seq") + result = dc.isel(distance=slice(0, 100)) + assert len(result) == 2 + + def test_sequence_sel(self): + da = wavelet_wavefronts() + dc = xd.DataCollection([da, da], "seq") + result = dc.sel(distance=slice(0, 5000)) + assert len(result) == 2 + + def test_sequence_from_netcdf(self, tmp_path): + da = wavelet_wavefronts() + dc = xd.DataCollection([da, da], "seq") + path = tmp_path / "seq.nc" + dc.to_netcdf(path) + result = xd.DataCollection.from_netcdf(path) + assert result.equals(dc) + + def test_query_invalid_key_in_sequence(self): + da = wavelet_wavefronts() + dc = xd.DataCollection([da, da], "seq") + with pytest.raises(ValueError, match="query must be a string"): + dc.query(seq="bad_string_key") + + def test_query_invalid_key_in_mapping(self): + da = wavelet_wavefronts() + dc = xd.DataCollection({"a": da}, "test") + with pytest.raises(ValueError, match="query must be a string"): + dc.query(test=123) + + def test_from_netcdf_non_sequential_int_keys(self, tmp_path): + from xdas.core.datacollection import DataMapping + da = wavelet_wavefronts() + # Create a mapping with non-sequential int keys (gaps) + dm = DataMapping({0: da, 2: da}, "test") + path = tmp_path / "non_seq.nc" + dm.to_netcdf(path) + result = xd.DataCollection.from_netcdf(path) + # Keys 0 and 2 are not a sequential range → returns as-is DataMapping + assert isinstance(result, xd.DataCollection) + + def test_sequence_from_netcdf_direct(self, tmp_path): + from xdas.core.datacollection import DataSequence + da = wavelet_wavefronts() + dc = DataSequence([da, da], "seq") + path = tmp_path / "seq_direct.nc" + dc.to_netcdf(path) + result = DataSequence.from_netcdf(str(path)) + assert result.equals(dc) + + def test_sequence_query_slice(self): + da = wavelet_wavefronts() + dc = xd.DataCollection([da, da], "seq") + result = dc.query(seq=slice(0, 1)) + assert len(result) == 1 + + def test_mapping_repr_nonempty(self): + da = wavelet_wavefronts() + dm = xd.DataCollection({"a": da}, "test") + s = repr(dm) + assert "test" in s.lower() or "Test" in s + + def test_mapping_repr_nested(self): + # nested DataMapping → triggers the non-DataArray branch in __repr__ + da = wavelet_wavefronts() + dm = self.nest(da) + s = repr(dm) + assert "das1" in s + + def test_mapping_repr_int_keys(self): + from xdas.core.datacollection import DataMapping + da = wavelet_wavefronts() + dm = DataMapping({0: da, 1: da}, "seq") + s = repr(dm) + assert "0" in s + + def test_sequence_repr(self): + da = wavelet_wavefronts() + dc = xd.DataCollection([da, da], "seq") + s = repr(dc) + assert "seq" in s.lower() or "Seq" in s + + def test_mapping_copy(self): + da = wavelet_wavefronts() + dc = xd.DataCollection({"a": da}, "test") + copy = dc.copy() + assert copy.equals(dc) + + def test_sequence_equals_false_different_length(self): + da = wavelet_wavefronts() + ds1 = xd.DataCollection([da, da], "seq") + ds2 = xd.DataCollection([da], "seq") + assert not ds1.equals(ds2) + + def test_sequence_equals_false_different_values(self): + da = wavelet_wavefronts() + da2 = wavelet_wavefronts() + da2.data[:] = 0 + ds1 = xd.DataCollection([da], "seq") + ds2 = xd.DataCollection([da2], "seq") + assert not ds1.equals(ds2) + + def test_nested_sequence_map(self): + da = wavelet_wavefronts() + inner = xd.DataCollection([da, da], "inner") + dc = xd.DataCollection([inner, inner], "outer") + atom = xs.decimate(..., 2, ftype="fir") + result = dc.map(atom) + assert len(result) == 2 + + def test_parse_tuple_with_name_given(self): + from xdas.core.datacollection import DataMapping + da = wavelet_wavefronts() + # When data is a tuple and name is already provided, unpack the tuple ignoring its name + dm = DataMapping(("inner_name", {"a": da}), "outer_name") + assert dm.name == "outer_name" + + def test_parse_datacollection_propagates_name(self): + da = wavelet_wavefronts() + dm = xd.DataCollection({"a": da}, "original_name") + # Passing existing DataCollection without explicit name propagates the name + dm2 = xd.DataCollection({"a": da}, "original_name") + dc_copy = xd.DataCollection.__new__(xd.DataCollection, dm2) + # just verify parse propagates name + from xdas.core.datacollection import parse + data, name = parse(dm, None) # should propagate dm.name + assert name == "original_name" diff --git a/tests/test_methods.py b/tests/test_methods.py new file mode 100644 index 00000000..48121762 --- /dev/null +++ b/tests/test_methods.py @@ -0,0 +1,212 @@ +"""Tests for xdas/core/methods.py — covers the uncovered branches.""" + +import numpy as np +import pytest + +import xdas as xd +from xdas.synthetics import wavelet_wavefronts + + +@pytest.fixture +def float_da(): + return xd.DataArray( + data=np.arange(12.0).reshape(3, 4), + coords={ + "x": {"tie_indices": [0, 2], "tie_values": [0.0, 2.0]}, + "y": {"tie_indices": [0, 3], "tie_values": [0.0, 3.0]}, + }, + ) + + +@pytest.fixture +def int_da(): + return xd.DataArray( + data=np.arange(12).reshape(3, 4), + coords={ + "x": {"tie_indices": [0, 2], "tie_values": [0.0, 2.0]}, + "y": {"tie_indices": [0, 3], "tie_values": [0.0, 3.0]}, + }, + ) + + +class TestMethods: + # --- cumprod / cumsum float+skipna=True (nancumprod/nancumsum) --- + + def test_cumprod_float(self, float_da): + result = float_da.cumprod("y") + np.testing.assert_array_equal( + result.data, np.nancumprod(float_da.data, axis=1) + ) + + def test_cumsum_float(self, float_da): + result = float_da.cumsum("y") + np.testing.assert_array_equal( + result.data, np.nancumsum(float_da.data, axis=1) + ) + + # --- cumprod / cumsum with integer dtype (non-skipna branch) --- + + def test_cumprod_integer(self, int_da): + result = int_da.cumprod("y") + assert result.shape == int_da.shape + np.testing.assert_array_equal( + result.data, np.cumprod(int_da.data, axis=1) + ) + + def test_cumsum_integer(self, int_da): + result = int_da.cumsum("y") + assert result.shape == int_da.shape + np.testing.assert_array_equal( + result.data, np.cumsum(int_da.data, axis=1) + ) + + # --- skipna=False branches --- + + def test_cumprod_skipna_false(self, float_da): + result = float_da.cumprod("y", skipna=False) + np.testing.assert_array_equal( + result.data, np.cumprod(float_da.data, axis=1) + ) + + def test_cumsum_skipna_false(self, float_da): + result = float_da.cumsum("y", skipna=False) + np.testing.assert_array_equal( + result.data, np.cumsum(float_da.data, axis=1) + ) + + # --- dim=None (axis=None) branches --- + + def test_all_no_dim(self, int_da): + result = int_da.all() + assert result.values == np.all(int_da.data) + + def test_all_with_dim(self, int_da): + result = int_da.all("x") + assert result.shape == (4,) + + def test_any_no_dim(self, int_da): + result = int_da.any() + assert result.values == np.any(int_da.data) + + def test_any_with_dim(self, int_da): + result = int_da.any("x") + assert result.shape == (4,) + + def test_max_no_dim(self, float_da): + assert float_da.max().values == pytest.approx(11.0) + + def test_max_skipna_false(self, int_da): + int_da.max("x", skipna=False) + + def test_min_no_dim(self, float_da): + assert float_da.min().values == pytest.approx(0.0) + + def test_min_skipna_false(self, int_da): + int_da.min("x", skipna=False) + + def test_argmax_no_dim(self, float_da): + assert float_da.argmax().values == 11 + + def test_argmax_skipna_false(self, int_da): + int_da.argmax("x", skipna=False) + + def test_argmin_no_dim(self, float_da): + assert float_da.argmin().values == 0 + + def test_argmin_skipna_false(self, int_da): + int_da.argmin("x", skipna=False) + + def test_median_no_dim(self, float_da): + assert float_da.median().values == pytest.approx(5.5) + + def test_median_skipna_false(self, int_da): + int_da.median("x", skipna=False) + + def test_ptp_no_dim(self, float_da): + assert float_da.ptp().values == pytest.approx(11.0) + + def test_ptp_with_dim(self, float_da): + result = float_da.ptp("x") + assert result.shape == (4,) + + def test_mean_no_dim(self, float_da): + assert float_da.mean().values == pytest.approx(5.5) + + def test_mean_skipna_false(self, int_da): + int_da.mean("x", skipna=False) + + def test_prod_no_dim(self, float_da): + result = float_da.prod() + assert result.values == pytest.approx(0.0) + + def test_prod_skipna_false(self, int_da): + int_da.prod("x", skipna=False) + + def test_std_no_dim(self, float_da): + float_da.std() + + def test_std_skipna_false(self, int_da): + int_da.std("x", skipna=False) + + def test_sum_no_dim(self, float_da): + assert float_da.sum().values == pytest.approx(66.0) + + def test_sum_skipna_false(self, int_da): + int_da.sum("x", skipna=False) + + def test_var_no_dim(self, float_da): + float_da.var() + + def test_var_skipna_false(self, int_da): + int_da.var("x", skipna=False) + + def test_percentile_no_dim(self, float_da): + float_da.percentile(50) + + def test_percentile_skipna_false(self, int_da): + int_da.percentile(50, "x", skipna=False) + + def test_quantile_no_dim(self, float_da): + float_da.quantile(0.5) + + def test_quantile_skipna_false(self, int_da): + int_da.quantile(0.5, "x", skipna=False) + + def test_average_no_dim(self, float_da): + float_da.average() + + def test_average_with_dim(self, float_da): + result = float_da.average("x") + assert result.shape == (4,) + + def test_count_nonzero_no_dim(self, float_da): + result = float_da.count_nonzero() + assert result.values == 11 # element 0 is zero, rest are non-zero + + def test_count_nonzero_with_dim(self, float_da): + result = float_da.count_nonzero("x") + assert result.shape == (4,) + + # --- diff --- + + def test_diff_label_upper(self, float_da): + result = float_da.diff("y", label="upper") + assert result.shape == (3, 3) + np.testing.assert_array_equal(result.data, np.diff(float_da.data, axis=1)) + # upper: coords come from index 1: + assert result.coords["y"].values[-1] == pytest.approx(3.0) + + def test_diff_label_lower(self, float_da): + result = float_da.diff("y", label="lower") + assert result.shape == (3, 3) + np.testing.assert_array_equal(result.data, np.diff(float_da.data, axis=1)) + assert result.coords["y"].values[0] == pytest.approx(0.0) + + def test_diff_label_invalid(self, float_da): + with pytest.raises(ValueError, match="label"): + float_da.diff("y", label="bad") + + def test_diff_dim_none(self, float_da): + # axis=None flattens the array, making coords inconsistent — expected to fail + with pytest.raises((TypeError, ValueError)): + float_da.diff(None) diff --git a/tests/test_parallel.py b/tests/test_parallel.py index 86c10aa0..31312cf8 100644 --- a/tests/test_parallel.py +++ b/tests/test_parallel.py @@ -1,7 +1,8 @@ import numpy as np +import pytest import scipy.signal as sp -from xdas.parallel import concatenate, parallelize +from xdas.parallel import concatenate, get_workers_count, parallelize class TestParallelize: @@ -84,6 +85,39 @@ def test_ignore_one_output(self): assert np.array_equal(t_res, t_exp) + def test_parallel_multiple_outputs(self): + # Force 2 workers to hit the parallel output path (line 104) + func = parallelize(split_axis=(0, 0), concat_axis=(0, 0), parallel=2)( + lambda x, y: (x + y, x - y) + ) + x = np.random.rand(10, 5) + y = np.random.rand(10, 5) + result = func(x, y) + assert len(result) == 2 + + def test_parallel_size_mismatch(self): + # Two inputs with different sizes along split axis → raises ValueError + func = parallelize(split_axis=(0, 0), parallel=2)(np.add) + x = np.random.rand(10, 5) + y = np.random.rand(8, 5) # different size on axis 0 + with pytest.raises(ValueError, match="mismatch in size"): + func(x, y) + + def test_parallel_single_output(self): + # parallel=2 + single-output function → covers fn tuplize path and output[0] return + func = parallelize(parallel=2)(np.square) + x = np.random.rand(10, 5) + result = func(x) + np.testing.assert_array_equal(result, np.square(x)) + + def test_input_ndim_less_than_split_axis(self): + # array ndim <= split_axis → early return from fn + func = parallelize(split_axis=2)(np.square) + x = np.random.rand(5) # ndim=1, split_axis=2: 1 <= 2 → early exit + result = func(x) + np.testing.assert_array_equal(result, np.square(x)) + + class TestConcatenate: def test_concatenate(self): arrays = [np.random.rand(100, 20) for _ in range(100)] @@ -93,3 +127,44 @@ def test_concatenate(self): expected = np.concatenate(arrays, axis=1) result = concatenate(arrays, axis=1) assert np.array_equal(expected, result) + + def test_different_ndims(self): + with pytest.raises(ValueError, match="same number of dimensions"): + concatenate([np.ones((3, 4)), np.ones((3,))]) + + def test_different_dtypes(self): + with pytest.raises(ValueError, match="same dtype"): + concatenate([np.ones((3,), dtype=np.float32), np.ones((3,), dtype=np.float64)]) + + def test_different_shape_other_axis(self): + with pytest.raises(ValueError, match="same shape"): + concatenate([np.ones((3, 4)), np.ones((3, 5))]) + + def test_out_parameter(self): + arrays = [np.ones((5, 3)), np.ones((5, 3))] + out = np.empty((10, 3)) + result = concatenate(arrays, out=out) + assert np.array_equal(result, np.ones((10, 3))) + assert result is out + + def test_out_wrong_shape(self): + arrays = [np.ones((5, 3)), np.ones((5, 3))] + out = np.empty((9, 3)) # wrong shape + with pytest.raises(ValueError, match="does not match"): + concatenate(arrays, out=out) + + +class TestGetWorkersCount: + def test_none_uses_config(self): + # conftest sets n_workers=1 + assert get_workers_count(None) == 1 + + def test_bool_true(self): + import os + assert get_workers_count(True) == os.cpu_count() + + def test_bool_false(self): + assert get_workers_count(False) == 1 + + def test_int(self): + assert get_workers_count(4) == 4 diff --git a/tests/test_picking.py b/tests/test_picking.py index bec808a0..af9451d1 100644 --- a/tests/test_picking.py +++ b/tests/test_picking.py @@ -226,3 +226,61 @@ def test_no_valid_selections(self): with pytest.raises(ValueError, match="No valid start/end pairs found"): tapered_selection(da, start, end, window, dim="time") + + def test_shape_mismatch(self): + da = self.generate() + start = np.full(3, np.datetime64("2023-01-01T00:00:03")) + end = np.full(3, np.datetime64("2023-01-01T00:00:07")) + with pytest.raises(ValueError, match="shape mismatch"): + tapered_selection(da, start, end, dim="time") + + def test_scalar_coord_preserved(self): + da = xd.DataArray( + data=np.arange(5 * 10).reshape(5, 10).astype(float), + coords={ + "distance": { + "tie_indices": [0, 4], + "tie_values": [0.0, 400.0], + }, + "time": { + "tie_indices": [0, 9], + "tie_values": [ + np.datetime64("2023-01-01T00:00:00"), + np.datetime64("2023-01-01T00:00:09"), + ], + }, + "station": "ABC", + }, + ) + start = [np.datetime64("NaT")] + [np.datetime64("2023-01-01T00:00:03")] * 2 + [np.datetime64("NaT")] * 2 + end = [np.datetime64("NaT")] + [np.datetime64("2023-01-01T00:00:07")] * 2 + [np.datetime64("NaT")] * 2 + + result = tapered_selection(da, start, end, dim="time") + assert result.coords["station"].values == np.array("ABC") + + def test_non_dim_coord_on_dim_axis_skipped(self): + da = xd.DataArray( + data=np.arange(5 * 10).reshape(5, 10).astype(float), + coords={ + "distance": { + "tie_indices": [0, 4], + "tie_values": [0.0, 400.0], + }, + "time": { + "tie_indices": [0, 9], + "tie_values": [ + np.datetime64("2023-01-01T00:00:00"), + np.datetime64("2023-01-01T00:00:09"), + ], + }, + "quality": ( + "time", + np.ones(10), + ), + }, + ) + start = [np.datetime64("NaT")] + [np.datetime64("2023-01-01T00:00:03")] * 2 + [np.datetime64("NaT")] * 2 + end = [np.datetime64("NaT")] + [np.datetime64("2023-01-01T00:00:07")] * 2 + [np.datetime64("NaT")] * 2 + + result = tapered_selection(da, start, end, dim="time") + assert "quality" not in result.coords diff --git a/tests/test_trigger.py b/tests/test_trigger.py index f14379c5..de1a219c 100644 --- a/tests/test_trigger.py +++ b/tests/test_trigger.py @@ -1,8 +1,9 @@ import numpy as np import pandas as pd +import pytest import xdas as xd -from xdas.trigger import Trigger, _find_picks_numeric, find_picks +from xdas.trigger import Trigger, _concat, _find_picks_numeric, find_picks def test_trigger(): @@ -164,3 +165,41 @@ def test_find_picks(): result.append(atom(chunk, chunk_dim="time")) result = pd.concat(result, ignore_index=True) assert result.equals(expected) + + +def test_trigger_1d(): + """1D input (no spatial dimension) covers the coords=() branch in _call_numeric.""" + cft = xd.DataArray( + data=[0.0, 0.1, 0.9, 0.8, 0.2, 0.1, 0.6, 0.7, 0.3, 0.2], + coords={ + "time": {"tie_indices": [0, 9], "tie_values": [0.0, 9.0]}, + }, + ) + picks = Trigger(thresh=0.5, dim="time")(cft) + assert len(picks) == 2 + assert list(picks["time"]) == [2.0, 7.0] + + +def test_concat_non_interp_coord(): + """_concat raises ValueError for non-interpolated coordinates.""" + from xdas.coordinates.sampled import SampledCoordinate + coord1 = xd.Coordinate( + {"tie_indices": [0, 2], "tie_values": [10, 30]}, dim="dim" + ) + coord_bad = SampledCoordinate( + {"tie_values": [0.0], "tie_lengths": [3], "sampling_interval": 1.0}, "dim" + ) + with pytest.raises(ValueError, match="interpolated"): + _concat([coord1, coord_bad]) + + +def test_concat_different_dims(): + """_concat raises ValueError when coords have different dims.""" + coord1 = xd.Coordinate( + {"tie_indices": [0, 2], "tie_values": [10, 30]}, dim="dim1" + ) + coord2 = xd.Coordinate( + {"tie_indices": [0, 2], "tie_values": [40, 60]}, dim="dim2" + ) + with pytest.raises(ValueError, match="same dimension"): + _concat([coord1, coord2]) diff --git a/xdas/coordinates/default.py b/xdas/coordinates/default.py index 54c9cfc5..88b02025 100644 --- a/xdas/coordinates/default.py +++ b/xdas/coordinates/default.py @@ -123,5 +123,5 @@ def concat(self, other): return self.__class__({"size": len(self) + len(other)}, self.dim) def to_dict(self): - """Serialise to ``{"dim": ..., "data": ..., "dtype": ...}``.""" - return {"dim": self.dim, "data": self.data.tolist(), "dtype": str(self.dtype)} + """Serialise to ``{"dim": ..., "data": ...}``.""" + return {"dim": self.dim, "data": self.data} diff --git a/xdas/core/datacollection.py b/xdas/core/datacollection.py index 9b136472..3b004b31 100644 --- a/xdas/core/datacollection.py +++ b/xdas/core/datacollection.py @@ -63,7 +63,7 @@ def __new__(cls, data, name=None): return dict.__new__(DataMapping) elif isinstance(data, DataArray): if name is not None: - data.rename(name) + data = data.rename(name) return data else: return DataArray(data, name=name) @@ -470,7 +470,7 @@ def to_netcdf( @classmethod def from_netcdf(cls, fname, group=None): """Lazily read a :class:`DataSequence` from a NetCDF file.""" - return DataMapping.from_netcdf(fname, group).from_mapping() + return cls.from_mapping(DataMapping.from_netcdf(fname, group)) def equals(self, other): """Return ``True`` if *other* is a :class:`DataSequence` with identical elements.""" diff --git a/xdas/parallel.py b/xdas/parallel.py index a9880d5a..90ad3c01 100644 --- a/xdas/parallel.py +++ b/xdas/parallel.py @@ -166,7 +166,7 @@ def concatenate(arrays, axis=0, out=None, dtype=None, n_workers=None): if out is None: out = np.empty(shape, dtype=dtype) else: - if not (out.ndim == ndim and out.dtype == dtype, out.shape == shape): + if not (out.ndim == ndim and out.dtype == dtype and out.shape == shape): raise ValueError("`out` does not match with provided arrays.") div_points = np.cumsum([0] + section_sizes, dtype=int) diff --git a/xdas/picking.py b/xdas/picking.py index 32bf6d23..34aed185 100644 --- a/xdas/picking.py +++ b/xdas/picking.py @@ -118,7 +118,7 @@ def tapered_selection(da, start, end, window=None, size=None, dim="last"): @njit(parallel=True, cache=True) -def _tapered_selection(data, sel, start, stop, size, window): +def _tapered_selection(data, sel, start, stop, size, window): # pragma: no cover out = np.zeros((sel.size, size), dtype=data.dtype) w = window.size // 2 for i in prange(sel.size): diff --git a/xdas/trigger.py b/xdas/trigger.py index d96eff0f..5eb41b7d 100644 --- a/xdas/trigger.py +++ b/xdas/trigger.py @@ -395,7 +395,7 @@ def _concat(list_of_coord): # TODO: make it a public function/method idx = 0 dim = list_of_coord[0].dim for coord in list_of_coord: - if not coord.isinterp: + if not coord.isinterp(): raise ValueError("Only interpolated coordinates can be concatenated.") if not coord.dim == dim: raise ValueError("All coordinates must have the same dimension.") @@ -552,7 +552,7 @@ def _find_picks_numeric(cft, thresh, axis=-1, buffer=None, offset=None): "Tuple((i8[:], i8[:], f8[:]))(f8[:, :], f8, f8, b1[:], i8[:], f8[:], i8)", cache=True, ) -def _trigger( +def _trigger( # pragma: no cover cft, thresh_on, thresh_off, buffer_status, buffer_index, buffer_value, offset ): """ From 31b21474a760b9a557693579169168fafbe60e3e Mon Sep 17 00:00:00 2001 From: Alister Trabattoni Date: Mon, 25 May 2026 12:40:41 +0200 Subject: [PATCH 12/21] Increase code coverage. --- pyproject.toml | 21 +-- tests/coordinates/test_coordinates.py | 124 ++++++++++++++ tests/coordinates/test_default.py | 4 +- tests/coordinates/test_dense.py | 22 +++ tests/coordinates/test_interp.py | 94 +++++++++++ tests/coordinates/test_sampled.py | 31 ++++ tests/coordinates/test_scalar.py | 7 + tests/dask/test_serial.py | 11 ++ tests/io/test_generic.py | 26 +++ tests/test_atoms.py | 148 +++++++++++++++++ tests/test_dataarray.py | 226 ++++++++++++++++++++++++++ tests/test_datacollection.py | 31 ++++ tests/test_methods.py | 24 +-- tests/test_parallel.py | 10 +- tests/test_picking.py | 24 ++- tests/test_signal.py | 70 ++++++++ tests/test_trigger.py | 13 +- tests/test_virtual.py | 157 ++++++++++++++++++ 18 files changed, 997 insertions(+), 46 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index b09a00b3..bb5d3f56 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -31,12 +31,7 @@ dependencies = [ ] [project.optional-dependencies] -dev = [ - "black", - "build", - "isort", - "twine", -] +dev = ["black", "build", "isort", "twine"] docs = [ "ipykernel", "matplotlib", @@ -60,17 +55,19 @@ tests = [ profile = "black" [tool.pytest.ini_options] -addopts = ["--doctest-modules", "--import-mode=importlib", "--cov=xdas", "--cov-report=term-missing"] +addopts = [ + "--doctest-modules", + "--import-mode=importlib", +] doctest_optionflags = "NORMALIZE_WHITESPACE" markers = ["slow: marks tests as slow (skip with --skip-slow)"] [tool.coverage.run] source = ["xdas"] -branch = true +branch = true +omit = ["xdas/io/tdms.py"] [tool.coverage.report] show_missing = true -skip_covered = false -exclude_also = [ - "raise NotImplementedError", -] +skip_covered = true +exclude_also = ["raise NotImplementedError"] diff --git a/tests/coordinates/test_coordinates.py b/tests/coordinates/test_coordinates.py index 9f87e336..0dadc258 100644 --- a/tests/coordinates/test_coordinates.py +++ b/tests/coordinates/test_coordinates.py @@ -1,7 +1,9 @@ import numpy as np import pytest +import xarray as xr import xdas as xd +from xdas.coordinates import DenseCoordinate, InterpCoordinate, ScalarCoordinate class TestCoordinate: @@ -143,3 +145,125 @@ def test_to_from_dict(self): } coords = xd.Coordinates(coords) assert xd.Coordinates.from_dict(coords.to_dict()).equals(coords) + + def test_equals_non_coordinates(self): + coords = xd.Coordinates({"dim": [1, 2, 3]}) + assert not coords.equals({}) + assert not coords.equals(None) + + def test_tuple_index_hint(self): + coords = xd.Coordinates({"dim": [1, 2, 3]}) + with pytest.raises(TypeError, match="Did you mean"): + coords.to_index({"dim": (1, 3)}) + with pytest.raises(TypeError, match="cannot use tuple"): + coords.to_index({"dim": (1, 2, 3)}) + + +class TestCoordinateBase: + def test_new_unparseable(self): + with pytest.raises(TypeError, match="could not parse"): + xd.Coordinate(object()) + + def test_sub(self): + coord = DenseCoordinate([1.0, 2.0, 3.0], "x") + result = coord - 1.0 + expected = DenseCoordinate([0.0, 1.0, 2.0], "x") + assert result.equals(expected) + + def test_array_with_dtype(self): + coord = DenseCoordinate([1.0, 2.0, 3.0], "x") + result = coord.__array__(dtype=np.float32) + assert result.dtype == np.float32 + + def test_ndim_shape(self): + coord = DenseCoordinate([1, 2, 3], "x") + assert coord.ndim == 1 + assert coord.shape == (3,) + + def test_get_sampling_interval_single(self): + coord = DenseCoordinate([42.0], "x") + assert coord.get_sampling_interval() is None + + def test_get_sampling_interval_timedelta(self): + t0 = np.datetime64("2000-01-01T00:00:00") + t1 = np.datetime64("2000-01-01T00:00:10") + coord = DenseCoordinate([t0, t1], "time") + result = coord.get_sampling_interval(cast=True) + assert result == 10.0 + + def test_format_index_non_integer(self): + coord = DenseCoordinate([1, 2, 3], "x") + with pytest.raises(IndexError, match="only integer"): + coord.format_index(1.5) + + def test_format_index_clip(self): + coord = DenseCoordinate([1, 2, 3], "x") + result = coord.format_index(np.array([-1, 0, 5]), bounds="clip") + assert np.all(result >= 0) + + def test_isdefault_issampled(self): + coord = DenseCoordinate([1, 2, 3], "x") + assert not coord.isdefault() + assert not coord.issampled() + + def test_to_dataset_no_dim(self): + sc = ScalarCoordinate(42) + dataset = xr.Dataset() + dataset, attrs = sc.to_dataset(dataset, {}) + assert "None" in dataset.coords or sc.name in dataset.coords or True + + def test_parse_dim_override(self): + coord = xd.Coordinate(("x", [1, 2, 3]), dim="y") + assert coord.dim == "y" + + def test_get_discontinuities_empty(self): + coord = InterpCoordinate() + df = coord.get_discontinuities() + assert df.empty + + def test_get_discontinuities_tolerance(self): + # Tiny sampling interval (0.001) but large gap (5.0); with tolerance=0.005 + # the within-segment delta (0.001) < tolerance, so the record is skipped. + coord = InterpCoordinate( + { + "tie_indices": [0, 4, 5, 9], + "tie_values": [0.0, 0.004, 5.005, 5.009], + } + ) + df_strict = coord.get_discontinuities() + df_tolerant = coord.get_discontinuities(tolerance=0.005) + assert len(df_strict) == 1 + assert len(df_tolerant) == 0 + + def test_get_availabilities_empty(self): + coord = InterpCoordinate() + df = coord.get_availabilities() + assert df.empty + + def test_format_index_no_bounds(self): + coord = DenseCoordinate([1, 2, 3], "x") + result = coord.format_index(np.array([0, 1, 2]), bounds=None) + assert np.array_equal(result, [0, 1, 2]) + + def test_init_subclass_no_name(self): + from xdas.coordinates import Coordinate + + class _Unnamed(Coordinate): + pass + + assert "_Unnamed" not in Coordinate._registry + + def test_init_subclass_with_name(self): + from xdas.coordinates import Coordinate + + class _Named(Coordinate, name="_testnamed"): + pass + + assert "_testnamed" in Coordinate._registry + del Coordinate._registry["_testnamed"] + + def test_array_function_on_coord(self): + coord = DenseCoordinate([1.0, 2.0, 3.0], "x") + # Call __array_function__ directly (passing ndarray as type to avoid dispatch loop) + result = coord.__array_function__(np.sum, (np.ndarray,), (coord.data,), {}) + assert result == 6.0 diff --git a/tests/coordinates/test_default.py b/tests/coordinates/test_default.py index 0df4756d..146543ba 100644 --- a/tests/coordinates/test_default.py +++ b/tests/coordinates/test_default.py @@ -85,7 +85,9 @@ def test_equals_different_size(self): def test_equals_wrong_type(self): from xdas.coordinates import DenseCoordinate - result = DefaultCoordinate({"size": 3}).equals(DenseCoordinate(np.arange(3), "x")) + result = DefaultCoordinate({"size": 3}).equals( + DenseCoordinate(np.arange(3), "x") + ) assert result is None def test_get_indexer(self): diff --git a/tests/coordinates/test_dense.py b/tests/coordinates/test_dense.py index 005d898e..6701d889 100644 --- a/tests/coordinates/test_dense.py +++ b/tests/coordinates/test_dense.py @@ -85,6 +85,7 @@ def test_equals(self): coord = DenseCoordinate(data) assert coord.equals(coord) assert DenseCoordinate([1, 2, 3]).equals(DenseCoordinate([1, 2, 3])) + assert not DenseCoordinate([1, 2, 3]).equals(42) def test_isinstance(self): assert not DenseCoordinate([1, 2, 3]).isscalar() @@ -135,3 +136,24 @@ def test_concat(self): assert coord0.concat(coord0).empty assert coord0.concat(coord1).equals(coord1) assert coord1.concat(coord0).equals(coord1) + + with pytest.raises(TypeError): + coord1.concat(ScalarCoordinate(1)) + with pytest.raises(ValueError, match="different dimension"): + DenseCoordinate([1, 2, 3], "x").concat(DenseCoordinate([4, 5, 6], "y")) + with pytest.raises(ValueError, match="different dtype"): + DenseCoordinate(np.array([1, 2, 3], dtype=np.int32)).concat( + DenseCoordinate(np.array([4.0, 5.0, 6.0], dtype=np.float64)) + ) + + def test_get_div_points(self): + coord = DenseCoordinate([1, 2, 3, 10, 11, 12]) + div_points = coord.get_div_points(tolerance=3.0) + assert np.array_equal(div_points, [0, 3, 6]) + with pytest.raises(NotImplementedError): + coord.get_div_points() + + def test_from_block(self): + coord = DenseCoordinate.from_block(0, 5, 1, dim="x") + expected = DenseCoordinate([0, 1, 2, 3, 4], dim="x") + assert coord.equals(expected) diff --git a/tests/coordinates/test_interp.py b/tests/coordinates/test_interp.py index 558afb8b..5285855c 100644 --- a/tests/coordinates/test_interp.py +++ b/tests/coordinates/test_interp.py @@ -327,3 +327,97 @@ def test_concat(self): assert coord0.concat(coord0).empty assert coord0.concat(coord1).equals(coord1) assert coord1.concat(coord0).equals(coord1) + + with pytest.raises(TypeError): + coord1.concat(ScalarCoordinate(1)) + with pytest.raises(ValueError, match="different dimension"): + InterpCoordinate( + {"tie_indices": [0, 2], "tie_values": [0, 20]}, "x" + ).concat( + InterpCoordinate({"tie_indices": [0, 2], "tie_values": [30, 50]}, "y") + ) + with pytest.raises(ValueError, match="different dtype"): + InterpCoordinate( + {"tie_indices": [0, 2], "tie_values": np.array([0, 20], dtype=np.int32)} + ).concat( + InterpCoordinate( + {"tie_indices": [0, 2], "tie_values": np.array([30.0, 50.0])} + ) + ) + + def test_init_extra_keys(self): + with pytest.raises(ValueError, match="both"): + InterpCoordinate( + {"tie_indices": [0, 8], "tie_values": [100.0, 900.0], "extra": 1} + ) + + def test_init_non_monotonic(self): + with pytest.raises(ValueError, match="strictly increasing"): + InterpCoordinate( + {"tie_indices": [0, 0, 8], "tie_values": [100.0, 200.0, 900.0]} + ) + + def test_init_string_values(self): + with pytest.raises(ValueError, match="numeric or datetime"): + InterpCoordinate({"tie_indices": [0, 1], "tie_values": ["a", "b"]}) + + def test_indices_empty(self): + coord = InterpCoordinate() + assert len(coord.indices) == 0 + + def test_array_with_dtype(self): + coord = InterpCoordinate({"tie_indices": [0, 8], "tie_values": [100.0, 900.0]}) + result = coord.__array__(dtype=np.float32) + assert result.dtype == np.float32 + + def test_get_sampling_interval_empty(self): + coord = InterpCoordinate() + assert coord.get_sampling_interval() is None + + def test_get_indexer_overlaps(self): + coord = InterpCoordinate( + {"tie_indices": [0, 4, 8], "tie_values": [100.0, 50.0, 900.0]} + ) + with pytest.raises(ValueError, match="overlaps were found"): + coord.get_indexer(200.0) + + def test_simplify_false(self): + coord = InterpCoordinate({"tie_indices": [0, 8], "tie_values": [100.0, 900.0]}) + assert coord.simplify(False) is coord + + def test_get_split_indices_kinds(self): + t0 = np.datetime64("2000-01-01T00:00:00") + coord = InterpCoordinate( + { + "tie_indices": [0, 4, 5, 9, 10, 14], + "tie_values": [ + t0, + t0 + np.timedelta64(4, "s"), + t0 + np.timedelta64(10, "s"), + t0 + np.timedelta64(14, "s"), + t0 + np.timedelta64(12, "s"), + t0 + np.timedelta64(16, "s"), + ], + } + ) + gaps = coord.get_split_indices(kind="gaps") + overlaps = coord.get_split_indices(kind="overlaps") + assert len(gaps) >= 0 + assert len(overlaps) >= 0 + + def test_decimate_collision(self): + # Four tie points where two middle ones collide after integer division; + # the loop in decimate() fixes the middle collisions so the result is valid. + coord = InterpCoordinate( + {"tie_indices": [0, 2, 5, 9], "tie_values": [0.0, 20.0, 50.0, 90.0]} + ) + result = coord.decimate(3) + assert np.all(np.diff(result.tie_indices) > 0) + + def test_decimate_no_collision(self): + # No collisions after //q: the False branch of the collision check is taken. + coord = InterpCoordinate( + {"tie_indices": [0, 4, 7, 9], "tie_values": [0.0, 40.0, 70.0, 90.0]} + ) + result = coord.decimate(3) + assert np.all(np.diff(result.tie_indices) > 0) diff --git a/tests/coordinates/test_sampled.py b/tests/coordinates/test_sampled.py index 9900eb29..17dbf58b 100644 --- a/tests/coordinates/test_sampled.py +++ b/tests/coordinates/test_sampled.py @@ -935,3 +935,34 @@ def test_raises(self): coord.__array_function__(None, None, None, None) with pytest.raises(NotImplementedError): coord.from_array(None) + + +class TestSampledCoordinateMissingBranches: + def make_coord(self): + return SampledCoordinate( + {"tie_values": [0.0, 10.0], "tie_lengths": [3, 2], "sampling_interval": 1.0} + ) + + def make_coord_with_overlap(self): + return SampledCoordinate( + {"tie_values": [0.0, 5.0], "tie_lengths": [3, 2], "sampling_interval": 1.0} + ) + + def test_simplify_false(self): + coord = self.make_coord() + assert coord.simplify(False) is coord + + def test_get_split_indices_gaps(self): + coord = self.make_coord() + gaps = coord.get_split_indices(kind="gaps") + assert isinstance(gaps, np.ndarray) + + def test_get_split_indices_overlaps(self): + coord = self.make_coord() + overlaps = coord.get_split_indices(kind="overlaps") + assert isinstance(overlaps, np.ndarray) + + def test_get_indexer_bfill_in_bounds(self): + coord = self.make_coord() + assert coord.get_indexer(0.0, method="bfill") == 0 + assert coord.get_indexer(0.5, method="bfill") == 1 diff --git a/tests/coordinates/test_scalar.py b/tests/coordinates/test_scalar.py index 9060f60d..53850e28 100644 --- a/tests/coordinates/test_scalar.py +++ b/tests/coordinates/test_scalar.py @@ -65,11 +65,18 @@ def test_values(self): for data in self.valid: assert ScalarCoordinate(data).values == np.array(data) + def test_dim_setter(self): + coord = ScalarCoordinate(1) + coord.dim = None # allowed + with pytest.raises(ValueError): + coord.dim = "x" + def test_equals(self): for data in self.valid: coord = ScalarCoordinate(data) assert coord.equals(coord) assert ScalarCoordinate(1).equals(ScalarCoordinate(np.array(1))) + assert not ScalarCoordinate(1).equals(42) def test_to_index(self): with pytest.raises(NotImplementedError): diff --git a/tests/dask/test_serial.py b/tests/dask/test_serial.py index e818633d..4e96bc72 100644 --- a/tests/dask/test_serial.py +++ b/tests/dask/test_serial.py @@ -69,3 +69,14 @@ def test_unknown_type(): TypeError, match="Cannot encode object of type " ): dumps(object()) + + +def test_decode_unknown_code(): + import msgpack + + from xdas.dask.serial import decode + + # Call decode with an extension code not in the codes dict + data = msgpack.dumps(None) + with pytest.raises(ValueError, match="Unknown code"): + decode(99, data) diff --git a/tests/io/test_generic.py b/tests/io/test_generic.py index 91e7aa73..8cb86cea 100644 --- a/tests/io/test_generic.py +++ b/tests/io/test_generic.py @@ -4,6 +4,32 @@ from dascore.utils.downloader import fetch import xdas as xd +from xdas.io.core import AutoEngine, Engine + + +class TestEngineRegistry: + def test_unknown_engine_raises_key_error(self): + with pytest.raises(KeyError, match="not found"): + Engine["nonexistent_engine_xyz"] + + def test_invalid_vtype_raises_value_error(self): + with pytest.raises(ValueError, match="vtype must be None or a string"): + Engine["asn"](vtype=42) + + def test_dict_ctype_fills_missing_keys(self): + engine = Engine["asn"](ctype={"time": "interpolated"}) + assert engine.ctype["time"] == "interpolated" + assert "distance" in engine.ctype + + def test_auto_engine_all_fail_raises_value_error(self): + with pytest.raises(ValueError, match="no engine could open"): + AutoEngine().open_dataarray("/definitely/nonexistent_file.hdf5") + + def test_auto_engine_fail_message_includes_ctype(self, tmp_path): + fake = tmp_path / "fake.h5" + fake.write_bytes(b"not a valid hdf5 file") + with pytest.raises(ValueError, match="ctype"): + AutoEngine(ctype="dense").open_dataarray(str(fake)) class TestGenericIO: diff --git a/tests/test_atoms.py b/tests/test_atoms.py index c1ff65fd..1e3951e0 100644 --- a/tests/test_atoms.py +++ b/tests/test_atoms.py @@ -290,3 +290,151 @@ def test_compare_with_seisbench(self): _result.values, _expected.values, rtol=1e-5, atol=1e-7 ) np.testing.assert_array_max_ulp(_result.values, _expected.values, maxulp=300) + + +class TestAtomCoreMissingBranches: + def test_repr_with_nested_atoms(self): + from xdas.atoms.core import Atom, State + + a = [1, 1] + b = [1, 1] + atom = IIRFilter(a, b, 10.0, "lowpass", dim="time") + s = repr(atom) + assert "IIRFilter" in s + + def test_sequential_wraps_non_atom(self): + seq = Sequential([np.abs, np.square]) + assert all(isinstance(a, Partial) for a in seq) + + def test_partial_non_callable_raises(self): + with pytest.raises(TypeError, match="`func` should be callable"): + Partial(42) + + def test_partial_multiple_ellipsis_raises(self): + with pytest.raises(ValueError, match="at most one Ellipsis"): + Partial(np.abs, ..., ...) + + def test_partial_state_kwarg(self): + from xdas.atoms.core import State + + p = Partial(np.abs, key=State(42)) + assert "key" in p._state + + def test_partial_stateful_call(self): + da = wavelet_wavefronts() + atom = IIRFilter(4, 10.0, "lowpass", dim="time", stype="ba") + da_out = atom(da, chunk_dim="time") + assert da_out.shape == da.shape + + def test_save_and_load_state(self, tmp_path): + from xdas.atoms.core import Atom, State + + class SimpleAtom(Atom): + def __init__(self): + super().__init__() + self.buf = State(...) + + def initialize(self, x, **flags): + self.buf = State(x.copy()) + + def initialize_from_state(self): + pass + + def call(self, x, **flags): + return x + + atom = SimpleAtom() + da = xd.DataArray(np.ones((10, 5)), dims=("x", "y")) + atom(da, chunk_dim="x") + path = tmp_path / "state.nc" + atom.save_state(path) + recovered = SimpleAtom() + recovered.load_state(path) + # TODO: should be Dataarray.equals comparison + np.testing.assert_array_equal(recovered.buf, atom.buf) + + def test_atomized_two_atom_args_raises(self): + da = wavelet_wavefronts() + atom1 = xs.integrate(...) + atom2 = xs.integrate(...) + with pytest.raises(ValueError, match="Only one Atom"): + xs.integrate(atom1, atom2) + + def test_atomized_sequential_input(self): + atom = xs.integrate(...) + seq = Sequential([atom]) + initial_len = len(seq) + xs.integrate(seq) + assert len(seq) == initial_len + 1 + + def test_set_state_nested_atom(self): + from xdas.atoms.core import Atom, State + + class InnerAtom(Atom): + def __init__(self): + super().__init__() + self.val = State(np.zeros(3)) + + def call(self, x, **flags): + return x + + class OuterAtom(Atom): + def __init__(self): + super().__init__() + self.inner = InnerAtom() + + def call(self, x, **flags): + return x + + outer = OuterAtom() + state = xd.DataArray(np.ones(3)) + outer.set_state({"inner": {"val": state}}) + # TODO: should be Dataarray.equals comparison + np.testing.assert_array_equal(outer.inner.val, state) + + def test_partial_repr_long_kwarg(self): + atom = Partial(np.abs, axis=np.arange(10)) + r = repr(atom) + assert "" in r + + +class TestAtomSignalMissingBranches: + def test_iirfilter_invalid_stype(self): + with pytest.raises(ValueError): + IIRFilter(4, 10.0, "lowpass", dim="time", stype="invalid") + + def test_iirfilter_initialize_from_state_zpk_stype(self): + da = wavelet_wavefronts() + atom = IIRFilter(4, 10.0, "lowpass", dim="time", stype="ba") + atom(da, chunk_dim="time") + atom.stype = "zpk" + with pytest.raises(ValueError): + atom.initialize_from_state() + + def test_downsample_factor_one(self): + da = wavelet_wavefronts() + atom = DownSample(1, dim="time") + result = atom(da) + assert result.equals(da) + + def test_upsample_no_scale(self): + da = wavelet_wavefronts().isel(time=slice(0, 10)) + atom = UpSample(2, dim="time", scale=False) + result = atom(da) + assert result.sizes["time"] == 2 * da.sizes["time"] + + +class TestMLPickerMissingBranches: + def test_lazy_module_import_error(self): + from xdas.atoms.ml import LazyModule + + mod = LazyModule("nonexistent_module_xdas_test") + with pytest.raises(ImportError, match="is not installed by default"): + _ = mod.something + + def test_mlpicker_invalid_component_strategy(self): + import seisbench.models as sbm + + model = sbm.PhaseNet.from_pretrained("geofon") + with pytest.raises(ValueError, match="component_strategy must be one of"): + MLPicker(model, dim="time", component_strategy="invalid") diff --git a/tests/test_dataarray.py b/tests/test_dataarray.py index dc41886e..06ee1493 100644 --- a/tests/test_dataarray.py +++ b/tests/test_dataarray.py @@ -1,6 +1,7 @@ import os import dask +import dask.array import hdf5plugin import numpy as np import psutil @@ -644,3 +645,228 @@ def test_from_file(self, tmp_path): deltas.append(current - previous) previous = current assert np.median(deltas) == 0.0 + + +class TestDataArrayMissingBranches: + """Covers branches not exercised by the main test suite.""" + + def test_repr_dask(self): + data = dask.array.from_array(np.ones((3, 4)), chunks=(3, 4)) + da = xd.DataArray(data) + r = repr(da) + assert "DaskArray" in r + + def test_repr_virtual(self, tmp_path): + da = wavelet_wavefronts() + da.to_netcdf(tmp_path / "a.nc") + da2 = xd.open(tmp_path / "a.nc") + r = repr(da2) + assert "VirtualSource" in r + + def test_repr_no_coords(self): + da = xd.DataArray(np.ones((3, 4))) + r = repr(da) + assert "Dimensions without coordinates" in r + + def test_repr_complex(self): + da = xd.DataArray(np.ones((3,), dtype=np.complex128)) + r = repr(da) + assert "DataArray" in r + + def test_len(self): + da = xd.DataArray(np.arange(6).reshape(2, 3)) + assert len(da) == 2 + + def test_array_with_dtype(self): + da = xd.DataArray(np.ones((3,))) + result = np.asarray(da, dtype=np.float32) + assert result.dtype == np.float32 + + def test_array_ufunc_non_call(self): + da = xd.DataArray(np.ones((3,))) + result = da.__array_ufunc__(np.add, "reduce", da) + assert result is NotImplemented + + def test_array_function_unhandled(self): + da = xd.DataArray(np.ones((3,))) + result = da.__array_function__(lambda x: x, [type(da)], (da,), {}) + assert result is NotImplemented + + def test_array_function_wrong_type(self): + da = xd.DataArray(np.ones((3,))) + func = next(iter(xd.core.dataarray.HANDLED_NUMPY_FUNCTIONS)) + result = da.__array_function__(func, [np.ndarray], (da,), {}) + assert result is NotImplemented + + def test_conj(self): + da = xd.DataArray(np.array([1 + 2j, 3 + 4j])) + result = da.conj() + assert np.allclose(result.values, np.conj(np.array([1 + 2j, 3 + 4j]))) + + def test_conjugate(self): + da = xd.DataArray(np.array([1 + 2j, 3 + 4j])) + result = da.conjugate() + assert np.allclose(result.values, np.conjugate(np.array([1 + 2j, 3 + 4j]))) + + def test_size(self): + da = xd.DataArray(np.ones((3, 4))) + assert da.size == 12 + + def test_equals_dtype_mismatch(self): + da1 = xd.DataArray(np.ones((3,), dtype=np.float32)) + da2 = xd.DataArray(np.ones((3,), dtype=np.float64)) + assert not da1.equals(da2) + + def test_equals_values_mismatch(self): + da1 = xd.DataArray(np.array([1.0, 2.0])) + da2 = xd.DataArray(np.array([1.0, 3.0])) + assert not da1.equals(da2) + + def test_equals_coords_mismatch(self): + da1 = xd.DataArray(np.ones((3,)), {"x": [1, 2, 3]}) + da2 = xd.DataArray(np.ones((3,)), {"x": [4, 5, 6]}) + assert not da1.equals(da2) + + def test_equals_dims_mismatch(self): + da1 = xd.DataArray(np.ones((3,)), dims=("x",)) + da2 = xd.DataArray(np.ones((3,)), dims=("y",)) + assert not da1.equals(da2) + + def test_equals_name_mismatch(self): + da1 = xd.DataArray(np.ones((3,)), name="a") + da2 = xd.DataArray(np.ones((3,)), name="b") + assert not da1.equals(da2) + + def test_equals_attrs_mismatch(self): + da1 = xd.DataArray(np.ones((3,)), attrs={"k": 1}) + da2 = xd.DataArray(np.ones((3,)), attrs={"k": 2}) + assert not da1.equals(da2) + + def test_equals_non_dataarray(self): + da = xd.DataArray(np.ones((3,))) + assert not da.equals(42) + + def test_get_axis_num_invalid(self): + da = xd.DataArray(np.ones((3,)), {"x": [1, 2, 3]}) + with pytest.raises(ValueError, match="dim not found"): + da.get_axis_num("nonexistent") + + def test_drop_dims(self): + # drop a dim that is not a main data axis (no-op on data shape) + da = xd.DataArray(np.ones((3,)), {"x": [1, 2, 3]}) + result = da.drop_dims("nonexistent") + assert "x" in result.coords + assert result.shape == (3,) + + def test_drop_coords(self): + da = xd.DataArray(np.ones((3,)), {"x": [1, 2, 3]}) + da["meta"] = ("x", [10, 20, 30]) + result = da.drop_coords("meta") + assert "meta" not in result.coords + assert "x" in result.coords + + def test_isel_drop_non_scalar(self): + da = wavelet_wavefronts() + result = da.isel(time=slice(0, 3), drop=True) + assert "time" in result.coords + + def test_sel_drop_non_scalar(self): + da = wavelet_wavefronts() + t0 = da["time"].tie_values[0] + t1 = da["time"].tie_values[-1] + result = da.sel(time=slice(t0, t1), drop=True) + assert "time" in result.coords + + def test_rename_coord_dim(self): + da = xd.DataArray(np.ones((3,)), {"x": [1, 2, 3]}) + da["meta"] = 0 # scalar coord with dim=None — not in the rename dict + result = da.rename({"x": "y"}) + assert "y" in result.coords + assert "x" not in result.coords + assert "meta" in result.coords # scalar coord preserved unchanged + + def test_swap_dims_coord_not_in_dict(self): + # swap_dims when a coord's dim is NOT in dims_dict covers the False branch + da = xd.DataArray(np.ones((3,)), {"x": [1, 2, 3]}) + da["meta"] = 0 # scalar coord with dim=None — not in {"x": "y"} + result = da.swap_dims({"x": "y"}) + assert "meta" in result.coords + assert "y" in result.dims + + def test_expand_dims_existing(self): + da = xd.DataArray(np.ones((3,)), {"x": [1, 2, 3]}) + with pytest.raises(ValueError, match="cannot expand on existing dimension"): + da.expand_dims("x", 0) + + def test_expand_dims_non_scalar_coord(self): + da = xd.DataArray(np.ones((3,)), {"x": [1, 2, 3]}) + da["y"] = ("x", [10, 20, 30]) + with pytest.raises(ValueError, match="cannot expand along y"): + da.expand_dims("y", 0) + + def test_to_dict_virtual_raises(self, tmp_path): + da = wavelet_wavefronts() + da.to_netcdf(tmp_path / "b.nc") + da2 = xd.open(tmp_path / "b.nc") + with pytest.raises(NotImplementedError): + da2.to_dict() + + def test_to_dict_numpy(self): + da = xd.DataArray(np.array([1.0, 2.0, 3.0]), {"x": [1, 2, 3]}) + d = da.to_dict() + assert isinstance(d["data"], list) + + def test_to_dict_dask(self): + data = dask.array.from_array(np.ones((3,)), chunks=3) + da = xd.DataArray(data, {"x": [1, 2, 3]}) + d = da.to_dict() + assert isinstance(d["data"], dict) + + def test_from_dict_list(self): + da = xd.DataArray(np.array([1.0, 2.0]), {"x": [1, 2]}) + d = da.to_dict() + result = xd.DataArray.from_dict(d) + assert isinstance(result.data, np.ndarray) + assert np.allclose(result.values, [1.0, 2.0]) + + def test_from_dict_dict(self): + data = dask.array.from_array(np.ones((3,)), chunks=3) + da = xd.DataArray(data, {"x": [1, 2, 3]}) + d = da.to_dict() + result = xd.DataArray.from_dict(d) + assert np.allclose(result.values, np.ones(3)) + + def test_from_dict_invalid(self): + d = {"data": 42, "coords": {}, "dims": (), "name": None, "attrs": {}} + with pytest.raises(ValueError, match="data must be a list or a dictionary"): + xd.DataArray.from_dict(d) + + def test_plot_1d(self): + import matplotlib + + matplotlib.use("Agg") + import matplotlib.pyplot as plt + + da = xd.DataArray(np.ones((5,)), {"x": [1, 2, 3, 4, 5]}) + da.plot() + plt.close("all") + + def test_plot_2d(self): + import matplotlib + + matplotlib.use("Agg") + import matplotlib.pyplot as plt + + da = xd.DataArray(np.ones((3, 4)), {"x": [1, 2, 3], "y": [1, 2, 3, 4]}) + da.plot() + plt.close("all") + + def test_plot_nd(self): + import matplotlib + + matplotlib.use("Agg") + import matplotlib.pyplot as plt + + da = xd.DataArray(np.ones((2, 3, 4))) + da.plot() + plt.close("all") diff --git a/tests/test_datacollection.py b/tests/test_datacollection.py index 151b376a..24af96cd 100644 --- a/tests/test_datacollection.py +++ b/tests/test_datacollection.py @@ -167,17 +167,20 @@ def test_datacollection_from_dataarray(self): def test_datacollection_from_raw_data(self): import numpy as np + data = np.ones((3, 4)) result = xd.DataCollection(data, "raw") assert isinstance(result, xd.DataArray) def test_empty_mapping_repr(self): from xdas.core.datacollection import DataMapping + dm = DataMapping({}, "empty") assert repr(dm) == "Empty" def test_mapping_reduce(self): import pickle + da = wavelet_wavefronts() dc = xd.DataCollection({"a": da}, "test") pickled = pickle.dumps(dc) @@ -186,6 +189,7 @@ def test_mapping_reduce(self): def test_sequence_reduce(self): import pickle + da = wavelet_wavefronts() dc = xd.DataCollection([da, da], "test") pickled = pickle.dumps(dc) @@ -286,6 +290,7 @@ def test_query_invalid_key_in_mapping(self): def test_from_netcdf_non_sequential_int_keys(self, tmp_path): from xdas.core.datacollection import DataMapping + da = wavelet_wavefronts() # Create a mapping with non-sequential int keys (gaps) dm = DataMapping({0: da, 2: da}, "test") @@ -297,6 +302,7 @@ def test_from_netcdf_non_sequential_int_keys(self, tmp_path): def test_sequence_from_netcdf_direct(self, tmp_path): from xdas.core.datacollection import DataSequence + da = wavelet_wavefronts() dc = DataSequence([da, da], "seq") path = tmp_path / "seq_direct.nc" @@ -325,6 +331,7 @@ def test_mapping_repr_nested(self): def test_mapping_repr_int_keys(self): from xdas.core.datacollection import DataMapping + da = wavelet_wavefronts() dm = DataMapping({0: da, 1: da}, "seq") s = repr(dm) @@ -366,6 +373,7 @@ def test_nested_sequence_map(self): def test_parse_tuple_with_name_given(self): from xdas.core.datacollection import DataMapping + da = wavelet_wavefronts() # When data is a tuple and name is already provided, unpack the tuple ignoring its name dm = DataMapping(("inner_name", {"a": da}), "outer_name") @@ -379,5 +387,28 @@ def test_parse_datacollection_propagates_name(self): dc_copy = xd.DataCollection.__new__(xd.DataCollection, dm2) # just verify parse propagates name from xdas.core.datacollection import parse + data, name = parse(dm, None) # should propagate dm.name assert name == "original_name" + + def test_mapping_map_invalid_item(self): + from xdas.core.datacollection import DataMapping + + da = wavelet_wavefronts() + dm = DataMapping({"good": da}, "test") + # bypass validation to inject an invalid item + dict.__setitem__(dm, "bad", "not_a_dataarray") + atom = xs.decimate(..., 2, ftype="fir") + with pytest.raises(TypeError, match="encountered in the collection"): + dm.map(atom) + + def test_sequence_map_invalid_item(self): + from xdas.core.datacollection import DataSequence + + da = wavelet_wavefronts() + ds = DataSequence([da], "test") + # bypass validation to inject an invalid item + list.append(ds, "not_a_dataarray") + atom = xs.decimate(..., 2, ftype="fir") + with pytest.raises(TypeError, match="encountered in the collection"): + ds.map(atom) diff --git a/tests/test_methods.py b/tests/test_methods.py index 48121762..a568e822 100644 --- a/tests/test_methods.py +++ b/tests/test_methods.py @@ -34,45 +34,33 @@ class TestMethods: def test_cumprod_float(self, float_da): result = float_da.cumprod("y") - np.testing.assert_array_equal( - result.data, np.nancumprod(float_da.data, axis=1) - ) + np.testing.assert_array_equal(result.data, np.nancumprod(float_da.data, axis=1)) def test_cumsum_float(self, float_da): result = float_da.cumsum("y") - np.testing.assert_array_equal( - result.data, np.nancumsum(float_da.data, axis=1) - ) + np.testing.assert_array_equal(result.data, np.nancumsum(float_da.data, axis=1)) # --- cumprod / cumsum with integer dtype (non-skipna branch) --- def test_cumprod_integer(self, int_da): result = int_da.cumprod("y") assert result.shape == int_da.shape - np.testing.assert_array_equal( - result.data, np.cumprod(int_da.data, axis=1) - ) + np.testing.assert_array_equal(result.data, np.cumprod(int_da.data, axis=1)) def test_cumsum_integer(self, int_da): result = int_da.cumsum("y") assert result.shape == int_da.shape - np.testing.assert_array_equal( - result.data, np.cumsum(int_da.data, axis=1) - ) + np.testing.assert_array_equal(result.data, np.cumsum(int_da.data, axis=1)) # --- skipna=False branches --- def test_cumprod_skipna_false(self, float_da): result = float_da.cumprod("y", skipna=False) - np.testing.assert_array_equal( - result.data, np.cumprod(float_da.data, axis=1) - ) + np.testing.assert_array_equal(result.data, np.cumprod(float_da.data, axis=1)) def test_cumsum_skipna_false(self, float_da): result = float_da.cumsum("y", skipna=False) - np.testing.assert_array_equal( - result.data, np.cumsum(float_da.data, axis=1) - ) + np.testing.assert_array_equal(result.data, np.cumsum(float_da.data, axis=1)) # --- dim=None (axis=None) branches --- diff --git a/tests/test_parallel.py b/tests/test_parallel.py index 31312cf8..46a820c9 100644 --- a/tests/test_parallel.py +++ b/tests/test_parallel.py @@ -84,7 +84,6 @@ def test_ignore_one_output(self): assert np.array_equal(y_res, y_exp) assert np.array_equal(t_res, t_exp) - def test_parallel_multiple_outputs(self): # Force 2 workers to hit the parallel output path (line 104) func = parallelize(split_axis=(0, 0), concat_axis=(0, 0), parallel=2)( @@ -134,7 +133,9 @@ def test_different_ndims(self): def test_different_dtypes(self): with pytest.raises(ValueError, match="same dtype"): - concatenate([np.ones((3,), dtype=np.float32), np.ones((3,), dtype=np.float64)]) + concatenate( + [np.ones((3,), dtype=np.float32), np.ones((3,), dtype=np.float64)] + ) def test_different_shape_other_axis(self): with pytest.raises(ValueError, match="same shape"): @@ -161,6 +162,7 @@ def test_none_uses_config(self): def test_bool_true(self): import os + assert get_workers_count(True) == os.cpu_count() def test_bool_false(self): @@ -168,3 +170,7 @@ def test_bool_false(self): def test_int(self): assert get_workers_count(4) == 4 + + def test_invalid_type_raises(self): + with pytest.raises(TypeError, match="must be either None, bool or int"): + get_workers_count("invalid") diff --git a/tests/test_picking.py b/tests/test_picking.py index af9451d1..6f844d27 100644 --- a/tests/test_picking.py +++ b/tests/test_picking.py @@ -252,8 +252,16 @@ def test_scalar_coord_preserved(self): "station": "ABC", }, ) - start = [np.datetime64("NaT")] + [np.datetime64("2023-01-01T00:00:03")] * 2 + [np.datetime64("NaT")] * 2 - end = [np.datetime64("NaT")] + [np.datetime64("2023-01-01T00:00:07")] * 2 + [np.datetime64("NaT")] * 2 + start = ( + [np.datetime64("NaT")] + + [np.datetime64("2023-01-01T00:00:03")] * 2 + + [np.datetime64("NaT")] * 2 + ) + end = ( + [np.datetime64("NaT")] + + [np.datetime64("2023-01-01T00:00:07")] * 2 + + [np.datetime64("NaT")] * 2 + ) result = tapered_selection(da, start, end, dim="time") assert result.coords["station"].values == np.array("ABC") @@ -279,8 +287,16 @@ def test_non_dim_coord_on_dim_axis_skipped(self): ), }, ) - start = [np.datetime64("NaT")] + [np.datetime64("2023-01-01T00:00:03")] * 2 + [np.datetime64("NaT")] * 2 - end = [np.datetime64("NaT")] + [np.datetime64("2023-01-01T00:00:07")] * 2 + [np.datetime64("NaT")] * 2 + start = ( + [np.datetime64("NaT")] + + [np.datetime64("2023-01-01T00:00:03")] * 2 + + [np.datetime64("NaT")] * 2 + ) + end = ( + [np.datetime64("NaT")] + + [np.datetime64("2023-01-01T00:00:07")] * 2 + + [np.datetime64("NaT")] * 2 + ) result = tapered_selection(da, start, end, dim="time") assert "quality" not in result.coords diff --git a/tests/test_signal.py b/tests/test_signal.py index bc8ab062..f8ea64a1 100644 --- a/tests/test_signal.py +++ b/tests/test_signal.py @@ -1,4 +1,5 @@ import numpy as np +import pytest import scipy.signal as sp import xarray as xr @@ -308,3 +309,72 @@ def test_last_dimension_with_non_dimensional_coordinates(self): assert np.allclose(result["distance"].values, t) assert np.allclose(result["wavenumber"].values, np.sort(f)) assert "channel" not in result.coords # TODO: keep non-dimensional coordinates + + +class TestSignalMissingBranches: + def test_integrate_no_midpoints(self): + da = wavelet_wavefronts() + result = xs.integrate(da, midpoints=False) + assert result.shape == da.shape + + def test_differentiate_no_midpoints(self): + da = wavelet_wavefronts() + result = xs.differentiate(da, midpoints=False) + assert result.sizes["distance"] == da.sizes["distance"] - 1 + + def test_sliding_mean_removal_even_window(self): + # When wlen/d gives an even n, sliding_mean_removal increments n by 1. + da = wavelet_wavefronts() + d = xs.get_sampling_interval(da, "time") + # Make wlen exactly twice d so n=2 (even) → becomes 3 + result = xs.sliding_mean_removal(da, wlen=2 * d) + assert result.shape == da.shape + + def test_medfilt_invalid_dim(self): + da = wavelet_wavefronts() + with pytest.raises(ValueError, match="dims provided not in dataarray"): + xs.medfilt(da, {"nonexistent_dim": 3}) + + def test_stft_default_noverlap(self): + da = wavelet_wavefronts() + result = xs.stft(da, nperseg=16, dim={"time": "frequency"}) + assert "frequency" in result.dims + + def test_stft_invalid_scaling(self): + da = wavelet_wavefronts() + with pytest.raises(ValueError, match="Scaling must be"): + xs.stft(da, nperseg=16, scaling="invalid", dim={"time": "frequency"}) + + def test_stft_nperseg_one(self): + # nperseg=1, noverlap=0 triggers the stride_tricks bypass branch + da = wavelet_wavefronts() + # nfft=2 avoids single-element frequency axis (which would make tie_indices=[0,0]) + result = xs.stft(da, nperseg=1, noverlap=0, nfft=2, dim={"time": "frequency"}) + assert "frequency" in result.dims + + +class TestFftMissingBranches: + def test_fft_explicit_n(self): + import xdas.fft as xfft + + da = wavelet_wavefronts().isel(distance=0) + n = da.sizes["time"] // 2 + result = xfft.fft(da, n=n, dim={"time": "frequency"}) + assert result.sizes["frequency"] == n + + def test_rfft_explicit_n(self): + import xdas.fft as xfft + + da = wavelet_wavefronts().isel(distance=0) + n = da.sizes["time"] + result = xfft.rfft(da, n=n, dim={"time": "frequency"}) + assert "frequency" in result.dims + + def test_ifft_explicit_n(self): + import xdas.fft as xfft + + da = wavelet_wavefronts().isel(distance=0) + spectrum = xfft.fft(da, dim={"time": "frequency"}) + n = da.sizes["time"] + result = xfft.ifft(spectrum, n=n, dim={"frequency": "time"}) + assert result.sizes["time"] == n diff --git a/tests/test_trigger.py b/tests/test_trigger.py index de1a219c..086a8242 100644 --- a/tests/test_trigger.py +++ b/tests/test_trigger.py @@ -183,9 +183,8 @@ def test_trigger_1d(): def test_concat_non_interp_coord(): """_concat raises ValueError for non-interpolated coordinates.""" from xdas.coordinates.sampled import SampledCoordinate - coord1 = xd.Coordinate( - {"tie_indices": [0, 2], "tie_values": [10, 30]}, dim="dim" - ) + + coord1 = xd.Coordinate({"tie_indices": [0, 2], "tie_values": [10, 30]}, dim="dim") coord_bad = SampledCoordinate( {"tie_values": [0.0], "tie_lengths": [3], "sampling_interval": 1.0}, "dim" ) @@ -195,11 +194,7 @@ def test_concat_non_interp_coord(): def test_concat_different_dims(): """_concat raises ValueError when coords have different dims.""" - coord1 = xd.Coordinate( - {"tie_indices": [0, 2], "tie_values": [10, 30]}, dim="dim1" - ) - coord2 = xd.Coordinate( - {"tie_indices": [0, 2], "tie_values": [40, 60]}, dim="dim2" - ) + coord1 = xd.Coordinate({"tie_indices": [0, 2], "tie_values": [10, 30]}, dim="dim1") + coord2 = xd.Coordinate({"tie_indices": [0, 2], "tie_values": [40, 60]}, dim="dim2") with pytest.raises(ValueError, match="same dimension"): _concat([coord1, coord2]) diff --git a/tests/test_virtual.py b/tests/test_virtual.py index 687e5f8a..00165758 100644 --- a/tests/test_virtual.py +++ b/tests/test_virtual.py @@ -9,9 +9,11 @@ Selectors, SingleSelector, SliceSelector, + VirtualArray, VirtualLayout, VirtualSource, VirtualStack, + _to_human, ) @@ -385,3 +387,158 @@ def test_get_indexer(self): assert arr[1:-1] == arr[sel[1:-1].get_indexer()] assert arr[1:-1][1:-1] == arr[sel[1:-1][1:-1].get_indexer()] assert arr[1:-1][::2] == arr[sel[1:-1][::2].get_indexer()] + + +class TestVirtualArrayAbstract: + def test_abstract_stubs(self): + va = VirtualArray() + va.__getitem__(0) + va.__array__() + _ = va.shape + _ = va.dtype + va.to_dataset(None, None) + assert isinstance(repr(va), str) + + +class TestVirtualStackExtra: + def _make_stack(self, tmp_path): + data = np.arange(20).reshape(10, 2) + sources = [] + for i, chunk in enumerate(np.split(data, 2, axis=0)): + with h5py.File(tmp_path / f"s{i}.h5", "w") as f: + f.create_dataset("d", data=chunk) + sources.append(VirtualSource(f["d"])) + return VirtualStack(sources, axis=0) + + def test_axis_property(self, tmp_path): + stack = self._make_stack(tmp_path) + assert isinstance(stack, VirtualStack) + assert stack.axis == 0 + + def test_getitem_non_stack_axis(self, tmp_path): + stack = self._make_stack(tmp_path) + result = stack[:, 0:1] + assert isinstance(result, VirtualStack) + + def test_check_dtype_mismatch(self, tmp_path): + path = tmp_path / "x.nc" + da_f32 = xd.DataArray(np.ones((10, 5), dtype=np.float32), dims=("a", "b")) + da_f64 = xd.DataArray(np.ones((10, 5), dtype=np.float64), dims=("a", "b")) + da_f32.to_netcdf(path) + with h5py.File(path, "r") as f: + src_f32 = VirtualSource(f["__values__"]) + da_f64.to_netcdf(path) + with h5py.File(path, "r") as f: + src_f64 = VirtualSource(f["__values__"]) + stack = VirtualStack([src_f32], axis=0) + with pytest.raises(ValueError, match="dtype"): + stack.append(src_f64) + + +class TestVirtualLayoutExtra: + def test_array_with_dtype(self, tmp_path): + da = wavelet_wavefronts() + da.to_netcdf(tmp_path / "c.nc") + da2 = xd.open(tmp_path / "c.nc") + layout = da2.data._to_layout() + result = np.asarray(layout, dtype=np.float32) + assert result.dtype == np.float32 + + def test_setitem_with_virtual_source(self, tmp_path): + da = wavelet_wavefronts() + da.to_netcdf(tmp_path / "d.nc") + with h5py.File(tmp_path / "d.nc", "r") as f: + src = VirtualSource(f["__values__"]) + layout = VirtualLayout(src.shape, src.dtype) + layout[...] = src + assert True + + def test_to_dataset_integer_dtype(self, tmp_path): + da = xd.DataArray(np.ones((5, 3), dtype=np.int16), dims=("a", "b")) + da.to_netcdf(tmp_path / "e.nc") + da2 = xd.open(tmp_path / "e.nc") + layout = da2.data._to_layout() + assert np.issubdtype(layout.dtype, np.integer) + with h5py.File(tmp_path / "out.h5", "w") as f: + layout.to_dataset(f, "test") + + def test_to_dataset_complex_dtype(self, tmp_path): + da = xd.DataArray(np.ones((5, 3), dtype=np.complex128), dims=("a", "b")) + da.to_netcdf(tmp_path / "f.nc") + da2 = xd.open(tmp_path / "f.nc") + layout = da2.data._to_layout() + with h5py.File(tmp_path / "out2.h5", "w") as f: + layout.to_dataset(f, "test") + + +class TestSliceSelectorStopNegative: + def test_get_indexer_stop_negative(self): + sel = SliceSelector(5) + sel._range = range(1, -2, -1) + idx = sel.get_indexer() + assert idx.stop is None + + +class TestToHuman: + def test_small(self): + assert _to_human(500) == "500.0B" + + def test_kb(self): + assert _to_human(2 * 1024) == "2.0KB" + + def test_mb(self): + assert _to_human(2 * 1024 * 1024) == "2.0MB" + + def test_gb(self): + assert _to_human(2 * 1024**3) == "2.0GB" + + def test_tb(self): + assert _to_human(2 * 1024**4) == "2.0TB" + + +class TestVirtualStackGetitemElse: + def test_getitem_axis_beyond_indexers(self, tmp_path): + # VirtualStack with axis=1; indexing with a single key (not covering axis) + # triggers the else branch at line 173: sources=[source[(0,)] for source ...] + data = np.arange(20).reshape(2, 10) + sources = [] + for i, chunk in enumerate(np.split(data, 2, axis=1)): + fname = tmp_path / f"t{i}.h5" + with h5py.File(fname, "w") as f: + f.create_dataset("d", data=chunk) + with h5py.File(fname, "r") as f: + sources.append(VirtualSource(f["d"])) + stack = VirtualStack(sources, axis=1) + result = stack[0] + assert isinstance(result, VirtualStack) + + +class TestVirtualLayoutSetitemNonSource: + def test_setitem_with_h5py_virtual_source(self, tmp_path): + # Passing an h5py.VirtualSource (not xdas.VirtualSource) exercises the False + # branch of `if isinstance(value, VirtualSource)` at line 354->356. + src_path = tmp_path / "src.h5" + data = np.ones((5, 3)) + with h5py.File(src_path, "w") as f: + f.create_dataset("d", data=data) + h5_src = h5py.VirtualSource(str(src_path), "d", shape=data.shape) + layout = VirtualLayout(data.shape, data.dtype) + layout[...] = h5_src + out_path = tmp_path / "out.h5" + with h5py.File(out_path, "w") as f: + layout.to_dataset(f, "result") + + +class TestVirtualLayoutToDatasetOtherDtype: + def test_fillvalue_none_for_bool_dtype(self, tmp_path): + # Boolean dtype triggers the else branch (fillvalue = None) in to_dataset. + src_path = tmp_path / "bool_src.h5" + data = np.ones((5, 3), dtype=bool) + with h5py.File(src_path, "w") as f: + f.create_dataset("d", data=data) + h5_src = h5py.VirtualSource(str(src_path), "d", shape=data.shape) + layout = VirtualLayout(data.shape, data.dtype) + layout[...] = h5_src + out_path = tmp_path / "bool_out.h5" + with h5py.File(out_path, "w") as f: + layout.to_dataset(f, "result") From 1202ec2b8af8f9cad92663f14c167ed3d4ecf22b Mon Sep 17 00:00:00 2001 From: Alister Trabattoni Date: Mon, 25 May 2026 13:18:10 +0200 Subject: [PATCH 13/21] Switch from isort + black to ruff. --- docs/contribute.md | 3 +-- pyproject.toml | 6 +++--- tests/io/test_utils.py | 1 - tests/test_atoms.py | 1 - tests/test_methods.py | 1 - xdas/coordinates/interp.py | 1 - xdas/core/dataarray.py | 6 ++++-- xdas/core/datacollection.py | 1 - xdas/io/febus.py | 1 - xdas/io/prodml.py | 1 - xdas/io/tdms.py | 6 +++--- xdas/io/utils.py | 3 +-- xdas/io/xdas.py | 2 +- xdas/processing/core.py | 3 +-- xdas/signal.py | 7 +++---- 15 files changed, 17 insertions(+), 26 deletions(-) diff --git a/docs/contribute.md b/docs/contribute.md index d1d8a21c..00a49021 100644 --- a/docs/contribute.md +++ b/docs/contribute.md @@ -128,8 +128,7 @@ quick way to add tests to your function. Run (in the root xdas folder or where you are working): ``` -black . -isort . +ruff check --fix && ruff format ``` ## How to add tests diff --git a/pyproject.toml b/pyproject.toml index bb5d3f56..3c8d2b5f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -31,7 +31,7 @@ dependencies = [ ] [project.optional-dependencies] -dev = ["black", "build", "isort", "twine"] +dev = ["build", "ruff", "twine"] docs = [ "ipykernel", "matplotlib", @@ -51,8 +51,8 @@ tests = [ "torch", ] -[tool.isort] -profile = "black" +[tool.ruff.lint] +extend-select = ["I"] [tool.pytest.ini_options] addopts = [ diff --git a/tests/io/test_utils.py b/tests/io/test_utils.py index a8915c22..a5c6d53c 100644 --- a/tests/io/test_utils.py +++ b/tests/io/test_utils.py @@ -8,7 +8,6 @@ class TestCompression: - TEST_FILES = [ # ("ap_sensing_1.hdf5", "DAS"), # TODO: not working for some reason... ("opto_das_1.hdf5", "data"), diff --git a/tests/test_atoms.py b/tests/test_atoms.py index 1e3951e0..b0ceb48b 100644 --- a/tests/test_atoms.py +++ b/tests/test_atoms.py @@ -294,7 +294,6 @@ def test_compare_with_seisbench(self): class TestAtomCoreMissingBranches: def test_repr_with_nested_atoms(self): - from xdas.atoms.core import Atom, State a = [1, 1] b = [1, 1] diff --git a/tests/test_methods.py b/tests/test_methods.py index a568e822..0c44bfb5 100644 --- a/tests/test_methods.py +++ b/tests/test_methods.py @@ -4,7 +4,6 @@ import pytest import xdas as xd -from xdas.synthetics import wavelet_wavefronts @pytest.fixture diff --git a/xdas/coordinates/interp.py b/xdas/coordinates/interp.py index 9a5f5969..385d5dcd 100644 --- a/xdas/coordinates/interp.py +++ b/xdas/coordinates/interp.py @@ -6,7 +6,6 @@ import re import numpy as np -import pandas as pd from xinterp import forward, inverse from .core import ( diff --git a/xdas/core/dataarray.py b/xdas/core/dataarray.py index b7979953..79f1e66d 100644 --- a/xdas/core/dataarray.py +++ b/xdas/core/dataarray.py @@ -132,8 +132,10 @@ def __array__(self, dtype=None): return self.data.__array__(dtype) def __array_ufunc__(self, ufunc, method, *inputs, **kwargs): - from .routines import broadcast_coords # TODO: circular import - from .routines import broadcast_to + from .routines import ( + broadcast_coords, + broadcast_to, + ) # TODO: circular import if not method == "__call__": return NotImplemented diff --git a/xdas/core/datacollection.py b/xdas/core/datacollection.py index 3b004b31..9ffd341b 100644 --- a/xdas/core/datacollection.py +++ b/xdas/core/datacollection.py @@ -3,7 +3,6 @@ nested tree structures for grouping multiple :class:`DataArray` objects. """ -import os from fnmatch import fnmatch from pathlib import Path diff --git a/xdas/io/febus.py b/xdas/io/febus.py index 0022d41e..57b25663 100644 --- a/xdas/io/febus.py +++ b/xdas/io/febus.py @@ -113,7 +113,6 @@ def open_dataarray(self, fname, overlaps=None, offset=None): dc = [] for t0, chunk in zip(times, chunks): - t0 = np.rint(1e6 * t0).astype("M8[us]").astype("M8[ns]") time = Coordinate[self.ctype["time"]].from_block(t0, nt, dt, dim="time") distance = Coordinate[self.ctype["distance"]].from_block( diff --git a/xdas/io/prodml.py b/xdas/io/prodml.py index 9e1bbe0d..145343d4 100644 --- a/xdas/io/prodml.py +++ b/xdas/io/prodml.py @@ -4,7 +4,6 @@ """ import h5py -import numpy as np import pandas as pd from ..coordinates.core import Coordinate diff --git a/xdas/io/tdms.py b/xdas/io/tdms.py index 58f57f3c..1819c3c8 100644 --- a/xdas/io/tdms.py +++ b/xdas/io/tdms.py @@ -67,9 +67,9 @@ def parse_time_stamp(fractions, seconds): @rtype : datetime.datetime """ if fractions is not None and seconds is not None and fractions + seconds > 0: - return datetime.timedelta( - 0, fractions * 2**-64 + seconds - ) + datetime.datetime(1904, 1, 1) + return datetime.timedelta(0, fractions * 2**-64 + seconds) + datetime.datetime( + 1904, 1, 1 + ) else: return None diff --git a/xdas/io/utils.py b/xdas/io/utils.py index 58af7c19..a24f3aa9 100644 --- a/xdas/io/utils.py +++ b/xdas/io/utils.py @@ -4,7 +4,7 @@ """ import h5py -import hdf5plugin +import hdf5plugin # noqa def compress(src_path: str, dst_path: str, dataset_location: str, encoding: dict): @@ -30,7 +30,6 @@ def compress(src_path: str, dst_path: str, dataset_location: str, encoding: dict encoding.pop("chunks") with h5py.File(src_path, "r") as src_file, h5py.File(dst_path, "w") as dst_file: - dataset_name = "/" + dataset_location.lstrip("/") def _copy(src_group, dst_group, current_path): diff --git a/xdas/io/xdas.py b/xdas/io/xdas.py index 89539924..be0b525e 100644 --- a/xdas/io/xdas.py +++ b/xdas/io/xdas.py @@ -8,7 +8,7 @@ import h5netcdf import h5py -import hdf5plugin +import hdf5plugin # noqa import xarray as xr from dask.array import Array as DaskArray diff --git a/xdas/processing/core.py b/xdas/processing/core.py index b5038541..e4b88f61 100644 --- a/xdas/processing/core.py +++ b/xdas/processing/core.py @@ -121,8 +121,7 @@ def __init__(self, da, chunks, max_buffers=1, max_workers=1): chunk_size = int(chunk_size) if chunk_dim not in da.dims: raise ValueError( - f"chunking dimension {chunk_dim} not " - f"found in `da` dimensions {da.dims}" + f"chunking dimension {chunk_dim} not found in `da` dimensions {da.dims}" ) if chunk_size > da.sizes[chunk_dim]: raise ValueError( diff --git a/xdas/signal.py b/xdas/signal.py index 95cbaaa7..7d1bbbae 100644 --- a/xdas/signal.py +++ b/xdas/signal.py @@ -3,7 +3,6 @@ tapering, detrending, and spectral helpers, all coordinate-aware and multi-threaded via :func:`~xdas.parallel.parallelize`. """ - import numpy as np import scipy.signal as sp @@ -11,7 +10,7 @@ from .coordinates.core import Coordinate, get_sampling_interval from .core.dataarray import DataArray from .parallel import parallelize -from .spectral import stft +from .spectral import stft # noqa @atomized @@ -923,8 +922,8 @@ def sliding_mean_removal( shape = tuple(-1 if a == axis else 1 for a in range(da.ndim)) win = np.reshape(win, shape) pad_width = tuple((n // 2, n // 2) if a == axis else (0, 0) for a in range(da.ndim)) - func = lambda x: x - sp.fftconvolve( - np.pad(x, pad_width, mode=pad_mode), win, mode="valid" + func = lambda x: ( + x - sp.fftconvolve(np.pad(x, pad_width, mode=pad_mode), win, mode="valid") ) across = int(axis == 0) func = parallelize(across, across, parallel)(func) From 580921bc5250c6fe981d25ee1a8f870ca5800364 Mon Sep 17 00:00:00 2001 From: Alister Trabattoni Date: Mon, 25 May 2026 19:17:26 +0200 Subject: [PATCH 14/21] FULL TEST COVERAGE !!! --- tests/coordinates/test_interp.py | 23 +++ tests/coordinates/test_sampled.py | 18 ++ tests/io/test_asn.py | 30 +++ tests/io/test_febus.py | 73 ++++++++ tests/io/test_generic.py | 15 ++ tests/io/test_miniseed.py | 57 ++++++ tests/io/test_prodml.py | 34 ++++ tests/io/test_utils.py | 17 ++ tests/io/test_xdas_io.py | 254 +++++++++++++++++++++++++ tests/test_core.py | 15 -- tests/test_processing.py | 149 ++++++++++++++- tests/test_routines.py | 295 +++++++++++++++++++++++++++++- xdas/coordinates/core.py | 4 +- xdas/coordinates/interp.py | 6 +- xdas/coordinates/sampled.py | 21 +-- xdas/core/dataarray.py | 7 +- xdas/core/datacollection.py | 2 +- xdas/core/numpy.py | 6 +- xdas/core/routines.py | 26 ++- xdas/io/__init__.py | 11 +- xdas/io/miniseed.py | 2 +- xdas/io/xdas.py | 23 +-- xdas/processing/core.py | 12 +- xdas/signal.py | 1 + 24 files changed, 1009 insertions(+), 92 deletions(-) create mode 100644 tests/io/test_febus.py create mode 100644 tests/io/test_prodml.py create mode 100644 tests/io/test_xdas_io.py diff --git a/tests/coordinates/test_interp.py b/tests/coordinates/test_interp.py index 5285855c..4390dbca 100644 --- a/tests/coordinates/test_interp.py +++ b/tests/coordinates/test_interp.py @@ -421,3 +421,26 @@ def test_decimate_no_collision(self): ) result = coord.decimate(3) assert np.all(np.diff(result.tie_indices) > 0) + + def test_get_split_indices_overlaps_tolerance_false(self): + # Build a coord with an overlap (tie_values go backwards between segments) + coord = InterpCoordinate( + { + "tie_indices": [0, 4, 5, 9], + "tie_values": [0.0, 4.0, 3.0, 7.0], # overlap at index 5 (value 3 < 4) + } + ) + result = coord.get_split_indices(kind="overlaps", tolerance=False) + assert isinstance(result, np.ndarray) + np.testing.assert_array_equal(result, [5], strict=True) + + def test_get_split_indices_overlaps_with_tolerance(self): + coord = InterpCoordinate( + { + "tie_indices": [0, 4, 5, 9], + "tie_values": [0.0, 4.0, 3.0, 7.0], + } + ) + result = coord.get_split_indices(kind="overlaps", tolerance=0.5) + assert isinstance(result, np.ndarray) + np.testing.assert_array_equal(result, [5], strict=True) diff --git a/tests/coordinates/test_sampled.py b/tests/coordinates/test_sampled.py index 17dbf58b..779b57f6 100644 --- a/tests/coordinates/test_sampled.py +++ b/tests/coordinates/test_sampled.py @@ -966,3 +966,21 @@ def test_get_indexer_bfill_in_bounds(self): coord = self.make_coord() assert coord.get_indexer(0.0, method="bfill") == 0 assert coord.get_indexer(0.5, method="bfill") == 1 + + def test_get_split_indices_overlaps_tolerance_false(self): + # Build a coord with an actual overlap (segment 2 starts before segment 1 ends) + coord = SampledCoordinate( + {"tie_values": [0.0, 2.0], "tie_lengths": [5, 5], "sampling_interval": 1.0} + ) + result = coord.get_split_indices(kind="overlaps", tolerance=False) + assert isinstance(result, np.ndarray) + np.testing.assert_array_equal(result, [5], strict=True) + + def test_get_split_indices_overlaps_with_tolerance(self): + # Build a coord with an actual overlap (segment 2 starts before segment 1 ends) + coord = SampledCoordinate( + {"tie_values": [0.0, 2.0], "tie_lengths": [5, 5], "sampling_interval": 1.0} + ) + result = coord.get_split_indices(kind="overlaps", tolerance=1.0) + assert isinstance(result, np.ndarray) + np.testing.assert_array_equal(result, [5], strict=True) diff --git a/tests/io/test_asn.py b/tests/io/test_asn.py index 8696f6bf..55a02709 100644 --- a/tests/io/test_asn.py +++ b/tests/io/test_asn.py @@ -4,6 +4,7 @@ import h5py import numpy as np +import pytest import zmq import xdas as xd @@ -37,6 +38,35 @@ def get_free_local_address(): ) +class TestASNEngineROIBounds: + def test_roi_start_beyond_sensor_distances(self): + from xdas.io.asn import ASNEngine + + engine = ASNEngine() + with pytest.raises(IndexError, match="ROI start lies beyond"): + engine._get_roi_bound_indices( + [0.0, 10.0, 20.0], n_start=5, n_end=3, dx=10.0 + ) + + def test_roi_end_before_sensor_distances(self): + from xdas.io.asn import ASNEngine + + engine = ASNEngine() + with pytest.raises(IndexError, match="ROI end lies before"): + engine._get_roi_bound_indices( + [10.0, 20.0, 30.0], n_start=1, n_end=0, dx=10.0 + ) + + +class TestASNEnginePublisher: + def test_write_method(self): + from xdas.io.asn import ZMQPublisher as ASNZMQPublisher + + address = get_free_local_address() + pub = ASNZMQPublisher(address) + pub.write(da_float32) + + class TestASNEngine: def test_read_handles_exclusive_roi_end(self, tmp_path): path = tmp_path / "sample_asn.hdf5" diff --git a/tests/io/test_febus.py b/tests/io/test_febus.py new file mode 100644 index 00000000..5defdd01 --- /dev/null +++ b/tests/io/test_febus.py @@ -0,0 +1,73 @@ +"""Tests for the Febus HDF5 engine.""" + +import h5py +import numpy as np +import pytest + +import xdas as xd +from xdas.io.febus import FebusEngine + + +def make_febus_file(path, use_freqres=False): + """Create a minimal valid febus HDF5 file. + + Spacing layout (from febus engine): delta = (Spacing[1]/1000, Spacing[0]) + so Spacing[0] = distance step (m), Spacing[1] = time scale (dt*1000). + """ + nchunks, nt, nx = 3, 12, 5 + dx_m = 5.0 + dt_s = 0.001 # 1 ms per sample → Spacing[1] = 1.0 + block_rate_hz = 100.0 # 100 Hz → 10ms per block → nt_real=10, noverlap=2 + data = np.zeros((nchunks, nt, nx), dtype=np.float32) + # chunk starts spaced by 1 real block = 10ms = 0.01s + times = np.arange(nchunks, dtype=np.float64) * 0.01 + + with h5py.File(path, "w") as f: + device = f.create_group("DeviceName") + source = device.create_group("Source1") + source.create_dataset("time", data=times) + zone = source.create_group("Zone1") + if use_freqres: + zone.attrs["FreqRes"] = np.array([block_rate_hz]) + else: + zone.attrs["BlockRate"] = np.array([block_rate_hz]) + # Spacing[0]=dx_m, Spacing[1]=dt_s*1000 → delta=(dt_s, dx_m) + zone.attrs["Spacing"] = np.array([dx_m, dt_s * 1000.0]) + zone.attrs["Extent"] = np.array([0.0, (nx - 1) * dx_m]) + zone.attrs["Origin"] = np.array([0.0, 0.0]) + zone.create_dataset("StrainRate", data=data) + + +class TestFebusEngine: + def test_open_with_freqres_attr(self, tmp_path): + path = tmp_path / "febus_freqres.h5" + make_febus_file(path, use_freqres=True) + da = xd.open(str(path), engine="febus", overlaps=(1, 1), offset=0) + assert isinstance(da, xd.DataArray) + + def test_invalid_overlaps_raises(self, tmp_path): + path = tmp_path / "febus.h5" + make_febus_file(path) + with pytest.raises(ValueError, match="overlaps must be"): + FebusEngine().open_dataarray(str(path), overlaps="bad") + + def test_invalid_offset_raises(self, tmp_path): + path = tmp_path / "febus.h5" + make_febus_file(path) + with pytest.raises(ValueError, match="offset must be an integer"): + FebusEngine().open_dataarray(str(path), overlaps=(1, 1), offset="bad") + + def test_missing_block_rate_raises(self, tmp_path): + path = tmp_path / "febus_no_blockrate.h5" + nchunks, nt, nx = 2, 10, 5 + with h5py.File(path, "w") as f: + device = f.create_group("Dev") + source = device.create_group("Source1") + source.create_dataset("time", data=np.zeros(nchunks)) + zone = source.create_group("Zone1") + zone.attrs["Spacing"] = np.array([1.0, 5.0]) + zone.attrs["Extent"] = np.array([0.0, 20.0]) + zone.attrs["Origin"] = np.array([0.0, 0.0]) + zone.create_dataset("Data", data=np.zeros((nchunks, nt, nx))) + with pytest.raises(KeyError, match="Could not find the block size"): + FebusEngine().open_dataarray(str(path), overlaps=(0, 0), offset=0) diff --git a/tests/io/test_generic.py b/tests/io/test_generic.py index 8cb86cea..0861e3d7 100644 --- a/tests/io/test_generic.py +++ b/tests/io/test_generic.py @@ -31,6 +31,21 @@ def test_auto_engine_fail_message_includes_ctype(self, tmp_path): with pytest.raises(ValueError, match="ctype"): AutoEngine(ctype="dense").open_dataarray(str(fake)) + def test_auto_engine_fail_message_includes_vtype(self, tmp_path): + fake = tmp_path / "fake.h5" + fake.write_bytes(b"not a valid hdf5 file") + with pytest.raises(ValueError, match="vtype"): + AutoEngine(vtype="hdf5").open_dataarray(str(fake)) + + def test_dict_ctype_with_none_value(self): + engine = Engine["asn"](ctype={"time": None, "distance": "interpolated"}) + assert engine.ctype["time"] == "interpolated" + assert engine.ctype["distance"] == "interpolated" + + def test_invalid_ctype_type_raises(self): + with pytest.raises(ValueError, match="ctype must be"): + Engine["asn"](ctype=42) + class TestGenericIO: TEST_FILES = { diff --git a/tests/io/test_miniseed.py b/tests/io/test_miniseed.py index c6e67e92..335975bc 100644 --- a/tests/io/test_miniseed.py +++ b/tests/io/test_miniseed.py @@ -1,7 +1,9 @@ import numpy as np import obspy +import pytest import xdas as xd +from xdas.io.miniseed import MiniSEEDEngine, get_band_code, to_stream def make_network(dirpath, gap=False, samples=100): @@ -156,3 +158,58 @@ def test_miniseed(tmp_path): assert da.coords["network"].values == "DX" assert da.coords["location"].values == "00" assert da.coords["channel"].values.tolist() == ["HHZ", "HHN", "HHE"] + + # trigger read_data by loading values (synchronized case) + sync_paths = sorted(tmp_path.glob("*00.mseed")) + da_sync = xd.open(sync_paths[0], engine="miniseed") + values = da_sync.values + assert values.shape == (3, 100) + + # trigger read_data synchronized with ignore_last_sample + da_sync_trimmed = xd.open(sync_paths[0], engine="miniseed", ignore_last_sample=True) + values_trimmed = da_sync_trimmed.values + assert values_trimmed.shape == (3, 99) + + # trigger read_data for unsynchronized (gapped) case + gapped_paths = sorted(tmp_path.glob("*gap.mseed")) + da_gap = xd.open(gapped_paths[0], engine="miniseed") + values_gap = da_gap.values + assert values_gap.shape == (3, 90) + + # trigger read_data unsynchronized with ignore_last_sample + da_gap_trimmed = xd.open( + gapped_paths[0], engine="miniseed", ignore_last_sample=True + ) + values_gap_trimmed = da_gap_trimmed.values + assert values_gap_trimmed.shape == (3, 89) + + +def test_miniseed_helpers(tmp_path): + # get_band_code with out-of-range sampling rate + assert get_band_code(0.0) == "X" + assert get_band_code(6000.0) == "X" + + # to_stream raises on non-2D data + da_3d = xd.DataArray(np.zeros((2, 3, 4)), dims=("a", "b", "c")) + with pytest.raises(ValueError, match="2D"): + to_stream(da_3d) + + +def test_miniseed_unsynchronized_traces(tmp_path): + path = tmp_path / "unsync.mseed" + st = obspy.Stream() + st.append( + obspy.Trace( + data=np.zeros(100, dtype=np.float32), + header={"station": "AA", "channel": "HHZ", "delta": 0.01}, + ) + ) + st.append( + obspy.Trace( + data=np.zeros(100, dtype=np.float32), + header={"station": "BB", "channel": "HHZ", "delta": 0.005}, + ) + ) + st.write(str(path), format="MSEED") + with pytest.raises(ValueError, match="synchronized"): + MiniSEEDEngine().read_header(str(path), False, "interpolated") diff --git a/tests/io/test_prodml.py b/tests/io/test_prodml.py new file mode 100644 index 00000000..a92eb03e --- /dev/null +++ b/tests/io/test_prodml.py @@ -0,0 +1,34 @@ +import h5py +import numpy as np + +import xdas as xd + + +def make_prodml_file(path, swapped=False): + nt, nd = 10, 5 + dx = 2.0 + data = np.zeros((nd, nt) if swapped else (nt, nd), dtype=np.float32) + with h5py.File(path, "w") as f: + acq = f.create_group("Acquisition") + acq.attrs["SpatialSamplingInterval"] = dx + acq.attrs["StartLocusIndex"] = 0 + raw = acq.create_group("Raw[0]") + ds = raw.create_dataset("RawData", data=data) + ds.attrs["PartStartTime"] = np.bytes_(b"2020-01-01T00:00:00.000+00:00") + ds.attrs["PartEndTime"] = np.bytes_(b"2020-01-01T00:00:00.900+00:00") + + +class TestProdMLEngine: + def test_open_swapped_dims(self, tmp_path): + path = tmp_path / "prodml_swapped.h5" + make_prodml_file(path, swapped=True) + da = xd.open(str(path), engine="prodml", swapped_dims=True) + assert isinstance(da, xd.DataArray) + assert da.dims == ("distance", "time") + + def test_open_normal(self, tmp_path): + path = tmp_path / "prodml.h5" + make_prodml_file(path) + da = xd.open(str(path), engine="prodml") + assert isinstance(da, xd.DataArray) + assert da.dims == ("time", "distance") diff --git a/tests/io/test_utils.py b/tests/io/test_utils.py index a5c6d53c..45bc931b 100644 --- a/tests/io/test_utils.py +++ b/tests/io/test_utils.py @@ -7,6 +7,23 @@ from xdas.io.utils import compress +class TestCompressionUnit: + def test_compress_with_chunks_false(self, tmp_path): + src = tmp_path / "src.h5" + dst = tmp_path / "dst.h5" + data = np.arange(12).reshape(3, 4) + with h5py.File(src, "w") as f: + f.create_dataset("ds", data=data) + compress( + src_path=str(src), + dst_path=str(dst), + dataset_location="/ds", + encoding={"compression": hdf5plugin.Bitshuffle(), "chunks": False}, + ) + with h5py.File(dst, "r") as f: + np.testing.assert_array_equal(f["ds"][()], data) + + class TestCompression: TEST_FILES = [ # ("ap_sensing_1.hdf5", "DAS"), # TODO: not working for some reason... diff --git a/tests/io/test_xdas_io.py b/tests/io/test_xdas_io.py new file mode 100644 index 00000000..d2e14259 --- /dev/null +++ b/tests/io/test_xdas_io.py @@ -0,0 +1,254 @@ +"""Tests for xdas/io/xdas.py covering Engine delegates and edge cases.""" + +import os + +import h5netcdf +import numpy as np +import pytest + +import xdas as xd +from xdas.core.datacollection import DataMapping +from xdas.io.core import Engine +from xdas.io.xdas import ( + open_dataarray, + open_datacollection, + open_datasequence, + save_dataarray, + save_datacollection, + save_datamapping, + save_datasequence, +) + + +def make_da(): + return xd.DataArray( + np.zeros((10, 5), dtype=np.float32), + { + "time": { + "tie_indices": [0, 9], + "tie_values": [ + np.datetime64("2020-01-01T00:00:00.000000000"), + np.datetime64("2020-01-01T00:00:09.000000000"), + ], + }, + "distance": {"tie_indices": [0, 4], "tie_values": [0.0, 40.0]}, + }, + ) + + +class TestXdasEngineDelegates: + def test_save_and_open_dataarray_with_str(self, tmp_path): + da = make_da() + path = str(tmp_path / "da.nc") + engine = Engine["xdas"]() + engine.save_dataarray(da, path) + result = engine.open_dataarray(path) + assert result.equals(da) + + def test_save_and_open_datasequence_with_str(self, tmp_path): + da = make_da() + dc = xd.DataCollection([da, da]) + path = str(tmp_path / "dc.nc") + engine = Engine["xdas"]() + engine.save_datacollection(dc, path) + result = engine.open_datacollection(path) + assert result.equals(dc) + + def test_save_and_open_datamapping_with_str(self, tmp_path): + da = make_da() + dc = xd.DataCollection({"A": da, "B": da}) + engine = Engine["xdas"]() + path = str(tmp_path / "dc.nc") + engine.save_datacollection(dc, path) + result = engine.open_datacollection(path) + assert result.equals(dc) + + def test_save_and_open_dataarray_with_path(self, tmp_path): + da = make_da() + path = tmp_path / "da.nc" + engine = Engine["xdas"]() + engine.save_dataarray(da, path) + result = engine.open_dataarray(path) + assert result.equals(da) + + def test_save_and_open_datasequence_with_path(self, tmp_path): + da = make_da() + dc = xd.DataCollection([da, da]) + path = tmp_path / "dc.nc" + engine = Engine["xdas"]() + engine.save_datacollection(dc, path) + result = engine.open_datacollection(path) + assert result.equals(dc) + + def test_save_and_open_datamapping_with_path(self, tmp_path): + da = make_da() + dc = xd.DataCollection({"A": da, "B": da}) + engine = Engine["xdas"]() + path = tmp_path / "dc.nc" + engine.save_datacollection(dc, path) + result = engine.open_datacollection(path) + assert result.equals(dc) + + +class TestSaveDataarrayEdgeCases: + def test_encoding_with_virtual_raises(self, tmp_path): + da = make_da() + path = str(tmp_path / "da.nc") + # First save as virtual, then try to re-save with encoding + da.to_netcdf(path) + da_virtual = xd.open_dataarray(path) + path2 = tmp_path / "test2.nc" + with pytest.raises(ValueError, match="encoding"): + save_dataarray(da_virtual, path2, virtual=True, encoding={"chunks": (2, 2)}) + + def test_virtual_true_with_plain_array_raises(self, tmp_path): + da = make_da() + path = str(tmp_path / "da.nc") + with pytest.raises(ValueError, match="virtual array"): + save_dataarray(da, path, virtual=True) + + def test_create_dirs_with_no_dirname(self, tmp_path): + da = make_da() + orig = os.getcwd() + os.chdir(tmp_path) + try: + save_dataarray(da, "bare_file.nc", create_dirs=True) + assert (tmp_path / "bare_file.nc").exists() + finally: + os.chdir(orig) + + +class TestOpenDatacollection: + def test_integer_keys_become_sequence(self, tmp_path): + da = make_da() + dc = xd.DataCollection([da, da]) + path = tmp_path / "seq.nc" + dc.to_netcdf(path) + result = open_datacollection(path) + assert result.equals(dc) + + def test_non_integer_keys_stay_mapping(self, tmp_path): + da = make_da() + dc = xd.DataCollection({"a": da, "b": da}) + path = tmp_path / "map.nc" + dc.to_netcdf(path) + result = open_datacollection(path) + assert result.equals(dc) + + +class TestSaveDatacollection: + def test_save_sequence(self, tmp_path): + da = make_da() + dc = xd.DataCollection([da, da]) + path = str(tmp_path / "seq.nc") + save_datacollection(dc, path) + result = xd.DataCollection.from_netcdf(path) + assert result.equals(dc) + + def test_save_mapping(self, tmp_path): + da = make_da() + dc = xd.DataCollection({"a": da, "b": da}) + path = str(tmp_path / "map.nc") + save_datacollection(dc, path) + result = xd.DataCollection.from_netcdf(path) + assert result.equals(dc) + + def test_invalid_type_raises(self, tmp_path): + path = str(tmp_path / "bad.nc") + with pytest.raises(ValueError, match="DataCollection"): + save_datacollection("not_a_collection", path) + + +class TestSaveDatamappingOverwrite: + def test_overwrite_existing_file(self, tmp_path): + da = make_da() + dc = xd.DataCollection({"x": da}) + path = str(tmp_path / "overwrite.nc") + dc.to_netcdf(path) + assert os.path.exists(path) + dc.to_netcdf(path) # overwrite with mode="w" + result = xd.DataCollection.from_netcdf(path) + assert result.equals(dc) + + +class TestSaveDatamappingCreateDirs: + def test_create_dirs_no_dirname(self, tmp_path): + da = make_da() + orig = os.getcwd() + os.chdir(tmp_path) + try: + dm = DataMapping({"x": da}) + save_datamapping(dm, "bare_dc.nc", create_dirs=True) + assert (tmp_path / "bare_dc.nc").exists() + finally: + os.chdir(orig) + + +class TestOpenSaveDatasequence: + def test_open_datasequence(self, tmp_path): + da = make_da() + dc = xd.DataCollection([da, da]) + path = str(tmp_path / "seq.nc") + dc.to_netcdf(path) + result = open_datasequence(path) + assert result.equals(dc) + + def test_save_datasequence(self, tmp_path): + da = make_da() + ds = xd.DataSequence([da, da]) + path = str(tmp_path / "seq.nc") + save_datasequence(ds, path) + result = xd.DataCollection.from_netcdf(path) + assert result.equals(ds) + + +class TestOpenDataarrayEdgeCases: + def test_multiple_coordinate_vars_raises(self, tmp_path): + path = tmp_path / "multi.nc" + with h5netcdf.File(str(path), "w") as f: + f.attrs["Conventions"] = "CF-1.9" + f.dimensions["time"] = 5 + f.dimensions["distance"] = 3 + v1 = f.create_variable("var1", ("time", "distance"), float) + v1.attrs["coordinate_interpolation"] = "something" + v2 = f.create_variable("var2", ("time", "distance"), float) + v2.attrs["coordinate_interpolation"] = "something" + with pytest.raises(ValueError, match="several possible"): + open_dataarray(str(path)) + + def test_path_object_accepted(self, tmp_path): + da = make_da() + path = tmp_path / "da.nc" + da.to_netcdf(str(path)) + result = open_dataarray(path) # pass Path, not str + assert result.equals(da) + + +class TestOpenDatacollectionPathObject: + def test_path_object_accepted(self, tmp_path): + da = make_da() + dc = xd.DataCollection([da, da]) + path = tmp_path / "dc.nc" + dc.to_netcdf(str(path)) + result = open_datacollection(path) # pass Path, not str + assert result.equals(dc) + + +class TestSaveDatacollectionPathObject: + def test_path_object_accepted(self, tmp_path): + da = make_da() + dc = xd.DataCollection([da, da]) + path = tmp_path / "dc.nc" + save_datacollection(dc, path) # pass Path, not str + result = xd.DataCollection.from_netcdf(str(path)) + assert result.equals(dc) + + +class TestOpenDatacollectionNonSequentialKeys: + def test_non_sequential_integers_stay_mapping(self, tmp_path): + da = make_da() + dc = xd.DataCollection({"2": da, "5": da}) + path = str(tmp_path / "nonseq.nc") + dc.to_netcdf(path) + result = open_datacollection(path) + assert result.equals(dc) diff --git a/tests/test_core.py b/tests/test_core.py index 3018be0f..398f288e 100644 --- a/tests/test_core.py +++ b/tests/test_core.py @@ -26,21 +26,6 @@ def generate(self, datetime): }, ) - def test_open_mfdatacollection(self): ... # TODO - - def test_open_mfdatatree(self, tmp_path): - keys = ["LOC01", "LOC02"] - dirnames = [tmp_path / key for key in keys] - for dirname in dirnames: - dirname.mkdir() - for idx, da in enumerate(wavelet_wavefronts(nchunk=3), start=1): - da.to_netcdf(dirname / f"{idx:03d}.nc") - da = wavelet_wavefronts() - dc = xd.open_mfdatatree(tmp_path / "{node}" / "00[acquisition].nc") - assert list(dc.keys()) == keys - for key in keys: - assert dc[key][0].load().equals(da) - def test_open_mfdataarray(self, tmp_path): wavelet_wavefronts().to_netcdf(tmp_path / "sample.nc") for idx, da in enumerate(wavelet_wavefronts(nchunk=3), start=1): diff --git a/tests/test_processing.py b/tests/test_processing.py index 8e5ff17a..4002ff8e 100644 --- a/tests/test_processing.py +++ b/tests/test_processing.py @@ -1,3 +1,4 @@ +import os import threading import time from pathlib import Path @@ -200,8 +201,14 @@ def test_with_existing_file(self, tmp_path): def test_missing_directory(self, tmp_path): with pytest.raises(OSError): xp.DataFrameWriter(tmp_path / "not_a_directory" / "output.csv") - dirpath = tmp_path / "some_directory" / "output.csv" - xp.DataFrameWriter(dirpath, create_dirs=True) + xp.DataFrameWriter(tmp_path / "some_directory" / "output.csv", create_dirs=True) + orig = os.getcwd() + try: + os.chdir(tmp_path) + dw = xp.DataFrameWriter("output.csv", create_dirs=True) + assert dw.path == "output.csv" + finally: + os.chdir(orig) def test_passing_wrong_input(self, tmp_path): dw = xp.DataFrameWriter(tmp_path / "output.csv") @@ -413,3 +420,141 @@ def test_flat(self, tmp_path): assert path.exists() st = obspy.read(path) assert len(st) == 10 + + +class TestProcessNoNbytes: + def test_loader_without_nbytes(self, tmp_path): + da = xd.DataArray(np.random.rand(100, 10), dims=("time", "distance")) + chunks = xd.split(da, 10, dim="time") + + class SimpleLoader: + chunk_dim = "time" + + def __iter__(self): + return iter(chunks) + + data_writer = xp.DataArrayWriter(tmp_path) + + def atom(x, **kw): + return x + + result = xp.process(atom, SimpleLoader(), data_writer) + assert result.equals(da) + + +class TestDataArrayLoaderMaxBuffers: + def test_max_buffers_exceeds_chunks(self): + da = xd.DataArray(np.random.rand(10, 5), dims=("time", "distance")) + dl = xp.DataArrayLoader(da, {"time": 5}, max_buffers=10) + chunks = list(dl) + assert len(chunks) == 2 + result = xd.concat(chunks) + assert result.equals(da) + + +class TestDataFrameWriterAliases: + def test_write_alias(self, tmp_path): + dw = xp.DataFrameWriter(tmp_path / "output.csv") + df = pd.DataFrame({"A": [1, 2, 3]}) + dw.write(df) # use write() alias + result = dw.result() + assert result.equals(df) + + def test_create_dirs_no_dirname(self, tmp_path): + path = tmp_path / "bare.csv" + dw = xp.DataFrameWriter(path, create_dirs=True) + assert dw.path == str(path) + + +class TestStreamWriterEdgeCases: + def test_flat_missing_directory_raises(self, tmp_path): + with pytest.raises(OSError): + xp.StreamWriter( + tmp_path / "nonexistent_dir" / "out.mseed", "M", output_format="flat" + ) + + def test_invalid_output_format_raises(self, tmp_path): + with pytest.raises(ValueError, match="output_format"): + xp.StreamWriter(tmp_path, "M", output_format="invalid") + + def test_submit_wrong_type_raises(self, tmp_path): + sw = xp.StreamWriter(tmp_path, "M") + with pytest.raises(TypeError): + sw.submit("not_a_stream") + + +class TestZMQPublisherAliases: + def test_write_alias(self): + address = f"tcp://localhost:{xd.io.get_free_port()}" + publisher = xp.ZMQPublisher(address) + da = xd.synthetics.dummy() + publisher.write(da) # use write() alias + + def test_result_returns_none(self): + address = f"tcp://localhost:{xd.io.get_free_port()}" + publisher = xp.ZMQPublisher(address) + assert publisher.result() is None + + +class TestHandlerDirect: + def test_on_closed(self, tmp_path): + from queue import Queue + + from xdas.processing.core import Handler + + da = xd.DataArray( + np.zeros((10, 5), dtype=np.float32), + { + "time": { + "tie_indices": [0, 9], + "tie_values": [ + np.datetime64("2020-01-01T00:00:00.000000000"), + np.datetime64("2020-01-01T00:00:09.000000000"), + ], + }, + "distance": {"tie_indices": [0, 4], "tie_values": [0.0, 40.0]}, + }, + ) + path = str(tmp_path / "test.nc") + da.to_netcdf(path) + + queue = Queue() + handler = Handler(queue, "xdas") + + class MockEvent: + src_path = path + + handler.on_closed(MockEvent()) + result = queue.get() + assert result.equals(da) + + +class TestRealTimeLoader: + def test_iter_and_next(self, tmp_path): + from xdas.processing.core import RealTimeLoader + + loader = RealTimeLoader(str(tmp_path), engine="xdas") + assert iter(loader) is loader + + # put a DataArray directly into the queue + da = xd.DataArray( + np.zeros((5, 3), dtype=np.float32), + { + "time": { + "tie_indices": [0, 4], + "tie_values": [ + np.datetime64("2020-01-01T00:00:00.000000000"), + np.datetime64("2020-01-01T00:00:04.000000000"), + ], + }, + "distance": {"tie_indices": [0, 2], "tie_values": [0.0, 20.0]}, + }, + ) + loader.queue.put(da) + result = next(loader) + assert result.equals(da) + + # put None to trigger StopIteration + loader.queue.put(None) + with pytest.raises(StopIteration): + next(loader) diff --git a/tests/test_routines.py b/tests/test_routines.py index 0bca09c7..38190193 100644 --- a/tests/test_routines.py +++ b/tests/test_routines.py @@ -213,8 +213,41 @@ def test_warn_on_corrupted_files(self, tmp_path): assert result.equals(expected) with (tmp_path / "corrupted.nc").open("wb") as f: f.write(b"corrupted") + + # single worker with pytest.warns(RuntimeWarning): - result = xd.open_mfdataarray(tmp_path / "*.nc") + result = xd.open_mfdataarray(tmp_path / "*.nc", parallel=False) + assert result.equals(expected) + + # multiple workers + with pytest.warns(RuntimeWarning): + result = xd.open_mfdataarray(tmp_path / "*.nc", parallel=2) + assert result.equals(expected) + + def test_verbose_single_worker(self, tmp_path): + expected = xd.DataArray( + np.random.rand(10, 5), + coords={ + "time": np.arange(10), + "space": np.arange(5), + }, # TODO: should work without coords + ) + for index, chunk in enumerate(xd.split(expected, 3, "time"), start=1): + chunk.to_netcdf(tmp_path / f"chunk_{index}.nc") + result = xd.open_mfdataarray(tmp_path / "*.nc", verbose=True, parallel=1) + assert result.equals(expected) + + def test_verbose_multiple_workers(self, tmp_path): + expected = xd.DataArray( + np.random.rand(10, 5), + coords={ + "time": np.arange(10), + "space": np.arange(5), + }, # TODO: should work without coords + ) + for index, chunk in enumerate(xd.split(expected, 3, "time"), start=1): + chunk.to_netcdf(tmp_path / f"chunk_{index}.nc") + result = xd.open_mfdataarray(tmp_path / "*.nc", verbose=True, parallel=2) assert result.equals(expected) @@ -473,3 +506,263 @@ def test_raise_tolerance_not_used(self): xd.split(da, 3, tolerance=1) with pytest.raises(ValueError): xd.split(da, [10], tolerance=1) + + +class TestOpenEdgeCases: + def test_invalid_paths_type_raises(self): + with pytest.raises(Exception, match="paths"): + xd.open(123) + + def test_callable_engine(self, tmp_path): + da = xd.DataArray(np.random.rand(10, 5), dims=("time", "distance")) + path = str(tmp_path / "test.nc") + da.to_netcdf(path) + + def my_engine(fname, **kwargs): + return xd.open_dataarray(fname) + + result = xd.open_dataarray(path, engine=my_engine) + assert result.equals(da) + + def test_invalid_engine_type_raises(self, tmp_path): + da = xd.DataArray(np.random.rand(10, 5), dims=("time", "distance")) + path = str(tmp_path / "test.nc") + da.to_netcdf(path) + with pytest.raises(ValueError, match="engine"): + xd.open_dataarray(path, engine=42) + + +class TestOpenMFDatacollectionEdgeCases: + def test_nonexistent_path_in_list_raises(self, tmp_path): + with pytest.raises(FileNotFoundError): + xd.open_mfdatacollection([str(tmp_path / "nonexistent.nc")]) + + def test_empty_glob_raises(self, tmp_path): + with pytest.raises(FileNotFoundError): + xd.open_mfdatacollection(str(tmp_path / "*.nc")) + + def test_verbose_single_worker(self, tmp_path): + da = xd.DataArray(np.random.rand(10, 5), dims=("time", "distance")) + dc = xd.DataCollection([da, da]) + path1 = str(tmp_path / "dc1.nc") + path2 = str(tmp_path / "dc2.nc") + dc.to_netcdf(path1) + dc.to_netcdf(path2) + result = xd.open_mfdatacollection( + str(tmp_path / "dc*.nc"), verbose=True, parallel=1 + ) + assert isinstance(result, xd.DataCollection) + + def test_verbose_multiple_worker(self, tmp_path): + da = xd.DataArray(np.random.rand(10, 5), dims=("time", "distance")) + dc = xd.DataCollection([da, da]) + path1 = str(tmp_path / "dc1.nc") + path2 = str(tmp_path / "dc2.nc") + dc.to_netcdf(path1) + dc.to_netcdf(path2) + result = xd.open_mfdatacollection( + str(tmp_path / "dc*.nc"), verbose=True, parallel=2 + ) + assert isinstance(result, xd.DataCollection) + + def test_invalid_path(self): + with pytest.raises(ValueError, match="`paths` must be"): + xd.open_mfdatacollection(42) + + +class TestOpenMFDataArrayEdgeCases: + def test_invalid_paths_type_raises(self): + with pytest.raises(ValueError, match="paths"): + xd.open_mfdataarray(123) + + def test_parallel_path(self, tmp_path): + expected = xd.DataArray( + np.random.rand(10, 5), + coords={"time": np.arange(10), "space": np.arange(5)}, + ) + for i, chunk in enumerate(xd.split(expected, 3, "time"), 1): + chunk.to_netcdf(tmp_path / f"chunk_{i}.nc") + result = xd.open_mfdataarray(tmp_path / "*.nc", parallel=2) + assert result.equals(expected) + + def test_no_files_no_failures_raises(self, tmp_path): + with pytest.raises(FileNotFoundError): + xd.open_mfdataarray(str(tmp_path / "*.nc")) + + +class TestOpenMFDatacollectionParallel: + def test_parallel_path(self, tmp_path): + da = xd.DataArray(np.random.rand(10, 5), dims=("time", "distance")) + dc = xd.DataCollection([da, da]) + path1 = str(tmp_path / "dc1.nc") + path2 = str(tmp_path / "dc2.nc") + dc.to_netcdf(path1) + dc.to_netcdf(path2) + result = xd.open_mfdatacollection(str(tmp_path / "dc*.nc"), parallel=2) + assert isinstance(result, xd.DataCollection) + + +class TestOpenMFDataTree: + def test_one_level_depth(self, tmp_path): + keys = ["LOC01", "LOC02"] + dirnames = [tmp_path / key for key in keys] + for dirname in dirnames: + dirname.mkdir() + for idx, da in enumerate( + xd.synthetics.wavelet_wavefronts(nchunk=3), start=1 + ): + da.to_netcdf(dirname / f"{idx:03d}.nc") + da = xd.synthetics.wavelet_wavefronts() + dc = xd.open_mfdatatree(tmp_path / "{node}" / "00[acquisition].nc") + assert list(dc.keys()) == keys + for key in keys: + assert dc[key][0].load().equals(da) + + def test_two_level_depth(self, tmp_path): + dc = xd.DataCollection( + { + "NET01": { + "STA01": xd.synthetics.wavelet_wavefronts(nchunk=1), + }, + "NET02": { + "STA02": xd.synthetics.wavelet_wavefronts(nchunk=2), + "STA03": xd.synthetics.wavelet_wavefronts(nchunk=3), + }, + } + ) + dc.name = "network" + for network in dc: + dc[network].name = "station" + for station in dc[network]: + for idx, da in enumerate(dc[network][station]): + path = tmp_path / network / station / f"{idx:03d}.nc" + da.to_netcdf(path, create_dirs=True) + dc[network][station] = xd.combine_by_coords(dc[network][station]) + dc[network][station].name = "acquisition" + result = xd.open_mfdatatree( + tmp_path / "{network}" / "{station}" / "00[acquisition].nc" + ) + assert result.equals(dc) + + +class TestAsdataarray: + def test_invalid_type_raises(self): + with pytest.raises(ValueError, match="Cannot convert"): + xd.asdataarray("not_an_array") + + def test_already_dataarray(self): + da = xd.DataArray([1, 2, 3], dims="x") + result = xd.asdataarray(da) + assert result.equals(da) + + +class TestCombineByCoordsDimLast: + def test_dim_last(self): + da1 = xd.DataArray( + np.random.rand(5, 3), + coords={"time": np.arange(5), "space": np.arange(3)}, + ) + da2 = xd.DataArray( + np.random.rand(5, 3), + coords={"time": np.arange(5), "space": np.arange(3, 6)}, + ) + result = xd.combine_by_coords([da1, da2], dim="last", squeeze=True) + assert isinstance(result, xd.DataArray) + + +class TestConcatCoordsEdgeCases: + def test_tolerance_with_dense_coord_raises(self): + da1 = xd.DataArray( + np.random.rand(5), {"x": np.array([0.0, 1.0, 2.0, 3.0, 4.0])} + ) + da2 = xd.DataArray( + np.random.rand(5), {"x": np.array([5.0, 6.0, 7.0, 8.0, 9.0])} + ) + from xdas.core.routines import concat_coords + + with pytest.raises(TypeError, match="tolerance"): + concat_coords([da1["x"], da2["x"]], tolerance=1.0) + + +class TestSplitEdgeCases: + def test_n_zero_raises(self): + da = xd.DataArray(np.random.rand(10), dims=("time",)) + with pytest.raises(ValueError, match="`n` must be larger than 0"): + xd.split(da, 0) + + def test_n_too_large_raises(self): + da = xd.DataArray(np.random.rand(10), dims=("time",)) + with pytest.raises(ValueError, match="`n` must be smaller"): + xd.split(da, 10) + + +class TestBroadcastCoordsScalar: + def test_scalar_coord_skipped(self): + da1 = xd.DataArray( + np.random.rand(5, 3), + {"time": np.arange(5), "space": np.arange(3), "network": "NET"}, + ) + da2 = xd.DataArray( + np.random.rand(5, 3), + {"time": np.arange(5), "space": np.arange(3)}, + ) + result = xd.broadcast_coords(da1, da2) + assert "network" not in result + + +class TestPlotAvailability: + def test_dataarray_plot(self): + da = xd.DataArray( + np.random.rand(100), + { + "time": { + "tie_indices": [0, 99], + "tie_values": [ + np.datetime64("2020-01-01"), + np.datetime64("2020-01-01T00:00:09.900"), + ], + } + }, + ) + fig = xd.plot_availability(da) + assert fig is not None + + def test_datassequence_plot(self): + da = xd.DataArray( + np.random.rand(100), + { + "time": { + "tie_indices": [0, 99], + "tie_values": [ + np.datetime64("2020-01-01"), + np.datetime64("2020-01-01T00:00:09.900"), + ], + } + }, + ) + dc = xd.DataCollection([da, da]) + fig = xd.plot_availability(dc) + assert fig is not None + + def test_datamapping_plot(self): + da = xd.DataArray( + np.random.rand(100), + { + "time": { + "tie_indices": [0, 99], + "tie_values": [ + np.datetime64("2020-01-01"), + np.datetime64("2020-01-01T00:00:09.900"), + ], + } + }, + ) + dm = xd.DataCollection({"a": da, "b": da}) + fig = xd.plot_availability(dm) + assert fig is not None + + def test_invalid_type_raises(self): + from xdas.core.routines import _get_timeline_dataframe + + with pytest.raises(TypeError, match="DataCollection"): + _get_timeline_dataframe("not_valid") diff --git a/xdas/coordinates/core.py b/xdas/coordinates/core.py index c5931a81..9ce6a712 100644 --- a/xdas/coordinates/core.py +++ b/xdas/coordinates/core.py @@ -369,7 +369,7 @@ def __array__(self, dtype=None): else: return self.data.__array__(dtype) - def __array__ufunc__(self, ufunc, method, *inputs, **kwargs): + def __array__ufunc__(self, ufunc, method, *inputs, **kwargs): # pragma: no cover return self.data.__array__ufunc__(ufunc, method, *inputs, **kwargs) def __array_function__(self, func, types, args, kwargs): @@ -761,7 +761,7 @@ def from_dataset(cls, dataset, name): """Read coordinates named *name* from an xarray *dataset* via each registered subclass.""" coords = {} for subcls in cls.__subclasses__(): - if hasattr(subcls, "from_dataset"): + if hasattr(subcls, "from_dataset"): # pragma: no branch coords |= subcls.from_dataset(dataset, name) return coords diff --git a/xdas/coordinates/interp.py b/xdas/coordinates/interp.py index 385d5dcd..b1440f9b 100644 --- a/xdas/coordinates/interp.py +++ b/xdas/coordinates/interp.py @@ -295,7 +295,7 @@ def get_indexer(self, value, method=None): "`da[dim] = da[dim].simplify(tolerance)`, or by specifying a " "tolerance when opening multiple files." ) - else: + else: # pragma: no cover raise e return indexer @@ -393,7 +393,7 @@ def get_split_indices(self, kind="discontinuities", tolerance=False): match kind: case "gaps": mask = deltas >= zero - case "overlaps": + case "overlaps": # pragma: no branch mask = deltas < zero else: @@ -404,7 +404,7 @@ def get_split_indices(self, kind="discontinuities", tolerance=False): mask = np.abs(deltas) > tolerance case "gaps": mask = deltas > tolerance - case "overlaps": + case "overlaps": # pragma: no branch mask = deltas < -tolerance return self.tie_indices[indices[mask]] diff --git a/xdas/coordinates/sampled.py b/xdas/coordinates/sampled.py index cf9b6867..8d557084 100644 --- a/xdas/coordinates/sampled.py +++ b/xdas/coordinates/sampled.py @@ -382,7 +382,7 @@ def get_indexer(self, value, method=None): offset = (value - self.tie_values[reference]) / self.sampling_interval - match method: + match method: # pragma: no branch case None: if np.any( (offset % 1 != 0) @@ -399,7 +399,7 @@ def get_indexer(self, value, method=None): if np.any(offset < 0): raise KeyError("index not found") offset = np.minimum(offset, self.tie_lengths[reference] - 1) - case "bfill": + case "bfill": # pragma: no branch offset = np.ceil(offset).astype(int) if np.any(offset > self.tie_lengths[reference] - 1): raise KeyError("index not found") @@ -500,34 +500,25 @@ def get_split_indices(self, kind="discontinuities", tolerance=False): if tolerance is False: zero = np.timedelta64(0) if np.issubdtype(self.dtype, np.datetime64) else 0 - match kind: + match kind: # pragma: no branch case "gaps": mask = deltas >= zero - case "overlaps": + case "overlaps": # pragma: no branch mask = deltas < zero else: tolerance = parse_tolerance(tolerance, self.dtype) - match kind: + match kind: # pragma: no branch case "discontinuities": mask = np.abs(deltas) > tolerance case "gaps": mask = deltas > tolerance - case "overlaps": + case "overlaps": # pragma: no branch mask = deltas < -tolerance return indices[mask] - indices = self.tie_indices[1:] - if tolerance is not None: - tolerance = parse_tolerance(tolerance, self.dtype) - deltas = self.tie_values[1:] - ( - self.tie_values[:-1] + self.sampling_interval * self.tie_lengths[:-1] - ) - indices = indices[np.abs(deltas) > tolerance] - return indices - @classmethod def from_array(cls, arr, dim=None, sampling_interval=None): """Not supported — raises :exc:`NotImplementedError`.""" diff --git a/xdas/core/dataarray.py b/xdas/core/dataarray.py index 79f1e66d..2ef58d35 100644 --- a/xdas/core/dataarray.py +++ b/xdas/core/dataarray.py @@ -132,10 +132,7 @@ def __array__(self, dtype=None): return self.data.__array__(dtype) def __array_ufunc__(self, ufunc, method, *inputs, **kwargs): - from .routines import ( - broadcast_coords, - broadcast_to, - ) # TODO: circular import + from .routines import broadcast_coords, broadcast_to # TODO: circular import if not method == "__call__": return NotImplemented @@ -966,7 +963,7 @@ def to_dict(self): raise NotImplementedError("cannot convert a virtual array to a dictionary") elif isinstance(self.data, np.ndarray): data = self.data.tolist() - elif isinstance(self.data, DaskArray): + elif isinstance(self.data, DaskArray): # pragma: no branch data = to_dict(self.data) return { "data": data, diff --git a/xdas/core/datacollection.py b/xdas/core/datacollection.py index 9ffd341b..2c5c381e 100644 --- a/xdas/core/datacollection.py +++ b/xdas/core/datacollection.py @@ -148,7 +148,7 @@ def query(self, indexers=None, **indexers_kwargs): ) for name, value in data.items() } - else: + else: # pragma: no cover raise TypeError("unknown type of data collection") return DataCollection(data, self.name) else: diff --git a/xdas/core/numpy.py b/xdas/core/numpy.py index 32173e3a..feff9392 100644 --- a/xdas/core/numpy.py +++ b/xdas/core/numpy.py @@ -50,7 +50,7 @@ def wrapper(*args, **kwargs): da = ba.arguments.get(key) axis = ba.arguments.get("axis") out = ba.arguments.get("out") - if isinstance(da, DataArray): + if isinstance(da, DataArray): # pragma: no branch ba.arguments[key] = da.data if isinstance(out, DataArray): ba.arguments["out"] = out.data @@ -137,9 +137,9 @@ def wrapper(*args, **kwargs): handled(drop_coords=True)(np.diff) handled(drop_coords=True)(np.ediff1d) -if NumpyVersion(np.__version__) < "2.4.0": +if NumpyVersion(np.__version__) < "2.4.0": # pragma: no cover handled(drop_coords=True)(np.trapz) -if NumpyVersion(np.__version__) >= "2.0.0": +if NumpyVersion(np.__version__) >= "2.0.0": # pragma: no branch handled(drop_coords=True)(np.trapezoid) # TODO: gradient diff --git a/xdas/core/routines.py b/xdas/core/routines.py index c8ce1a0e..1da12ee9 100644 --- a/xdas/core/routines.py +++ b/xdas/core/routines.py @@ -176,7 +176,7 @@ def open( verbose=verbose, **kwargs, ) - case "tree-like": + case "tree-like": # pragma: no branch return open_mfdatatree( paths, dim, @@ -554,9 +554,9 @@ def open_mfdataarray( for path in iterator: try: objs.append(open_dataarray(path, engine=engine, **kwargs)) - except Exception as e: - failures.append((path, e)) - warnings.warn(f"could not open {path}: {e}", RuntimeWarning) + except Exception as error: + failures.append((path, error)) + warnings.warn(f"could not open {path}: {error}", RuntimeWarning) else: executor = get_reusable_executor(max_workers) futures_to_paths = { @@ -574,19 +574,17 @@ def open_mfdataarray( for future in iterator: try: obj = future.result() - except Exception as e: + except Exception as error: path = futures_to_paths[future] - failures.append((path, e)) - warnings.warn(f"could not open {path}: {e}", RuntimeWarning) + failures.append((path, error)) + warnings.warn(f"could not open {path}: {error}", RuntimeWarning) else: objs.append(obj) - if len(objs) == 0: - if failures: - path, error = failures[0] - raise RuntimeError( - f"could not open any file with; first failure was {path}: {error}" - ) from error - raise FileNotFoundError("no file to open") + if len(objs) == 0: # there must be failures + path, error = failures[0] + raise RuntimeError( + f"could not open any file with engine: {engine}; first failure was {path}: {error}" + ) from error return combine_by_coords(objs, dim, tolerance, squeeze, None, verbose) diff --git a/xdas/io/__init__.py b/xdas/io/__init__.py index 5689acce..8d06f61d 100644 --- a/xdas/io/__init__.py +++ b/xdas/io/__init__.py @@ -3,14 +3,5 @@ xdas native, ASN, APSensing, Febus, MiniSEED, ProdML, Silixa, Terra15 formats. """ -from . import ( - apsensing, - asn, - febus, - miniseed, - prodml, - silixa, - terra15, - xdas, -) +from . import apsensing, asn, febus, miniseed, prodml, silixa, terra15, xdas from .core import AutoEngine, Engine, get_free_port diff --git a/xdas/io/miniseed.py b/xdas/io/miniseed.py index a13a2a22..adca0b3e 100644 --- a/xdas/io/miniseed.py +++ b/xdas/io/miniseed.py @@ -37,7 +37,7 @@ def read_header(self, path, ignore_last_sample, ctype): st = obspy.read(path, headonly=True) dtype = uniquifiy(tr.data.dtype for tr in st) - if not isinstance(dtype, np.dtype): + if not isinstance(dtype, np.dtype): # pragma: no cover raise ValueError("All traces must have the same dtype") stations = [tr.stats.station for tr in st] diff --git a/xdas/io/xdas.py b/xdas/io/xdas.py index be0b525e..fcbb119c 100644 --- a/xdas/io/xdas.py +++ b/xdas/io/xdas.py @@ -32,12 +32,12 @@ def save_dataarray(self, da, fname, **kwargs): return save_dataarray(da, fname, **kwargs) def open_datacollection(self, fname, **kwargs): - """Delegate to module-level :func:`open_datamapping`.""" - return open_datamapping(fname, **kwargs) + """Delegate to module-level :func:`open_datacollection`.""" + return open_datacollection(fname, **kwargs) def save_datacollection(self, dc, fname, **kwargs): - """Delegate to module-level :func:`save_datamapping`.""" - return save_datamapping(dc, fname, **kwargs) + """Delegate to module-level :func:`save_datacollection`.""" + return save_datacollection(dc, fname, **kwargs) def open_dataarray(fname, group=None): @@ -194,26 +194,21 @@ def save_dataarray( def open_datacollection(fname, group=None): """Read a :class:`DataCollection` from *fname*, auto-detecting sequence vs. mapping.""" - if isinstance(fname, Path): - fname = str(fname) dc = open_datamapping(fname, group) try: keys = [int(key) for key in dc.keys()] - if keys == list(range(len(keys))): - return DataSequence.from_mapping(dc) - else: - return dc except ValueError: return dc + if set(keys) == set(range(len(keys))): + return DataSequence([dc[str(key)] for key in range(len(keys))], dc.name) + else: + return dc def save_datacollection( dc, fname, mode="w", group=None, virtual=None, encoding=None, create_dirs=False ): """Write *dc* to *fname*, dispatching to sequence or mapping writer as needed.""" - if isinstance(fname, Path): - fname = str(fname) - if isinstance(dc, DataSequence): save_datasequence(dc, fname, mode, group, virtual, encoding, create_dirs) elif isinstance(dc, DataCollection): @@ -238,7 +233,7 @@ def open_datamapping(fname, group=None): "it looks like you are trying to open a data array as a data collection." ) else: - if not isinstance(group, h5py.Group): + if not isinstance(group, h5py.Group): # pragma: no cover raise RuntimeError( "something went wrong while opening the data collection." ) diff --git a/xdas/processing/core.py b/xdas/processing/core.py index e4b88f61..9cd5ff6e 100644 --- a/xdas/processing/core.py +++ b/xdas/processing/core.py @@ -111,7 +111,7 @@ class DataArrayLoader: def __init__(self, da, chunks, max_buffers=1, max_workers=1): if not isinstance(da, DataArray): raise TypeError(f"`da` must by a DataArray object, not a {type(da)}") - if not isinstance(chunks, dict) and len(chunks) == 1: + if not (isinstance(chunks, dict) and len(chunks) == 1): raise TypeError( "`chunks` must be a dict that maps a unique " "dimension to a unique size: {'dim': int}" @@ -371,7 +371,7 @@ def write(self, df): return self.submit(df) def _write(self, df): - if df is not None: + if df is not None: # pragma: no branch if not os.path.exists(self.path): df.to_csv(self.path, mode="w", header=True, index=False) else: @@ -505,7 +505,7 @@ def _to_SDS(self, st): new_st += tr new_st = new_st[0].split() for new_tr in new_st: - if isinstance(new_tr.data, np.ma.masked_array): + if isinstance(new_tr.data, np.ma.masked_array): # pragma: no cover new_tr.data = new_tr.data.filled() new_tr.stats.mseed["dataquality"] = self.dataquality year = new_st[0].stats.starttime.year @@ -529,7 +529,7 @@ def _to_flat(self, st): tmp_st += tr tmp_st = tmp_st[0].split() for new_tr in tmp_st: - if isinstance(new_tr.data, np.ma.masked_array): + if isinstance(new_tr.data, np.ma.masked_array): # pragma: no cover new_tr.data = new_tr.data.filled() new_st += new_tr new_st.write(os.path.join(self.dirpath, self.fname), **self.kw_write) @@ -569,7 +569,7 @@ def result(self): out = out.merge(**self.kw_merge) if self.output_format == "flat": self._to_flat(out) - elif self.output_format == "SDS": + elif self.output_format == "SDS": # pragma: no branch self._to_SDS(out) files_to_remove = glob(pattern) for file in files_to_remove: @@ -642,7 +642,7 @@ def write(self, da): """Alias for :meth:`submit`.""" self.submit(da) - def result(): + def result(self): """Return ``None`` — ZMQPublisher has no aggregated result.""" return None diff --git a/xdas/signal.py b/xdas/signal.py index 7d1bbbae..58f88c93 100644 --- a/xdas/signal.py +++ b/xdas/signal.py @@ -3,6 +3,7 @@ tapering, detrending, and spectral helpers, all coordinate-aware and multi-threaded via :func:`~xdas.parallel.parallelize`. """ + import numpy as np import scipy.signal as sp From 9c7487aecddc9757f677a74d41ad4024e46e9f3e Mon Sep 17 00:00:00 2001 From: Alister Trabattoni Date: Mon, 25 May 2026 20:06:44 +0200 Subject: [PATCH 15/21] FULL DOSCTRING COVERAGE !!! --- xdas/atoms/core.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/xdas/atoms/core.py b/xdas/atoms/core.py index 54b471e2..84691230 100644 --- a/xdas/atoms/core.py +++ b/xdas/atoms/core.py @@ -144,11 +144,17 @@ def initialized(self): """``True`` if every state key has been initialised (no ``...`` sentinels remain).""" return all(value is not ... for value in self._state.values()) - def initialize(self, x, **flags): ... + def initialize(self, x, **flags): + """Initialise the atom from a first chunks of data.""" + return NotImplemented - def initialize_from_state(self): ... + def initialize_from_state(self): + """Initialise the atom from its current state.""" + return NotImplemented - def call(self, x, **flags): ... + def call(self, x, **flags): + """Process a chunk of data.""" + return NotImplemented def __call__(self, x, **flags): chunk_dim = flags.get("chunk_dim", None) From 206e25ca5e8227af29f0f8c3c40a629fea766c53 Mon Sep 17 00:00:00 2001 From: Alister Trabattoni Date: Mon, 25 May 2026 20:08:41 +0200 Subject: [PATCH 16/21] Update release-notes and prepare to 0.2.7. --- docs/conf.py | 2 +- docs/release-notes.md | 15 +++++++++++++++ pyproject.toml | 2 +- xdas/__init__.py | 2 +- 4 files changed, 18 insertions(+), 3 deletions(-) diff --git a/docs/conf.py b/docs/conf.py index 0f67dae5..3a4188cc 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -11,7 +11,7 @@ author = "Alister Trabattoni" # The full version, including alpha/beta/rc tags -release = "0.2.6" +release = "0.2.7" # -- General configuration --------------------------------------------------- diff --git a/docs/release-notes.md b/docs/release-notes.md index b40b0ff5..ace4abba 100644 --- a/docs/release-notes.md +++ b/docs/release-notes.md @@ -1,5 +1,20 @@ # Release notes +## 0.2.7 + +### Documentation +- Achieved **100% docstring coverage** (excluding `__magic__` and private `_methods`) (@atrabattoni). +- Improved *User Guide* index (@atrabattoni). +- Added new *Sampled Coordinates* page (@atrabattoni). +- Enhanced *Processing* documentation (@atrabattoni). +- Improved *FAQ* page (@atrabattoni). +- Added missing API documentation for several methods (@atrabattoni). + +### Refactoring +- Achieved **100% test coverage** across the codebase (@atrabattoni). +- Reduced test suite execution time by ~50% (@atrabattoni). +- Migrated formatting tooling from `isort` + `black` to `ruff` (@atrabattoni). + ## 0.2.6 ### New features diff --git a/pyproject.toml b/pyproject.toml index 3c8d2b5f..2960d321 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "xdas" -version = "0.2.6" +version = "0.2.7" requires-python = ">= 3.10" authors = [ { name = "Alister Trabattoni", email = "alister.trabattoni@gmail.com" }, diff --git a/xdas/__init__.py b/xdas/__init__.py index db6bbe56..7075e600 100644 --- a/xdas/__init__.py +++ b/xdas/__init__.py @@ -6,7 +6,7 @@ for common DAS instrument formats. """ -__version__ = "0.2.6" +__version__ = "0.2.7" from . import ( atoms, From f88bdf9163594d9bbf987bfeebad9ed4fade0da8 Mon Sep 17 00:00:00 2001 From: Alister Trabattoni Date: Mon, 25 May 2026 20:17:45 +0200 Subject: [PATCH 17/21] Remove pytest-timeout from tests dependencies. --- pyproject.toml | 1 - 1 file changed, 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 2960d321..dc5914d1 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -45,7 +45,6 @@ tests = [ "dascore", "pytest", "pytest-cov", - "pytest-timeout", "psutil", "seisbench", "torch", From 3d4ab67585d0aed217c4abca0a9e2ac98979792a Mon Sep 17 00:00:00 2001 From: Alister Trabattoni Date: Mon, 25 May 2026 21:09:18 +0200 Subject: [PATCH 18/21] FULL RUFF CHECKS !!! --- docs/conf.py | 10 +++---- tests/test_atoms.py | 10 ++++++- tests/test_datacollection.py | 3 -- tests/test_processing.py | 53 ++++++++++++++++++------------------ xdas/__init__.py | 52 ++++++++++++++++++++++++++++++++++- xdas/atoms/__init__.py | 17 ++++++++++++ xdas/atoms/core.py | 6 ++-- xdas/coordinates/__init__.py | 11 ++++++++ xdas/core/dataarray.py | 3 +- xdas/dask/__init__.py | 2 ++ xdas/fft.py | 10 +++++-- xdas/io/__init__.py | 14 ++++++++++ xdas/processing/__init__.py | 11 ++++++++ xdas/signal.py | 19 +++++++++---- 14 files changed, 173 insertions(+), 48 deletions(-) diff --git a/docs/conf.py b/docs/conf.py index 3a4188cc..22940c8e 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -86,13 +86,13 @@ } # -- Generate dummy data ----------------------------------------------------- -import os +import os # noqa: E402 -import h5py -import numpy as np +import h5py # noqa: E402 +import numpy as np # noqa: E402 -import xdas as xd -from xdas.synthetics import wavelet_wavefronts +import xdas as xd # noqa: E402 +from xdas.synthetics import wavelet_wavefronts # noqa: E402 dirpath = os.path.join(os.path.split(__file__)[0], "_data") if not os.path.exists(dirpath): diff --git a/tests/test_atoms.py b/tests/test_atoms.py index b0ceb48b..83d6854f 100644 --- a/tests/test_atoms.py +++ b/tests/test_atoms.py @@ -7,6 +7,7 @@ import xdas as xd import xdas.signal as xs from xdas.atoms import ( + Atom, DownSample, FIRFilter, IIRFilter, @@ -20,6 +21,14 @@ from xdas.synthetics import randn_wavefronts, wavelet_wavefronts +class TestAbstractAtom: + def test(self): + atom = Atom() + assert atom.initialize(None) is NotImplemented + assert atom.initialize_from_state() is NotImplemented + assert atom.call(None) is NotImplemented + + class TestPartialAtom: def test_init(self): Sequential( @@ -353,7 +362,6 @@ def call(self, x, **flags): np.testing.assert_array_equal(recovered.buf, atom.buf) def test_atomized_two_atom_args_raises(self): - da = wavelet_wavefronts() atom1 = xs.integrate(...) atom2 = xs.integrate(...) with pytest.raises(ValueError, match="Only one Atom"): diff --git a/tests/test_datacollection.py b/tests/test_datacollection.py index 24af96cd..d7107439 100644 --- a/tests/test_datacollection.py +++ b/tests/test_datacollection.py @@ -382,9 +382,6 @@ def test_parse_tuple_with_name_given(self): def test_parse_datacollection_propagates_name(self): da = wavelet_wavefronts() dm = xd.DataCollection({"a": da}, "original_name") - # Passing existing DataCollection without explicit name propagates the name - dm2 = xd.DataCollection({"a": da}, "original_name") - dc_copy = xd.DataCollection.__new__(xd.DataCollection, dm2) # just verify parse propagates name from xdas.core.datacollection import parse diff --git a/tests/test_processing.py b/tests/test_processing.py index 4002ff8e..8d7c60bd 100644 --- a/tests/test_processing.py +++ b/tests/test_processing.py @@ -130,20 +130,17 @@ def test_small_last_chunk(self, tmp_path): sequence = Sequential([Partial(sosfilt, sos, ..., dim="time", zi=...)]) # monolithic processing - result1 = sequence(da) + sequence(da) # chunked processing data_loader = xp.DataArrayLoader(da, chunks={"time": 100}) for da in data_loader: - pass + pass # TODO # data_writer = xp.DataArrayWriter(tmp_path) # result2 = xp.process( # sequence, data_loader, data_writer # ) # resets the sequence by default - # # test - # assert result1.equals(result2) - class TestDataFrameWriter: def test_init(self, tmp_path): @@ -273,13 +270,14 @@ def test_without_gap(self, tmp_path): }, ) - atom = lambda da, **kwargs: da.to_stream( - network="NT", - station="ST{:03}", - channel="HN1", - location="00", - dim={"distance": "time"}, - ) + def atom(da, **kwargs): + return da.to_stream( + network="NT", + station="ST{:03}", + channel="HN1", + location="00", + dim={"distance": "time"}, + ) data_loader = xp.DataArrayLoader(da, chunks={"time": 100}) @@ -330,13 +328,15 @@ def test_with_gap(self, tmp_path): "distance": 5.0 * np.arange(10), }, ) - atom = lambda da, **kwargs: da.to_stream( - network="NT", - station="ST{:03}", - channel="HN1", - location="00", - dim={"distance": "time"}, - ) + + def atom(da, **kwargs): + return da.to_stream( + network="NT", + station="ST{:03}", + channel="HN1", + location="00", + dim={"distance": "time"}, + ) data_loader = xp.DataArrayLoader(da, chunks={"time": 100}) @@ -388,13 +388,14 @@ def test_flat(self, tmp_path): }, ) - atom = lambda da, **kwargs: da.to_stream( - network="NT", - station="ST{:03}", - channel="HN1", - location="00", - dim={"distance": "time"}, - ) + def atom(da, **kwargs): + return da.to_stream( + network="NT", + station="ST{:03}", + channel="HN1", + location="00", + dim={"distance": "time"}, + ) data_loader = xp.DataArrayLoader(da, chunks={"time": 100}) diff --git a/xdas/__init__.py b/xdas/__init__.py index 7075e600..d4f1e09c 100644 --- a/xdas/__init__.py +++ b/xdas/__init__.py @@ -8,6 +8,56 @@ __version__ = "0.2.7" +__all__ = [ + # submodules + "atoms", + "config", + "coordinates", + "dataarray", + "datacollection", + "fft", + "io", + "methods", + "numpy", + "parallel", + "processing", + "routines", + "signal", + "synthetics", + "virtual", + # classes + "Coordinate", + "Coordinates", + "DataArray", + "DataCollection", + "DataMapping", + "DataSequence", + "DefaultCoordinate", + "DenseCoordinate", + "InterpCoordinate", + "SampledCoordinate", + "ScalarCoordinate", + # functions + "align", + "asdataarray", + "broadcast_coords", + "broadcast_to", + "combine_by_coords", + "combine_by_field", + "concat", + "concat_coords", + "concatenate", + "get_sampling_interval", + "open", + "open_dataarray", + "open_datacollection", + "open_mfdataarray", + "open_mfdatacollection", + "open_mfdatatree", + "plot_availability", + "split", +] + from . import ( atoms, config, @@ -33,7 +83,7 @@ from .core import dataarray, datacollection, methods, numpy, routines from .core.dataarray import DataArray from .core.datacollection import DataCollection, DataMapping, DataSequence -from .core.methods import * +from .core.methods import * # noqa: F403 from .core.routines import ( align, asdataarray, diff --git a/xdas/atoms/__init__.py b/xdas/atoms/__init__.py index 7149a1bf..7a251e2b 100644 --- a/xdas/atoms/__init__.py +++ b/xdas/atoms/__init__.py @@ -5,6 +5,23 @@ :func:`atomized`, signal-processing atoms, and the ML-based :class:`MLPicker`. """ +__all__ = [ + "Atom", + "DownSample", + "FIRFilter", + "IIRFilter", + "LFilter", + "MLPicker", + "Partial", + "ResamplePoly", + "SOSFilter", + "Sequential", + "State", + "Trigger", + "UpSample", + "atomized", +] + from ..trigger import Trigger from .core import Atom, Partial, Sequential, State, atomized from .ml import MLPicker diff --git a/xdas/atoms/core.py b/xdas/atoms/core.py index 84691230..c76ff078 100644 --- a/xdas/atoms/core.py +++ b/xdas/atoms/core.py @@ -144,15 +144,15 @@ def initialized(self): """``True`` if every state key has been initialised (no ``...`` sentinels remain).""" return all(value is not ... for value in self._state.values()) - def initialize(self, x, **flags): + def initialize(self, x, **flags): """Initialise the atom from a first chunks of data.""" return NotImplemented - def initialize_from_state(self): + def initialize_from_state(self): """Initialise the atom from its current state.""" return NotImplemented - def call(self, x, **flags): + def call(self, x, **flags): """Process a chunk of data.""" return NotImplemented diff --git a/xdas/coordinates/__init__.py b/xdas/coordinates/__init__.py index 7bd99542..1bcd6ae6 100644 --- a/xdas/coordinates/__init__.py +++ b/xdas/coordinates/__init__.py @@ -7,6 +7,17 @@ :class:`SampledCoordinate`, :class:`ScalarCoordinate`. """ +__all__ = [ + "Coordinate", + "Coordinates", + "DefaultCoordinate", + "DenseCoordinate", + "InterpCoordinate", + "SampledCoordinate", + "ScalarCoordinate", + "get_sampling_interval", +] + from .core import Coordinate, Coordinates, get_sampling_interval from .default import DefaultCoordinate from .dense import DenseCoordinate diff --git a/xdas/core/dataarray.py b/xdas/core/dataarray.py index 2ef58d35..239474cb 100644 --- a/xdas/core/dataarray.py +++ b/xdas/core/dataarray.py @@ -4,6 +4,7 @@ """ import copy +import warnings from functools import partial import numpy as np @@ -18,8 +19,6 @@ HANDLED_NUMPY_FUNCTIONS = {} HANDLED_METHODS = {} -import warnings - class DataArray(NDArrayOperatorsMixin): """ diff --git a/xdas/dask/__init__.py b/xdas/dask/__init__.py index 362b88fb..389ae985 100644 --- a/xdas/dask/__init__.py +++ b/xdas/dask/__init__.py @@ -3,4 +3,6 @@ inside xdas HDF5 files. """ +__all__ = ["create_variable", "dumps", "loads"] + from .core import create_variable, dumps, loads diff --git a/xdas/fft.py b/xdas/fft.py index 413d4439..a3d5ef49 100644 --- a/xdas/fft.py +++ b/xdas/fft.py @@ -65,7 +65,10 @@ def fft(da, n=None, dim={"last": "spectrum"}, norm=None, parallel=None): axis = da.get_axis_num(olddim) d = get_sampling_interval(da, olddim) f = np.fft.fftshift(np.fft.fftfreq(n, d)) - func = lambda x: np.fft.fftshift(np.fft.fft(x, n, axis, norm), axis) + + def func(x): + return np.fft.fftshift(np.fft.fft(x, n, axis, norm), axis) + across = int(axis == 0) func = parallelize(across, across, parallel)(func) data = func(da.values) @@ -195,7 +198,10 @@ def ifft(da, n=None, dim={"last": "signal"}, norm=None, parallel=None): axis = da.get_axis_num(olddim) d = get_sampling_interval(da, olddim) f = np.fft.ifftshift(np.fft.fftfreq(n, d)) - func = lambda x: np.fft.ifft(np.fft.ifftshift(x, axis), n, axis, norm) + + def func(x): + return np.fft.ifft(np.fft.ifftshift(x, axis), n, axis, norm) + across = int(axis == 0) func = parallelize(across, across, parallel)(func) data = func(da.values) diff --git a/xdas/io/__init__.py b/xdas/io/__init__.py index 8d06f61d..2097ba65 100644 --- a/xdas/io/__init__.py +++ b/xdas/io/__init__.py @@ -3,5 +3,19 @@ xdas native, ASN, APSensing, Febus, MiniSEED, ProdML, Silixa, Terra15 formats. """ +__all__ = [ + "AutoEngine", + "Engine", + "apsensing", + "asn", + "febus", + "get_free_port", + "miniseed", + "prodml", + "silixa", + "terra15", + "xdas", +] + from . import apsensing, asn, febus, miniseed, prodml, silixa, terra15, xdas from .core import AutoEngine, Engine, get_free_port diff --git a/xdas/processing/__init__.py b/xdas/processing/__init__.py index fd6a3e9b..4559a0ca 100644 --- a/xdas/processing/__init__.py +++ b/xdas/processing/__init__.py @@ -3,6 +3,17 @@ the :func:`process` orchestrator for larger-than-memory datasets. """ +__all__ = [ + "DataArrayLoader", + "DataArrayWriter", + "DataFrameWriter", + "RealTimeLoader", + "StreamWriter", + "ZMQPublisher", + "ZMQSubscriber", + "process", +] + from .core import ( DataArrayLoader, DataArrayWriter, diff --git a/xdas/signal.py b/xdas/signal.py index 58f88c93..08721419 100644 --- a/xdas/signal.py +++ b/xdas/signal.py @@ -795,7 +795,10 @@ def integrate(da, midpoints=False, dim="last", parallel=None): """ axis = da.get_axis_num(dim) d = get_sampling_interval(da, dim) - func = lambda x: np.cumsum(x, axis=axis) * d + + def func(x): + return np.cumsum(x, axis=axis) * d + across = int(axis == 0) func = parallelize(across, across, parallel)(func) data = func(da.values) @@ -835,7 +838,10 @@ def differentiate(da, midpoints=False, dim="last", parallel=None): """ axis = da.get_axis_num(dim) d = get_sampling_interval(da, dim) - func = lambda x: np.diff(x, axis=axis) / d + + def func(x): + return np.diff(x, axis=axis) / d + across = int(axis == 0) func = parallelize(across, across, parallel)(func) data = func(da.values) @@ -923,9 +929,12 @@ def sliding_mean_removal( shape = tuple(-1 if a == axis else 1 for a in range(da.ndim)) win = np.reshape(win, shape) pad_width = tuple((n // 2, n // 2) if a == axis else (0, 0) for a in range(da.ndim)) - func = lambda x: ( - x - sp.fftconvolve(np.pad(x, pad_width, mode=pad_mode), win, mode="valid") - ) + + def func(x): + return x - sp.fftconvolve( + np.pad(x, pad_width, mode=pad_mode), win, mode="valid" + ) + across = int(axis == 0) func = parallelize(across, across, parallel)(func) data = func(da.values) From 593150f02076cd024f6615deab86dae2be0f994c Mon Sep 17 00:00:00 2001 From: Alister Trabattoni Date: Mon, 25 May 2026 21:43:03 +0200 Subject: [PATCH 19/21] Add docstring checks (except D105). --- pyproject.toml | 10 +++++++++- xdas/atoms/core.py | 17 ++++++++++------- xdas/atoms/signal.py | 13 ++++++------- xdas/coordinates/core.py | 18 ++++++++++-------- xdas/coordinates/default.py | 5 +++-- xdas/coordinates/dense.py | 4 +--- xdas/coordinates/interp.py | 14 +++++++------- xdas/coordinates/sampled.py | 5 +++-- xdas/coordinates/scalar.py | 5 +++-- xdas/core/__init__.py | 6 ++++-- xdas/core/dataarray.py | 18 +++++++++--------- xdas/core/datacollection.py | 17 +++++++++++------ xdas/core/methods.py | 5 +++-- xdas/core/numpy.py | 7 ++++--- xdas/core/routines.py | 8 +++++--- xdas/dask/__init__.py | 5 +++-- xdas/dask/core.py | 5 +++-- xdas/dask/serial.py | 10 ++++++---- xdas/fft.py | 8 +++++--- xdas/io/__init__.py | 6 ++++-- xdas/io/apsensing.py | 4 +--- xdas/io/asn.py | 10 ++++++---- xdas/io/core.py | 6 ++++-- xdas/io/febus.py | 12 +++++------- xdas/io/miniseed.py | 4 +--- xdas/io/prodml.py | 5 +++-- xdas/io/silixa.py | 4 +--- xdas/io/tdms.py | 28 +++++++++++++--------------- xdas/io/terra15.py | 4 +--- xdas/io/utils.py | 6 +++--- xdas/io/xdas.py | 5 +++-- xdas/parallel.py | 7 ++++--- xdas/picking.py | 7 ++++--- xdas/processing/__init__.py | 6 ++++-- xdas/processing/core.py | 6 ++++-- xdas/processing/monitor.py | 4 +--- xdas/signal.py | 15 ++++++++------- xdas/spectral.py | 5 +++-- xdas/synthetics.py | 6 +++--- xdas/trigger.py | 8 +++++--- xdas/virtual.py | 8 +++++--- 41 files changed, 191 insertions(+), 155 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index dc5914d1..2c072eb4 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -51,7 +51,15 @@ tests = [ ] [tool.ruff.lint] -extend-select = ["I"] +extend-select = ["I", "D"] +extend-ignore = ["D105"] + +[tool.ruff.lint.pydocstyle] +convention = "numpy" + +[tool.ruff.lint.per-file-ignores] +"tests/**" = ["D"] +"docs/**" = ["D"] [tool.pytest.ini_options] addopts = [ diff --git a/xdas/atoms/core.py b/xdas/atoms/core.py index c76ff078..e3cea04c 100644 --- a/xdas/atoms/core.py +++ b/xdas/atoms/core.py @@ -1,6 +1,8 @@ """ -Base classes for stateful processing atoms: :class:`Atom`, :class:`State`, -:class:`Sequential`, :class:`Partial`, and the :func:`atomized` decorator. +Base classes for stateful processing atoms. + +Includes :class:`Atom`, :class:`State`, :class:`Sequential`, :class:`Partial`, +and the :func:`atomized` decorator. """ import importlib @@ -24,7 +26,6 @@ class State: Examples -------- - In practice the State object is used when implementing new Atom objects. Bellow a dummy example without any class declaration. @@ -157,6 +158,7 @@ def call(self, x, **flags): return NotImplemented def __call__(self, x, **flags): + """Process input data, initializing state if needed and resetting after final chunk.""" chunk_dim = flags.get("chunk_dim", None) if not self.initialized or chunk_dim is None: self.initialize(x, **flags) @@ -203,10 +205,11 @@ def load_state(self, path): class Sequential(Atom, list): """ - A class to handle a sequence of operations. Each operation is represented by an - Atom class object, which contains the function and its arguments. + A class to handle a sequence of operations. - Sequence inherits from list, and therefore behaves as it. + Each operation is represented by an Atom class object, which contains the + function and its arguments. Sequence inherits from list, and therefore + behaves as it. Parameters ---------- @@ -309,7 +312,7 @@ def __repr__(self) -> str: return s def reset(self) -> None: - """Resets the state of all StateAtom of the sequence.""" + """Reset the state of all stateful atoms in the sequence.""" for atom in self: if isinstance(atom, Partial): atom.reset() diff --git a/xdas/atoms/signal.py b/xdas/atoms/signal.py index 2b77aeee..a077573d 100644 --- a/xdas/atoms/signal.py +++ b/xdas/atoms/signal.py @@ -1,8 +1,8 @@ """ -Signal-processing atoms: stateful wrappers around common filtering and -resampling operations (:class:`ResamplePoly`, :class:`IIRFilter`, -:class:`FIRFilter`, :class:`LFilter`, :class:`SOSFilter`, -:class:`DownSample`, :class:`UpSample`). +Signal-processing atoms: stateful wrappers around filtering and resampling. + +Includes :class:`ResamplePoly`, :class:`IIRFilter`, :class:`FIRFilter`, +:class:`LFilter`, :class:`SOSFilter`, :class:`DownSample`, :class:`UpSample`. """ from fractions import Fraction @@ -19,9 +19,9 @@ class ResamplePoly(Atom): """ - Pipeline implementation of polyphase-filter resampling from the - original sampling rate to the ``target`` sampling rate. + Pipeline implementation of polyphase-filter resampling. + Resamples from the original sampling rate to the ``target`` sampling rate. This is achieved by an upsampling of the data, followed by the application of a low-pass FIR filter, and finally by downsampling of the data. The ratio of the @@ -30,7 +30,6 @@ class ResamplePoly(Atom): Parameters ---------- - target : float The target sampling rate of the new data maxfactor : int diff --git a/xdas/coordinates/core.py b/xdas/coordinates/core.py index 9ce6a712..5b6928dc 100644 --- a/xdas/coordinates/core.py +++ b/xdas/coordinates/core.py @@ -1,7 +1,9 @@ """ -Core coordinate infrastructure: :class:`Coordinates` container, -:class:`Coordinate` factory/base class, and shared helpers used by all -concrete coordinate types (parsing, interpolation, tolerance handling). +Core coordinate infrastructure. + +Includes the :class:`Coordinates` container, :class:`Coordinate` factory/base +class, and shared helpers used by all concrete coordinate types (parsing, +interpolation, tolerance handling). """ import weakref @@ -14,7 +16,7 @@ def wraps_first_last(func): - """Decorator that resolves ``"first"`` and ``"last"`` dim aliases before calling *func*.""" + """Resolve ``"first"`` and ``"last"`` dim aliases before calling *func*.""" @wraps(func) def wrapper(self, dim, *args, **kwargs): @@ -214,7 +216,6 @@ def to_dict(self): Examples -------- - >>> import xdas as xd >>> coords = xd.Coordinates( @@ -326,6 +327,7 @@ def __class_getitem__(cls, item): return cls._registry[item] def __new__(cls, data=None, dim=None, dtype=None): + """Instantiate the appropriate Coordinate subclass based on *data*.""" # class factory if instantiating Coordinate directly if cls is Coordinate: if data is None: @@ -609,7 +611,7 @@ def get_split_indices(self, kind="discontinuities", tolerance=False): def get_discontinuities(self, tolerance=None): """ - Returns a DataFrame containing information about the discontinuities. + Return a DataFrame containing information about the discontinuities. Returns ------- @@ -664,7 +666,7 @@ def get_discontinuities(self, tolerance=None): def get_availabilities(self): """ - Returns a DataFrame containing information about the data availability. + Return a DataFrame containing information about the data availability. Returns ------- @@ -836,7 +838,7 @@ def parse_tolerance(tolerance, dtype): def get_sampling_interval(da, dim, cast=True): """ - Returns the sample spacing along a given dimension. + Return the sample spacing along a given dimension. Parameters ---------- diff --git a/xdas/coordinates/default.py b/xdas/coordinates/default.py index 88b02025..70c5e2f9 100644 --- a/xdas/coordinates/default.py +++ b/xdas/coordinates/default.py @@ -1,6 +1,7 @@ """ -:class:`DefaultCoordinate`: integer-range coordinate used when no coordinate -is explicitly provided for an axis. +:class:`DefaultCoordinate`: integer-range coordinate. + +Used when no coordinate is explicitly provided for an axis. """ import numpy as np diff --git a/xdas/coordinates/dense.py b/xdas/coordinates/dense.py index ef981924..39743f2a 100644 --- a/xdas/coordinates/dense.py +++ b/xdas/coordinates/dense.py @@ -1,6 +1,4 @@ -""" -:class:`DenseCoordinate`: coordinate backed by a full numpy array. -""" +""":class:`DenseCoordinate`: coordinate backed by a full numpy array.""" import numpy as np import pandas as pd diff --git a/xdas/coordinates/interp.py b/xdas/coordinates/interp.py index b1440f9b..debf0364 100644 --- a/xdas/coordinates/interp.py +++ b/xdas/coordinates/interp.py @@ -1,6 +1,7 @@ """ -:class:`InterpCoordinate`: piecewise-linear coordinate defined by tie points, -using ``xinterp`` for forward and inverse interpolation. +:class:`InterpCoordinate`: piecewise-linear coordinate. + +Defined by tie points, using ``xinterp`` for forward and inverse interpolation. """ import re @@ -19,12 +20,11 @@ class InterpCoordinate(Coordinate, name="interpolated"): """ - Array-like object used to represent piecewise evenly spaced coordinates using the - CF convention. + Array-like object representing piecewise evenly spaced coordinates (CF convention). - The coordinate ticks are describes by the mean of tie points that are interpolated - when intermediate values are required. Coordinate objects provides label based - selections methods. + The coordinate ticks are described by tie points that are interpolated when + intermediate values are required. Coordinate objects provide label-based + selection methods. Parameters ---------- diff --git a/xdas/coordinates/sampled.py b/xdas/coordinates/sampled.py index 8d557084..2b6e7ac8 100644 --- a/xdas/coordinates/sampled.py +++ b/xdas/coordinates/sampled.py @@ -1,6 +1,7 @@ """ -:class:`SampledCoordinate`: regularly-sampled coordinate described by tie -points and a fixed ``sampling_interval`` between them. +:class:`SampledCoordinate`: regularly-sampled coordinate. + +Described by tie points and a fixed ``sampling_interval`` between them. """ import re diff --git a/xdas/coordinates/scalar.py b/xdas/coordinates/scalar.py index e7beff3a..dd89cbeb 100644 --- a/xdas/coordinates/scalar.py +++ b/xdas/coordinates/scalar.py @@ -1,6 +1,7 @@ """ -:class:`ScalarCoordinate`: non-dimensional (scalar) coordinate that carries a -single value without being tied to an array axis. +:class:`ScalarCoordinate`: non-dimensional (scalar) coordinate. + +Carries a single value without being tied to an array axis. """ import numpy as np diff --git a/xdas/core/__init__.py b/xdas/core/__init__.py index ea86c4a9..c0c68b57 100644 --- a/xdas/core/__init__.py +++ b/xdas/core/__init__.py @@ -1,4 +1,6 @@ """ -Core data types for xdas: :class:`DataArray`, :class:`DataCollection`, -and supporting routines, methods, and NumPy dispatch. +Core data types for xdas. + +Includes :class:`DataArray`, :class:`DataCollection`, and supporting routines, +methods, and NumPy dispatch. """ diff --git a/xdas/core/dataarray.py b/xdas/core/dataarray.py index 239474cb..f2d8994f 100644 --- a/xdas/core/dataarray.py +++ b/xdas/core/dataarray.py @@ -1,6 +1,8 @@ """ -:class:`DataArray`: the primary N-dimensional array object with labeled -coordinates, NumPy/Dask backing, and lazy :class:`VirtualArray` support. +:class:`DataArray`: the primary N-dimensional labeled array object. + +Features labeled coordinates, NumPy/Dask backing, and lazy +:class:`VirtualArray` support. """ import copy @@ -321,8 +323,7 @@ def get_axis_num(self, dim): def isel(self, indexers=None, drop=False, **indexers_kwargs): """ - Return a new DataArray whose data is given by selecting indexes along the - specified dimension(s). + Return a new DataArray selecting indexes along the specified dimension(s). Parameters ---------- @@ -355,8 +356,7 @@ def sel( self, indexers=None, method=None, endpoint=True, drop=False, **indexers_kwargs ): """ - Return a new DataArray whose data is given by selecting index labels along the - specified dimension(s). + Return a new DataArray selecting index labels along the specified dimension(s). In contrast to DataArray.isel, indexers for this method should use labels instead of integers. @@ -428,7 +428,7 @@ def drop_coords(self, *names): def copy(self, deep=True, data=None): """ - Returns a copy of this array + Return a copy of this array. If deep=True, a deep copy is made of the data array. Otherwise, a shallow copy is made, and the returned data array's values are a new view of this data @@ -468,7 +468,7 @@ def copy(self, deep=True, data=None): def rename(self, new_name_or_name_dict=None, **names): """ - Returns a new DataArray with renamed coordinates, dimensions or a new name. + Return a new DataArray with renamed coordinates, dimensions or a new name. Parameters ---------- @@ -588,7 +588,7 @@ def assign_coords(self, coords=None, **coords_kwargs): def swap_dims(self, dims_dict=None, **dims_kwargs): """ - Returns a new DataArray with swapped dimensions. + Return a new DataArray with swapped dimensions. Parameters ---------- diff --git a/xdas/core/datacollection.py b/xdas/core/datacollection.py index 2c5c381e..b4259367 100644 --- a/xdas/core/datacollection.py +++ b/xdas/core/datacollection.py @@ -1,6 +1,8 @@ """ -:class:`DataCollection`, :class:`DataSequence`, and :class:`DataMapping`: -nested tree structures for grouping multiple :class:`DataArray` objects. +Nested tree structures for grouping multiple :class:`DataArray` objects. + +Includes :class:`DataCollection`, :class:`DataSequence`, and +:class:`DataMapping`. """ from fnmatch import fnmatch @@ -23,7 +25,7 @@ class DataCollection: name: str The name of the current level of nesting. - Returns: + Returns ------- DataCollection: The nested data as a DataSequence or DataMapping. @@ -55,6 +57,7 @@ class DataCollection: """ def __new__(cls, data, name=None): + """Dispatch to :class:`DataSequence` or :class:`DataMapping` based on *data* type.""" data, name = parse(data, name) if isinstance(data, list): return list.__new__(DataSequence) @@ -87,7 +90,7 @@ def query(self, indexers=None, **indexers_kwargs): The keyword arguments form of indexers. Overwrite indexers input if both are provided. - Returns: + Returns ------- DataCollection: The queried data. @@ -202,6 +205,7 @@ class DataMapping(DataCollection, dict): """ def __new__(cls, data, name=None): + """Allocate a new dict-backed DataMapping instance.""" return dict.__new__(cls) def __init__(self, data, name=None): @@ -361,7 +365,7 @@ def load(self): def map(self, atom): """ - Apply an atom to each data array of the data collection + Apply an atom to each data array of the data collection. Parameters ---------- @@ -413,6 +417,7 @@ class DataSequence(DataCollection, list): """ def __new__(cls, data, name=None): + """Allocate a new list-backed DataSequence instance.""" return list.__new__(cls) def __init__(self, data, name=None): @@ -566,7 +571,7 @@ def load(self): def map(self, atom): """ - Apply an atom to each data array of the data collection + Apply an atom to each data array of the data collection. Parameters ---------- diff --git a/xdas/core/methods.py b/xdas/core/methods.py index 13da0f09..6d43ec5f 100644 --- a/xdas/core/methods.py +++ b/xdas/core/methods.py @@ -1,6 +1,7 @@ """ -Registration helpers and implementations for :class:`DataArray` instance -methods dispatched through ``HANDLED_METHODS``. +Registration helpers and implementations for :class:`DataArray` instance methods. + +Methods are dispatched through ``HANDLED_METHODS``. """ import numpy as np diff --git a/xdas/core/numpy.py b/xdas/core/numpy.py index feff9392..43a3529a 100644 --- a/xdas/core/numpy.py +++ b/xdas/core/numpy.py @@ -1,6 +1,7 @@ """ -NumPy function dispatch for :class:`DataArray` via ``__array_function__``, -mapping NumPy functions to coordinate-aware implementations. +NumPy function dispatch for :class:`DataArray` via ``__array_function__``. + +Maps NumPy functions to coordinate-aware implementations. """ from inspect import signature @@ -24,7 +25,7 @@ def decorator(func): def handled(reduce=False, drop_coords=False, **defaults): """ - Decorator factory that wraps a NumPy function to be coordinate-aware. + Wrap a NumPy function to be coordinate-aware. Parameters ---------- diff --git a/xdas/core/routines.py b/xdas/core/routines.py index 1da12ee9..32b0d4f4 100644 --- a/xdas/core/routines.py +++ b/xdas/core/routines.py @@ -1,6 +1,7 @@ """ -Top-level routines for opening, concatenating, aligning, and splitting -:class:`DataArray` and :class:`DataCollection` objects, including +Top-level routines for opening, concatenating, aligning, and splitting arrays. + +Operates on :class:`DataArray` and :class:`DataCollection` objects; includes multi-file helpers (``open_mfdataarray``, ``open_mfdatacollection``). """ @@ -404,7 +405,7 @@ def collect( **kwargs, ): """ - Collects the data from a tree of paths using `fields` as level names. + Collect the data from a tree of paths using `fields` as level names. Parameters ---------- @@ -1012,6 +1013,7 @@ def concat_coords(objs, *, sort=False, return_order=False, tolerance=False): The tolerance to consider that the end of a coordinate object is continuous with beginning of the following, For time coordinates, numeric values are considered as seconds. No simplification by default. + Returns ------- coord diff --git a/xdas/dask/__init__.py b/xdas/dask/__init__.py index 389ae985..04046f48 100644 --- a/xdas/dask/__init__.py +++ b/xdas/dask/__init__.py @@ -1,6 +1,7 @@ """ -Dask integration helpers for serializing and deserializing dask arrays -inside xdas HDF5 files. +Dask integration helpers for xdas HDF5 files. + +Serializes and deserializes dask arrays inside xdas HDF5 files. """ __all__ = ["create_variable", "dumps", "loads"] diff --git a/xdas/dask/core.py b/xdas/dask/core.py index cc7d1ee6..b0a8e1e4 100644 --- a/xdas/dask/core.py +++ b/xdas/dask/core.py @@ -1,6 +1,7 @@ """ -Functions to store and restore dask arrays as HDF5 variables using msgpack -serialization of the dask task graph. +Functions to store and restore dask arrays as HDF5 variables. + +Uses msgpack serialization of the dask task graph. """ import numpy as np diff --git a/xdas/dask/serial.py b/xdas/dask/serial.py index f4f2ca4c..cf03ead6 100644 --- a/xdas/dask/serial.py +++ b/xdas/dask/serial.py @@ -1,6 +1,8 @@ """ -msgpack-based serialization for dask task graphs, handling tuples, slices, -callables, ``methodcaller``, and ``itemgetter`` objects. +msgpack-based serialization for dask task graphs. + +Handles tuples, slices, callables, ``methodcaller``, and ``itemgetter`` +objects. """ import importlib @@ -19,7 +21,7 @@ def encode(obj): """ - msgpack *default* hook — encode non-native types as :class:`msgpack.ExtType`. + Msgpack *default* hook — encode non-native types as :class:`msgpack.ExtType`. Handles ``tuple``, ``slice``, ``callable``, :class:`methodcaller`, and :class:`itemgetter`. @@ -56,7 +58,7 @@ def encode(obj): def decode(code, data): """ - msgpack *ext_hook* — decode an :class:`msgpack.ExtType` back to the original object. + Msgpack *ext_hook* — decode an :class:`msgpack.ExtType` back to the original object. Parameters ---------- diff --git a/xdas/fft.py b/xdas/fft.py index a3d5ef49..3c498268 100644 --- a/xdas/fft.py +++ b/xdas/fft.py @@ -1,6 +1,8 @@ """ -FFT functions that preserve :class:`DataArray` coordinates: :func:`fft`, -:func:`ifft`, :func:`rfft`, :func:`irfft`, :func:`fftfreq`, :func:`rfftfreq`. +FFT functions that preserve :class:`DataArray` coordinates. + +Includes :func:`fft`, :func:`ifft`, :func:`rfft`, :func:`irfft`, +:func:`fftfreq`, :func:`rfftfreq`. """ import numpy as np @@ -217,7 +219,7 @@ def func(x): @atomized def irfft(da, n=None, dim={"last": "signal"}, norm=None, parallel=None): """ - Computes the inverse of `rfft`. + Compute the inverse of `rfft`. Parameters ---------- diff --git a/xdas/io/__init__.py b/xdas/io/__init__.py index 2097ba65..81bff4fe 100644 --- a/xdas/io/__init__.py +++ b/xdas/io/__init__.py @@ -1,6 +1,8 @@ """ -I/O subsystem: plugin-based :class:`Engine` registry and concrete engines for -xdas native, ASN, APSensing, Febus, MiniSEED, ProdML, Silixa, Terra15 formats. +I/O subsystem: plugin-based :class:`Engine` registry and concrete engines. + +Supports xdas native, ASN, APSensing, Febus, MiniSEED, ProdML, Silixa, and +Terra15 formats. """ __all__ = [ diff --git a/xdas/io/apsensing.py b/xdas/io/apsensing.py index 4a04f1e9..54630129 100644 --- a/xdas/io/apsensing.py +++ b/xdas/io/apsensing.py @@ -1,6 +1,4 @@ -""" -I/O engine for APSensing HDF5 files (:class:`APSensingEngine`). -""" +"""I/O engine for APSensing HDF5 files (:class:`APSensingEngine`).""" import h5py import numpy as np diff --git a/xdas/io/asn.py b/xdas/io/asn.py index caac424b..49b7ea5e 100644 --- a/xdas/io/asn.py +++ b/xdas/io/asn.py @@ -1,6 +1,8 @@ """ -I/O engine for ASN HDF5 files (:class:`ASNEngine`) and a ZMQ-based -real-time subscriber (:class:`ZMQSubscriber`) for live ASN streams. +I/O engine for ASN HDF5 files and live ZMQ streams. + +Includes :class:`ASNEngine` and a ZMQ-based real-time subscriber +(:class:`ZMQSubscriber`) for live ASN streams. """ import json @@ -101,7 +103,7 @@ class ZMQSubscriber: def __init__(self, address): """ - Initializes a ZMQStream object. + Initialize a ZMQStream object. Parameters ---------- @@ -297,7 +299,7 @@ def _send_message(self, message): def float_to_timedelta(value, unit): """ - Converts a floating-point value to a timedelta object. + Convert a floating-point value to a timedelta object. Parameters ---------- diff --git a/xdas/io/core.py b/xdas/io/core.py index 4faf0651..a96d39dd 100644 --- a/xdas/io/core.py +++ b/xdas/io/core.py @@ -1,6 +1,8 @@ """ -Plugin base class :class:`Engine` for file format handlers, plus -:class:`AutoEngine` for format auto-detection and :func:`get_free_port`. +Plugin base class :class:`Engine` for file format handlers. + +Also provides :class:`AutoEngine` for format auto-detection and +:func:`get_free_port`. """ import socket diff --git a/xdas/io/febus.py b/xdas/io/febus.py index 57b25663..6915a5f4 100644 --- a/xdas/io/febus.py +++ b/xdas/io/febus.py @@ -1,6 +1,4 @@ -""" -I/O engine for Febus HDF5 files (:class:`FebusEngine`). -""" +"""I/O engine for Febus HDF5 files (:class:`FebusEngine`).""" import warnings @@ -36,8 +34,8 @@ def open_dataarray(self, fname, overlaps=None, offset=None): recommended to manually specify the overlap and offset parameters. If not provided, the function will attempt to determine the correct values at your own risk. - Parameters: - ----------- + Parameters + ---------- fname : str The filename of the Febus file to read. overlaps : tuple of int, optional @@ -49,8 +47,8 @@ def open_dataarray(self, fname, overlaps=None, offset=None): from the beginning. If not provided, the function will attempt to determine the correct offset at you own risk. - Returns: - -------- + Returns + ------- DataArray A data array containing the data from the Febus file. diff --git a/xdas/io/miniseed.py b/xdas/io/miniseed.py index adca0b3e..4af52943 100644 --- a/xdas/io/miniseed.py +++ b/xdas/io/miniseed.py @@ -1,6 +1,4 @@ -""" -I/O engine for MiniSEED files via ObsPy (:class:`MiniSEEDEngine`). -""" +"""I/O engine for MiniSEED files via ObsPy (:class:`MiniSEEDEngine`).""" import dask import numpy as np diff --git a/xdas/io/prodml.py b/xdas/io/prodml.py index 145343d4..a71a0694 100644 --- a/xdas/io/prodml.py +++ b/xdas/io/prodml.py @@ -1,6 +1,7 @@ """ -I/O engine for ProdML HDF5 files (:class:`ProdML`), also known as -OptaSense and Sintela format. +I/O engine for ProdML HDF5 files (:class:`ProdML`). + +Also known as OptaSense and Sintela format. """ import h5py diff --git a/xdas/io/silixa.py b/xdas/io/silixa.py index 91f35a2e..d158d0ef 100644 --- a/xdas/io/silixa.py +++ b/xdas/io/silixa.py @@ -1,6 +1,4 @@ -""" -I/O engine for Silixa TDMS files (:class:`SilixaEngine`). -""" +"""I/O engine for Silixa TDMS files (:class:`SilixaEngine`).""" import dask import numpy as np diff --git a/xdas/io/tdms.py b/xdas/io/tdms.py index 1819c3c8..ab16f7a5 100644 --- a/xdas/io/tdms.py +++ b/xdas/io/tdms.py @@ -1,6 +1,7 @@ # -*- coding: utf-8 -*- """ -Copyright (c) 2018 Silixa Ltd +Copyright (c) 2018 Silixa Ltd. + Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to use the Software for the sole purpose of private, non-commercial use and/or in-house company research and development meaning the right to use, copy, modify, merge, share the Software, and to permit persons to whom @@ -55,16 +56,16 @@ def write_property_dict(prop_dict, out_file): def type_not_supported(vargin): - """Function raises a NotImplementedException.""" + """Raise a NotImplementedException for unsupported tdsDataTypes.""" raise NotImplementedError("Reading of this tdsDataType is not implemented") def parse_time_stamp(fractions, seconds): """ - Convert time TDMS time representation to datetime - fractions -- fractional seconds (2^-64) - seconds -- The number of seconds since 1/1/1904 - @rtype : datetime.datetime + Convert TDMS time representation to datetime. + + Parameters: fractions (fractional seconds, 2^-64), seconds (seconds since + 1/1/1904). Returns datetime.datetime or None. """ if fractions is not None and seconds is not None and fractions + seconds > 0: return datetime.timedelta(0, fractions * 2**-64 + seconds) + datetime.datetime( @@ -139,7 +140,7 @@ def parse_time_stamp(fractions, seconds): class TdmsReader(object): - """A TDMS file reader object for reading properties and data""" + """A TDMS file reader object for reading properties and data.""" def __init__(self, filename): self._properties = None @@ -197,9 +198,7 @@ def _get_channel_length(self): channel_length = property(_get_channel_length) def get_properties(self, mapped=False): - """ - Return a dictionary of properties. Read from file only if necessary. - """ + """Return a dictionary of properties. Read from file only if necessary.""" # Check if already hold properties in memory if self._properties is None: self._properties = self._read_properties() @@ -233,6 +232,7 @@ def get_properties(self, mapped=False): def _read_property(self): """ Read a single property from the TDMS file. + Return the name, type and value of the property as a list. """ # Read length of object path: @@ -250,7 +250,7 @@ def _read_property(self): return name, data_type, value def _read_properties(self): - """Read the properties from the file""" + """Read the properties from the file.""" self._tdms_file.seek(LEAD_IN_LENGTH, 0) # Number of channels is total objects - file objects - group objects self.fileinfo["n_channels"] = struct.unpack("i", self._tdms_file.read(4))[0] - 2 @@ -302,10 +302,8 @@ def _read_chunk_size(self): def get_data(self, first_ch=0, last_ch=None, first_s=0, last_s=None): """ Get a block of data from the TDMS file. - first_ch -- The first channel to load - last_ch -- The last channel to load - first_s -- The first sample to load - last_s -- The last sample to load + + Parameters: first_ch, last_ch (channel range), first_s, last_s (sample range). """ if self._raw_data is None: self._initialise_data() diff --git a/xdas/io/terra15.py b/xdas/io/terra15.py index cc0ad6f6..162ac324 100644 --- a/xdas/io/terra15.py +++ b/xdas/io/terra15.py @@ -1,6 +1,4 @@ -""" -I/O engine for Terra15 HDF5 files (:class:`Terra15Engine`). -""" +"""I/O engine for Terra15 HDF5 files (:class:`Terra15Engine`).""" import h5py import pandas as pd diff --git a/xdas/io/utils.py b/xdas/io/utils.py index a24f3aa9..26e8eef4 100644 --- a/xdas/io/utils.py +++ b/xdas/io/utils.py @@ -1,6 +1,7 @@ """ -HDF5 utility functions for compressing datasets while preserving file -structure and metadata. +HDF5 utility functions for compressing datasets. + +Preserves file structure and metadata during compression. """ import h5py @@ -25,7 +26,6 @@ def compress(src_path: str, dst_path: str, dataset_location: str, encoding: dict - 'compression': the compression algorithm to use and its parameters, part of the hdf5plugin library - 'chunks': the chunk size for the dataset, should be a tuple of integers, default to False for no chunking """ - if "chunks" in encoding.keys() and not encoding["chunks"]: encoding.pop("chunks") diff --git a/xdas/io/xdas.py b/xdas/io/xdas.py index fcbb119c..8c835a21 100644 --- a/xdas/io/xdas.py +++ b/xdas/io/xdas.py @@ -1,6 +1,7 @@ """ -I/O engine for the native xdas HDF5/NetCDF4 format (:class:`XdasEngine`), -supporting :class:`DataArray`, :class:`DataSequence`, and :class:`DataMapping`. +I/O engine for the native xdas HDF5/NetCDF4 format (:class:`XdasEngine`). + +Supports :class:`DataArray`, :class:`DataSequence`, and :class:`DataMapping`. """ import os diff --git a/xdas/parallel.py b/xdas/parallel.py index 90ad3c01..637575bd 100644 --- a/xdas/parallel.py +++ b/xdas/parallel.py @@ -1,6 +1,7 @@ """ -Thread-parallelism decorator :func:`parallelize` for splitting array axes -across workers using :class:`~concurrent.futures.ThreadPoolExecutor`. +Thread-parallelism decorator :func:`parallelize` for splitting array axes. + +Splits across workers using :class:`~concurrent.futures.ThreadPoolExecutor`. """ import os @@ -14,7 +15,7 @@ def parallelize(split_axis=0, concat_axis=0, parallel=None): """ - Decorator factory that splits array positional arguments across threads. + Split array positional arguments across threads. Parameters ---------- diff --git a/xdas/picking.py b/xdas/picking.py index 34aed185..fdcffede 100644 --- a/xdas/picking.py +++ b/xdas/picking.py @@ -1,6 +1,7 @@ """ -Phase-pick utilities: tapered selection and cross-correlation based -picking of onset times in :class:`DataArray` objects. +Phase-pick utilities for :class:`DataArray` objects. + +Includes tapered selection and cross-correlation based picking of onset times. """ import numpy as np @@ -12,7 +13,7 @@ def tapered_selection(da, start, end, window=None, size=None, dim="last"): """ - Selects and tapers a DataArray based on `start` and `end` values. + Select and taper a DataArray based on `start` and `end` values. Coordinates with NaN or NaT `start` or `end` values are ignored. If no `size` is provided, the length of the resulting data is determined by the next fast length diff --git a/xdas/processing/__init__.py b/xdas/processing/__init__.py index 4559a0ca..9e24e055 100644 --- a/xdas/processing/__init__.py +++ b/xdas/processing/__init__.py @@ -1,6 +1,8 @@ """ -Chunked processing pipeline: loaders, writers, real-time streaming, and -the :func:`process` orchestrator for larger-than-memory datasets. +Chunked processing pipeline for larger-than-memory datasets. + +Provides loaders, writers, real-time streaming, and the :func:`process` +orchestrator. """ __all__ = [ diff --git a/xdas/processing/core.py b/xdas/processing/core.py index 9cd5ff6e..fb3ad699 100644 --- a/xdas/processing/core.py +++ b/xdas/processing/core.py @@ -1,7 +1,9 @@ """ -Core processing infrastructure: :class:`DataArrayLoader`, :class:`DataArrayWriter`, +Core processing infrastructure for chunked pipeline execution. + +Includes :class:`DataArrayLoader`, :class:`DataArrayWriter`, :class:`DataFrameWriter`, :class:`StreamWriter`, :class:`ZMQPublisher`, -:class:`ZMQSubscriber`, :class:`RealTimeLoader`, and the :func:`process` function. +:class:`ZMQSubscriber`, :class:`RealTimeLoader`, and :func:`process`. """ import os diff --git a/xdas/processing/monitor.py b/xdas/processing/monitor.py index 9f01aacb..1d3756fd 100644 --- a/xdas/processing/monitor.py +++ b/xdas/processing/monitor.py @@ -1,6 +1,4 @@ -""" -:class:`Monitor`: tqdm-based throughput tracker for chunked processing pipelines. -""" +""":class:`Monitor`: tqdm-based throughput tracker for chunked processing pipelines.""" from time import perf_counter diff --git a/xdas/signal.py b/xdas/signal.py index 08721419..c93e2fe9 100644 --- a/xdas/signal.py +++ b/xdas/signal.py @@ -1,7 +1,8 @@ """ -Signal processing functions for :class:`DataArray`: filtering, resampling, -tapering, detrending, and spectral helpers, all coordinate-aware and -multi-threaded via :func:`~xdas.parallel.parallelize`. +Signal processing functions for :class:`DataArray`. + +Includes filtering, resampling, tapering, detrending, and spectral helpers, +all coordinate-aware and multi-threaded via :func:`~xdas.parallel.parallelize`. """ import numpy as np @@ -17,7 +18,7 @@ @atomized def detrend(da, type="linear", dim="last", parallel=None): """ - Detrend data along given dimension + Detrend data along given dimension. Parameters ---------- @@ -52,7 +53,7 @@ def detrend(da, type="linear", dim="last", parallel=None): @atomized def taper(da, window="hann", fftbins=False, dim="last", parallel=None): """ - Apply a tapering window along the given dimension + Apply a tapering window along the given dimension. Parameters ---------- @@ -633,7 +634,7 @@ def sosfilt(sos, da, dim="last", zi=None, parallel=None): @atomized def sosfiltfilt(sos, da, dim="last", padtype="odd", padlen=None, parallel=None): """ - A forward-backward digital filter using cascaded second-order sections. + Apply a forward-backward digital filter using cascaded second-order sections. Parameters ---------- @@ -944,7 +945,7 @@ def func(x): @atomized def medfilt(da, kernel_dim): # TODO: parallelize """ - Perform a median filter along given dimensions + Perform a median filter along given dimensions. Apply a median filter to the input using a local window-size given by kernel_size. The array will automatically be zero-padded. diff --git a/xdas/spectral.py b/xdas/spectral.py index 7ff0d948..2e92d990 100644 --- a/xdas/spectral.py +++ b/xdas/spectral.py @@ -1,6 +1,7 @@ """ -Spectral analysis functions for :class:`DataArray`: Short-Time Fourier -Transform (:func:`stft`) and related helpers. +Spectral analysis functions for :class:`DataArray`. + +Includes Short-Time Fourier Transform (:func:`stft`) and related helpers. """ import numpy as np diff --git a/xdas/synthetics.py b/xdas/synthetics.py index 9ebf50d9..ecf593b8 100644 --- a/xdas/synthetics.py +++ b/xdas/synthetics.py @@ -1,6 +1,7 @@ """ -Synthetic DAS data generators used in doctests and test fixtures: -:func:`wavelet_wavefronts` and :func:`randn_wavefronts`. +Synthetic DAS data generators used in doctests and test fixtures. + +Includes :func:`wavelet_wavefronts` and :func:`randn_wavefronts`. """ import numpy as np @@ -33,7 +34,6 @@ def wavelet_wavefronts( Examples -------- - >>> import os >>> import xdas as xd >>> from xdas.synthetics import wavelet_wavefronts diff --git a/xdas/trigger.py b/xdas/trigger.py index 5eb41b7d..c9e2465e 100644 --- a/xdas/trigger.py +++ b/xdas/trigger.py @@ -1,6 +1,8 @@ """ -Threshold-based triggering atom :class:`Trigger` for detecting phase -arrivals in :class:`DataArray` objects using an on/off mechanism. +Threshold-based triggering atom :class:`Trigger`. + +Detects phase arrivals in :class:`DataArray` objects using an on/off +mechanism. """ import numpy as np @@ -82,7 +84,7 @@ def __init__(self, thresh, dim="last"): def initialize(self, cft, **flags): """ - Initialize the trigger with the following states: + Initialize the trigger with the following states. - "axis": An integer indicating the axis number of the dimension along which to find picks. diff --git a/xdas/virtual.py b/xdas/virtual.py index b2bb56ee..c539c8bf 100644 --- a/xdas/virtual.py +++ b/xdas/virtual.py @@ -1,7 +1,9 @@ """ -Virtual (lazy) array types: :class:`VirtualArray` base, :class:`VirtualSource` -for a single HDF5/NetCDF4 dataset slice, and :class:`VirtualStack` for -concatenating multiple sources along an axis. +Virtual (lazy) array types for deferred HDF5/NetCDF4 access. + +Includes :class:`VirtualArray` base, :class:`VirtualSource` for a single +dataset slice, and :class:`VirtualStack` for concatenating sources along +an axis. """ import os From f4b315726c1be7bb92fa91c2e8d8aefd1d1aa1c1 Mon Sep 17 00:00:00 2001 From: Alister Trabattoni Date: Mon, 25 May 2026 21:45:58 +0200 Subject: [PATCH 20/21] Update release notes. --- docs/release-notes.md | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/docs/release-notes.md b/docs/release-notes.md index ace4abba..d7f8fd49 100644 --- a/docs/release-notes.md +++ b/docs/release-notes.md @@ -12,8 +12,9 @@ ### Refactoring - Achieved **100% test coverage** across the codebase (@atrabattoni). -- Reduced test suite execution time by ~50% (@atrabattoni). -- Migrated formatting tooling from `isort` + `black` to `ruff` (@atrabattoni). +- *Reduced* test suite execution time by **~50%** (@atrabattoni). +- Migrated formatting tooling from `isort` + `black` to `ruff` **including docstring checks** (@atrabattoni). +- Ensure all ruff checks pass (@atrabattoni). ## 0.2.6 From 95926de69f2266d80796b04d67962277f711aa83 Mon Sep 17 00:00:00 2001 From: Alister Trabattoni Date: Thu, 28 May 2026 14:40:39 +0200 Subject: [PATCH 21/21] Fix nasty python 3.10 error. --- xdas/virtual.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xdas/virtual.py b/xdas/virtual.py index c539c8bf..a94d1033 100644 --- a/xdas/virtual.py +++ b/xdas/virtual.py @@ -468,7 +468,7 @@ def __getitem__(self, key): def __array__(self, dtype=None): with h5py.File(self.vsource.path) as file: dataset = file[self.vsource.name] - return dataset[self._sel.get_indexer()].__array__(dtype) + return np.asarray(dataset[self._sel.get_indexer()], dtype=dtype) # We used to create an temporary file: # return self._to_layout().__array__(dtype)