diff --git a/src/anndata/_core/aligned_df.py b/src/anndata/_core/aligned_df.py index 59157de0f..dd6e82bbd 100644 --- a/src/anndata/_core/aligned_df.py +++ b/src/anndata/_core/aligned_df.py @@ -8,9 +8,9 @@ from pandas.api.types import is_string_dtype from .._warnings import ImplicitModificationWarning -from ..compat import XDataset, pandas_as_str +from ..compat import pandas_as_str +from ..types import DataFrameLike from ..utils import warn -from .xarray import Dataset2D if TYPE_CHECKING: from collections.abc import Iterable @@ -25,7 +25,7 @@ def _gen_dataframe( source: Literal["X", "shape"], attr: Literal["obs", "var"], length: int | None = None, -) -> pd.DataFrame: # pragma: no cover +) -> DataFrameLike: # pragma: no cover msg = f"Cannot convert {type(anno)} to {attr} DataFrame" raise ValueError(msg) @@ -69,9 +69,9 @@ def mk_index(l: int) -> pd.Index: return df -@_gen_dataframe.register(pd.DataFrame) +@_gen_dataframe.register(DataFrameLike) def _gen_dataframe_df( - anno: pd.DataFrame, + anno: DataFrameLike, index_names: Iterable[str], *, source: Literal["X", "shape"], @@ -129,27 +129,3 @@ def _mk_df_error( f"({actual} {what}s instead of {expected})" ) return ValueError(msg) - - -@_gen_dataframe.register(Dataset2D) -def _gen_dataframe_xr( - anno: Dataset2D, - index_names: Iterable[str], - *, - source: Literal["X", "shape"], - attr: Literal["obs", "var"], - length: int | None = None, -): - return anno - - -@_gen_dataframe.register(XDataset) -def _gen_dataframe_xdataset( - anno: XDataset, - index_names: Iterable[str], - *, - source: Literal["X", "shape"], - attr: Literal["obs", "var"], - length: int | None = None, -): - return Dataset2D(anno) diff --git a/src/anndata/_core/aligned_mapping.py b/src/anndata/_core/aligned_mapping.py index 3ac1c33d7..bfbc2a05b 100644 --- a/src/anndata/_core/aligned_mapping.py +++ b/src/anndata/_core/aligned_mapping.py @@ -10,7 +10,8 @@ import pandas as pd from .._warnings import ExperimentalFeatureWarning, ImplicitModificationWarning -from ..compat import AwkArray, CSArray, CSMatrix, CupyArray, XDataset +from ..compat import AwkArray, CSArray, CSMatrix, CupyArray +from ..types import DataFrameLike from ..utils import ( axis_len, convert_to_dict, @@ -24,7 +25,6 @@ from .index import _subset from .storage import coerce_array from .views import as_view, view_update -from .xarray import Dataset2D if TYPE_CHECKING: from collections.abc import Callable, Iterable, Iterator, Mapping @@ -36,8 +36,8 @@ OneDIdx = Sequence[int] | Sequence[bool] | slice TwoDIdx = tuple[OneDIdx, OneDIdx] -# TODO: pd.DataFrame only allowed in AxisArrays? -Value = pd.DataFrame | CSMatrix | CSArray | np.ndarray +# DataFrameLike encompasses pd.DataFrame and Dataset2D +Value = DataFrameLike | CSMatrix | CSArray | np.ndarray class AlignedMappingBase[I: OneDIdx](MutableMapping[str, Value], ABC): @@ -74,8 +74,6 @@ def _validate_value(self, val: Value, key: str) -> Value: warn_once(msg, ExperimentalFeatureWarning) elif isinstance(val, np.ndarray | CupyArray) and len(val.shape) == 1: val = val.reshape((val.shape[0], 1)) - elif isinstance(val, XDataset): - val = Dataset2D(val) for i, axis in enumerate(self.axes): if self.parent.shape[axis] == axis_len(val, i): continue diff --git a/src/anndata/_core/anndata.py b/src/anndata/_core/anndata.py index f582c5dfe..8a9b8f1af 100644 --- a/src/anndata/_core/anndata.py +++ b/src/anndata/_core/anndata.py @@ -61,7 +61,7 @@ from zarr.storage import StoreLike - from ..compat import XDataset + from ..types import DataFrameLike from ..typing import Index1D, _Index1DNorm, _XDataType from .aligned_mapping import AxisArraysView, LayersView, PairwiseArraysView from .index import Index @@ -772,7 +772,7 @@ def n_vars(self) -> int: """Number of variables/features.""" return len(self.var_names) - def _set_dim_df(self, value: pd.DataFrame | XDataset, attr: Literal["obs", "var"]): + def _set_dim_df(self, value: DataFrameLike, attr: Literal["obs", "var"]): value = _gen_dataframe( value, [f"{attr}_names", f"{'row' if attr == 'obs' else 'col'}_names"], @@ -836,12 +836,12 @@ def _set_dim_index(self, value: pd.Index, attr: str): v.index = value @property - def obs(self) -> pd.DataFrame | Dataset2D: + def obs(self) -> DataFrameLike: """One-dimensional annotation of observations (`pd.DataFrame`).""" return self._obs @obs.setter - def obs(self, value: pd.DataFrame | XDataset): + def obs(self, value: DataFrameLike): self._set_dim_df(value, "obs") @obs.deleter @@ -859,12 +859,12 @@ def obs_names(self, names: Sequence[str]): self._set_dim_index(names, "obs") @property - def var(self) -> pd.DataFrame | Dataset2D: + def var(self) -> DataFrameLike: """One-dimensional annotation of variables/ features (`pd.DataFrame`).""" return self._var @var.setter - def var(self, value: pd.DataFrame | XDataset): + def var(self, value: DataFrameLike): self._set_dim_df(value, "var") @var.deleter diff --git a/src/anndata/_core/index.py b/src/anndata/_core/index.py index 398b08a4b..d1fb4cdd4 100644 --- a/src/anndata/_core/index.py +++ b/src/anndata/_core/index.py @@ -11,11 +11,11 @@ from scipy.sparse import issparse from ..compat import AwkArray, CSArray, CSMatrix, DaskArray, XDataArray -from .xarray import Dataset2D if TYPE_CHECKING: from numpy.typing import NDArray + from ..types import DataFrameLike from ..typing import Index, Index1D, _Index1DNorm @@ -36,7 +36,7 @@ def _normalize_index( # noqa: PLR0911, PLR0912 ) -> _Index1DNorm | int | np.integer: # TODO: why is this here? All tests pass without it and it seems at the minimum not strict enough. if not isinstance(index, pd.RangeIndex) and index.dtype in (np.float64, np.int64): - msg = f"Don’t call _normalize_index with non-categorical/string names and non-range index {index}" + msg = f"Don't call _normalize_index with non-categorical/string names and non-range index {index}" raise TypeError(msg) if isinstance(indexer, pd.Index | pd.Series): @@ -92,7 +92,7 @@ def name_idx(i): elif issubclass(indexer.dtype.type, np.bool_): if indexer.shape != index.shape: msg = ( - f"Boolean index does not match AnnData’s shape along this " + f"Boolean index does not match AnnData's shape along this " f"dimension. Boolean index has shape {indexer.shape} while " f"AnnData index has shape {index.shape}." ) @@ -171,9 +171,14 @@ def unpack_index(index: Index) -> tuple[Index1D, Index1D]: @singledispatch def _subset( - a: np.ndarray | pd.DataFrame, + a: np.ndarray | DataFrameLike, subset_idx: tuple[_Index1DNorm] | tuple[_Index1DNorm, _Index1DNorm], ): + # Check for DataFrameLike objects (pd.DataFrame, Dataset2D, etc.) + from ..types import DataFrameLike + + if isinstance(a, DataFrameLike): + return a.iloc[subset_idx] # Select as combination of indexes, not coordinates # Correcting for indexing behaviour of np.ndarray if all(isinstance(x, Iterable) for x in subset_idx): @@ -207,15 +212,6 @@ def _subset_sparse( return a[subset_idx] -@_subset.register(pd.DataFrame) -@_subset.register(Dataset2D) -def _subset_df( - df: pd.DataFrame | Dataset2D, - subset_idx: tuple[_Index1DNorm] | tuple[_Index1DNorm, _Index1DNorm], -): - return df.iloc[subset_idx] - - @_subset.register(AwkArray) def _subset_awkarray( a: AwkArray, subset_idx: tuple[_Index1DNorm] | tuple[_Index1DNorm, _Index1DNorm] diff --git a/src/anndata/_core/merge.py b/src/anndata/_core/merge.py index d9d3f2049..4d477ef38 100644 --- a/src/anndata/_core/merge.py +++ b/src/anndata/_core/merge.py @@ -29,6 +29,7 @@ CupySparseMatrix, DaskArray, ) +from ..types import DataFrameLike from ..utils import asarray, axis_len, warn, warn_once from .anndata import AnnData from .index import _subset, make_slice @@ -557,7 +558,7 @@ def apply(self, el, *, axis, fill_value=None): # noqa: PLR0911 """ if self.no_change and (axis_len(el, axis) == len(self.old_idx)): return el - if isinstance(el, pd.DataFrame | Dataset2D): + if isinstance(el, DataFrameLike): return self._apply_to_df_like(el, axis=axis, fill_value=fill_value) elif isinstance(el, CSMatrix | CSArray | CupySparseMatrix): return self._apply_to_sparse(el, axis=axis, fill_value=fill_value) @@ -570,7 +571,7 @@ def apply(self, el, *, axis, fill_value=None): # noqa: PLR0911 else: return self._apply_to_array(el, axis=axis, fill_value=fill_value) - def _apply_to_df_like(self, el: pd.DataFrame | Dataset2D, *, axis, fill_value=None): + def _apply_to_df_like(self, el: DataFrameLike, *, axis, fill_value=None): if fill_value is None: fill_value = np.nan return el.reindex(self.new_idx, axis=axis, fill_value=fill_value) diff --git a/src/anndata/_core/storage.py b/src/anndata/_core/storage.py index 94dcb3868..ac87a6e91 100644 --- a/src/anndata/_core/storage.py +++ b/src/anndata/_core/storage.py @@ -3,13 +3,12 @@ from typing import TYPE_CHECKING import numpy as np -import pandas as pd from scipy import sparse from anndata.compat import CSArray, CSMatrix +from anndata.types import DataFrameLike from .._warnings import ImplicitModificationWarning -from ..compat import XDataset from ..utils import ( ensure_df_homogeneous, get_union_members, @@ -17,7 +16,6 @@ raise_value_error_if_multiindex_columns, warn, ) -from .xarray import Dataset2D if TYPE_CHECKING: from typing import Any @@ -38,9 +36,7 @@ def coerce_array( return value # If value is one of the allowed types, return it array_data_structure_types = get_union_members(_ArrayDataStructureTypes) - if isinstance(value, XDataset): - value = Dataset2D(value) - if isinstance(value, (*array_data_structure_types, Dataset2D)): + if isinstance(value, array_data_structure_types): if isinstance(value, np.matrix): msg = f"{name} should not be a np.matrix, use np.ndarray instead." warn(msg, ImplicitModificationWarning) @@ -56,7 +52,7 @@ def coerce_array( if any(is_non_csc_r_array_or_matrix): msg = f"Only CSR and CSC {'matrices' if isinstance(value, sparse.spmatrix) else 'arrays'} are supported." raise ValueError(msg) - if isinstance(value, pd.DataFrame): + if isinstance(value, DataFrameLike): if allow_df: raise_value_error_if_multiindex_columns(value, name) return value if allow_df else ensure_df_homogeneous(value, name) diff --git a/src/anndata/_core/views.py b/src/anndata/_core/views.py index 95054139f..58e0e06a6 100644 --- a/src/anndata/_core/views.py +++ b/src/anndata/_core/views.py @@ -11,6 +11,7 @@ from scipy import sparse from anndata._warnings import ImplicitModificationWarning +from anndata.types import DataFrameLike from .._settings import settings from ..compat import ( @@ -23,7 +24,6 @@ ) from ..utils import warn from .access import ElementRef -from .xarray import Dataset2D if TYPE_CHECKING: from collections.abc import Callable, Iterable, KeysView, Sequence @@ -366,8 +366,8 @@ def as_view_cupy_csc(mtx, view_args): return CupySparseCSCView(mtx, view_args=view_args) -@as_view.register(Dataset2D) -def _(a: Dataset2D, view_args): +@as_view.register(DataFrameLike) +def _(a: DataFrameLike, view_args): return a diff --git a/src/anndata/_core/xarray.py b/src/anndata/_core/xarray.py index 0e75d604a..0d74e1d05 100644 --- a/src/anndata/_core/xarray.py +++ b/src/anndata/_core/xarray.py @@ -1,10 +1,9 @@ from __future__ import annotations import warnings -from collections.abc import Hashable, Mapping from dataclasses import dataclass from functools import wraps -from typing import TYPE_CHECKING, Self, overload +from typing import TYPE_CHECKING, overload import numpy as np import pandas as pd @@ -14,7 +13,14 @@ from ..compat import XDataArray, XDataset, XVariable, pandas_as_str if TYPE_CHECKING: - from collections.abc import Callable, Collection, Iterable, Iterator + from collections.abc import ( + Callable, + Collection, + Hashable, + Iterable, + Iterator, + Mapping, + ) from typing import Any, Literal from .._types import Dataset2DIlocIndexer @@ -33,16 +39,14 @@ def wrapper(*args: P.args, **kwargs: P.kwargs) -> R: return wrapper -class Dataset2D(Mapping[Hashable, XDataArray | Self]): +class Dataset2D: r""" A wrapper class meant to enable working with lazy dataframe data according to :class:`~anndata.AnnData`'s internal API. This class ensures that "dataframe-invariants" are respected, namely that there is only one 1d dim and coord with the same name i.e., like a :class:`pandas.DataFrame`. - You should not have to initiate this class yourself. Setting an :class:`xarray.Dataset` - into a relevant part of the :class:`~anndata.AnnData` object will attempt to wrap that - object in this object, trying to enforce the "dataframe-invariants." + You will need to wrap :class:`xarray.Dataset` inside this class if you wish to set :attr:`~anndata.AnnData.obs` or :attr:`~anndata.AnnData.var` with that. Because xarray requires :attr:`xarray.Dataset.coords` to be in-memory, this class provides handling for an out-of-memory index via :attr:`~anndata.experimental.backed.Dataset2D.true_index`. @@ -191,7 +195,7 @@ def shape(self) -> tuple[int, int]: ------- The (2D) shape of the dataframe resolved from :attr:`~xarray.Dataset.sizes`. """ - return (self.ds.sizes[self.index_dim], len(self.ds)) + return (len(self), len(self.ds)) @property def iloc(self) -> Dataset2DIlocIndexer: @@ -363,7 +367,7 @@ def __iter__(self) -> Iterator[Hashable]: return iter(self.ds) def __len__(self) -> int: - return len(self.ds) + return self.ds.sizes[self.index_dim] @property def dtypes(self) -> Mapping[Hashable, np.dtype]: diff --git a/src/anndata/_types.py b/src/anndata/_types.py index 6006b31c3..e278e4041 100644 --- a/src/anndata/_types.py +++ b/src/anndata/_types.py @@ -6,7 +6,6 @@ from typing import TYPE_CHECKING, Literal, Protocol -from . import typing from .compat import H5Array, H5Group, ZarrArray, ZarrGroup from .utils import set_module @@ -16,6 +15,7 @@ from anndata._core.xarray import Dataset2D + from . import typing from ._io.specs.registry import ( IOSpec, LazyDataStructures, @@ -40,7 +40,6 @@ # These two are not public, so we don’t make them `type`s _ArrayStorageType: TypeAlias = ZarrArray | H5Array # noqa: UP040 _GroupStorageType: TypeAlias = ZarrGroup | H5Group # noqa: UP040 - type StorageType = _ArrayStorageType | _GroupStorageType diff --git a/src/anndata/experimental/__init__.py b/src/anndata/experimental/__init__.py index 1271ac9b5..bb0480af8 100644 --- a/src/anndata/experimental/__init__.py +++ b/src/anndata/experimental/__init__.py @@ -12,6 +12,7 @@ Write, WriteCallback, ) +from ..types import DataFrameLike, DataFrameLikeIlocIndexer from ..utils import module_get_attr_redirect from ._dispatch_io import read_dispatched, write_dispatched from .backed import read_lazy @@ -53,6 +54,8 @@ def __getattr__(attr_name: str) -> Any: __all__ = [ "AnnCollection", "AnnLoader", + "DataFrameLike", + "DataFrameLikeIlocIndexer", "Dataset2DIlocIndexer", "IOSpec", "Read", diff --git a/src/anndata/tests/helpers.py b/src/anndata/tests/helpers.py index 3ff344a16..8079d567f 100644 --- a/src/anndata/tests/helpers.py +++ b/src/anndata/tests/helpers.py @@ -337,9 +337,9 @@ def gen_adata( # noqa: PLR0913 if has_xr := find_spec("xarray"): if obs_xdataset: - obs = XDataset.from_dataframe(obs) + obs = Dataset2D(XDataset.from_dataframe(obs)) if var_xdataset: - var = XDataset.from_dataframe(var) + var = Dataset2D(XDataset.from_dataframe(var)) if X_type is None: X = None @@ -361,11 +361,11 @@ def gen_adata( # noqa: PLR0913 da=da.random.random((N, 50)), ) if has_xr: - obsm["xdataset"] = XDataset.from_dataframe( - gen_typed_df(M, obs_names, dtypes=obs_dtypes) + obsm["xdataset"] = Dataset2D( + XDataset.from_dataframe(gen_typed_df(M, obs_names, dtypes=obs_dtypes)) ) - varm["xdataset"] = XDataset.from_dataframe( - gen_typed_df(N, var_names, dtypes=var_dtypes) + varm["xdataset"] = Dataset2D( + XDataset.from_dataframe(gen_typed_df(N, var_names, dtypes=var_dtypes)) ) obsm = {k: v for k, v in obsm.items() if type(v) in obsm_types} obsm = maybe_add_sparse_array( diff --git a/src/anndata/types.py b/src/anndata/types.py index ed3a293bd..a2f6f4953 100644 --- a/src/anndata/types.py +++ b/src/anndata/types.py @@ -3,6 +3,10 @@ from typing import TYPE_CHECKING, Protocol, runtime_checkable if TYPE_CHECKING: + from typing import Any, Literal, Self + + from pandas import Index + from ._core.anndata import AnnData @@ -20,3 +24,78 @@ def __init__(self, adata: AnnData) -> None: """ Used to enforce the correct signature for extension namespaces. """ + + +@runtime_checkable +class DataFrameLikeIlocIndexer(Protocol): + """Protocol for iloc-style indexers on DataFrame-like objects. + + Only requires `__getitem__`. + """ + + def __getitem__(self, idx: Any) -> Any: ... + + +@runtime_checkable +class DataFrameLike(Protocol): + """Protocol for DataFrame-like objects usable in AnnData. + + See Also + -------- + :class:`~anndata.experimental.backed.Dataset2D` + An xarray-based implementation of this protocol. + """ + + def __len__(self) -> int: + """Number of rows in this object""" + ... + + @property + def index(self) -> Index: + """Row labels of the DataFrame-like object.""" + ... + + @property + def columns(self) -> Index: + """Column labels of the DataFrame-like object.""" + ... + + @columns.setter + def columns(self, v: Any) -> None: + """Setter for columns""" + ... + + @property + def shape(self) -> tuple[int, int]: + """Shape of the DataFrame-like object as (n_rows, n_columns).""" + ... + + @property + def iloc(self) -> DataFrameLikeIlocIndexer: + """Positional indexer for the DataFrame-like object.""" + ... + + def reindex( + self, + *, + index: Index | None = None, + axis: Literal[0, 1] | None = 0, + fill_value: Any = ..., + **kwargs, + ) -> Self: + """Reindex the DataFrame-like object to match a new index. + + Parameters + ---------- + index + New index to conform to. + axis + Axis to reindex along (only 0 is supported). + fill_value + Value to use for missing values. + + Returns + ------- + Reindexed DataFrame-like object. + """ + ... diff --git a/tests/test_concatenate.py b/tests/test_concatenate.py index 62ac5f90f..8110020c6 100644 --- a/tests/test_concatenate.py +++ b/tests/test_concatenate.py @@ -269,8 +269,8 @@ def test_concatenate_roundtrip( if backwards_compat and use_xdataset: import xarray as xr - result.var = xr.Dataset.from_dataframe( - result.var + result.var = Dataset2D( + xr.Dataset.from_dataframe(result.var) ) # backwards compat always returns a dataframe # Correcting for known differences