From 8143e073bec17fc0650ecbe4d2688409757d19d2 Mon Sep 17 00:00:00 2001 From: Phil Schaf Date: Wed, 9 Oct 2024 09:32:52 +0200 Subject: [PATCH 01/51] WIP --- src/anndata/_core/aligned_mapping.py | 72 +++++++++++++++------------- src/anndata/_core/anndata.py | 58 +++++----------------- 2 files changed, 51 insertions(+), 79 deletions(-) diff --git a/src/anndata/_core/aligned_mapping.py b/src/anndata/_core/aligned_mapping.py index e2f6e4352..470f3fbac 100644 --- a/src/anndata/_core/aligned_mapping.py +++ b/src/anndata/_core/aligned_mapping.py @@ -38,12 +38,13 @@ # TODO: pd.DataFrame only allowed in AxisArrays? Value = Union[pd.DataFrame, spmatrix, np.ndarray] +K = TypeVar("K", str, str | None) P = TypeVar("P", bound="AlignedMappingBase") """Parent mapping an AlignedView is based on.""" I = TypeVar("I", OneDIdx, TwoDIdx) -class AlignedMappingBase(MutableMapping[str, Value], ABC): +class AlignedMappingBase(MutableMapping[K, Value], ABC, Generic[K]): """\ An abstract base class for Mappings containing array-like values aligned to either one or both AnnData axes. @@ -61,13 +62,14 @@ class AlignedMappingBase(MutableMapping[str, Value], ABC): _parent: AnnData | Raw """The parent object that this mapping is aligned to.""" - def __repr__(self): - return f"{type(self).__name__} with keys: {', '.join(self.keys())}" + def __repr__(self) -> str: + str_keys = (k for k in self.keys() if k is not None) + return f"{type(self).__name__} with keys: {', '.join(str_keys)}" - def _ipython_key_completions_(self) -> list[str]: + def _ipython_key_completions_(self) -> list[K]: return list(self.keys()) - def _validate_value(self, val: Value, key: str) -> Value: + def _validate_value(self, val: Value, key: K) -> Value: """Raises an error if value is invalid""" if isinstance(val, AwkArray): warn_once( @@ -117,13 +119,13 @@ def is_view(self) -> bool: ... def parent(self) -> AnnData | Raw: return self._parent - def copy(self) -> dict[str, Value]: + def copy(self) -> dict[K, Value]: # Shallow copy for awkward array since their buffers are immutable return { k: copy(v) if isinstance(v, AwkArray) else v.copy() for k, v in self.items() } - def _view(self, parent: AnnData, subset_idx: I) -> AlignedView[Self, I]: + def _view(self, parent: AnnData, subset_idx: I) -> AlignedView[K, Self, I]: """Returns a subset copy-on-write view of the object.""" return self._view_class(self, parent, subset_idx) @@ -132,7 +134,7 @@ def as_dict(self) -> dict: return dict(self) -class AlignedView(AlignedMappingBase, Generic[P, I]): +class AlignedView(AlignedMappingBase[K], Generic[K, P, I]): is_view: ClassVar[Literal[True]] = True # override docstring @@ -156,13 +158,13 @@ def __init__(self, parent_mapping: P, parent_view: AnnData, subset_idx: I): # LayersBase has no _axis, the rest does self._axis = parent_mapping._axis # type: ignore - def __getitem__(self, key: str) -> Value: + def __getitem__(self, key: K) -> Value: return as_view( _subset(self.parent_mapping[key], self.subset_idx), ElementRef(self.parent, self.attrname, (key,)), ) - def __setitem__(self, key: str, value: Value) -> None: + def __setitem__(self, key: K, value: Value) -> None: value = self._validate_value(value, key) # Validate before mutating warnings.warn( f"Setting element `.{self.attrname}['{key}']` of view, " @@ -173,7 +175,7 @@ def __setitem__(self, key: str, value: Value) -> None: with view_update(self.parent, self.attrname, ()) as new_mapping: new_mapping[key] = value - def __delitem__(self, key: str) -> None: + def __delitem__(self, key: K) -> None: if key not in self: raise KeyError( "'{key!r}' not found in view of {self.attrname}" @@ -187,49 +189,49 @@ def __delitem__(self, key: str) -> None: with view_update(self.parent, self.attrname, ()) as new_mapping: del new_mapping[key] - def __contains__(self, key: str) -> bool: + def __contains__(self, key: K) -> bool: return key in self.parent_mapping - def __iter__(self) -> Iterator[str]: + def __iter__(self) -> Iterator[K]: return iter(self.parent_mapping) def __len__(self) -> int: return len(self.parent_mapping) -class AlignedActual(AlignedMappingBase): +class AlignedActual(AlignedMappingBase[K], Generic[K]): is_view: ClassVar[Literal[False]] = False - _data: MutableMapping[str, Value] + _data: MutableMapping[K, Value] """Underlying mapping to the data""" - def __init__(self, parent: AnnData | Raw, *, store: MutableMapping[str, Value]): + def __init__(self, parent: AnnData | Raw, *, store: MutableMapping[K, Value]): self._parent = parent self._data = store for k, v in self._data.items(): self._data[k] = self._validate_value(v, k) - def __getitem__(self, key: str) -> Value: + def __getitem__(self, key: K) -> Value: return self._data[key] - def __setitem__(self, key: str, value: Value): + def __setitem__(self, key: K, value: Value): value = self._validate_value(value, key) self._data[key] = value - def __contains__(self, key: str) -> bool: + def __contains__(self, key: K) -> bool: return key in self._data - def __delitem__(self, key: str): + def __delitem__(self, key: K): del self._data[key] - def __iter__(self) -> Iterator[str]: + def __iter__(self) -> Iterator[K]: return iter(self._data) def __len__(self) -> int: return len(self._data) -class AxisArraysBase(AlignedMappingBase): +class AxisArraysBase(AlignedMappingBase[str]): """\ Mapping of key→array-like, where array-like is aligned to an axis of parent AnnData. @@ -283,7 +285,7 @@ def dim_names(self) -> pd.Index: return (self.parent.obs_names, self.parent.var_names)[self._axis] -class AxisArrays(AlignedActual, AxisArraysBase): +class AxisArrays(AlignedActual[str], AxisArraysBase): def __init__( self, parent: AnnData | Raw, @@ -297,7 +299,7 @@ def __init__( super().__init__(parent, store=store) -class AxisArraysView(AlignedView[AxisArraysBase, OneDIdx], AxisArraysBase): +class AxisArraysView(AlignedView[str, AxisArraysBase, OneDIdx], AxisArraysBase): pass @@ -305,7 +307,7 @@ class AxisArraysView(AlignedView[AxisArraysBase, OneDIdx], AxisArraysBase): AxisArraysBase._actual_class = AxisArrays -class LayersBase(AlignedMappingBase): +class LayersBase(AlignedMappingBase[str | None]): """\ Mapping of key: array-like, where array-like is aligned to both axes of the parent anndata. @@ -316,11 +318,11 @@ class LayersBase(AlignedMappingBase): axes: ClassVar[tuple[Literal[0], Literal[1]]] = (0, 1) -class Layers(AlignedActual, LayersBase): +class Layers(AlignedActual[str | None], LayersBase): pass -class LayersView(AlignedView[LayersBase, TwoDIdx], LayersBase): +class LayersView(AlignedView[str | None, LayersBase, TwoDIdx], LayersBase): pass @@ -328,7 +330,7 @@ class LayersView(AlignedView[LayersBase, TwoDIdx], LayersBase): LayersBase._actual_class = Layers -class PairwiseArraysBase(AlignedMappingBase): +class PairwiseArraysBase(AlignedMappingBase[str]): """\ Mapping of key: array-like, where both axes of array-like are aligned to one axis of the parent anndata. @@ -354,7 +356,7 @@ def dim(self) -> str: return self._dimnames[self._axis] -class PairwiseArrays(AlignedActual, PairwiseArraysBase): +class PairwiseArrays(AlignedActual[str], PairwiseArraysBase): def __init__( self, parent: AnnData, @@ -368,7 +370,9 @@ def __init__( super().__init__(parent, store=store) -class PairwiseArraysView(AlignedView[PairwiseArraysBase, OneDIdx], PairwiseArraysBase): +class PairwiseArraysView( + AlignedView[str, PairwiseArraysBase, OneDIdx], PairwiseArraysBase +): pass @@ -384,7 +388,7 @@ class PairwiseArraysView(AlignedView[PairwiseArraysBase, OneDIdx], PairwiseArray @dataclass -class AlignedMappingProperty(property, Generic[T]): +class AlignedMappingProperty(property, Generic[K, T]): """A :class:`property` that creates an ephemeral AlignedMapping. The actual data is stored as `f'_{self.name}'` in the parent object. @@ -397,7 +401,7 @@ class AlignedMappingProperty(property, Generic[T]): axis: Literal[0, 1] | None = None """Axis of the parent to align to.""" - def construct(self, obj: AnnData, *, store: MutableMapping[str, Value]) -> T: + def construct(self, obj: AnnData, *, store: MutableMapping[K, Value]) -> T: if self.axis is None: return self.cls(obj, store=store) return self.cls(obj, axis=self.axis, store=store) @@ -426,7 +430,7 @@ def __get__(self, obj: None | AnnData, objtype: type | None = None) -> T: return parent._view(obj, tuple(idxs[ax] for ax in parent.axes)) def __set__( - self, obj: AnnData, value: Mapping[str, Value] | Iterable[tuple[str, Value]] + self, obj: AnnData, value: Mapping[K, Value] | Iterable[tuple[K, Value]] | None ) -> None: value = convert_to_dict(value) _ = self.construct(obj, store=value) # Validate @@ -434,5 +438,5 @@ def __set__( obj._init_as_actual(obj.copy()) setattr(obj, f"_{self.name}", value) - def __delete__(self, obj) -> None: + def __delete__(self, obj: AnnData) -> None: setattr(obj, self.name, dict()) diff --git a/src/anndata/_core/anndata.py b/src/anndata/_core/anndata.py index 45382c588..ea11b5ea8 100644 --- a/src/anndata/_core/anndata.py +++ b/src/anndata/_core/anndata.py @@ -17,14 +17,13 @@ import numpy as np import pandas as pd from natsort import natsorted -from numpy import ma from pandas.api.types import infer_dtype from scipy import sparse from scipy.sparse import issparse from .. import utils from .._settings import settings -from ..compat import DaskArray, SpArray, ZarrArray, _move_adj_mtx +from ..compat import SpArray, _move_adj_mtx from ..logging import anndata_logger as logger from ..utils import ( axis_len, @@ -73,18 +72,6 @@ def _gen_keys_from_multicol_key(key_multicol, n_keys): return keys -def _check_2d_shape(X): - """\ - Check shape of array or sparse matrix. - - Assure that X is always 2D: Unlike numpy we always deal with 2D arrays. - """ - if X.dtype.names is None and len(X.shape) != 2: - raise ValueError( - f"X needs to be 2-dimensional, not {len(X.shape)}-dimensional." - ) - - class AnnData(metaclass=utils.DeprecationMixinMeta): """\ An annotated data matrix. @@ -228,7 +215,6 @@ def __init__( varm: np.ndarray | Mapping[str, Sequence[Any]] | None = None, layers: Mapping[str, np.ndarray | sparse.spmatrix] | None = None, raw: Mapping[str, Any] | None = None, - dtype: np.dtype | type | str | None = None, shape: tuple[int, int] | None = None, filename: PathLike | None = None, filemode: Literal["r", "r+"] | None = None, @@ -257,7 +243,6 @@ def __init__( varm=varm, raw=raw, layers=layers, - dtype=dtype, shape=shape, obsp=obsp, varp=varp, @@ -329,7 +314,6 @@ def _init_as_actual( obsp=None, raw=None, layers=None, - dtype=None, shape=None, filename=None, filemode=None, @@ -359,8 +343,7 @@ def _init_as_actual( raise ValueError( "If `X` is a dict no further arguments must be provided." ) - X, obs, var, uns, obsm, varm, obsp, varp, layers, raw = ( - X._X, + obs, var, uns, obsm, varm, obsp, varp, layers, raw = ( X.obs, X.var, X.uns, @@ -371,6 +354,7 @@ def _init_as_actual( X.layers, X.raw, ) + X = X.layers.get(None) # init from DataFrame elif isinstance(X, pd.DataFrame): @@ -394,22 +378,6 @@ def _init_as_actual( X = coerce_array(X, name="X") if shape is not None: raise ValueError("`shape` needs to be `None` if `X` is not `None`.") - _check_2d_shape(X) - # if type doesn’t match, a copy is made, otherwise, use a view - if dtype is not None: - warnings.warn( - "The dtype argument is deprecated and will be removed in late 2024.", - FutureWarning, - ) - if issparse(X) or isinstance(X, ma.MaskedArray): - # TODO: maybe use view on data attribute of sparse matrix - # as in readwrite.read_10x_h5 - if X.dtype != np.dtype(dtype): - X = X.astype(dtype) - elif isinstance(X, (ZarrArray, DaskArray)): - X = X.astype(dtype) - else: # is np.ndarray or a subclass, convert to true np.ndarray - X = np.asarray(X, dtype) # data matrix and shape self._X = X n_obs, n_vars = X.shape @@ -656,11 +624,11 @@ def X(self, value: np.ndarray | sparse.spmatrix | SpArray | None): ) @X.deleter - def X(self): - self.X = None + def X(self) -> None: + del self.layers[None] - layers: AlignedMappingProperty[Layers | LayersView] = AlignedMappingProperty( - "layers", Layers + layers: AlignedMappingProperty[str | None, Layers | LayersView] = ( + AlignedMappingProperty("layers", Layers) ) """\ Dictionary-like object with values of the same dimensions as :attr:`X`. @@ -870,8 +838,8 @@ def uns(self, value: MutableMapping): def uns(self): self.uns = OrderedDict() - obsm: AlignedMappingProperty[AxisArrays | AxisArraysView] = AlignedMappingProperty( - "obsm", AxisArrays, 0 + obsm: AlignedMappingProperty[str, AxisArrays | AxisArraysView] = ( + AlignedMappingProperty("obsm", AxisArrays, 0) ) """\ Multi-dimensional annotation of observations @@ -882,8 +850,8 @@ def uns(self): Is sliced with `data` and `obs` but behaves otherwise like a :term:`mapping`. """ - varm: AlignedMappingProperty[AxisArrays | AxisArraysView] = AlignedMappingProperty( - "varm", AxisArrays, 1 + varm: AlignedMappingProperty[str, AxisArrays | AxisArraysView] = ( + AlignedMappingProperty("varm", AxisArrays, 1) ) """\ Multi-dimensional annotation of variables/features @@ -894,7 +862,7 @@ def uns(self): Is sliced with `data` and `var` but behaves otherwise like a :term:`mapping`. """ - obsp: AlignedMappingProperty[PairwiseArrays | PairwiseArraysView] = ( + obsp: AlignedMappingProperty[str, PairwiseArrays | PairwiseArraysView] = ( AlignedMappingProperty("obsp", PairwiseArrays, 0) ) """\ @@ -906,7 +874,7 @@ def uns(self): Is sliced with `data` and `obs` but behaves otherwise like a :term:`mapping`. """ - varp: AlignedMappingProperty[PairwiseArrays | PairwiseArraysView] = ( + varp: AlignedMappingProperty[str, PairwiseArrays | PairwiseArraysView] = ( AlignedMappingProperty("varp", PairwiseArrays, 1) ) """\ From 2bfdd7a0bb4f8d2cc833ae6e90540ed76f1f7a4e Mon Sep 17 00:00:00 2001 From: Phil Schaf Date: Thu, 10 Oct 2024 16:20:26 +0200 Subject: [PATCH 02/51] make all test modules import --- src/anndata/_core/aligned_mapping.py | 5 +- src/anndata/_core/anndata.py | 129 ++------------------------- src/anndata/_core/raw.py | 4 +- 3 files changed, 15 insertions(+), 123 deletions(-) diff --git a/src/anndata/_core/aligned_mapping.py b/src/anndata/_core/aligned_mapping.py index 470f3fbac..2a78f3f8b 100644 --- a/src/anndata/_core/aligned_mapping.py +++ b/src/anndata/_core/aligned_mapping.py @@ -209,13 +209,16 @@ def __init__(self, parent: AnnData | Raw, *, store: MutableMapping[K, Value]): self._parent = parent self._data = store for k, v in self._data.items(): + if v is None: + continue self._data[k] = self._validate_value(v, k) def __getitem__(self, key: K) -> Value: return self._data[key] def __setitem__(self, key: K, value: Value): - value = self._validate_value(value, key) + if value is not None: + value = self._validate_value(value, key) self._data[key] = value def __contains__(self, key: K) -> bool: diff --git a/src/anndata/_core/anndata.py b/src/anndata/_core/anndata.py index ea11b5ea8..4b32179ee 100644 --- a/src/anndata/_core/anndata.py +++ b/src/anndata/_core/anndata.py @@ -31,19 +31,17 @@ ensure_df_homogeneous, raise_value_error_if_multiindex_columns, ) -from .access import ElementRef from .aligned_df import _gen_dataframe from .aligned_mapping import AlignedMappingProperty, AxisArrays, Layers, PairwiseArrays from .file_backing import AnnDataFileManager, to_memory from .index import _normalize_indices, _subset, get_vector from .raw import Raw -from .sparse_dataset import BaseCompressedSparseDataset, sparse_dataset +from .sparse_dataset import BaseCompressedSparseDataset from .storage import coerce_array from .views import ( DataFrameView, DictView, _resolve_idxs, - as_view, ) if TYPE_CHECKING: @@ -290,10 +288,6 @@ def _init_as_view(self, adata_ref: AnnData, oidx: Index, vidx: Index): self._var = DataFrameView(var_sub, view_args=(self, "var")) self._uns = uns - # set data - if self.isbacked: - self._X = None - # set raw, easy, as it’s immutable anyways... if adata_ref._raw is not None: # slicing along variables axis is ignored @@ -379,11 +373,9 @@ def _init_as_actual( if shape is not None: raise ValueError("`shape` needs to be `None` if `X` is not `None`.") # data matrix and shape - self._X = X n_obs, n_vars = X.shape source = "X" else: - self._X = None n_obs, n_vars = (None, None) if shape is None else shape source = "shape" @@ -436,6 +428,7 @@ def _init_as_actual( # layers self.layers = layers + self.X = X def __sizeof__(self, show_stratified=None, with_disk: bool = False) -> int: def get_size(X) -> int: @@ -511,117 +504,11 @@ def shape(self) -> tuple[int, int]: @property def X(self) -> ArrayDataStructureType | None: """Data matrix of shape :attr:`n_obs` × :attr:`n_vars`.""" - if self.isbacked: - if not self.file.is_open: - self.file.open() - X = self.file["X"] - if isinstance(X, h5py.Group): - X = sparse_dataset(X) - # This is so that we can index into a backed dense dataset with - # indices that aren’t strictly increasing - if self.is_view: - X = _subset(X, (self._oidx, self._vidx)) - elif self.is_view and self._adata_ref.X is None: - X = None - elif self.is_view: - X = as_view( - _subset(self._adata_ref.X, (self._oidx, self._vidx)), - ElementRef(self, "X"), - ) - else: - X = self._X - return X - # if self.n_obs == 1 and self.n_vars == 1: - # return X[0, 0] - # elif self.n_obs == 1 or self.n_vars == 1: - # if issparse(X): X = X.toarray() - # return X.flatten() - # else: - # return X + return self.layers.get(None) @X.setter def X(self, value: np.ndarray | sparse.spmatrix | SpArray | None): - if value is None: - if self.isbacked: - raise NotImplementedError( - "Cannot currently remove data matrix from backed object." - ) - if self.is_view: - self._init_as_actual(self.copy()) - self._X = None - return - value = coerce_array(value, name="X", allow_array_like=True) - - # If indices are both arrays, we need to modify them - # so we don’t set values like coordinates - # This can occur if there are successive views - if ( - self.is_view - and isinstance(self._oidx, np.ndarray) - and isinstance(self._vidx, np.ndarray) - ): - oidx, vidx = np.ix_(self._oidx, self._vidx) - else: - oidx, vidx = self._oidx, self._vidx - if ( - np.isscalar(value) - or (hasattr(value, "shape") and (self.shape == value.shape)) - or (self.n_vars == 1 and self.n_obs == len(value)) - or (self.n_obs == 1 and self.n_vars == len(value)) - ): - if not np.isscalar(value): - if self.is_view and any( - isinstance(idx, np.ndarray) - and len(np.unique(idx)) != len(idx.ravel()) - for idx in [oidx, vidx] - ): - msg = ( - "You are attempting to set `X` to a matrix on a view which has non-unique indices. " - "The resulting `adata.X` will likely not equal the value to which you set it. " - "To avoid this potential issue, please make a copy of the data first. " - "In the future, this operation will throw an error." - ) - warnings.warn(msg, FutureWarning, stacklevel=1) - if self.shape != value.shape: - # For assigning vector of values to 2d array or matrix - # Not necessary for row of 2d array - value = value.reshape(self.shape) - if self.isbacked: - if self.is_view: - X = self.file["X"] - if isinstance(X, h5py.Group): - X = sparse_dataset(X) - X[oidx, vidx] = value - else: - self._set_backed("X", value) - else: - if self.is_view: - if sparse.issparse(self._adata_ref._X) and isinstance( - value, np.ndarray - ): - if isinstance(self._adata_ref.X, SpArray): - memory_class = sparse.coo_array - else: - memory_class = sparse.coo_matrix - value = memory_class(value) - elif sparse.issparse(value) and isinstance( - self._adata_ref._X, np.ndarray - ): - warnings.warn( - "Trying to set a dense array with a sparse array on a view." - "Densifying the sparse array." - "This may incur excessive memory usage", - stacklevel=2, - ) - value = value.toarray() - self._adata_ref._X[oidx, vidx] = value - else: - self._X = value - else: - raise ValueError( - f"Data matrix has wrong shape {value.shape}, " - f"need to be {self.shape}." - ) + self.layers[None] = value @X.deleter def X(self) -> None: @@ -960,8 +847,8 @@ def filename(self, filename: PathLike | None): self.write(filename, as_dense=as_dense) # open new file for accessing self.file.open(filename, "r+") - # as the data is stored on disk, we can safely set self._X to None - self._X = None + # as the data is stored on disk, we can safely set self.X to None + del self.X def _set_backed(self, attr, value): from .._io.utils import write_attribute @@ -976,7 +863,7 @@ def __delitem__(self, index: Index): obs, var = self._normalize_indices(index) # TODO: does this really work? if not self.isbacked: - del self._X[obs, var] + del self.X[obs, var] else: X = self.file["X"] del X[obs, var] @@ -1145,7 +1032,7 @@ def __setitem__( raise ValueError("Object is view and cannot be accessed with `[]`.") obs, var = self._normalize_indices(index) if not self.isbacked: - self._X[obs, var] = val + self.X[obs, var] = val else: X = self.file["X"] X[obs, var] = val diff --git a/src/anndata/_core/raw.py b/src/anndata/_core/raw.py index 7237c06b4..b58d40edf 100644 --- a/src/anndata/_core/raw.py +++ b/src/anndata/_core/raw.py @@ -51,7 +51,9 @@ def __init__( self.varm = varm elif X is None: # construct from adata # Move from GPU to CPU since it's large and not always used - if isinstance(adata.X, (CupyArray, CupySparseMatrix)): + if adata.X is None: + self._X = None + elif isinstance(adata.X, (CupyArray, CupySparseMatrix)): self._X = adata.X.get() else: self._X = adata.X.copy() From 2d136180ffc3f014ac9b37f0c98b5592295a8a32 Mon Sep 17 00:00:00 2001 From: Phil Schaf Date: Thu, 10 Oct 2024 17:06:17 +0200 Subject: [PATCH 03/51] some misc fixes --- src/anndata/_core/aligned_mapping.py | 3 +-- src/anndata/_io/h5ad.py | 7 ++++++- src/anndata/tests/helpers.py | 11 +++++++++-- tests/test_base.py | 10 ++++++---- tests/test_layers.py | 2 +- 5 files changed, 23 insertions(+), 10 deletions(-) diff --git a/src/anndata/_core/aligned_mapping.py b/src/anndata/_core/aligned_mapping.py index 2a78f3f8b..4f99152e2 100644 --- a/src/anndata/_core/aligned_mapping.py +++ b/src/anndata/_core/aligned_mapping.py @@ -63,8 +63,7 @@ class AlignedMappingBase(MutableMapping[K, Value], ABC, Generic[K]): """The parent object that this mapping is aligned to.""" def __repr__(self) -> str: - str_keys = (k for k in self.keys() if k is not None) - return f"{type(self).__name__} with keys: {', '.join(str_keys)}" + return f"{type(self).__name__} with keys: {', '.join(map(repr, self.keys()))}" def _ipython_key_completions_(self) -> list[K]: return list(self.keys()) diff --git a/src/anndata/_io/h5ad.py b/src/anndata/_io/h5ad.py index 36429403d..6d60fdf9b 100644 --- a/src/anndata/_io/h5ad.py +++ b/src/anndata/_io/h5ad.py @@ -106,7 +106,12 @@ def write_h5ad( write_elem(f, "varm", dict(adata.varm), dataset_kwargs=dataset_kwargs) write_elem(f, "obsp", dict(adata.obsp), dataset_kwargs=dataset_kwargs) write_elem(f, "varp", dict(adata.varp), dataset_kwargs=dataset_kwargs) - write_elem(f, "layers", dict(adata.layers), dataset_kwargs=dataset_kwargs) + write_elem( + f, + "layers", + {k: v for k, v in adata.layers.items() if k is not None}, + dataset_kwargs=dataset_kwargs, + ) write_elem(f, "uns", dict(adata.uns), dataset_kwargs=dataset_kwargs) diff --git a/src/anndata/tests/helpers.py b/src/anndata/tests/helpers.py index d149e5108..a9fd3d214 100644 --- a/src/anndata/tests/helpers.py +++ b/src/anndata/tests/helpers.py @@ -524,7 +524,7 @@ def subset_func(request): ################### -def format_msg(elem_name): +def format_msg(elem_name: str | None) -> str: if elem_name is not None: return f"Error raised from element {elem_name!r}." else: @@ -562,6 +562,11 @@ def _assert_equal(a, b): @singledispatch def assert_equal(a, b, exact=False, elem_name=None): + a_handler, b_handler, default_handler = map( + assert_equal.dispatch, (type(a), type(b), object) + ) + if (a_handler is default_handler) and (b_handler is not default_handler): + return assert_equal(b, a, exact=exact, elem_name=elem_name) _assert_equal(a, b, _elem_name=elem_name) @@ -654,7 +659,9 @@ def assert_equal_awkarray(a, b, exact=False, elem_name=None): @assert_equal.register(Mapping) def assert_equal_mapping(a, b, exact=False, elem_name=None): - assert set(a.keys()) == set(b.keys()), format_msg(elem_name) + assert set(a.keys()) == set(b.keys()), ( + format_msg(elem_name) + f" {a.keys()} != {b.keys()}" + ) for k in a.keys(): if elem_name is None: elem_name = "" diff --git a/tests/test_base.py b/tests/test_base.py index e1401ed74..28160b209 100644 --- a/tests/test_base.py +++ b/tests/test_base.py @@ -207,13 +207,15 @@ def test_convert_matrix(attr, when): assert not isinstance(arr, np.matrix), f"{arr} is still a matrix" -def test_attr_deletion(): +@pytest.mark.parametrize( + "attr", ["X", "obs", "var", "obsm", "varm", "obsp", "varp", "layers", "uns"] +) +def test_attr_deletion(attr: str): full = gen_adata((30, 30)) # Empty has just X, obs_names, var_names empty = AnnData(None, obs=full.obs[[]], var=full.var[[]]) - for attr in ["X", "obs", "var", "obsm", "varm", "obsp", "varp", "layers", "uns"]: - delattr(full, attr) - assert_equal(getattr(full, attr), getattr(empty, attr)) + delattr(full, attr) + assert_equal(getattr(full, attr), getattr(empty, attr)) assert_equal(full, empty, exact=True) diff --git a/tests/test_layers.py b/tests/test_layers.py index ba1f96e49..4b95419a0 100644 --- a/tests/test_layers.py +++ b/tests/test_layers.py @@ -19,7 +19,7 @@ def test_creation(): adata = AnnData(X=X, layers=dict(L=L.copy())) - assert list(adata.layers.keys()) == ["L"] + assert adata.layers.keys() == {"L", None} assert "L" in adata.layers assert "X" not in adata.layers assert "some_other_thing" not in adata.layers From f12f4d7cb152afd08eee370e4bf90cc4071b371c Mon Sep 17 00:00:00 2001 From: "Philipp A." Date: Fri, 25 Oct 2024 17:15:02 +0200 Subject: [PATCH 04/51] small fixes --- src/anndata/_core/aligned_mapping.py | 8 ++++++-- src/anndata/_core/anndata.py | 2 +- src/anndata/_core/raw.py | 2 +- 3 files changed, 8 insertions(+), 4 deletions(-) diff --git a/src/anndata/_core/aligned_mapping.py b/src/anndata/_core/aligned_mapping.py index 45cd221b1..30e496e30 100644 --- a/src/anndata/_core/aligned_mapping.py +++ b/src/anndata/_core/aligned_mapping.py @@ -172,7 +172,10 @@ def __setitem__(self, key: K, value: Value) -> None: stacklevel=2, ) with view_update(self.parent, self.attrname, ()) as new_mapping: - new_mapping[key] = value + if value is None: + del new_mapping[key] + else: + new_mapping[key] = value def __delitem__(self, key: K) -> None: if key not in self: @@ -444,4 +447,5 @@ def __set__( setattr(obj, f"_{self.name}", value) def __delete__(self, obj: AnnData) -> None: - setattr(obj, self.name, dict()) + new = {None: x} if (x := getattr(obj, self.name).get(None)) is not None else {} + setattr(obj, self.name, new) diff --git a/src/anndata/_core/anndata.py b/src/anndata/_core/anndata.py index 2eb4e50c8..67431ba53 100644 --- a/src/anndata/_core/anndata.py +++ b/src/anndata/_core/anndata.py @@ -421,7 +421,7 @@ def _init_as_actual( elif isinstance(raw, Mapping): self._raw = Raw(self, **raw) else: # is a Raw from another AnnData - self._raw = Raw(self, raw._X, raw.var, raw.varm) + self._raw = Raw(self, raw.X, raw.var, raw.varm) # clean up old formats self._clean_up_old_format(uns) diff --git a/src/anndata/_core/raw.py b/src/anndata/_core/raw.py index 883a0cc3c..1804b3e0b 100644 --- a/src/anndata/_core/raw.py +++ b/src/anndata/_core/raw.py @@ -164,7 +164,7 @@ def to_adata(self) -> AnnData: from anndata import AnnData return AnnData( - X=self.X.copy(), + X=None if self.X is None else self.X.copy(), var=self.var.copy(), varm=None if self._varm is None else self._varm.copy(), obs=self._adata.obs.copy(), From 95e8f1d78a4517a505fca94747abde96151b1e4c Mon Sep 17 00:00:00 2001 From: "Philipp A." Date: Fri, 25 Oct 2024 17:18:16 +0200 Subject: [PATCH 05/51] fix write_anndata --- src/anndata/_io/specs/methods.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/src/anndata/_io/specs/methods.py b/src/anndata/_io/specs/methods.py index 8e02b9283..cae2b3f94 100644 --- a/src/anndata/_io/specs/methods.py +++ b/src/anndata/_io/specs/methods.py @@ -280,7 +280,12 @@ def write_anndata( _writer.write_elem(g, "varm", dict(adata.varm), dataset_kwargs=dataset_kwargs) _writer.write_elem(g, "obsp", dict(adata.obsp), dataset_kwargs=dataset_kwargs) _writer.write_elem(g, "varp", dict(adata.varp), dataset_kwargs=dataset_kwargs) - _writer.write_elem(g, "layers", dict(adata.layers), dataset_kwargs=dataset_kwargs) + _writer.write_elem( + g, + "layers", + {k: v for k, v in adata.layers.items() if k is not None}, + dataset_kwargs=dataset_kwargs, + ) _writer.write_elem(g, "uns", dict(adata.uns), dataset_kwargs=dataset_kwargs) _writer.write_elem(g, "raw", adata.raw, dataset_kwargs=dataset_kwargs) From a6555f86c96d6e36743d8495f042345b7a7c5a38 Mon Sep 17 00:00:00 2001 From: "Philipp A." Date: Fri, 25 Oct 2024 17:37:32 +0200 Subject: [PATCH 06/51] some more fixes --- src/anndata/_core/aligned_mapping.py | 6 +++++- src/anndata/_core/anndata.py | 4 ---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/src/anndata/_core/aligned_mapping.py b/src/anndata/_core/aligned_mapping.py index 30e496e30..a2fda845f 100644 --- a/src/anndata/_core/aligned_mapping.py +++ b/src/anndata/_core/aligned_mapping.py @@ -5,6 +5,7 @@ from collections.abc import MutableMapping, Sequence from copy import copy from dataclasses import dataclass +from types import NoneType from typing import TYPE_CHECKING, Generic, TypeVar import numpy as np @@ -121,7 +122,8 @@ def parent(self) -> AnnData | Raw: def copy(self) -> dict[K, Value]: # Shallow copy for awkward array since their buffers are immutable return { - k: copy(v) if isinstance(v, AwkArray) else v.copy() for k, v in self.items() + k: copy(v) if isinstance(v, AwkArray | NoneType) else v.copy() + for k, v in self.items() } def _view(self, parent: AnnData, subset_idx: I) -> AlignedView[K, Self, I]: @@ -158,6 +160,8 @@ def __init__(self, parent_mapping: P, parent_view: AnnData, subset_idx: I): self._axis = parent_mapping._axis # type: ignore def __getitem__(self, key: K) -> Value: + if self.parent_mapping[key] is None: + return None return as_view( _subset(self.parent_mapping[key], self.subset_idx), ElementRef(self.parent, self.attrname, (key,)), diff --git a/src/anndata/_core/anndata.py b/src/anndata/_core/anndata.py index 67431ba53..94c434a23 100644 --- a/src/anndata/_core/anndata.py +++ b/src/anndata/_core/anndata.py @@ -1224,10 +1224,6 @@ def _mutated_copy(self, **kwargs): new[key] = kwargs[key] else: new[key] = getattr(self, key).copy() - if "X" in kwargs: - new["X"] = kwargs["X"] - elif self._has_X(): - new["X"] = self.X.copy() if "uns" in kwargs: new["uns"] = kwargs["uns"] else: From 77024ba7771f5c8a780ff0df5453b8c248978e6d Mon Sep 17 00:00:00 2001 From: "Philipp A." Date: Tue, 15 Jul 2025 11:59:08 +0200 Subject: [PATCH 07/51] make setting X to None equivalent to deleting it --- src/anndata/_core/aligned_mapping.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/src/anndata/_core/aligned_mapping.py b/src/anndata/_core/aligned_mapping.py index e832299f3..39bc7aa90 100644 --- a/src/anndata/_core/aligned_mapping.py +++ b/src/anndata/_core/aligned_mapping.py @@ -227,13 +227,19 @@ def __getitem__(self, key: K) -> Value: def __setitem__(self, key: K, value: Value): if value is not None: value = self._validate_value(value, key) - self._data[key] = value + if key is None and value is None: + del self[key] + else: + self._data[key] = value def __contains__(self, key: K) -> bool: return key in self._data def __delitem__(self, key: K): - del self._data[key] + if key is None: + self._data.pop(key, None) + else: + del self._data[key] def __iter__(self) -> Iterator[K]: return iter(self._data) From 208b8bbe7e439832a1a476809a8583f2fddc52c4 Mon Sep 17 00:00:00 2001 From: "Philipp A." Date: Tue, 15 Jul 2025 12:14:34 +0200 Subject: [PATCH 08/51] handle X from layers --- src/anndata/_core/anndata.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/src/anndata/_core/anndata.py b/src/anndata/_core/anndata.py index c4d9607ab..290743982 100644 --- a/src/anndata/_core/anndata.py +++ b/src/anndata/_core/anndata.py @@ -351,6 +351,14 @@ def _init_as_actual( # noqa: PLR0912, PLR0913, PLR0915 ) X = X.layers.get(None) + if layers is not None and None in layers: + if X is not None and X is not layers[None]: + msg = ( + "If you provide `layers[None]` and `X`, they must be identical." + ) + raise ValueError(msg) + X = layers[None] + # init from DataFrame elif isinstance(X, pd.DataFrame): # to verify index matching, we wait until obs and var are DataFrames From fca1a8772f772920a371766cf68dbac157f3785d Mon Sep 17 00:00:00 2001 From: "Philipp A." Date: Tue, 15 Jul 2025 13:49:29 +0200 Subject: [PATCH 09/51] simplify copy --- src/anndata/_core/anndata.py | 36 +++++++----------------------------- 1 file changed, 7 insertions(+), 29 deletions(-) diff --git a/src/anndata/_core/anndata.py b/src/anndata/_core/anndata.py index 290743982..2de706091 100644 --- a/src/anndata/_core/anndata.py +++ b/src/anndata/_core/anndata.py @@ -34,7 +34,7 @@ from .aligned_df import _gen_dataframe from .aligned_mapping import AlignedMappingProperty, AxisArrays, Layers, PairwiseArrays from .file_backing import AnnDataFileManager, to_memory -from .index import _normalize_indices, _subset, get_vector +from .index import _normalize_indices, get_vector from .raw import Raw from .sparse_dataset import BaseCompressedSparseDataset from .storage import coerce_array @@ -1236,30 +1236,18 @@ def _get_var_array(self, k, use_raw=False, layer=None): # noqa: FBT002 else: return self.raw.var_vector(k) - def _mutated_copy(self, **kwargs): - """Creating AnnData with attributes optionally specified via kwargs.""" - if self.isbacked and ( - "X" not in kwargs or (self.raw is not None and "raw" not in kwargs) - ): + def _copy(self) -> AnnData: + if self.isbacked and self.raw is not None: msg = ( "This function does not currently handle backed objects " "internally, this should be dealt with before." ) raise NotImplementedError(msg) new = {} - for key in ["obs", "var", "obsm", "varm", "obsp", "varp", "layers"]: - if key in kwargs: - new[key] = kwargs[key] - else: - new[key] = getattr(self, key).copy() - if "uns" in kwargs: - new["uns"] = kwargs["uns"] - else: - new["uns"] = deepcopy(self._uns) - if "raw" in kwargs: - new["raw"] = kwargs["raw"] - elif self.raw is not None: + new[key] = getattr(self, key).copy() + new["uns"] = deepcopy(self._uns) + if self.raw is not None: new["raw"] = self.raw.copy() return AnnData(**new) @@ -1283,7 +1271,6 @@ def to_memory(self, *, copy: bool = False) -> AnnData: """ new = {} for attr_name in [ - "X", "obs", "var", "obsm", @@ -1312,16 +1299,7 @@ def to_memory(self, *, copy: bool = False) -> AnnData: def copy(self, filename: PathLike[str] | str | None = None) -> AnnData: """Full copy, optionally on disk.""" if not self.isbacked: - if self.is_view and self._has_X(): - # TODO: How do I unambiguously check if this is a copy? - # Subsetting this way means we don’t have to have a view type - # defined for the matrix, which is needed for some of the - # current distributed backend. Specifically Dask. - return self._mutated_copy( - X=_subset(self._adata_ref.X, (self._oidx, self._vidx)).copy() - ) - else: - return self._mutated_copy() + return self._copy() else: from ..io import read_h5ad, write_h5ad From 5ac31c49fa90da271c38a65944efb72eb84cd5c0 Mon Sep 17 00:00:00 2001 From: "Philipp A." Date: Tue, 15 Jul 2025 14:56:23 +0200 Subject: [PATCH 10/51] delegate transpose to layers --- docs/concatenation.rst | 7 +++++++ src/anndata/_core/anndata.py | 2 -- src/anndata/_core/merge.py | 27 --------------------------- 3 files changed, 7 insertions(+), 29 deletions(-) diff --git a/docs/concatenation.rst b/docs/concatenation.rst index ce6547b66..ed71944a2 100644 --- a/docs/concatenation.rst +++ b/docs/concatenation.rst @@ -29,6 +29,7 @@ Let's start off with an example: uns: 'bulk_labels_colors', 'louvain', 'louvain_colors', 'neighbors', 'pca', 'rank_genes_groups' obsm: 'X_pca', 'X_umap' varm: 'PCs' + layers: None obsp: 'distances', 'connectivities' If we split this object up by clusters of observations, then stack those subsets we'll obtain the same values – just ordered differently. @@ -42,6 +43,7 @@ If we split this object up by clusters of observations, then stack those subsets var: 'n_counts', 'means', 'dispersions', 'dispersions_norm', 'highly_variable' obsm: 'X_pca', 'X_umap' varm: 'PCs' + layers: None Note that we concatenated along the observations by default, and that most elements aligned to the observations were concatenated as well. A notable exception is :attr:`~anndata.AnnData.obsp`, which can be re-enabled with the `pairwise` keyword argument. @@ -168,6 +170,7 @@ First, our example case: uns: 'pca' obsm: 'X_pca' varm: 'PCs' + layers: None Now we will split this object by the categorical `"blobs"` and recombine it to illustrate different merge strategies. @@ -184,6 +187,7 @@ Now we will split this object by the categorical `"blobs"` and recombine it to i uns: 'pca' obsm: 'X_pca', 'qc' varm: 'PCs', '0_qc' + layers: None `adatas` is now a list of datasets with disjoint sets of observations and a common set of variables. Each object has had QC metrics computed, with observation-wise metrics stored under `"qc"` in `.obsm`, and variable-wise metrics stored with a unique key for each subset. @@ -193,16 +197,19 @@ Taking a look at how this affects concatenation: AnnData object with n_obs × n_vars = 640 × 30 obs: 'blobs' obsm: 'X_pca', 'qc' + layers: None >>> ad.concat(adatas, merge="same") AnnData object with n_obs × n_vars = 640 × 30 obs: 'blobs' obsm: 'X_pca', 'qc' varm: 'PCs' + layers: None >>> ad.concat(adatas, merge="unique") AnnData object with n_obs × n_vars = 640 × 30 obs: 'blobs' obsm: 'X_pca', 'qc' varm: 'PCs', '0_qc', '1_qc', '2_qc', '3_qc', '4_qc' + layers: None Note that comparisons are made after indices are aligned. That is, if the objects only share a subset of indices on the alternative axis, it's only required that values for those indices match when using a strategy like `"same"`. diff --git a/src/anndata/_core/anndata.py b/src/anndata/_core/anndata.py index 2de706091..0ab522b79 100644 --- a/src/anndata/_core/anndata.py +++ b/src/anndata/_core/anndata.py @@ -1074,7 +1074,6 @@ def transpose(self) -> AnnData: """ from anndata.compat import _safe_transpose - X = self.X if not self.isbacked else self.file["X"] if self.is_view: msg = ( "You’re trying to transpose a view of an `AnnData`, " @@ -1083,7 +1082,6 @@ def transpose(self) -> AnnData: raise ValueError(msg) return AnnData( - X=_safe_transpose(X) if X is not None else None, layers={k: _safe_transpose(v) for k, v in self.layers.items()}, obs=self.var, var=self.obs, diff --git a/src/anndata/_core/merge.py b/src/anndata/_core/merge.py index b8a959f48..b0e70477d 100644 --- a/src/anndata/_core/merge.py +++ b/src/anndata/_core/merge.py @@ -1164,30 +1164,6 @@ def axis_indices(adata: AnnData, axis: Literal["obs", 0, "var", 1]) -> pd.Index: return attr.index -# TODO: Resolve https://github.com/scverse/anndata/issues/678 and remove this function -def concat_Xs(adatas, reindexers, axis, fill_value): - """ - Shimy until support for some missing X's is implemented. - - Basically just checks if it's one of the two supported cases, or throws an error. - - This is not done inline in `concat` because we don't want to maintain references - to the values of a.X. - """ - Xs = [a.X for a in adatas] - if all(X is None for X in Xs): - return None - elif any(X is None for X in Xs): - msg = ( - "Some (but not all) of the AnnData's to be concatenated had no .X value. " - "Concatenation is currently only implemented for cases where all or none of" - " the AnnData's have .X assigned." - ) - raise NotImplementedError(msg) - else: - return concat_arrays(Xs, reindexers, axis=axis, fill_value=fill_value) - - def make_dask_col_from_extension_dtype( col: XDataArray, *, use_only_object_dtype: bool = False ) -> DaskArray: @@ -1720,8 +1696,6 @@ def concat( # noqa: PLR0912, PLR0913, PLR0915 ) alt_annot.true_index_dim = "merge_index" - X = concat_Xs(adatas, reindexers, axis=axis, fill_value=fill_value) - if join == "inner": concat_aligned_mapping = inner_concat_aligned_mapping join_keys = intersect_keys @@ -1799,7 +1773,6 @@ def concat( # noqa: PLR0912, PLR0913, PLR0915 warn(msg, UserWarning, stacklevel=2) return AnnData( **{ - "X": X, "layers": layers, axis_name: concat_annot, alt_axis_name: alt_annot, From 375697495f9928fc376110250a10f32f3dd764ed Mon Sep 17 00:00:00 2001 From: Ilan Gold Date: Fri, 29 Aug 2025 13:35:16 +0200 Subject: [PATCH 11/51] fix: most immutable setting situtations (#2095) Co-authored-by: Philipp A. --- .github/workflows/test-cpu.yml | 1 + src/anndata/_core/anndata.py | 80 ++++++++++++++++++++++++++----- src/anndata/_core/file_backing.py | 2 +- src/anndata/tests/helpers.py | 12 ++++- tests/test_backed_sparse.py | 7 +-- tests/test_views.py | 11 ++++- 6 files changed, 93 insertions(+), 20 deletions(-) diff --git a/.github/workflows/test-cpu.yml b/.github/workflows/test-cpu.yml index bcef7228c..203f5d52f 100644 --- a/.github/workflows/test-cpu.yml +++ b/.github/workflows/test-cpu.yml @@ -41,6 +41,7 @@ jobs: needs: get-environments runs-on: ubuntu-latest strategy: + fail-fast: false matrix: env: ${{ fromJSON(needs.get-environments.outputs.envs) }} io_mark: ["zarr_io", "not zarr_io"] diff --git a/src/anndata/_core/anndata.py b/src/anndata/_core/anndata.py index 735ac8d9c..3be2dc81e 100644 --- a/src/anndata/_core/anndata.py +++ b/src/anndata/_core/anndata.py @@ -21,6 +21,9 @@ from scipy import sparse from scipy.sparse import issparse +from anndata._core.access import ElementRef +from anndata._core.sparse_dataset import sparse_dataset + from .. import utils from .._settings import settings from ..compat import CSArray, _move_adj_mtx, old_positionals @@ -34,7 +37,7 @@ from .aligned_df import _gen_dataframe from .aligned_mapping import AlignedMappingProperty, AxisArrays, Layers, PairwiseArrays from .file_backing import AnnDataFileManager, to_memory -from .index import _normalize_indices, get_vector +from .index import _normalize_indices, _subset, get_vector from .raw import Raw from .sparse_dataset import BaseCompressedSparseDataset from .storage import coerce_array @@ -448,19 +451,20 @@ def _init_as_actual( # noqa: PLR0912, PLR0913, PLR0915 ) if {"raw", "raw.X"} & set(self.file): raw = dict(X=None, **raw) - if not raw: - self._raw = None - elif isinstance(raw, Mapping): - self._raw = Raw(self, **raw) - else: # is a Raw from another AnnData - self._raw = Raw(self, raw.X, raw.var, raw.varm) # clean up old formats self._clean_up_old_format(uns) # layers self.layers = layers - self.X = X + if X is not None: + self.X = X + if not raw: + self._raw = None + elif isinstance(raw, Mapping): + self._raw = Raw(self, **raw) + else: # is a Raw from another AnnData + self._raw = Raw(self, raw.X, raw.var, raw.varm) @old_positionals("show_stratified", "with_disk") def __sizeof__( @@ -537,15 +541,64 @@ def shape(self) -> tuple[int, int]: @property def X(self) -> XDataType | None: """Data matrix of shape :attr:`n_obs` × :attr:`n_vars`.""" + if self.isbacked: + if not self.file.is_open: + self.file.open() + X = self.file["X"] + if isinstance(X, h5py.Group): + X = sparse_dataset(X) + # This is so that we can index into a backed dense dataset with + # indices that aren’t strictly increasing + if self.is_view: + return _subset(X, (self._oidx, self._vidx)) + return X + elif self.is_view and self._adata_ref.X is None: + return None + elif self.is_view: + return as_view( + _subset(self._adata_ref.X, (self._oidx, self._vidx)), + ElementRef(self, "X"), + ) return self.layers.get(None) @X.setter def X(self, value: XDataType) -> None: - self.layers[None] = value + if value is None and self.isbacked: + msg = "Cannot currently remove data matrix from backed object." + raise NotImplementedError(msg) + # If indices are both arrays, we need to modify them + # so we don’t set values like coordinates + # This can occur if there are successive views + if ( + self.is_view + and isinstance(self._oidx, np.ndarray) + and isinstance(self._vidx, np.ndarray) + ): + oidx, vidx = np.ix_(self._oidx, self._vidx) + else: + oidx, vidx = self._oidx, self._vidx + if self.isbacked and ( + np.isscalar(value) + or (hasattr(value, "shape") and (self.shape == value.shape)) + or (self.n_vars == 1 and self.n_obs == len(value)) + or (self.n_obs == 1 and self.n_vars == len(value)) + ): + if self.is_view: + X = self.file["X"] + if isinstance(X, h5py.Group): + X = sparse_dataset(X) + X[oidx, vidx] = value + else: + self._set_backed("X", value) + # TODO: should we add support for `layers[None] = None`? + if value is not None: + self.layers[None] = value + else: + self.layers.pop(None) @X.deleter def X(self) -> None: - del self.layers[None] + self.X = None layers: AlignedMappingProperty[str | None, Layers | LayersView] = ( AlignedMappingProperty("layers", Layers) @@ -880,8 +933,10 @@ def filename(self, filename: PathLike[str] | str | None): self.write(filename, as_dense=as_dense) # open new file for accessing self.file.open(filename, "r+") - # as the data is stored on disk, we can safely set self.X to None - del self.X + # As the data is stored on disk, we can safely set remove if it previously was + # in layers. Setting `X` to `None` now would raise an error because `self.isbacked`. + if None in self.layers: + self.layers.pop(None) def _set_backed(self, attr, value): from .._io.utils import write_attribute @@ -1307,6 +1362,7 @@ def to_memory(self, *, copy: bool = False) -> AnnData: } if self.isbacked: + new["layers"][None] = self.X[...] self.file.close() return AnnData(**new) diff --git a/src/anndata/_core/file_backing.py b/src/anndata/_core/file_backing.py index 0e1dbf336..35d61ffe2 100644 --- a/src/anndata/_core/file_backing.py +++ b/src/anndata/_core/file_backing.py @@ -106,7 +106,7 @@ def close(self): def _to_memory_mode(self): """Close the backing file, forget filename, *do* change to memory mode.""" - self._adata._X = self._adata.X[()] + self._adata.X = self._adata.X[()] self._file.close() self._file = None self._filename = None diff --git a/src/anndata/tests/helpers.py b/src/anndata/tests/helpers.py index a1f4511db..43f15b471 100644 --- a/src/anndata/tests/helpers.py +++ b/src/anndata/tests/helpers.py @@ -848,9 +848,17 @@ def fmt_name(x): "varp", "raw", ]: + a_elem, b_elem = getattr(a, attr), getattr(b, attr) + # TODO: This is helpful in backed mode where `X is not None` but `None not in layers`. + # Does this filter make sense in general? Is there a case where we explicitly want to check `None in layers`? + if attr == "layers" and any(adata.isbacked for adata in [a, b]): + a_elem, b_elem = [ + {k: v for k, v in elem.items() if k is not None} + for elem in [a_elem, b_elem] + ] assert_equal( - getattr(a, attr), - getattr(b, attr), + a_elem, + b_elem, exact=exact, elem_name=fmt_name(attr), ) diff --git a/tests/test_backed_sparse.py b/tests/test_backed_sparse.py index 79d64ee4a..f1e84e843 100644 --- a/tests/test_backed_sparse.py +++ b/tests/test_backed_sparse.py @@ -236,13 +236,13 @@ def test_consecutive_bool( assert ( spy.call_count == 2 if should_trigger_optimization else not spy.call_count ) - assert_equal(csr_disk[mask, :], csr_disk[np.where(mask)]) + assert_equal(csr_disk[mask, :].X, csr_disk[np.where(mask)].X) if should_trigger_optimization is not None: assert ( spy.call_count == 3 if should_trigger_optimization else not spy.call_count ) subset = csc_disk[:, mask] - assert_equal(subset, csc_disk[:, np.where(mask)[0]]) + assert_equal(subset.X, csc_disk[:, np.where(mask)[0]].X) if should_trigger_optimization is not None: assert ( spy.call_count == 4 if should_trigger_optimization else not spy.call_count @@ -255,7 +255,8 @@ def test_consecutive_bool( else: subset_subset_mask = make_one_elem_mask(size) assert_equal( - subset[:, subset_subset_mask], subset[:, np.where(subset_subset_mask)[0]] + subset[:, subset_subset_mask].X, + subset[:, np.where(subset_subset_mask)[0]].X, ) assert ( spy.call_count == 5 if should_trigger_optimization else not spy.call_count diff --git a/tests/test_views.py b/tests/test_views.py index d52f9adfc..e15f43831 100644 --- a/tests/test_views.py +++ b/tests/test_views.py @@ -496,8 +496,15 @@ def test_view_delattr(attr, subset_func): delattr(subset, attr) assert not subset.is_view - # Should now have same value as default - assert_equal(getattr(subset, attr), getattr(empty, attr)) + # Should now have same value as default, except for `layers`, which still has the `None` key for `subset` + if attr == "layers": + assert_equal( + {k: v for k, v in getattr(subset, attr).items() if k is not None}, + getattr(empty, attr), + ) + else: + assert_equal(getattr(subset, attr), getattr(empty, attr)) + assert orig_hash == tokenize(base) # Original should not be modified From 28436ecea8dde0a928029c95bef2264a5d41100a Mon Sep 17 00:00:00 2001 From: "Philipp A." Date: Fri, 29 Aug 2025 13:50:16 +0200 Subject: [PATCH 12/51] make them falsy --- src/anndata/_core/aligned_mapping.py | 3 +++ tests/test_views.py | 9 +++++++++ 2 files changed, 12 insertions(+) diff --git a/src/anndata/_core/aligned_mapping.py b/src/anndata/_core/aligned_mapping.py index 39bc7aa90..57cb87b93 100644 --- a/src/anndata/_core/aligned_mapping.py +++ b/src/anndata/_core/aligned_mapping.py @@ -337,6 +337,9 @@ class LayersBase(AlignedMappingBase[str | None]): attrname: ClassVar[Literal["layers"]] = "layers" axes: ClassVar[tuple[Literal[0], Literal[1]]] = (0, 1) + def __bool__(self) -> bool: + return not self.keys() <= {None} + class Layers(AlignedActual[str | None], LayersBase): pass diff --git a/tests/test_views.py b/tests/test_views.py index e15f43831..c7b09f751 100644 --- a/tests/test_views.py +++ b/tests/test_views.py @@ -4,6 +4,7 @@ from copy import deepcopy from operator import mul from typing import TYPE_CHECKING +from warnings import filterwarnings import joblib import numpy as np @@ -488,6 +489,9 @@ def test_view_delitem(attr): "attr", ["X", "obs", "var", "obsm", "varm", "obsp", "varp", "layers", "uns"] ) def test_view_delattr(attr, subset_func): + # we shouldn’t trigger a warning here + filterwarnings("error", category=ad.ImplicitModificationWarning) + base = gen_adata((10, 10), **GEN_ADATA_DASK_ARGS) orig_hash = tokenize(base) subset = base[subset_func(base.obs_names), subset_func(base.var_names)] @@ -505,6 +509,11 @@ def test_view_delattr(attr, subset_func): else: assert_equal(getattr(subset, attr), getattr(empty, attr)) + if attr in {"obs", "var"}: + assert getattr(subset, attr).empty + else: + assert not getattr(subset, attr), "should be falsy" + assert orig_hash == tokenize(base) # Original should not be modified From 77d218d8046b01de6de955ca24f3e31db273406f Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Thu, 12 Feb 2026 15:37:25 +0100 Subject: [PATCH 13/51] fix: dont accidentally open backed mode --- src/anndata/_core/anndata.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/anndata/_core/anndata.py b/src/anndata/_core/anndata.py index ce7f0ade9..f8836254e 100644 --- a/src/anndata/_core/anndata.py +++ b/src/anndata/_core/anndata.py @@ -943,9 +943,9 @@ def uns_keys(self) -> list[str]: def isbacked(self) -> bool: """`True` if object is backed on disk, `False` otherwise.""" is_filename_none = self.filename is not None - is_x_none = ( - getattr(self._adata_ref if self._is_view else self, "_X", None) is None - ) + is_x_none = (self._adata_ref.layers if self._is_view else self.layers).get( + None, None + ) is None return is_filename_none and is_x_none @property From 5c118e09bcb28780b4031cd93ad8747090ac4143 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Thu, 12 Feb 2026 15:42:47 +0100 Subject: [PATCH 14/51] fix: remove expectant test --- tests/test_deprecations.py | 24 ------------------------ 1 file changed, 24 deletions(-) diff --git a/tests/test_deprecations.py b/tests/test_deprecations.py index 7a1b4308a..00d7eadc4 100644 --- a/tests/test_deprecations.py +++ b/tests/test_deprecations.py @@ -74,30 +74,6 @@ def test_obsvar_vector_Xlayer(adata): adata.obs_vector("a", layer="X") -# This should break in 0.9 -def test_dtype_warning(): - # Tests a warning is thrown - with pytest.warns(FutureWarning): - a = AnnData(np.ones((3, 3)), dtype=np.float32) - assert a.X.dtype == np.float32 - - # This shouldn't warn, shouldn't copy - with warnings.catch_warnings(record=True) as record: - b_X = np.ones((3, 3), dtype=np.float64) - b = AnnData(b_X) - assert not record - assert b_X is b.X - assert b.X.dtype == np.float64 - - # Should warn, should copy - c_X = np.ones((3, 3), dtype=np.float32) - with pytest.warns(FutureWarning): - c = AnnData(c_X, dtype=np.float64) - assert not record - assert c_X is not c.X - assert c.X.dtype == np.float64 - - def test_deprecated_write_attribute(tmp_path): pth = tmp_path / "file.h5" A = np.random.randn(20, 10) From 9342e91f59963da33128ca169c1874ca24130a93 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Thu, 12 Feb 2026 15:53:32 +0100 Subject: [PATCH 15/51] fix: collection dtype checking --- src/anndata/experimental/multi_files/_anncollection.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/anndata/experimental/multi_files/_anncollection.py b/src/anndata/experimental/multi_files/_anncollection.py index 9345d9f60..e0bf5290e 100644 --- a/src/anndata/experimental/multi_files/_anncollection.py +++ b/src/anndata/experimental/multi_files/_anncollection.py @@ -56,7 +56,7 @@ def check_type(attr, key=None): arrs = [] for a in adatas: attr_arr = getattr(a, attr) - if key is not None: + if (key is None and attr == "layers") or key is not None: attr_arr = attr_arr[key] arrs.append(attr_arr) # hacky but numpy find_common_type doesn't work with categoricals From 8513bdeea01292f9db58cbec220c64f6b2917b16 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Thu, 12 Feb 2026 16:12:19 +0100 Subject: [PATCH 16/51] fix: layers test --- tests/test_layers.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/test_layers.py b/tests/test_layers.py index 063d95708..c153eac10 100644 --- a/tests/test_layers.py +++ b/tests/test_layers.py @@ -21,7 +21,8 @@ def X(request): def test_creation(X: np.ndarray | None): adata = AnnData(X=X, layers=dict(L=L.copy())) - assert adata.layers.keys() == {"L", None} + # TODO: phil, when reviewing this, you had { "L", None } in both cases before - but if `X` is `None` should the key be there? + assert adata.layers.keys() == {"L", None} if X is not None else {"L"} assert "L" in adata.layers assert "X" not in adata.layers assert "some_other_thing" not in adata.layers From c9b096a5ec0886161a5376299840412e2897ce95 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Thu, 12 Feb 2026 16:21:42 +0100 Subject: [PATCH 17/51] fix: copy changes --- src/anndata/_core/anndata.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/src/anndata/_core/anndata.py b/src/anndata/_core/anndata.py index f8836254e..ff6b96942 100644 --- a/src/anndata/_core/anndata.py +++ b/src/anndata/_core/anndata.py @@ -602,8 +602,8 @@ def X(self, value: _XDataType | None): msg = "Automatic reshaping when setting X will be removed in the future." warn(msg, FutureWarning) value = value.reshape(self.shape) - can_set_direct_if_not_none = value is None or ( - np.isscalar(value) + can_set_direct_if_not_none = ( + value is None or (hasattr(value, "shape") and (self.shape == value.shape)) or (self.n_vars == 1 and self.n_obs == len(value)) or (self.n_obs == 1 and self.n_vars == len(value)) @@ -614,9 +614,6 @@ def X(self, value: _XDataType | None): if self.is_view: msg = "Setting element `.X` of view, initializing view as actual." warn(msg, ImplicitModificationWarning) - new = self._mutated_copy(X=value) - self._init_as_actual(new) - return if value is not None: self.layers[None] = value else: From a41f72570cf6a6bbafff065aa172a3fe6fa1c76a Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Thu, 12 Feb 2026 16:31:09 +0100 Subject: [PATCH 18/51] fix: removal --- tests/test_repr.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/tests/test_repr.py b/tests/test_repr.py index 94f39e25e..bd3df126d 100644 --- a/tests/test_repr.py +++ b/tests/test_repr.py @@ -59,4 +59,7 @@ def test_removal(adata, adata_attr): attr = adata_attr assert re.search(rf"^\s+{attr}:.*$", repr(adata), flags=re.MULTILINE) delattr(adata, attr) - assert re.search(rf"^\s+{attr}:.*$", repr(adata), flags=re.MULTILINE) is None + if attr != "layers": + assert re.search(rf"^\s+{attr}:.*$", repr(adata), flags=re.MULTILINE) is None + else: + assert re.search(r"^\s+layers: None.*$", repr(adata), flags=re.MULTILINE) From c8d968319cf22651d1fbd695903eea6ba9b8fcbf Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Thu, 12 Feb 2026 16:54:12 +0100 Subject: [PATCH 19/51] fix: warn --- src/anndata/_core/merge.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/anndata/_core/merge.py b/src/anndata/_core/merge.py index 09e80fa51..6b3cd9de7 100644 --- a/src/anndata/_core/merge.py +++ b/src/anndata/_core/merge.py @@ -1783,7 +1783,7 @@ def concat( # noqa: PLR0912, PLR0913, PLR0915 "Only some AnnData objects have `.raw` attribute, " "not concatenating `.raw` attributes." ) - warn(msg, UserWarning, stacklevel=2) + warn(msg, UserWarning) return AnnData(**{ "layers": layers, axis_name: concat_annot, From 9330978a849f6e9fca419aa5aaf50d08c32ffe0d Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Thu, 12 Feb 2026 17:03:49 +0100 Subject: [PATCH 20/51] maybe don't warn on deleting? --- src/anndata/_core/anndata.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/anndata/_core/anndata.py b/src/anndata/_core/anndata.py index ff6b96942..0e77f3cf9 100644 --- a/src/anndata/_core/anndata.py +++ b/src/anndata/_core/anndata.py @@ -611,7 +611,7 @@ def X(self, value: _XDataType | None): if not can_set_direct_if_not_none: msg = f"Data matrix has wrong shape {value.shape}, need to be {self.shape}." raise ValueError(msg) - if self.is_view: + if self.is_view and value is not None: msg = "Setting element `.X` of view, initializing view as actual." warn(msg, ImplicitModificationWarning) if value is not None: From b2651e0624f335b6f0410ae8354548c98caccf8a Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Thu, 12 Feb 2026 17:21:12 +0100 Subject: [PATCH 21/51] fix: del test --- tests/test_base.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/tests/test_base.py b/tests/test_base.py index 501710b66..17f6b8931 100644 --- a/tests/test_base.py +++ b/tests/test_base.py @@ -221,15 +221,13 @@ def test_convert_matrix(attr, when): assert not isinstance(arr, np.matrix), f"{arr} is still a matrix" -@pytest.mark.parametrize( - "attr", ["X", "obs", "var", "obsm", "varm", "obsp", "varp", "layers", "uns"] -) -def test_attr_deletion(attr: str): +def test_attr_deletion(): full = gen_adata((30, 30)) # Empty has just X, obs_names, var_names empty = AnnData(None, obs=full.obs[[]], var=full.var[[]]) - delattr(full, attr) - assert_equal(getattr(full, attr), getattr(empty, attr)) + for attr in ["X", "obs", "var", "obsm", "varm", "obsp", "varp", "layers", "uns"]: + delattr(full, attr) + assert_equal(getattr(full, attr), getattr(empty, attr)) assert_equal(full, empty, exact=True) From 24f168433f60d6075cda69c08aef515b5af6c809 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Thu, 12 Feb 2026 18:10:07 +0100 Subject: [PATCH 22/51] fix: del yes! --- src/anndata/_core/aligned_mapping.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/src/anndata/_core/aligned_mapping.py b/src/anndata/_core/aligned_mapping.py index ec4ea19de..6660b1edf 100644 --- a/src/anndata/_core/aligned_mapping.py +++ b/src/anndata/_core/aligned_mapping.py @@ -186,11 +186,12 @@ def __delitem__(self, key: K) -> None: if key not in self: msg = f"{key!r} not found in view of {self.attrname}" raise KeyError(msg) # Make sure it exists before bothering with a copy - msg = ( - f"Removing element `.{self.attrname}['{key}']` of view, " - "initializing view as actual." - ) - warn(msg, ImplicitModificationWarning) + if key is not None and self.attrname != "layers": + msg = ( + f"Removing element `.{self.attrname}['{key}']` of view, " + "initializing view as actual." + ) + warn(msg, ImplicitModificationWarning) with view_update(self.parent, self.attrname, ()) as new_mapping: del new_mapping[key] From 0ee2a12ffc98882653b926b5cbb22f0320cc79ff Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Thu, 12 Feb 2026 18:15:25 +0100 Subject: [PATCH 23/51] fix: correct conditoin --- src/anndata/_core/aligned_mapping.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/anndata/_core/aligned_mapping.py b/src/anndata/_core/aligned_mapping.py index 6660b1edf..e00d7e935 100644 --- a/src/anndata/_core/aligned_mapping.py +++ b/src/anndata/_core/aligned_mapping.py @@ -186,7 +186,7 @@ def __delitem__(self, key: K) -> None: if key not in self: msg = f"{key!r} not found in view of {self.attrname}" raise KeyError(msg) # Make sure it exists before bothering with a copy - if key is not None and self.attrname != "layers": + if not (key is None and self.attrname == "layers"): msg = ( f"Removing element `.{self.attrname}['{key}']` of view, " "initializing view as actual." From 1cdd856882501605038c81d6ca751deaa4ea5617 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 12 Feb 2026 17:17:50 +0000 Subject: [PATCH 24/51] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- tests/test_deprecations.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/tests/test_deprecations.py b/tests/test_deprecations.py index bedc7afa1..6bbff0fd9 100644 --- a/tests/test_deprecations.py +++ b/tests/test_deprecations.py @@ -6,8 +6,6 @@ from __future__ import annotations -import warnings - import h5py import numpy as np import pytest From 0d187b448b26e51cef7848ffd3fe1b236478ec47 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Thu, 26 Feb 2026 14:08:34 +0100 Subject: [PATCH 25/51] fix: sizeof --- src/anndata/_core/anndata.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/anndata/_core/anndata.py b/src/anndata/_core/anndata.py index 8fa8775cc..15ac08b9a 100644 --- a/src/anndata/_core/anndata.py +++ b/src/anndata/_core/anndata.py @@ -510,7 +510,7 @@ def cs_to_bytes(X) -> int: return X.__sizeof__() sizes = {} - attrs = ["X", "_obs", "_var"] + attrs = ["_obs", "_var"] + (["X"] if self.isbacked else []) attrs_multi = ["_uns", "_obsm", "_varm", "varp", "_obsp", "_layers"] for attr in attrs + attrs_multi: if attr in attrs_multi: From 4ea784753907166837de1488c83b124198627590 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Thu, 26 Feb 2026 17:04:12 +0100 Subject: [PATCH 26/51] fix: update view setting --- src/anndata/_core/anndata.py | 11 ++++++++--- src/anndata/_core/merge.py | 2 +- .../experimental/multi_files/_anncollection.py | 2 ++ tests/test_x.py | 5 +++-- 4 files changed, 14 insertions(+), 6 deletions(-) diff --git a/src/anndata/_core/anndata.py b/src/anndata/_core/anndata.py index 15ac08b9a..e873630c5 100644 --- a/src/anndata/_core/anndata.py +++ b/src/anndata/_core/anndata.py @@ -588,7 +588,7 @@ def X(self) -> _XDataType | None: return self.layers.get(None) @X.setter - def X(self, value: _XDataType | None): + def X(self, value: _XDataType | None) -> None: value = ( coerce_array(value, name="X", allow_array_like=True) if value is not None @@ -611,13 +611,16 @@ def X(self, value: _XDataType | None): if not can_set_direct_if_not_none: msg = f"Data matrix has wrong shape {value.shape}, need to be {self.shape}." raise ValueError(msg) - if self.is_view and value is not None: + if self.is_view: msg = "Setting element `.X` of view, initializing view as actual." warn(msg, ImplicitModificationWarning) + self._init_as_actual(self._copy(X=value)) + return None if value is not None: self.layers[None] = value else: self.layers.pop(None) + return None @X.deleter def X(self) -> None: @@ -1325,7 +1328,7 @@ def var_vector(self, k: str, /, *, layer: str | None = None) -> np.ndarray: """ return _get_vector_ambiguous(self, k, "var", layer=layer) - def _copy(self) -> AnnData: + def _copy(self, *, X: _XDataType | None = None) -> AnnData: if self.isbacked and self.raw is not None: msg = ( "This function does not currently handle backed objects " @@ -1335,6 +1338,8 @@ def _copy(self) -> AnnData: new = {} for key in ["obs", "var", "obsm", "varm", "obsp", "varp", "layers"]: new[key] = getattr(self, key).copy() + if X is not None and key == "layers": + new[key][None] = X new["uns"] = deepcopy(self._uns) if self.raw is not None: new["raw"] = self.raw.copy() diff --git a/src/anndata/_core/merge.py b/src/anndata/_core/merge.py index 6b3cd9de7..578981449 100644 --- a/src/anndata/_core/merge.py +++ b/src/anndata/_core/merge.py @@ -1532,7 +1532,7 @@ def concat( # noqa: PLR0912, PLR0913, PLR0915 >>> outer AnnData object with n_obs × n_vars = 4 × 3 obs: 'group', 'measure' - layers: Noneå + layers: None >>> outer.var_names.astype("string") Index(['var1', 'var2', 'var3'], dtype='string') >>> outer.to_df() # Sparse arrays are padded with zeroes by default diff --git a/src/anndata/experimental/multi_files/_anncollection.py b/src/anndata/experimental/multi_files/_anncollection.py index e0bf5290e..22ce5ad85 100644 --- a/src/anndata/experimental/multi_files/_anncollection.py +++ b/src/anndata/experimental/multi_files/_anncollection.py @@ -653,11 +653,13 @@ class AnnCollection(_ConcatViewMixin, _IterateViewMixin): AnnCollection object with n_obs × n_vars = 3338 × 208 constructed from 2 AnnData objects view of obsm: 'X_pca', 'X_umap' + view of layers: None obs: 'n_genes', 'percent_mito', 'n_counts', 'louvain' >>> batch = dc[100:200] # AnnCollectionView >>> batch AnnCollectionView object with n_obs × n_vars = 100 × 208 obsm: 'X_pca', 'X_umap' + layers: None obs: 'n_genes', 'percent_mito', 'n_counts', 'louvain' >>> batch.X.shape (100, 208) diff --git a/tests/test_x.py b/tests/test_x.py index 69508a94e..8f3be7b2a 100644 --- a/tests/test_x.py +++ b/tests/test_x.py @@ -88,8 +88,9 @@ def test_del_set_equiv_X(): assert_equal(orig, copy) # Check that deleting again is still fine - del orig.X - assert orig.X is None + # TODO: Do we even want to keep supporting this operation i.e., del adata.X if X is None? + # del orig.X + # assert orig.X is None @pytest.mark.parametrize( From 4f97aaa8717e37edc991815435bbb60d6b48e9ba Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Thu, 26 Feb 2026 17:32:19 +0100 Subject: [PATCH 27/51] fix: copying X in setting None --- src/anndata/_core/anndata.py | 11 +++++++++-- tests/test_views.py | 8 ++------ 2 files changed, 11 insertions(+), 8 deletions(-) diff --git a/src/anndata/_core/anndata.py b/src/anndata/_core/anndata.py index e873630c5..cd6a12bc7 100644 --- a/src/anndata/_core/anndata.py +++ b/src/anndata/_core/anndata.py @@ -40,6 +40,7 @@ deprecated, deprecation_msg, ensure_df_homogeneous, + get_union_members, raise_value_error_if_multiindex_columns, set_module, warn, @@ -1328,7 +1329,11 @@ def var_vector(self, k: str, /, *, layer: str | None = None) -> np.ndarray: """ return _get_vector_ambiguous(self, k, "var", layer=layer) - def _copy(self, *, X: _XDataType | None = None) -> AnnData: + def _copy( + self, *, X: _XDataType | None | Literal["no_set_X"] = "no_set_X" + ) -> AnnData: + from ..typing import _XDataType + if self.isbacked and self.raw is not None: msg = ( "This function does not currently handle backed objects " @@ -1338,7 +1343,9 @@ def _copy(self, *, X: _XDataType | None = None) -> AnnData: new = {} for key in ["obs", "var", "obsm", "varm", "obsp", "varp", "layers"]: new[key] = getattr(self, key).copy() - if X is not None and key == "layers": + if key == "layers" and isinstance( + X, (*get_union_members(_XDataType), type(None)) + ): new[key][None] = X new["uns"] = deepcopy(self._uns) if self.raw is not None: diff --git a/tests/test_views.py b/tests/test_views.py index 561ba051c..777319b99 100644 --- a/tests/test_views.py +++ b/tests/test_views.py @@ -6,7 +6,6 @@ from importlib.metadata import version from importlib.util import find_spec from typing import TYPE_CHECKING -from warnings import filterwarnings import joblib import numpy as np @@ -546,15 +545,12 @@ def test_view_delitem(attr): "attr", ["X", "obs", "var", "obsm", "varm", "obsp", "varp", "layers", "uns"] ) def test_view_delattr(attr, subset_func): - # we shouldn’t trigger a warning here - filterwarnings("error", category=ad.ImplicitModificationWarning) - base = gen_adata((10, 10), **GEN_ADATA_DASK_ARGS) orig_hash = tokenize(base) subset = base[subset_func(base.obs_names), subset_func(base.var_names)] empty = ad.AnnData(obs=subset.obs[[]], var=subset.var[[]]) - - delattr(subset, attr) + with pytest.warns(ad.ImplicitModificationWarning) if attr == "X" else nullcontext(): + delattr(subset, attr) assert not subset.is_view # Should now have same value as default, except for `layers`, which still has the `None` key for `subset` From ad93198a06d4ad94b7a3daaf69e605bbb07d7f3a Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Thu, 26 Feb 2026 18:10:30 +0100 Subject: [PATCH 28/51] fix: virtualenv --- pyproject.toml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/pyproject.toml b/pyproject.toml index e849e3ebc..00751ff51 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -286,3 +286,6 @@ fragment.perf.name = "Performance" fragment.chore.name = "Miscellaneous changes" fragment.revert.name = "Revert" fragment.breaking.name = "Breaking changes" # add `!` to commit type (e.g. “feature!:”) + +[tool.uv] +override-dependencies = [ "virtualenv<21" ] From b595e91d4316d262ccbdaa5d04b836fa101b84a1 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Mon, 27 Apr 2026 12:02:25 +0200 Subject: [PATCH 29/51] Merge branch 'main' into x-layers-unification --- .github/workflows/benchmark.yml | 2 +- .github/workflows/check-pr.yml | 2 +- .pre-commit-config.yaml | 6 +- .readthedocs.yml | 2 +- benchmarks/benchmarks/sparse_dataset.py | 67 ++- docs/api.md | 1 + docs/release-notes/0.6.0.md | 2 +- docs/release-notes/0.9.0.md | 2 +- docs/release-notes/2326.perf.md | 1 + docs/release-notes/2358.fix.md | 1 + docs/release-notes/2367.breaking.md | 1 + docs/release-notes/2370.breaking.md | 1 + docs/release-notes/2372.feat.md | 1 + docs/release-notes/2395.perf.md | 1 + docs/release-notes/2399.fix.md | 1 + docs/release-notes/2406.fix.md | 1 + pyproject.toml | 5 +- src/anndata/_core/aligned_mapping.py | 11 +- src/anndata/_core/anndata.py | 627 +++++++++--------------- src/anndata/_core/extensions.py | 233 +-------- src/anndata/_core/merge.py | 38 +- src/anndata/_core/raw.py | 2 +- src/anndata/_core/xarray.py | 12 +- src/anndata/_io/h5ad.py | 47 +- src/anndata/_io/read.py | 10 +- src/anndata/_io/specs/methods.py | 30 +- src/anndata/_io/write.py | 10 +- src/anndata/_io/zarr.py | 59 ++- src/anndata/_types.py | 32 ++ src/anndata/tests/helpers.py | 10 +- src/anndata/types.py | 38 +- src/anndata/utils.py | 68 ++- tests/lazy/conftest.py | 3 +- tests/lazy/test_read.py | 6 +- tests/test_backed_dense.py | 12 +- tests/test_backed_hdf5.py | 6 +- tests/test_backed_sparse.py | 19 +- tests/test_base.py | 91 +++- tests/test_concatenate.py | 219 ++++----- tests/test_deprecations.py | 7 + tests/test_extensions.py | 45 +- tests/test_get_vector.py | 2 +- tests/test_helpers.py | 15 + tests/test_readwrite.py | 50 +- tests/test_xarray.py | 1 - 45 files changed, 833 insertions(+), 967 deletions(-) create mode 100644 docs/release-notes/2326.perf.md create mode 100644 docs/release-notes/2358.fix.md create mode 100644 docs/release-notes/2367.breaking.md create mode 100644 docs/release-notes/2370.breaking.md create mode 100644 docs/release-notes/2372.feat.md create mode 100644 docs/release-notes/2395.perf.md create mode 100644 docs/release-notes/2399.fix.md create mode 100644 docs/release-notes/2406.fix.md diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml index e90cfb204..66c4ada30 100644 --- a/.github/workflows/benchmark.yml +++ b/.github/workflows/benchmark.yml @@ -64,4 +64,4 @@ jobs: working-directory: ${{ env.ASV_DIR }} run: | asv machine --yes - asv run --quick --show-stderr --verbose + asv run --dry-run --quick --show-stderr --verbose HEAD^! diff --git a/.github/workflows/check-pr.yml b/.github/workflows/check-pr.yml index 5f87e6b88..17f79942c 100644 --- a/.github/workflows/check-pr.yml +++ b/.github/workflows/check-pr.yml @@ -63,7 +63,7 @@ jobs: id: changes with: filters: | # this is intentionally a string - relnotes: 'docs/release-notes/${{ github.event.pull_request.number }}.${{ needs.check-milestone.outputs.type }}.md' + relnotes: 'docs/release-notes/${{ github.event.pull_request.number }}.${{ (contains(github.event.pull_request.title, '!') && 'breaking') || needs.check-milestone.outputs.type }}.md' - name: Check if a relevant release fragment is added uses: flying-sheep/check@v1 with: diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 214c2804e..d5186f976 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -3,7 +3,7 @@ ci: repos: - repo: https://github.com/astral-sh/ruff-pre-commit - rev: v0.15.4 + rev: v0.15.11 hooks: - id: ruff-check args: ["--fix"] @@ -13,7 +13,7 @@ repos: id: ruff args: ["--preview", "--select=PLR0917"] - repo: https://github.com/biomejs/pre-commit - rev: v2.4.4 + rev: v2.4.12 hooks: - id: biome-format - repo: https://github.com/ComPWA/taplo-pre-commit @@ -34,7 +34,7 @@ repos: - id: no-commit-to-branch args: ["--branch=main"] - repo: https://github.com/codespell-project/codespell - rev: v2.4.1 + rev: v2.4.2 hooks: - id: codespell additional_dependencies: diff --git a/.readthedocs.yml b/.readthedocs.yml index ebe00200c..fe008d0e0 100644 --- a/.readthedocs.yml +++ b/.readthedocs.yml @@ -14,7 +14,7 @@ build: - asdf global uv latest pre_build: # run towncrier to preview the next version’s release notes - - ( find docs/release-notes -regex '[^.]+[.][^.]+.md' | grep -q . ) && towncrier build --keep || true + - ( find docs/release-notes -regex '[^.]+[.][^.]+.md' | grep -q . ) && uvx hatch run docs:towncrier build --keep || true build: html: - uvx hatch run docs:build diff --git a/benchmarks/benchmarks/sparse_dataset.py b/benchmarks/benchmarks/sparse_dataset.py index 66f5b221a..da8a1b4d6 100644 --- a/benchmarks/benchmarks/sparse_dataset.py +++ b/benchmarks/benchmarks/sparse_dataset.py @@ -1,8 +1,10 @@ from __future__ import annotations from types import MappingProxyType +from typing import TYPE_CHECKING import numpy as np +import pandas as pd import zarr from dask.array.core import Array as DaskArray from scipy import sparse @@ -12,6 +14,9 @@ from anndata._io.specs import write_elem from anndata.experimental import read_elem_lazy +if TYPE_CHECKING: + from typing import Literal + def make_alternating_mask(n): mask_alternating = np.ones(10_000, dtype=bool) @@ -79,9 +84,12 @@ def peakmem_getitem_adata(self, *_): res.compute() -class SparseCSRDask: +class SparseCSRDaskConcat: filepath = "data.zarr" + params = (["inner", "outer"], [0, -1]) + param_names = ("join", "fill_value") + def setup_cache(self): X = sparse.random( 10_000, @@ -93,18 +101,59 @@ def setup_cache(self): g = zarr.group(self.filepath) write_elem(g, "X", X) - def setup(self): + def setup(self, *_): self.group = zarr.group(self.filepath) - self.adata = AnnData(X=read_elem_lazy(self.group["X"])) + self.adatas = [ + AnnData( + var=pd.DataFrame( + index=[ + f"gene_{j}{f'_{i}' if (j % 500 == 0) else ''}" + for j in range(10_000) + ] + ), + X=read_elem_lazy(self.group["X"]), + ) + for i in range(5) + ] + + def time_concat(self, join: Literal["inner", "outer"], fill_value: Literal[0, -1]): + concat(self.adatas, join=join, fill_value=fill_value) + + def peakmem_concat( + self, join: Literal["inner", "outer"], fill_value: Literal[0, -1] + ): + concat(self.adatas, join=join, fill_value=fill_value) + + def time_concat_with_mem( + self, join: Literal["inner", "outer"], fill_value: Literal[0, -1] + ): + concat(self.adatas, join=join, fill_value=fill_value).to_memory() + + def peakmem_concat_with_mem( + self, join: Literal["inner", "outer"], fill_value: Literal[0, -1] + ): + concat(self.adatas, join=join, fill_value=fill_value).to_memory() - def time_concat(self): - concat([self.adata for i in range(100)]) - def peakmem_concat(self): - concat([self.adata for i in range(100)]) +class SparseCSRDask: + filepath = "data.zarr" + + def setup_cache(self): + X = sparse.random( + 10_000, + 10_000, + density=0.01, + format="csr", + random_state=np.random.default_rng(42), + ) + g = zarr.group(self.filepath) + write_elem(g, "X", X) + + def setup(self, *_): + self.group = zarr.group(self.filepath) - def time_read(self): + def time_read(self, *_): AnnData(X=read_elem_lazy(self.group["X"])) - def peakmem_read(self): + def peakmem_read(self, *_): AnnData(X=read_elem_lazy(self.group["X"])) diff --git a/docs/api.md b/docs/api.md index 279070e50..14883f1e7 100644 --- a/docs/api.md +++ b/docs/api.md @@ -92,6 +92,7 @@ Writing a complete {class}`AnnData` object to disk in anndata’s native formats AnnData.write_h5ad AnnData.write_zarr + AnnData.unwriteable .. diff --git a/docs/release-notes/0.6.0.md b/docs/release-notes/0.6.0.md index dc9f2e981..127f4d72c 100644 --- a/docs/release-notes/0.6.0.md +++ b/docs/release-notes/0.6.0.md @@ -2,7 +2,7 @@ ### 0.6.0 {small}`1 May, 2018` - compatibility with Seurat converter -- tremendous speedup for {meth}`~anndata.AnnData.concatenate` +- tremendous speedup for `~anndata.AnnData.concatenate` - bug fix for deep copy of unstructured annotation after slicing - bug fix for reading HDF5 stored single-category annotations - `'outer join'` concatenation: adds zeros for concatenation of sparse data and nans for dense data diff --git a/docs/release-notes/0.9.0.md b/docs/release-notes/0.9.0.md index 3481ade4c..6f1af32b9 100644 --- a/docs/release-notes/0.9.0.md +++ b/docs/release-notes/0.9.0.md @@ -39,7 +39,7 @@ #### Deprecations -- {meth}`AnnData.concatenate() ` is now deprecated in favour of {func}`anndata.concat` {pr}`845` {user}`ivirshup` +- `AnnData.concatenate()` is now deprecated in favour of {func}`anndata.concat` {pr}`845` {user}`ivirshup` #### Bug fixes diff --git a/docs/release-notes/2326.perf.md b/docs/release-notes/2326.perf.md new file mode 100644 index 000000000..6b423245e --- /dev/null +++ b/docs/release-notes/2326.perf.md @@ -0,0 +1 @@ +Use {doc}`zarrs-python ` by default for {func}`anndata.io.read_zarr` and {func}`anndata.io.write_zarr` if it is installed {user}`ilan-gold` diff --git a/docs/release-notes/2358.fix.md b/docs/release-notes/2358.fix.md new file mode 100644 index 000000000..2dd4847ae --- /dev/null +++ b/docs/release-notes/2358.fix.md @@ -0,0 +1 @@ +Ensure the index name along `obs` and `var` are not lost when reading/writing to disk after reading with {func}`~anndata.experimental.read_lazy` {user}`ilan-gold` diff --git a/docs/release-notes/2367.breaking.md b/docs/release-notes/2367.breaking.md new file mode 100644 index 000000000..2541b0fc5 --- /dev/null +++ b/docs/release-notes/2367.breaking.md @@ -0,0 +1 @@ +Remove `Anndata.__{set,del}item__` {user}`ilan-gold` diff --git a/docs/release-notes/2370.breaking.md b/docs/release-notes/2370.breaking.md new file mode 100644 index 000000000..a3bdee43c --- /dev/null +++ b/docs/release-notes/2370.breaking.md @@ -0,0 +1 @@ +Remove `AnnData.concatenate` {user}`ilan-gold` diff --git a/docs/release-notes/2372.feat.md b/docs/release-notes/2372.feat.md new file mode 100644 index 000000000..61e9fa0bc --- /dev/null +++ b/docs/release-notes/2372.feat.md @@ -0,0 +1 @@ +New {meth}`anndata.AnnData.unwriteable` for checking if an `AnnData` can be written {user}`ilan-gold` diff --git a/docs/release-notes/2395.perf.md b/docs/release-notes/2395.perf.md new file mode 100644 index 000000000..e0d6fd08e --- /dev/null +++ b/docs/release-notes/2395.perf.md @@ -0,0 +1 @@ +Accelerate outer joins on dask-sparse matrices with unchunked minor axes in {func}`anndata.concat` {user}`ilan-gold` diff --git a/docs/release-notes/2399.fix.md b/docs/release-notes/2399.fix.md new file mode 100644 index 000000000..9c3ccafc1 --- /dev/null +++ b/docs/release-notes/2399.fix.md @@ -0,0 +1 @@ +Disallow {meth}`anndata.AnnData.transpose` when `X` or `layers` contains {class}`h5py.Dataset`, {class}`zarr.Array` ,{class}`anndata.abc.CSRDataset`, or {class}`anndata.abc.CSCDataset` {user}`ilan-gold`. diff --git a/docs/release-notes/2406.fix.md b/docs/release-notes/2406.fix.md new file mode 100644 index 000000000..fe7c668d2 --- /dev/null +++ b/docs/release-notes/2406.fix.md @@ -0,0 +1 @@ +Fix {meth}`anndata.AnnData.copy` so that it provides an informative error when trying to `copy` and object that contains {class}`h5py.Dataset`, {class}`zarr.Array`, {class}`anndata.abc.CSRDataset`, or {class}`anndata.abc.CSCDataset` {user}`ilan-gold` diff --git a/pyproject.toml b/pyproject.toml index 00751ff51..0db2a7fa6 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -46,6 +46,7 @@ dependencies = [ "legacy-api-wrap", "zarr >=3.1", "typing-extensions; python_version<'3.13'", + "scverse-misc>=0.0.3", ] dynamic = [ "version" ] @@ -65,7 +66,7 @@ doc = [ "sphinx-issues>=5.0.1", "sphinx-copybutton", "sphinxext.opengraph", - "myst-nb", + "myst-nb>=1.4", "scanpydoc[theme,typehints] >=0.17.1", "awkward>=2.6.3", "IPython", # For syntax highlighting in notebooks @@ -173,7 +174,7 @@ filterwarnings_when_strict = [ "default::dask.array.core.PerformanceWarning", "default:anndata will no longer support zarr v2:DeprecationWarning", "default:Consolidated metadata is:UserWarning", - "default:.*Structured:zarr.core.dtype.common.UnstableSpecificationWarning", + "default:.*Struct:zarr.core.dtype.common.UnstableSpecificationWarning", "default:.*FixedLengthUTF32:zarr.core.dtype.common.UnstableSpecificationWarning", "default:Automatic shard shape inference is experimental", "default:Writing zarr v2:UserWarning", diff --git a/src/anndata/_core/aligned_mapping.py b/src/anndata/_core/aligned_mapping.py index e00d7e935..d56f8c6f1 100644 --- a/src/anndata/_core/aligned_mapping.py +++ b/src/anndata/_core/aligned_mapping.py @@ -9,13 +9,13 @@ import numpy as np import pandas as pd +from scverse_misc import Deprecation, deprecated from .._warnings import ExperimentalFeatureWarning, ImplicitModificationWarning from ..compat import AwkArray, CSArray, CSMatrix, CupyArray, XDataset from ..utils import ( axis_len, convert_to_dict, - deprecated, deprecation_msg, raise_value_error_if_multiindex_columns, warn, @@ -130,7 +130,7 @@ def _view(self, parent: AnnData, subset_idx: I) -> AlignedView[K, Self, I]: """Returns a subset copy-on-write view of the object.""" return self._view_class(self, parent, subset_idx) - @deprecated(deprecation_msg("as_dict", "dict(obj)")) + @deprecated(Deprecation("0.10.2", deprecation_msg("as_dict", "dict(obj)"))) def as_dict(self) -> dict: return dict(self) @@ -419,12 +419,12 @@ class AlignedMappingProperty[T: AlignedMapping, K: (str, str | None)](property): The actual data is stored as `f'_{self.name}'` in the parent object. """ - name: str - """Name of the attribute in the parent object.""" cls: type[T] """Concrete type that will be constructed.""" axis: Literal[0, 1] | None = None """Axis of the parent to align to.""" + name: str | None = None + """Name of the attribute in the parent object.""" def construct(self, obj: AnnData, *, store: MutableMapping[K, Value]) -> T: if self.axis is None: @@ -440,6 +440,9 @@ def fake(): ... fake.__annotations__ = {"return": self.cls._actual_class | self.cls._view_class} return fake + def __set_name__(self, owner: AnnData, name: str): + self.name = name + def __get__(self, obj: None | AnnData, objtype: type | None = None) -> T: if obj is None: # When accessed from the class, e.g. via `AnnData.obs`, diff --git a/src/anndata/_core/anndata.py b/src/anndata/_core/anndata.py index cd6a12bc7..ffb19795c 100644 --- a/src/anndata/_core/anndata.py +++ b/src/anndata/_core/anndata.py @@ -4,10 +4,10 @@ from __future__ import annotations -from collections import OrderedDict +from collections import OrderedDict, defaultdict from collections.abc import Mapping, MutableMapping, Sequence from copy import copy, deepcopy -from functools import partial, singledispatchmethod +from functools import singledispatchmethod from pathlib import Path from textwrap import dedent from typing import TYPE_CHECKING, cast, overload @@ -17,8 +17,8 @@ import pandas as pd from natsort import natsorted from pandas.api.types import infer_dtype -from scipy import sparse from scipy.sparse import issparse +from scverse_misc import Deprecation, deprecated from anndata._core.access import ElementRef from anndata._core.sparse_dataset import sparse_dataset @@ -27,8 +27,11 @@ from .._settings import settings from .._warnings import ImplicitModificationWarning from ..compat import ( + AwkArray, CSArray, IndexManager, + XDataset, + ZarrArray, _move_adj_mtx, has_xp, old_positionals, @@ -37,10 +40,10 @@ from ..logging import anndata_logger as logger from ..utils import ( axis_len, - deprecated, deprecation_msg, ensure_df_homogeneous, get_union_members, + iter_outer, raise_value_error_if_multiindex_columns, set_module, warn, @@ -60,16 +63,21 @@ from os import PathLike from typing import Any, ClassVar, Literal + from scipy import sparse from zarr.storage import StoreLike + from anndata._types import AnnDataElem + from anndata.typing import RWAble + + from .._types import ReduceFunc from ..acc import AdRef, Array, MapAcc, RefAcc - from ..compat import XDataset - from ..typing import Index, Index1D, _Index1DNorm, _XDataType + from ..compat import CSArray, CSMatrix + from ..typing import AxisStorable, Index, Index1D, _Index1DNorm, _XDataType from .aligned_mapping import AxisArraysView, LayersView, PairwiseArraysView @set_module("anndata") -class AnnData(metaclass=utils.DeprecationMixinMeta): # noqa: PLW1641 +class AnnData: # noqa: PLW1641 """\ An annotated data matrix. @@ -275,12 +283,6 @@ def _init_as_view( oidx: _Index1DNorm | int | np.integer, vidx: _Index1DNorm | int | np.integer, ): - if adata_ref.isbacked and adata_ref.is_view: - msg = ( - "Currently, you cannot index repeatedly into a backed AnnData, " - "that is, you cannot make a view of a view." - ) - raise ValueError(msg) self._is_view = True if isinstance(oidx, int | np.integer): if not (-adata_ref.n_obs <= oidx < adata_ref.n_obs): @@ -497,53 +499,54 @@ def _init_as_actual( # noqa: PLR0912, PLR0913, PLR0915 def __sizeof__( self, *, show_stratified: bool = False, with_disk: bool = False ) -> int: - def get_size(X) -> int: - def cs_to_bytes(X) -> int: - return int(X.data.nbytes + X.indptr.nbytes + X.indices.nbytes) + def cs_to_bytes(X: CSArray | CSMatrix) -> int: + return int(X.data.nbytes + X.indptr.nbytes + X.indices.nbytes) + def get_size(X: RWAble) -> int: if isinstance(X, h5py.Dataset) and with_disk: return int(np.array(X.shape).prod() * X.dtype.itemsize) elif isinstance(X, BaseCompressedSparseDataset) and with_disk: return cs_to_bytes(X._to_backed()) elif issparse(X): return cs_to_bytes(X) + elif isinstance(X, dict | MutableMapping): + return sum(get_size(v) for v in X.values()) else: return X.__sizeof__() - sizes = {} - attrs = ["_obs", "_var"] + (["X"] if self.isbacked else []) - attrs_multi = ["_uns", "_obsm", "_varm", "varp", "_obsp", "_layers"] - for attr in attrs + attrs_multi: - if attr in attrs_multi: - keys = getattr(self, attr).keys() - s = sum(get_size(getattr(self, attr)[k]) for k in keys) + def fold_size( + elem: _XDataType | AxisStorable | pd.DataFrame | XDataset, + *, + accumulate: dict[str, int], + attr_name: str | None, # TODO: type + ): + if elem is None: + size = 0 + elif elem is self.raw: + size = ( + get_size(elem.X) + + get_size(elem.var) + + sum(get_size(v) for v in elem.varm.values()) + ) else: - s = get_size(getattr(self, attr)) - if s > 0 and show_stratified: + size = get_size(elem) + accumulate[attr_name] = size + if size > 0 and show_stratified: from tqdm import tqdm - print( - f"Size of {attr.replace('_', '.'):<7}: {tqdm.format_sizeof(s, 'B')}" - ) - sizes[attr] = s - return sum(sizes.values()) + print(f"Size of {attr_name}: {tqdm.format_sizeof(size, 'B')}") + return accumulate + + return sum(self._reduce(fold_size, init=defaultdict(int)).values()) def _gen_repr(self, n_obs, n_vars) -> str: backed_at = f" backed at {str(self.filename)!r}" if self.isbacked else "" descr = f"AnnData object with n_obs × n_vars = {n_obs} × {n_vars}{backed_at}" - for attr in [ - "obs", - "var", - "uns", - "obsm", - "varm", - "layers", - "obsp", - "varp", - ]: - keys = getattr(self, attr).keys() - if len(keys) > 0: - descr += f"\n {attr}: {str(list(keys))[1:-1]}" + for attr_name, elem in iter_outer(self): + if attr_name not in {"raw", "X"}: + keys = elem.keys() + if len(keys) > 0: + descr += f"\n {attr_name}: {str(list(keys))[1:-1]}" return descr def __repr__(self) -> str: @@ -627,9 +630,7 @@ def X(self, value: _XDataType | None) -> None: def X(self) -> None: self.X = None - layers: AlignedMappingProperty[str | None, Layers | LayersView] = ( - AlignedMappingProperty("layers", Layers) - ) + layers: AlignedMappingProperty[Layers | LayersView] = AlignedMappingProperty(Layers) """\ Dictionary-like object with values of the same dimensions as :attr:`X`. @@ -844,8 +845,8 @@ def uns(self, value: MutableMapping): def uns(self): self.uns = OrderedDict() - obsm: AlignedMappingProperty[str, AxisArrays | AxisArraysView] = ( - AlignedMappingProperty("obsm", AxisArrays, 0) + obsm: AlignedMappingProperty[AxisArrays | AxisArraysView] = AlignedMappingProperty( + AxisArrays, 0 ) """\ Multi-dimensional annotation of observations @@ -856,8 +857,8 @@ def uns(self): Is sliced with `data` and `obs` but behaves otherwise like a :term:`mapping`. """ - varm: AlignedMappingProperty[str, AxisArrays | AxisArraysView] = ( - AlignedMappingProperty("varm", AxisArrays, 1) + varm: AlignedMappingProperty[AxisArrays | AxisArraysView] = AlignedMappingProperty( + AxisArrays, 1 ) """\ Multi-dimensional annotation of variables/features @@ -868,8 +869,8 @@ def uns(self): Is sliced with `data` and `var` but behaves otherwise like a :term:`mapping`. """ - obsp: AlignedMappingProperty[str, PairwiseArrays | PairwiseArraysView] = ( - AlignedMappingProperty("obsp", PairwiseArrays, 0) + obsp: AlignedMappingProperty[PairwiseArrays | PairwiseArraysView] = ( + AlignedMappingProperty(PairwiseArrays, 0) ) """\ Pairwise annotation of observations, @@ -880,8 +881,8 @@ def uns(self): Is sliced with `data` and `obs` but behaves otherwise like a :term:`mapping`. """ - varp: AlignedMappingProperty[str, PairwiseArrays | PairwiseArraysView] = ( - AlignedMappingProperty("varp", PairwiseArrays, 1) + varp: AlignedMappingProperty[PairwiseArrays | PairwiseArraysView] = ( + AlignedMappingProperty(PairwiseArrays, 1) ) """\ Pairwise annotation of variables/features, @@ -893,9 +894,12 @@ def uns(self): """ @deprecated( - deprecation_msg( - *("obs_keys", "obs"), - "(e.g. `k in adata.obs` or `str(adata.obs.columns.tolist())`)", + Deprecation( + "0.12.3", + deprecation_msg( + *("obs_keys", "obs"), + "(e.g. `k in adata.obs` or `str(adata.obs.columns.tolist())`)", + ), ) ) def obs_keys(self) -> list[str]: @@ -903,9 +907,12 @@ def obs_keys(self) -> list[str]: return self._obs.keys().tolist() @deprecated( - deprecation_msg( - *("var_keys", "var"), - "(e.g. `k in adata.var` or `str(adata.var.columns.tolist())`)", + Deprecation( + "0.12.3", + deprecation_msg( + *("var_keys", "var"), + "(e.g. `k in adata.var` or `str(adata.var.columns.tolist())`)", + ), ) ) def var_keys(self) -> list[str]: @@ -913,9 +920,12 @@ def var_keys(self) -> list[str]: return self._var.keys().tolist() @deprecated( - deprecation_msg( - *("obsm_keys", "obsm"), - "(e.g. `k in adata.obsm` or `adata.obsm.keys() | {'u'}`)", + Deprecation( + "0.12.3", + deprecation_msg( + *("obsm_keys", "obsm"), + "(e.g. `k in adata.obsm` or `adata.obsm.keys() | {'u'}`)", + ), ) ) def obsm_keys(self) -> list[str]: @@ -923,9 +933,12 @@ def obsm_keys(self) -> list[str]: return list(self.obsm.keys()) @deprecated( - deprecation_msg( - *("varm_keys", "varm"), - "(e.g. `k in adata.varm` or `adata.varm.keys() | {'u'}`)", + Deprecation( + "0.12.3", + deprecation_msg( + *("varm_keys", "varm"), + "(e.g. `k in adata.varm` or `adata.varm.keys() | {'u'}`)", + ), ) ) def varm_keys(self) -> list[str]: @@ -933,8 +946,11 @@ def varm_keys(self) -> list[str]: return list(self.varm.keys()) @deprecated( - deprecation_msg( - "uns_keys", "uns", "(e.g. `k in adata.uns` or `sorted(adata.uns)`)" + Deprecation( + "0.13", + deprecation_msg( + "uns_keys", "uns", "(e.g. `k in adata.uns` or `sorted(adata.uns)`)" + ), ) ) def uns_keys(self) -> list[str]: @@ -1011,21 +1027,6 @@ def _normalize_indices( ) -> tuple[_Index1DNorm | int | np.integer, _Index1DNorm | int | np.integer]: return _normalize_indices(index, self.obs_names, self.var_names) - # TODO: this is not quite complete... - def __delitem__(self, index: Index) -> None: - obs, var = self._normalize_indices(index) - # TODO: does this really work? - if not self.isbacked: - del self.X[obs, var] - else: - X = self.file["X"] - del X[obs, var] - self._set_backed("X", X) - if var == slice(None): - del self._obs.iloc[obs, :] - if obs == slice(None): - del self._var.iloc[var, :] - @overload def __getitem__(self, index: AdRef) -> Array: ... @overload @@ -1194,19 +1195,6 @@ def _inplace_subset_obs(self, index: Index1D): self._init_as_actual(adata_subset) - # TODO: Update, possibly remove - def __setitem__(self, index: Index, val: float | _XDataType): - if self.is_view: - msg = "Object is view and cannot be accessed with `[]`." - raise ValueError(msg) - obs, var = self._normalize_indices(index) - if not self.isbacked: - self.X[obs, var] = val - else: - X = self.file["X"] - X[obs, var] = val - self._set_backed("X", X) - def __len__(self) -> int: return self.shape[0] @@ -1225,6 +1213,12 @@ def transpose(self) -> AnnData: "which is currently not implemented. Call `.copy()` before transposing." ) raise ValueError(msg) + if any( + isinstance(elem, ZarrArray | BaseCompressedSparseDataset | h5py.Dataset) + for elem in (self.X, *self.layers.values()) + ): + msg = "Cannot transpose anndata object that has raw zarr arrays or h5py arrays backing X or layers" + raise ValueError(msg) return AnnData( layers={k: _safe_transpose(v) for k, v in self.layers.items()}, @@ -1272,16 +1266,18 @@ def to_df(self, layer: str | None = None) -> pd.DataFrame: return pd.DataFrame(X, index=self.obs_names, columns=self.var_names) @deprecated( - deprecation_msg( - "obs_vector", - "anndata.acc.A", - "E.g. `vec = adata[A.obs['foo']]` or `vec = adata[A.layers['l']['bar', :]]`", + Deprecation( + "0.13", + deprecation_msg( + "obs_vector", + "anndata.acc.A", + "E.g. `vec = adata[A.obs['foo']]` or `vec = adata[A.layers['l']['bar', :]]`", + ), ) ) def obs_vector(self, k: str, /, *, layer: str | None = None) -> np.ndarray: """\ - Convenience function for returning a 1 dimensional ndarray of values - from :attr:`X`, :attr:`layers`\\ `[k]`, or :attr:`obs`. + Convenience function for returning a 1 dimensional ndarray of values from :attr:`X`, :attr:`layers`\\ `[k]`, or :attr:`obs`. Made for convenience, not performance. Intentionally permissive about arguments, for easy iterative use. @@ -1301,16 +1297,18 @@ def obs_vector(self, k: str, /, *, layer: str | None = None) -> np.ndarray: return _get_vector_ambiguous(self, k, "obs", layer=layer) @deprecated( - deprecation_msg( - "var_vector", - "anndata.acc.A", - "E.g. `vec = adata[A.var['foo']]` or `vec = adata[A.layers['l'][:, 'bar']]`", + Deprecation( + "0.13", + deprecation_msg( + "var_vector", + "anndata.acc.A", + "E.g. `vec = adata[A.var['foo']]` or `vec = adata[A.layers['l'][:, 'bar']]`", + ), ) ) def var_vector(self, k: str, /, *, layer: str | None = None) -> np.ndarray: """\ - Convenience function for returning a 1 dimensional ndarray of values - from :attr:`X`, :attr:`layers`\\ `[k]`, or :attr:`obs`. + Convenience function for returning a 1 dimensional ndarray of values from :attr:`X`, :attr:`layers`\\ `[k]`, or :attr:`obs`. Made for convenience, not performance. Intentionally permissive about arguments, for easy iterative use. @@ -1371,26 +1369,16 @@ def to_memory(self, *, copy: bool = False) -> AnnData: mem = backed[backed.obs["cluster"] == "a", :].to_memory() """ new = {} - for attr_name in [ - "obs", - "var", - "obsm", - "varm", - "obsp", - "varp", - "layers", - "uns", - ]: - attr = getattr(self, attr_name, None) + for attr_name, attr in iter_outer(self): if attr is not None: - new[attr_name] = to_memory(attr, copy=copy) - - if self.raw is not None: - new["raw"] = { - "X": to_memory(self.raw.X, copy=copy), - "var": to_memory(self.raw.var, copy=copy), - "varm": to_memory(self.raw.varm, copy=copy), - } + if attr is self.raw: + new["raw"] = { + "X": to_memory(self.raw.X, copy=copy), + "var": to_memory(self.raw.var, copy=copy), + "varm": to_memory(self.raw.varm, copy=copy), + } + else: + new[attr_name] = to_memory(attr, copy=copy) if self.isbacked: new["layers"][None] = self.X[...] @@ -1398,9 +1386,32 @@ def to_memory(self, *, copy: bool = False) -> AnnData: return AnnData(**new) + def _has_raw_zarr_or_h5_array(self) -> bool: + def predicate( + elem: RWAble, + *, + accumulate: bool, + attr_name: AnnDataElem | None = None, + ): + if isinstance(elem, MutableMapping): + return accumulate or any( + isinstance( + v, ZarrArray | BaseCompressedSparseDataset | h5py.Dataset + ) + for v in elem.values() + ) + return accumulate or isinstance( + elem, ZarrArray | BaseCompressedSparseDataset | h5py.Dataset + ) + + return self._reduce(predicate, init=False) + def copy(self, filename: PathLike[str] | str | None = None) -> AnnData: """Full copy, optionally on disk.""" if not self.isbacked: + if self._has_raw_zarr_or_h5_array(): + msg = "Copy is not implemented for anndatas which have backing raw h5 (not in backed mode) or zarr arrays" + raise NotImplementedError(msg) return self._copy() else: from ..io import read_h5ad, write_h5ad @@ -1416,295 +1427,99 @@ def copy(self, filename: PathLike[str] | str | None = None) -> AnnData: write_h5ad(filename, self) return read_h5ad(filename, backed=mode) - @deprecated( - deprecation_msg( - *("AnnData.concatenate", "anndata.concat"), - "See the tutorial for concat at: " - "https://anndata.readthedocs.io/en/latest/concatenation.html", - ) - ) - def concatenate( + def _reduce[T]( self, - *adatas: AnnData, - join: str = "inner", - batch_key: str = "batch", - batch_categories: Sequence[Any] | None = None, - uns_merge: str | None = None, - index_unique: str | None = "-", - fill_value=None, - ) -> AnnData: - """\ - Concatenate along the observations axis. - - The :attr:`uns`, :attr:`varm` and :attr:`obsm` attributes are ignored. + func: ReduceFunc[T], + *, + init: T, + ) -> T: + """Accumulate a value starting from init by iterating over the parent "elems"of the AnnData object i.e., raw, obs, varp etc. - Currently, this works only in `'memory'` mode. + Parameters + ---------- + func + The function that performs the accumulation. + init + The starting value - .. note:: + Returns + ------- + An accumulated value + """ + accumulate = init + for attr_name, attr in iter_outer(self): + accumulate = func(attr, accumulate=accumulate, attr_name=attr_name) + return accumulate - For more flexible and efficient concatenation, see: :func:`~anndata.concat`. + def unwriteable(self, *, store_type: Literal["h5", "zarr"] | None) -> bool: + """Whether or not an `AnnData` object can be written to disk for a given store type. Parameters ---------- - adatas - AnnData matrices to concatenate with. Each matrix is referred to as - a “batch”. - join - Use intersection (`'inner'`) or union (`'outer'`) of variables. - batch_key - Add the batch annotation to :attr:`obs` using this key. - batch_categories - Use these as categories for the batch annotation. By default, use increasing numbers. - uns_merge - Strategy to use for merging entries of uns. These strategies are applied recusivley. - Currently implemented strategies include: - - * `None`: The default. The concatenated object will just have an empty dict for `uns`. - * `"same"`: Only entries which have the same value in all AnnData objects are kept. - * `"unique"`: Only entries which have one unique value in all AnnData objects are kept. - * `"first"`: The first non-missing value is used. - * `"only"`: A value is included if only one of the AnnData objects has a value at this - path. - index_unique - Make the index unique by joining the existing index names with the - batch category, using `index_unique='-'`, for instance. Provide - `None` to keep existing indices. - fill_value - Scalar value to fill newly missing values in arrays with. Note: only applies to arrays - and sparse matrices (not dataframes) and will only be used if `join="outer"`. - - .. note:: - If not provided, the default value is `0` for sparse matrices and `np.nan` - for numpy arrays. See the examples below for more information. + store_type + Which backing store - `None` indicates that it can be writeable to either. Returns ------- - :class:`~anndata.AnnData` - The concatenated :class:`~anndata.AnnData`, where `adata.obs[batch_key]` - stores a categorical variable labeling the batch. - - Notes - ----- - - .. warning:: - - If you use `join='outer'` this fills 0s for sparse data when - variables are absent in a batch. Use this with care. Dense data is - filled with `NaN`. See the examples. - - Examples - -------- - Joining on intersection of variables. - - >>> adata1 = AnnData( - ... np.array([[1, 2, 3], [4, 5, 6]]), - ... dict(obs_names=['s1', 's2'], anno1=['c1', 'c2']), - ... dict(var_names=['a', 'b', 'c'], annoA=[0, 1, 2]), - ... ) - >>> adata2 = AnnData( - ... np.array([[1, 2, 3], [4, 5, 6]]), - ... dict(obs_names=['s3', 's4'], anno1=['c3', 'c4']), - ... dict(var_names=['d', 'c', 'b'], annoA=[0, 1, 2]), - ... ) - >>> adata3 = AnnData( - ... np.array([[1, 2, 3], [4, 5, 6]]), - ... dict(obs_names=['s1', 's2'], anno2=['d3', 'd4']), - ... dict(var_names=['d', 'c', 'b'], annoA=[0, 2, 3], annoB=[0, 1, 2]), - ... ) - >>> adata = adata1.concatenate(adata2, adata3) - >>> adata - AnnData object with n_obs × n_vars = 6 × 2 - obs: 'anno1', 'anno2', 'batch' - var: 'annoA-0', 'annoA-1', 'annoA-2', 'annoB-2' - layers: None - >>> adata.X - array([[2, 3], - [5, 6], - [3, 2], - [6, 5], - [3, 2], - [6, 5]]) - >>> adata.obs - anno1 anno2 batch - s1-0 c1 NaN 0 - s2-0 c2 NaN 0 - s3-1 c3 NaN 1 - s4-1 c4 NaN 1 - s1-2 NaN d3 2 - s2-2 NaN d4 2 - >>> adata.var.T - b c - annoA-0 1 2 - annoA-1 2 1 - annoA-2 3 2 - annoB-2 2 1 - - Joining on the union of variables. - - >>> outer = adata1.concatenate(adata2, adata3, join='outer') - >>> outer - AnnData object with n_obs × n_vars = 6 × 4 - obs: 'anno1', 'anno2', 'batch' - var: 'annoA-0', 'annoA-1', 'annoA-2', 'annoB-2' - layers: None - >>> outer.var.T - a b c d - annoA-0 0.0 1.0 2.0 NaN - annoA-1 NaN 2.0 1.0 0.0 - annoA-2 NaN 3.0 2.0 0.0 - annoB-2 NaN 2.0 1.0 0.0 - >>> outer.var_names.astype("string") - Index(['a', 'b', 'c', 'd'], dtype='string') - >>> outer.X - array([[ 1., 2., 3., nan], - [ 4., 5., 6., nan], - [nan, 3., 2., 1.], - [nan, 6., 5., 4.], - [nan, 3., 2., 1.], - [nan, 6., 5., 4.]]) - >>> outer.X.sum(axis=0) - array([nan, 25., 23., nan]) - >>> import pandas as pd - >>> Xdf = pd.DataFrame(outer.X, columns=outer.var_names) - >>> Xdf - a b c d - 0 1.0 2.0 3.0 NaN - 1 4.0 5.0 6.0 NaN - 2 NaN 3.0 2.0 1.0 - 3 NaN 6.0 5.0 4.0 - 4 NaN 3.0 2.0 1.0 - 5 NaN 6.0 5.0 4.0 - >>> Xdf.sum() - a 5.0 - b 25.0 - c 23.0 - d 10.0 - dtype: float64 - - One way to deal with missing values is to use masked arrays: - - >>> from numpy import ma - >>> outer.X = ma.masked_invalid(outer.X) - >>> outer.X - masked_array( - data=[[1.0, 2.0, 3.0, --], - [4.0, 5.0, 6.0, --], - [--, 3.0, 2.0, 1.0], - [--, 6.0, 5.0, 4.0], - [--, 3.0, 2.0, 1.0], - [--, 6.0, 5.0, 4.0]], - mask=[[False, False, False, True], - [False, False, False, True], - [ True, False, False, False], - [ True, False, False, False], - [ True, False, False, False], - [ True, False, False, False]], - fill_value=1e+20) - >>> outer.X.sum(axis=0).data - array([ 5., 25., 23., 10.]) - - The masked array is not saved but has to be reinstantiated after saving. - - >>> outer.write('./test.h5ad') - >>> from anndata import read_h5ad - >>> outer = read_h5ad('./test.h5ad') - >>> outer.X - array([[ 1., 2., 3., nan], - [ 4., 5., 6., nan], - [nan, 3., 2., 1.], - [nan, 6., 5., 4.], - [nan, 3., 2., 1.], - [nan, 6., 5., 4.]]) - - For sparse data, everything behaves similarly, - except that for `join='outer'`, zeros are added. - - >>> from scipy.sparse import csr_matrix - >>> adata1 = AnnData( - ... csr_matrix([[0, 2, 3], [0, 5, 6]], dtype=np.float32), - ... dict(obs_names=['s1', 's2'], anno1=['c1', 'c2']), - ... dict(var_names=['a', 'b', 'c']), - ... ) - >>> adata2 = AnnData( - ... csr_matrix([[0, 2, 3], [0, 5, 6]], dtype=np.float32), - ... dict(obs_names=['s3', 's4'], anno1=['c3', 'c4']), - ... dict(var_names=['d', 'c', 'b']), - ... ) - >>> adata3 = AnnData( - ... csr_matrix([[1, 2, 0], [0, 5, 6]], dtype=np.float32), - ... dict(obs_names=['s5', 's6'], anno2=['d3', 'd4']), - ... dict(var_names=['d', 'c', 'b']), - ... ) - >>> adata = adata1.concatenate(adata2, adata3, join='outer') - >>> adata.var_names.astype("string") - Index(['a', 'b', 'c', 'd'], dtype='string') - >>> adata.X.toarray() - array([[0., 2., 3., 0.], - [0., 5., 6., 0.], - [0., 3., 2., 0.], - [0., 6., 5., 0.], - [0., 0., 2., 1.], - [0., 6., 5., 0.]], dtype=float32) + Whether or not this object is writeable. + While the return type may change to include richer output about which elements cannot be written, + this new type's evaluation as a boolean will not change from the current behavior i.e., + `bool(adata.unwriteable())` will always evaluate the same. """ - from .merge import concat, merge_dataframes, merge_outer, merge_same - if self.isbacked: - msg = "Currently, concatenate only works in memory mode." - raise ValueError(msg) + from anndata._io.specs.registry import _REGISTRY - if len(adatas) == 0: - return self.copy() - elif len(adatas) == 1 and not isinstance(adatas[0], AnnData): - adatas = adatas[0] # backwards compatibility - all_adatas = (self, *adatas) - - out = concat( - all_adatas, - axis=0, - join=join, - label=batch_key, - keys=batch_categories, - uns_merge=uns_merge, - fill_value=fill_value, - index_unique=index_unique, - pairwise=False, - ) + writeable_elems = { + src_type + for (dest_type, src_type, __) in _REGISTRY.write + if store_type is None or store_type in dest_type.__module__ + } - # Backwards compat (some of this could be more efficient) - # obs used to always be an outer join - sparse_class = sparse.csr_matrix - if any(isinstance(a.X, CSArray) for a in all_adatas): - sparse_class = sparse.csr_array - out.obs = concat( - [AnnData(sparse_class(a.shape), obs=a.obs) for a in all_adatas], - axis=0, - join="outer", - label=batch_key, - keys=batch_categories, - index_unique=index_unique, - ).obs - # Removing varm - del out.varm - # Implementing old-style merging of var - if batch_categories is None: - batch_categories = np.arange(len(all_adatas)).astype(str) - pat = rf"-({'|'.join(batch_categories)})$" - out.var = merge_dataframes( - [a.var for a in all_adatas], - out.var_names, - partial(merge_outer, batch_keys=batch_categories, merge=merge_same), - ) - out.var = out.var.iloc[ - :, - ( - out.var.columns.str - .extract(pat, expand=False) - .fillna("") - .argsort(kind="stable") - ), - ] + def predicate( # noqa: PLR0911 + elem: RWAble, + *, + accumulate: bool, + attr_name: AnnDataElem | None = None, + ): + if elem is None: + return accumulate + if isinstance(elem, AnnData): + return accumulate and elem.unwriteable(store_type=store_type) + if isinstance(elem, pd.Categorical): + return accumulate and predicate(elem.categories, accumulate=accumulate) + if isinstance(elem, pd.Series | pd.Index): + # matches behavior in methods.py + return accumulate and predicate(elem._values, accumulate=accumulate) + if isinstance(elem, AwkArray): + import awkward as ak + + container = ak.to_buffers(ak.to_packed(elem)) + return accumulate and all( + predicate(v, accumulate=accumulate) for v in container[2].values() + ) + if attr_name == "raw": + accumulate = accumulate and type(elem.X) in writeable_elems + return accumulate and all( + predicate(e[attr], accumulate=accumulate) + for e in [elem.var, elem.varm] + for attr in e + ) + if attr_name in { + "obs", + "obsm", + "varm", + "var", + "layers", + "varp", + "obsp", + "uns", + } or isinstance(elem, pd.DataFrame | XDataset | MutableMapping): + return accumulate and all( + predicate(elem[k], accumulate=accumulate) for k in elem + ) + return accumulate and type(elem) in writeable_elems - return out + return self._reduce(predicate, init=True) def var_names_make_unique(self, join: str = "-") -> None: # Important to go through the setter so obsm dataframes are updated too @@ -1892,8 +1707,11 @@ def write_csvs( write_csvs(dirname, self, skip_data=skip_data, sep=sep) @deprecated( - "Deprecated in favor of other formats, e.g. `write_h5ad`. " - "Loom isn’t well-maintained and supports only a subset of anndata features." + Deprecation( + "0.13", + "Deprecated in favor of other formats, e.g. `write_h5ad`. " + "Loom isn’t well-maintained and supports only a subset of anndata features.", + ) ) @old_positionals("write_obsm_varm") def write_loom( @@ -2031,8 +1849,9 @@ def _has_X(self) -> bool: # -------------------------------------------------------------------------- @property - @deprecated(deprecation_msg("isview", "is_view")) + @deprecated(Deprecation("0.7.2", deprecation_msg("isview", "is_view"))) def isview(self) -> bool: + """Whether or not this object is a view.""" return self.is_view def _clean_up_old_format(self, uns): diff --git a/src/anndata/_core/extensions.py b/src/anndata/_core/extensions.py index a983668bf..9835c0e9d 100644 --- a/src/anndata/_core/extensions.py +++ b/src/anndata/_core/extensions.py @@ -1,237 +1,12 @@ from __future__ import annotations -import inspect -from typing import TYPE_CHECKING, get_type_hints, overload +from scverse_misc import make_register_namespace_decorator -from ..types import ExtensionNamespace -from ..utils import warn from .anndata import AnnData -if TYPE_CHECKING: - from collections.abc import Callable - - -# Based off of the extension framework in Polars -# https://github.com/pola-rs/polars/blob/main/py-polars/polars/api.py - __all__ = ["register_anndata_namespace"] -# Reserved namespaces include accessors built into AnnData (currently there are none) -# and all current attributes of AnnData -_reserved_namespaces: set[str] = set(dir(AnnData)) - - -class AccessorNameSpace[NameSpT: ExtensionNamespace](ExtensionNamespace): - """Establish property-like namespace object for user-defined functionality.""" - - def __init__(self, name: str, namespace: type[NameSpT]) -> None: - self._accessor = name - self._ns = namespace - - @overload - def __get__[T](self, instance: None, cls: type[T]) -> type[NameSpT]: ... - - @overload - def __get__[T](self, instance: T, cls: type[T]) -> NameSpT: ... - - def __get__[T](self, instance: T | None, cls: type[T]) -> NameSpT | type[NameSpT]: - if instance is None: - return self._ns - - ns_instance = self._ns(instance) # type: ignore[call-arg] - setattr(instance, self._accessor, ns_instance) - return ns_instance - - -def _check_namespace_signature(ns_class: type) -> None: - """Validate the signature of a namespace class for AnnData extensions. - - This function ensures that any class intended to be used as an extension namespace - has a properly formatted `__init__` method such that: - - 1. Accepts at least two parameters (self and adata) - 2. Has 'adata' as the name of the second parameter - 3. Has the second parameter properly type-annotated as 'AnnData' or any equivalent import alias - - The function performs runtime validation of these requirements before a namespace - can be registered through the `register_anndata_namespace` decorator. - - Parameters - ---------- - ns_class - The namespace class to validate. - - Raises - ------ - TypeError - If the `__init__` method has fewer than 2 parameters (missing the AnnData parameter). - AttributeError - If the second parameter of `__init__` lacks a type annotation. - TypeError - If the second parameter of `__init__` is not named 'adata'. - TypeError - If the second parameter of `__init__` is not annotated as the 'AnnData' class. - TypeError - If both the name and type annotation of the second parameter are incorrect. - - """ - sig = inspect.signature(ns_class.__init__) - params = list(sig.parameters.values()) - - # Ensure there are at least two parameters (self and adata) - if len(params) < 2: - error_msg = "Namespace initializer must accept an AnnData instance as the second parameter." - raise TypeError(error_msg) - - # Get the second parameter (expected to be 'adata') - param = params[1] - if param.annotation is inspect._empty: - err_msg = "Namespace initializer's second parameter must be annotated as the 'AnnData' class, got empty annotation." - raise AttributeError(err_msg) - - name_ok = param.name == "adata" - - # Resolve the annotation using get_type_hints to handle forward references and aliases. - try: - type_hints = get_type_hints(ns_class.__init__) - resolved_type = type_hints.get(param.name, param.annotation) - except NameError as e: - err_msg = f"Namespace initializer's second parameter must be named 'adata', got '{param.name}'." - raise NameError(err_msg) from e - - type_ok = resolved_type is AnnData - - match (name_ok, type_ok): - case (True, True): - return # Signature is correct. - case (False, True): - msg = f"Namespace initializer's second parameter must be named 'adata', got {param.name!r}." - raise TypeError(msg) - case (True, False): - type_repr = getattr(resolved_type, "__name__", str(resolved_type)) - msg = f"Namespace initializer's second parameter must be annotated as the 'AnnData' class, got '{type_repr}'." - raise TypeError(msg) - case _: - type_repr = getattr(resolved_type, "__name__", str(resolved_type)) - msg = ( - f"Namespace initializer's second parameter must be named 'adata', got {param.name!r}. " - f"And must be annotated as 'AnnData', got {type_repr!r}." - ) - raise TypeError(msg) - - -def _create_namespace[NameSpT: ExtensionNamespace]( - name: str, cls: type[AnnData] -) -> Callable[[type[NameSpT]], type[NameSpT]]: - """Register custom namespace against the underlying AnnData class.""" - - def namespace(ns_class: type[NameSpT]) -> type[NameSpT]: - _check_namespace_signature(ns_class) # Perform the runtime signature check - if name in _reserved_namespaces: - msg = f"cannot override reserved attribute {name!r}" - raise AttributeError(msg) - elif name in cls._accessors: - warn( - f"Overriding existing custom namespace {name!r} (on {cls.__name__!r})", - UserWarning, - ) - setattr(cls, name, AccessorNameSpace(name, ns_class)) - cls._accessors.add(name) - return ns_class - - return namespace - - -def register_anndata_namespace[NameSpT: ExtensionNamespace]( - name: str, -) -> Callable[[type[NameSpT]], type[NameSpT]]: - """Decorator for registering custom functionality with an :class:`~anndata.AnnData` object. - - This decorator allows you to extend AnnData objects with custom methods and properties - organized under a namespace. The namespace becomes accessible as an attribute on AnnData - instances, providing a clean way to you to add domain-specific functionality without modifying - the AnnData class itself, or extending the class with additional methods as you see fit in your workflow. - - Parameters - ---------- - name - Name under which the accessor should be registered. This will be the attribute name - used to access your namespace's functionality on AnnData objects (e.g., `adata.{name}`). - Cannot conflict with existing AnnData attributes like `obs`, `var`, `X`, etc. The list of reserved - attributes includes everything outputted by `dir(AnnData)`. - - Returns - ------- - A decorator that registers the decorated class as a custom namespace. - - Notes - ----- - Implementation requirements: - - 1. The decorated class must have an `__init__` method that accepts exactly one parameter - (besides `self`) named `adata` and annotated with type :class:`~anndata.AnnData`. - 2. The namespace will be initialized with the AnnData object on first access and then - cached on the instance. - 3. If the namespace name conflicts with an existing namespace, a warning is issued. - 4. If the namespace name conflicts with a built-in AnnData attribute, an AttributeError is raised. - - Examples - -------- - Simple transformation namespace with two methods: - - >>> import anndata as ad - >>> import numpy as np - >>> - >>> @ad.register_anndata_namespace("transform") - ... class TransformX: - ... def __init__(self, adata: ad.AnnData): - ... self._adata = adata - ... - ... def log1p( - ... self, layer: str = None, inplace: bool = False - ... ) -> ad.AnnData | None: - ... '''Log1p transform the data.''' - ... data = self._adata.layers[layer] if layer else self._adata.X - ... log1p_data = np.log1p(data) - ... - ... if layer: - ... layer_name = f"{layer}_log1p" if not inplace else layer - ... else: - ... layer_name = "log1p" - ... - ... self._adata.layers[layer_name] = log1p_data - ... - ... if not inplace: - ... return self._adata - ... - ... def arcsinh( - ... self, layer: str = None, scale: float = 1.0, inplace: bool = False - ... ) -> ad.AnnData | None: - ... '''Arcsinh transform the data with optional scaling.''' - ... data = self._adata.layers[layer] if layer else self._adata.X - ... asinh_data = np.arcsinh(data / scale) - ... - ... if layer: - ... layer_name = f"{layer}_arcsinh" if not inplace else layer - ... else: - ... layer_name = "arcsinh" - ... - ... self._adata.layers[layer_name] = asinh_data - ... - ... if not inplace: - ... return self._adata - >>> - >>> # Create an AnnData object - >>> rng = np.random.default_rng(42) - >>> adata = ad.AnnData(X=rng.poisson(1, size=(100, 2000))) - >>> - >>> # Use the registered namespace - >>> adata.transform.log1p() # Transforms X and returns the AnnData object - AnnData object with n_obs × n_vars = 100 × 2000 - layers: None, 'log1p' - >>> adata.transform.arcsinh() # Transforms X and returns the AnnData object - AnnData object with n_obs × n_vars = 100 × 2000 - layers: None, 'log1p', 'arcsinh' - """ - return _create_namespace(name, AnnData) +register_anndata_namespace = make_register_namespace_decorator( + AnnData, "adata", "register_anndata_namespace", "numpy" +) diff --git a/src/anndata/_core/merge.py b/src/anndata/_core/merge.py index 578981449..b26bc5f50 100644 --- a/src/anndata/_core/merge.py +++ b/src/anndata/_core/merge.py @@ -583,6 +583,29 @@ def _apply_to_df_like(self, el: pd.DataFrame | Dataset2D, *, axis, fill_value=No def _apply_to_dask_array(self, el: DaskArray, *, axis, fill_value=None): import dask.array as da + indexer = self.idx + is_outer = any(indexer == -1) + # Fast path for the majority of sparse matrices whose minor-axis is unchunked and is being reindexed. + # This prevents 0's from being stored explicitly in the sparse matrices when outer joining, for example (see below). + if ( + is_sparse_sub := isinstance(el._meta, CSArray | CSMatrix) + and el.chunksize[minor_axis := int(el._meta.format == "csr")] + == el.shape[minor_axis] + and axis == minor_axis + and is_outer + ): + return el.map_blocks( + partial( + self._apply_to_sparse, + axis=axis, + fill_value=fill_value, + keep_format=True, + ), + chunks=(el.chunks[0], len(self.new_idx)) + if minor_axis == 1 + else (len(self.new_idx), el.chunks[1]), + meta=el._meta, + ) if fill_value is None: fill_value = default_fill_value([el]) shape = list(el.shape) @@ -591,12 +614,11 @@ def _apply_to_dask_array(self, el: DaskArray, *, axis, fill_value=None): shape[axis] = len(self.new_idx) return da.broadcast_to(fill_value, tuple(shape)) - indexer = self.idx sub_el = _subset(el, make_slice(indexer, axis, len(shape))) - if any(indexer == -1): + if is_outer: # TODO: Remove this condition once https://github.com/dask/dask/pull/12078 is released - if isinstance(sub_el._meta, CSArray | CSMatrix) and np.isscalar(fill_value): + if is_sparse_sub and np.isscalar(fill_value): fill_value = np.array([[fill_value]]) sub_el[make_slice(indexer == -1, axis, len(shape))] = fill_value @@ -658,7 +680,7 @@ def _apply_to_array_api( return xp.where(mask, fv, taken) def _apply_to_sparse( # noqa: PLR0912 - self, el: CSMatrix | CSArray, *, axis, fill_value=None + self, el: CSMatrix | CSArray, *, axis, fill_value=None, keep_format: bool = True ) -> CSMatrix: if isinstance(el, CupySparseMatrix): from cupyx.scipy import sparse @@ -730,7 +752,8 @@ def _apply_to_sparse( # noqa: PLR0912 if fill_idxer is not None: out[fill_idxer] = fill_value - + if keep_format: + out = out.tocsr() if el.format == "csr" else out.tocsc() return out def _apply_to_awkward(self, el: AwkArray, *, axis, fill_value=None): @@ -1289,7 +1312,8 @@ def make_xarray_extension_dtypes_dask( ) -DS_CONCAT_DUMMY_INDEX_NAME = "concat_index" +DS_CONCAT_DUMMY_INDEX_NAME = "_anndata_concat_index" +DS_MERGE_DUMMY_INDEX_NAME = "_anndata_merge_index" def concat_dataset2d_on_annot_axis( @@ -1707,7 +1731,7 @@ def concat( # noqa: PLR0912, PLR0913, PLR0915 if a.true_index_dim != a.index_dim: a.index = a.true_index annotations_with_only_dask = [ - a.ds.rename({a.true_index_dim: "merge_index"}) + a.ds.rename({a.true_index_dim: DS_MERGE_DUMMY_INDEX_NAME}) for a in annotations_with_only_dask ] alt_annot = Dataset2D( diff --git a/src/anndata/_core/raw.py b/src/anndata/_core/raw.py index 790c52b50..6f5a77b7b 100644 --- a/src/anndata/_core/raw.py +++ b/src/anndata/_core/raw.py @@ -118,7 +118,7 @@ def n_obs(self) -> int: return self._n_obs varm: AlignedMappingProperty[AxisArrays | AxisArraysView] = AlignedMappingProperty( - "varm", AxisArrays, 1 + AxisArrays, 1 ) @property diff --git a/src/anndata/_core/xarray.py b/src/anndata/_core/xarray.py index 0e75d604a..c312644c6 100644 --- a/src/anndata/_core/xarray.py +++ b/src/anndata/_core/xarray.py @@ -243,6 +243,11 @@ def to_memory(self, *, copy: bool = False) -> pd.DataFrame: ------- :class:`pandas.DataFrame` with index set accordingly. """ + from anndata._core.merge import ( + DS_CONCAT_DUMMY_INDEX_NAME, + DS_MERGE_DUMMY_INDEX_NAME, + ) + index_key = self.ds.attrs.get("indexing_key", None) all_columns = {*self.columns, *([] if index_key is None else [index_key])} # https://github.com/pydata/xarray/issues/10419 @@ -258,7 +263,12 @@ def to_memory(self, *, copy: bool = False) -> pd.DataFrame: ) if df.index.name != index_key and index_key is not None: df = df.set_index(index_key) - df.index.name = None # matches old AnnData object + if df.index.name in { + "_index", + DS_CONCAT_DUMMY_INDEX_NAME, + DS_MERGE_DUMMY_INDEX_NAME, + }: + df.index.name = None # matches old AnnData object return df @property diff --git a/src/anndata/_io/h5ad.py b/src/anndata/_io/h5ad.py index 253bc5bfc..13f920428 100644 --- a/src/anndata/_io/h5ad.py +++ b/src/anndata/_io/h5ad.py @@ -1,6 +1,7 @@ from __future__ import annotations import re +from collections.abc import MutableMapping from functools import partial from pathlib import Path from types import MappingProxyType @@ -23,7 +24,7 @@ _from_fixed_length_strings, ) from ..experimental import read_dispatched -from ..utils import warn +from ..utils import iter_outer, warn from .specs import read_elem, write_elem from .specs.registry import IOSpec, write_spec from .utils import ( @@ -84,28 +85,28 @@ def write_h5ad( f = cast("h5py.Group", f["/"]) f.attrs.setdefault("encoding-type", "anndata") f.attrs.setdefault("encoding-version", "0.1.0") - - _write_x( - f, - adata, # accessing adata.X reopens adata.file if it’s backed - is_backed=adata.isbacked and adata.filename == filepath, - as_dense=as_dense, - dataset_kwargs=dataset_kwargs, - ) - _write_raw(f, adata.raw, as_dense=as_dense, dataset_kwargs=dataset_kwargs) - write_elem(f, "obs", adata.obs, dataset_kwargs=dataset_kwargs) - write_elem(f, "var", adata.var, dataset_kwargs=dataset_kwargs) - write_elem(f, "obsm", dict(adata.obsm), dataset_kwargs=dataset_kwargs) - write_elem(f, "varm", dict(adata.varm), dataset_kwargs=dataset_kwargs) - write_elem(f, "obsp", dict(adata.obsp), dataset_kwargs=dataset_kwargs) - write_elem(f, "varp", dict(adata.varp), dataset_kwargs=dataset_kwargs) - write_elem( - f, - "layers", - {k: v for k, v in adata.layers.items() if k is not None}, - dataset_kwargs=dataset_kwargs, - ) - write_elem(f, "uns", dict(adata.uns), dataset_kwargs=dataset_kwargs) + for k, elem in iter_outer(adata): + if k == "X": + _write_x( + f, + adata, # accessing adata.X reopens adata.file if it’s backed + is_backed=adata.isbacked and adata.filename == filepath, + as_dense=as_dense, + dataset_kwargs=dataset_kwargs, + ) + elif k == "raw": + _write_raw( + f, adata.raw, as_dense=as_dense, dataset_kwargs=dataset_kwargs + ) + else: + if k == "layers": + elem = {k: v for k, v in elem.items() if k is not None} + write_elem( + f, + k, + dict(elem) if isinstance(elem, MutableMapping) else elem, + dataset_kwargs=dataset_kwargs, + ) def _write_x( diff --git a/src/anndata/_io/read.py b/src/anndata/_io/read.py index 2211864ed..8a2cbd5e5 100644 --- a/src/anndata/_io/read.py +++ b/src/anndata/_io/read.py @@ -12,10 +12,11 @@ import numpy as np import pandas as pd from scipy import sparse +from scverse_misc import Deprecation, deprecated from .. import AnnData from ..compat import old_positionals, pandas_as_str -from ..utils import deprecated, warn +from ..utils import warn from .utils import is_float if TYPE_CHECKING: @@ -157,8 +158,11 @@ def _fmt_loom_axis_attrs( @deprecated( - "Deprecated in favor of other formats, e.g. (`write_h5ad` and then) `read_h5ad`. " - "Loom isn’t well-maintained and supports only a subset of anndata features.", + Deprecation( + "0.13", + "Deprecated in favor of other formats, e.g. (`write_h5ad` and then) `read_h5ad`. " + "Loom isn’t well-maintained and supports only a subset of anndata features.", + ) ) @old_positionals( "sparse", diff --git a/src/anndata/_io/specs/methods.py b/src/anndata/_io/specs/methods.py index 91f9350b4..125fb6f45 100644 --- a/src/anndata/_io/specs/methods.py +++ b/src/anndata/_io/specs/methods.py @@ -1,6 +1,6 @@ from __future__ import annotations -from collections.abc import Mapping +from collections.abc import Mapping, MutableMapping from copy import copy from functools import partial from itertools import product @@ -41,7 +41,7 @@ from ..._settings import settings from ...compat import PANDAS_STRING_ARRAY_TYPES, PANDAS_SUPPORTS_NA_VALUE -from ...utils import warn +from ...utils import iter_outer, warn from .registry import _REGISTRY, IOSpec, read_elem, read_elem_partial if TYPE_CHECKING: @@ -286,22 +286,16 @@ def write_anndata( dataset_kwargs: Mapping[str, Any] = MappingProxyType({}), ): g = f.require_group(k) - if adata.X is not None: - _writer.write_elem(g, "X", adata.X, dataset_kwargs=dataset_kwargs) - _writer.write_elem(g, "obs", adata.obs, dataset_kwargs=dataset_kwargs) - _writer.write_elem(g, "var", adata.var, dataset_kwargs=dataset_kwargs) - _writer.write_elem(g, "obsm", dict(adata.obsm), dataset_kwargs=dataset_kwargs) - _writer.write_elem(g, "varm", dict(adata.varm), dataset_kwargs=dataset_kwargs) - _writer.write_elem(g, "obsp", dict(adata.obsp), dataset_kwargs=dataset_kwargs) - _writer.write_elem(g, "varp", dict(adata.varp), dataset_kwargs=dataset_kwargs) - _writer.write_elem( - g, - "layers", - {k: v for k, v in adata.layers.items() if k is not None}, - dataset_kwargs=dataset_kwargs, - ) - _writer.write_elem(g, "uns", dict(adata.uns), dataset_kwargs=dataset_kwargs) - _writer.write_elem(g, "raw", adata.raw, dataset_kwargs=dataset_kwargs) + for sub_key, elem in iter_outer(adata): + if not (sub_key == "X" and elem is None): + if sub_key == "layers": + elem = {k: v for k, v in elem.items() if k is not None} + _writer.write_elem( + g, + sub_key, + dict(elem) if isinstance(elem, MutableMapping) else elem, + dataset_kwargs=dataset_kwargs, + ) @_REGISTRY.register_read(H5Group, IOSpec("anndata", "0.1.0")) diff --git a/src/anndata/_io/write.py b/src/anndata/_io/write.py index 3691f929a..f765f85f7 100644 --- a/src/anndata/_io/write.py +++ b/src/anndata/_io/write.py @@ -8,13 +8,14 @@ import numpy as np import pandas as pd from scipy.sparse import issparse +from scverse_misc import Deprecation, deprecated from anndata._io.utils import no_write_dataset_2d from .._warnings import WriteWarning from ..compat import old_positionals from ..logging import get_logger -from ..utils import deprecated, warn +from ..utils import warn if TYPE_CHECKING: from os import PathLike @@ -84,8 +85,11 @@ def write_csvs( @deprecated( - "Deprecated in favor of other formats, e.g. `write_h5ad`. " - "Loom isn’t well-maintained and supports only a subset of anndata features." + Deprecation( + "0.13", + "Deprecated in favor of other formats, e.g. `write_h5ad`. " + "Loom isn’t well-maintained and supports only a subset of anndata features.", + ) ) @no_write_dataset_2d @old_positionals("write_obsm_varm") diff --git a/src/anndata/_io/zarr.py b/src/anndata/_io/zarr.py index 06d16d909..229315b0b 100644 --- a/src/anndata/_io/zarr.py +++ b/src/anndata/_io/zarr.py @@ -1,5 +1,8 @@ from __future__ import annotations +import warnings +from contextlib import contextmanager, nullcontext +from importlib.util import find_spec from typing import TYPE_CHECKING import numpy as np @@ -24,6 +27,25 @@ from zarr.storage import StoreLike +@contextmanager +def zarrs_context(): + with ( + ( + zarr.config.set({"codec_pipeline.path": "zarrs.ZarrsCodecPipeline"}) + if find_spec("zarrs") + else nullcontext() + ), + warnings.catch_warnings() if find_spec("zarrs") else nullcontext(), + ): + if find_spec("zarrs"): + warnings.filterwarnings( + "ignore", + message=r".*unsupported by ZarrsCodecPipeline.*", + category=UserWarning, + ) + yield + + @no_write_dataset_2d def write_zarr( store: StoreLike, @@ -38,10 +60,6 @@ def write_zarr( adata.strings_to_categoricals() if adata.raw is not None: adata.strings_to_categoricals(adata.raw.var) - # TODO: Use spec writing system for this - f = open_write_group(store) - f.attrs.setdefault("encoding-type", "anndata") - f.attrs.setdefault("encoding-version", "0.1.0") def callback( write_func, store, elem_name: str, elem, *, dataset_kwargs, iospec @@ -54,8 +72,14 @@ def callback( dataset_kwargs = dict(dataset_kwargs, chunks=chunks) write_func(store, elem_name, elem, dataset_kwargs=dataset_kwargs) - write_dispatched(f, "/", adata, callback=callback, dataset_kwargs=ds_kwargs) - zarr.consolidate_metadata(f.store) + with zarrs_context(): + # TODO: Use spec writing system for this + f = open_write_group(store) + f.attrs.setdefault("encoding-type", "anndata") + f.attrs.setdefault("encoding-version", "0.1.0") + + write_dispatched(f, "/", adata, callback=callback, dataset_kwargs=ds_kwargs) + zarr.consolidate_metadata(f.store) def read_zarr(store: PathLike[str] | str | MutableMapping | zarr.Group) -> AnnData: @@ -67,10 +91,9 @@ def read_zarr(store: PathLike[str] | str | MutableMapping | zarr.Group) -> AnnDa store The filename, a :class:`~typing.MutableMapping`, or a Zarr storage class. """ - f = store if isinstance(store, zarr.Group) else zarr.open(store, mode="r") - # Read with handling for backwards compat def callback(func, elem_name: str, elem, iospec): + """Read with handling for backwards compat""" if iospec.encoding_type == "anndata" or elem_name.endswith("/"): return AnnData(**{ k: read_dispatched(v, callback) @@ -86,17 +109,19 @@ def callback(func, elem_name: str, elem, iospec): return _read_legacy_raw(f, func(elem), read_dataframe, func) return func(elem) - adata = read_dispatched(f, callback=callback) + with zarrs_context(): + f = store if isinstance(store, zarr.Group) else zarr.open(store, mode="r") + adata = read_dispatched(f, callback=callback) - # Backwards compat (should figure out which version) - if "raw.X" in f: - raw = AnnData(**_read_legacy_raw(f, adata.raw, read_dataframe, read_elem)) - raw.obs_names = adata.obs_names - adata.raw = raw + # Backwards compat (should figure out which version) + if "raw.X" in f: + raw = AnnData(**_read_legacy_raw(f, adata.raw, read_dataframe, read_elem)) + raw.obs_names = adata.obs_names + adata.raw = raw - # Backwards compat for <0.7 - if isinstance(f["obs"], zarr.Array): - _clean_uns(adata) + # Backwards compat for <0.7 + if isinstance(f["obs"], zarr.Array): + _clean_uns(adata) return adata diff --git a/src/anndata/_types.py b/src/anndata/_types.py index 6006b31c3..514b8b1e1 100644 --- a/src/anndata/_types.py +++ b/src/anndata/_types.py @@ -14,7 +14,10 @@ from collections.abc import Mapping from typing import Any, TypeAlias + from pandas import DataFrame + from anndata._core.xarray import Dataset2D + from anndata.typing import AxisStorable, _XDataType from ._io.specs.registry import ( IOSpec, @@ -23,6 +26,9 @@ Reader, Writer, ) + from ._types import AnnDataElem + from .compat import XDataset + else: # https://github.com/tox-dev/sphinx-autodoc-typehints/issues/580 type S = StorageType type RWAble = typing.RWAble @@ -216,3 +222,29 @@ def __call__( ] type Join_T = Literal["inner", "outer"] + + +class ReduceFunc[T](Protocol): + def __call__( + self, + elem: _XDataType | AxisStorable | DataFrame | XDataset, + *, + accumulate: T, + attr_name: AnnDataElem | None, + ) -> T: + """Function to be called on each visit within `anndata.AnnData._reduce`. + + Parameters + ---------- + elem + The current element. + accumulate + The value being accumulated. + ref_acc + A reference to help uses distinguish where they are in the `AnnData` object. + + Returns + ------- + An accumulated value + """ + ... diff --git a/src/anndata/tests/helpers.py b/src/anndata/tests/helpers.py index 290a2bd36..f264eee67 100644 --- a/src/anndata/tests/helpers.py +++ b/src/anndata/tests/helpers.py @@ -688,6 +688,11 @@ def assert_equal_sparse( exact: bool = False, elem_name: str | None = None, ): + if exact and sparse.issparse(b) and hasattr(a, "indptr") and hasattr(b, "indptr"): + assert a.indptr.dtype == b.indptr.dtype, f"{elem_name}: indptr dtype mismatch" + assert a.indices.dtype == b.indices.dtype, ( + f"{elem_name}: indices dtype mismatch" + ) a = asarray(a) assert_equal(b, a, exact=exact, elem_name=elem_name) @@ -1201,12 +1206,7 @@ class AccessTrackingStore(LocalStore): _accessed_keys: defaultdict[str, list[str]] def __init__(self, *args, **kwargs): - import traceback - - traceback.print_stack() - print(kwargs) super().__init__(*args, **kwargs) - print(self._read_only) self._access_count = Counter() self._accessed = defaultdict(set) self._accessed_keys = defaultdict(list) diff --git a/src/anndata/types.py b/src/anndata/types.py index aa23d10f2..aa2bc795b 100644 --- a/src/anndata/types.py +++ b/src/anndata/types.py @@ -8,24 +8,6 @@ from array_api.latest import ArrayNamespace - from ._core.anndata import AnnData - - -@runtime_checkable -class ExtensionNamespace(Protocol): - """Protocol for extension namespaces. - - Enforces that the namespace initializer accepts a class with the proper `__init__` method. - Protocol's can't enforce that the `__init__` accepts the correct types. See - `_check_namespace_signature` for that. This is mainly useful for static type - checking with mypy and IDEs. - """ - - def __init__(self, adata: AnnData) -> None: - """ - Used to enforce the correct signature for extension namespaces. - """ - @runtime_checkable class SupportsArrayApi(Protocol): @@ -48,3 +30,23 @@ def __dlpack__( copy: bool | None = None, ) -> Any: ... def __dlpack_device__(self) -> tuple[int, int]: ... + + +def __getattr__(key: str): + match key: + case "ExtensionNamespace": + from scverse_misc import ExtensionNamespace + + from .utils import warn + + msg = ( + "Importing ExtensionNamespace from `types` is deprecated. " + "Please use scverse_misc instead." + ) + warn(msg, FutureWarning) + return ExtensionNamespace + case "SupportsArrayApi": + return SupportsArrayApi + case _: + msg = f"types has no attribute {key!r}" + raise AttributeError(msg) diff --git a/src/anndata/utils.py b/src/anndata/utils.py index 9089f90a5..bee9882f7 100644 --- a/src/anndata/utils.py +++ b/src/anndata/utils.py @@ -1,7 +1,6 @@ from __future__ import annotations import re -import sys import warnings from functools import partial, singledispatch from types import FunctionType, UnionType @@ -20,9 +19,13 @@ from .logging import get_logger if TYPE_CHECKING: - from collections.abc import Callable, Iterable, Mapping, Sequence + from collections.abc import Callable, Generator, Iterable, Mapping, Sequence from typing import Any, LiteralString + from ._core.xarray import Dataset2D + from ._types import AnnDataElem + from .typing import AxisStorable, _XDataType + logger = get_logger(__name__) @@ -365,50 +368,15 @@ def warn_once(msg: str, category: type[Warning]) -> None: warnings.filterwarnings("ignore", category=category, message=re.escape(msg)) -if TYPE_CHECKING: - from warnings import deprecated -else: - if sys.version_info >= (3, 13): - from warnings import deprecated as _deprecated - else: - from typing_extensions import deprecated as _deprecated - deprecated = partial(_deprecated, category=FutureWarning) - - def deprecation_msg( name: LiteralString, new_name: LiteralString, add_msg: LiteralString | None = None ) -> LiteralString: - msg = ( - f"Use {new_name} instead of {name}, " - f"{name} is deprecated and will be removed in the future." - ) + msg = f"Use {new_name} instead of {name}." if add_msg is not None: msg += f" {add_msg}" return msg -class DeprecationMixinMeta(type): - """\ - Use this as superclass so deprecated methods and properties - do not appear in vars(MyClass)/dir(MyClass) - """ - - def __dir__(cls): - dont_hide = getattr(cls, "_DONT_HIDE_DEPRECATED", set()) - - def is_hidden(attr: object) -> bool: - if isinstance(attr, property): - attr = attr.fget - is_deprecated = bool(getattr(attr, "__deprecated__", None)) - return is_deprecated and getattr(attr, "__name__", None) not in dont_hide - - return [ - item - for item in type.__dir__(cls) - if not is_hidden(getattr(cls, item, None)) - ] - - def set_module[C: FunctionType | type](name: str, /) -> Callable[[C], C]: def decorator(f: C) -> C: f.__module__ = name @@ -468,3 +436,27 @@ def module_get_attr_redirect( return getattr(mod, new_path) msg = f"module {full_old_module_path} has no attribute {attr_name!r}" raise AttributeError(msg) + + +def iter_outer( + adata, +) -> Generator[ + tuple[AnnDataElem, AxisStorable | _XDataType | Dataset2D | pd.DataFrame] +]: + """Iterate over key-value pairs of the parent "elems" like aw, obs, varp etc""" + for attr_name in [ + "X", + "obs", + "var", + "uns", + "obsm", + "varm", + "obsp", + "varp", + "layers", + "raw", + ]: + was_closed = adata.isbacked and not adata.file.is_open + yield (attr_name, getattr(adata, attr_name)) + if was_closed: + adata.file.close() diff --git a/tests/lazy/conftest.py b/tests/lazy/conftest.py index 5048a526b..a92bb6437 100644 --- a/tests/lazy/conftest.py +++ b/tests/lazy/conftest.py @@ -124,7 +124,8 @@ def adata_remote_with_store_tall_skinny_path( orig_path = tmp_path_factory.mktemp(f"orig_{worker_id}.zarr") M = 1000 N = 5 - obs_names = pd.Index(f"cell{i}" for i in range(M)) + # One named, one unnamed + obs_names = pd.Index((f"cell{i}" for i in range(M)), name="obs_names") var_names = pd.Index(f"gene{i}" for i in range(N)) obs = gen_typed_df(M, obs_names) var = gen_typed_df(N, var_names) diff --git a/tests/lazy/test_read.py b/tests/lazy/test_read.py index 5f203264b..c8e3a3104 100644 --- a/tests/lazy/test_read.py +++ b/tests/lazy/test_read.py @@ -94,16 +94,16 @@ def test_access_count_index( ) -> None: adata_orig = read_zarr(adata_remote_with_store_tall_skinny_path) - remote_store_tall_skinny.initialize_key_trackers(["obs/_index"]) + remote_store_tall_skinny.initialize_key_trackers(["obs/obs_names"]) read_lazy(remote_store_tall_skinny, load_annotation_index=False) - remote_store_tall_skinny.assert_access_count("obs/_index", 0) + remote_store_tall_skinny.assert_access_count("obs/obs_names", 0) read_lazy(remote_store_tall_skinny) n_chunks = 4 count_expected = ( # *2 when mask exists n_chunks * 2 if adata_orig.obs.index.dtype == "string" else n_chunks ) - remote_store_tall_skinny.assert_access_count("obs/_index", count_expected) + remote_store_tall_skinny.assert_access_count("obs/obs_names", count_expected) def test_access_count_dtype( diff --git a/tests/test_backed_dense.py b/tests/test_backed_dense.py index 01165b921..62cf32415 100644 --- a/tests/test_backed_dense.py +++ b/tests/test_backed_dense.py @@ -74,13 +74,19 @@ def test_create_delete( del getattr(adata, attr)["a"] -def test_assign_x_subset(file: h5py.File | zarr.Group): +@pytest.mark.parametrize( + "num_indexing_ops", [1, 2], ids=["single_index", "double_index"] +) +def test_assign_x_subset(file: h5py.File | zarr.Group, num_indexing_ops: Literal[1, 2]): x = np.ones((10, 10)) write_elem(file, "a", x) adata = AnnData(file["a"]) - - view = adata[3:7, 6:8] + if num_indexing_ops == 1: + view = adata[3:7, 6:8] # (3 : 7-3=4), (6 : 8-6=2) + else: + view = adata[2:8, 1:8] # first a wider window … + view = view[1:5, 5:7] # (2+1=3 : 5-1=4), (1+5=6 : 7-5=2) view.X = np.zeros((4, 2)) expected = x.copy() diff --git a/tests/test_backed_hdf5.py b/tests/test_backed_hdf5.py index 2d584a5be..a4939387f 100644 --- a/tests/test_backed_hdf5.py +++ b/tests/test_backed_hdf5.py @@ -290,10 +290,10 @@ def test_to_memory_full( def test_double_index(adata: ad.AnnData, backing_h5ad: Path): + adata_mem = adata.to_memory(copy=True) + adata_mem.strings_to_categoricals() adata.filename = backing_h5ad - with pytest.raises(ValueError, match=r"cannot make a view of a view"): - # no view of view of backed object currently - adata[:2][:, 0] + assert_equal(adata[:2][:, [0, 2]].to_memory(), adata_mem[:2][:, [0, 2]]) # close backing file adata.write() diff --git a/tests/test_backed_sparse.py b/tests/test_backed_sparse.py index be64cb223..91ba76133 100644 --- a/tests/test_backed_sparse.py +++ b/tests/test_backed_sparse.py @@ -1,5 +1,6 @@ from __future__ import annotations +import re from functools import partial from itertools import product from typing import TYPE_CHECKING, Literal @@ -489,6 +490,7 @@ def width_idx_kinds( ], ids=["sparse_dataset", "read_elem_lazy"], ) +@pytest.mark.parametrize("read_data", [True, False], ids=["read", "no_read"]) def test_data_access( tmp_path: Path, sparse_format: Callable[[ArrayLike], CSMatrix], @@ -496,12 +498,16 @@ def test_data_access( idx_min: Idx, exp: list[str], open_func: Callable[[ZarrGroup], CSRDataset | CSCDataset | DaskArray], - zarr_metadata_key, - zarr_separator, + zarr_metadata_key: str, + zarr_separator: str, + *, + read_data: bool, ): exp = [ e.format(zarr_metadata_key=zarr_metadata_key, zarr_separator=zarr_separator) for e in exp + if ((is_data := (len(re.findall(r"/\d(?!\d)", e)) == 1)) and read_data) + or (not is_data) ] path = tmp_path / "test.zarr" a = sparse_format(np.eye(10, 10)) @@ -520,8 +526,13 @@ def test_data_access( store.initialize_key_trackers(["X/data"]) f = zarr.open_group(store, mode="r") a_disk = AnnData(X=open_func(f["X"])) - subset = a_disk[idx_maj, idx_min] if a.format == "csr" else a_disk[idx_min, idx_maj] - if isinstance(subset.X, DaskArray): + subset = ( + a_disk[idx_maj, :][:, idx_min] + if a.format == "csr" + else a_disk[idx_min, :][:, idx_maj] + ) + # Accessing X reads data if backed, otherwise call compute + if read_data and isinstance(subset.X, DaskArray): subset.X.compute(scheduler="single-threaded") # zarr v2 fetches all and not just metadata for that node in 3.X.X python package # TODO: https://github.com/zarr-developers/zarr-python/discussions/2760 diff --git a/tests/test_base.py b/tests/test_base.py index 254e483b8..096829394 100644 --- a/tests/test_base.py +++ b/tests/test_base.py @@ -5,9 +5,11 @@ from itertools import product from typing import TYPE_CHECKING +import h5py import numpy as np import pandas as pd import pytest +import zarr from numpy import ma from scipy import sparse as sp from scipy.sparse import csr_matrix, issparse @@ -15,6 +17,7 @@ import anndata as ad from anndata import AnnData, ImplicitModificationWarning from anndata._core.raw import Raw +from anndata._core.sparse_dataset import sparse_dataset from anndata._settings import settings from anndata.acc import A from anndata.tests.helpers import ( @@ -39,6 +42,47 @@ ) +@pytest.fixture(params=["zarr", "h5ad"], scope="session") +def diskfmt(request) -> Literal["zarr", "h5ad"]: + return request.param + + +@pytest.fixture( + params=[adata_sparse, adata_dense], ids=["sparse", "dense"], scope="session" +) +def adata(request) -> AnnData: + return request.param + + +@pytest.fixture(scope="session") +def adata_on_disk( + diskfmt: Literal["h5ad", "zarr"], + adata: AnnData, + tmp_path_factory: pytest.TempPathFactory, +) -> Path: + path = ( + tmp_path_factory.mktemp("disk_backed") + / f"{'sparse' if adata is adata_sparse else 'dense'}.{diskfmt}" + ) + getattr(adata, f"write_{diskfmt}")(path) + return path + + +@pytest.mark.parametrize("elem_name", ["X", "obsm", "layers"]) +def test_cant_copy_disk_backed( + adata_on_disk: Path, elem_name: Literal["X", "obsm", "layers"] +): + is_sparse = "sparse" in str(adata_on_disk) + is_zarr = adata_on_disk.suffix == ".zarr" + root = (zarr.open if is_zarr else h5py.File)(adata_on_disk) + X = ad.io.sparse_dataset(root["X"]) if is_sparse else root["X"] + adata = AnnData(**{elem_name: (X if elem_name == "X" else {"X": X})}) + with pytest.raises(NotImplementedError, match=r"Copy is not implemented"): + adata.copy() + if not is_zarr: + root.close() + + def test_creation(): AnnData(np.array([[1, 2], [3, 4]])) AnnData(np.array([[1, 2], [3, 4]]), {}, {}) @@ -186,6 +230,31 @@ def test_df_warnings(): adata.X = df +@pytest.mark.parametrize("use_raw", [True, False], ids=["raw", "no_raw"]) +@pytest.mark.parametrize("use_uns", [True, False], ids=["uns", "no_uns"]) +def test_sizeof_print_stratified(capsys, *, use_raw: bool, use_uns: bool): + adata = gen_adata((10, 20)) + if use_uns: + adata.uns = {"foo": np.arange(10), "nested": {"here": np.arange(10)}} + else: + adata.uns = {} + if use_raw: + adata.raw = adata.copy() + adata.__sizeof__(show_stratified=True) + captured = capsys.readouterr() + for attr in [ + "X", + "layers", + "obsm", + "varm", + "obsp", + "varp", + ]: + assert attr in captured.out + assert use_uns == ("uns" in captured.out) + assert use_raw == ("raw" in captured.out) + + @pytest.mark.parametrize("attr", ["X", "layers", "obsm", "varm", "obsp", "varp"]) @pytest.mark.parametrize("when", ["init", "assign"]) def test_convert_matrix(attr, when): @@ -634,7 +703,7 @@ def test_to_df_dense(): pd.testing.assert_index_equal(X_df.index, layer_df.index) -@pytest.mark.filterwarnings("ignore:Use anndata.acc.A instead of:FutureWarning") +@pytest.mark.filterwarnings("ignore:.*Use anndata.acc.A instead of.*:FutureWarning") def test_convenience(subtests: pytest.Subtests) -> None: adata = adata_sparse.copy() adata.layers["x2"] = adata.X * 2 @@ -809,3 +878,23 @@ def test_create_adata_from_single_axis_elem( in_memory.write_h5ad(tmp_path / "adata.h5ad") from_disk = ad.read_h5ad(tmp_path / "adata.h5ad") assert_equal(from_disk, in_memory) + + +@pytest.mark.parametrize("in_x", [True, False], ids=["X", "layers"]) +@pytest.mark.parametrize("is_sparse", [True, False], ids=["sparse", "dense"]) +@pytest.mark.parametrize("storage", ["h5ad", "zarr"]) +def test_transpose_errors_with_backed_arrays( + tmp_path: Path, storage: str, *, is_sparse: bool, in_x: bool +): + adata = AnnData(X=csr_matrix(np.ones((3, 4))) if is_sparse else np.ones((3, 4))) + path = tmp_path / f"test.{storage}" + getattr(adata, f"write_{storage}")(path) + f = (h5py.File if storage == "h5ad" else zarr.open)(path) + raw_array = sparse_dataset(f["X"]) if is_sparse else f["X"] + + adata = AnnData(**({"X": raw_array} if in_x else {"layers": {"test": raw_array}})) + + with pytest.raises(ValueError, match=r"Cannot transpose anndata object"): + adata.transpose() + if storage == "h5ad": + f.close() diff --git a/tests/test_concatenate.py b/tests/test_concatenate.py index cfa57719c..2fa87d651 100644 --- a/tests/test_concatenate.py +++ b/tests/test_concatenate.py @@ -48,10 +48,6 @@ from anndata._types import Join_T -mark_legacy_concatenate = pytest.mark.filterwarnings( - r"ignore:.*AnnData\.concatenate is deprecated:FutureWarning" -) - @singledispatch def filled_like(a, fill_value=None): @@ -167,9 +163,7 @@ def force_lazy(request): return request.param -def fix_known_differences( - orig: AnnData, result: AnnData, *, backwards_compat: bool = True -): +def fix_known_differences(orig: AnnData, result: AnnData): """ Helper function for reducing anndata's to only the elements we expect to be equivalent after concatenation. @@ -181,14 +175,6 @@ def fix_known_differences( orig = orig.copy() result = result.copy() - if backwards_compat: - del orig.varm - del orig.varp - if isinstance(result.obs, Dataset2D): - result.obs = result.obs.ds.drop_vars(["batch"]) - else: - result.obs.drop(columns=["batch"], inplace=True) - for attrname in ("obs", "var"): if isinstance(getattr(result, attrname), Dataset2D): for adata in (orig, result): @@ -240,28 +226,11 @@ def test_concat_interface_errors(use_xdataset): concat([]) -@pytest.mark.parametrize( - ("concat_func", "backwards_compat"), - [ - pytest.param(partial(concat, merge="unique"), False, id="concat"), - pytest.param( - lambda x, **kwargs: x[0].concatenate(x[1:], **kwargs), - True, - marks=mark_legacy_concatenate, - id="concatenate", - ), - ], -) def test_concatenate_roundtrip( join_type, array_type, - concat_func, - backwards_compat, use_xdataset, - force_lazy, ): - if backwards_compat and force_lazy: - pytest.skip("unsupported") adata = gen_adata( (100, 10), X_type=array_type, @@ -277,17 +246,12 @@ def test_concatenate_roundtrip( subset_idx = np.random.choice(remaining, n, replace=False) subsets.append(adata[subset_idx]) remaining = remaining.difference(subset_idx) - result = concat_func(subsets, join=join_type, uns_merge="same", index_unique=None) - if backwards_compat and use_xdataset: - import xarray as xr - - # backwards compat always returns a dataframe - result.var = xr.Dataset.from_dataframe(result.var) + result = concat( + subsets, join=join_type, uns_merge="same", index_unique=None, merge="unique" + ) # Correcting for known differences - orig, result = fix_known_differences( - adata, result, backwards_compat=backwards_compat - ) + orig, result = fix_known_differences(adata, result) assert_equal(result[orig.obs_names].copy(), orig) base_type = type(orig.X) @@ -298,7 +262,6 @@ def test_concatenate_roundtrip( assert isinstance(result.X, base_type) -@mark_legacy_concatenate def test_concatenate_dense(): # dense data X1 = np.array([[1, 2, 3], [4, 5, 6]]) @@ -328,25 +291,24 @@ def test_concatenate_dense(): ) # inner join - adata = adata1.concatenate(adata2, adata3) - X_combined = [[2, 3], [5, 6], [3, 2], [6, 5], [3, 2], [6, 5]] - assert adata.X.astype(int).tolist() == X_combined - assert adata.layers["Xs"].astype(int).tolist() == X_combined - assert adata.obs.columns.tolist() == ["anno1", "anno2", "batch"] - assert adata.var.columns.tolist() == ["annoA-0", "annoA-1", "annoB-2"] - assert adata.var.values.tolist() == [[1, 2, 2], [2, 1, 1]] + adata = concat([adata1, adata2, adata3], merge="first", label="batch") + X_combined = np.array([[2, 3], [5, 6], [3, 2], [6, 5], [3, 2], [6, 5]]) + assert_equal(X_combined, adata.X) + assert_equal(adata.layers["Xs"], X_combined) + assert adata.obs.columns.tolist() == ["batch"] + assert adata.var.columns.tolist() == ["annoA", "annoB"] + assert adata.var.values.tolist() == [[1, 2], [2, 1]] assert adata.obsm.keys() == {"X_1", "X_2"} assert adata.obsm["X_1"].tolist() == np.concatenate([X1, X1, X1]).tolist() - # with batch_key and batch_categories - adata = adata1.concatenate(adata2, adata3, batch_key="batch1") - assert adata.obs.columns.tolist() == ["anno1", "anno2", "batch1"] - adata = adata1.concatenate(adata2, adata3, batch_categories=["a1", "a2", "a3"]) - assert adata.obs["batch"].cat.categories.tolist() == ["a1", "a2", "a3"] + adata = concat([adata1, adata2, adata3], label="batch1") + assert adata.obs.columns.tolist() == ["batch1"] + adata = concat([adata1, adata2, adata3], label="batch1", keys=["a1", "a2", "a3"]) + assert adata.obs["batch1"].cat.categories.tolist() == ["a1", "a2", "a3"] assert adata.var_names.tolist() == ["b", "c"] # outer join - adata = adata1.concatenate(adata2, adata3, join="outer") + adata = concat([adata1, adata2, adata3], join="outer", merge="first") X_ref = np.array([ [1.0, 2.0, 3.0, np.nan], @@ -360,24 +322,23 @@ def test_concatenate_dense(): var_ma = ma.masked_invalid(adata.var.values.tolist()) var_ma_ref = ma.masked_invalid( np.array([ - [0.0, np.nan, np.nan], - [1.0, 2.0, 2.0], - [2.0, 1.0, 1.0], - [np.nan, 0.0, 0.0], + [0.0, np.nan], + [1.0, 2.0], + [2.0, 1.0], + [np.nan, 0.0], ]) ) assert np.array_equal(var_ma.mask, var_ma_ref.mask) assert np.allclose(var_ma.compressed(), var_ma_ref.compressed()) -@mark_legacy_concatenate def test_concatenate_layers(array_type, join_type): adatas = [] for _ in range(5): a = array_type(sparse.random(100, 200, format="csr")) adatas.append(AnnData(X=a, layers={"a": a})) - merged = adatas[0].concatenate(adatas[1:], join=join_type) + merged = concat(adatas, join=join_type) assert_equal(merged.X, merged.layers["a"]) @@ -430,9 +391,8 @@ def gen_index(n): ] -@mark_legacy_concatenate def test_concatenate_obsm_inner(obsm_adatas): - adata = obsm_adatas[0].concatenate(obsm_adatas[1:], join="inner") + adata = concat(obsm_adatas, join="inner") assert set(adata.obsm.keys()) == {"dense", "df"} assert adata.obsm["dense"].shape == (9, 2) @@ -460,13 +420,10 @@ def test_concatenate_obsm_inner(obsm_adatas): pd.testing.assert_frame_equal(true_df, cur_df) -@mark_legacy_concatenate def test_concatenate_obsm_outer(obsm_adatas, fill_val): - outer = obsm_adatas[0].concatenate( - obsm_adatas[1:], join="outer", fill_value=fill_val - ) + outer = concat(obsm_adatas, join="outer", fill_value=fill_val) - inner = obsm_adatas[0].concatenate(obsm_adatas[1:], join="inner") + inner = concat(obsm_adatas, join="inner") for k, inner_v in inner.obsm.items(): assert np.array_equal( _subset(outer.obsm[k], (slice(None), slice(None, inner_v.shape[1]))), @@ -536,7 +493,6 @@ def test_concat_annot_join(obsm_adatas, join_type): ) -@mark_legacy_concatenate def test_concatenate_layers_misaligned(array_type, join_type): adatas = [] for _ in range(5): @@ -546,11 +502,10 @@ def test_concatenate_layers_misaligned(array_type, join_type): adata[:, np.random.choice(adata.var_names, 150, replace=False)].copy() ) - merged = adatas[0].concatenate(adatas[1:], join=join_type) + merged = concat(adatas, join=join_type) assert_equal(merged.X, merged.layers["a"]) -@mark_legacy_concatenate def test_concatenate_layers_outer(array_type, fill_val): # Testing that issue #368 is fixed a = AnnData( @@ -559,14 +514,15 @@ def test_concatenate_layers_outer(array_type, fill_val): ) b = AnnData(X=np.ones((10, 20))) - c = a.concatenate(b, join="outer", fill_value=fill_val, batch_categories=["a", "b"]) + c = concat( + [a, b], join="outer", fill_value=fill_val, label="batch", keys=["a", "b"] + ) np.testing.assert_array_equal( asarray(c[c.obs["batch"] == "b"].layers["a"]), fill_val ) -@mark_legacy_concatenate def test_concatenate_fill_value(fill_val): def get_obs_els(adata): return { @@ -598,7 +554,7 @@ def get_obs_els(adata): for k in [k for k, v in tmp_ad.varm.items() if isinstance(v, AwkArray)]: del tmp_ad.varm[k] - joined = adata1.concatenate([adata2, adata3], join="outer", fill_value=fill_val) + joined = concat([adata1, adata2, adata3], join="outer", fill_value=fill_val) ptr = 0 for orig in [adata1, adata2, adata3]: @@ -612,8 +568,19 @@ def get_obs_els(adata): ptr += orig.n_obs -@mark_legacy_concatenate -def test_concatenate_dense_duplicates(): +@pytest.mark.parametrize( + ("merge", "expected_cols"), + [ + ("first", ["annoA", "annoB", "annoC", "annoD", "annoE"]), + ("same", ["annoA", "annoB"]), + ("unique", ["annoA", "annoB", "annoC", "annoE"]), + ("only", ["annoE"]), + (None, []), + ], +) +def test_concatenate_merge( + merge: Literal["first", "unique", "same", "only"] | None, expected_cols: list[str] +): X1 = np.array([[1, 2, 3], [4, 5, 6]]) X2 = np.array([[1, 2, 3], [4, 5, 6]]) X3 = np.array([[1, 2, 3], [4, 5, 6]]) @@ -649,22 +616,14 @@ def test_concatenate_dense_duplicates(): annoA=[0, 1, 2], annoB=[1.1, 1.0, 2.0], annoD=[2.1, 2.0, 3.1], + annoE=[2.1, 2.0, 3.1], ), ) - adata = adata1.concatenate(adata2, adata3) - assert adata.var.columns.tolist() == [ - "annoA", - "annoB", - "annoC-0", - "annoD-0", - "annoC-1", - "annoD-1", - "annoD-2", - ] + adata = concat([adata1, adata2, adata3], merge=merge) + assert adata.var.columns.tolist() == expected_cols -@mark_legacy_concatenate def test_concatenate_sparse(): # sparse data from scipy.sparse import csr_matrix @@ -693,13 +652,13 @@ def test_concatenate_sparse(): ) # inner join - adata = adata1.concatenate(adata2, adata3) + adata = concat([adata1, adata2, adata3]) X_combined = [[2, 3], [5, 6], [3, 2], [6, 5], [0, 2], [6, 5]] assert adata.X.toarray().astype(int).tolist() == X_combined assert adata.layers["Xs"].toarray().astype(int).tolist() == X_combined # outer join - adata = adata1.concatenate(adata2, adata3, join="outer") + adata = concat([adata1, adata2, adata3], join="outer") assert adata.X.toarray().tolist() == [ [0.0, 2.0, 3.0, 0.0], [0.0, 5.0, 6.0, 0.0], @@ -710,7 +669,6 @@ def test_concatenate_sparse(): ] -@mark_legacy_concatenate def test_concatenate_mixed(): X1 = sparse.csr_matrix(np.array([[1, 2, 0], [4, 0, 6], [0, 0, 9]])) X2 = sparse.csr_matrix(np.array([[0, 2, 3], [4, 0, 0], [7, 0, 9]])) @@ -741,12 +699,11 @@ def test_concatenate_mixed(): layers=dict(counts=X2), # sic ) - adata_all = AnnData.concatenate(adata1, adata2, adata3, adata4) + adata_all = concat([adata1, adata2, adata3, adata4]) assert isinstance(adata_all.X, sparse.csr_matrix) assert isinstance(adata_all.layers["counts"], sparse.csr_matrix) -@mark_legacy_concatenate def test_concatenate_with_raw(): # dense data X1 = np.array([[1, 2, 3], [4, 5, 6]]) @@ -785,20 +742,20 @@ def test_concatenate_with_raw(): adata2.raw = adata2.copy() adata3.raw = adata3.copy() - adata_all = AnnData.concatenate(adata1, adata2, adata3) + adata_all = concat([adata1, adata2, adata3]) assert isinstance(adata_all.raw, Raw) assert set(adata_all.raw.var_names) == {"b", "c"} assert_equal(adata_all.raw.to_adata().obs, adata_all.obs) assert np.array_equal(adata_all.raw.X, adata_all.X) - adata_all = AnnData.concatenate(adata1, adata2, adata3, join="outer") + adata_all = concat([adata1, adata2, adata3], join="outer") assert isinstance(adata_all.raw, Raw) assert set(adata_all.raw.var_names) == set("abcd") assert_equal(adata_all.raw.to_adata().obs, adata_all.obs) assert np.array_equal(np.nan_to_num(adata_all.raw.X), np.nan_to_num(adata_all.X)) adata3.raw = adata4.copy() - adata_all = AnnData.concatenate(adata1, adata2, adata3, join="outer") + adata_all = concat([adata1, adata2, adata3], join="outer") assert isinstance(adata_all.raw, Raw) assert set(adata_all.raw.var_names) == set("abcdz") assert set(adata_all.var_names) == set("abcd") @@ -814,13 +771,13 @@ def test_concatenate_with_raw(): "not concatenating `.raw` attributes." ), ): - adata_all = AnnData.concatenate(adata1, adata2, adata3) + adata_all = concat([adata1, adata2, adata3]) assert adata_all.raw is None del adata1.raw del adata2.raw assert all(_adata.raw is None for _adata in (adata1, adata2, adata3)) - adata_all = AnnData.concatenate(adata1, adata2, adata3) + adata_all = concat([adata1, adata2, adata3]) assert adata_all.raw is None @@ -1232,11 +1189,9 @@ def test_concatenate_uns(unss, merge_strategy, result, value_gen): to `[{"a": [1, 2, 3]}, {"a": [1, 2, 3]}]`. """ # So we can see what the initial pattern was meant to be - print(merge_strategy, "\n", unss, "\n", result) result, *unss = permute_nested_values([result, *unss], value_gen) adatas = [uns_ad(uns) for uns in unss] - with pytest.warns(FutureWarning, match=r"concatenate is deprecated"): - merged = AnnData.concatenate(*adatas, uns_merge=merge_strategy).uns + merged = concat(adatas, uns_merge=merge_strategy).uns assert_equal(merged, result, elem_name="uns") @@ -1634,7 +1589,6 @@ def test_concat_outer_aligned_mapping(elem, axis, use_xdataset, force_lazy): check_filled_like(result, elem_name=f"{axis}m/{elem}") -@mark_legacy_concatenate def test_concatenate_size_0_axis(): # https://github.com/scverse/anndata/issues/526 @@ -1642,8 +1596,7 @@ def test_concatenate_size_0_axis(): b = gen_adata((5, 0)) # Mostly testing that this doesn't error - assert a.concatenate([b]).shape == (10, 0) - assert b.concatenate([a]).shape == (10, 0) + assert concat([a, b]).shape == (10, 0) def test_concat_null_X(use_xdataset): @@ -1861,22 +1814,60 @@ def test_concat_on_var_outer_join(array_type): _ = concat([a, b], join="outer", axis=1) -def test_concat_dask_sparse_matches_memory(join_type, merge_strategy): +@pytest.mark.parametrize("format", ["csr", "csc"]) +@pytest.mark.parametrize( + "unchunked_minor_axis", [True, False], ids=["unchunked_minor", "chunked_minor"] +) +@pytest.mark.parametrize("fill_value", [0, -1]) +def test_concat_dask_sparse_matches_memory( + join_type, + merge_strategy, + format: Literal["csr", "csc"], + axis_name: Literal["obs", "var"], + fill_value: Literal[-1, 0], + *, + unchunked_minor_axis: bool, +): import dask.array as da - X = sparse.random(50, 20, density=0.5, format="csr") - X_dask = da.from_array(X, chunks=(5, 20)) - var_names_1 = [f"gene_{i}" for i in range(20)] - var_names_2 = [f"gene_{i}{'_foo' if (i % 2) else ''}" for i in range(20)] + X = sparse.random(50, 20, density=0.5, format=format) + X_dask = da.from_array( + X, + chunks=( + X.shape[0] if format == "csc" else 10, + X.shape[1] if format == "csr" else 5, + ) + if unchunked_minor_axis + else (5, 10), + ) + off_axis_idx = int(axis_name == "obs") + concat_axis_idx = int(axis_name == "var") + off_axis = "var" if axis_name == "obs" else "obs" + axis_names_1 = [f"off_axis_{i}" for i in range(X.shape[off_axis_idx])] + axis_names_2 = [ + f"off_axis_{i}{'_foo' if (i % 2) else ''}" for i in range(X.shape[off_axis_idx]) + ] - ad1 = AnnData(X=X, var=pd.DataFrame(index=var_names_1)) - ad2 = AnnData(X=X, var=pd.DataFrame(index=var_names_2)) + ad1 = AnnData(X=X, **{off_axis: pd.DataFrame(index=axis_names_1)}) + ad2 = AnnData(X=X, **{off_axis: pd.DataFrame(index=axis_names_2)}) - ad1_dask = AnnData(X=X_dask, var=pd.DataFrame(index=var_names_1)) - ad2_dask = AnnData(X=X_dask, var=pd.DataFrame(index=var_names_2)) + ad1_dask = AnnData(X=X_dask, **{off_axis: pd.DataFrame(index=axis_names_1)}) + ad2_dask = AnnData(X=X_dask, **{off_axis: pd.DataFrame(index=axis_names_2)}) - res_in_memory = concat([ad1, ad2], join=join_type, merge=merge_strategy) - res_dask = concat([ad1_dask, ad2_dask], join=join_type, merge=merge_strategy) + res_in_memory = concat( + [ad1, ad2], + join=join_type, + merge=merge_strategy, + axis=concat_axis_idx, + fill_value=fill_value, + ) + res_dask = concat( + [ad1_dask, ad2_dask], + join=join_type, + merge=merge_strategy, + axis=concat_axis_idx, + fill_value=fill_value, + ) assert_equal(res_in_memory, res_dask) diff --git a/tests/test_deprecations.py b/tests/test_deprecations.py index 6bbff0fd9..3c81a8166 100644 --- a/tests/test_deprecations.py +++ b/tests/test_deprecations.py @@ -75,6 +75,13 @@ def test_warn_on_deprecated__io_module(): from anndata._io import read_h5ad # noqa +def test_warn_on_deprecated_extension_namespace(): + with pytest.warns( + FutureWarning, match=r"Importing ExtensionNamespace from `types`" + ): + from anndata.types import ExtensionNamespace # noqa + + @pytest.mark.parametrize("name", ["obs", "var", "obsm", "varm", "uns"]) def test_keys_function_warns(adata: AnnData, name) -> None: with pytest.warns(FutureWarning, match=rf"{name}_keys is deprecated"): diff --git a/tests/test_extensions.py b/tests/test_extensions.py index 2724ba5b2..05dd33046 100644 --- a/tests/test_extensions.py +++ b/tests/test_extensions.py @@ -6,7 +6,6 @@ import pytest import anndata as ad -from anndata._core import extensions if TYPE_CHECKING: from collections.abc import Generator @@ -46,46 +45,6 @@ def adata() -> ad.AnnData: return ad.AnnData(X=rng.poisson(1, size=(10, 10))) -def test_accessor_namespace() -> None: - """Test the behavior of the AccessorNameSpace descriptor. - - This test verifies that: - - When accessed at the class level (i.e., without an instance), the descriptor - returns the namespace type. - - When accessed via an instance, the descriptor instantiates the namespace, - passing the instance to its constructor. - - The instantiated namespace is then cached on the instance such that subsequent - accesses of the same attribute return the cached namespace instance. - """ - - # Define a dummy namespace class to be used via the descriptor. - class DummyNamespace: - def __init__(self, adata: ad.AnnData) -> None: - self._adata = adata - - def foo(self) -> str: - return "foo" - - class Dummy: - pass - - descriptor = extensions.AccessorNameSpace("dummy", DummyNamespace) - - # When accessed on the class, it should return the namespace type. - ns_class = descriptor.__get__(None, Dummy) - assert ns_class is DummyNamespace - - # When accessed via an instance, it should instantiate DummyNamespace. - dummy_obj = Dummy() - ns_instance = descriptor.__get__(dummy_obj, Dummy) - assert isinstance(ns_instance, DummyNamespace) - assert ns_instance._adata is dummy_obj - - # __get__ should cache the namespace instance on the object. - # Subsequent access should return the same cached instance. - assert dummy_obj.dummy is ns_instance - - def test_descriptor_instance_caching(dummy_namespace: type, adata: ad.AnnData) -> None: """Test that namespace instances are cached on individual AnnData objects.""" # First access creates the instance @@ -101,8 +60,6 @@ def test_register_namespace_basic(dummy_namespace: type, adata: ad.AnnData) -> N def test_register_namespace_override(dummy_namespace: type) -> None: """Test namespace registration and override behavior.""" - assert "dummy" in ad.AnnData._accessors - # Override should warn and update the namespace with pytest.warns( UserWarning, match="Overriding existing custom namespace 'dummy'" @@ -156,7 +113,7 @@ def test_missing_param() -> None: """Test that a namespace missing the second parameter is rejected.""" with pytest.raises( TypeError, - match=r"Namespace initializer must accept an AnnData instance as the second parameter\.", + match=r"Namespace initializer must accept a AnnData instance as the second parameter\.", ): @ad.register_anndata_namespace("missing_param") diff --git a/tests/test_get_vector.py b/tests/test_get_vector.py index 9c6def276..882ca7c89 100644 --- a/tests/test_get_vector.py +++ b/tests/test_get_vector.py @@ -8,7 +8,7 @@ import anndata as ad pytestmark = [ - pytest.mark.filterwarnings("ignore:Use anndata.acc.A instead of:FutureWarning"), + pytest.mark.filterwarnings("ignore:.*Use anndata.acc.A instead of.*:FutureWarning"), ] OBS_KEYS = [ diff --git a/tests/test_helpers.py b/tests/test_helpers.py index 4534f2beb..b525d8d14 100644 --- a/tests/test_helpers.py +++ b/tests/test_helpers.py @@ -248,6 +248,21 @@ def test_assert_equal_dask_arrays(): assert_equal(c, d) +@pytest.mark.parametrize("attr", ["indices", "indptr"]) +def test_assert_equal_sparse_index_dtype(attr): + """assert_equal(exact=True) should detect indptr/indices dtype mismatches.""" + a = sparse.csr_matrix(np.eye(3)) + b = sparse.csr_matrix(np.eye(3)) + setattr(b, attr, getattr(b, attr).astype(np.int64)) + + # Non-exact comparison should pass (values are identical) + assert_equal(a, b, exact=False) + + # Exact comparison should catch the dtype mismatch + with pytest.raises(AssertionError, match=attr): + assert_equal(a, b, exact=True) + + def test_assert_equal_dask_sparse_arrays(): import dask.array as da from scipy import sparse diff --git a/tests/test_readwrite.py b/tests/test_readwrite.py index 3359b2ff8..d9ea1e595 100644 --- a/tests/test_readwrite.py +++ b/tests/test_readwrite.py @@ -14,7 +14,6 @@ import pandas as pd import pytest import zarr -import zarr.convenience from scipy.sparse import csc_array, csc_matrix, csr_array, csr_matrix import anndata as ad @@ -99,7 +98,7 @@ def dataset_kwargs(request): @pytest.fixture -def rw(backing_h5ad): +def rw(backing_h5ad) -> tuple[ad.AnnData, ad.AnnData]: M, N = 100, 101 orig = gen_adata((M, N), **GEN_ADATA_NO_XARRAY_ARGS) orig.write(backing_h5ad) @@ -126,6 +125,53 @@ def dtype(request): # ------------------------------------------------------------------------------ +@pytest.mark.parametrize("store_type", ["h5", "zarr", None]) +def test_can_write( + rw: tuple[ad.AnnData, ad.AnnData], store_type: Literal["h5", "zarr"] | None +): + adata, _ = rw + assert adata.unwriteable(store_type=store_type) + + +@pytest.mark.parametrize("store_type", ["h5", "zarr", None]) +def test_can_not_write_bad_categorical( + rw: tuple[ad.AnnData, ad.AnnData], store_type: Literal["h5", "zarr"] | None +): + + adata, _ = rw + adata.var["arrow_categorical_array"] = pd.Categorical.from_codes( + [i % 2 for i in range(adata.shape[1])], + categories=pd.arrays.IntervalArray.from_tuples([(0, 10), (20, 30)]), + ) + assert not adata.unwriteable(store_type=store_type) + + +@pytest.mark.parametrize("store_type", ["h5", "zarr", None]) +@pytest.mark.parametrize("should_nest", [True, False], ids=["nest", "no_nest"]) +@pytest.mark.parametrize("parent_elem", ["var", "uns", "raw"]) +def test_can_not_write_with_custom_array( + rw: tuple[ad.AnnData, ad.AnnData], + store_type: Literal["h5", "zarr"] | None, + parent_elem: Literal["obs", "uns", "raw"], + *, + should_nest: bool, +): + import pyarrow as pa + + adata, _ = rw + if parent_elem == "raw": + adata.raw = adata.copy() + getter = lambda adata: getattr(adata, parent_elem).var + else: + getter = lambda adata: getattr(adata, parent_elem) + if should_nest: + adata.uns["adata"] = adata.copy() + getter(adata.uns["adata"] if should_nest else adata)["arrow_array"] = ( + pd.arrays.ArrowExtensionArray(pa.array([{"x": 1, "y": True}] * adata.shape[1])) + ) + assert not adata.unwriteable(store_type=store_type) + + @pytest.mark.parametrize("typ", ARRAY_TYPES) def test_readwrite_roundtrip(typ, tmp_path, diskfmt, diskfmt2): pth1 = tmp_path / f"first.{diskfmt}" diff --git a/tests/test_xarray.py b/tests/test_xarray.py index d458fd429..4e31870e3 100644 --- a/tests/test_xarray.py +++ b/tests/test_xarray.py @@ -86,7 +86,6 @@ def test_true_index_dim_column_subset(dataset2d, df): df_expected = dataset2d[cols].to_memory() # account for the fact that we manually set `true_index_dim` df.index = df[col] - df.index.name = None pd.testing.assert_frame_equal(df_expected, df[cols]) From 5cd4b45d6f2b5c3d2b44dd926c3c44ad92667463 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Mon, 27 Apr 2026 14:32:36 +0200 Subject: [PATCH 30/51] fix: iteration hadnling --- src/anndata/_core/anndata.py | 4 ---- src/anndata/_io/h5ad.py | 10 +++++++++- src/anndata/_io/specs/methods.py | 7 +++++++ src/anndata/utils.py | 7 +++++-- tests/test_backed_sparse.py | 15 +++++++++++---- tests/test_base.py | 1 - 6 files changed, 32 insertions(+), 12 deletions(-) diff --git a/src/anndata/_core/anndata.py b/src/anndata/_core/anndata.py index ffb19795c..ea089812f 100644 --- a/src/anndata/_core/anndata.py +++ b/src/anndata/_core/anndata.py @@ -1380,10 +1380,6 @@ def to_memory(self, *, copy: bool = False) -> AnnData: else: new[attr_name] = to_memory(attr, copy=copy) - if self.isbacked: - new["layers"][None] = self.X[...] - self.file.close() - return AnnData(**new) def _has_raw_zarr_or_h5_array(self) -> bool: diff --git a/src/anndata/_io/h5ad.py b/src/anndata/_io/h5ad.py index 13f920428..d85320a14 100644 --- a/src/anndata/_io/h5ad.py +++ b/src/anndata/_io/h5ad.py @@ -46,7 +46,7 @@ @no_write_dataset_2d -def write_h5ad( +def write_h5ad( # noqa: PLR0912 filepath: PathLike[str] | str, adata: AnnData, *, @@ -100,7 +100,15 @@ def write_h5ad( ) else: if k == "layers": + if None in elem: + write_elem( + f, + "X", + elem[None], + dataset_kwargs=dataset_kwargs, + ) elem = {k: v for k, v in elem.items() if k is not None} + write_elem( f, k, diff --git a/src/anndata/_io/specs/methods.py b/src/anndata/_io/specs/methods.py index 125fb6f45..d6c975b17 100644 --- a/src/anndata/_io/specs/methods.py +++ b/src/anndata/_io/specs/methods.py @@ -289,6 +289,13 @@ def write_anndata( for sub_key, elem in iter_outer(adata): if not (sub_key == "X" and elem is None): if sub_key == "layers": + if None in elem: + _writer.write_elem( + g, + "X", + elem[None], + dataset_kwargs=dataset_kwargs, + ) elem = {k: v for k, v in elem.items() if k is not None} _writer.write_elem( g, diff --git a/src/anndata/utils.py b/src/anndata/utils.py index bee9882f7..8ee3d0252 100644 --- a/src/anndata/utils.py +++ b/src/anndata/utils.py @@ -445,7 +445,6 @@ def iter_outer( ]: """Iterate over key-value pairs of the parent "elems" like aw, obs, varp etc""" for attr_name in [ - "X", "obs", "var", "uns", @@ -457,6 +456,10 @@ def iter_outer( "raw", ]: was_closed = adata.isbacked and not adata.file.is_open - yield (attr_name, getattr(adata, attr_name)) + yield ( + attr_name, + getattr(adata, attr_name), + ) + if was_closed: adata.file.close() diff --git a/tests/test_backed_sparse.py b/tests/test_backed_sparse.py index 91ba76133..2873f54ed 100644 --- a/tests/test_backed_sparse.py +++ b/tests/test_backed_sparse.py @@ -503,11 +503,15 @@ def test_data_access( *, read_data: bool, ): + # sparse_dataset without reading doesn't even read in metadata, but dask does for dtype resolution. exp = [ e.format(zarr_metadata_key=zarr_metadata_key, zarr_separator=zarr_separator) for e in exp - if ((is_data := (len(re.findall(r"/\d(?!\d)", e)) == 1)) and read_data) - or (not is_data) + if ( + ((is_data := (len(re.findall(r"/\d(?!\d)", e)) == 1)) and read_data) + or (not is_data) + ) + and not (open_func is sparse_dataset and not read_data) ] path = tmp_path / "test.zarr" a = sparse_format(np.eye(10, 10)) @@ -525,7 +529,8 @@ def test_data_access( store = AccessTrackingStore(path, read_only=True) store.initialize_key_trackers(["X/data"]) f = zarr.open_group(store, mode="r") - a_disk = AnnData(X=open_func(f["X"])) + dataset = open_func(f["X"]) + a_disk = AnnData(X=dataset) subset = ( a_disk[idx_maj, :][:, idx_min] if a.format == "csr" @@ -536,7 +541,9 @@ def test_data_access( subset.X.compute(scheduler="single-threaded") # zarr v2 fetches all and not just metadata for that node in 3.X.X python package # TODO: https://github.com/zarr-developers/zarr-python/discussions/2760 - if ad.settings.zarr_write_format == 2: + if ( + ad.settings.zarr_write_format == 2 and read_data + ) or open_func is not sparse_dataset: exp = [*exp, "X/data/.zgroup", "X/data/.zattrs"] assert store.get_access_count("X/data") == len(exp), store.get_accessed_keys( diff --git a/tests/test_base.py b/tests/test_base.py index 096829394..726b125b5 100644 --- a/tests/test_base.py +++ b/tests/test_base.py @@ -243,7 +243,6 @@ def test_sizeof_print_stratified(capsys, *, use_raw: bool, use_uns: bool): adata.__sizeof__(show_stratified=True) captured = capsys.readouterr() for attr in [ - "X", "layers", "obsm", "varm", From 1f1386090eb4a049bb163fc9fae314f49d05826e Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Mon, 27 Apr 2026 15:28:05 +0200 Subject: [PATCH 31/51] fix: small fixes --- src/anndata/_core/anndata.py | 5 ++--- src/anndata/_io/h5ad.py | 19 ++++++------------- 2 files changed, 8 insertions(+), 16 deletions(-) diff --git a/src/anndata/_core/anndata.py b/src/anndata/_core/anndata.py index ea089812f..a7fae916c 100644 --- a/src/anndata/_core/anndata.py +++ b/src/anndata/_core/anndata.py @@ -1339,8 +1339,8 @@ def _copy( ) raise NotImplementedError(msg) new = {} - for key in ["obs", "var", "obsm", "varm", "obsp", "varp", "layers"]: - new[key] = getattr(self, key).copy() + for key, elem in iter_outer(self): + new[key] = elem.copy() if key == "layers" and isinstance( X, (*get_union_members(_XDataType), type(None)) ): @@ -1379,7 +1379,6 @@ def to_memory(self, *, copy: bool = False) -> AnnData: } else: new[attr_name] = to_memory(attr, copy=copy) - return AnnData(**new) def _has_raw_zarr_or_h5_array(self) -> bool: diff --git a/src/anndata/_io/h5ad.py b/src/anndata/_io/h5ad.py index d85320a14..17c94ba34 100644 --- a/src/anndata/_io/h5ad.py +++ b/src/anndata/_io/h5ad.py @@ -46,7 +46,7 @@ @no_write_dataset_2d -def write_h5ad( # noqa: PLR0912 +def write_h5ad( filepath: PathLike[str] | str, adata: AnnData, *, @@ -86,25 +86,18 @@ def write_h5ad( # noqa: PLR0912 f.attrs.setdefault("encoding-type", "anndata") f.attrs.setdefault("encoding-version", "0.1.0") for k, elem in iter_outer(adata): - if k == "X": - _write_x( - f, - adata, # accessing adata.X reopens adata.file if it’s backed - is_backed=adata.isbacked and adata.filename == filepath, - as_dense=as_dense, - dataset_kwargs=dataset_kwargs, - ) - elif k == "raw": + if k == "raw": _write_raw( f, adata.raw, as_dense=as_dense, dataset_kwargs=dataset_kwargs ) else: if k == "layers": if None in elem: - write_elem( + _write_x( f, - "X", - elem[None], + adata, # accessing adata.X reopens adata.file if it’s backed + is_backed=adata.isbacked and adata.filename == filepath, + as_dense=as_dense, dataset_kwargs=dataset_kwargs, ) elem = {k: v for k, v in elem.items() if k is not None} From 03e58a24595e1c2c798103f8284922dd3fe6dd50 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Mon, 27 Apr 2026 15:29:57 +0200 Subject: [PATCH 32/51] fix: concat docs --- docs/concatenation.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/concatenation.rst b/docs/concatenation.rst index 109c63e2b..0abdfdfdf 100644 --- a/docs/concatenation.rst +++ b/docs/concatenation.rst @@ -29,8 +29,8 @@ Let's start off with an example: uns: 'bulk_labels_colors', 'louvain', 'louvain_colors', 'neighbors', 'pca', 'rank_genes_groups' obsm: 'X_pca', 'X_umap' varm: 'PCs' + obsp: 'connectivities', 'distances' layers: None - obsp: ... If we split this object up by clusters of observations, then stack those subsets we'll obtain the same values – just ordered differently. From 21588258783cf93916775e41f5436b3d539efa18 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Mon, 27 Apr 2026 15:31:03 +0200 Subject: [PATCH 33/51] fix: remove `_copy` condition --- src/anndata/_core/anndata.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/src/anndata/_core/anndata.py b/src/anndata/_core/anndata.py index a7fae916c..1dccb4bf8 100644 --- a/src/anndata/_core/anndata.py +++ b/src/anndata/_core/anndata.py @@ -1332,12 +1332,6 @@ def _copy( ) -> AnnData: from ..typing import _XDataType - if self.isbacked and self.raw is not None: - msg = ( - "This function does not currently handle backed objects " - "internally, this should be dealt with before." - ) - raise NotImplementedError(msg) new = {} for key, elem in iter_outer(self): new[key] = elem.copy() From b7eea4e737f8f87bbea35b40324aefb5beeb5930 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Mon, 27 Apr 2026 16:30:35 +0200 Subject: [PATCH 34/51] fix: copy --- src/anndata/_core/anndata.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/anndata/_core/anndata.py b/src/anndata/_core/anndata.py index 1dccb4bf8..1b4804aa8 100644 --- a/src/anndata/_core/anndata.py +++ b/src/anndata/_core/anndata.py @@ -1334,7 +1334,9 @@ def _copy( new = {} for key, elem in iter_outer(self): - new[key] = elem.copy() + if elem is not None: + elem = elem.copy() + new[key] = elem if key == "layers" and isinstance( X, (*get_union_members(_XDataType), type(None)) ): From c5bcb6bd727921ac9039a80dfc5458dba44dfbcb Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Mon, 27 Apr 2026 17:20:47 +0200 Subject: [PATCH 35/51] fix: add `backed` hack for `layers` specifically --- src/anndata/_core/aligned_mapping.py | 20 +++++++++++++++++++- src/anndata/_core/anndata.py | 6 +++--- 2 files changed, 22 insertions(+), 4 deletions(-) diff --git a/src/anndata/_core/aligned_mapping.py b/src/anndata/_core/aligned_mapping.py index d56f8c6f1..a75a6e8ab 100644 --- a/src/anndata/_core/aligned_mapping.py +++ b/src/anndata/_core/aligned_mapping.py @@ -7,6 +7,7 @@ from types import NoneType from typing import TYPE_CHECKING +import h5py import numpy as np import pandas as pd from scverse_misc import Deprecation, deprecated @@ -340,7 +341,24 @@ def __bool__(self) -> bool: class Layers(AlignedActual[str | None], LayersBase): - pass + def __init__(self, parent: AnnData | Raw, *, store: MutableMapping[str, Value]): + super().__init__(parent, store=store) + if None not in self._data: + self._data[None] = None + self._is_x_none = self._data[None] is None + + def __getitem__(self, key: str) -> Value: + res = super().__getitem__(key) + if res is None and key is None and self.parent.file is not None: + if not self.parent.file.is_open: + self.parent.file.open() + X = self.parent.file["X"] + if isinstance(X, h5py.Group): + from ..io import sparse_dataset + + X = sparse_dataset(X) + return X + return res class LayersView(AlignedView[str | None, LayersBase, TwoDIdx], LayersBase): diff --git a/src/anndata/_core/anndata.py b/src/anndata/_core/anndata.py index 1b4804aa8..a5fb6e905 100644 --- a/src/anndata/_core/anndata.py +++ b/src/anndata/_core/anndata.py @@ -961,9 +961,9 @@ def uns_keys(self) -> list[str]: def isbacked(self) -> bool: """`True` if object is backed on disk, `False` otherwise.""" is_filename_none = self.filename is not None - is_x_none = (self._adata_ref.layers if self._is_view else self.layers).get( - None, None - ) is None + is_x_none = ( + self._adata_ref.layers if self._is_view else self.layers + )._is_x_none return is_filename_none and is_x_none @property From dfc8d527712969b4d2a66ec6e4c766b59885c06e Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Mon, 27 Apr 2026 17:42:17 +0200 Subject: [PATCH 36/51] fix: backed check --- src/anndata/_core/aligned_mapping.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/anndata/_core/aligned_mapping.py b/src/anndata/_core/aligned_mapping.py index a75a6e8ab..f76389c62 100644 --- a/src/anndata/_core/aligned_mapping.py +++ b/src/anndata/_core/aligned_mapping.py @@ -349,7 +349,7 @@ def __init__(self, parent: AnnData | Raw, *, store: MutableMapping[str, Value]): def __getitem__(self, key: str) -> Value: res = super().__getitem__(key) - if res is None and key is None and self.parent.file is not None: + if res is None and key is None and self.parent.filename is not None: if not self.parent.file.is_open: self.parent.file.open() X = self.parent.file["X"] From ec003457e740b3d6588fe624ed61c03f3156e1bd Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Mon, 27 Apr 2026 17:53:35 +0200 Subject: [PATCH 37/51] fix: array api handling --- src/anndata/typing.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/anndata/typing.py b/src/anndata/typing.py index 3f4f01ab3..12432cc0b 100644 --- a/src/anndata/typing.py +++ b/src/anndata/typing.py @@ -86,6 +86,7 @@ | DaskArray | CupyArray | CupySparseMatrix + | SupportsArrayApi ) """An Array that is possibly stored in Memory (Dask Arrays are possibly stored on disk).""" From 7327ccbbac8ddff1670e7d5a17674ab33703aa45 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Mon, 27 Apr 2026 18:08:19 +0200 Subject: [PATCH 38/51] fix: ok back out of that, needs to be even more explicit --- src/anndata/_core/aligned_mapping.py | 34 +++++++++++++++++++++------- src/anndata/_core/anndata.py | 2 +- 2 files changed, 27 insertions(+), 9 deletions(-) diff --git a/src/anndata/_core/aligned_mapping.py b/src/anndata/_core/aligned_mapping.py index f76389c62..83b85d498 100644 --- a/src/anndata/_core/aligned_mapping.py +++ b/src/anndata/_core/aligned_mapping.py @@ -4,6 +4,7 @@ from collections.abc import MutableMapping, Sequence from copy import copy from dataclasses import dataclass +from itertools import chain from types import NoneType from typing import TYPE_CHECKING @@ -340,16 +341,16 @@ def __bool__(self) -> bool: return not self.keys() <= {None} -class Layers(AlignedActual[str | None], LayersBase): +class Layers[K: str | None](AlignedActual[K], LayersBase): def __init__(self, parent: AnnData | Raw, *, store: MutableMapping[str, Value]): super().__init__(parent, store=store) - if None not in self._data: - self._data[None] = None - self._is_x_none = self._data[None] is None + if None not in self._data and self.parent.filename is not None: + self.is_none_backed = True + else: + self.is_none_backed = False - def __getitem__(self, key: str) -> Value: - res = super().__getitem__(key) - if res is None and key is None and self.parent.filename is not None: + def __getitem__(self, key: K) -> Value: + if key is None and self.is_none_backed: if not self.parent.file.is_open: self.parent.file.open() X = self.parent.file["X"] @@ -358,7 +359,24 @@ def __getitem__(self, key: str) -> Value: X = sparse_dataset(X) return X - return res + return super().__getitem__(key) + + def __iter__(self) -> K: + keys_iter = super().__iter__() + if self.is_none_backed: + yield from chain([None], keys_iter) + yield from keys_iter + + def __len__(self) -> int: + data_length = super().__len__() + if self.is_none_backed: + return data_length + 1 + return data_length + + def __contains__(self, key: K) -> bool: + if key is None and self.is_none_backed: + return True + return super().__contains__(key) class LayersView(AlignedView[str | None, LayersBase, TwoDIdx], LayersBase): diff --git a/src/anndata/_core/anndata.py b/src/anndata/_core/anndata.py index a5fb6e905..dd0f55880 100644 --- a/src/anndata/_core/anndata.py +++ b/src/anndata/_core/anndata.py @@ -963,7 +963,7 @@ def isbacked(self) -> bool: is_filename_none = self.filename is not None is_x_none = ( self._adata_ref.layers if self._is_view else self.layers - )._is_x_none + ).is_none_backed return is_filename_none and is_x_none @property From 38858fcf22ebc5b38bccb7c5fffcc62952613b90 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Tue, 28 Apr 2026 11:20:57 +0200 Subject: [PATCH 39/51] fix: `obsp` ellipsis --- docs/concatenation.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/concatenation.rst b/docs/concatenation.rst index 0abdfdfdf..4bd30430a 100644 --- a/docs/concatenation.rst +++ b/docs/concatenation.rst @@ -29,7 +29,7 @@ Let's start off with an example: uns: 'bulk_labels_colors', 'louvain', 'louvain_colors', 'neighbors', 'pca', 'rank_genes_groups' obsm: 'X_pca', 'X_umap' varm: 'PCs' - obsp: 'connectivities', 'distances' + obsp: ... layers: None If we split this object up by clusters of observations, then stack those subsets we'll obtain the same values – just ordered differently. From ecfdd8ce984bd35f77243fb592a27363b55d3037 Mon Sep 17 00:00:00 2001 From: "Philipp A." Date: Tue, 28 Apr 2026 14:06:20 +0200 Subject: [PATCH 40/51] types --- src/anndata/_core/access.py | 4 +-- src/anndata/_core/aligned_mapping.py | 54 ++++++++++++++-------------- src/anndata/_core/anndata.py | 16 +++++---- src/anndata/_core/raw.py | 4 +-- 4 files changed, 40 insertions(+), 38 deletions(-) diff --git a/src/anndata/_core/access.py b/src/anndata/_core/access.py index 9a2ffbbda..30e63f8b3 100644 --- a/src/anndata/_core/access.py +++ b/src/anndata/_core/access.py @@ -7,10 +7,10 @@ from anndata import AnnData -class ElementRef(NamedTuple): +class ElementRef[K: (str, str | None)](NamedTuple): parent: AnnData attrname: str - keys: tuple[str, ...] = () + keys: tuple[K, ...] = () def __str__(self) -> str: return f".{self.attrname}" + "".join(f"[{x!r}]" for x in self.keys) diff --git a/src/anndata/_core/aligned_mapping.py b/src/anndata/_core/aligned_mapping.py index 83b85d498..a4f1e2fc7 100644 --- a/src/anndata/_core/aligned_mapping.py +++ b/src/anndata/_core/aligned_mapping.py @@ -34,7 +34,6 @@ from typing import ClassVar, Literal, Self from .anndata import AnnData - from .raw import Raw OneDIdx = Sequence[int] | Sequence[bool] | slice @@ -43,8 +42,8 @@ Value = pd.DataFrame | CSMatrix | CSArray | np.ndarray -class AlignedMappingBase[I: OneDIdx, K: (str, str | None)]( - MutableMapping[str, Value], ABC +class AlignedMappingBase[I: (OneDIdx, TwoDIdx), K: (str, str | None)]( + MutableMapping[K, Value], ABC ): """\ An abstract base class for Mappings containing array-like values aligned @@ -60,7 +59,7 @@ class AlignedMappingBase[I: OneDIdx, K: (str, str | None)]( _actual_class: ClassVar[type[AlignedActual]] """The actual class (which has it’s own data) for this aligned mapping.""" - _parent: AnnData | Raw + _parent: AnnData # technically also can be Raw for .varm """The parent object that this mapping is aligned to.""" def __repr__(self) -> str: @@ -118,7 +117,7 @@ def axes(self) -> tuple[Literal[0, 1], ...]: def is_view(self) -> bool: ... @property - def parent(self) -> AnnData | Raw: + def parent(self) -> AnnData: return self._parent def copy(self) -> dict[K, Value]: @@ -128,7 +127,7 @@ def copy(self) -> dict[K, Value]: for k, v in self.items() } - def _view(self, parent: AnnData, subset_idx: I) -> AlignedView[K, Self, I]: + def _view(self, parent: AnnData, subset_idx: I) -> AlignedView[Self, I, K]: """Returns a subset copy-on-write view of the object.""" return self._view_class(self, parent, subset_idx) @@ -138,7 +137,7 @@ def as_dict(self) -> dict: class AlignedView[P: AlignedMappingBase, I: (OneDIdx, TwoDIdx), K: (str, str | None)]( - AlignedMappingBase + AlignedMappingBase[I, K] ): is_view: ClassVar[Literal[True]] = True @@ -207,13 +206,15 @@ def __len__(self) -> int: return len(self.parent_mapping) -class AlignedActual[K: (str, str | None)](AlignedMappingBase[K]): +class AlignedActual[I: (OneDIdx, TwoDIdx), K: (str, str | None)]( + AlignedMappingBase[I, K] +): is_view: ClassVar[Literal[False]] = False _data: MutableMapping[K, Value] """Underlying mapping to the data""" - def __init__(self, parent: AnnData | Raw, *, store: MutableMapping[K, Value]): + def __init__(self, parent: AnnData, *, store: MutableMapping[K, Value]): self._parent = parent self._data = store for k, v in self._data.items(): @@ -248,7 +249,7 @@ def __len__(self) -> int: return len(self._data) -class AxisArraysBase(AlignedMappingBase[str]): +class AxisArraysBase(AlignedMappingBase[OneDIdx, str]): """\ Mapping of key→array-like, where array-like is aligned to an axis of parent AnnData. @@ -305,10 +306,10 @@ def dim_names(self) -> pd.Index: return (self.parent.obs_names, self.parent.var_names)[self._axis] -class AxisArrays(AlignedActual[str], AxisArraysBase): +class AxisArrays(AlignedActual[OneDIdx, str], AxisArraysBase): def __init__( self, - parent: AnnData | Raw, + parent: AnnData, *, axis: Literal[0, 1], store: MutableMapping[str, Value] | AxisArraysBase, @@ -319,7 +320,7 @@ def __init__( super().__init__(parent, store=store) -class AxisArraysView(AlignedView[str, AxisArraysBase, OneDIdx], AxisArraysBase): +class AxisArraysView(AlignedView[AxisArraysBase, OneDIdx, str], AxisArraysBase): pass @@ -327,7 +328,7 @@ class AxisArraysView(AlignedView[str, AxisArraysBase, OneDIdx], AxisArraysBase): AxisArraysBase._actual_class = AxisArrays -class LayersBase(AlignedMappingBase[str | None]): +class LayersBase(AlignedMappingBase[TwoDIdx, str | None]): """\ Mapping of key: array-like, where array-like is aligned to both axes of the parent anndata. @@ -341,15 +342,14 @@ def __bool__(self) -> bool: return not self.keys() <= {None} -class Layers[K: str | None](AlignedActual[K], LayersBase): - def __init__(self, parent: AnnData | Raw, *, store: MutableMapping[str, Value]): +class Layers(AlignedActual[TwoDIdx, str | None], LayersBase): + def __init__(self, parent: AnnData, *, store: MutableMapping[str | None, Value]): super().__init__(parent, store=store) - if None not in self._data and self.parent.filename is not None: - self.is_none_backed = True - else: - self.is_none_backed = False + self.is_none_backed = ( + None not in self._data and self.parent.filename is not None + ) - def __getitem__(self, key: K) -> Value: + def __getitem__(self, key: str | None) -> Value: if key is None and self.is_none_backed: if not self.parent.file.is_open: self.parent.file.open() @@ -361,7 +361,7 @@ def __getitem__(self, key: K) -> Value: return X return super().__getitem__(key) - def __iter__(self) -> K: + def __iter__(self) -> str | None: keys_iter = super().__iter__() if self.is_none_backed: yield from chain([None], keys_iter) @@ -373,13 +373,13 @@ def __len__(self) -> int: return data_length + 1 return data_length - def __contains__(self, key: K) -> bool: + def __contains__(self, key: str | None) -> bool: if key is None and self.is_none_backed: return True return super().__contains__(key) -class LayersView(AlignedView[str | None, LayersBase, TwoDIdx], LayersBase): +class LayersView(AlignedView[LayersBase, TwoDIdx, str | None], LayersBase): pass @@ -387,7 +387,7 @@ class LayersView(AlignedView[str | None, LayersBase, TwoDIdx], LayersBase): LayersBase._actual_class = Layers -class PairwiseArraysBase(AlignedMappingBase[str]): +class PairwiseArraysBase(AlignedMappingBase[OneDIdx, str]): """\ Mapping of key: array-like, where both axes of array-like are aligned to one axis of the parent anndata. @@ -413,7 +413,7 @@ def dim(self) -> str: return self._dimnames[self._axis] -class PairwiseArrays(AlignedActual[str], PairwiseArraysBase): +class PairwiseArrays(AlignedActual[OneDIdx, str], PairwiseArraysBase): def __init__( self, parent: AnnData, @@ -428,7 +428,7 @@ def __init__( class PairwiseArraysView( - AlignedView[str, PairwiseArraysBase, OneDIdx], PairwiseArraysBase + AlignedView[PairwiseArraysBase, OneDIdx, str], PairwiseArraysBase ): pass diff --git a/src/anndata/_core/anndata.py b/src/anndata/_core/anndata.py index dd0f55880..ec9ef9b6e 100644 --- a/src/anndata/_core/anndata.py +++ b/src/anndata/_core/anndata.py @@ -630,7 +630,9 @@ def X(self, value: _XDataType | None) -> None: def X(self) -> None: self.X = None - layers: AlignedMappingProperty[Layers | LayersView] = AlignedMappingProperty(Layers) + layers: AlignedMappingProperty[Layers | LayersView, str | None] = ( + AlignedMappingProperty(Layers) + ) """\ Dictionary-like object with values of the same dimensions as :attr:`X`. @@ -845,8 +847,8 @@ def uns(self, value: MutableMapping): def uns(self): self.uns = OrderedDict() - obsm: AlignedMappingProperty[AxisArrays | AxisArraysView] = AlignedMappingProperty( - AxisArrays, 0 + obsm: AlignedMappingProperty[AxisArrays | AxisArraysView, str] = ( + AlignedMappingProperty(AxisArrays, 0) ) """\ Multi-dimensional annotation of observations @@ -857,8 +859,8 @@ def uns(self): Is sliced with `data` and `obs` but behaves otherwise like a :term:`mapping`. """ - varm: AlignedMappingProperty[AxisArrays | AxisArraysView] = AlignedMappingProperty( - AxisArrays, 1 + varm: AlignedMappingProperty[AxisArrays | AxisArraysView, str] = ( + AlignedMappingProperty(AxisArrays, 1) ) """\ Multi-dimensional annotation of variables/features @@ -869,7 +871,7 @@ def uns(self): Is sliced with `data` and `var` but behaves otherwise like a :term:`mapping`. """ - obsp: AlignedMappingProperty[PairwiseArrays | PairwiseArraysView] = ( + obsp: AlignedMappingProperty[PairwiseArrays | PairwiseArraysView, str] = ( AlignedMappingProperty(PairwiseArrays, 0) ) """\ @@ -881,7 +883,7 @@ def uns(self): Is sliced with `data` and `obs` but behaves otherwise like a :term:`mapping`. """ - varp: AlignedMappingProperty[PairwiseArrays | PairwiseArraysView] = ( + varp: AlignedMappingProperty[PairwiseArrays | PairwiseArraysView, str] = ( AlignedMappingProperty(PairwiseArrays, 1) ) """\ diff --git a/src/anndata/_core/raw.py b/src/anndata/_core/raw.py index 6f5a77b7b..eab85a57e 100644 --- a/src/anndata/_core/raw.py +++ b/src/anndata/_core/raw.py @@ -117,8 +117,8 @@ def n_vars(self) -> int: def n_obs(self) -> int: return self._n_obs - varm: AlignedMappingProperty[AxisArrays | AxisArraysView] = AlignedMappingProperty( - AxisArrays, 1 + varm: AlignedMappingProperty[AxisArrays | AxisArraysView, str] = ( + AlignedMappingProperty(AxisArrays, 1) ) @property From 09f5e4cefd2d2d87e88dbf52393317a20067b19c Mon Sep 17 00:00:00 2001 From: "Philipp A." Date: Tue, 28 Apr 2026 14:12:46 +0200 Subject: [PATCH 41/51] double del --- src/anndata/_core/anndata.py | 12 +++++------- tests/test_x.py | 7 +++---- 2 files changed, 8 insertions(+), 11 deletions(-) diff --git a/src/anndata/_core/anndata.py b/src/anndata/_core/anndata.py index ec9ef9b6e..cc2c2f9ce 100644 --- a/src/anndata/_core/anndata.py +++ b/src/anndata/_core/anndata.py @@ -619,12 +619,11 @@ def X(self, value: _XDataType | None) -> None: msg = "Setting element `.X` of view, initializing view as actual." warn(msg, ImplicitModificationWarning) self._init_as_actual(self._copy(X=value)) - return None + return if value is not None: self.layers[None] = value else: - self.layers.pop(None) - return None + self.layers.pop(None, None) @X.deleter def X(self) -> None: @@ -1014,10 +1013,9 @@ def filename(self, filename: PathLike[str] | str | None): self.write(filename, as_dense=as_dense) # open new file for accessing self.file.open(filename, "r+") - # As the data is stored on disk, we can safely set remove if it previously was - # in layers. Setting `X` to `None` now would raise an error because `self.isbacked`. - if None in self.layers: - self.layers.pop(None) + # As the data is stored on disk, we can safely set remove it. + # Setting `X` to `None` now would raise an error because `self.isbacked`. + self.layers.pop(None, None) def _set_backed(self, attr, value): from .._io.utils import write_attribute diff --git a/tests/test_x.py b/tests/test_x.py index 8f3be7b2a..42ee7a1f7 100644 --- a/tests/test_x.py +++ b/tests/test_x.py @@ -75,7 +75,7 @@ def test_set_x_is_none(): assert adata.X is None -def test_del_set_equiv_X(): +def test_del_set_equiv_x() -> None: """Tests that `del adata.X` is equivalent to `adata.X = None`""" # test setter and deleter orig = gen_adata((10, 10)) @@ -88,9 +88,8 @@ def test_del_set_equiv_X(): assert_equal(orig, copy) # Check that deleting again is still fine - # TODO: Do we even want to keep supporting this operation i.e., del adata.X if X is None? - # del orig.X - # assert orig.X is None + del orig.X + assert orig.X is None @pytest.mark.parametrize( From dd7722b9d2b5eb415e041df1ccd807d4311a19b3 Mon Sep 17 00:00:00 2001 From: "Philipp A." Date: Tue, 28 Apr 2026 14:18:46 +0200 Subject: [PATCH 42/51] simpler is_none_backed --- src/anndata/_core/aligned_mapping.py | 8 +++++++- src/anndata/_core/anndata.py | 6 +----- tests/test_backed_hdf5.py | 1 + 3 files changed, 9 insertions(+), 6 deletions(-) diff --git a/src/anndata/_core/aligned_mapping.py b/src/anndata/_core/aligned_mapping.py index a4f1e2fc7..bef930346 100644 --- a/src/anndata/_core/aligned_mapping.py +++ b/src/anndata/_core/aligned_mapping.py @@ -338,6 +338,8 @@ class LayersBase(AlignedMappingBase[TwoDIdx, str | None]): attrname: ClassVar[Literal["layers"]] = "layers" axes: ClassVar[tuple[Literal[0], Literal[1]]] = (0, 1) + is_none_backed: bool + def __bool__(self) -> bool: return not self.keys() <= {None} @@ -380,7 +382,11 @@ def __contains__(self, key: str | None) -> bool: class LayersView(AlignedView[LayersBase, TwoDIdx, str | None], LayersBase): - pass + def __init__( + self, parent_mapping: LayersBase, parent_view: AnnData, subset_idx: TwoDIdx + ) -> None: + super().__init__(parent_mapping, parent_view, subset_idx) + self.is_none_backed = parent_mapping.is_none_backed LayersBase._view_class = LayersView diff --git a/src/anndata/_core/anndata.py b/src/anndata/_core/anndata.py index cc2c2f9ce..6dd5ce16b 100644 --- a/src/anndata/_core/anndata.py +++ b/src/anndata/_core/anndata.py @@ -961,11 +961,7 @@ def uns_keys(self) -> list[str]: @property def isbacked(self) -> bool: """`True` if object is backed on disk, `False` otherwise.""" - is_filename_none = self.filename is not None - is_x_none = ( - self._adata_ref.layers if self._is_view else self.layers - ).is_none_backed - return is_filename_none and is_x_none + return self.filename is not None and self.layers.is_none_backed @property def is_view(self) -> bool: diff --git a/tests/test_backed_hdf5.py b/tests/test_backed_hdf5.py index a4939387f..34571793c 100644 --- a/tests/test_backed_hdf5.py +++ b/tests/test_backed_hdf5.py @@ -246,6 +246,7 @@ def test_backed_raw_subset( backed_adata = ad.read_h5ad(backed_pth, backed="r") backed_v = backed_adata[obs_idx, var_idx] assert backed_v.is_view + assert backed_v.isbacked mem_v = mem_adata[obs_idx, var_idx] # Value equivalent From 05364d0ec7bcf9812a04159183a028662a0aa6d4 Mon Sep 17 00:00:00 2001 From: "Philipp A." Date: Tue, 28 Apr 2026 14:20:34 +0200 Subject: [PATCH 43/51] style --- src/anndata/_io/h5ad.py | 37 +++++++++++++++++++------------------ 1 file changed, 19 insertions(+), 18 deletions(-) diff --git a/src/anndata/_io/h5ad.py b/src/anndata/_io/h5ad.py index 17c94ba34..a77af5219 100644 --- a/src/anndata/_io/h5ad.py +++ b/src/anndata/_io/h5ad.py @@ -90,24 +90,25 @@ def write_h5ad( _write_raw( f, adata.raw, as_dense=as_dense, dataset_kwargs=dataset_kwargs ) - else: - if k == "layers": - if None in elem: - _write_x( - f, - adata, # accessing adata.X reopens adata.file if it’s backed - is_backed=adata.isbacked and adata.filename == filepath, - as_dense=as_dense, - dataset_kwargs=dataset_kwargs, - ) - elem = {k: v for k, v in elem.items() if k is not None} - - write_elem( - f, - k, - dict(elem) if isinstance(elem, MutableMapping) else elem, - dataset_kwargs=dataset_kwargs, - ) + continue + + if k == "layers": + if None in elem: + _write_x( + f, + adata, # accessing adata.X reopens adata.file if it’s backed + is_backed=adata.isbacked and adata.filename == filepath, + as_dense=as_dense, + dataset_kwargs=dataset_kwargs, + ) + elem = {k: v for k, v in elem.items() if k is not None} + + write_elem( + f, + k, + dict(elem) if isinstance(elem, MutableMapping) else elem, + dataset_kwargs=dataset_kwargs, + ) def _write_x( From 691cac016a41928d88ece80d4647d927fe1429b2 Mon Sep 17 00:00:00 2001 From: "Philipp A." Date: Tue, 28 Apr 2026 14:20:59 +0200 Subject: [PATCH 44/51] style --- src/anndata/experimental/multi_files/_anncollection.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/anndata/experimental/multi_files/_anncollection.py b/src/anndata/experimental/multi_files/_anncollection.py index 22ce5ad85..b1ed27b78 100644 --- a/src/anndata/experimental/multi_files/_anncollection.py +++ b/src/anndata/experimental/multi_files/_anncollection.py @@ -56,7 +56,7 @@ def check_type(attr, key=None): arrs = [] for a in adatas: attr_arr = getattr(a, attr) - if (key is None and attr == "layers") or key is not None: + if key is not None or attr == "layers": attr_arr = attr_arr[key] arrs.append(attr_arr) # hacky but numpy find_common_type doesn't work with categoricals From 4e2e76d68b3c40fdc5103e32fd707a4b2f80e34b Mon Sep 17 00:00:00 2001 From: "Philipp A." Date: Tue, 28 Apr 2026 14:21:25 +0200 Subject: [PATCH 45/51] style --- src/anndata/tests/helpers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/anndata/tests/helpers.py b/src/anndata/tests/helpers.py index f264eee67..3e5a45341 100644 --- a/src/anndata/tests/helpers.py +++ b/src/anndata/tests/helpers.py @@ -767,7 +767,7 @@ def assert_equal_mapping( a: Mapping, b: object, *, exact: bool = False, elem_name: str | None = None ): assert isinstance(b, Mapping) - assert set(a) == set(b), format_msg(elem_name) + f" {a.keys()} != {b.keys()}" + assert set(a) == set(b), f"{format_msg(elem_name)} {a.keys()} != {b.keys()}" for k in a: if elem_name is None: elem_name = "" From 233520c32e6c9b87bfb85a8072ee6fecda3beb61 Mon Sep 17 00:00:00 2001 From: "Philipp A." Date: Tue, 28 Apr 2026 14:21:52 +0200 Subject: [PATCH 46/51] resolve TODO --- tests/test_layers.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/test_layers.py b/tests/test_layers.py index c153eac10..9147a2db2 100644 --- a/tests/test_layers.py +++ b/tests/test_layers.py @@ -21,7 +21,6 @@ def X(request): def test_creation(X: np.ndarray | None): adata = AnnData(X=X, layers=dict(L=L.copy())) - # TODO: phil, when reviewing this, you had { "L", None } in both cases before - but if `X` is `None` should the key be there? assert adata.layers.keys() == {"L", None} if X is not None else {"L"} assert "L" in adata.layers assert "X" not in adata.layers From cb30a50d4382391bc65307fa8c7d9b55cbbe23eb Mon Sep 17 00:00:00 2001 From: "Philipp A." Date: Tue, 28 Apr 2026 14:22:41 +0200 Subject: [PATCH 47/51] comment --- tests/test_repr.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_repr.py b/tests/test_repr.py index bd3df126d..a73f37518 100644 --- a/tests/test_repr.py +++ b/tests/test_repr.py @@ -60,6 +60,6 @@ def test_removal(adata, adata_attr): assert re.search(rf"^\s+{attr}:.*$", repr(adata), flags=re.MULTILINE) delattr(adata, attr) if attr != "layers": - assert re.search(rf"^\s+{attr}:.*$", repr(adata), flags=re.MULTILINE) is None - else: + assert not re.search(rf"^\s+{attr}:.*$", repr(adata), flags=re.MULTILINE) + else: # `del adata.layers` doesn’t delete `X` for backwards compat reasons assert re.search(r"^\s+layers: None.*$", repr(adata), flags=re.MULTILINE) From 49978e7c9185c921ae6505a369ed690fff1e441d Mon Sep 17 00:00:00 2001 From: "Philipp A." Date: Tue, 28 Apr 2026 14:40:52 +0200 Subject: [PATCH 48/51] relnotes --- docs/release-notes/1707.breaking.md | 1 + docs/release-notes/1707.feat.md | 1 + pyproject.toml | 3 --- 3 files changed, 2 insertions(+), 3 deletions(-) create mode 100644 docs/release-notes/1707.breaking.md create mode 100644 docs/release-notes/1707.feat.md diff --git a/docs/release-notes/1707.breaking.md b/docs/release-notes/1707.breaking.md new file mode 100644 index 000000000..22d1ada6c --- /dev/null +++ b/docs/release-notes/1707.breaking.md @@ -0,0 +1 @@ +Remove `dtype` argument from {class}`~anndata.AnnData` constructor. {user}`flying-sheep` diff --git a/docs/release-notes/1707.feat.md b/docs/release-notes/1707.feat.md new file mode 100644 index 000000000..f8127bcd5 --- /dev/null +++ b/docs/release-notes/1707.feat.md @@ -0,0 +1 @@ +Move {attr}`~anndata.AnnData.X` into {attr}`anndata.AnnData.layers` under the `None` key {user}`flying-sheep` & {user}`ilan-gold` diff --git a/pyproject.toml b/pyproject.toml index 0db2a7fa6..c775e211b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -287,6 +287,3 @@ fragment.perf.name = "Performance" fragment.chore.name = "Miscellaneous changes" fragment.revert.name = "Revert" fragment.breaking.name = "Breaking changes" # add `!` to commit type (e.g. “feature!:”) - -[tool.uv] -override-dependencies = [ "virtualenv<21" ] From 64188b1c9c72141626db85f172234f67fca7699b Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Tue, 5 May 2026 13:34:13 +0200 Subject: [PATCH 49/51] fix: add warning --- docs/conf.py | 1 + src/anndata/_core/merge.py | 16 ++++++++++++++-- src/anndata/utils.py | 9 +++++++++ tests/test_concatenate.py | 14 +++++++++++--- 4 files changed, 35 insertions(+), 5 deletions(-) diff --git a/docs/conf.py b/docs/conf.py index 1304933e3..359de62c1 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -181,6 +181,7 @@ def res( ("py:class", "anndata.acc.GenericAlias"), ("py:obj", "typing.R"), ("py:class", "_M"), + ("py:class", "anndata.utils.Default"), ] # -- Social cards --------------------------------------------------------- diff --git a/src/anndata/_core/merge.py b/src/anndata/_core/merge.py index b26bc5f50..b63b7bf84 100644 --- a/src/anndata/_core/merge.py +++ b/src/anndata/_core/merge.py @@ -30,7 +30,7 @@ DaskArray, has_xp, ) -from ..utils import asarray, axis_len, warn, warn_once +from ..utils import Default, asarray, axis_len, warn, warn_once from .anndata import AnnData from .index import _subset, make_slice from .xarray import Dataset2D @@ -1422,7 +1422,7 @@ def concat( # noqa: PLR0912, PLR0913, PLR0915 adatas: Collection[AnnData] | Mapping[str, AnnData], *, axis: Literal["obs", 0, "var", 1] = "obs", - join: Join_T = "inner", + join: Join_T | Default = Default("inner"), # noqa: B008 merge: StrategiesLiteral | Callable | None = None, uns_merge: StrategiesLiteral | Callable | None = None, label: str | None = None, @@ -1653,6 +1653,18 @@ def concat( # noqa: PLR0912, PLR0913, PLR0915 else: adatas = list(adatas) + if isinstance(join, Default): + join = join.val + if (num_xs := sum(a.X is not None for a in adatas)) > 0 and num_xs < len( + adatas + ): + msg = ( + "Some Xs are None and non-explicit join found - Xs will be dropped, which matches the behavior of `layers`." + "This warning will be removed in the next minor release, 0.14." + "To silence this warning pass in an explicit `join` parameter." + ) + warn(msg, UserWarning) + if keys is None: keys = np.arange(len(adatas)).astype(str) diff --git a/src/anndata/utils.py b/src/anndata/utils.py index 2dabbe0b6..19d8172e2 100644 --- a/src/anndata/utils.py +++ b/src/anndata/utils.py @@ -2,6 +2,7 @@ import re import warnings +from dataclasses import dataclass from functools import partial, singledispatch from types import FunctionType, UnionType from typing import TYPE_CHECKING, Literal, TypeAliasType, get_args, get_origin @@ -459,3 +460,11 @@ def iter_outer( yield (attr_name, getattr(adata, attr_name)) if was_closed: adata.file.close() + + +@dataclass +class Default[T]: + val: T + + def __repr__(self) -> str: + return repr(self.val) diff --git a/tests/test_concatenate.py b/tests/test_concatenate.py index 2fa87d651..71ab664f3 100644 --- a/tests/test_concatenate.py +++ b/tests/test_concatenate.py @@ -2,6 +2,7 @@ import warnings from collections.abc import Hashable +from contextlib import nullcontext from copy import deepcopy from functools import partial, singledispatch from importlib.metadata import version @@ -1599,7 +1600,8 @@ def test_concatenate_size_0_axis(): assert concat([a, b]).shape == (10, 0) -def test_concat_null_X(use_xdataset): +@pytest.mark.parametrize("all_none", [True, False], ids=["all_none", "some_none"]) +def test_concat_null_X(*, use_xdataset: bool, all_none: bool): adatas_orig = { k: gen_adata((20, 10), obs_xdataset=use_xdataset, var_xdataset=use_xdataset) for k in list("abc") @@ -1607,11 +1609,17 @@ def test_concat_null_X(use_xdataset): adatas_no_X = {} for k, v in adatas_orig.items(): v = v.copy() - del v.X + if k == "a" or all_none: + del v.X adatas_no_X[k] = v orig = concat(adatas_orig, index_unique="-") - no_X = concat(adatas_no_X, index_unique="-") + with ( + pytest.warns(UserWarning, match=r"Some Xs are None") + if not all_none + else nullcontext() + ): + no_X = concat(adatas_no_X, index_unique="-") del orig.X assert_equal(no_X, orig) From eab6a5d4b7fa9caf446e33646bc54688fffdf884 Mon Sep 17 00:00:00 2001 From: "Philipp A." Date: Thu, 7 May 2026 13:57:53 +0200 Subject: [PATCH 50/51] relnote --- docs/release-notes/1707.feat.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/docs/release-notes/1707.feat.md b/docs/release-notes/1707.feat.md index f8127bcd5..3ca22dea7 100644 --- a/docs/release-notes/1707.feat.md +++ b/docs/release-notes/1707.feat.md @@ -1 +1,3 @@ -Move {attr}`~anndata.AnnData.X` into {attr}`anndata.AnnData.layers` under the `None` key {user}`flying-sheep` & {user}`ilan-gold` +Move {attr}`~anndata.AnnData.X` into {attr}`anndata.AnnData.layers` under the `None` key. +As result, `layers: X` shows up in `AnnData`’s text representation, `None` appears in `.layers.items()`/`.keys()`, {func}`~anndata.concat` now works when some `AnnData`s have no `X`. +{user}`flying-sheep` & {user}`ilan-gold` From 3d8f4bd85cc6b59cef06678e81e28fd71f861919 Mon Sep 17 00:00:00 2001 From: "Philipp A." Date: Thu, 7 May 2026 14:05:59 +0200 Subject: [PATCH 51/51] test for warning --- tests/test_concatenate.py | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/tests/test_concatenate.py b/tests/test_concatenate.py index 71ab664f3..37a8e6b75 100644 --- a/tests/test_concatenate.py +++ b/tests/test_concatenate.py @@ -1600,8 +1600,12 @@ def test_concatenate_size_0_axis(): assert concat([a, b]).shape == (10, 0) -@pytest.mark.parametrize("all_none", [True, False], ids=["all_none", "some_none"]) -def test_concat_null_X(*, use_xdataset: bool, all_none: bool): +@pytest.mark.parametrize( + ("all_none", "implicit_join"), + [(True, False), (False, False), (False, True)], + ids=["all_none", "some_none", "some_none-warn"], +) +def test_concat_null_X(*, use_xdataset: bool, all_none: bool, implicit_join: bool): adatas_orig = { k: gen_adata((20, 10), obs_xdataset=use_xdataset, var_xdataset=use_xdataset) for k in list("abc") @@ -1616,10 +1620,14 @@ def test_concat_null_X(*, use_xdataset: bool, all_none: bool): orig = concat(adatas_orig, index_unique="-") with ( pytest.warns(UserWarning, match=r"Some Xs are None") - if not all_none + if not all_none and implicit_join else nullcontext() ): - no_X = concat(adatas_no_X, index_unique="-") + no_X = ( + concat(adatas_no_X, index_unique="-") + if implicit_join + else concat(adatas_no_X, index_unique="-", join="inner") + ) del orig.X assert_equal(no_X, orig)