From ed4a1d7c65d7638889226a716b9fc0787808d7a5 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Tue, 9 Dec 2025 11:05:49 +0100 Subject: [PATCH 01/11] feat: api for dataframes --- src/anndata/_core/aligned_df.py | 9 +- src/anndata/_core/aligned_mapping.py | 5 +- src/anndata/_core/anndata.py | 11 +- src/anndata/_core/index.py | 20 +- src/anndata/_core/merge.py | 5 +- src/anndata/_core/storage.py | 10 + src/anndata/_types.py | 108 ++++++- tests/test_dataframe_protocol.py | 407 +++++++++++++++++++++++++++ 8 files changed, 548 insertions(+), 27 deletions(-) create mode 100644 tests/test_dataframe_protocol.py diff --git a/src/anndata/_core/aligned_df.py b/src/anndata/_core/aligned_df.py index 722e881b7..880c08d38 100644 --- a/src/anndata/_core/aligned_df.py +++ b/src/anndata/_core/aligned_df.py @@ -7,6 +7,7 @@ import pandas as pd from pandas.api.types import is_string_dtype +from .._types import DataFrameLike from .._warnings import ImplicitModificationWarning from ..compat import XDataset from ..utils import warn @@ -25,7 +26,13 @@ def _gen_dataframe( source: Literal["X", "shape"], attr: Literal["obs", "var"], length: int | None = None, -) -> pd.DataFrame: # pragma: no cover +) -> DataFrameLike: # pragma: no cover + # Check if anno satisfies the DataFrameLike protocol + # This allows any DataFrameLike-compliant object to be used as obs/var + if isinstance(anno, DataFrameLike): + if length is not None and anno.shape[0] != length: + raise _mk_df_error(source, attr, length, anno.shape[0]) + return anno msg = f"Cannot convert {type(anno)} to {attr} DataFrame" raise ValueError(msg) diff --git a/src/anndata/_core/aligned_mapping.py b/src/anndata/_core/aligned_mapping.py index 3ac1c33d7..32827cb94 100644 --- a/src/anndata/_core/aligned_mapping.py +++ b/src/anndata/_core/aligned_mapping.py @@ -9,6 +9,7 @@ import numpy as np import pandas as pd +from .._types import DataFrameLike from .._warnings import ExperimentalFeatureWarning, ImplicitModificationWarning from ..compat import AwkArray, CSArray, CSMatrix, CupyArray, XDataset from ..utils import ( @@ -36,8 +37,8 @@ OneDIdx = Sequence[int] | Sequence[bool] | slice TwoDIdx = tuple[OneDIdx, OneDIdx] -# TODO: pd.DataFrame only allowed in AxisArrays? -Value = pd.DataFrame | CSMatrix | CSArray | np.ndarray +# DataFrameLike encompasses pd.DataFrame and Dataset2D +Value = DataFrameLike | CSMatrix | CSArray | np.ndarray class AlignedMappingBase[I: OneDIdx](MutableMapping[str, Value], ABC): diff --git a/src/anndata/_core/anndata.py b/src/anndata/_core/anndata.py index 35a679d20..c2c27fd8e 100644 --- a/src/anndata/_core/anndata.py +++ b/src/anndata/_core/anndata.py @@ -53,6 +53,7 @@ from zarr.storage import StoreLike + from .._types import DataFrameLike from ..compat import Index1D, Index1DNorm, XDataset from ..typing import XDataType from .aligned_mapping import AxisArraysView, LayersView, PairwiseArraysView @@ -757,7 +758,7 @@ def n_vars(self) -> int: """Number of variables/features.""" return len(self.var_names) - def _set_dim_df(self, value: pd.DataFrame | XDataset, attr: Literal["obs", "var"]): + def _set_dim_df(self, value: DataFrameLike | XDataset, attr: Literal["obs", "var"]): value = _gen_dataframe( value, [f"{attr}_names", f"{'row' if attr == 'obs' else 'col'}_names"], @@ -819,12 +820,12 @@ def _set_dim_index(self, value: pd.Index, attr: str): v.index = value @property - def obs(self) -> pd.DataFrame | Dataset2D: + def obs(self) -> DataFrameLike: """One-dimensional annotation of observations (`pd.DataFrame`).""" return self._obs @obs.setter - def obs(self, value: pd.DataFrame | XDataset): + def obs(self, value: DataFrameLike | XDataset): self._set_dim_df(value, "obs") @obs.deleter @@ -842,12 +843,12 @@ def obs_names(self, names: Sequence[str]): self._set_dim_index(names, "obs") @property - def var(self) -> pd.DataFrame | Dataset2D: + def var(self) -> DataFrameLike: """One-dimensional annotation of variables/ features (`pd.DataFrame`).""" return self._var @var.setter - def var(self, value: pd.DataFrame | XDataset): + def var(self, value: DataFrameLike | XDataset): self._set_dim_df(value, "var") @var.deleter diff --git a/src/anndata/_core/index.py b/src/anndata/_core/index.py index 3b92a99ac..5e3fe2c72 100644 --- a/src/anndata/_core/index.py +++ b/src/anndata/_core/index.py @@ -10,8 +10,8 @@ import pandas as pd from scipy.sparse import issparse +from .._types import DataFrameLike from ..compat import AwkArray, CSArray, CSMatrix, DaskArray, XDataArray -from .xarray import Dataset2D if TYPE_CHECKING: from numpy.typing import NDArray @@ -42,7 +42,7 @@ def _normalize_index( # noqa: PLR0911, PLR0912 ) -> Index1DNorm | int | np.integer: # TODO: why is this here? All tests pass without it and it seems at the minimum not strict enough. if not isinstance(index, pd.RangeIndex) and index.dtype in (np.float64, np.int64): - msg = f"Don’t call _normalize_index with non-categorical/string names and non-range index {index}" + msg = f"Don't call _normalize_index with non-categorical/string names and non-range index {index}" raise TypeError(msg) # the following is insanely slow for sequences, @@ -90,7 +90,7 @@ def name_idx(i): elif issubclass(indexer.dtype.type, np.bool_): if indexer.shape != index.shape: msg = ( - f"Boolean index does not match AnnData’s shape along this " + f"Boolean index does not match AnnData's shape along this " f"dimension. Boolean index has shape {indexer.shape} while " f"AnnData index has shape {index.shape}." ) @@ -164,9 +164,12 @@ def unpack_index(index: Index) -> tuple[Index1D, Index1D]: @singledispatch def _subset( - a: np.ndarray | pd.DataFrame, + a: np.ndarray | DataFrameLike, subset_idx: tuple[Index1DNorm] | tuple[Index1DNorm, Index1DNorm], ): + # Check for DataFrameLike objects (pd.DataFrame, Dataset2D, etc.) + if isinstance(a, DataFrameLike): + return a.iloc[subset_idx] # Select as combination of indexes, not coordinates # Correcting for indexing behaviour of np.ndarray if all(isinstance(x, Iterable) for x in subset_idx): @@ -200,15 +203,6 @@ def _subset_sparse( return a[subset_idx] -@_subset.register(pd.DataFrame) -@_subset.register(Dataset2D) -def _subset_df( - df: pd.DataFrame | Dataset2D, - subset_idx: tuple[Index1DNorm] | tuple[Index1DNorm, Index1DNorm], -): - return df.iloc[subset_idx] - - @_subset.register(AwkArray) def _subset_awkarray( a: AwkArray, subset_idx: tuple[Index1DNorm] | tuple[Index1DNorm, Index1DNorm] diff --git a/src/anndata/_core/merge.py b/src/anndata/_core/merge.py index a4bec22c3..9922c85ce 100644 --- a/src/anndata/_core/merge.py +++ b/src/anndata/_core/merge.py @@ -20,6 +20,7 @@ from anndata._core.file_backing import to_memory from anndata._warnings import ExperimentalFeatureWarning +from .._types import DataFrameLike from ..compat import ( AwkArray, CSArray, @@ -574,7 +575,7 @@ def apply(self, el, *, axis, fill_value=None): # noqa: PLR0911 """ if self.no_change and (axis_len(el, axis) == len(self.old_idx)): return el - if isinstance(el, pd.DataFrame | Dataset2D): + if isinstance(el, DataFrameLike): return self._apply_to_df_like(el, axis=axis, fill_value=fill_value) elif isinstance(el, CSMatrix | CSArray | CupySparseMatrix): return self._apply_to_sparse(el, axis=axis, fill_value=fill_value) @@ -587,7 +588,7 @@ def apply(self, el, *, axis, fill_value=None): # noqa: PLR0911 else: return self._apply_to_array(el, axis=axis, fill_value=fill_value) - def _apply_to_df_like(self, el: pd.DataFrame | Dataset2D, *, axis, fill_value=None): + def _apply_to_df_like(self, el: DataFrameLike, *, axis, fill_value=None): if fill_value is None: fill_value = np.nan return el.reindex(self.new_idx, axis=axis, fill_value=fill_value) diff --git a/src/anndata/_core/storage.py b/src/anndata/_core/storage.py index b7a63d785..95de80614 100644 --- a/src/anndata/_core/storage.py +++ b/src/anndata/_core/storage.py @@ -8,6 +8,7 @@ from anndata.compat import CSArray, CSMatrix +from .._types import DataFrameLike from .._warnings import ImplicitModificationWarning from ..compat import XDataset from ..utils import ( @@ -59,6 +60,15 @@ def coerce_array( if allow_df: raise_value_error_if_multiindex_columns(value, name) return value if allow_df else ensure_df_homogeneous(value, name) + # Handle other DataFrameLike objects (not pd.DataFrame) + if isinstance(value, DataFrameLike): + if allow_df: + return value + # For non-DataFrames, we can't use ensure_df_homogeneous + # so we convert to array via iloc + msg = f"DataFrameLike object used for {name} will be converted to array." + warn(msg, ImplicitModificationWarning) + return np.array(value.iloc[:, :]) # if value is an array-like object, try to convert it e = None if allow_array_like: diff --git a/src/anndata/_types.py b/src/anndata/_types.py index cbec38aa1..da23bdb5b 100644 --- a/src/anndata/_types.py +++ b/src/anndata/_types.py @@ -4,17 +4,19 @@ from __future__ import annotations -from typing import TYPE_CHECKING, Literal, Protocol +from typing import TYPE_CHECKING, Literal, Protocol, runtime_checkable -from . import typing from .compat import H5Array, H5Group, ZarrArray, ZarrGroup if TYPE_CHECKING: from collections.abc import Mapping - from typing import Any + from typing import Any, Self + + import pandas as pd from anndata._core.xarray import Dataset2D + from . import typing from ._io.specs.registry import ( IOSpec, LazyDataStructures, @@ -26,6 +28,8 @@ __all__ = [ "ArrayStorageType", + "DataFrameLike", + "DataFrameLikeIlocIndexer", "GroupStorageType", "StorageType", "_ReadInternal", @@ -39,7 +43,103 @@ # circumvent https://github.com/tox-dev/sphinx-autodoc-typehints/issues/580 type S = StorageType -type RWAble = typing.RWAble +type RWAble = "typing.RWAble" + + +@runtime_checkable +class DataFrameLikeIlocIndexer(Protocol): + """Protocol for iloc-style indexers on DataFrame-like objects. + + This protocol defines the minimal interface for positional-based indexing + that AnnData requires. Both :class:`pandas.DataFrame` and + :class:`~anndata.experimental.backed.Dataset2D` provide compatible + ``iloc`` accessors. + + Examples + -------- + >>> import pandas as pd + >>> from anndata._types import DataFrameLikeIlocIndexer + >>> df = pd.DataFrame({"a": [1, 2, 3]}) + >>> isinstance(df.iloc, DataFrameLikeIlocIndexer) + True + """ + + def __getitem__(self, idx: Any) -> Self: ... + + +@runtime_checkable +class DataFrameLike(Protocol): + """Protocol for DataFrame-like objects usable in AnnData. + + This runtime-checkable protocol defines the minimal DataFrame API that + AnnData uses internally for ``obs``, ``var``, and similar dataframe-like + data containers. Any class implementing this protocol can be used as a + drop-in replacement for :class:`pandas.DataFrame` in these contexts. + + The required interface includes: + + - :attr:`index`: Row labels as a :class:`pandas.Index` + - :attr:`columns`: Column labels as a :class:`pandas.Index` + - :attr:`shape`: Tuple of (n_rows, n_columns) + - :attr:`iloc`: Positional indexer returning a :class:`DataFrameLikeIlocIndexer` + - :meth:`reindex`: Method to reindex rows + + Examples + -------- + >>> import pandas as pd + >>> from anndata._types import DataFrameLike + >>> df = pd.DataFrame({"a": [1, 2, 3]}) + >>> isinstance(df, DataFrameLike) + True + + See Also + -------- + :class:`~anndata.experimental.backed.Dataset2D` + An xarray-based implementation of this protocol. + """ + + @property + def index(self) -> pd.Index: + """Row labels of the DataFrame-like object.""" + ... + + @property + def columns(self) -> pd.Index: + """Column labels of the DataFrame-like object.""" + ... + + @property + def shape(self) -> tuple[int, int]: + """Shape of the DataFrame-like object as (n_rows, n_columns).""" + ... + + @property + def iloc(self) -> DataFrameLikeIlocIndexer: + """Positional indexer for the DataFrame-like object.""" + ... + + def reindex( + self, + index: pd.Index | None = None, + axis: Literal[0] = 0, + fill_value: Any = ..., + ) -> Self: + """Reindex the DataFrame-like object to match a new index. + + Parameters + ---------- + index + New index to conform to. + axis + Axis to reindex along (only 0 is supported). + fill_value + Value to use for missing values. + + Returns + ------- + Reindexed DataFrame-like object. + """ + ... class Dataset2DIlocIndexer(Protocol): diff --git a/tests/test_dataframe_protocol.py b/tests/test_dataframe_protocol.py new file mode 100644 index 000000000..62b9371b5 --- /dev/null +++ b/tests/test_dataframe_protocol.py @@ -0,0 +1,407 @@ +"""Tests for the DataFrameLike protocol.""" + +from __future__ import annotations + +from typing import TYPE_CHECKING + +import numpy as np +import pandas as pd +import pytest + +from anndata._types import DataFrameLike, DataFrameLikeIlocIndexer + +if TYPE_CHECKING: + from typing import Any, Literal, Self + + +class MockIlocIndexer: + """Mock iloc indexer for testing.""" + + def __init__(self, data: pd.DataFrame): + self._data = data + + def __getitem__(self, idx: Any) -> MockDataFrame: + result = self._data.iloc[idx] + if isinstance(result, pd.DataFrame): + return MockDataFrame(result) + # For single row selection, wrap in DataFrame + return MockDataFrame(pd.DataFrame([result])) + + +class MockDataFrame: + """A minimal DataFrame-like class for testing the protocol.""" + + def __init__(self, data: pd.DataFrame): + self._data = data + + @property + def index(self) -> pd.Index: + return self._data.index + + @property + def columns(self) -> pd.Index: + return self._data.columns + + @property + def shape(self) -> tuple[int, int]: + return self._data.shape + + @property + def iloc(self) -> MockIlocIndexer: + return MockIlocIndexer(self._data) + + def reindex( + self, + index: pd.Index | None = None, + axis: Literal[0] = 0, + fill_value: Any = np.nan, + ) -> Self: + # axis=0 is the default; don't pass it when index is specified + # since pandas doesn't allow both keyword arguments together + return MockDataFrame(self._data.reindex(index=index, fill_value=fill_value)) + + +class TestDataFrameLikeProtocol: + """Test the DataFrameLike protocol with different implementations.""" + + @pytest.fixture + def sample_df(self) -> pd.DataFrame: + """Create a sample pandas DataFrame for testing.""" + return pd.DataFrame( + {"a": [1, 2, 3], "b": [4.0, 5.0, 6.0], "c": ["x", "y", "z"]}, + index=["row1", "row2", "row3"], + ) + + def test_pandas_dataframe_is_dataframe_like(self, sample_df: pd.DataFrame): + """pd.DataFrame should satisfy the DataFrameLike protocol.""" + assert isinstance(sample_df, DataFrameLike) + + def test_pandas_iloc_is_iloc_indexer(self, sample_df: pd.DataFrame): + """pd.DataFrame.iloc should satisfy the DataFrameLikeIlocIndexer protocol.""" + assert isinstance(sample_df.iloc, DataFrameLikeIlocIndexer) + + def test_mock_dataframe_is_dataframe_like(self, sample_df: pd.DataFrame): + """MockDataFrame should satisfy the DataFrameLike protocol.""" + mock_df = MockDataFrame(sample_df) + assert isinstance(mock_df, DataFrameLike) + + def test_mock_iloc_is_iloc_indexer(self, sample_df: pd.DataFrame): + """MockDataFrame.iloc should satisfy the DataFrameLikeIlocIndexer protocol.""" + mock_df = MockDataFrame(sample_df) + assert isinstance(mock_df.iloc, DataFrameLikeIlocIndexer) + + def test_dataframe_like_has_required_properties(self, sample_df: pd.DataFrame): + """Verify DataFrameLike objects have the required properties.""" + for df in [sample_df, MockDataFrame(sample_df)]: + assert hasattr(df, "index") + assert hasattr(df, "columns") + assert hasattr(df, "shape") + assert hasattr(df, "iloc") + assert hasattr(df, "reindex") + + def test_dataframe_like_index(self, sample_df: pd.DataFrame): + """Verify index property returns a pd.Index.""" + mock_df = MockDataFrame(sample_df) + assert isinstance(mock_df.index, pd.Index) + pd.testing.assert_index_equal(mock_df.index, sample_df.index) + + def test_dataframe_like_columns(self, sample_df: pd.DataFrame): + """Verify columns property returns a pd.Index.""" + mock_df = MockDataFrame(sample_df) + assert isinstance(mock_df.columns, pd.Index) + pd.testing.assert_index_equal(mock_df.columns, sample_df.columns) + + def test_dataframe_like_shape(self, sample_df: pd.DataFrame): + """Verify shape property returns correct tuple.""" + mock_df = MockDataFrame(sample_df) + assert mock_df.shape == sample_df.shape + assert mock_df.shape == (3, 3) + + def test_dataframe_like_iloc(self, sample_df: pd.DataFrame): + """Verify iloc indexer works correctly.""" + mock_df = MockDataFrame(sample_df) + + # Test single row selection + result = mock_df.iloc[0] + assert isinstance(result, DataFrameLike) + + # Test slice selection + result = mock_df.iloc[0:2] + assert isinstance(result, DataFrameLike) + assert result.shape[0] == 2 + + def test_dataframe_like_reindex(self, sample_df: pd.DataFrame): + """Verify reindex method works correctly.""" + mock_df = MockDataFrame(sample_df) + new_index = pd.Index(["row1", "row2", "row4"]) + + result = mock_df.reindex(index=new_index, fill_value=-1) + assert isinstance(result, DataFrameLike) + pd.testing.assert_index_equal(result.index, new_index) + + def test_non_dataframe_is_not_dataframe_like(self): + """Objects that don't implement the protocol should not match.""" + assert not isinstance([], DataFrameLike) + assert not isinstance({}, DataFrameLike) + assert not isinstance("string", DataFrameLike) + assert not isinstance(42, DataFrameLike) + assert not isinstance(np.array([1, 2, 3]), DataFrameLike) + + +@pytest.mark.usefixtures("xr_available") +class TestDataset2DIsDataFrameLike: + """Test that Dataset2D satisfies the DataFrameLike protocol.""" + + @pytest.fixture + def xr_available(self): + """Skip tests if xarray is not available.""" + pytest.importorskip("xarray") + + @pytest.fixture + def sample_dataset2d(self): + """Create a sample Dataset2D for testing.""" + from anndata._core.xarray import Dataset2D + from anndata.compat import XDataset + + ds = XDataset( + { + "a": (["idx"], [1, 2, 3]), + "b": (["idx"], [4.0, 5.0, 6.0]), + }, + coords={"idx": ["row1", "row2", "row3"]}, + ) + return Dataset2D(ds) + + def test_dataset2d_is_dataframe_like(self, sample_dataset2d): + """Dataset2D should satisfy the DataFrameLike protocol.""" + assert isinstance(sample_dataset2d, DataFrameLike) + + def test_dataset2d_iloc_is_iloc_indexer(self, sample_dataset2d): + """Dataset2D.iloc should satisfy the DataFrameLikeIlocIndexer protocol.""" + assert isinstance(sample_dataset2d.iloc, DataFrameLikeIlocIndexer) + + def test_dataset2d_has_required_properties(self, sample_dataset2d): + """Verify Dataset2D has the required properties.""" + assert hasattr(sample_dataset2d, "index") + assert hasattr(sample_dataset2d, "columns") + assert hasattr(sample_dataset2d, "shape") + assert hasattr(sample_dataset2d, "iloc") + assert hasattr(sample_dataset2d, "reindex") + + def test_dataset2d_properties_return_correct_types(self, sample_dataset2d): + """Verify Dataset2D properties return correct types.""" + assert isinstance(sample_dataset2d.index, pd.Index) + assert isinstance(sample_dataset2d.columns, pd.Index) + assert isinstance(sample_dataset2d.shape, tuple) + assert len(sample_dataset2d.shape) == 2 + + +class TestDataFrameLikeWithAnnData: + """Test that DataFrameLike protocol works correctly with AnnData objects.""" + + @pytest.fixture + def simple_adata(self): + """Create a simple AnnData object for testing.""" + import anndata as ad + + return ad.AnnData( + X=np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]), + obs=pd.DataFrame( + {"cell_type": ["A", "B", "C"]}, + index=["cell1", "cell2", "cell3"], + ), + var=pd.DataFrame( + {"gene_name": ["g1", "g2", "g3"]}, + index=["gene1", "gene2", "gene3"], + ), + ) + + def test_adata_obs_is_dataframe_like(self, simple_adata): + """AnnData.obs should satisfy the DataFrameLike protocol.""" + assert isinstance(simple_adata.obs, DataFrameLike) + + def test_adata_var_is_dataframe_like(self, simple_adata): + """AnnData.var should satisfy the DataFrameLike protocol.""" + assert isinstance(simple_adata.var, DataFrameLike) + + def test_adata_obs_has_required_properties(self, simple_adata): + """Verify AnnData.obs has all required DataFrameLike properties.""" + obs = simple_adata.obs + assert hasattr(obs, "index") + assert hasattr(obs, "columns") + assert hasattr(obs, "shape") + assert hasattr(obs, "iloc") + assert hasattr(obs, "reindex") + + def test_adata_obs_iloc_subsetting(self, simple_adata): + """Verify iloc subsetting works on AnnData.obs.""" + obs = simple_adata.obs + subset = obs.iloc[0:2] + assert isinstance(subset, DataFrameLike) + assert subset.shape[0] == 2 + + def test_adata_subset_preserves_dataframe_like(self, simple_adata): + """Verify subsetting AnnData preserves DataFrameLike for obs/var.""" + adata_subset = simple_adata[0:2, 0:2] + assert isinstance(adata_subset.obs, DataFrameLike) + assert isinstance(adata_subset.var, DataFrameLike) + assert adata_subset.obs.shape[0] == 2 + assert adata_subset.var.shape[0] == 2 + + def test_adata_copy_preserves_dataframe_like(self, simple_adata): + """Verify copying AnnData preserves DataFrameLike for obs/var.""" + adata_copy = simple_adata.copy() + assert isinstance(adata_copy.obs, DataFrameLike) + assert isinstance(adata_copy.var, DataFrameLike) + + def test_set_obs_with_dataframe(self, simple_adata): + """Verify setting obs with pd.DataFrame works.""" + new_obs = pd.DataFrame( + {"new_col": [1, 2, 3]}, + index=["cell1", "cell2", "cell3"], + ) + simple_adata.obs = new_obs + assert isinstance(simple_adata.obs, DataFrameLike) + assert "new_col" in simple_adata.obs.columns + + def test_set_var_with_dataframe(self, simple_adata): + """Verify setting var with pd.DataFrame works.""" + new_var = pd.DataFrame( + {"new_col": [1, 2, 3]}, + index=["gene1", "gene2", "gene3"], + ) + simple_adata.var = new_var + assert isinstance(simple_adata.var, DataFrameLike) + assert "new_col" in simple_adata.var.columns + + +class TestCustomDataFrameLikeWithAnnData: + """Test that custom DataFrameLike implementations work with AnnData.""" + + def test_init_adata_with_custom_dataframe_like_obs(self): + """Verify AnnData can be initialized with a custom DataFrameLike obs.""" + import anndata as ad + + # Create a custom DataFrameLike object + obs_df = pd.DataFrame( + {"cell_type": ["A", "B", "C"]}, + index=["cell1", "cell2", "cell3"], + ) + mock_obs = MockDataFrame(obs_df) + + # Verify MockDataFrame satisfies the protocol + assert isinstance(mock_obs, DataFrameLike) + + # Create AnnData with custom DataFrameLike + adata = ad.AnnData( + X=np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]), + obs=mock_obs, + var=pd.DataFrame( + {"gene_name": ["g1", "g2", "g3"]}, + index=["gene1", "gene2", "gene3"], + ), + ) + + # Verify obs is the MockDataFrame (unchanged) + assert isinstance(adata.obs, DataFrameLike) + assert adata.obs.shape[0] == 3 + + def test_init_adata_with_custom_dataframe_like_var(self): + """Verify AnnData can be initialized with a custom DataFrameLike var.""" + import anndata as ad + + # Create a custom DataFrameLike object + var_df = pd.DataFrame( + {"gene_name": ["g1", "g2", "g3"]}, + index=["gene1", "gene2", "gene3"], + ) + mock_var = MockDataFrame(var_df) + + # Verify MockDataFrame satisfies the protocol + assert isinstance(mock_var, DataFrameLike) + + # Create AnnData with custom DataFrameLike var + adata = ad.AnnData( + X=np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]), + obs=pd.DataFrame( + {"cell_type": ["A", "B", "C"]}, + index=["cell1", "cell2", "cell3"], + ), + var=mock_var, + ) + + # Verify var is the MockDataFrame (unchanged) + assert isinstance(adata.var, DataFrameLike) + assert adata.var.shape[0] == 3 + + def test_custom_dataframe_like_length_validation(self): + """Verify length validation works for custom DataFrameLike.""" + import anndata as ad + + # Create a custom DataFrameLike with wrong length + obs_df = pd.DataFrame( + {"cell_type": ["A", "B"]}, # Only 2 rows, but X has 3 + index=["cell1", "cell2"], + ) + mock_obs = MockDataFrame(obs_df) + + # Should raise ValueError due to length mismatch + with pytest.raises(ValueError, match="must have as many rows"): + ad.AnnData( + X=np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]), + obs=mock_obs, + var=pd.DataFrame( + {"gene_name": ["g1", "g2", "g3"]}, + index=["gene1", "gene2", "gene3"], + ), + ) + + +@pytest.mark.usefixtures("xr_available") +class TestDataset2DWithAnnData: + """Test that Dataset2D works correctly as obs/var in AnnData.""" + + @pytest.fixture + def xr_available(self): + """Skip tests if xarray is not available.""" + pytest.importorskip("xarray") + + @pytest.fixture + def adata_with_dataset2d_obs(self): + """Create an AnnData with Dataset2D obs.""" + import anndata as ad + from anndata._core.xarray import Dataset2D + from anndata.compat import XDataset + + X = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]) + + # Create Dataset2D for obs + obs_ds = XDataset( + {"cell_type": (["idx"], ["A", "B", "C"])}, + coords={"idx": ["cell1", "cell2", "cell3"]}, + ) + obs = Dataset2D(obs_ds) + + var = pd.DataFrame( + {"gene_name": ["g1", "g2", "g3"]}, + index=["gene1", "gene2", "gene3"], + ) + + return ad.AnnData(X=X, obs=obs, var=var) + + def test_adata_with_dataset2d_obs_is_dataframe_like(self, adata_with_dataset2d_obs): + """AnnData with Dataset2D obs should satisfy DataFrameLike.""" + assert isinstance(adata_with_dataset2d_obs.obs, DataFrameLike) + + def test_adata_with_dataset2d_subset(self, adata_with_dataset2d_obs): + """Subsetting AnnData with Dataset2D obs should work.""" + adata_subset = adata_with_dataset2d_obs[0:2] + assert isinstance(adata_subset.obs, DataFrameLike) + assert adata_subset.obs.shape[0] == 2 + + def test_adata_with_dataset2d_obs_index(self, adata_with_dataset2d_obs): + """Dataset2D obs should have correct index.""" + obs = adata_with_dataset2d_obs.obs + pd.testing.assert_index_equal( + obs.index, pd.Index(["cell1", "cell2", "cell3"]), check_names=False + ) From 232d1baf4cf3f8aab9e2ccd37e223da228969697 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Wed, 4 Feb 2026 16:48:11 +0100 Subject: [PATCH 02/11] fix: iloc --- src/anndata/_types.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/anndata/_types.py b/src/anndata/_types.py index e57b842b0..900276d53 100644 --- a/src/anndata/_types.py +++ b/src/anndata/_types.py @@ -65,7 +65,7 @@ class DataFrameLikeIlocIndexer(Protocol): True """ - def __getitem__(self, idx: Any) -> Self: ... + def __getitem__(self, idx: Any) -> Any: ... @runtime_checkable From 3a7882ea878e84e43f3f95a0fc168983653cdb54 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Wed, 4 Feb 2026 17:03:43 +0100 Subject: [PATCH 03/11] fix: maybe make `reindex` weaker? --- src/anndata/_types.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/anndata/_types.py b/src/anndata/_types.py index 900276d53..1a3c2fe76 100644 --- a/src/anndata/_types.py +++ b/src/anndata/_types.py @@ -121,9 +121,11 @@ def iloc(self) -> DataFrameLikeIlocIndexer: def reindex( self, + *, index: pd.Index | None = None, - axis: Literal[0] = 0, + axis: Literal[0, 1] | None = 0, fill_value: Any = ..., + **kwargs, ) -> Self: """Reindex the DataFrame-like object to match a new index. From 287a6160587e544d04ad25009669100c94b0f5db Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Wed, 4 Feb 2026 17:08:40 +0100 Subject: [PATCH 04/11] fix: setter --- src/anndata/_types.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/anndata/_types.py b/src/anndata/_types.py index 1a3c2fe76..74c6c2df3 100644 --- a/src/anndata/_types.py +++ b/src/anndata/_types.py @@ -109,6 +109,11 @@ def columns(self) -> pd.Index: """Column labels of the DataFrame-like object.""" ... + @columns.setter + def _(self, v: Any) -> None: + """Setter for columns""" + ... + @property def shape(self) -> tuple[int, int]: """Shape of the DataFrame-like object as (n_rows, n_columns).""" From 1fb50d6d64525da8010ddf422ecd19b55008b838 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Wed, 4 Feb 2026 17:29:33 +0100 Subject: [PATCH 05/11] fix: put in `types.py` --- src/anndata/_core/aligned_df.py | 2 +- src/anndata/_core/aligned_mapping.py | 2 +- src/anndata/_core/anndata.py | 2 +- src/anndata/_core/index.py | 4 +- src/anndata/_core/merge.py | 2 +- src/anndata/_core/storage.py | 2 +- src/anndata/_types.py | 111 +-------------------------- src/anndata/types.py | 107 ++++++++++++++++++++++++++ 8 files changed, 117 insertions(+), 115 deletions(-) diff --git a/src/anndata/_core/aligned_df.py b/src/anndata/_core/aligned_df.py index 82621f542..b35706122 100644 --- a/src/anndata/_core/aligned_df.py +++ b/src/anndata/_core/aligned_df.py @@ -7,9 +7,9 @@ import pandas as pd from pandas.api.types import is_string_dtype -from .._types import DataFrameLike from .._warnings import ImplicitModificationWarning from ..compat import XDataset, pandas_as_str +from ..types import DataFrameLike from ..utils import warn from .xarray import Dataset2D diff --git a/src/anndata/_core/aligned_mapping.py b/src/anndata/_core/aligned_mapping.py index 32827cb94..70621b433 100644 --- a/src/anndata/_core/aligned_mapping.py +++ b/src/anndata/_core/aligned_mapping.py @@ -9,9 +9,9 @@ import numpy as np import pandas as pd -from .._types import DataFrameLike from .._warnings import ExperimentalFeatureWarning, ImplicitModificationWarning from ..compat import AwkArray, CSArray, CSMatrix, CupyArray, XDataset +from ..types import DataFrameLike from ..utils import ( axis_len, convert_to_dict, diff --git a/src/anndata/_core/anndata.py b/src/anndata/_core/anndata.py index 95f79557e..8a9b8f1af 100644 --- a/src/anndata/_core/anndata.py +++ b/src/anndata/_core/anndata.py @@ -61,7 +61,7 @@ from zarr.storage import StoreLike - from .._types import DataFrameLike + from ..types import DataFrameLike from ..typing import Index1D, _Index1DNorm, _XDataType from .aligned_mapping import AxisArraysView, LayersView, PairwiseArraysView from .index import Index diff --git a/src/anndata/_core/index.py b/src/anndata/_core/index.py index 1b90b4ba7..d1fb4cdd4 100644 --- a/src/anndata/_core/index.py +++ b/src/anndata/_core/index.py @@ -10,12 +10,12 @@ import pandas as pd from scipy.sparse import issparse -from .._types import DataFrameLike from ..compat import AwkArray, CSArray, CSMatrix, DaskArray, XDataArray if TYPE_CHECKING: from numpy.typing import NDArray + from ..types import DataFrameLike from ..typing import Index, Index1D, _Index1DNorm @@ -175,6 +175,8 @@ def _subset( subset_idx: tuple[_Index1DNorm] | tuple[_Index1DNorm, _Index1DNorm], ): # Check for DataFrameLike objects (pd.DataFrame, Dataset2D, etc.) + from ..types import DataFrameLike + if isinstance(a, DataFrameLike): return a.iloc[subset_idx] # Select as combination of indexes, not coordinates diff --git a/src/anndata/_core/merge.py b/src/anndata/_core/merge.py index 649ab3e6f..4d477ef38 100644 --- a/src/anndata/_core/merge.py +++ b/src/anndata/_core/merge.py @@ -20,7 +20,6 @@ from anndata._core.file_backing import to_memory from anndata._warnings import ExperimentalFeatureWarning -from .._types import DataFrameLike from ..compat import ( AwkArray, CSArray, @@ -30,6 +29,7 @@ CupySparseMatrix, DaskArray, ) +from ..types import DataFrameLike from ..utils import asarray, axis_len, warn, warn_once from .anndata import AnnData from .index import _subset, make_slice diff --git a/src/anndata/_core/storage.py b/src/anndata/_core/storage.py index 22afcebb9..05942fdea 100644 --- a/src/anndata/_core/storage.py +++ b/src/anndata/_core/storage.py @@ -8,9 +8,9 @@ from anndata.compat import CSArray, CSMatrix -from .._types import DataFrameLike from .._warnings import ImplicitModificationWarning from ..compat import XDataset +from ..types import DataFrameLike from ..utils import ( ensure_df_homogeneous, get_union_members, diff --git a/src/anndata/_types.py b/src/anndata/_types.py index 74c6c2df3..e278e4041 100644 --- a/src/anndata/_types.py +++ b/src/anndata/_types.py @@ -4,16 +4,14 @@ from __future__ import annotations -from typing import TYPE_CHECKING, Literal, Protocol, runtime_checkable +from typing import TYPE_CHECKING, Literal, Protocol from .compat import H5Array, H5Group, ZarrArray, ZarrGroup from .utils import set_module if TYPE_CHECKING: from collections.abc import Mapping - from typing import Any, Self, TypeAlias - - import pandas as pd + from typing import Any, TypeAlias from anndata._core.xarray import Dataset2D @@ -31,8 +29,6 @@ __all__ = [ - "DataFrameLike", - "DataFrameLikeIlocIndexer", "StorageType", "_ArrayStorageType", "_GroupStorageType", @@ -47,109 +43,6 @@ type StorageType = _ArrayStorageType | _GroupStorageType -@runtime_checkable -class DataFrameLikeIlocIndexer(Protocol): - """Protocol for iloc-style indexers on DataFrame-like objects. - - This protocol defines the minimal interface for positional-based indexing - that AnnData requires. Both :class:`pandas.DataFrame` and - :class:`~anndata.experimental.backed.Dataset2D` provide compatible - ``iloc`` accessors. - - Examples - -------- - >>> import pandas as pd - >>> from anndata._types import DataFrameLikeIlocIndexer - >>> df = pd.DataFrame({"a": [1, 2, 3]}) - >>> isinstance(df.iloc, DataFrameLikeIlocIndexer) - True - """ - - def __getitem__(self, idx: Any) -> Any: ... - - -@runtime_checkable -class DataFrameLike(Protocol): - """Protocol for DataFrame-like objects usable in AnnData. - - This runtime-checkable protocol defines the minimal DataFrame API that - AnnData uses internally for ``obs``, ``var``, and similar dataframe-like - data containers. Any class implementing this protocol can be used as a - drop-in replacement for :class:`pandas.DataFrame` in these contexts. - - The required interface includes: - - - :attr:`index`: Row labels as a :class:`pandas.Index` - - :attr:`columns`: Column labels as a :class:`pandas.Index` - - :attr:`shape`: Tuple of (n_rows, n_columns) - - :attr:`iloc`: Positional indexer returning a :class:`DataFrameLikeIlocIndexer` - - :meth:`reindex`: Method to reindex rows - - Examples - -------- - >>> import pandas as pd - >>> from anndata._types import DataFrameLike - >>> df = pd.DataFrame({"a": [1, 2, 3]}) - >>> isinstance(df, DataFrameLike) - True - - See Also - -------- - :class:`~anndata.experimental.backed.Dataset2D` - An xarray-based implementation of this protocol. - """ - - @property - def index(self) -> pd.Index: - """Row labels of the DataFrame-like object.""" - ... - - @property - def columns(self) -> pd.Index: - """Column labels of the DataFrame-like object.""" - ... - - @columns.setter - def _(self, v: Any) -> None: - """Setter for columns""" - ... - - @property - def shape(self) -> tuple[int, int]: - """Shape of the DataFrame-like object as (n_rows, n_columns).""" - ... - - @property - def iloc(self) -> DataFrameLikeIlocIndexer: - """Positional indexer for the DataFrame-like object.""" - ... - - def reindex( - self, - *, - index: pd.Index | None = None, - axis: Literal[0, 1] | None = 0, - fill_value: Any = ..., - **kwargs, - ) -> Self: - """Reindex the DataFrame-like object to match a new index. - - Parameters - ---------- - index - New index to conform to. - axis - Axis to reindex along (only 0 is supported). - fill_value - Value to use for missing values. - - Returns - ------- - Reindexed DataFrame-like object. - """ - ... - - @set_module("anndata.experimental") class Dataset2DIlocIndexer(Protocol): def __getitem__(self, idx: Any) -> Dataset2D: ... diff --git a/src/anndata/types.py b/src/anndata/types.py index ed3a293bd..49b75ac80 100644 --- a/src/anndata/types.py +++ b/src/anndata/types.py @@ -3,6 +3,10 @@ from typing import TYPE_CHECKING, Protocol, runtime_checkable if TYPE_CHECKING: + from typing import Any, Literal, Self + + from pandas import Index + from ._core.anndata import AnnData @@ -20,3 +24,106 @@ def __init__(self, adata: AnnData) -> None: """ Used to enforce the correct signature for extension namespaces. """ + + +@runtime_checkable +class DataFrameLikeIlocIndexer(Protocol): + """Protocol for iloc-style indexers on DataFrame-like objects. + + This protocol defines the minimal interface for positional-based indexing + that AnnData requires. Both :class:`pandas.DataFrame` and + :class:`~anndata.experimental.backed.Dataset2D` provide compatible + ``iloc`` accessors. + + Examples + -------- + >>> import pandas as pd + >>> from anndata._types import DataFrameLikeIlocIndexer + >>> df = pd.DataFrame({"a": [1, 2, 3]}) + >>> isinstance(df.iloc, DataFrameLikeIlocIndexer) + True + """ + + def __getitem__(self, idx: Any) -> Any: ... + + +@runtime_checkable +class DataFrameLike(Protocol): + """Protocol for DataFrame-like objects usable in AnnData. + + This runtime-checkable protocol defines the minimal DataFrame API that + AnnData uses internally for ``obs``, ``var``, and similar dataframe-like + data containers. Any class implementing this protocol can be used as a + drop-in replacement for :class:`pandas.DataFrame` in these contexts. + + The required interface includes: + + - :attr:`index`: Row labels as a :class:`pandas.Index` + - :attr:`columns`: Column labels as a :class:`pandas.Index` + - :attr:`shape`: Tuple of (n_rows, n_columns) + - :attr:`iloc`: Positional indexer returning a :class:`DataFrameLikeIlocIndexer` + - :meth:`reindex`: Method to reindex rows + + Examples + -------- + >>> import pandas as pd + >>> from anndata._types import DataFrameLike + >>> df = pd.DataFrame({"a": [1, 2, 3]}) + >>> isinstance(df, DataFrameLike) + True + + See Also + -------- + :class:`~anndata.experimental.backed.Dataset2D` + An xarray-based implementation of this protocol. + """ + + @property + def index(self) -> Index: + """Row labels of the DataFrame-like object.""" + ... + + @property + def columns(self) -> Index: + """Column labels of the DataFrame-like object.""" + ... + + @columns.setter + def columns(self, v: Any) -> None: + """Setter for columns""" + ... + + @property + def shape(self) -> tuple[int, int]: + """Shape of the DataFrame-like object as (n_rows, n_columns).""" + ... + + @property + def iloc(self) -> DataFrameLikeIlocIndexer: + """Positional indexer for the DataFrame-like object.""" + ... + + def reindex( + self, + *, + index: Index | None = None, + axis: Literal[0, 1] | None = 0, + fill_value: Any = ..., + **kwargs, + ) -> Self: + """Reindex the DataFrame-like object to match a new index. + + Parameters + ---------- + index + New index to conform to. + axis + Axis to reindex along (only 0 is supported). + fill_value + Value to use for missing values. + + Returns + ------- + Reindexed DataFrame-like object. + """ + ... From 86013ca2c2ff7bcffdce044f956a995c29c5f004 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Wed, 4 Feb 2026 17:32:31 +0100 Subject: [PATCH 06/11] fix: AI test with this was a failure --- tests/test_dataframe_protocol.py | 407 ------------------------------- 1 file changed, 407 deletions(-) delete mode 100644 tests/test_dataframe_protocol.py diff --git a/tests/test_dataframe_protocol.py b/tests/test_dataframe_protocol.py deleted file mode 100644 index 62b9371b5..000000000 --- a/tests/test_dataframe_protocol.py +++ /dev/null @@ -1,407 +0,0 @@ -"""Tests for the DataFrameLike protocol.""" - -from __future__ import annotations - -from typing import TYPE_CHECKING - -import numpy as np -import pandas as pd -import pytest - -from anndata._types import DataFrameLike, DataFrameLikeIlocIndexer - -if TYPE_CHECKING: - from typing import Any, Literal, Self - - -class MockIlocIndexer: - """Mock iloc indexer for testing.""" - - def __init__(self, data: pd.DataFrame): - self._data = data - - def __getitem__(self, idx: Any) -> MockDataFrame: - result = self._data.iloc[idx] - if isinstance(result, pd.DataFrame): - return MockDataFrame(result) - # For single row selection, wrap in DataFrame - return MockDataFrame(pd.DataFrame([result])) - - -class MockDataFrame: - """A minimal DataFrame-like class for testing the protocol.""" - - def __init__(self, data: pd.DataFrame): - self._data = data - - @property - def index(self) -> pd.Index: - return self._data.index - - @property - def columns(self) -> pd.Index: - return self._data.columns - - @property - def shape(self) -> tuple[int, int]: - return self._data.shape - - @property - def iloc(self) -> MockIlocIndexer: - return MockIlocIndexer(self._data) - - def reindex( - self, - index: pd.Index | None = None, - axis: Literal[0] = 0, - fill_value: Any = np.nan, - ) -> Self: - # axis=0 is the default; don't pass it when index is specified - # since pandas doesn't allow both keyword arguments together - return MockDataFrame(self._data.reindex(index=index, fill_value=fill_value)) - - -class TestDataFrameLikeProtocol: - """Test the DataFrameLike protocol with different implementations.""" - - @pytest.fixture - def sample_df(self) -> pd.DataFrame: - """Create a sample pandas DataFrame for testing.""" - return pd.DataFrame( - {"a": [1, 2, 3], "b": [4.0, 5.0, 6.0], "c": ["x", "y", "z"]}, - index=["row1", "row2", "row3"], - ) - - def test_pandas_dataframe_is_dataframe_like(self, sample_df: pd.DataFrame): - """pd.DataFrame should satisfy the DataFrameLike protocol.""" - assert isinstance(sample_df, DataFrameLike) - - def test_pandas_iloc_is_iloc_indexer(self, sample_df: pd.DataFrame): - """pd.DataFrame.iloc should satisfy the DataFrameLikeIlocIndexer protocol.""" - assert isinstance(sample_df.iloc, DataFrameLikeIlocIndexer) - - def test_mock_dataframe_is_dataframe_like(self, sample_df: pd.DataFrame): - """MockDataFrame should satisfy the DataFrameLike protocol.""" - mock_df = MockDataFrame(sample_df) - assert isinstance(mock_df, DataFrameLike) - - def test_mock_iloc_is_iloc_indexer(self, sample_df: pd.DataFrame): - """MockDataFrame.iloc should satisfy the DataFrameLikeIlocIndexer protocol.""" - mock_df = MockDataFrame(sample_df) - assert isinstance(mock_df.iloc, DataFrameLikeIlocIndexer) - - def test_dataframe_like_has_required_properties(self, sample_df: pd.DataFrame): - """Verify DataFrameLike objects have the required properties.""" - for df in [sample_df, MockDataFrame(sample_df)]: - assert hasattr(df, "index") - assert hasattr(df, "columns") - assert hasattr(df, "shape") - assert hasattr(df, "iloc") - assert hasattr(df, "reindex") - - def test_dataframe_like_index(self, sample_df: pd.DataFrame): - """Verify index property returns a pd.Index.""" - mock_df = MockDataFrame(sample_df) - assert isinstance(mock_df.index, pd.Index) - pd.testing.assert_index_equal(mock_df.index, sample_df.index) - - def test_dataframe_like_columns(self, sample_df: pd.DataFrame): - """Verify columns property returns a pd.Index.""" - mock_df = MockDataFrame(sample_df) - assert isinstance(mock_df.columns, pd.Index) - pd.testing.assert_index_equal(mock_df.columns, sample_df.columns) - - def test_dataframe_like_shape(self, sample_df: pd.DataFrame): - """Verify shape property returns correct tuple.""" - mock_df = MockDataFrame(sample_df) - assert mock_df.shape == sample_df.shape - assert mock_df.shape == (3, 3) - - def test_dataframe_like_iloc(self, sample_df: pd.DataFrame): - """Verify iloc indexer works correctly.""" - mock_df = MockDataFrame(sample_df) - - # Test single row selection - result = mock_df.iloc[0] - assert isinstance(result, DataFrameLike) - - # Test slice selection - result = mock_df.iloc[0:2] - assert isinstance(result, DataFrameLike) - assert result.shape[0] == 2 - - def test_dataframe_like_reindex(self, sample_df: pd.DataFrame): - """Verify reindex method works correctly.""" - mock_df = MockDataFrame(sample_df) - new_index = pd.Index(["row1", "row2", "row4"]) - - result = mock_df.reindex(index=new_index, fill_value=-1) - assert isinstance(result, DataFrameLike) - pd.testing.assert_index_equal(result.index, new_index) - - def test_non_dataframe_is_not_dataframe_like(self): - """Objects that don't implement the protocol should not match.""" - assert not isinstance([], DataFrameLike) - assert not isinstance({}, DataFrameLike) - assert not isinstance("string", DataFrameLike) - assert not isinstance(42, DataFrameLike) - assert not isinstance(np.array([1, 2, 3]), DataFrameLike) - - -@pytest.mark.usefixtures("xr_available") -class TestDataset2DIsDataFrameLike: - """Test that Dataset2D satisfies the DataFrameLike protocol.""" - - @pytest.fixture - def xr_available(self): - """Skip tests if xarray is not available.""" - pytest.importorskip("xarray") - - @pytest.fixture - def sample_dataset2d(self): - """Create a sample Dataset2D for testing.""" - from anndata._core.xarray import Dataset2D - from anndata.compat import XDataset - - ds = XDataset( - { - "a": (["idx"], [1, 2, 3]), - "b": (["idx"], [4.0, 5.0, 6.0]), - }, - coords={"idx": ["row1", "row2", "row3"]}, - ) - return Dataset2D(ds) - - def test_dataset2d_is_dataframe_like(self, sample_dataset2d): - """Dataset2D should satisfy the DataFrameLike protocol.""" - assert isinstance(sample_dataset2d, DataFrameLike) - - def test_dataset2d_iloc_is_iloc_indexer(self, sample_dataset2d): - """Dataset2D.iloc should satisfy the DataFrameLikeIlocIndexer protocol.""" - assert isinstance(sample_dataset2d.iloc, DataFrameLikeIlocIndexer) - - def test_dataset2d_has_required_properties(self, sample_dataset2d): - """Verify Dataset2D has the required properties.""" - assert hasattr(sample_dataset2d, "index") - assert hasattr(sample_dataset2d, "columns") - assert hasattr(sample_dataset2d, "shape") - assert hasattr(sample_dataset2d, "iloc") - assert hasattr(sample_dataset2d, "reindex") - - def test_dataset2d_properties_return_correct_types(self, sample_dataset2d): - """Verify Dataset2D properties return correct types.""" - assert isinstance(sample_dataset2d.index, pd.Index) - assert isinstance(sample_dataset2d.columns, pd.Index) - assert isinstance(sample_dataset2d.shape, tuple) - assert len(sample_dataset2d.shape) == 2 - - -class TestDataFrameLikeWithAnnData: - """Test that DataFrameLike protocol works correctly with AnnData objects.""" - - @pytest.fixture - def simple_adata(self): - """Create a simple AnnData object for testing.""" - import anndata as ad - - return ad.AnnData( - X=np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]), - obs=pd.DataFrame( - {"cell_type": ["A", "B", "C"]}, - index=["cell1", "cell2", "cell3"], - ), - var=pd.DataFrame( - {"gene_name": ["g1", "g2", "g3"]}, - index=["gene1", "gene2", "gene3"], - ), - ) - - def test_adata_obs_is_dataframe_like(self, simple_adata): - """AnnData.obs should satisfy the DataFrameLike protocol.""" - assert isinstance(simple_adata.obs, DataFrameLike) - - def test_adata_var_is_dataframe_like(self, simple_adata): - """AnnData.var should satisfy the DataFrameLike protocol.""" - assert isinstance(simple_adata.var, DataFrameLike) - - def test_adata_obs_has_required_properties(self, simple_adata): - """Verify AnnData.obs has all required DataFrameLike properties.""" - obs = simple_adata.obs - assert hasattr(obs, "index") - assert hasattr(obs, "columns") - assert hasattr(obs, "shape") - assert hasattr(obs, "iloc") - assert hasattr(obs, "reindex") - - def test_adata_obs_iloc_subsetting(self, simple_adata): - """Verify iloc subsetting works on AnnData.obs.""" - obs = simple_adata.obs - subset = obs.iloc[0:2] - assert isinstance(subset, DataFrameLike) - assert subset.shape[0] == 2 - - def test_adata_subset_preserves_dataframe_like(self, simple_adata): - """Verify subsetting AnnData preserves DataFrameLike for obs/var.""" - adata_subset = simple_adata[0:2, 0:2] - assert isinstance(adata_subset.obs, DataFrameLike) - assert isinstance(adata_subset.var, DataFrameLike) - assert adata_subset.obs.shape[0] == 2 - assert adata_subset.var.shape[0] == 2 - - def test_adata_copy_preserves_dataframe_like(self, simple_adata): - """Verify copying AnnData preserves DataFrameLike for obs/var.""" - adata_copy = simple_adata.copy() - assert isinstance(adata_copy.obs, DataFrameLike) - assert isinstance(adata_copy.var, DataFrameLike) - - def test_set_obs_with_dataframe(self, simple_adata): - """Verify setting obs with pd.DataFrame works.""" - new_obs = pd.DataFrame( - {"new_col": [1, 2, 3]}, - index=["cell1", "cell2", "cell3"], - ) - simple_adata.obs = new_obs - assert isinstance(simple_adata.obs, DataFrameLike) - assert "new_col" in simple_adata.obs.columns - - def test_set_var_with_dataframe(self, simple_adata): - """Verify setting var with pd.DataFrame works.""" - new_var = pd.DataFrame( - {"new_col": [1, 2, 3]}, - index=["gene1", "gene2", "gene3"], - ) - simple_adata.var = new_var - assert isinstance(simple_adata.var, DataFrameLike) - assert "new_col" in simple_adata.var.columns - - -class TestCustomDataFrameLikeWithAnnData: - """Test that custom DataFrameLike implementations work with AnnData.""" - - def test_init_adata_with_custom_dataframe_like_obs(self): - """Verify AnnData can be initialized with a custom DataFrameLike obs.""" - import anndata as ad - - # Create a custom DataFrameLike object - obs_df = pd.DataFrame( - {"cell_type": ["A", "B", "C"]}, - index=["cell1", "cell2", "cell3"], - ) - mock_obs = MockDataFrame(obs_df) - - # Verify MockDataFrame satisfies the protocol - assert isinstance(mock_obs, DataFrameLike) - - # Create AnnData with custom DataFrameLike - adata = ad.AnnData( - X=np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]), - obs=mock_obs, - var=pd.DataFrame( - {"gene_name": ["g1", "g2", "g3"]}, - index=["gene1", "gene2", "gene3"], - ), - ) - - # Verify obs is the MockDataFrame (unchanged) - assert isinstance(adata.obs, DataFrameLike) - assert adata.obs.shape[0] == 3 - - def test_init_adata_with_custom_dataframe_like_var(self): - """Verify AnnData can be initialized with a custom DataFrameLike var.""" - import anndata as ad - - # Create a custom DataFrameLike object - var_df = pd.DataFrame( - {"gene_name": ["g1", "g2", "g3"]}, - index=["gene1", "gene2", "gene3"], - ) - mock_var = MockDataFrame(var_df) - - # Verify MockDataFrame satisfies the protocol - assert isinstance(mock_var, DataFrameLike) - - # Create AnnData with custom DataFrameLike var - adata = ad.AnnData( - X=np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]), - obs=pd.DataFrame( - {"cell_type": ["A", "B", "C"]}, - index=["cell1", "cell2", "cell3"], - ), - var=mock_var, - ) - - # Verify var is the MockDataFrame (unchanged) - assert isinstance(adata.var, DataFrameLike) - assert adata.var.shape[0] == 3 - - def test_custom_dataframe_like_length_validation(self): - """Verify length validation works for custom DataFrameLike.""" - import anndata as ad - - # Create a custom DataFrameLike with wrong length - obs_df = pd.DataFrame( - {"cell_type": ["A", "B"]}, # Only 2 rows, but X has 3 - index=["cell1", "cell2"], - ) - mock_obs = MockDataFrame(obs_df) - - # Should raise ValueError due to length mismatch - with pytest.raises(ValueError, match="must have as many rows"): - ad.AnnData( - X=np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]), - obs=mock_obs, - var=pd.DataFrame( - {"gene_name": ["g1", "g2", "g3"]}, - index=["gene1", "gene2", "gene3"], - ), - ) - - -@pytest.mark.usefixtures("xr_available") -class TestDataset2DWithAnnData: - """Test that Dataset2D works correctly as obs/var in AnnData.""" - - @pytest.fixture - def xr_available(self): - """Skip tests if xarray is not available.""" - pytest.importorskip("xarray") - - @pytest.fixture - def adata_with_dataset2d_obs(self): - """Create an AnnData with Dataset2D obs.""" - import anndata as ad - from anndata._core.xarray import Dataset2D - from anndata.compat import XDataset - - X = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]) - - # Create Dataset2D for obs - obs_ds = XDataset( - {"cell_type": (["idx"], ["A", "B", "C"])}, - coords={"idx": ["cell1", "cell2", "cell3"]}, - ) - obs = Dataset2D(obs_ds) - - var = pd.DataFrame( - {"gene_name": ["g1", "g2", "g3"]}, - index=["gene1", "gene2", "gene3"], - ) - - return ad.AnnData(X=X, obs=obs, var=var) - - def test_adata_with_dataset2d_obs_is_dataframe_like(self, adata_with_dataset2d_obs): - """AnnData with Dataset2D obs should satisfy DataFrameLike.""" - assert isinstance(adata_with_dataset2d_obs.obs, DataFrameLike) - - def test_adata_with_dataset2d_subset(self, adata_with_dataset2d_obs): - """Subsetting AnnData with Dataset2D obs should work.""" - adata_subset = adata_with_dataset2d_obs[0:2] - assert isinstance(adata_subset.obs, DataFrameLike) - assert adata_subset.obs.shape[0] == 2 - - def test_adata_with_dataset2d_obs_index(self, adata_with_dataset2d_obs): - """Dataset2D obs should have correct index.""" - obs = adata_with_dataset2d_obs.obs - pd.testing.assert_index_equal( - obs.index, pd.Index(["cell1", "cell2", "cell3"]), check_names=False - ) From e3491b7c1f8f64a26043496a093c575515e789f5 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Wed, 4 Feb 2026 17:43:46 +0100 Subject: [PATCH 07/11] fix: more ai BS --- src/anndata/types.py | 34 +--------------------------------- 1 file changed, 1 insertion(+), 33 deletions(-) diff --git a/src/anndata/types.py b/src/anndata/types.py index 49b75ac80..dd678cc5d 100644 --- a/src/anndata/types.py +++ b/src/anndata/types.py @@ -30,18 +30,7 @@ def __init__(self, adata: AnnData) -> None: class DataFrameLikeIlocIndexer(Protocol): """Protocol for iloc-style indexers on DataFrame-like objects. - This protocol defines the minimal interface for positional-based indexing - that AnnData requires. Both :class:`pandas.DataFrame` and - :class:`~anndata.experimental.backed.Dataset2D` provide compatible - ``iloc`` accessors. - - Examples - -------- - >>> import pandas as pd - >>> from anndata._types import DataFrameLikeIlocIndexer - >>> df = pd.DataFrame({"a": [1, 2, 3]}) - >>> isinstance(df.iloc, DataFrameLikeIlocIndexer) - True + Only requires `__getitem__`. """ def __getitem__(self, idx: Any) -> Any: ... @@ -51,27 +40,6 @@ def __getitem__(self, idx: Any) -> Any: ... class DataFrameLike(Protocol): """Protocol for DataFrame-like objects usable in AnnData. - This runtime-checkable protocol defines the minimal DataFrame API that - AnnData uses internally for ``obs``, ``var``, and similar dataframe-like - data containers. Any class implementing this protocol can be used as a - drop-in replacement for :class:`pandas.DataFrame` in these contexts. - - The required interface includes: - - - :attr:`index`: Row labels as a :class:`pandas.Index` - - :attr:`columns`: Column labels as a :class:`pandas.Index` - - :attr:`shape`: Tuple of (n_rows, n_columns) - - :attr:`iloc`: Positional indexer returning a :class:`DataFrameLikeIlocIndexer` - - :meth:`reindex`: Method to reindex rows - - Examples - -------- - >>> import pandas as pd - >>> from anndata._types import DataFrameLike - >>> df = pd.DataFrame({"a": [1, 2, 3]}) - >>> isinstance(df, DataFrameLike) - True - See Also -------- :class:`~anndata.experimental.backed.Dataset2D` From cb21896323dd85aa2448e99572f5b8ce03272129 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Wed, 4 Feb 2026 17:54:00 +0100 Subject: [PATCH 08/11] fix: make `Dataset2D` public --- src/anndata/_core/aligned_df.py | 14 +------------- src/anndata/_core/aligned_mapping.py | 5 +---- src/anndata/_core/storage.py | 3 --- src/anndata/_core/xarray.py | 4 +--- src/anndata/experimental/__init__.py | 3 +++ 5 files changed, 6 insertions(+), 23 deletions(-) diff --git a/src/anndata/_core/aligned_df.py b/src/anndata/_core/aligned_df.py index b35706122..4d7789eaa 100644 --- a/src/anndata/_core/aligned_df.py +++ b/src/anndata/_core/aligned_df.py @@ -8,7 +8,7 @@ from pandas.api.types import is_string_dtype from .._warnings import ImplicitModificationWarning -from ..compat import XDataset, pandas_as_str +from ..compat import pandas_as_str from ..types import DataFrameLike from ..utils import warn from .xarray import Dataset2D @@ -148,15 +148,3 @@ def _gen_dataframe_xr( length: int | None = None, ): return anno - - -@_gen_dataframe.register(XDataset) -def _gen_dataframe_xdataset( - anno: XDataset, - index_names: Iterable[str], - *, - source: Literal["X", "shape"], - attr: Literal["obs", "var"], - length: int | None = None, -): - return Dataset2D(anno) diff --git a/src/anndata/_core/aligned_mapping.py b/src/anndata/_core/aligned_mapping.py index 70621b433..bfbc2a05b 100644 --- a/src/anndata/_core/aligned_mapping.py +++ b/src/anndata/_core/aligned_mapping.py @@ -10,7 +10,7 @@ import pandas as pd from .._warnings import ExperimentalFeatureWarning, ImplicitModificationWarning -from ..compat import AwkArray, CSArray, CSMatrix, CupyArray, XDataset +from ..compat import AwkArray, CSArray, CSMatrix, CupyArray from ..types import DataFrameLike from ..utils import ( axis_len, @@ -25,7 +25,6 @@ from .index import _subset from .storage import coerce_array from .views import as_view, view_update -from .xarray import Dataset2D if TYPE_CHECKING: from collections.abc import Callable, Iterable, Iterator, Mapping @@ -75,8 +74,6 @@ def _validate_value(self, val: Value, key: str) -> Value: warn_once(msg, ExperimentalFeatureWarning) elif isinstance(val, np.ndarray | CupyArray) and len(val.shape) == 1: val = val.reshape((val.shape[0], 1)) - elif isinstance(val, XDataset): - val = Dataset2D(val) for i, axis in enumerate(self.axes): if self.parent.shape[axis] == axis_len(val, i): continue diff --git a/src/anndata/_core/storage.py b/src/anndata/_core/storage.py index 05942fdea..4a5503d3f 100644 --- a/src/anndata/_core/storage.py +++ b/src/anndata/_core/storage.py @@ -9,7 +9,6 @@ from anndata.compat import CSArray, CSMatrix from .._warnings import ImplicitModificationWarning -from ..compat import XDataset from ..types import DataFrameLike from ..utils import ( ensure_df_homogeneous, @@ -39,8 +38,6 @@ def coerce_array( return value # If value is one of the allowed types, return it array_data_structure_types = get_union_members(_ArrayDataStructureTypes) - if isinstance(value, XDataset): - value = Dataset2D(value) if isinstance(value, (*array_data_structure_types, Dataset2D)): if isinstance(value, np.matrix): msg = f"{name} should not be a np.matrix, use np.ndarray instead." diff --git a/src/anndata/_core/xarray.py b/src/anndata/_core/xarray.py index 0e75d604a..67460bf11 100644 --- a/src/anndata/_core/xarray.py +++ b/src/anndata/_core/xarray.py @@ -40,9 +40,7 @@ class Dataset2D(Mapping[Hashable, XDataArray | Self]): are respected, namely that there is only one 1d dim and coord with the same name i.e., like a :class:`pandas.DataFrame`. - You should not have to initiate this class yourself. Setting an :class:`xarray.Dataset` - into a relevant part of the :class:`~anndata.AnnData` object will attempt to wrap that - object in this object, trying to enforce the "dataframe-invariants." + You will need to wrap :class:`xarray.Dataset` inside this class if you wish to set :attr:`~anndata.AnnData.obs` or :attr:`~anndata.AnnData.var` with that. Because xarray requires :attr:`xarray.Dataset.coords` to be in-memory, this class provides handling for an out-of-memory index via :attr:`~anndata.experimental.backed.Dataset2D.true_index`. diff --git a/src/anndata/experimental/__init__.py b/src/anndata/experimental/__init__.py index 1271ac9b5..bb0480af8 100644 --- a/src/anndata/experimental/__init__.py +++ b/src/anndata/experimental/__init__.py @@ -12,6 +12,7 @@ Write, WriteCallback, ) +from ..types import DataFrameLike, DataFrameLikeIlocIndexer from ..utils import module_get_attr_redirect from ._dispatch_io import read_dispatched, write_dispatched from .backed import read_lazy @@ -53,6 +54,8 @@ def __getattr__(attr_name: str) -> Any: __all__ = [ "AnnCollection", "AnnLoader", + "DataFrameLike", + "DataFrameLikeIlocIndexer", "Dataset2DIlocIndexer", "IOSpec", "Read", From 44a1fe3b1b3baf40e4a37dfe47942a3bfa0eda08 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Wed, 4 Feb 2026 18:12:47 +0100 Subject: [PATCH 09/11] fix: remove internal uses of `Dataset2D` + `pd.DataFrame` --- src/anndata/_core/aligned_df.py | 23 ++--------------------- src/anndata/_core/storage.py | 3 +-- src/anndata/_core/views.py | 6 +++--- src/anndata/_core/xarray.py | 18 ++++++++++++------ src/anndata/tests/helpers.py | 12 ++++++------ src/anndata/types.py | 4 ++++ 6 files changed, 28 insertions(+), 38 deletions(-) diff --git a/src/anndata/_core/aligned_df.py b/src/anndata/_core/aligned_df.py index 4d7789eaa..dd6e82bbd 100644 --- a/src/anndata/_core/aligned_df.py +++ b/src/anndata/_core/aligned_df.py @@ -11,7 +11,6 @@ from ..compat import pandas_as_str from ..types import DataFrameLike from ..utils import warn -from .xarray import Dataset2D if TYPE_CHECKING: from collections.abc import Iterable @@ -27,12 +26,6 @@ def _gen_dataframe( attr: Literal["obs", "var"], length: int | None = None, ) -> DataFrameLike: # pragma: no cover - # Check if anno satisfies the DataFrameLike protocol - # This allows any DataFrameLike-compliant object to be used as obs/var - if isinstance(anno, DataFrameLike): - if length is not None and anno.shape[0] != length: - raise _mk_df_error(source, attr, length, anno.shape[0]) - return anno msg = f"Cannot convert {type(anno)} to {attr} DataFrame" raise ValueError(msg) @@ -76,9 +69,9 @@ def mk_index(l: int) -> pd.Index: return df -@_gen_dataframe.register(pd.DataFrame) +@_gen_dataframe.register(DataFrameLike) def _gen_dataframe_df( - anno: pd.DataFrame, + anno: DataFrameLike, index_names: Iterable[str], *, source: Literal["X", "shape"], @@ -136,15 +129,3 @@ def _mk_df_error( f"({actual} {what}s instead of {expected})" ) return ValueError(msg) - - -@_gen_dataframe.register(Dataset2D) -def _gen_dataframe_xr( - anno: Dataset2D, - index_names: Iterable[str], - *, - source: Literal["X", "shape"], - attr: Literal["obs", "var"], - length: int | None = None, -): - return anno diff --git a/src/anndata/_core/storage.py b/src/anndata/_core/storage.py index 4a5503d3f..9e793a4ba 100644 --- a/src/anndata/_core/storage.py +++ b/src/anndata/_core/storage.py @@ -17,7 +17,6 @@ raise_value_error_if_multiindex_columns, warn, ) -from .xarray import Dataset2D if TYPE_CHECKING: from typing import Any @@ -38,7 +37,7 @@ def coerce_array( return value # If value is one of the allowed types, return it array_data_structure_types = get_union_members(_ArrayDataStructureTypes) - if isinstance(value, (*array_data_structure_types, Dataset2D)): + if isinstance(value, (*array_data_structure_types, DataFrameLike)): if isinstance(value, np.matrix): msg = f"{name} should not be a np.matrix, use np.ndarray instead." warn(msg, ImplicitModificationWarning) diff --git a/src/anndata/_core/views.py b/src/anndata/_core/views.py index 95054139f..58e0e06a6 100644 --- a/src/anndata/_core/views.py +++ b/src/anndata/_core/views.py @@ -11,6 +11,7 @@ from scipy import sparse from anndata._warnings import ImplicitModificationWarning +from anndata.types import DataFrameLike from .._settings import settings from ..compat import ( @@ -23,7 +24,6 @@ ) from ..utils import warn from .access import ElementRef -from .xarray import Dataset2D if TYPE_CHECKING: from collections.abc import Callable, Iterable, KeysView, Sequence @@ -366,8 +366,8 @@ def as_view_cupy_csc(mtx, view_args): return CupySparseCSCView(mtx, view_args=view_args) -@as_view.register(Dataset2D) -def _(a: Dataset2D, view_args): +@as_view.register(DataFrameLike) +def _(a: DataFrameLike, view_args): return a diff --git a/src/anndata/_core/xarray.py b/src/anndata/_core/xarray.py index 67460bf11..0d74e1d05 100644 --- a/src/anndata/_core/xarray.py +++ b/src/anndata/_core/xarray.py @@ -1,10 +1,9 @@ from __future__ import annotations import warnings -from collections.abc import Hashable, Mapping from dataclasses import dataclass from functools import wraps -from typing import TYPE_CHECKING, Self, overload +from typing import TYPE_CHECKING, overload import numpy as np import pandas as pd @@ -14,7 +13,14 @@ from ..compat import XDataArray, XDataset, XVariable, pandas_as_str if TYPE_CHECKING: - from collections.abc import Callable, Collection, Iterable, Iterator + from collections.abc import ( + Callable, + Collection, + Hashable, + Iterable, + Iterator, + Mapping, + ) from typing import Any, Literal from .._types import Dataset2DIlocIndexer @@ -33,7 +39,7 @@ def wrapper(*args: P.args, **kwargs: P.kwargs) -> R: return wrapper -class Dataset2D(Mapping[Hashable, XDataArray | Self]): +class Dataset2D: r""" A wrapper class meant to enable working with lazy dataframe data according to :class:`~anndata.AnnData`'s internal API. This class ensures that "dataframe-invariants" @@ -189,7 +195,7 @@ def shape(self) -> tuple[int, int]: ------- The (2D) shape of the dataframe resolved from :attr:`~xarray.Dataset.sizes`. """ - return (self.ds.sizes[self.index_dim], len(self.ds)) + return (len(self), len(self.ds)) @property def iloc(self) -> Dataset2DIlocIndexer: @@ -361,7 +367,7 @@ def __iter__(self) -> Iterator[Hashable]: return iter(self.ds) def __len__(self) -> int: - return len(self.ds) + return self.ds.sizes[self.index_dim] @property def dtypes(self) -> Mapping[Hashable, np.dtype]: diff --git a/src/anndata/tests/helpers.py b/src/anndata/tests/helpers.py index 3ff344a16..8079d567f 100644 --- a/src/anndata/tests/helpers.py +++ b/src/anndata/tests/helpers.py @@ -337,9 +337,9 @@ def gen_adata( # noqa: PLR0913 if has_xr := find_spec("xarray"): if obs_xdataset: - obs = XDataset.from_dataframe(obs) + obs = Dataset2D(XDataset.from_dataframe(obs)) if var_xdataset: - var = XDataset.from_dataframe(var) + var = Dataset2D(XDataset.from_dataframe(var)) if X_type is None: X = None @@ -361,11 +361,11 @@ def gen_adata( # noqa: PLR0913 da=da.random.random((N, 50)), ) if has_xr: - obsm["xdataset"] = XDataset.from_dataframe( - gen_typed_df(M, obs_names, dtypes=obs_dtypes) + obsm["xdataset"] = Dataset2D( + XDataset.from_dataframe(gen_typed_df(M, obs_names, dtypes=obs_dtypes)) ) - varm["xdataset"] = XDataset.from_dataframe( - gen_typed_df(N, var_names, dtypes=var_dtypes) + varm["xdataset"] = Dataset2D( + XDataset.from_dataframe(gen_typed_df(N, var_names, dtypes=var_dtypes)) ) obsm = {k: v for k, v in obsm.items() if type(v) in obsm_types} obsm = maybe_add_sparse_array( diff --git a/src/anndata/types.py b/src/anndata/types.py index dd678cc5d..a2f6f4953 100644 --- a/src/anndata/types.py +++ b/src/anndata/types.py @@ -46,6 +46,10 @@ class DataFrameLike(Protocol): An xarray-based implementation of this protocol. """ + def __len__(self) -> int: + """Number of rows in this object""" + ... + @property def index(self) -> Index: """Row labels of the DataFrame-like object.""" From a69d374aceeb30a580439e4777ee698681b07f54 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Wed, 4 Feb 2026 18:45:38 +0100 Subject: [PATCH 10/11] fix: more ai bs --- src/anndata/_core/storage.py | 16 +++------------- 1 file changed, 3 insertions(+), 13 deletions(-) diff --git a/src/anndata/_core/storage.py b/src/anndata/_core/storage.py index 9e793a4ba..ac87a6e91 100644 --- a/src/anndata/_core/storage.py +++ b/src/anndata/_core/storage.py @@ -3,13 +3,12 @@ from typing import TYPE_CHECKING import numpy as np -import pandas as pd from scipy import sparse from anndata.compat import CSArray, CSMatrix +from anndata.types import DataFrameLike from .._warnings import ImplicitModificationWarning -from ..types import DataFrameLike from ..utils import ( ensure_df_homogeneous, get_union_members, @@ -37,7 +36,7 @@ def coerce_array( return value # If value is one of the allowed types, return it array_data_structure_types = get_union_members(_ArrayDataStructureTypes) - if isinstance(value, (*array_data_structure_types, DataFrameLike)): + if isinstance(value, array_data_structure_types): if isinstance(value, np.matrix): msg = f"{name} should not be a np.matrix, use np.ndarray instead." warn(msg, ImplicitModificationWarning) @@ -53,19 +52,10 @@ def coerce_array( if any(is_non_csc_r_array_or_matrix): msg = f"Only CSR and CSC {'matrices' if isinstance(value, sparse.spmatrix) else 'arrays'} are supported." raise ValueError(msg) - if isinstance(value, pd.DataFrame): + if isinstance(value, DataFrameLike): if allow_df: raise_value_error_if_multiindex_columns(value, name) return value if allow_df else ensure_df_homogeneous(value, name) - # Handle other DataFrameLike objects (not pd.DataFrame) - if isinstance(value, DataFrameLike): - if allow_df: - return value - # For non-DataFrames, we can't use ensure_df_homogeneous - # so we convert to array via iloc - msg = f"DataFrameLike object used for {name} will be converted to array." - warn(msg, ImplicitModificationWarning) - return np.array(value.iloc[:, :]) # if value is an array-like object, try to convert it e = None if allow_array_like: From fc13bf6fa3a49f878eef1ae73effd695c201a5fe Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Wed, 4 Feb 2026 18:58:47 +0100 Subject: [PATCH 11/11] fix: test --- tests/test_concatenate.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_concatenate.py b/tests/test_concatenate.py index 62ac5f90f..8110020c6 100644 --- a/tests/test_concatenate.py +++ b/tests/test_concatenate.py @@ -269,8 +269,8 @@ def test_concatenate_roundtrip( if backwards_compat and use_xdataset: import xarray as xr - result.var = xr.Dataset.from_dataframe( - result.var + result.var = Dataset2D( + xr.Dataset.from_dataframe(result.var) ) # backwards compat always returns a dataframe # Correcting for known differences