Skip to content
34 changes: 5 additions & 29 deletions src/anndata/_core/aligned_df.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,9 @@
from pandas.api.types import is_string_dtype

from .._warnings import ImplicitModificationWarning
from ..compat import XDataset, pandas_as_str
from ..compat import pandas_as_str
from ..types import DataFrameLike
from ..utils import warn
from .xarray import Dataset2D

if TYPE_CHECKING:
from collections.abc import Iterable
Expand All @@ -25,7 +25,7 @@ def _gen_dataframe(
source: Literal["X", "shape"],
attr: Literal["obs", "var"],
length: int | None = None,
) -> pd.DataFrame: # pragma: no cover
) -> DataFrameLike: # pragma: no cover
msg = f"Cannot convert {type(anno)} to {attr} DataFrame"
raise ValueError(msg)

Expand Down Expand Up @@ -69,9 +69,9 @@ def mk_index(l: int) -> pd.Index:
return df


@_gen_dataframe.register(pd.DataFrame)
@_gen_dataframe.register(DataFrameLike)
def _gen_dataframe_df(
anno: pd.DataFrame,
anno: DataFrameLike,
index_names: Iterable[str],
*,
source: Literal["X", "shape"],
Expand Down Expand Up @@ -129,27 +129,3 @@ def _mk_df_error(
f"({actual} {what}s instead of {expected})"
)
return ValueError(msg)


@_gen_dataframe.register(Dataset2D)
def _gen_dataframe_xr(
anno: Dataset2D,
index_names: Iterable[str],
*,
source: Literal["X", "shape"],
attr: Literal["obs", "var"],
length: int | None = None,
):
return anno


@_gen_dataframe.register(XDataset)
def _gen_dataframe_xdataset(
anno: XDataset,
index_names: Iterable[str],
*,
source: Literal["X", "shape"],
attr: Literal["obs", "var"],
length: int | None = None,
):
return Dataset2D(anno)
10 changes: 4 additions & 6 deletions src/anndata/_core/aligned_mapping.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,8 @@
import pandas as pd

from .._warnings import ExperimentalFeatureWarning, ImplicitModificationWarning
from ..compat import AwkArray, CSArray, CSMatrix, CupyArray, XDataset
from ..compat import AwkArray, CSArray, CSMatrix, CupyArray
from ..types import DataFrameLike
from ..utils import (
axis_len,
convert_to_dict,
Expand All @@ -24,7 +25,6 @@
from .index import _subset
from .storage import coerce_array
from .views import as_view, view_update
from .xarray import Dataset2D

if TYPE_CHECKING:
from collections.abc import Callable, Iterable, Iterator, Mapping
Expand All @@ -36,8 +36,8 @@

OneDIdx = Sequence[int] | Sequence[bool] | slice
TwoDIdx = tuple[OneDIdx, OneDIdx]
# TODO: pd.DataFrame only allowed in AxisArrays?
Value = pd.DataFrame | CSMatrix | CSArray | np.ndarray
# DataFrameLike encompasses pd.DataFrame and Dataset2D
Value = DataFrameLike | CSMatrix | CSArray | np.ndarray


class AlignedMappingBase[I: OneDIdx](MutableMapping[str, Value], ABC):
Expand Down Expand Up @@ -74,8 +74,6 @@ def _validate_value(self, val: Value, key: str) -> Value:
warn_once(msg, ExperimentalFeatureWarning)
elif isinstance(val, np.ndarray | CupyArray) and len(val.shape) == 1:
val = val.reshape((val.shape[0], 1))
elif isinstance(val, XDataset):
val = Dataset2D(val)
for i, axis in enumerate(self.axes):
if self.parent.shape[axis] == axis_len(val, i):
continue
Expand Down
12 changes: 6 additions & 6 deletions src/anndata/_core/anndata.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@

from zarr.storage import StoreLike

from ..compat import XDataset
from ..types import DataFrameLike
from ..typing import Index1D, _Index1DNorm, _XDataType
from .aligned_mapping import AxisArraysView, LayersView, PairwiseArraysView
from .index import Index
Expand Down Expand Up @@ -772,7 +772,7 @@ def n_vars(self) -> int:
"""Number of variables/features."""
return len(self.var_names)

def _set_dim_df(self, value: pd.DataFrame | XDataset, attr: Literal["obs", "var"]):
def _set_dim_df(self, value: DataFrameLike, attr: Literal["obs", "var"]):
value = _gen_dataframe(
value,
[f"{attr}_names", f"{'row' if attr == 'obs' else 'col'}_names"],
Expand Down Expand Up @@ -836,12 +836,12 @@ def _set_dim_index(self, value: pd.Index, attr: str):
v.index = value

@property
def obs(self) -> pd.DataFrame | Dataset2D:
def obs(self) -> DataFrameLike:
"""One-dimensional annotation of observations (`pd.DataFrame`)."""
return self._obs

@obs.setter
def obs(self, value: pd.DataFrame | XDataset):
def obs(self, value: DataFrameLike):
self._set_dim_df(value, "obs")

@obs.deleter
Expand All @@ -859,12 +859,12 @@ def obs_names(self, names: Sequence[str]):
self._set_dim_index(names, "obs")

@property
def var(self) -> pd.DataFrame | Dataset2D:
def var(self) -> DataFrameLike:
"""One-dimensional annotation of variables/ features (`pd.DataFrame`)."""
return self._var

@var.setter
def var(self, value: pd.DataFrame | XDataset):
def var(self, value: DataFrameLike):
self._set_dim_df(value, "var")

@var.deleter
Expand Down
22 changes: 9 additions & 13 deletions src/anndata/_core/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,11 +11,11 @@
from scipy.sparse import issparse

from ..compat import AwkArray, CSArray, CSMatrix, DaskArray, XDataArray
from .xarray import Dataset2D

if TYPE_CHECKING:
from numpy.typing import NDArray

from ..types import DataFrameLike
from ..typing import Index, Index1D, _Index1DNorm


Expand All @@ -36,7 +36,7 @@ def _normalize_index( # noqa: PLR0911, PLR0912
) -> _Index1DNorm | int | np.integer:
# TODO: why is this here? All tests pass without it and it seems at the minimum not strict enough.
if not isinstance(index, pd.RangeIndex) and index.dtype in (np.float64, np.int64):
msg = f"Dont call _normalize_index with non-categorical/string names and non-range index {index}"
msg = f"Don't call _normalize_index with non-categorical/string names and non-range index {index}"
raise TypeError(msg)

if isinstance(indexer, pd.Index | pd.Series):
Expand Down Expand Up @@ -92,7 +92,7 @@ def name_idx(i):
elif issubclass(indexer.dtype.type, np.bool_):
if indexer.shape != index.shape:
msg = (
f"Boolean index does not match AnnDatas shape along this "
f"Boolean index does not match AnnData's shape along this "
f"dimension. Boolean index has shape {indexer.shape} while "
f"AnnData index has shape {index.shape}."
)
Expand Down Expand Up @@ -171,9 +171,14 @@ def unpack_index(index: Index) -> tuple[Index1D, Index1D]:

@singledispatch
def _subset(
a: np.ndarray | pd.DataFrame,
a: np.ndarray | DataFrameLike,
subset_idx: tuple[_Index1DNorm] | tuple[_Index1DNorm, _Index1DNorm],
):
# Check for DataFrameLike objects (pd.DataFrame, Dataset2D, etc.)
from ..types import DataFrameLike

if isinstance(a, DataFrameLike):
return a.iloc[subset_idx]
# Select as combination of indexes, not coordinates
# Correcting for indexing behaviour of np.ndarray
if all(isinstance(x, Iterable) for x in subset_idx):
Expand Down Expand Up @@ -207,15 +212,6 @@ def _subset_sparse(
return a[subset_idx]


@_subset.register(pd.DataFrame)
@_subset.register(Dataset2D)
def _subset_df(
df: pd.DataFrame | Dataset2D,
subset_idx: tuple[_Index1DNorm] | tuple[_Index1DNorm, _Index1DNorm],
):
return df.iloc[subset_idx]


@_subset.register(AwkArray)
def _subset_awkarray(
a: AwkArray, subset_idx: tuple[_Index1DNorm] | tuple[_Index1DNorm, _Index1DNorm]
Expand Down
5 changes: 3 additions & 2 deletions src/anndata/_core/merge.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
CupySparseMatrix,
DaskArray,
)
from ..types import DataFrameLike
from ..utils import asarray, axis_len, warn, warn_once
from .anndata import AnnData
from .index import _subset, make_slice
Expand Down Expand Up @@ -557,7 +558,7 @@ def apply(self, el, *, axis, fill_value=None): # noqa: PLR0911
"""
if self.no_change and (axis_len(el, axis) == len(self.old_idx)):
return el
if isinstance(el, pd.DataFrame | Dataset2D):
if isinstance(el, DataFrameLike):
return self._apply_to_df_like(el, axis=axis, fill_value=fill_value)
elif isinstance(el, CSMatrix | CSArray | CupySparseMatrix):
return self._apply_to_sparse(el, axis=axis, fill_value=fill_value)
Expand All @@ -570,7 +571,7 @@ def apply(self, el, *, axis, fill_value=None): # noqa: PLR0911
else:
return self._apply_to_array(el, axis=axis, fill_value=fill_value)

def _apply_to_df_like(self, el: pd.DataFrame | Dataset2D, *, axis, fill_value=None):
def _apply_to_df_like(self, el: DataFrameLike, *, axis, fill_value=None):
if fill_value is None:
fill_value = np.nan
return el.reindex(self.new_idx, axis=axis, fill_value=fill_value)
Expand Down
10 changes: 3 additions & 7 deletions src/anndata/_core/storage.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,21 +3,19 @@
from typing import TYPE_CHECKING

import numpy as np
import pandas as pd
from scipy import sparse

from anndata.compat import CSArray, CSMatrix
from anndata.types import DataFrameLike

from .._warnings import ImplicitModificationWarning
from ..compat import XDataset
from ..utils import (
ensure_df_homogeneous,
get_union_members,
join_english,
raise_value_error_if_multiindex_columns,
warn,
)
from .xarray import Dataset2D

if TYPE_CHECKING:
from typing import Any
Expand All @@ -38,9 +36,7 @@ def coerce_array(
return value
# If value is one of the allowed types, return it
array_data_structure_types = get_union_members(_ArrayDataStructureTypes)
if isinstance(value, XDataset):
value = Dataset2D(value)
if isinstance(value, (*array_data_structure_types, Dataset2D)):
if isinstance(value, array_data_structure_types):
if isinstance(value, np.matrix):
msg = f"{name} should not be a np.matrix, use np.ndarray instead."
warn(msg, ImplicitModificationWarning)
Expand All @@ -56,7 +52,7 @@ def coerce_array(
if any(is_non_csc_r_array_or_matrix):
msg = f"Only CSR and CSC {'matrices' if isinstance(value, sparse.spmatrix) else 'arrays'} are supported."
raise ValueError(msg)
if isinstance(value, pd.DataFrame):
if isinstance(value, DataFrameLike):
if allow_df:
raise_value_error_if_multiindex_columns(value, name)
return value if allow_df else ensure_df_homogeneous(value, name)
Expand Down
6 changes: 3 additions & 3 deletions src/anndata/_core/views.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
from scipy import sparse

from anndata._warnings import ImplicitModificationWarning
from anndata.types import DataFrameLike

from .._settings import settings
from ..compat import (
Expand All @@ -23,7 +24,6 @@
)
from ..utils import warn
from .access import ElementRef
from .xarray import Dataset2D

if TYPE_CHECKING:
from collections.abc import Callable, Iterable, KeysView, Sequence
Expand Down Expand Up @@ -366,8 +366,8 @@ def as_view_cupy_csc(mtx, view_args):
return CupySparseCSCView(mtx, view_args=view_args)


@as_view.register(Dataset2D)
def _(a: Dataset2D, view_args):
@as_view.register(DataFrameLike)
def _(a: DataFrameLike, view_args):
return a


Expand Down
22 changes: 13 additions & 9 deletions src/anndata/_core/xarray.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,9 @@
from __future__ import annotations

import warnings
from collections.abc import Hashable, Mapping
from dataclasses import dataclass
from functools import wraps
from typing import TYPE_CHECKING, Self, overload
from typing import TYPE_CHECKING, overload

import numpy as np
import pandas as pd
Expand All @@ -14,7 +13,14 @@
from ..compat import XDataArray, XDataset, XVariable, pandas_as_str

if TYPE_CHECKING:
from collections.abc import Callable, Collection, Iterable, Iterator
from collections.abc import (
Callable,
Collection,
Hashable,
Iterable,
Iterator,
Mapping,
)
from typing import Any, Literal

from .._types import Dataset2DIlocIndexer
Expand All @@ -33,16 +39,14 @@ def wrapper(*args: P.args, **kwargs: P.kwargs) -> R:
return wrapper


class Dataset2D(Mapping[Hashable, XDataArray | Self]):
class Dataset2D:

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

adopt __dataframe__ protocol in a meaningful way ideally for writing, but potentially as a solution to the above to cases

So will Dataset2D eventually support the __dataframe__ protocol?

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So will Dataset2D eventually support the dataframe protocol?

Mmmm depends if I actually implement its use here unless you have a use-case (you probably do?). In theory, I don't see why not but I haven't actually investigated what would go into that. There hasn't even been activity in the repo that hosts the protocols for like 2 years but it is documented on the arrow website: https://arrow.apache.org/docs/python/interchange_protocol.html

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It looks like in DataFrame land people have been using PyCapsule for an interchange as of late, here it is in pandas

Specifically Pandas seem to be fully onboard with the Arrow PyCapsule interface.

For new development, we highly recommend using the Arrow C Data Interface alongside the Arrow PyCapsule Interface instead of the interchange protocol. From pandas 3.0 onwards, from_dataframe uses the PyCapsule Interface, only falling back to the interchange protocol if that fails.

From pandas 4.0 onwards, that fallback will no longer be available and only the PyCapsule Interface will be used.

I guess with this now, do you think __dataframe__ is still worth your time and effort, if you'll have to add some backwards compatibility or special handling come pandas 4?

data-apis/dataframe-api#363

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I guess with this now, do you think dataframe is still worth your time and effort, if you'll have to add some backwards compatibility or special handling come pandas 4?

I'm definitely not tied to the idea, so if that is the better interchange, we should use that instead. Thanks for pointing this out!

r"""
A wrapper class meant to enable working with lazy dataframe data according to
:class:`~anndata.AnnData`'s internal API. This class ensures that "dataframe-invariants"
are respected, namely that there is only one 1d dim and coord with the same name i.e.,
like a :class:`pandas.DataFrame`.

You should not have to initiate this class yourself. Setting an :class:`xarray.Dataset`
into a relevant part of the :class:`~anndata.AnnData` object will attempt to wrap that
object in this object, trying to enforce the "dataframe-invariants."
You will need to wrap :class:`xarray.Dataset` inside this class if you wish to set :attr:`~anndata.AnnData.obs` or :attr:`~anndata.AnnData.var` with that.

Because xarray requires :attr:`xarray.Dataset.coords` to be in-memory, this class provides
handling for an out-of-memory index via :attr:`~anndata.experimental.backed.Dataset2D.true_index`.
Expand Down Expand Up @@ -191,7 +195,7 @@ def shape(self) -> tuple[int, int]:
-------
The (2D) shape of the dataframe resolved from :attr:`~xarray.Dataset.sizes`.
"""
return (self.ds.sizes[self.index_dim], len(self.ds))
return (len(self), len(self.ds))

@property
def iloc(self) -> Dataset2DIlocIndexer:
Expand Down Expand Up @@ -363,7 +367,7 @@ def __iter__(self) -> Iterator[Hashable]:
return iter(self.ds)

def __len__(self) -> int:
return len(self.ds)
return self.ds.sizes[self.index_dim]

@property
def dtypes(self) -> Mapping[Hashable, np.dtype]:
Expand Down
3 changes: 1 addition & 2 deletions src/anndata/_types.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@

from typing import TYPE_CHECKING, Literal, Protocol

from . import typing
from .compat import H5Array, H5Group, ZarrArray, ZarrGroup
from .utils import set_module

Expand All @@ -16,6 +15,7 @@

from anndata._core.xarray import Dataset2D

from . import typing
from ._io.specs.registry import (
IOSpec,
LazyDataStructures,
Expand All @@ -40,7 +40,6 @@
# These two are not public, so we don’t make them `type`s
_ArrayStorageType: TypeAlias = ZarrArray | H5Array # noqa: UP040
_GroupStorageType: TypeAlias = ZarrGroup | H5Group # noqa: UP040

type StorageType = _ArrayStorageType | _GroupStorageType


Expand Down
Loading
Loading