Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions docs/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -139,6 +139,7 @@ def res(
zarrs=("https://zarrs-python.readthedocs.io/en/stable/", None),
annbatch=("https://annbatch.readthedocs.io/en/stable/", None),
mudata=("https://mudata.readthedocs.io/stable/", None),
narwhals=("https://narwhals-dev.github.io/narwhals/", None),
)

# Fix mis-documented types. Use `anndata.utils.set_module` for ours instead.
Expand Down Expand Up @@ -187,6 +188,10 @@ def res(
("py:obj", "typing.R"),
("py:class", "_M"),
("py:class", "anndata.utils.Default"),
("py:class", "anndata._core._dataframe_backend.DataFrameLike"),
]
nitpick_ignore_regex = [
(r"py:.*", r"narwhals\._utils\..*"),
]

# -- Social cards ---------------------------------------------------------
Expand Down
5 changes: 5 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,8 @@ dependencies = [
"zarr >=3.1",
"typing-extensions; python_version<'3.13'",
"scverse-misc[settings]>=0.1.0",
# backend-agnostic obs/var; >=2.10.0 for the narwhals.plugins entry-point system (Dataset2D)
"narwhals>=2.10.0",
]
dynamic = [ "version" ]

Expand Down Expand Up @@ -129,6 +131,9 @@ dask = [
"dask[array]>=2024.5.1,!=2024.8.*,!=2024.9.*,!=2025.2.*,!=2025.3.*,!=2025.4.*,!=2025.5.*,!=2025.6.*,!=2025.7.*,!=2025.8.*",
]

[project.entry-points."narwhals.plugins"]
anndata = "anndata._core._dataframe_backend"

[tool.hatch.version]
source = "vcs"
raw-options.version_scheme = "release-branch-semver"
Expand Down
156 changes: 156 additions & 0 deletions src/anndata/_core/_dataframe_backend.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,156 @@
from __future__ import annotations

import re
from typing import TYPE_CHECKING, Protocol, cast, runtime_checkable

import narwhals as nw
from narwhals._utils import Implementation

if TYPE_CHECKING:
from typing import Any, Self

import pandas as pd
from narwhals._pandas_like.dataframe import PandasLikeDataFrame
from narwhals.typing import EagerAllowed, IntoBackend
from narwhals.utils import Version

from .xarray import Dataset2D


@runtime_checkable
class DataFrameLikeIlocIndexer(Protocol):
"""Positional indexer, as in ``df.iloc[...]``."""

def __getitem__(self, idx: Any) -> Any: ...


@runtime_checkable
class DataFrameLike(Protocol):
"""Structural contract an AnnData ``obs``/``var`` must satisfy to be stored as-is.

:class:`pandas.DataFrame` and :class:`~anndata._core.xarray.Dataset2D` conform. Index-less
frames (polars, pyarrow, cuDF, ...) do not — bring them in with :func:`from_backend`.
"""

def __len__(self) -> int: ...
@property
def index(self) -> pd.Index: ...
@property
def columns(self) -> pd.Index: ...
@columns.setter
def columns(self, value: Any) -> None: ...
@property
def shape(self) -> tuple[int, int]: ...
@property
def iloc(self) -> DataFrameLikeIlocIndexer: ...
def reindex(self, *, index: Any = None, axis: Any = 0, **kwargs: Any) -> Self: ...


NATIVE_PACKAGE = "anndata"


def is_native(native_object: object, /) -> bool:
"""Return whether ``native_object`` is a :class:`Dataset2D`."""
from .xarray import Dataset2D

return isinstance(native_object, Dataset2D)


class Dataset2DNamespace:
"""Routes a :class:`Dataset2D` to its pandas realisation as a compliant frame.

A ``Dataset2D`` realises to pandas (:meth:`~Dataset2D.to_memory`), which narwhals already
supports, so we route through its ``PandasLikeDataFrame`` instead of reimplementing the
compliant protocol. Wrapping realises the frame in memory (eager only). Row labels stay the
index, recoverable via :func:`narwhals.maybe_get_index`.

``PandasLikeDataFrame.from_native`` reads ``_implementation`` and ``_version`` off its
``context``, so this namespace doubles as that context.
"""

_implementation: Implementation = Implementation.PANDAS

def __init__(self, *, version: Version) -> None:
self._version = version

def from_native(self, native_object: Dataset2D, /) -> PandasLikeDataFrame:
from narwhals._pandas_like.dataframe import PandasLikeDataFrame

return PandasLikeDataFrame.from_native(native_object.to_memory(), context=self)


def __narwhals_namespace__(version: Version) -> Dataset2DNamespace:
"""Return the compliant namespace narwhals uses to wrap a :class:`Dataset2D`."""
return Dataset2DNamespace(version=version)


# pandas serialises an unnamed index as an "__index_level_<n>__" placeholder column.
_INDEX_PLACEHOLDER = re.compile(r"__index_level_\d+__")

_KNOWN_BACKENDS = ("pandas", "polars", "pyarrow", "modin", "cudf")
_EAGER_BACKENDS = frozenset(Implementation.from_backend(b) for b in _KNOWN_BACKENDS)


def to_backend(
frame: Any, backend: str | IntoBackend, *, index_name: str = "index"
) -> Any:
"""Outgest: return ``frame`` as a native DataFrame of ``backend``.

``frame`` is a ``DataFrameLike`` (:class:`pandas.DataFrame` or :class:`Dataset2D`); any other
narwhals-wrappable frame also works. ``backend`` is an eager backend: a string (``"pandas"``,
``"polars"``, ``"pyarrow"``, ``"modin"``, ``"cudf"``), the backend module, or a
:class:`narwhals.Implementation`. When the index is unnamed it rides along as an ``index_name``
column (non-pandas backends only). The result is not ``DataFrameLike`` for index-less backends.
"""
impl = Implementation.from_backend(backend)
if impl not in _EAGER_BACKENDS:
msg = f"Unsupported DataFrame backend {backend!r}; expected one of {_KNOWN_BACKENDS}."
raise ValueError(msg)

nwf = nw.from_native(frame)
if impl is Implementation.PANDAS:
return nwf.to_pandas()
table = _name_unnamed_index(nwf.to_arrow(), index_name)
return nw.from_arrow(
table, backend=cast("IntoBackend[EagerAllowed]", impl)
).to_native()


def from_backend(frame: Any, *, index_name: str | None = None) -> DataFrameLike:
"""Ingest: normalize ``frame`` to a stored ``DataFrameLike``.

Frames that already conform (:class:`pandas.DataFrame`, :class:`Dataset2D`) pass through.
A frame from another backend (polars, pyarrow, cuDF, ...) is converted to pandas; if
``index_name`` names one of its columns, that column becomes the index (the inverse of
:func:`to_backend`).
"""
if isinstance(frame, DataFrameLike):
return frame
df = nw.from_native(frame).to_pandas()
if index_name is not None and index_name in df.columns:
df = df.set_index(index_name)
return df


def try_from_backend(
frame: Any, *, index_name: str | None = None
) -> DataFrameLike | None:
"""Like :func:`from_backend`, but return ``None`` instead of raising when ``frame`` isn't a frame.

Used where the input might not be a dataframe at all (obs/var construction, the writer's
normalize-on-dispatch-miss path): an eager frame from another backend (polars, pyarrow,
cuDF, ...) becomes a pandas ``DataFrameLike``; a Series, ndarray, mapping, etc. returns
``None`` so the caller can handle it.
"""
wrapped = nw.from_native(frame, pass_through=True)
if wrapped is frame or not isinstance(wrapped, nw.DataFrame):
return None
return from_backend(frame, index_name=index_name)


def _name_unnamed_index(table, index_name: str):
"""Rename a pandas ``__index_level_<n>__`` placeholder column to ``index_name``."""
renamed = [
index_name if _INDEX_PLACEHOLDER.fullmatch(c) else c for c in table.column_names
]
return table.rename_columns(renamed) if renamed != table.column_names else table
114 changes: 47 additions & 67 deletions src/anndata/_core/aligned_df.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
from __future__ import annotations

from collections.abc import Mapping
from functools import singledispatch
from typing import TYPE_CHECKING

import pandas as pd
Expand All @@ -11,41 +10,67 @@
from .._warnings import ImplicitModificationWarning
from ..compat import XDataset, pandas_as_str
from ..utils import warn
from ._dataframe_backend import DataFrameLike, try_from_backend
from .xarray import Dataset2D

if TYPE_CHECKING:
from collections.abc import Iterable
from typing import Any, Literal


@singledispatch
def _gen_dataframe(
anno: Any,
index_names: Iterable[str],
*,
source: Literal["X", "shape"],
attr: Literal["obs", "var"],
length: int | None = None,
) -> pd.DataFrame: # pragma: no cover
msg = f"Cannot convert {type(anno)} to {attr} DataFrame"
raise ValueError(msg)
) -> DataFrameLike:
"""Coerce ``anno`` to a stored ``obs``/``var`` frame (a :class:`DataFrameLike`).

Accepts ``None``/mappings (built into a pandas frame), an :class:`xarray.Dataset` /
:class:`Dataset2D`, any :class:`DataFrameLike` (pandas / ``Dataset2D``), or any other
narwhals-native eager frame (polars / pyarrow / cuDF / ...), brought in via
:func:`~anndata._core._dataframe_backend.from_backend`.
"""
index_names = list(index_names)
if isinstance(anno, DataFrameLike):
pass
elif isinstance(anno, XDataset):
anno = Dataset2D(anno)
elif anno is None or isinstance(anno, Mapping):
anno = _dataframe_from_mapping(anno, index_names, length=length)
else:
# frames from another backend are ingested via their canonical index column
# (obs_names/var_names); the deprecated row_names/col_names alias the mapping path
# accepts is not honored here.
coerced = try_from_backend(anno, index_name=index_names[0])
if coerced is None:
msg = f"Cannot convert {type(anno)} to {attr} DataFrame"
raise ValueError(msg)
anno = coerced
# The pandas MultiIndex restriction fires before the length check (preserving prior precedence).
if isinstance(anno, pd.DataFrame):
_reject_pandas_multiindex(anno)
# Uniform validation. shape[0] is the row count; len() is the column count on a
# Mapping-based Dataset2D.
if length is not None and length != anno.shape[0]:
raise _mk_df_error(source, attr, length, anno.shape[0])
# pandas-specific index/column hygiene (Dataset2D manages its own index).
if isinstance(anno, pd.DataFrame):
anno = _coerce_pandas_df(anno)
return anno


@_gen_dataframe.register(Mapping)
@_gen_dataframe.register(type(None))
def _gen_dataframe_mapping(
anno: Mapping[str, Any] | None,
index_names: Iterable[str],
*,
source: Literal["X", "shape"],
attr: Literal["obs", "var"],
length: int | None = None,
def _dataframe_from_mapping(
anno: Mapping[str, Any] | None, index_names: list[str], *, length: int | None
) -> pd.DataFrame:
"""Build a pandas DataFrame from a mapping (or ``None``), mirroring the constructor."""
if anno is None or len(anno) == 0:
anno = {}

def mk_index(l: int) -> pd.Index:
return pd.RangeIndex(0, l, name=None).astype(str)
def mk_index(length: int) -> pd.Index:
return pd.RangeIndex(0, length, name=None).astype(str)

for index_name in index_names:
if index_name not in anno:
Expand All @@ -65,29 +90,22 @@ def mk_index(l: int) -> pd.Index:

if length is None:
df.index = mk_index(len(df))
elif length != len(df):
raise _mk_df_error(source, attr, length, len(df))
return df


@_gen_dataframe.register(pd.DataFrame)
def _gen_dataframe_df(
anno: pd.DataFrame,
index_names: Iterable[str],
*,
source: Literal["X", "shape"],
attr: Literal["obs", "var"],
length: int | None = None,
):
def _reject_pandas_multiindex(anno: pd.DataFrame) -> None:
"""Disallow a pandas ``MultiIndex`` row index on obs/var declaration (unless opted out)."""
if isinstance(anno.index, pd.MultiIndex) and settings.restrict_index_types:
msg = (
"pandas.MultiIndex not supported as index for obs or var on declaration.\n"
"You can set `obs_names` manually although most operations after will error or convert to str.\n"
"You can also opt out of `settings.restrict_index_types` which will allow pandas.MultiIndex."
)
raise ValueError(msg)
if length is not None and length != len(anno):
raise _mk_df_error(source, attr, length, len(anno))


def _coerce_pandas_df(anno: pd.DataFrame) -> pd.DataFrame:
"""pandas-only index/column hygiene applied on obs/var declaration."""
anno = anno.copy(deep=False)
if (
settings.restrict_index_types
Expand All @@ -101,20 +119,6 @@ def _gen_dataframe_df(
return anno


@_gen_dataframe.register(pd.Series)
@_gen_dataframe.register(pd.Index)
def _gen_dataframe_1d(
anno: pd.Series | pd.Index,
index_names: Iterable[str],
*,
source: Literal["X", "shape"],
attr: Literal["obs", "var"],
length: int | None = None,
):
msg = f"Cannot convert {type(anno)} to {attr} DataFrame"
raise ValueError(msg)


def _mk_df_error(
source: Literal["X", "shape"],
attr: Literal["obs", "var"],
Expand All @@ -133,27 +137,3 @@ def _mk_df_error(
f"({actual} {what}s instead of {expected})"
)
return ValueError(msg)


@_gen_dataframe.register(Dataset2D)
def _gen_dataframe_xr(
anno: Dataset2D,
index_names: Iterable[str],
*,
source: Literal["X", "shape"],
attr: Literal["obs", "var"],
length: int | None = None,
):
return anno


@_gen_dataframe.register(XDataset)
def _gen_dataframe_xdataset(
anno: XDataset,
index_names: Iterable[str],
*,
source: Literal["X", "shape"],
attr: Literal["obs", "var"],
length: int | None = None,
):
return Dataset2D(anno)
Loading
Loading