diff --git a/docs/conf.py b/docs/conf.py index 6e5f4e827..87166ebbd 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -139,6 +139,7 @@ def res( zarrs=("https://zarrs-python.readthedocs.io/en/stable/", None), annbatch=("https://annbatch.readthedocs.io/en/stable/", None), mudata=("https://mudata.readthedocs.io/stable/", None), + narwhals=("https://narwhals-dev.github.io/narwhals/", None), ) # Fix mis-documented types. Use `anndata.utils.set_module` for ours instead. @@ -187,6 +188,10 @@ def res( ("py:obj", "typing.R"), ("py:class", "_M"), ("py:class", "anndata.utils.Default"), + ("py:class", "anndata._core._dataframe_backend.DataFrameLike"), +] +nitpick_ignore_regex = [ + (r"py:.*", r"narwhals\._utils\..*"), ] # -- Social cards --------------------------------------------------------- diff --git a/pyproject.toml b/pyproject.toml index 792d1f034..6b35e1481 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -47,6 +47,8 @@ dependencies = [ "zarr >=3.1", "typing-extensions; python_version<'3.13'", "scverse-misc[settings]>=0.1.0", + # backend-agnostic obs/var; >=2.10.0 for the narwhals.plugins entry-point system (Dataset2D) + "narwhals>=2.10.0", ] dynamic = [ "version" ] @@ -129,6 +131,9 @@ dask = [ "dask[array]>=2024.5.1,!=2024.8.*,!=2024.9.*,!=2025.2.*,!=2025.3.*,!=2025.4.*,!=2025.5.*,!=2025.6.*,!=2025.7.*,!=2025.8.*", ] +[project.entry-points."narwhals.plugins"] +anndata = "anndata._core._dataframe_backend" + [tool.hatch.version] source = "vcs" raw-options.version_scheme = "release-branch-semver" diff --git a/src/anndata/_core/_dataframe_backend.py b/src/anndata/_core/_dataframe_backend.py new file mode 100644 index 000000000..2f87fe594 --- /dev/null +++ b/src/anndata/_core/_dataframe_backend.py @@ -0,0 +1,156 @@ +from __future__ import annotations + +import re +from typing import TYPE_CHECKING, Protocol, cast, runtime_checkable + +import narwhals as nw +from narwhals._utils import Implementation + +if TYPE_CHECKING: + from typing import Any, Self + + import pandas as pd + from narwhals._pandas_like.dataframe import PandasLikeDataFrame + from narwhals.typing import EagerAllowed, IntoBackend + from narwhals.utils import Version + + from .xarray import Dataset2D + + +@runtime_checkable +class DataFrameLikeIlocIndexer(Protocol): + """Positional indexer, as in ``df.iloc[...]``.""" + + def __getitem__(self, idx: Any) -> Any: ... + + +@runtime_checkable +class DataFrameLike(Protocol): + """Structural contract an AnnData ``obs``/``var`` must satisfy to be stored as-is. + + :class:`pandas.DataFrame` and :class:`~anndata._core.xarray.Dataset2D` conform. Index-less + frames (polars, pyarrow, cuDF, ...) do not — bring them in with :func:`from_backend`. + """ + + def __len__(self) -> int: ... + @property + def index(self) -> pd.Index: ... + @property + def columns(self) -> pd.Index: ... + @columns.setter + def columns(self, value: Any) -> None: ... + @property + def shape(self) -> tuple[int, int]: ... + @property + def iloc(self) -> DataFrameLikeIlocIndexer: ... + def reindex(self, *, index: Any = None, axis: Any = 0, **kwargs: Any) -> Self: ... + + +NATIVE_PACKAGE = "anndata" + + +def is_native(native_object: object, /) -> bool: + """Return whether ``native_object`` is a :class:`Dataset2D`.""" + from .xarray import Dataset2D + + return isinstance(native_object, Dataset2D) + + +class Dataset2DNamespace: + """Routes a :class:`Dataset2D` to its pandas realisation as a compliant frame. + + A ``Dataset2D`` realises to pandas (:meth:`~Dataset2D.to_memory`), which narwhals already + supports, so we route through its ``PandasLikeDataFrame`` instead of reimplementing the + compliant protocol. Wrapping realises the frame in memory (eager only). Row labels stay the + index, recoverable via :func:`narwhals.maybe_get_index`. + + ``PandasLikeDataFrame.from_native`` reads ``_implementation`` and ``_version`` off its + ``context``, so this namespace doubles as that context. + """ + + _implementation: Implementation = Implementation.PANDAS + + def __init__(self, *, version: Version) -> None: + self._version = version + + def from_native(self, native_object: Dataset2D, /) -> PandasLikeDataFrame: + from narwhals._pandas_like.dataframe import PandasLikeDataFrame + + return PandasLikeDataFrame.from_native(native_object.to_memory(), context=self) + + +def __narwhals_namespace__(version: Version) -> Dataset2DNamespace: + """Return the compliant namespace narwhals uses to wrap a :class:`Dataset2D`.""" + return Dataset2DNamespace(version=version) + + +# pandas serialises an unnamed index as an "__index_level___" placeholder column. +_INDEX_PLACEHOLDER = re.compile(r"__index_level_\d+__") + +_KNOWN_BACKENDS = ("pandas", "polars", "pyarrow", "modin", "cudf") +_EAGER_BACKENDS = frozenset(Implementation.from_backend(b) for b in _KNOWN_BACKENDS) + + +def to_backend( + frame: Any, backend: str | IntoBackend, *, index_name: str = "index" +) -> Any: + """Outgest: return ``frame`` as a native DataFrame of ``backend``. + + ``frame`` is a ``DataFrameLike`` (:class:`pandas.DataFrame` or :class:`Dataset2D`); any other + narwhals-wrappable frame also works. ``backend`` is an eager backend: a string (``"pandas"``, + ``"polars"``, ``"pyarrow"``, ``"modin"``, ``"cudf"``), the backend module, or a + :class:`narwhals.Implementation`. When the index is unnamed it rides along as an ``index_name`` + column (non-pandas backends only). The result is not ``DataFrameLike`` for index-less backends. + """ + impl = Implementation.from_backend(backend) + if impl not in _EAGER_BACKENDS: + msg = f"Unsupported DataFrame backend {backend!r}; expected one of {_KNOWN_BACKENDS}." + raise ValueError(msg) + + nwf = nw.from_native(frame) + if impl is Implementation.PANDAS: + return nwf.to_pandas() + table = _name_unnamed_index(nwf.to_arrow(), index_name) + return nw.from_arrow( + table, backend=cast("IntoBackend[EagerAllowed]", impl) + ).to_native() + + +def from_backend(frame: Any, *, index_name: str | None = None) -> DataFrameLike: + """Ingest: normalize ``frame`` to a stored ``DataFrameLike``. + + Frames that already conform (:class:`pandas.DataFrame`, :class:`Dataset2D`) pass through. + A frame from another backend (polars, pyarrow, cuDF, ...) is converted to pandas; if + ``index_name`` names one of its columns, that column becomes the index (the inverse of + :func:`to_backend`). + """ + if isinstance(frame, DataFrameLike): + return frame + df = nw.from_native(frame).to_pandas() + if index_name is not None and index_name in df.columns: + df = df.set_index(index_name) + return df + + +def try_from_backend( + frame: Any, *, index_name: str | None = None +) -> DataFrameLike | None: + """Like :func:`from_backend`, but return ``None`` instead of raising when ``frame`` isn't a frame. + + Used where the input might not be a dataframe at all (obs/var construction, the writer's + normalize-on-dispatch-miss path): an eager frame from another backend (polars, pyarrow, + cuDF, ...) becomes a pandas ``DataFrameLike``; a Series, ndarray, mapping, etc. returns + ``None`` so the caller can handle it. + """ + wrapped = nw.from_native(frame, pass_through=True) + if wrapped is frame or not isinstance(wrapped, nw.DataFrame): + return None + return from_backend(frame, index_name=index_name) + + +def _name_unnamed_index(table, index_name: str): + """Rename a pandas ``__index_level___`` placeholder column to ``index_name``.""" + renamed = [ + index_name if _INDEX_PLACEHOLDER.fullmatch(c) else c for c in table.column_names + ] + return table.rename_columns(renamed) if renamed != table.column_names else table diff --git a/src/anndata/_core/aligned_df.py b/src/anndata/_core/aligned_df.py index 877f05928..be3a2e98e 100644 --- a/src/anndata/_core/aligned_df.py +++ b/src/anndata/_core/aligned_df.py @@ -1,7 +1,6 @@ from __future__ import annotations from collections.abc import Mapping -from functools import singledispatch from typing import TYPE_CHECKING import pandas as pd @@ -11,6 +10,7 @@ from .._warnings import ImplicitModificationWarning from ..compat import XDataset, pandas_as_str from ..utils import warn +from ._dataframe_backend import DataFrameLike, try_from_backend from .xarray import Dataset2D if TYPE_CHECKING: @@ -18,7 +18,6 @@ from typing import Any, Literal -@singledispatch def _gen_dataframe( anno: Any, index_names: Iterable[str], @@ -26,26 +25,52 @@ def _gen_dataframe( source: Literal["X", "shape"], attr: Literal["obs", "var"], length: int | None = None, -) -> pd.DataFrame: # pragma: no cover - msg = f"Cannot convert {type(anno)} to {attr} DataFrame" - raise ValueError(msg) +) -> DataFrameLike: + """Coerce ``anno`` to a stored ``obs``/``var`` frame (a :class:`DataFrameLike`). + + Accepts ``None``/mappings (built into a pandas frame), an :class:`xarray.Dataset` / + :class:`Dataset2D`, any :class:`DataFrameLike` (pandas / ``Dataset2D``), or any other + narwhals-native eager frame (polars / pyarrow / cuDF / ...), brought in via + :func:`~anndata._core._dataframe_backend.from_backend`. + """ + index_names = list(index_names) + if isinstance(anno, DataFrameLike): + pass + elif isinstance(anno, XDataset): + anno = Dataset2D(anno) + elif anno is None or isinstance(anno, Mapping): + anno = _dataframe_from_mapping(anno, index_names, length=length) + else: + # frames from another backend are ingested via their canonical index column + # (obs_names/var_names); the deprecated row_names/col_names alias the mapping path + # accepts is not honored here. + coerced = try_from_backend(anno, index_name=index_names[0]) + if coerced is None: + msg = f"Cannot convert {type(anno)} to {attr} DataFrame" + raise ValueError(msg) + anno = coerced + # The pandas MultiIndex restriction fires before the length check (preserving prior precedence). + if isinstance(anno, pd.DataFrame): + _reject_pandas_multiindex(anno) + # Uniform validation. shape[0] is the row count; len() is the column count on a + # Mapping-based Dataset2D. + if length is not None and length != anno.shape[0]: + raise _mk_df_error(source, attr, length, anno.shape[0]) + # pandas-specific index/column hygiene (Dataset2D manages its own index). + if isinstance(anno, pd.DataFrame): + anno = _coerce_pandas_df(anno) + return anno -@_gen_dataframe.register(Mapping) -@_gen_dataframe.register(type(None)) -def _gen_dataframe_mapping( - anno: Mapping[str, Any] | None, - index_names: Iterable[str], - *, - source: Literal["X", "shape"], - attr: Literal["obs", "var"], - length: int | None = None, +def _dataframe_from_mapping( + anno: Mapping[str, Any] | None, index_names: list[str], *, length: int | None ) -> pd.DataFrame: + """Build a pandas DataFrame from a mapping (or ``None``), mirroring the constructor.""" if anno is None or len(anno) == 0: anno = {} - def mk_index(l: int) -> pd.Index: - return pd.RangeIndex(0, l, name=None).astype(str) + def mk_index(length: int) -> pd.Index: + return pd.RangeIndex(0, length, name=None).astype(str) for index_name in index_names: if index_name not in anno: @@ -65,20 +90,11 @@ def mk_index(l: int) -> pd.Index: if length is None: df.index = mk_index(len(df)) - elif length != len(df): - raise _mk_df_error(source, attr, length, len(df)) return df -@_gen_dataframe.register(pd.DataFrame) -def _gen_dataframe_df( - anno: pd.DataFrame, - index_names: Iterable[str], - *, - source: Literal["X", "shape"], - attr: Literal["obs", "var"], - length: int | None = None, -): +def _reject_pandas_multiindex(anno: pd.DataFrame) -> None: + """Disallow a pandas ``MultiIndex`` row index on obs/var declaration (unless opted out).""" if isinstance(anno.index, pd.MultiIndex) and settings.restrict_index_types: msg = ( "pandas.MultiIndex not supported as index for obs or var on declaration.\n" @@ -86,8 +102,10 @@ def _gen_dataframe_df( "You can also opt out of `settings.restrict_index_types` which will allow pandas.MultiIndex." ) raise ValueError(msg) - if length is not None and length != len(anno): - raise _mk_df_error(source, attr, length, len(anno)) + + +def _coerce_pandas_df(anno: pd.DataFrame) -> pd.DataFrame: + """pandas-only index/column hygiene applied on obs/var declaration.""" anno = anno.copy(deep=False) if ( settings.restrict_index_types @@ -101,20 +119,6 @@ def _gen_dataframe_df( return anno -@_gen_dataframe.register(pd.Series) -@_gen_dataframe.register(pd.Index) -def _gen_dataframe_1d( - anno: pd.Series | pd.Index, - index_names: Iterable[str], - *, - source: Literal["X", "shape"], - attr: Literal["obs", "var"], - length: int | None = None, -): - msg = f"Cannot convert {type(anno)} to {attr} DataFrame" - raise ValueError(msg) - - def _mk_df_error( source: Literal["X", "shape"], attr: Literal["obs", "var"], @@ -133,27 +137,3 @@ def _mk_df_error( f"({actual} {what}s instead of {expected})" ) return ValueError(msg) - - -@_gen_dataframe.register(Dataset2D) -def _gen_dataframe_xr( - anno: Dataset2D, - index_names: Iterable[str], - *, - source: Literal["X", "shape"], - attr: Literal["obs", "var"], - length: int | None = None, -): - return anno - - -@_gen_dataframe.register(XDataset) -def _gen_dataframe_xdataset( - anno: XDataset, - index_names: Iterable[str], - *, - source: Literal["X", "shape"], - attr: Literal["obs", "var"], - length: int | None = None, -): - return Dataset2D(anno) diff --git a/src/anndata/_core/anndata.py b/src/anndata/_core/anndata.py index 486c17f31..c2a780549 100644 --- a/src/anndata/_core/anndata.py +++ b/src/anndata/_core/anndata.py @@ -64,6 +64,7 @@ from os import PathLike from typing import Any, ClassVar, Literal + from narwhals.typing import IntoBackend from scipy import sparse from zarr.storage import StoreLike @@ -74,6 +75,7 @@ from ..acc import AdRef, Array, MapAcc, RefAcc from ..compat import CSArray, CSMatrix from ..typing import AxisStorable, Index, Index1D, _Index1DNorm, _XDataType + from ._dataframe_backend import DataFrameLike from .aligned_mapping import AxisArraysView, LayersView, PairwiseArraysView @@ -735,7 +737,7 @@ def n_vars(self) -> int: """Number of variables/features.""" return len(self.var_names) - def _set_dim_df(self, value: pd.DataFrame | XDataset, attr: Literal["obs", "var"]): + def _set_dim_df(self, value: DataFrameLike | XDataset, attr: Literal["obs", "var"]): value = _gen_dataframe( value, [f"{attr}_names", f"{'row' if attr == 'obs' else 'col'}_names"], @@ -804,12 +806,12 @@ def _set_dim_index(self, value: pd.Index, attr: str): v.index = value @property - def obs(self) -> pd.DataFrame | Dataset2D: - """One-dimensional annotation of observations (`pd.DataFrame`).""" + def obs(self) -> DataFrameLike: + """One-dimensional annotation of observations (a :class:`~pandas.DataFrame`-like).""" return self._obs @obs.setter - def obs(self, value: pd.DataFrame | XDataset): + def obs(self, value: DataFrameLike | XDataset): self._set_dim_df(value, "obs") @obs.deleter @@ -827,12 +829,12 @@ def obs_names(self, names: Sequence[str]): self._set_dim_index(names, "obs") @property - def var(self) -> pd.DataFrame | Dataset2D: - """One-dimensional annotation of variables/ features (`pd.DataFrame`).""" + def var(self) -> DataFrameLike: + """One-dimensional annotation of variables/ features (a :class:`~pandas.DataFrame`-like).""" return self._var @var.setter - def var(self, value: pd.DataFrame | XDataset): + def var(self, value: DataFrameLike | XDataset): self._set_dim_df(value, "var") @var.deleter @@ -849,6 +851,50 @@ def var_names(self, names: Sequence[str]): names = self._prep_dim_index(names, "var") self._set_dim_index(names, "var") + def obs_as(self, backend: str | IntoBackend) -> Any: + """Return :attr:`obs` as a native DataFrame of another backend. + + Parameters + ---------- + backend + ``"pandas"``, ``"polars"``, ``"pyarrow"``, ``"modin"`` or ``"cudf"`` (a backend + module or :class:`narwhals.Implementation` also works). + + Returns + ------- + :attr:`obs` as a native frame of ``backend``. ``obs_names`` is preserved — the index for + pandas, or an ``obs_names`` column for index-less backends. + + Examples + -------- + >>> adata.obs_as("polars") # doctest: +SKIP + """ + from ._dataframe_backend import to_backend + + return to_backend(self.obs, backend, index_name="obs_names") + + def var_as(self, backend: str | IntoBackend) -> Any: + """Return :attr:`var` as a native DataFrame of another backend. + + Parameters + ---------- + backend + ``"pandas"``, ``"polars"``, ``"pyarrow"``, ``"modin"`` or ``"cudf"`` (a backend + module or :class:`narwhals.Implementation` also works). + + Returns + ------- + :attr:`var` as a native frame of ``backend``. ``var_names`` is preserved — the index for + pandas, or a ``var_names`` column for index-less backends. + + Examples + -------- + >>> adata.var_as("polars") # doctest: +SKIP + """ + from ._dataframe_backend import to_backend + + return to_backend(self.var, backend, index_name="var_names") + @property def uns(self) -> MutableMapping: """Unstructured annotation (ordered dictionary).""" diff --git a/src/anndata/_core/index.py b/src/anndata/_core/index.py index 92359ab70..23d628546 100644 --- a/src/anndata/_core/index.py +++ b/src/anndata/_core/index.py @@ -23,6 +23,9 @@ XDataArray, has_xp, ) +from ._dataframe_backend import ( + DataFrameLike, # runtime: used in the PEP 695 bound of _subset_df +) from .xarray import Dataset2D if TYPE_CHECKING: @@ -431,7 +434,7 @@ def _subset_sparse[T: CSMatrix | CSArray]( @_subset.register(pd.DataFrame) @_subset.register(Dataset2D) @_ensure_numpy_idx -def _subset_df[T: pd.DataFrame | Dataset2D]( +def _subset_df[T: DataFrameLike]( df: T, subset_idx: tuple[_Index1DNorm] | tuple[_Index1DNorm, _Index1DNorm] ) -> T: return df.iloc[subset_idx] diff --git a/src/anndata/_io/specs/lazy_methods.py b/src/anndata/_io/specs/lazy_methods.py index 29ef3dda5..03668a6ed 100644 --- a/src/anndata/_io/specs/lazy_methods.py +++ b/src/anndata/_io/specs/lazy_methods.py @@ -345,7 +345,7 @@ def read_categorical( return CategoricalArray( codes=elem["codes"], categories=elem["categories"], - ordered=elem.attrs["ordered"], + ordered=bool(elem.attrs["ordered"]), base_path_or_zarr_group=base_path_or_zarr_group, elem_name=elem_name, ) diff --git a/src/anndata/_io/specs/registry.py b/src/anndata/_io/specs/registry.py index f2f2fb41b..34a3dfb6e 100644 --- a/src/anndata/_io/specs/registry.py +++ b/src/anndata/_io/specs/registry.py @@ -348,6 +348,24 @@ def find_write_func( # Raises IORegistryError return self.registry.get_write(dest_type, type(elem), modifiers, writer=self) + def _resolve_write_func( + self, dest_type: type, elem: Any, modifiers: frozenset[str] + ) -> tuple[Write, Any]: + """Find the writer for ``elem``, coercing a frame from another backend on a miss. + + On a dispatch miss, normalize a frame from another backend (polars/pyarrow/cuDF/...) to + pandas and retry. Returns ``(write_func, elem)`` -- ``elem`` may have been coerced. The + narwhals path runs only on the miss, so ordinary (array/mapping/...) writes are untouched. + """ + try: + return self.find_write_func(dest_type, elem, modifiers), elem + except IORegistryError: + from anndata._core._dataframe_backend import try_from_backend + + if (coerced := try_from_backend(elem)) is None: + raise + return self.find_write_func(dest_type, coerced, modifiers), coerced + @report_write_key_on_error def write_elem( self, @@ -404,7 +422,7 @@ def write_elem( # Normalize array-API (e.g., JAX/CuPy) even if not AnnData elem = normalize_nested(elem) - write_func = self.find_write_func(type(store), elem, modifiers) + write_func, elem = self._resolve_write_func(type(store), elem, modifiers) if self.callback is None: return write_func(store, k, elem, dataset_kwargs=dataset_kwargs) diff --git a/src/anndata/acc/__init__.py b/src/anndata/acc/__init__.py index 0844674ec..096382986 100644 --- a/src/anndata/acc/__init__.py +++ b/src/anndata/acc/__init__.py @@ -25,6 +25,7 @@ from collections.abc import Callable, Collection, Sequence from typing import Any, Literal, Self, TypeGuard + from .._core._dataframe_backend import DataFrameLike from .._core.aligned_mapping import AxisArrays from ..compat import XVariable from ..typing import InMemoryArray @@ -339,7 +340,7 @@ def idx_repr(self, k: str | None) -> str: def isin(self, data: MuData | AnnData, idx: str | None = None) -> bool: if idx is None: return True # obs and var index always exist - attr: pd.DataFrame | Dataset2D = getattr(data, self.dim) + attr: DataFrameLike = getattr(data, self.dim) return idx in attr def get( diff --git a/src/anndata/utils.py b/src/anndata/utils.py index 19d8172e2..6c7b7d1d2 100644 --- a/src/anndata/utils.py +++ b/src/anndata/utils.py @@ -23,7 +23,7 @@ from collections.abc import Callable, Generator, Iterable, Mapping, Sequence from typing import Any, LiteralString - from ._core.xarray import Dataset2D + from ._core._dataframe_backend import DataFrameLike from ._types import AnnDataElem from .typing import AxisStorable, _XDataType @@ -441,9 +441,7 @@ def module_get_attr_redirect( def iter_outer( adata, -) -> Generator[ - tuple[AnnDataElem, AxisStorable | _XDataType | Dataset2D | pd.DataFrame] -]: +) -> Generator[tuple[AnnDataElem, AxisStorable | _XDataType | DataFrameLike]]: """Iterate over key-value pairs of the parent "elems" like aw, obs, varp etc""" for attr_name in [ "obs", diff --git a/tests/test_dataframe_backend.py b/tests/test_dataframe_backend.py new file mode 100644 index 000000000..55cec8f71 --- /dev/null +++ b/tests/test_dataframe_backend.py @@ -0,0 +1,320 @@ +"""DataFrame-backend layer: narwhals plugin + obs/var backend conversion. + +Covers: +- the narwhals plugin (``nw.from_native(Dataset2D)``) +- the ``DataFrameLike`` contract + ``from_backend`` (ingest) +- ``to_backend`` / ``AnnData.obs_as`` / ``AnnData.var_as`` (outgest to any eager backend) +""" + +from __future__ import annotations + +import narwhals as nw +import numpy as np +import pandas as pd +import pytest + +import anndata as ad +from anndata._core._dataframe_backend import DataFrameLike, from_backend, to_backend +from anndata._core.xarray import Dataset2D +from anndata.compat import XDataset +from anndata.tests.helpers import assert_equal, gen_typed_df + +pytest.importorskip("xarray") + + +@pytest.fixture +def df(): + return gen_typed_df(10) + + +@pytest.fixture +def dataset2d(df): + return Dataset2D(XDataset.from_dataframe(df)) + + +@pytest.fixture +def named_obs(): + """A small obs with a categorical column and a *named* string index (obs_names).""" + return pd.DataFrame( + {"cell_type": pd.Categorical(["A", "B", "A"]), "n_genes": [10, 20, 30]}, + index=pd.Index(["AAAC", "AAAG", "AAAT"], name="obs_names"), + ) + + +def test_narwhals_from_native_roundtrip(df, dataset2d): + """``nw.from_native`` accepts a Dataset2D, yielding an eager frame matching to_memory().""" + frame = nw.from_native(dataset2d) + assert isinstance(frame, nw.DataFrame) + assert set(frame.columns) == set(df.columns) + assert_equal( + frame.to_native().sort_index(axis=1), dataset2d.to_memory().sort_index(axis=1) + ) + + +def test_narwhals_op_parity_with_pandas(df, dataset2d): + """A narwhals op on a Dataset2D matches the same op on the source pandas frame (numeric + columns, which ``to_memory`` doesn't recast — so this is a real transform, not self-comparison).""" + num = df.select_dtypes("number").columns.tolist() + predicate = nw.col(num[0]) >= df[num[0]].median() + from_ds = nw.from_native(dataset2d).filter(predicate).select(num).to_native() + from_df = ( + nw.from_native(df, eager_only=True).filter(predicate).select(num).to_native() + ) + assert_equal(from_ds.reset_index(drop=True), from_df.reset_index(drop=True)) + + +def test_narwhals_index_preserved(dataset2d): + """The row index survives wrapping, recoverable via maybe_get_index (narwhals has no index).""" + idx = nw.maybe_get_index(nw.from_native(dataset2d)) + assert_equal(idx, dataset2d.to_memory().index) + + +def test_narwhals_categorical_preserved(named_obs): + """Categorical columns (the load-bearing dtype) round-trip through the plugin.""" + ds = Dataset2D(XDataset.from_dataframe(named_obs)) + out = nw.from_native(ds).to_native() + assert isinstance(out["cell_type"].dtype, pd.CategoricalDtype) + assert out["cell_type"].tolist() == ["A", "B", "A"] + + +def test_narwhals_realises_backed_dataset2d(): + """``from_native`` realises a backed/lazy (dask) Dataset2D to eager pandas — no dask survives.""" + da = pytest.importorskip("dask.array") + n = 50 + ds = Dataset2D( + XDataset( + { + "val": ("idx", da.from_array(np.arange(n, dtype="float64"), chunks=20)), + "grp": ("idx", pd.array(["a", "b"] * (n // 2), dtype="category")), + }, + coords={"idx": [f"c{i}" for i in range(n)]}, + ) + ) + frame = nw.from_native(ds) + assert isinstance(frame, nw.DataFrame) # eager, not a LazyFrame + native = frame.to_native() + assert type(native).__module__.split(".")[0] == "pandas" + assert "dask" not in type(native["val"].values).__module__ # realised + assert native["val"].tolist() == list(range(n)) + assert isinstance(native["grp"].dtype, pd.CategoricalDtype) + + +@pytest.mark.parametrize("backend", ["pandas", "pyarrow", "polars"]) +def test_to_backend_from_dataset2d(dataset2d, backend): + """to_backend fans a Dataset2D out to any eager backend (values compared at the Arrow level).""" + pytest.importorskip(backend) + + native = to_backend(dataset2d, backend) + assert type(native).__module__.split(".")[0] == backend + got = nw.from_native(native, eager_only=True).to_arrow() + src = nw.from_native(dataset2d).to_arrow() + assert set(dataset2d.columns) <= set(got.column_names) + for name in dataset2d.columns: + assert got.column(name).to_pylist() == src.column(name).to_pylist() + + +def test_to_backend_named_index_identity(named_obs): + """A named index (obs_names) rides into index-less backends as a column; pandas keeps it as the index.""" + ds = Dataset2D(XDataset.from_dataframe(named_obs)) + for backend in ("polars", "pyarrow"): + pytest.importorskip(backend) + got = nw.from_native(to_backend(ds, backend), eager_only=True).to_arrow() + assert got.column("obs_names").to_pylist() == ["AAAC", "AAAG", "AAAT"] + assert got.column("n_genes").to_pylist() == [10, 20, 30] + pandas_frame = to_backend(ds, "pandas") + assert pandas_frame.index.name == "obs_names" + assert "obs_names" not in pandas_frame.columns + + +def test_obs_var_as_in_memory(named_obs): + """AnnData.obs_as / var_as on an in-memory (pandas-backed) AnnData.""" + pytest.importorskip("polars") + adata = ad.AnnData(X=np.zeros((3, 2), "f4"), obs=named_obs) + + obs_pl = adata.obs_as("polars") + assert type(obs_pl).__module__.split(".")[0] == "polars" + assert nw.from_native(obs_pl, eager_only=True)["cell_type"].to_list() == [ + "A", + "B", + "A", + ] + + obs_pd = adata.obs_as("pandas") + assert isinstance(obs_pd, pd.DataFrame) + assert obs_pd.index.name == "obs_names" + + var_pl = adata.var_as("polars") + assert type(var_pl).__module__.split(".")[0] == "polars" + + +def test_obs_as_read_lazy(tmp_path, diskfmt, named_obs): + """obs_as works the same when obs is a Dataset2D (read lazily) as when it is pandas.""" + pytest.importorskip("polars") + p = tmp_path / f"a.{diskfmt}" + getattr(ad.AnnData(X=np.zeros((3, 2), "f4"), obs=named_obs), f"write_{diskfmt}")(p) + lazy = ad.experimental.read_lazy(p) + assert isinstance(lazy.obs, Dataset2D) + obs_pl = lazy.obs_as("polars") + assert type(obs_pl).__module__.split(".")[0] == "polars" + assert "obs_names" in obs_pl.columns + + +def test_var_as_read_lazy(tmp_path, diskfmt): + """var_as on a backed Dataset2D (axis 1) carries var_names + categorical values.""" + pytest.importorskip("polars") + var = pd.DataFrame( + {"gene_type": pd.Categorical(["tf", "rp", "tf"])}, + index=pd.Index(["G1", "G2", "G3"], name="var_names"), + ) + p = tmp_path / f"a.{diskfmt}" + getattr(ad.AnnData(X=np.zeros((2, 3), "f4"), var=var), f"write_{diskfmt}")(p) + lazy = ad.experimental.read_lazy(p) + assert isinstance(lazy.var, Dataset2D) + var_pl = lazy.var_as("polars") + assert var_pl["var_names"].to_list() == ["G1", "G2", "G3"] + assert var_pl["gene_type"].to_list() == ["tf", "rp", "tf"] + + +def test_to_backend_unnamed_index_named(named_obs): + """A default (unnamed) index surfaces as obs_names/var_names, not ``__index_level_0__``.""" + pytest.importorskip("polars") + obs = pd.DataFrame({"g": [1, 2, 3]}, index=["c0", "c1", "c2"]) # unnamed index + adata = ad.AnnData(X=np.zeros((3, 2), "f4"), obs=obs) + assert adata.obs.index.name is None + cols = adata.obs_as("polars").columns + assert "obs_names" in cols + assert not any(c.startswith("__index_level_") for c in cols) + assert "var_names" in adata.var_as("polars").columns + + +def test_to_backend_unknown_backend_raises(dataset2d): + """An unrecognised backend name is a clear ValueError, not a cryptic AssertionError.""" + with pytest.raises(ValueError, match=r"Unsupported DataFrame backend 'polrs'"): + to_backend(dataset2d, "polrs") + + +def test_to_backend_lazy_backend_raises(dataset2d): + """A *lazy* backend (duckdb/dask/...) is rejected with our clear error — ``to_backend`` + is eager-only — instead of failing deep inside narwhals' ``from_arrow``.""" + with pytest.raises(ValueError, match=r"Unsupported DataFrame backend 'duckdb'"): + to_backend(dataset2d, "duckdb") + + +def test_narwhals_plugin_entry_point_registered(): + """The narwhals.plugins entry point is declared and exposes the plugin surface.""" + from importlib.metadata import entry_points + + eps = {e.name: e for e in entry_points(group="narwhals.plugins")} + assert "anndata" in eps + mod = eps["anndata"].load() + assert mod.NATIVE_PACKAGE == "anndata" + assert callable(mod.is_native) + assert callable(mod.__narwhals_namespace__) + + +@pytest.mark.parametrize("backend", ["cudf", "modin"]) +def test_pandas_like_backends_route_via_arrow(backend): + """cuDF/modin are recognised and (being non-pandas) route through Arrow — without needing + the backend installed.""" + impl = nw.Implementation.from_backend(backend) + assert impl is not nw.Implementation.UNKNOWN + assert ( + impl is not nw.Implementation.PANDAS + ) # takes the Arrow (identity-as-column) branch + + +def test_to_backend_empty_obs(): + """Empty obs shapes (0 columns; 0 rows) convert cleanly to a backend.""" + pytest.importorskip("polars") + + # 0 columns, with a named index + zero_col = Dataset2D( + XDataset.from_dataframe( + pd.DataFrame(index=pd.Index(["c0", "c1", "c2"], name="obs_names")) + ) + ) + out = to_backend(zero_col, "polars", index_name="obs_names") + assert out.shape == (3, 1) + assert out.columns == ["obs_names"] + + # 0 rows, with columns + zero_row = Dataset2D( + XDataset.from_dataframe( + pd.DataFrame({"g": pd.Series([], dtype="int64")}).rename_axis("obs_names") + ) + ) + out0 = to_backend(zero_row, "polars", index_name="obs_names") + assert out0.shape[0] == 0 + assert set(out0.columns) >= {"g"} + + +def test_dataframe_like_isinstance(named_obs, dataset2d): + """pandas + Dataset2D conform to DataFrameLike; index-less backends do not.""" + assert isinstance(named_obs, DataFrameLike) # pandas + assert isinstance(dataset2d, DataFrameLike) # Dataset2D + pl = pytest.importorskip("polars") + pa = pytest.importorskip("pyarrow") + assert not isinstance(pl.DataFrame({"a": [1]}), DataFrameLike) + assert not isinstance(pa.table({"a": [1]}), DataFrameLike) + + +def test_from_backend_passthrough(named_obs, dataset2d): + """Already-conforming frames (pandas, Dataset2D) ingest unchanged (same object).""" + assert from_backend(named_obs) is named_obs + assert from_backend(dataset2d) is dataset2d + + +def test_from_backend_foreign_restores_index(named_obs): + """A foreign backend ingests to a DataFrameLike (pandas), restoring obs_names to the index; + round-trips with to_backend.""" + pytest.importorskip("polars") + + pl_frame = to_backend( + named_obs, "polars", index_name="obs_names" + ) # obs_names is a column + back = from_backend(pl_frame, index_name="obs_names") + assert isinstance(back, DataFrameLike) + assert back.index.name == "obs_names" + assert list(back.index) == ["AAAC", "AAAG", "AAAT"] + assert "obs_names" not in back.columns + assert back["cell_type"].tolist() == ["A", "B", "A"] + + +def test_write_elem_foreign_backend(tmp_path, diskfmt): + """write_elem normalizes a foreign frame (polars) to pandas before writing — the + dispatch-miss path.""" + pytest.importorskip("polars") + import polars as pl + + from anndata.io import read_elem, write_elem + + if diskfmt == "zarr": + import zarr + + g = zarr.open_group(str(tmp_path / "t.zarr"), mode="w") + else: + import h5py + + g = h5py.File(tmp_path / "t.h5ad", "w") + write_elem(g, "obs", pl.DataFrame({"obs_names": ["c0", "c1"], "ct": ["T", "B"]})) + back = read_elem(g["obs"]) + assert isinstance(back, pd.DataFrame) + assert back["ct"].tolist() == ["T", "B"] + assert "obs_names" in back.columns # carried as a column (no AnnData axis context) + + +def test_assign_foreign_backend_to_obs(): + """Ingest: assigning a foreign frame to obs stores pandas with obs_names as the index.""" + pytest.importorskip("polars") + import polars as pl + + adata = ad.AnnData(np.zeros((3, 2), "f4")) + adata.obs = pl.DataFrame({ + "obs_names": ["c0", "c1", "c2"], + "ct": ["T", "B", "T"], + "n": [1, 2, 3], + }) + assert isinstance(adata.obs, pd.DataFrame) + assert list(adata.obs_names) == ["c0", "c1", "c2"] + assert adata.obs["ct"].tolist() == ["T", "B", "T"] + assert "obs_names" not in adata.obs.columns