From 7b6483dd190d616782e2593320cb1a0da479b82a Mon Sep 17 00:00:00 2001 From: "Philipp A." Date: Mon, 5 Jan 2026 17:44:16 +0100 Subject: [PATCH 1/3] fix: actually allow missing mask in lazy MaskedArray --- .../experimental/backed/_lazy_arrays.py | 50 +++++++++++-------- 1 file changed, 28 insertions(+), 22 deletions(-) diff --git a/src/anndata/experimental/backed/_lazy_arrays.py b/src/anndata/experimental/backed/_lazy_arrays.py index cb3a926ff..831705665 100644 --- a/src/anndata/experimental/backed/_lazy_arrays.py +++ b/src/anndata/experimental/backed/_lazy_arrays.py @@ -1,7 +1,7 @@ from __future__ import annotations from functools import cached_property -from typing import TYPE_CHECKING +from typing import TYPE_CHECKING, cast import numpy as np import pandas as pd @@ -23,8 +23,9 @@ from pathlib import Path from typing import Literal + from numpy.typing import NDArray from pandas._libs.missing import NAType - from pandas.core.dtypes.base import ExtensionDtype + from pandas.core.dtypes.dtypes import BaseMaskedDtype from anndata.compat import ZarrGroup @@ -143,7 +144,7 @@ class MaskedArray[K: (H5Array, ZarrArray)](XBackendArray): by :class:`xarray.backends.BackendArray`. """ - _mask: ZarrOrHDF5Wrapper[K] + _mask: ZarrOrHDF5Wrapper[K] | None _values: ZarrOrHDF5Wrapper[K] _dtype_str: Literal["nullable-integer", "nullable-boolean", "nullable-string-array"] shape: tuple[int, ...] @@ -152,15 +153,15 @@ class MaskedArray[K: (H5Array, ZarrArray)](XBackendArray): def __init__( self, - values: ZarrArray | H5Array, + values: K, dtype_str: Literal[ "nullable-integer", "nullable-boolean", "nullable-string-array" ], - mask: ZarrArray | H5Array, + mask: K | None, base_path_or_zarr_group: Path | ZarrGroup, elem_name: str, ): - self._mask = ZarrOrHDF5Wrapper(mask) + self._mask = None if mask is None else ZarrOrHDF5Wrapper(mask) self._values = ZarrOrHDF5Wrapper(values) self._dtype_str = dtype_str self.shape = self._values.shape @@ -168,28 +169,33 @@ def __init__( self.file_format = "zarr" if isinstance(mask, ZarrArray) else "h5" self.elem_name = elem_name - def __getitem__(self, key: ExplicitIndexer) -> PandasExtensionArray | np.ndarray: - from xarray.core.extension_array import PandasExtensionArray - + def __getitem__( + self, key: ExplicitIndexer + ) -> PandasExtensionArray | NDArray[np.str_]: values = self._values[key] - mask = self._mask[key] - if self._dtype_str == "nullable-integer": - # numpy does not support nan ints - extension_array = pd.arrays.IntegerArray(values, mask=mask) - elif self._dtype_str == "nullable-boolean": - extension_array = pd.arrays.BooleanArray(values, mask=mask) - elif self._dtype_str == "nullable-string-array": + mask = None if self._mask is None else self._mask[key] + + # numpy does not support nan ints + if self._dtype_str in {"nullable-integer", "nullable-boolean"}: + from xarray.core.extension_array import PandasExtensionArray + + if mask is None: + mask = np.ones(len(values), dtype=bool) + cls = cast("BaseMaskedDtype", self.dtype).construct_array_type() + return PandasExtensionArray(cls(values, mask=mask)) + + if self._dtype_str == "nullable-string-array": # https://github.com/pydata/xarray/issues/10419 values = values.astype(self.dtype) - values[mask] = pd.NA + if mask is not None: + values[mask] = pd.NA return values - else: - msg = f"Invalid dtype_str {self._dtype_str}" - raise RuntimeError(msg) - return PandasExtensionArray(extension_array) + + msg = f"Invalid dtype_str {self._dtype_str}" + raise RuntimeError(msg) @cached_property - def dtype(self) -> np.dtypes.StringDType[NAType] | ExtensionDtype: + def dtype(self) -> np.dtypes.StringDType[NAType] | BaseMaskedDtype: if self._dtype_str == "nullable-integer": return pd.array( [], From 3253b88c610adfa03d51be7595be8e8856740e14 Mon Sep 17 00:00:00 2001 From: "Philipp A." Date: Thu, 8 Jan 2026 09:52:22 +0100 Subject: [PATCH 2/3] fix docs --- docs/conf.py | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/conf.py b/docs/conf.py index e041a37db..ff2590145 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -174,6 +174,7 @@ def res( "numpy.dtypes.StringDType": ("py:attr", "numpy.dtypes.StringDType"), "pandas.DataFrame.iloc": ("py:attr", "pandas.DataFrame.iloc"), "pandas.DataFrame.loc": ("py:attr", "pandas.DataFrame.loc"), + "pandas.core.dtypes.dtypes.BaseMaskedDtype": "pandas.api.extensions.ExtensionDtype", # should be fixed soon: https://github.com/tox-dev/sphinx-autodoc-typehints/pull/516 "types.EllipsisType": ("py:data", "types.EllipsisType"), "pathlib._local.Path": "pathlib.Path", From 81a54350c65242143a41dc56602e358954a5103b Mon Sep 17 00:00:00 2001 From: "Philipp A." Date: Thu, 8 Jan 2026 10:51:51 +0100 Subject: [PATCH 3/3] allow missing mask in eager too --- docs/fileformat-prose.md | 23 ++++----- src/anndata/_io/specs/methods.py | 48 +++++++++---------- src/anndata/_io/utils.py | 15 ++++++ .../experimental/backed/_lazy_arrays.py | 8 ++-- 4 files changed, 54 insertions(+), 40 deletions(-) diff --git a/docs/fileformat-prose.md b/docs/fileformat-prose.md index c22fa467b..53422452e 100644 --- a/docs/fileformat-prose.md +++ b/docs/fileformat-prose.md @@ -543,31 +543,32 @@ nullable_integer/values ``` (nullable-integer)= -### Nullable integer specifications (v0.1.0) +### Nullable integer specifications (v0.1.1) * Nullable integers MUST be stored as a group -* The group’s attributes MUST contain the encoding metadata `"encoding-type": "nullable-integer"`, `"encoding-version": "0.1.0"` +* The group’s attributes MUST contain the encoding metadata `"encoding-type": "nullable-integer"`, `"encoding-version": "0.1.0" | "0.1.1"` * The group MUST contain an integer valued array under the key `"values"` -* The group MUST contain an boolean valued array under the key `"mask"` +* The group MAY contain a boolean valued array under the key `"mask"` (it MUST contain `"mask"` for `"encoding-version": "0.1.0"`) +* The `"values"` and `"mask"` arrays MUST be the same shape if both exist (nullable-boolean)= -### Nullable boolean specifications (v0.1.0) +### Nullable boolean specifications (v0.1.1) * Nullable booleans MUST be stored as a group -* The group’s attributes MUST contain the encoding metadata `"encoding-type": "nullable-boolean"`, `"encoding-version": "0.1.0"` +* The group’s attributes MUST contain the encoding metadata `"encoding-type": "nullable-boolean"`, `"encoding-version": "0.1.0" | "0.1.1"` * The group MUST contain an boolean valued array under the key `"values"` -* The group MUST contain an boolean valued array under the key `"mask"` -* The `"values"` and `"mask"` arrays MUST be the same shape +* The group MAY contain a boolean valued array under the key `"mask"` (it MUST contain `"mask"` for `"encoding-version": "0.1.0"`) +* The `"values"` and `"mask"` arrays MUST be the same shape if both exist (nullable-string-array)= -### Nullable string specifications (v0.1.0) +### Nullable string specifications (v0.1.1) * Nullable strings MUST be stored as a group -* The group’s attributes MUST contain the encoding metadata `"encoding-type": "nullable-string-array"`, `"encoding-version": "0.1.0"` +* The group’s attributes MUST contain the encoding metadata `"encoding-type": "nullable-string-array"`, `"encoding-version": "0.1.0" | "0.1.1"` * The group’s attributes MAY contain `"na-value"` as an indicator for missing value semantics with the possible value `"NA"` or `"NaN"` described in [](#missing-value-semantics), and the default being `"NA"` * The group MUST contain a string valued array under the key `"values"` -* The group MUST contain a boolean valued array under the key `"mask"` -* The `"values"` and `"mask"` arrays MUST be the same shape +* The group MAY contain a boolean valued array under the key `"mask"` (it MUST contain `"mask"` for `"encoding-version": "0.1.0"`) +* The `"values"` and `"mask"` arrays MUST be the same shape if both exist (missing-value-semantics)= ### Missing value semantics diff --git a/src/anndata/_io/specs/methods.py b/src/anndata/_io/specs/methods.py index 5babeb118..43dc4a51f 100644 --- a/src/anndata/_io/specs/methods.py +++ b/src/anndata/_io/specs/methods.py @@ -20,7 +20,7 @@ from anndata._core.index import _normalize_indices from anndata._core.merge import intersect_keys from anndata._core.sparse_dataset import _CSCDataset, _CSRDataset, sparse_dataset -from anndata._io.utils import check_key, zero_dim_array_as_scalar +from anndata._io.utils import check_key, pandas_nullable_dtype, zero_dim_array_as_scalar from anndata._warnings import OldFormatWarning from anndata.compat import ( AwkArray, @@ -45,12 +45,11 @@ from .registry import _REGISTRY, IOSpec, read_elem, read_elem_partial if TYPE_CHECKING: - from collections.abc import Callable, Iterator + from collections.abc import Iterator from os import PathLike from typing import Any, Literal from numpy import typing as npt - from numpy.typing import NDArray from anndata._types import ArrayStorageType, GroupStorageType from anndata.compat import CSArray, CSMatrix @@ -1214,39 +1213,39 @@ def _read_nullable( elem: GroupStorageType, *, _reader: Reader, - # BaseMaskedArray - array_type: Callable[ - [NDArray[np.number], NDArray[np.bool_]], pd.api.extensions.ExtensionArray - ], + array_type: type[pd.arrays.IntegerArray | pd.arrays.BooleanArray], ) -> pd.api.extensions.ExtensionArray: - return array_type( - _reader.read_elem(elem["values"]), - mask=_reader.read_elem(elem["mask"]), - ) + values = _reader.read_elem(elem["values"]) + if "mask" in elem: + mask = _reader.read_elem(elem["mask"]) + return array_type(values, mask=mask) + return pd.array(values, dtype=pandas_nullable_dtype(array_type, values.dtype)) -_REGISTRY.register_read(H5Group, IOSpec("nullable-integer", "0.1.0"))( - read_nullable_integer := partial(_read_nullable, array_type=pd.arrays.IntegerArray) -) -_REGISTRY.register_read(ZarrGroup, IOSpec("nullable-integer", "0.1.0"))( - read_nullable_integer -) -_REGISTRY.register_read(H5Group, IOSpec("nullable-boolean", "0.1.0"))( - read_nullable_boolean := partial(_read_nullable, array_type=pd.arrays.BooleanArray) -) -_REGISTRY.register_read(ZarrGroup, IOSpec("nullable-boolean", "0.1.0"))( - read_nullable_boolean +read_nullable = dict( + integer=partial(_read_nullable, array_type=pd.arrays.IntegerArray), + boolean=partial(_read_nullable, array_type=pd.arrays.BooleanArray), ) +for dtype, store_type, _version in product( + ("integer", "boolean"), (H5Group, ZarrGroup), ("0.1.0", "0.1.1") +): + _REGISTRY.register_read(store_type, IOSpec(f"nullable-{dtype}", _version))( + read_nullable[dtype] + ) + + @_REGISTRY.register_read(H5Group, IOSpec("nullable-string-array", "0.1.0")) +@_REGISTRY.register_read(H5Group, IOSpec("nullable-string-array", "0.1.1")) @_REGISTRY.register_read(ZarrGroup, IOSpec("nullable-string-array", "0.1.0")) +@_REGISTRY.register_read(ZarrGroup, IOSpec("nullable-string-array", "0.1.1")) def _read_nullable_string( elem: GroupStorageType, *, _reader: Reader ) -> pd.api.extensions.ExtensionArray: values = _reader.read_elem(elem["values"]) - mask = _reader.read_elem(elem["mask"]) + mask = _reader.read_elem(elem["mask"]) if "mask" in elem else None dtype = ( pd.StringDtype( na_value=np.nan @@ -1261,7 +1260,8 @@ def _read_nullable_string( values.astype(np.dtypes.StringDType(na_object=dtype.na_value)), # TODO: why? dtype=dtype, ) - arr[mask] = pd.NA + if mask is not None: + arr[mask] = pd.NA return arr diff --git a/src/anndata/_io/utils.py b/src/anndata/_io/utils.py index 25bd5ee2e..db10c001b 100644 --- a/src/anndata/_io/utils.py +++ b/src/anndata/_io/utils.py @@ -5,6 +5,8 @@ from itertools import pairwise from typing import TYPE_CHECKING, Literal, cast +import numpy as np + from .._core.sparse_dataset import BaseCompressedSparseDataset from ..utils import warn @@ -12,6 +14,9 @@ from collections.abc import Callable, Mapping from typing import Any, Literal + from pandas.core.arrays.masked import BaseMaskedArray + from pandas.core.dtypes.dtypes import BaseMaskedDtype + from .._types import StorageType, _WriteInternal from ..compat import H5Group, ZarrGroup from ..typing import RWAble @@ -119,6 +124,16 @@ def check_key(key): raise TypeError(msg) +def pandas_nullable_dtype( + array_type: type[BaseMaskedArray], dtype: np.dtype +) -> BaseMaskedDtype: + """Infer nullable dtype from numpy dtype. + + There is no public pandas API for this, so this is the cleanest way. + """ + return array_type(np.ones(1, dtype), np.ones(1, bool)).dtype + + # ------------------------------------------------------------------------------- # Generic functions # ------------------------------------------------------------------------------- diff --git a/src/anndata/experimental/backed/_lazy_arrays.py b/src/anndata/experimental/backed/_lazy_arrays.py index 831705665..2f042fb80 100644 --- a/src/anndata/experimental/backed/_lazy_arrays.py +++ b/src/anndata/experimental/backed/_lazy_arrays.py @@ -10,6 +10,7 @@ from anndata._core.views import as_view from anndata._io.specs.lazy_methods import get_chunksize +from ..._io.utils import pandas_nullable_dtype from ..._settings import settings from ...compat import ( H5Array, @@ -160,7 +161,7 @@ def __init__( mask: K | None, base_path_or_zarr_group: Path | ZarrGroup, elem_name: str, - ): + ) -> None: self._mask = None if mask is None else ZarrOrHDF5Wrapper(mask) self._values = ZarrOrHDF5Wrapper(values) self._dtype_str = dtype_str @@ -197,10 +198,7 @@ def __getitem__( @cached_property def dtype(self) -> np.dtypes.StringDType[NAType] | BaseMaskedDtype: if self._dtype_str == "nullable-integer": - return pd.array( - [], - dtype=str(pd.api.types.pandas_dtype(self._values.dtype)).capitalize(), - ).dtype + return pandas_nullable_dtype(pd.arrays.IntegerArray, self._values.dtype) elif self._dtype_str == "nullable-boolean": return pd.BooleanDtype() elif self._dtype_str == "nullable-string-array":