Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions docs/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -174,6 +174,7 @@ def res(
"numpy.dtypes.StringDType": ("py:attr", "numpy.dtypes.StringDType"),
"pandas.DataFrame.iloc": ("py:attr", "pandas.DataFrame.iloc"),
"pandas.DataFrame.loc": ("py:attr", "pandas.DataFrame.loc"),
"pandas.core.dtypes.dtypes.BaseMaskedDtype": "pandas.api.extensions.ExtensionDtype",
# should be fixed soon: https://github.com/tox-dev/sphinx-autodoc-typehints/pull/516
"types.EllipsisType": ("py:data", "types.EllipsisType"),
"pathlib._local.Path": "pathlib.Path",
Expand Down
23 changes: 12 additions & 11 deletions docs/fileformat-prose.md
Original file line number Diff line number Diff line change
Expand Up @@ -543,31 +543,32 @@ nullable_integer/values <zarr.core.Array '/nullable_integer/values' (4,) int64>
```

(nullable-integer)=
### Nullable integer specifications (v0.1.0)
### Nullable integer specifications (v0.1.1)

* Nullable integers MUST be stored as a group
* The group’s attributes MUST contain the encoding metadata `"encoding-type": "nullable-integer"`, `"encoding-version": "0.1.0"`
* The group’s attributes MUST contain the encoding metadata `"encoding-type": "nullable-integer"`, `"encoding-version": "0.1.0" | "0.1.1"`
* The group MUST contain an integer valued array under the key `"values"`
* The group MUST contain an boolean valued array under the key `"mask"`
* The group MAY contain a boolean valued array under the key `"mask"` (it MUST contain `"mask"` for `"encoding-version": "0.1.0"`)
* The `"values"` and `"mask"` arrays MUST be the same shape if both exist

(nullable-boolean)=
### Nullable boolean specifications (v0.1.0)
### Nullable boolean specifications (v0.1.1)

* Nullable booleans MUST be stored as a group
* The group’s attributes MUST contain the encoding metadata `"encoding-type": "nullable-boolean"`, `"encoding-version": "0.1.0"`
* The group’s attributes MUST contain the encoding metadata `"encoding-type": "nullable-boolean"`, `"encoding-version": "0.1.0" | "0.1.1"`
* The group MUST contain an boolean valued array under the key `"values"`
* The group MUST contain an boolean valued array under the key `"mask"`
* The `"values"` and `"mask"` arrays MUST be the same shape
* The group MAY contain a boolean valued array under the key `"mask"` (it MUST contain `"mask"` for `"encoding-version": "0.1.0"`)
* The `"values"` and `"mask"` arrays MUST be the same shape if both exist

(nullable-string-array)=
### Nullable string specifications (v0.1.0)
### Nullable string specifications (v0.1.1)

* Nullable strings MUST be stored as a group
* The group’s attributes MUST contain the encoding metadata `"encoding-type": "nullable-string-array"`, `"encoding-version": "0.1.0"`
* The group’s attributes MUST contain the encoding metadata `"encoding-type": "nullable-string-array"`, `"encoding-version": "0.1.0" | "0.1.1"`
* The group’s attributes MAY contain `"na-value"` as an indicator for missing value semantics with the possible value `"NA"` or `"NaN"` described in [](#missing-value-semantics), and the default being `"NA"`
* The group MUST contain a string valued array under the key `"values"`
* The group MUST contain a boolean valued array under the key `"mask"`
* The `"values"` and `"mask"` arrays MUST be the same shape
* The group MAY contain a boolean valued array under the key `"mask"` (it MUST contain `"mask"` for `"encoding-version": "0.1.0"`)
* The `"values"` and `"mask"` arrays MUST be the same shape if both exist

(missing-value-semantics)=
### Missing value semantics
Expand Down
48 changes: 24 additions & 24 deletions src/anndata/_io/specs/methods.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
from anndata._core.index import _normalize_indices
from anndata._core.merge import intersect_keys
from anndata._core.sparse_dataset import _CSCDataset, _CSRDataset, sparse_dataset
from anndata._io.utils import check_key, zero_dim_array_as_scalar
from anndata._io.utils import check_key, pandas_nullable_dtype, zero_dim_array_as_scalar
from anndata._warnings import OldFormatWarning
from anndata.compat import (
AwkArray,
Expand All @@ -45,12 +45,11 @@
from .registry import _REGISTRY, IOSpec, read_elem, read_elem_partial

if TYPE_CHECKING:
from collections.abc import Callable, Iterator
from collections.abc import Iterator
from os import PathLike
from typing import Any, Literal

from numpy import typing as npt
from numpy.typing import NDArray

from anndata._types import ArrayStorageType, GroupStorageType
from anndata.compat import CSArray, CSMatrix
Expand Down Expand Up @@ -1214,39 +1213,39 @@ def _read_nullable(
elem: GroupStorageType,
*,
_reader: Reader,
# BaseMaskedArray
array_type: Callable[
[NDArray[np.number], NDArray[np.bool_]], pd.api.extensions.ExtensionArray
],
array_type: type[pd.arrays.IntegerArray | pd.arrays.BooleanArray],
) -> pd.api.extensions.ExtensionArray:
return array_type(
_reader.read_elem(elem["values"]),
mask=_reader.read_elem(elem["mask"]),
)
values = _reader.read_elem(elem["values"])
if "mask" in elem:
mask = _reader.read_elem(elem["mask"])
return array_type(values, mask=mask)

return pd.array(values, dtype=pandas_nullable_dtype(array_type, values.dtype))

_REGISTRY.register_read(H5Group, IOSpec("nullable-integer", "0.1.0"))(
read_nullable_integer := partial(_read_nullable, array_type=pd.arrays.IntegerArray)
)
_REGISTRY.register_read(ZarrGroup, IOSpec("nullable-integer", "0.1.0"))(
read_nullable_integer
)

_REGISTRY.register_read(H5Group, IOSpec("nullable-boolean", "0.1.0"))(
read_nullable_boolean := partial(_read_nullable, array_type=pd.arrays.BooleanArray)
)
_REGISTRY.register_read(ZarrGroup, IOSpec("nullable-boolean", "0.1.0"))(
read_nullable_boolean
read_nullable = dict(
integer=partial(_read_nullable, array_type=pd.arrays.IntegerArray),
boolean=partial(_read_nullable, array_type=pd.arrays.BooleanArray),
)


for dtype, store_type, _version in product(
("integer", "boolean"), (H5Group, ZarrGroup), ("0.1.0", "0.1.1")
):
_REGISTRY.register_read(store_type, IOSpec(f"nullable-{dtype}", _version))(
read_nullable[dtype]
)


@_REGISTRY.register_read(H5Group, IOSpec("nullable-string-array", "0.1.0"))
@_REGISTRY.register_read(H5Group, IOSpec("nullable-string-array", "0.1.1"))
@_REGISTRY.register_read(ZarrGroup, IOSpec("nullable-string-array", "0.1.0"))
@_REGISTRY.register_read(ZarrGroup, IOSpec("nullable-string-array", "0.1.1"))
def _read_nullable_string(
elem: GroupStorageType, *, _reader: Reader
) -> pd.api.extensions.ExtensionArray:
values = _reader.read_elem(elem["values"])
mask = _reader.read_elem(elem["mask"])
mask = _reader.read_elem(elem["mask"]) if "mask" in elem else None
dtype = (
pd.StringDtype(
na_value=np.nan
Expand All @@ -1261,7 +1260,8 @@ def _read_nullable_string(
values.astype(np.dtypes.StringDType(na_object=dtype.na_value)), # TODO: why?
dtype=dtype,
)
arr[mask] = pd.NA
if mask is not None:
arr[mask] = pd.NA
return arr


Expand Down
15 changes: 15 additions & 0 deletions src/anndata/_io/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,13 +5,18 @@
from itertools import pairwise
from typing import TYPE_CHECKING, Literal, cast

import numpy as np

from .._core.sparse_dataset import BaseCompressedSparseDataset
from ..utils import warn

if TYPE_CHECKING:
from collections.abc import Callable, Mapping
from typing import Any, Literal

from pandas.core.arrays.masked import BaseMaskedArray
from pandas.core.dtypes.dtypes import BaseMaskedDtype

from .._types import StorageType, _WriteInternal
from ..compat import H5Group, ZarrGroup
from ..typing import RWAble
Expand Down Expand Up @@ -119,6 +124,16 @@ def check_key(key):
raise TypeError(msg)


def pandas_nullable_dtype(
array_type: type[BaseMaskedArray], dtype: np.dtype
) -> BaseMaskedDtype:
"""Infer nullable dtype from numpy dtype.

There is no public pandas API for this, so this is the cleanest way.
"""
return array_type(np.ones(1, dtype), np.ones(1, bool)).dtype


# -------------------------------------------------------------------------------
# Generic functions
# -------------------------------------------------------------------------------
Expand Down
58 changes: 31 additions & 27 deletions src/anndata/experimental/backed/_lazy_arrays.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from __future__ import annotations

from functools import cached_property
from typing import TYPE_CHECKING
from typing import TYPE_CHECKING, cast

import numpy as np
import pandas as pd
Expand All @@ -10,6 +10,7 @@
from anndata._core.views import as_view
from anndata._io.specs.lazy_methods import get_chunksize

from ..._io.utils import pandas_nullable_dtype
from ..._settings import settings
from ...compat import (
H5Array,
Expand All @@ -23,8 +24,9 @@
from pathlib import Path
from typing import Literal

from numpy.typing import NDArray
from pandas._libs.missing import NAType
from pandas.core.dtypes.base import ExtensionDtype
from pandas.core.dtypes.dtypes import BaseMaskedDtype

from anndata.compat import ZarrGroup

Expand Down Expand Up @@ -143,7 +145,7 @@ class MaskedArray[K: (H5Array, ZarrArray)](XBackendArray):
by :class:`xarray.backends.BackendArray`.
"""

_mask: ZarrOrHDF5Wrapper[K]
_mask: ZarrOrHDF5Wrapper[K] | None
_values: ZarrOrHDF5Wrapper[K]
_dtype_str: Literal["nullable-integer", "nullable-boolean", "nullable-string-array"]
shape: tuple[int, ...]
Expand All @@ -152,49 +154,51 @@ class MaskedArray[K: (H5Array, ZarrArray)](XBackendArray):

def __init__(
self,
values: ZarrArray | H5Array,
values: K,
dtype_str: Literal[
"nullable-integer", "nullable-boolean", "nullable-string-array"
],
mask: ZarrArray | H5Array,
mask: K | None,
base_path_or_zarr_group: Path | ZarrGroup,
elem_name: str,
):
self._mask = ZarrOrHDF5Wrapper(mask)
) -> None:
self._mask = None if mask is None else ZarrOrHDF5Wrapper(mask)
self._values = ZarrOrHDF5Wrapper(values)
self._dtype_str = dtype_str
self.shape = self._values.shape
self.base_path_or_zarr_group = base_path_or_zarr_group
self.file_format = "zarr" if isinstance(mask, ZarrArray) else "h5"
self.elem_name = elem_name

def __getitem__(self, key: ExplicitIndexer) -> PandasExtensionArray | np.ndarray:
from xarray.core.extension_array import PandasExtensionArray

def __getitem__(
self, key: ExplicitIndexer
) -> PandasExtensionArray | NDArray[np.str_]:
values = self._values[key]
mask = self._mask[key]
if self._dtype_str == "nullable-integer":
# numpy does not support nan ints
extension_array = pd.arrays.IntegerArray(values, mask=mask)
elif self._dtype_str == "nullable-boolean":
extension_array = pd.arrays.BooleanArray(values, mask=mask)
elif self._dtype_str == "nullable-string-array":
mask = None if self._mask is None else self._mask[key]

# numpy does not support nan ints
if self._dtype_str in {"nullable-integer", "nullable-boolean"}:
from xarray.core.extension_array import PandasExtensionArray

if mask is None:
mask = np.ones(len(values), dtype=bool)
cls = cast("BaseMaskedDtype", self.dtype).construct_array_type()
return PandasExtensionArray(cls(values, mask=mask))

if self._dtype_str == "nullable-string-array":
# https://github.com/pydata/xarray/issues/10419
values = values.astype(self.dtype)
values[mask] = pd.NA
if mask is not None:
values[mask] = pd.NA
return values
else:
msg = f"Invalid dtype_str {self._dtype_str}"
raise RuntimeError(msg)
return PandasExtensionArray(extension_array)

msg = f"Invalid dtype_str {self._dtype_str}"
raise RuntimeError(msg)

@cached_property
def dtype(self) -> np.dtypes.StringDType[NAType] | ExtensionDtype:
def dtype(self) -> np.dtypes.StringDType[NAType] | BaseMaskedDtype:
if self._dtype_str == "nullable-integer":
return pd.array(
[],
dtype=str(pd.api.types.pandas_dtype(self._values.dtype)).capitalize(),
).dtype
return pandas_nullable_dtype(pd.arrays.IntegerArray, self._values.dtype)
elif self._dtype_str == "nullable-boolean":
return pd.BooleanDtype()
elif self._dtype_str == "nullable-string-array":
Expand Down
Loading