diff --git a/doc/changes/DM-55041.feature.md b/doc/changes/DM-55041.feature.md new file mode 100644 index 00000000..c232d459 --- /dev/null +++ b/doc/changes/DM-55041.feature.md @@ -0,0 +1 @@ +Added a new `lsst.images.zarr` archive backend that reads and writes Zarr v3 archives. The on-disk layout is xarray/CF-shaped at the root (`image`, `variance`, `mask` as siblings sharing `(y, x)` dimensions, CF `flag_masks`/`flag_meanings` on the mask) with OME-NGFF v0.5 multiscales metadata layered on top — the same bytes are visible to xarray, GDAL, and OME-Zarr tooling like `napari` and `ome-zarr-py`. Supports `Image`, `Mask`, `MaskedImage`, and `ColorImage`. Cloud-friendly defaults (256-pixel tile-aligned chunks, automatic v3 sharding tuned for ~16 MiB shards on S3/GCS, fsspec-backed remote stores) and subset reads that only fetch the chunks they need. Tunable via the `LSST_IMAGES_ZARR_TARGET_SHARD_BYTES` environment variable. Install via the new `[zarr]` extra (`pip install lsst-images[zarr]`). diff --git a/doc/lsst.images/index.rst b/doc/lsst.images/index.rst index e819d3a0..37af78b7 100644 --- a/doc/lsst.images/index.rst +++ b/doc/lsst.images/index.rst @@ -32,4 +32,5 @@ API Reference fits.rst json.rst ndf.rst + zarr.rst tests.rst diff --git a/doc/lsst.images/zarr.rst b/doc/lsst.images/zarr.rst new file mode 100644 index 00000000..ee5c3f27 --- /dev/null +++ b/doc/lsst.images/zarr.rst @@ -0,0 +1,182 @@ +Zarr I/O +======== + +A Zarr v3 serialization backend whose on-disk layout is xarray/CF-shaped at the root (``image`` / ``variance`` / ``mask`` as siblings sharing ``(y, x)`` dimensions, CF ``flag_masks`` / ``flag_meanings`` on the mask) with OME-NGFF v0.5 multiscales metadata as a discoverability layer pointing at the same ``image`` array. +The same bytes are visible to ``xarray``, GDAL's Zarr driver, and OME-Zarr tooling like ``napari`` and ``ome-zarr-py``. + +Default chunking is tile-aligned — 256 pixels per spatial axis for plain images, ``cell_shape`` for ``CellCoadd`` — and bulk pixel arrays are sharded with a ~16 MiB byte budget so a typical archive is a small handful of objects rather than thousands of chunk files. +Subset reads via ``slices=`` only fetch the chunks they need, including on remote stores accessed through ``lsst.resources.ResourcePath`` and ``fsspec``. + +This backend requires the optional ``zarr >= 3.0`` package. Install via the ``[zarr]`` extra:: + + pip install lsst-images[zarr] + +Standards alignment +------------------- + +The on-disk container is `Zarr v3 `_. +On top of that we layer four community standards so the same bytes are usable by tools that don't know anything about LSST: + +* `xarray / CF-conventions `_ — every array carries an ``_ARRAY_DIMENSIONS`` attribute and a v3 ``dimension_names`` metadata field. The mask carries CF ``flag_masks`` / ``flag_meanings`` / ``flag_descriptions`` so any CF-aware tool can interpret the bit assignments. +* `OME-NGFF v0.5 `_ — the root group carries a ``multiscales`` block whose only ``dataset.path`` points back at the same ``image`` array. This makes the same archive openable by OME-Zarr tooling without any byte duplication. +* `Geo-Zarr `_ shape compatibility — sibling arrays sharing ``(y, x)`` dimensions with CF flag attributes is the same convention ``rasterio`` and ``GDAL``'s Zarr driver expect for raster + mask layers. +* `LSST archive tree <#data-model>`_ — a Pydantic JSON document at ``/lsst_json`` carries the full LSST-specific metadata (WCS, PSF, detector, butler info, …) that the community standards have no place for. Same convention as the FITS backend's ``JSON`` HDU and the NDF backend's ``/MORE/LSST/JSON`` path. + +Data model +---------- + +Every archive contains the following pieces: + +``/lsst_json`` (1-D ``uint8``) + UTF-8 encoded JSON of the Pydantic archive tree (see `~lsst.images.serialization.ArchiveTree`). + The round-trip authority — every array reference, projection, PSF, mask schema, butler provenance, etc. lives here. + SIP polynomials and other ``PolyMap``-based distortions round-trip byte-exact through the chain of `Mapping `_ models embedded in this JSON. + Stored as a single chunk because it is always read whole. + +Root attributes (``zarr.json`` ``attributes``) + Three namespaces: + + * ``lsst.*`` — backend-specific keys: ``archive_class``, ``json``, ``opaque_metadata_format``, ``cell_grid``, ``wcs_simplified_dropped``. + * ``ome.*`` — OME-NGFF v0.5 ``multiscales`` block (and ``omero/channels`` when a channel axis exists). + * top-level — CF / xarray attributes that aren't tied to a specific axis. + +``/lsst/opaque_metadata/fits/primary`` (2-D ``(N, 80) uint8``) + Present only when an object originated from a FITS read. + Holds the primary HDU's card stream verbatim — ``Header.tostring()`` reshaped one row per card. + ``COMMENT``, ``HISTORY``, ``HIERARCH``, and ``CONTINUE`` cards survive byte-for-byte. + +Per-array data + The ``image`` / ``variance`` / ``mask`` arrays at the root, plus any class-specific extras. + Mask is a 2-D unsigned integer (``uint8`` for ≤8 planes, ``uint64`` for 17–64 planes; >64 raises) with CF ``flag_masks`` / ``flag_meanings`` / ``flag_descriptions``. + +Chunking and sharding +--------------------- + +Chunks + The default chunk shape per top-level array is ``min(256, dim)`` per axis for plain image arrays (``DEFAULT_CHUNK_AXIS_LIMIT`` in `lsst.images.zarr`). + For `~lsst.images.cells.CellCoadd`, ``image`` / ``variance`` / ``mask`` chunks are aligned to ``cell_shape`` so a single-cell read is one chunk per array; the 4-D ``psf`` array is chunked ``(1, 1, Py, Px)`` so a single-cell PSF read is also one chunk. + Sibling arrays (``variance`` / ``mask``) inherit the ``image`` array's chunk shape unless the caller passes an explicit override to `~lsst.images.zarr.write`. + +Sharding + Bulk pixel arrays (``image`` / ``variance`` / ``mask`` / ``CellCoadd``'s ``psf``) are sharded by default so a remote archive on S3 / GCS is a small number of objects rather than thousands of chunk files. + The shard shape is chosen by a byte-budget rule that grows axes whose chunk does not already cover the full extent until each shard is close to ``LSST_IMAGES_ZARR_TARGET_SHARD_BYTES`` of uncompressed data; the default budget is 16 MiB. + Shard axes are always integer multiples of the corresponding chunk axes, capped at the array extent. + Tiny single-chunk arrays (``lsst_json``, ``wcs_ast``, the FITS opaque-metadata block, per-PSF parameter arrays whose chunks already cover the whole array) are left unsharded — sharding them would only add a layer of indirection. + Sharding can be disabled or overridden per-array by passing ``shards={"image": None, ...}`` to `~lsst.images.zarr.write`. + +Stores + The store implementation is selected from the URI shape: a path ending in ``.zarr.zip`` (or any ``.zip``) opens a ``ZipStore``, a remote URI (``s3://``, ``gs://``, ``http(s)://``) opens a ``FsspecStore`` via `lsst.resources.ResourcePath`, and anything else opens a ``LocalStore`` directory. + Two caveats worth knowing about: + + * Writing a ``ZipStore`` directly to a remote URI is not yet supported — write to a local ``.zarr.zip`` and upload, or write to a remote directory store. Reading a remote ``.zarr.zip`` works (the file is fetched to a local cache first via ``ResourcePath.as_local``, then opened). + * After a directory or fsspec write, consolidated metadata is emitted so a single read fetches the whole hierarchy's ``zarr.json`` contents — a significant latency win on remote stores. ``ZipStore`` does not support consolidation; zip writes succeed without consolidated metadata, and reads of zip archives walk the hierarchy normally. + +Example layouts +--------------- + +`~lsst.images.VisitImage` +^^^^^^^^^^^^^^^^^^^^^^^^^ + +The most common case — a single detector exposure with a projection, PSF, and detector geometry:: + + visit.zarr/ + ├── zarr.json ← root attrs (lsst.archive_class="VisitImage", + │ ome.multiscales, data_model, version, …) + ├── image/ ← (Y, X) float32, dim_names=["y", "x"] + ├── variance/ ← (Y, X) float64 + ├── mask/ ← (Y, X) packed wide-int with CF flag attrs + ├── lsst_json/ ← 1-D uint8, the LSST archive tree + ├── psf/ ← (PSF parameters as one or more arrays) + └── lsst/opaque_metadata/fits/primary/ ← (N, 80) uint8 (when read from a FITS file) + +The ``lsst_json`` tree carries the projection, PSF type, detector reference, observation summary stats, photometric scaling, aperture-correction map, and any background fields. +For the WCS specifically, the projection's ``pixel_to_sky`` mapping is decomposed into a chain of Frames and Mappings (including any ``PolyMap`` for SIP distortion); reading is byte-exact. + +`~lsst.images.cells.CellCoadd` +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +A coadd composed of a regular grid of cells, each with its own PSF:: + + coadd.zarr/ + ├── zarr.json ← lsst.archive_class="CellCoadd", + │ lsst.cell_grid={bbox, cell_shape}, + │ ome.multiscales pointing at /image + ├── image/ ← (Y, X) float32, chunks = cell_shape + ├── variance/ ← (Y, X) float64, chunks = cell_shape + ├── mask/ ← (Y, X) packed wide-int, chunks = cell_shape + ├── psf/ ← (Cy, Cx, Py, Px) float32, + │ chunks=(1, 1, Py, Px) — one chunk per cell + ├── lsst_json/ + └── lsst/opaque_metadata/fits/primary/ + +The ``image`` / ``variance`` / ``mask`` chunks are aligned to the cell grid so reading a single cell is one chunk per array. +The ``psf`` array's chunking is per-cell so a single-cell PSF read is also one chunk. + +`~lsst.images.ColorImage` +^^^^^^^^^^^^^^^^^^^^^^^^^ + +A 3-channel display image:: + + color.zarr/ + ├── zarr.json ← lsst.archive_class="ColorImage" + │ (no root-level ome.multiscales) + ├── red/ ← (Y, X) uint8, dim_names=["y", "x"] + ├── green/ ← (Y, X) uint8 + ├── blue/ ← (Y, X) uint8 + └── lsst_json/ + +Channels are flat top-level arrays rather than a stacked ``(3, Y, X)`` array, so xarray sees them as three independent 2-D variables and there is no byte duplication for the OME view. + +WCS handling +------------ + +The full WCS — including SIP polynomials and any other ``PolyMap``-based distortion — round-trips through the JSON tree at ``lsst_json`` as a chain of `~lsst.images.FrameSet` / Mapping models. +The layout layer also asks AST's `linearapprox `_ for an affine approximation over the image footprint at one-pixel accuracy. +If AST returns one, the OME ``coordinateTransformations`` block on the root multiscale is populated with the resulting ``[scale, affine]`` pair. +If AST cannot fit a linear approximation within tolerance, the block is dropped and ``lsst.wcs_simplified_dropped: true`` is set on the root attrs. +The OME block is always informational — readers reconstruct the projection from the JSON tree, never from the OME block. + +Tooling that can read these files +--------------------------------- + +The standards-aligned root layout means tools that don't know about LSST can still open the file in some useful capacity: + +`xarray `_ + ``xr.open_zarr(path)`` returns a ``Dataset`` with one ``DataArray`` per zarr array sharing ``(y, x)`` dimensions, CF flag attributes on the mask variable, and any per-array ``units`` / ``long_name``. + The Pydantic JSON tree at ``/lsst_json`` shows up as a 1-D ``uint8`` variable; xarray ignores it for analysis, you decode it manually if you need the LSST metadata. + +`napari-ome-zarr `_ and `ome-zarr-py `_ + Browse and visualize the science image through the OME-NGFF multiscales block. + Sees the ``image`` array as the only level of a single multiscale; ignores everything else. + +`GDAL `_'s Zarr driver and `rasterio `_ + Opens individual top-level arrays as raster bands. + Reads CF attributes including the mask's ``flag_masks`` / ``flag_meanings``. + +`zarr-python `_ + Direct array access at any path, including from S3 / GCS / HTTP via fsspec. + Subset reads via ``arr[y0:y1, x0:x1]`` only fetch chunks intersecting the slice. + +`napari `_ via the OME-Zarr plugin + Same OME view as ``napari-ome-zarr``. + +`neuroglancer `_ + Native OME-NGFF support; will display the science image with the affine ``coordinateTransformations`` block when present. + +`ngff-validator `_ + Validates the OME-NGFF v0.5 metadata block against the schema. + +Round-trip with FITS +-------------------- + +When an object that originated from a FITS read carries a `~lsst.images.fits.FitsOpaqueMetadata`, the primary-HDU header is preserved at ``/lsst/opaque_metadata/fits/primary`` as a 2-D ``(N, 80)`` byte array. +Reading the zarr back attaches an equivalent ``FitsOpaqueMetadata`` to the deserialized object so a subsequent FITS write reproduces the original cards. +This means an ``LSSTCam`` raw read in via FITS, written to zarr, read back, and written again to FITS will round-trip the full primary header — including ``COMMENT``, ``HISTORY``, ``HIERARCH``, and ``CONTINUE`` cards — byte-for-byte. + +API reference +------------- + +.. automodapi:: lsst.images.zarr + :no-inheritance-diagram: + :include-all-objects: + :inherited-members: diff --git a/docs/superpowers/plans/2026-05-22-zarr-io-backend.md b/docs/superpowers/plans/2026-05-22-zarr-io-backend.md new file mode 100644 index 00000000..b482f2e2 --- /dev/null +++ b/docs/superpowers/plans/2026-05-22-zarr-io-backend.md @@ -0,0 +1,5353 @@ +# Zarr I/O Backend Implementation Plan + +> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking. + +**Goal:** Add a `lsst.images.zarr` subpackage that reads and writes Zarr v3 archives following the revised design at `docs/superpowers/specs/2026-05-22-zarr-io-design.md` — xarray/CF-shaped at the root with OME-NGFF v0.5 metadata as a discoverability layer on top, supporting every image type the FITS/JSON/NDF backends support, with cloud-friendly chunking and lazy subset reads that only fetch the chunks they touch. + +**Architecture:** Mirrors the NDF backend. A Python intermediate representation (`ZarrDocument`/`ZarrGroup`/`ZarrArray`) holds the on-disk layout independently of `zarr-python`. The IR holds **lazy `zarr.Array` handles, never materialized `numpy` arrays** — so `from_zarr()` opens groups without reading bytes, and `InputArchive.get_array(model, slices=...)` passes slices through to the lazy handle. Writes use a two-pass model: `obj.serialize(archive)` populates the IR, then `__exit__` materializes it via the configured `zarr.storage.Store`. Stores are selected from a `ResourcePath` URI: `*.zarr.zip` → `ZipStore`, remote URIs → `FsspecStore`, otherwise `LocalStore`. **No stacking, no JSON-pointer rewrites, no compound source URLs** — each `add_array(name)` call lands at the zarr path equal to `name`. + +**Tech Stack:** `zarr >= 3.0`, `numcodecs` (already pulled by zarr), `fsspec` (already a dependency), `lsst.resources.ResourcePath` (already a dependency), `pydantic >= 2.12`, `numpy >= 2.0`. Reuses `lsst.images.serialization` ABCs and tree models. Optional install via `pip install lsst-images[zarr]`. + +**Critical invariants** — these are pinned by tests in this plan: + +1. **Lazy reads everywhere.** `ZarrArray.data` is one of `np.ndarray` (staged for write) or `zarr.Array` (read-side handle). `from_zarr` never reads chunk bytes. `InputArchive.get_array(model, slices=...)` forwards `slices` straight to the lazy handle. Pinned by `_CountingStore` regression test in Task 3.2. +2. **Aligned chunks across siblings.** `image`, `variance`, and `mask` share spatial chunk shape. The output archive derives `variance`/`mask` chunks from `image`'s chunk shape when not explicitly overridden. Pinned by Task 2.5. +3. **Affine residual validator.** Before emitting an OME `coordinateTransformations` block, the layout layer samples residuals on an 11×11 grid; if max pixel-equivalent residual exceeds 1.0 pixel, the block is dropped and `lsst.wcs_simplified_dropped: true` is set. The AST string at `wcs_ast` is always authoritative. Pinned by Task 2.4. +4. **No byte duplication.** ColorImage channels are recursive sub-archives, not stacked. CellCoadd PSF is whatever shape `serialize` natively emits — typically 4-D `(Cy, Cx, Py, Px)`. There is no fixup pass that copies or re-shapes data. + +--- + +## File Structure + +``` +python/lsst/images/zarr/ +├── __init__.py guarded `import zarr`; re-exports public API +├── _common.py ZarrPointerModel, namespace constants +│ (LSST_NS / OME_NS / LSST_VERSION / OME_VERSION), +│ ZarrCompressionOptions, mask-dtype-for-plane-count, +│ path helpers (no JSON-pointer mapping table — +│ every name maps to its literal path now) +├── _model.py IR: ZarrAttributes, ZarrArray (lazy-handle backed), +│ ZarrGroup, ZarrDocument, OME/CF helpers +│ (OmeMultiscale, OmeOmeroChannel, +│ CfFlagAttributes, build_image_array_attrs) +├── _layout.py Layout rules: axes per archive class, +│ chunk derivation (incl. cell-aligned for CellCoadd +│ and aligned-with-image for variance/mask), +│ affine extraction + residual validator, +│ OME multiscale block construction, +│ CF flag-attrs construction from MaskSchema +├── _store.py URI → zarr.storage.Store wrapper: +│ *.zarr.zip → ZipStore, http(s)/s3/gs → FsspecStore, +│ local → LocalStore. Honors create-only mode. +├── _output_archive.py ZarrOutputArchive (populates IR) and write() helper +├── _input_archive.py ZarrInputArchive (reads IR lazily) and read() helper + +tests/ +├── test_zarr_common.py constants, helpers, ZarrCompressionOptions, +│ mask-dtype-for-plane-count +├── test_zarr_model.py IR round-trip via in-memory MemoryStore, +│ lazy invariant on from_zarr +├── test_zarr_layout.py axes per archive class, chunk derivation, +│ CF flag-attrs construction, +│ affine residual validator behaviour +├── test_zarr_store.py URI dispatch, create-only refusal +├── test_zarr_output_archive.py write paths inspected against IR for +│ every supported archive class +├── test_zarr_input_archive.py read paths + lazy subset assertion +│ (_CountingStore), error taxonomy, +│ opaque-metadata round-trip +├── test_zarr_round_trip.py full write→read for every type +├── test_zarr_cross_format.py FITS↔Zarr opaque-metadata round-trip +├── test_zarr_xarray_interop.py xr.open_zarr returns Dataset with +│ image/variance/mask data variables +├── test_zarr_ome_compliance.py ngff-validator (skipped if absent) +└── test_zarr_external_reader.py ome-zarr-py sanity (skipped if absent) +``` + +The split mirrors the NDF backend exactly: `_model.py` is pure data; `_output_archive.py` and `_input_archive.py` only translate between the IR and the abstract archive interface; `_layout.py` holds every per-archive-class decision so individual `add_array` calls stay generic. + +--- + +## Phase 1 — Skeleton, `_common.py`, and IR (no I/O yet) + +This phase produces the IR and constants in isolation. The IR round-trips through an in-memory zarr `MemoryStore` so the shape of what later phases will produce is pinned before any archive code is written. + +### Task 1.1: Create the package skeleton + +**Files:** +- Create: `python/lsst/images/zarr/__init__.py` +- Modify: `pyproject.toml` (add `zarr` extra after the existing `ndf` extra at line 55) + +- [ ] **Step 1: Add the optional dependency** + +In `pyproject.toml`, immediately after the `ndf` extra (around line 55), add: + +```toml +# Add feature for Zarr v3 read/write support. +zarr = ["zarr >= 3.0"] +``` + +- [ ] **Step 2: Create the package `__init__.py` with a guarded import** + +Create `python/lsst/images/zarr/__init__.py`: + +```python +# This file is part of lsst-images. +# +# Developed for the LSST Data Management System. +# This product includes software developed by the LSST Project +# (https://www.lsst.org). +# See the COPYRIGHT file at the top-level directory of this distribution +# for details of code ownership. +# +# Use of this source code is governed by a 3-clause BSD-style +# license that can be found in the LICENSE file. + +"""Zarr v3 archive backend for `lsst.images`. + +Files written by this archive are xarray/CF-shaped at the root +(``image`` / ``variance`` / ``mask`` as siblings sharing ``(y, x)`` +dimensions, CF ``flag_masks`` / ``flag_meanings`` on the mask) with +OME-NGFF v0.5 multiscales metadata as a discoverability layer +pointing at the same ``image`` array. The same bytes are visible to +``xarray``, GDAL's Zarr driver, and OME-Zarr tooling like ``napari`` +and ``ome-zarr-py``. + +Default chunk geometry is tile-aligned (~1024×1024 for plain images, +``cell_shape`` for ``CellCoadd``). Sharding (zarr v3 native) is +enabled by default with a tunable shard size to keep object counts +manageable on S3/GCS. Both ``DirectoryStore`` and ``ZipStore`` are +supported; the choice is driven by URI shape (``*.zarr.zip`` → +``ZipStore``, otherwise directory). Remote URIs go through +`lsst.resources.ResourcePath` and `fsspec`. +""" + +try: + import zarr # noqa: F401 +except ImportError as e: + raise ImportError( + "lsst.images.zarr requires the optional 'zarr' package (>=3.0). " + "Install it directly or via 'pip install lsst-images[zarr]'." + ) from e + +# Phase 1 has no public archive API yet. Re-exports are added in later phases. +``` + +- [ ] **Step 3: Verify the guarded import works** + +Run: `python -c "import lsst.images.zarr"` +Expected: no output (success), or a clear ImportError pointing at the `[zarr]` extra if `zarr` is not installed. + +- [ ] **Step 4: Commit** + +```bash +git add python/lsst/images/zarr/__init__.py pyproject.toml +git commit -m "feat: add lsst.images.zarr package skeleton with guarded import" +``` + +### Task 1.2: `_common.py` — namespaces, `ZarrPointerModel`, `ZarrCompressionOptions`, mask-dtype helper + +**Files:** +- Create: `python/lsst/images/zarr/_common.py` +- Test: `tests/test_zarr_common.py` + +`_common.py` carries: + +- Namespace constants `LSST_NS = "lsst"`, `OME_NS = "ome"`, version integers `LSST_VERSION = 1`, `OME_VERSION = "0.5"`. +- `ZarrPointerModel` — Pydantic model holding an absolute zarr path. +- `ZarrCompressionOptions` — dataclass with `codec`, `cname`, `clevel`, `shuffle`. Provides `default_for_dtype(dtype)` returning byte-shuffle for floats, bit-shuffle for ints/masks. +- `mask_dtype_for_plane_count(n)` — picks the smallest unsigned integer that holds `n` planes; raises if `n > 64`. +- `archive_path_to_zarr_path(archive_path)` — translates an empty archive path to `/tree`; non-empty paths are kept verbatim under their natural path. **There is no JSON-pointer mapping table.** `name="image"` lands at `/image`; `name="mask"` at `/mask`; nested `name="red/image"` at `/red/image`. + +- [ ] **Step 1: Write the failing test** + +Create `tests/test_zarr_common.py`: + +```python +# This file is part of lsst-images. +# +# Developed for the LSST Data Management System. +# This product includes software developed by the LSST Project +# (https://www.lsst.org). +# See the COPYRIGHT file at the top-level directory of this distribution +# for details of code ownership. +# +# Use of this source code is governed by a 3-clause BSD-style +# license that can be found in the LICENSE file. + +from __future__ import annotations + +import unittest + +import numpy as np + +try: + from lsst.images.zarr._common import ( + LSST_NS, + LSST_VERSION, + OME_NS, + OME_VERSION, + ZarrCompressionOptions, + ZarrPointerModel, + archive_path_to_zarr_path, + mask_dtype_for_plane_count, + ) + + HAVE_ZARR = True +except ImportError: + HAVE_ZARR = False + + +@unittest.skipUnless(HAVE_ZARR, "zarr is not installed") +class CommonTestCase(unittest.TestCase): + def test_pointer_round_trips(self) -> None: + original = ZarrPointerModel(path="/lsst/psf/tree") + recovered = ZarrPointerModel.model_validate_json(original.model_dump_json()) + self.assertEqual(recovered, original) + + def test_constants(self) -> None: + self.assertEqual(LSST_NS, "lsst") + self.assertEqual(OME_NS, "ome") + self.assertEqual(OME_VERSION, "0.5") + self.assertGreaterEqual(LSST_VERSION, 1) + + def test_archive_path_translation(self) -> None: + # Empty archive path -> the canonical root-level JSON tree. + self.assertEqual(archive_path_to_zarr_path(""), "/tree") + # Non-empty archive paths are kept verbatim. + self.assertEqual(archive_path_to_zarr_path("/image"), "/image") + self.assertEqual(archive_path_to_zarr_path("image"), "/image") + self.assertEqual(archive_path_to_zarr_path("/red/image"), "/red/image") + self.assertEqual(archive_path_to_zarr_path("/psf"), "/psf") + + def test_compression_defaults(self) -> None: + floats = ZarrCompressionOptions.default_for_dtype("float32") + self.assertEqual(floats.codec, "blosc") + self.assertEqual(floats.shuffle, "shuffle") + ints = ZarrCompressionOptions.default_for_dtype("uint8") + self.assertEqual(ints.shuffle, "bitshuffle") + + def test_mask_dtype_picks_smallest_fit(self) -> None: + self.assertEqual(mask_dtype_for_plane_count(1), np.dtype("uint8")) + self.assertEqual(mask_dtype_for_plane_count(8), np.dtype("uint8")) + self.assertEqual(mask_dtype_for_plane_count(9), np.dtype("uint16")) + self.assertEqual(mask_dtype_for_plane_count(16), np.dtype("uint16")) + self.assertEqual(mask_dtype_for_plane_count(17), np.dtype("uint32")) + self.assertEqual(mask_dtype_for_plane_count(32), np.dtype("uint32")) + self.assertEqual(mask_dtype_for_plane_count(33), np.dtype("uint64")) + self.assertEqual(mask_dtype_for_plane_count(64), np.dtype("uint64")) + + def test_mask_dtype_refuses_more_than_64_planes(self) -> None: + with self.assertRaisesRegex(ValueError, "supports up to 64"): + mask_dtype_for_plane_count(65) + + +if __name__ == "__main__": + unittest.main() +``` + +- [ ] **Step 2: Run the test to verify it fails** + +Run: `pytest tests/test_zarr_common.py -v` +Expected: FAIL — `ImportError` on `lsst.images.zarr._common`. + +- [ ] **Step 3: Write `_common.py`** + +Create `python/lsst/images/zarr/_common.py`: + +```python +# This file is part of lsst-images. +# +# Developed for the LSST Data Management System. +# This product includes software developed by the LSST Project +# (https://www.lsst.org). +# See the COPYRIGHT file at the top-level directory of this distribution +# for details of code ownership. +# +# Use of this source code is governed by a 3-clause BSD-style +# license that can be found in the LICENSE file. + +from __future__ import annotations + +__all__ = ( + "LSST_NS", + "LSST_VERSION", + "OME_NS", + "OME_VERSION", + "ZarrCompressionOptions", + "ZarrPointerModel", + "archive_path_to_zarr_path", + "mask_dtype_for_plane_count", +) + +from dataclasses import dataclass +from typing import ClassVar, Self + +import numpy as np +import pydantic + +LSST_NS = "lsst" +"""Top-level zarr-attributes namespace key for LSST extensions.""" + +OME_NS = "ome" +"""Top-level zarr-attributes namespace key for OME-NGFF metadata.""" + +OME_VERSION = "0.5" +"""OME-Zarr / NGFF version this backend writes.""" + +LSST_VERSION = 1 +"""Schema version of the ``lsst:`` extension this backend writes. + +Readers refuse versions newer than they understand. Bump on +backwards-incompatible changes to the on-disk layout. +""" + + +class ZarrPointerModel(pydantic.BaseModel): + """Reference to a zarr archive sub-tree by absolute zarr path. + + Used by `ZarrOutputArchive` / `ZarrInputArchive` to point to + sub-trees that have been hoisted out of the main JSON tree into + separate zarr arrays. The path is interpreted relative to the + archive root, e.g. ``"/lsst/psf/tree"``. + """ + + path: str + """Absolute zarr path (e.g. ``/lsst/psf/tree``).""" + + +@dataclass(frozen=True) +class ZarrCompressionOptions: + """Per-array zarr v3 codec configuration. + + The default codec stack is ``bytes -> blosc(zstd, clevel=5)`` with + byte-shuffle for floats and bit-shuffle for integers (and masks). + All defaults are overridable per-array via the ``compression`` + keyword to ``write()``. + """ + + codec: str = "blosc" + cname: str = "zstd" + clevel: int = 5 + shuffle: str = "shuffle" # 'shuffle' (byte) or 'bitshuffle' or 'noshuffle' + + DEFAULT_FLOAT: ClassVar[Self] + DEFAULT_INT: ClassVar[Self] + + @classmethod + def default_for_dtype(cls, dtype: str | np.dtype) -> Self: + """Return the default codec stack for a numpy dtype.""" + kind = np.dtype(dtype).kind + # 'u' (unsigned int), 'i' (signed int), 'b' (bool) -> bit-shuffle. + if kind in ("u", "i", "b"): + return cls.DEFAULT_INT + return cls.DEFAULT_FLOAT + + +ZarrCompressionOptions.DEFAULT_FLOAT = ZarrCompressionOptions(shuffle="shuffle") +ZarrCompressionOptions.DEFAULT_INT = ZarrCompressionOptions(shuffle="bitshuffle") + + +def archive_path_to_zarr_path(archive_path: str) -> str: + """Translate a serialization archive path to its zarr path. + + The empty archive path maps to the root-level JSON tree at + ``/tree``. Non-empty archive paths are kept verbatim (with a + leading slash). The v1 design's JSON-pointer mapping table is + intentionally absent: arrays land where their archive name says + they do. + """ + if not archive_path: + return "/tree" + stripped = archive_path.strip("/") + return f"/{stripped}" + + +def mask_dtype_for_plane_count(n_planes: int) -> np.dtype: + """Pick the smallest unsigned-integer dtype that holds ``n_planes`` bits. + + Returns ``uint8`` for ≤8 planes, ``uint16`` for ≤16, ``uint32`` + for ≤32, ``uint64`` for ≤64. Raises `ValueError` for >64 planes; + a 3-D fallback for that case is tracked as a follow-up. + """ + if n_planes <= 0: + raise ValueError(f"n_planes must be positive, got {n_planes}.") + if n_planes <= 8: + return np.dtype("uint8") + if n_planes <= 16: + return np.dtype("uint16") + if n_planes <= 32: + return np.dtype("uint32") + if n_planes <= 64: + return np.dtype("uint64") + raise ValueError( + f"Mask has {n_planes} planes; v1 supports up to 64. " + f"3-D fallback is a follow-up." + ) +``` + +- [ ] **Step 4: Run the test to verify it passes** + +Run: `pytest tests/test_zarr_common.py -v` +Expected: PASS — 6 tests pass. + +- [ ] **Step 5: Commit** + +```bash +git add python/lsst/images/zarr/_common.py tests/test_zarr_common.py +git commit -m "feat: add ZarrPointerModel, ZarrCompressionOptions, mask-dtype helper" +``` + +### Task 1.3: IR — `ZarrAttributes` and `ZarrArray` with lazy backing + +**Files:** +- Create: `python/lsst/images/zarr/_model.py` +- Test: `tests/test_zarr_model.py` + +This task introduces the IR types whose **lazy-array invariant** is the heart of the efficient subsetting story. `ZarrArray.data` is one of: + +- `numpy.ndarray` — staged for write +- `zarr.Array` — read from a store, **never sliced eagerly** + +A read of a remote VisitImage opens its `zarr.Array` handle through `from_zarr`. Subsequent slicing (in `InputArchive.get_array(model, slices=...)`) goes straight to that handle, so only the chunks intersecting the slice are downloaded. + +`ZarrAttributes` separates the `lsst:` and `ome:` namespaces (each gets its `version` field stamped automatically on `dump`) and preserves unknown keys for forward compatibility. Plain CF / xarray attributes like `_ARRAY_DIMENSIONS`, `flag_masks`, `flag_meanings`, `units` live in a third namespace called `extra` that round-trips verbatim — they're written at the top level of `zarr.json` `attributes` (no `lsst:` or `ome:` wrapper). + +- [ ] **Step 1: Write the failing test** + +Create `tests/test_zarr_model.py`: + +```python +# This file is part of lsst-images. +# +# Developed for the LSST Data Management System. +# This product includes software developed by the LSST Project +# (https://www.lsst.org). +# See the COPYRIGHT file at the top-level directory of this distribution +# for details of code ownership. +# +# Use of this source code is governed by a 3-clause BSD-style +# license that can be found in the LICENSE file. + +from __future__ import annotations + +import unittest + +import numpy as np + +try: + import zarr + + from lsst.images.zarr._common import LSST_NS, LSST_VERSION, OME_NS, OME_VERSION + from lsst.images.zarr._model import ZarrArray, ZarrAttributes + + HAVE_ZARR = True +except ImportError: + HAVE_ZARR = False + + +@unittest.skipUnless(HAVE_ZARR, "zarr is not installed") +class ZarrAttributesTestCase(unittest.TestCase): + def test_dump_separates_namespaces(self) -> None: + attrs = ZarrAttributes() + attrs.lsst["archive_class"] = "MaskedImage" + attrs.ome["multiscales"] = [{"name": "image"}] + attrs.extra["_ARRAY_DIMENSIONS"] = ["y", "x"] + attrs.extra["units"] = "adu" + dumped = attrs.dump() + self.assertEqual(dumped[LSST_NS]["archive_class"], "MaskedImage") + self.assertEqual(dumped[LSST_NS]["version"], LSST_VERSION) + self.assertEqual(dumped[OME_NS]["multiscales"], [{"name": "image"}]) + self.assertEqual(dumped[OME_NS]["version"], OME_VERSION) + # CF / xarray attrs sit at the top level, not inside lsst: or ome:. + self.assertEqual(dumped["_ARRAY_DIMENSIONS"], ["y", "x"]) + self.assertEqual(dumped["units"], "adu") + + def test_load_preserves_unknown_keys(self) -> None: + # Forward compatibility: unknown lsst.* keys must survive a + # load -> dump round-trip. + raw = { + LSST_NS: { + "version": LSST_VERSION, + "archive_class": "Image", + "future_thing": {"x": 1}, + }, + OME_NS: {"version": OME_VERSION, "multiscales": []}, + "_ARRAY_DIMENSIONS": ["y", "x"], + "units": "adu", + } + attrs = ZarrAttributes.load(raw) + dumped = attrs.dump() + self.assertEqual(dumped[LSST_NS]["future_thing"], {"x": 1}) + self.assertEqual(dumped["units"], "adu") + + +@unittest.skipUnless(HAVE_ZARR, "zarr is not installed") +class ZarrArrayTestCase(unittest.TestCase): + def test_lazy_data_after_from_zarr(self) -> None: + store = zarr.storage.MemoryStore() + root = zarr.create_group(store=store, zarr_format=3) + zarr_array = root.create_array( + name="image", shape=(8, 8), chunks=(4, 4), dtype="float32" + ) + zarr_array[:] = np.arange(64, dtype=np.float32).reshape(8, 8) + + ir_array = ZarrArray.from_zarr(zarr_array) + # Lazy invariant: data is the zarr.Array handle, not numpy. + self.assertIsInstance(ir_array.data, zarr.Array) + self.assertNotIsInstance(ir_array.data, np.ndarray) + self.assertEqual(ir_array.shape, (8, 8)) + self.assertEqual(str(ir_array.dtype), "float32") + + def test_subset_does_not_materialize_full_array(self) -> None: + store = _CountingStore() + root = zarr.create_group(store=store, zarr_format=3) + zarr_array = root.create_array( + name="image", shape=(16, 16), chunks=(4, 4), dtype="int32" + ) + zarr_array[:] = np.arange(256, dtype=np.int32).reshape(16, 16) + store.reads = 0 # reset after the write phase + + ir_array = ZarrArray.from_zarr(zarr_array) + # Reading shape / dtype must not fetch any chunk data. + self.assertEqual(ir_array.shape, (16, 16)) + self.assertEqual(store.reads, 0) + + subset = ir_array.read(slices=(slice(0, 4), slice(0, 4))) + self.assertEqual(subset.shape, (4, 4)) + np.testing.assert_array_equal(subset, np.arange(256).reshape(16, 16)[:4, :4]) + # 16 chunks total in the array; we should have touched far fewer. + self.assertLess(store.reads, 16) + + def test_staged_numpy_array_is_eager(self) -> None: + data = np.arange(12, dtype=np.float64).reshape(3, 4) + ir_array = ZarrArray(data=data) + self.assertIs(ir_array.data, data) + self.assertEqual(ir_array.shape, (3, 4)) + + +class _CountingStore(zarr.storage.MemoryStore if HAVE_ZARR else object): + """A MemoryStore that counts get() calls.""" + + def __init__(self) -> None: + super().__init__() + self.reads = 0 + + async def get(self, key, prototype, byte_range=None): # type: ignore[override] + self.reads += 1 + return await super().get(key, prototype, byte_range) + + +if __name__ == "__main__": + unittest.main() +``` + +- [ ] **Step 2: Run the test to verify it fails** + +Run: `pytest tests/test_zarr_model.py -v` +Expected: FAIL — `ImportError` on `lsst.images.zarr._model`. + +- [ ] **Step 3: Write `_model.py` (initial portion: `ZarrAttributes` and `ZarrArray`)** + +Create `python/lsst/images/zarr/_model.py`: + +```python +# This file is part of lsst-images. +# +# Developed for the LSST Data Management System. +# This product includes software developed by the LSST Project +# (https://www.lsst.org). +# See the COPYRIGHT file at the top-level directory of this distribution +# for details of code ownership. +# +# Use of this source code is governed by a 3-clause BSD-style +# license that can be found in the LICENSE file. + +"""Python intermediate representation for zarr / xarray-CF / OME-NGFF content. + +The IR is the source of truth for what gets written. ``ZarrOutputArchive`` +populates a `ZarrDocument`; on context-manager exit, `to_zarr` materializes +it through a configured ``zarr.storage.Store``. + +Reads invert that flow: ``ZarrInputArchive`` opens the store and calls +`ZarrDocument.from_zarr`, which builds the IR around **lazy** ``zarr.Array`` +handles. No array bytes are read until a caller asks for them via +`ZarrArray.read`, which forwards slices straight to the underlying handle. +This keeps subset reads of remote files cheap: only the chunks intersecting +the requested slice are fetched. +""" + +from __future__ import annotations + +__all__ = ( + "ZarrArray", + "ZarrAttributes", +) + +from dataclasses import dataclass, field +from types import EllipsisType +from typing import Any, Self + +import numpy as np +import zarr + +from ._common import LSST_NS, LSST_VERSION, OME_NS, OME_VERSION, ZarrCompressionOptions + + +@dataclass +class ZarrAttributes: + """Namespaced attributes attached to a `ZarrGroup` or `ZarrArray`. + + Three namespaces: + + - ``lsst`` — LSST extensions (always emitted with a ``version`` key). + - ``ome`` — OME-NGFF (emitted only when non-empty). + - ``extra`` — flat top-level keys for CF / xarray conventions + (``_ARRAY_DIMENSIONS``, ``flag_masks``, ``flag_meanings``, + ``flag_descriptions``, ``units``, ``long_name``, …). These live at + the top of ``zarr.json`` ``attributes`` so xarray and CF tooling + see them without unwrapping a namespace. + """ + + lsst: dict[str, Any] = field(default_factory=dict) + ome: dict[str, Any] = field(default_factory=dict) + extra: dict[str, Any] = field(default_factory=dict) + + def dump(self) -> dict[str, Any]: + """Return the raw mapping zarr-python writes to ``zarr.json``.""" + out: dict[str, Any] = dict(self.extra) + # lsst is always present so readers can dispatch on lsst.archive_class. + out[LSST_NS] = {"version": LSST_VERSION, **self.lsst} + if self.ome: + out[OME_NS] = {"version": OME_VERSION, **self.ome} + return out + + @classmethod + def load(cls, raw: dict[str, Any]) -> Self: + """Construct from a raw attributes mapping read from zarr.""" + lsst = dict(raw.get(LSST_NS, {})) + lsst.pop("version", None) # version implicit in the namespace + ome = dict(raw.get(OME_NS, {})) + ome.pop("version", None) + extra = {k: v for k, v in raw.items() if k not in (LSST_NS, OME_NS)} + return cls(lsst=lsst, ome=ome, extra=extra) + + +@dataclass +class ZarrArray: + """An IR node holding either staged numpy data or a lazy zarr handle. + + Parameters + ---------- + data + Either a ``numpy.ndarray`` (when staged for write by the output + archive) or a ``zarr.Array`` (when read by the input archive). + The two forms never mix in a single instance. + chunks + Per-axis chunk shape. ``None`` lets `to_zarr` derive a default + from the array shape (~1024 per axis for plain images). + shards + Per-axis shard shape (zarr v3 native). ``None`` lets `to_zarr` + derive a default of 4× the chunk shape per axis when the + resulting shard exceeds 1 MiB. + compression + Codec configuration. ``None`` falls back to + `ZarrCompressionOptions.default_for_dtype`. + attributes + Namespaced attributes for this array's ``zarr.json``. + """ + + data: np.ndarray | zarr.Array + chunks: tuple[int, ...] | None = None + shards: tuple[int, ...] | None = None + compression: ZarrCompressionOptions | None = None + attributes: ZarrAttributes = field(default_factory=ZarrAttributes) + + @property + def shape(self) -> tuple[int, ...]: + return tuple(self.data.shape) + + @property + def dtype(self) -> np.dtype: + return np.dtype(self.data.dtype) + + @classmethod + def from_zarr(cls, zarr_array: zarr.Array) -> Self: + """Wrap an open ``zarr.Array`` without reading its data.""" + attrs = ZarrAttributes.load(dict(zarr_array.attrs)) + return cls( + data=zarr_array, + chunks=tuple(zarr_array.chunks), + attributes=attrs, + ) + + def read(self, *, slices: tuple[slice, ...] | EllipsisType = ...) -> np.ndarray: + """Materialize this array (or a slice of it) into numpy. + + For a `ZarrArray` backed by a lazy handle, this is the only + place that touches array bytes. ``slices`` is forwarded straight + to the handle so only chunks intersecting the slice are fetched. + """ + if isinstance(self.data, np.ndarray): + return self.data if slices is ... else self.data[slices] + return self.data[...] if slices is ... else self.data[slices] +``` + +- [ ] **Step 4: Run the tests to verify they pass** + +Run: `pytest tests/test_zarr_model.py -v` +Expected: PASS — 5 tests pass; the `_CountingStore` test confirms a 4×4 subset of a 16×16 / chunks=(4,4) array touches strictly fewer than 16 chunk reads. + +- [ ] **Step 5: Commit** + +```bash +git add python/lsst/images/zarr/_model.py tests/test_zarr_model.py +git commit -m "feat: add ZarrAttributes and ZarrArray IR with lazy zarr.Array backing" +``` + +### Task 1.4: IR — `ZarrGroup`, `ZarrDocument`, store materialization + +**Files:** +- Modify: `python/lsst/images/zarr/_model.py` (append `ZarrGroup`, `ZarrDocument`, helpers) +- Modify: `tests/test_zarr_model.py` (add round-trip test through `MemoryStore`) + +This task gives the IR a full tree shape and the bidirectional `to_zarr` / `from_zarr` materialization. The round-trip test pins the lazy invariant: after `from_zarr` on a freshly-opened store, every `ZarrArray.data` is a `zarr.Array`, not a materialized ndarray. + +- [ ] **Step 1: Write the failing test (extend `test_zarr_model.py`)** + +Append before the `if __name__` guard: + +```python +@unittest.skipUnless(HAVE_ZARR, "zarr is not installed") +class ZarrDocumentTestCase(unittest.TestCase): + def test_round_trip_through_memory_store(self) -> None: + from lsst.images.zarr._model import ZarrArray, ZarrDocument, ZarrGroup + + # Build a flat IR: image, variance, mask siblings at root. + doc = ZarrDocument(root=ZarrGroup()) + doc.root.attributes.lsst["archive_class"] = "MaskedImage" + doc.root.attributes.lsst["tree"] = "tree" + + image = ZarrArray(data=np.ones((4, 4), dtype="float32")) + image.attributes.extra["_ARRAY_DIMENSIONS"] = ["y", "x"] + doc.root.arrays["image"] = image + + mask = ZarrArray(data=np.zeros((4, 4), dtype="uint8")) + mask.attributes.extra["_ARRAY_DIMENSIONS"] = ["y", "x"] + mask.attributes.extra["flag_masks"] = [1, 2] + mask.attributes.extra["flag_meanings"] = "BAD SAT" + doc.root.arrays["mask"] = mask + + # Stub a 1-D uint8 'tree' array (JSON bytes). + doc.root.arrays["tree"] = ZarrArray( + data=np.frombuffer(b"{}", dtype=np.uint8) + ) + + store = zarr.storage.MemoryStore() + doc.to_zarr(store) + + # Reload and verify lazy invariant on every array. + recovered = ZarrDocument.from_zarr(store) + self.assertIsInstance(recovered.root.arrays["image"].data, zarr.Array) + self.assertIsInstance(recovered.root.arrays["mask"].data, zarr.Array) + self.assertEqual( + recovered.root.attributes.lsst["archive_class"], "MaskedImage" + ) + # CF flag attrs round-trip via the extra namespace. + self.assertEqual( + recovered.root.arrays["mask"].attributes.extra["flag_meanings"], + "BAD SAT", + ) + # xarray dims round-trip. + self.assertEqual( + recovered.root.arrays["image"].attributes.extra["_ARRAY_DIMENSIONS"], + ["y", "x"], + ) + # Subset reads still go through the lazy handle. + np.testing.assert_array_equal( + recovered.root.arrays["image"].read(), np.ones((4, 4), dtype="float32") + ) + + def test_get_walks_paths(self) -> None: + from lsst.images.zarr._model import ZarrArray, ZarrDocument, ZarrGroup + + doc = ZarrDocument(root=ZarrGroup()) + doc.root.arrays["image"] = ZarrArray(data=np.zeros((2, 2), dtype="float32")) + red = doc.root.ensure_group("/red") + red.arrays["image"] = ZarrArray(data=np.ones((2, 2), dtype="float32")) + + # Absolute and relative paths. + self.assertIs(doc.root.get("/image"), doc.root.arrays["image"]) + self.assertIs(doc.root.get("image"), doc.root.arrays["image"]) + self.assertIs(doc.root.get("/red/image"), red.arrays["image"]) + self.assertIs(doc.root.get("/"), doc.root) + + with self.assertRaises(KeyError): + doc.root.get("/missing") +``` + +- [ ] **Step 2: Run the test to verify it fails** + +Run: `pytest tests/test_zarr_model.py::ZarrDocumentTestCase -v` +Expected: FAIL — `ImportError` for `ZarrGroup` / `ZarrDocument`. + +- [ ] **Step 3: Append `ZarrGroup`, `ZarrDocument`, helpers** + +Update the `__all__` and append to `python/lsst/images/zarr/_model.py`: + +```python +__all__ = ( + "ZarrArray", + "ZarrAttributes", + "ZarrDocument", + "ZarrGroup", +) + + +@dataclass +class ZarrGroup: + """A zarr group: nested groups, arrays, and namespaced attributes.""" + + groups: dict[str, "ZarrGroup"] = field(default_factory=dict) + arrays: dict[str, ZarrArray] = field(default_factory=dict) + attributes: ZarrAttributes = field(default_factory=ZarrAttributes) + + def get(self, path: str) -> "ZarrGroup | ZarrArray": + """Return a child by absolute or relative zarr path.""" + if path in ("", "/"): + return self + parts = [p for p in path.strip("/").split("/") if p] + cursor: ZarrGroup | ZarrArray = self + for part in parts: + if not isinstance(cursor, ZarrGroup): + raise KeyError(path) + if part in cursor.arrays: + cursor = cursor.arrays[part] + elif part in cursor.groups: + cursor = cursor.groups[part] + else: + raise KeyError(path) + return cursor + + def ensure_group(self, path: str) -> "ZarrGroup": + """Return or create a sub-group at ``path``.""" + if path in ("", "/"): + return self + parts = [p for p in path.strip("/").split("/") if p] + cursor = self + for part in parts: + if part in cursor.arrays: + raise KeyError(f"{part!r} already exists as an array.") + if part not in cursor.groups: + cursor.groups[part] = ZarrGroup() + cursor = cursor.groups[part] + return cursor + + +@dataclass +class ZarrDocument: + """A complete zarr archive root.""" + + root: ZarrGroup = field(default_factory=ZarrGroup) + + @classmethod + def from_zarr(cls, store: zarr.storage.Store) -> Self: + """Open ``store`` and build a lazy IR view of its contents.""" + zarr_root = zarr.open_group(store=store, mode="r", zarr_format=3) + return cls(root=_group_from_zarr(zarr_root)) + + def to_zarr(self, store: zarr.storage.Store) -> None: + """Materialize this IR into ``store`` (which must be empty).""" + zarr_root = zarr.create_group(store=store, zarr_format=3, overwrite=False) + _group_to_zarr(self.root, zarr_root) + + +def _group_from_zarr(zarr_group: zarr.Group) -> ZarrGroup: + """Build a lazy `ZarrGroup` IR from an open ``zarr.Group``.""" + ir = ZarrGroup(attributes=ZarrAttributes.load(dict(zarr_group.attrs))) + for name, child in zarr_group.members(): + if isinstance(child, zarr.Array): + ir.arrays[name] = ZarrArray.from_zarr(child) + else: + ir.groups[name] = _group_from_zarr(child) + return ir + + +def _group_to_zarr(ir: ZarrGroup, zarr_group: zarr.Group) -> None: + """Write a `ZarrGroup` IR into an open ``zarr.Group``.""" + if dumped := ir.attributes.dump(): + zarr_group.update_attributes(dumped) + for name, sub in ir.groups.items(): + sub_zarr = zarr_group.create_group(name) + _group_to_zarr(sub, sub_zarr) + for name, array in ir.arrays.items(): + if not isinstance(array.data, np.ndarray): + raise TypeError( + f"Cannot write ZarrArray at {name!r}: data is a lazy zarr.Array, " + "not numpy. Read it first or pass a fresh numpy array." + ) + chunks = array.chunks or _default_chunks(array.data.shape) + compression = array.compression or ZarrCompressionOptions.default_for_dtype( + str(array.dtype) + ) + codecs = _build_codecs(compression) + zarr_array = zarr_group.create_array( + name=name, + shape=array.data.shape, + chunks=chunks, + dtype=array.data.dtype, + shards=array.shards, + codecs=codecs, + ) + zarr_array[:] = array.data + if dumped := array.attributes.dump(): + zarr_array.update_attributes(dumped) + + +def _default_chunks(shape: tuple[int, ...]) -> tuple[int, ...]: + """Default chunk shape: min(1024, dim) per axis.""" + return tuple(min(1024, dim) for dim in shape) + + +def _build_codecs(options: ZarrCompressionOptions) -> list[Any]: + """Build a zarr v3 codec stack from `ZarrCompressionOptions`.""" + from numcodecs.zarr3 import Blosc + + if options.codec != "blosc": + raise NotImplementedError(f"Unsupported codec {options.codec!r}.") + return [ + zarr.codecs.BytesCodec(), + Blosc(cname=options.cname, clevel=options.clevel, shuffle=options.shuffle), + ] +``` + +- [ ] **Step 4: Run all model tests** + +Run: `pytest tests/test_zarr_model.py -v` +Expected: PASS — all tests pass; round-trip test confirms `.data` is a `zarr.Array` after `from_zarr`. + +- [ ] **Step 5: Commit** + +```bash +git add python/lsst/images/zarr/_model.py tests/test_zarr_model.py +git commit -m "feat: add ZarrGroup and ZarrDocument with lazy-on-read materialization" +``` + +### Task 1.5: IR — OME and CF helper dataclasses + +**Files:** +- Modify: `python/lsst/images/zarr/_model.py` (append OME / CF helpers) +- Modify: `tests/test_zarr_model.py` (helper-construction test) + +These small dataclasses centralize the OME and CF attribute shapes so `_layout.py` can populate them without literal-dict-typo bugs. + +- [ ] **Step 1: Write the failing test** + +Append to `tests/test_zarr_model.py`: + +```python +@unittest.skipUnless(HAVE_ZARR, "zarr is not installed") +class OmeCfHelpersTestCase(unittest.TestCase): + def test_multiscale_emits_expected_shape(self) -> None: + from lsst.images.zarr._model import OmeMultiscale + + m = OmeMultiscale( + name="visitimage", + axes=("y", "x"), + dataset_path="image", + ) + d = m.dump() + self.assertEqual(d["name"], "visitimage") + self.assertEqual( + d["axes"], + [ + {"name": "y", "type": "space", "unit": "pixel"}, + {"name": "x", "type": "space", "unit": "pixel"}, + ], + ) + self.assertEqual(d["datasets"][0]["path"], "image") + # Default coordinate transform is unit scale until a real one is set. + self.assertEqual( + d["datasets"][0]["coordinateTransformations"], + [{"type": "scale", "scale": [1.0, 1.0]}], + ) + + def test_multiscale_with_affine(self) -> None: + from lsst.images.zarr._model import OmeMultiscale + + m = OmeMultiscale( + name="image", + axes=("y", "x"), + dataset_path="image", + coordinate_transformations=[ + {"type": "scale", "scale": [0.2, 0.2]}, + { + "type": "affine", + "affine": [[1.0, 0.0, 0.0], [0.0, 1.0, 0.0], [0.0, 0.0, 1.0]], + }, + ], + ) + d = m.dump() + self.assertEqual(len(d["datasets"][0]["coordinateTransformations"]), 2) + self.assertEqual( + d["datasets"][0]["coordinateTransformations"][0]["type"], "scale" + ) + + def test_cf_flag_attributes(self) -> None: + from lsst.images.zarr._model import CfFlagAttributes, MaskPlaneEntry + + cf = CfFlagAttributes( + planes=[ + MaskPlaneEntry(name="BAD", bit=0, description="Bad pixel."), + MaskPlaneEntry(name="SAT", bit=1, description="Saturated."), + MaskPlaneEntry(name="CR", bit=2, description="Cosmic ray."), + ] + ) + d = cf.dump() + self.assertEqual(d["flag_masks"], [1, 2, 4]) + self.assertEqual(d["flag_meanings"], "BAD SAT CR") + self.assertEqual( + d["flag_descriptions"], ["Bad pixel.", "Saturated.", "Cosmic ray."] + ) + + def test_image_array_attrs(self) -> None: + from lsst.images.zarr._model import build_image_array_attrs + + attrs = build_image_array_attrs(axes=("y", "x"), units="adu", long_name="science image") + self.assertEqual(attrs["_ARRAY_DIMENSIONS"], ["y", "x"]) + self.assertEqual(attrs["units"], "adu") + self.assertEqual(attrs["long_name"], "science image") +``` + +- [ ] **Step 2: Run to verify failure** + +Run: `pytest tests/test_zarr_model.py::OmeCfHelpersTestCase -v` +Expected: FAIL — `ImportError`. + +- [ ] **Step 3: Append helpers to `_model.py`** + +Update the `__all__` and append: + +```python +__all__ = ( + "CfFlagAttributes", + "MaskPlaneEntry", + "OmeMultiscale", + "OmeOmeroChannel", + "ZarrArray", + "ZarrAttributes", + "ZarrDocument", + "ZarrGroup", + "build_image_array_attrs", +) + + +@dataclass +class OmeMultiscale: + """OME-NGFF v0.5 multiscales metadata for a single-level image. + + The backend always writes one level whose ``path`` points at a + sibling array (``image`` for typical archives). ``coordinate_transformations`` + defaults to a unit ``scale`` so the OME block is well-formed even + when the simplified affine is dropped by the residual validator. + """ + + name: str + axes: tuple[str, ...] + dataset_path: str = "image" + coordinate_transformations: list[dict[str, Any]] | None = None + + @staticmethod + def _axis_block(name: str) -> dict[str, Any]: + if name == "c": + return {"name": "c", "type": "channel"} + if name == "t": + return {"name": "t", "type": "time"} + return {"name": name, "type": "space", "unit": "pixel"} + + def dump(self) -> dict[str, Any]: + ndim = len(self.axes) + ct = self.coordinate_transformations + if ct is None: + ct = [{"type": "scale", "scale": [1.0] * ndim}] + return { + "name": self.name, + "axes": [self._axis_block(a) for a in self.axes], + "datasets": [ + { + "path": self.dataset_path, + "coordinateTransformations": ct, + } + ], + } + + +@dataclass +class OmeOmeroChannel: + """OME ``omero/channels`` entry (used only when a channel axis exists).""" + + label: str + color: str | None = None + + def dump(self) -> dict[str, Any]: + out: dict[str, Any] = {"label": self.label} + if self.color is not None: + out["color"] = self.color + return out + + +@dataclass +class MaskPlaneEntry: + """One mask-plane definition.""" + + name: str + bit: int + description: str = "" + + +@dataclass +class CfFlagAttributes: + """CF-conventions flag metadata for a 2-D packed mask array. + + Emits ``flag_masks`` (list of bit values), ``flag_meanings`` + (single space-separated string per CF), and the LSST extension + ``flag_descriptions`` (list of human-readable strings parallel to + ``flag_meanings``). + """ + + planes: list[MaskPlaneEntry] = field(default_factory=list) + + def dump(self) -> dict[str, Any]: + return { + "flag_masks": [int(1 << p.bit) for p in self.planes], + "flag_meanings": " ".join(p.name for p in self.planes), + "flag_descriptions": [p.description for p in self.planes], + } + + @classmethod + def load(cls, raw: dict[str, Any]) -> Self: + meanings = raw.get("flag_meanings", "").split() + masks = [int(m) for m in raw.get("flag_masks", [])] + descriptions = list(raw.get("flag_descriptions", [""] * len(meanings))) + planes = [] + for name, mask, desc in zip(meanings, masks, descriptions, strict=False): + # Recover bit position from the mask value (always a power of 2). + bit = (mask & -mask).bit_length() - 1 + planes.append(MaskPlaneEntry(name=name, bit=bit, description=desc)) + return cls(planes=planes) + + +def build_image_array_attrs( + *, + axes: tuple[str, ...], + units: str | None = None, + long_name: str | None = None, +) -> dict[str, Any]: + """Build the CF / xarray attribute block for a 2-D-or-higher image array.""" + out: dict[str, Any] = {"_ARRAY_DIMENSIONS": list(axes)} + if units is not None: + out["units"] = units + if long_name is not None: + out["long_name"] = long_name + return out +``` + +- [ ] **Step 4: Run all model tests** + +Run: `pytest tests/test_zarr_model.py -v` +Expected: PASS — 9 tests. + +- [ ] **Step 5: Commit** + +```bash +git add python/lsst/images/zarr/_model.py tests/test_zarr_model.py +git commit -m "feat: add OmeMultiscale, CfFlagAttributes, image-array-attrs helpers" +``` + +--- + +**End of Phase 1.** Five tasks. The IR is in place with the lazy invariant pinned by `_CountingStore`, the CF / OME helpers are unit-tested in isolation, and `ZarrAttributes` separates `lsst:` / `ome:` / top-level (`extra`) namespaces so xarray and CF tooling see flat attributes without unwrapping. Phase 2 wires `_store.py`, `_layout.py`, and `ZarrOutputArchive` against this IR for `Image` / `MaskedImage` / `VisitImage`. + +## Phase 2 — Store dispatch, layout rules, and `ZarrOutputArchive` (Image / MaskedImage / VisitImage) + +This phase adds enough machinery to **write** a plain `Image`, a `MaskedImage`, and a `VisitImage` to a zarr archive on disk and on a `ZipStore`. No reading yet — that lands in Phase 3 — so tests inspect the on-disk shape via `ZarrDocument.from_zarr()` directly. `ColorImage` and `CellCoadd` are deferred to Phase 4. + +The output archive's `add_array(name)` method writes to a zarr path equal to `name` (after stripping the leading slash). There is **no JSON-pointer mapping table** and **no fixup pass**. Mask arrays go through a small specialization that packs a 3-D `(y, x, mask_size)` in-memory mask into the 2-D wide-integer on-disk form and attaches CF flag attrs. + +### Task 2.1: `_store.py` — URI → `zarr.storage.Store` dispatch + +**Files:** +- Create: `python/lsst/images/zarr/_store.py` +- Test: `tests/test_zarr_store.py` + +URI dispatch: + +| URI shape | Store | +|---|---| +| `*.zarr.zip` (any scheme) | `zarr.storage.ZipStore` | +| `file://` or local path | `zarr.storage.LocalStore` | +| `http(s)://`, `s3://`, `gs://`, etc. | `zarr.storage.FsspecStore` (via `fsspec.url_to_fs`) | + +Create-only mode is enforced here: write helpers refuse to open a non-empty existing store. + +- [ ] **Step 1: Write the failing test** + +Create `tests/test_zarr_store.py`: + +```python +# This file is part of lsst-images. +# +# Developed for the LSST Data Management System. +# This product includes software developed by the LSST Project +# (https://www.lsst.org). +# See the COPYRIGHT file at the top-level directory of this distribution +# for details of code ownership. +# +# Use of this source code is governed by a 3-clause BSD-style +# license that can be found in the LICENSE file. + +from __future__ import annotations + +import os +import tempfile +import unittest + +try: + import zarr + + from lsst.images.zarr._store import open_store_for_read, open_store_for_write + + HAVE_ZARR = True +except ImportError: + HAVE_ZARR = False + + +@unittest.skipUnless(HAVE_ZARR, "zarr is not installed") +class StoreDispatchTestCase(unittest.TestCase): + def test_local_directory(self) -> None: + with tempfile.TemporaryDirectory() as tmp: + target = os.path.join(tmp, "out.zarr") + with open_store_for_write(target) as store: + self.assertIsInstance(store, zarr.storage.LocalStore) + zarr.create_group(store=store, zarr_format=3) + with open_store_for_read(target) as store: + self.assertIsInstance(store, zarr.storage.LocalStore) + root = zarr.open_group(store=store, mode="r") + self.assertEqual(list(root.keys()), []) + + def test_zip_store(self) -> None: + with tempfile.TemporaryDirectory() as tmp: + target = os.path.join(tmp, "out.zarr.zip") + with open_store_for_write(target) as store: + self.assertIsInstance(store, zarr.storage.ZipStore) + zarr.create_group(store=store, zarr_format=3) + with open_store_for_read(target) as store: + self.assertIsInstance(store, zarr.storage.ZipStore) + + def test_create_only_refuses_existing(self) -> None: + with tempfile.TemporaryDirectory() as tmp: + target = os.path.join(tmp, "out.zarr") + with open_store_for_write(target) as store: + zarr.create_group(store=store, zarr_format=3) + with self.assertRaisesRegex(OSError, "already exists"): + with open_store_for_write(target): + pass + + +if __name__ == "__main__": + unittest.main() +``` + +- [ ] **Step 2: Run the test to verify it fails** + +Run: `pytest tests/test_zarr_store.py -v` +Expected: FAIL — `ImportError`. + +- [ ] **Step 3: Write `_store.py`** + +Create `python/lsst/images/zarr/_store.py`: + +```python +# This file is part of lsst-images. +# +# Developed for the LSST Data Management System. +# This product includes software developed by the LSST Project +# (https://www.lsst.org). +# See the COPYRIGHT file at the top-level directory of this distribution +# for details of code ownership. +# +# Use of this source code is governed by a 3-clause BSD-style +# license that can be found in the LICENSE file. + +from __future__ import annotations + +__all__ = ("open_store_for_read", "open_store_for_write") + +import os +from collections.abc import Iterator +from contextlib import contextmanager + +import zarr + +from lsst.resources import ResourcePath, ResourcePathExpression + + +def _is_zip(rp: ResourcePath) -> bool: + return rp.path.endswith(".zarr.zip") or rp.path.endswith(".zip") + + +def _is_remote(rp: ResourcePath) -> bool: + return rp.scheme not in ("", "file") + + +@contextmanager +def open_store_for_write(path: ResourcePathExpression) -> Iterator[zarr.storage.Store]: + """Open a zarr store for writing. + + Refuses to overwrite a non-empty existing store. The returned + context manager closes the store on exit; for ``ZipStore`` this + finalizes the central directory. + """ + rp = ResourcePath(path) + if _is_zip(rp): + if _is_remote(rp): + raise NotImplementedError("Remote ZipStore writes are a follow-up.") + local = rp.ospath + if os.path.exists(local) and os.path.getsize(local) > 0: + raise OSError(f"File {local!r} already exists.") + store = zarr.storage.ZipStore(local, mode="w") + try: + yield store + finally: + store.close() + return + if _is_remote(rp): + import fsspec + + fs, fs_path = fsspec.url_to_fs(str(rp)) + if fs.exists(fs_path) and fs.ls(fs_path): + raise OSError(f"Store {rp!s} already exists.") + store = zarr.storage.FsspecStore(fs=fs, path=fs_path, read_only=False) + yield store + return + local = rp.ospath + if os.path.exists(local) and os.listdir(local): + raise OSError(f"Directory {local!r} already exists and is non-empty.") + os.makedirs(local, exist_ok=True) + store = zarr.storage.LocalStore(local, read_only=False) + yield store + + +@contextmanager +def open_store_for_read(path: ResourcePathExpression) -> Iterator[zarr.storage.Store]: + """Open a zarr store for reading.""" + rp = ResourcePath(path) + if _is_zip(rp): + if _is_remote(rp): + with rp.as_local() as local: + store = zarr.storage.ZipStore(local.ospath, mode="r") + try: + yield store + finally: + store.close() + return + store = zarr.storage.ZipStore(rp.ospath, mode="r") + try: + yield store + finally: + store.close() + return + if _is_remote(rp): + import fsspec + + fs, fs_path = fsspec.url_to_fs(str(rp)) + store = zarr.storage.FsspecStore(fs=fs, path=fs_path, read_only=True) + yield store + return + store = zarr.storage.LocalStore(rp.ospath, read_only=True) + yield store +``` + +- [ ] **Step 4: Run the tests** + +Run: `pytest tests/test_zarr_store.py -v` +Expected: PASS — 3 tests. + +- [ ] **Step 5: Commit** + +```bash +git add python/lsst/images/zarr/_store.py tests/test_zarr_store.py +git commit -m "feat: add zarr store dispatch (LocalStore / ZipStore / FsspecStore)" +``` + +### Task 2.2: `_layout.py` — axes per archive class and chunk derivation + +**Files:** +- Create: `python/lsst/images/zarr/_layout.py` +- Test: `tests/test_zarr_layout.py` + +This task adds the per-archive-class layout rules: axis tuples and chunk-shape derivation. Chunk derivation honors three sources of truth in priority order: + +1. Explicit per-array override (from `write(chunks={...})`). +2. `cell_shape` from the archive metadata (for `CellCoadd`). +3. `min(1024, dim)` per axis fallback. + +A separate helper `chunks_aligned_to(image_chunks, shape)` derives `variance`/`mask` chunks from the `image` array's chunks so siblings stay aligned (CF / xarray / GDAL all assume this). The output archive will call this helper when the user has not overridden the sibling's chunks. + +The affine residual validator lands in Task 2.3 (separate task because it has its own test surface). + +- [ ] **Step 1: Write the failing test** + +Create `tests/test_zarr_layout.py`: + +```python +# This file is part of lsst-images. +# +# Developed for the LSST Data Management System. +# This product includes software developed by the LSST Project +# (https://www.lsst.org). +# See the COPYRIGHT file at the top-level directory of this distribution +# for details of code ownership. +# +# Use of this source code is governed by a 3-clause BSD-style +# license that can be found in the LICENSE file. + +from __future__ import annotations + +import unittest + +try: + from lsst.images.zarr._layout import ( + axes_for_archive_class, + chunks_aligned_to, + chunks_for, + ) + + HAVE_ZARR = True +except ImportError: + HAVE_ZARR = False + + +@unittest.skipUnless(HAVE_ZARR, "zarr is not installed") +class LayoutTestCase(unittest.TestCase): + def test_axes_for_archive_class(self) -> None: + # Standard 2-D images use (y, x). + self.assertEqual(axes_for_archive_class("Image"), ("y", "x")) + self.assertEqual(axes_for_archive_class("MaskedImage"), ("y", "x")) + self.assertEqual(axes_for_archive_class("VisitImage"), ("y", "x")) + self.assertEqual(axes_for_archive_class("Mask"), ("y", "x")) + self.assertEqual(axes_for_archive_class("CellCoadd"), ("y", "x")) + # ColorImage's root has no top-level multiscale; this returns + # an empty tuple to signal "no OME multiscale at this level". + self.assertEqual(axes_for_archive_class("ColorImage"), ()) + + def test_chunks_for_default(self) -> None: + self.assertEqual(chunks_for("Image", (4096, 4096), None), (1024, 1024)) + # Smaller than 1024 -> use full dim. + self.assertEqual(chunks_for("Image", (300, 600), None), (300, 600)) + + def test_chunks_for_override(self) -> None: + self.assertEqual(chunks_for("Image", (4096, 4096), (256, 256)), (256, 256)) + + def test_chunks_for_cell_coadd_uses_cell_shape(self) -> None: + result = chunks_for( + "CellCoadd", + (4096, 4096), + None, + archive_metadata={"cell_shape": (256, 256)}, + ) + self.assertEqual(result, (256, 256)) + + def test_chunks_for_cell_coadd_without_metadata_falls_back(self) -> None: + self.assertEqual(chunks_for("CellCoadd", (4096, 4096), None), (1024, 1024)) + + def test_chunks_aligned_to_matches_image(self) -> None: + # variance / mask follow image's chunks when not overridden. + self.assertEqual( + chunks_aligned_to(image_chunks=(256, 256), shape=(4096, 4096)), + (256, 256), + ) + # If the sibling shape is smaller than image's chunks, clamp. + self.assertEqual( + chunks_aligned_to(image_chunks=(1024, 1024), shape=(300, 600)), + (300, 600), + ) + + +if __name__ == "__main__": + unittest.main() +``` + +- [ ] **Step 2: Run the test to verify it fails** + +Run: `pytest tests/test_zarr_layout.py -v` +Expected: FAIL — `ImportError`. + +- [ ] **Step 3: Write `_layout.py`** + +Create `python/lsst/images/zarr/_layout.py`: + +```python +# This file is part of lsst-images. +# +# Developed for the LSST Data Management System. +# This product includes software developed by the LSST Project +# (https://www.lsst.org). +# See the COPYRIGHT file at the top-level directory of this distribution +# for details of code ownership. +# +# Use of this source code is governed by a 3-clause BSD-style +# license that can be found in the LICENSE file. + +"""Per-archive-class layout rules for the zarr backend. + +This module centralises the decisions that vary by image type: + +- which OME axes apply (``ColorImage`` has no root multiscale) +- default chunk sizes (clamped to 1024 per axis for plain images, + cell-aligned for `CellCoadd`, image-aligned for `variance` / `mask` + siblings) +- the affine residual validator that gates the OME + ``coordinateTransformations`` block + +Keeping these in one place lets the output archive populate the IR +generically. +""" + +from __future__ import annotations + +__all__ = ( + "axes_for_archive_class", + "chunks_aligned_to", + "chunks_for", +) + +from collections.abc import Mapping +from typing import Any + +_DEFAULT_AXIS_LIMIT = 1024 + + +def axes_for_archive_class(name: str) -> tuple[str, ...]: + """Return the OME axis tuple for a given archive class. + + Returns an empty tuple for ``ColorImage`` to signal that there is + no OME multiscale at the root of that class — the per-channel + sub-archives carry their own ``(y, x)`` multiscales. + """ + if name == "ColorImage": + return () + return ("y", "x") + + +def chunks_for( + archive_class: str, + shape: tuple[int, ...], + override: tuple[int, ...] | None, + *, + archive_metadata: Mapping[str, Any] | None = None, +) -> tuple[int, ...]: + """Return the chunk shape to use for a top-level array. + + Parameters + ---------- + archive_class + Top-level archive class name; used for class-specific + defaults like ``CellCoadd``'s cell-aligned chunks. + shape + The full array shape, used to clamp the default per-axis. + override + User-supplied chunk shape. If not ``None`` it is returned + verbatim after a length check. + archive_metadata + Class-specific layout hints. ``CellCoadd`` reads + ``"cell_shape"`` from this mapping. + """ + if override is not None: + if len(override) != len(shape): + raise ValueError( + f"chunks override has rank {len(override)}, " + f"expected {len(shape)} for {archive_class!r}." + ) + return tuple(override) + if archive_class == "CellCoadd" and archive_metadata is not None: + cell_shape = archive_metadata.get("cell_shape") + if cell_shape is not None: + return tuple(min(c, dim) for c, dim in zip(cell_shape, shape, strict=True)) + return tuple(min(_DEFAULT_AXIS_LIMIT, dim) for dim in shape) + + +def chunks_aligned_to( + *, + image_chunks: tuple[int, ...], + shape: tuple[int, ...], +) -> tuple[int, ...]: + """Derive a sibling array's chunks from the ``image`` array's chunks. + + Used by `ZarrOutputArchive.add_array` for ``variance`` and + ``mask`` siblings when the user has not provided an explicit + override. The result is per-axis ``min(image_chunks[i], + shape[i])`` so a sibling smaller than ``image`` is not + over-chunked. + """ + if len(image_chunks) != len(shape): + raise ValueError( + f"image_chunks rank {len(image_chunks)} does not match " + f"sibling shape rank {len(shape)}." + ) + return tuple(min(c, dim) for c, dim in zip(image_chunks, shape, strict=True)) +``` + +- [ ] **Step 4: Run the tests** + +Run: `pytest tests/test_zarr_layout.py -v` +Expected: PASS — 6 tests. + +- [ ] **Step 5: Commit** + +```bash +git add python/lsst/images/zarr/_layout.py tests/test_zarr_layout.py +git commit -m "feat: add zarr layout rules for axes and chunk derivation" +``` + +### Task 2.3: `_layout.py` — affine residual validator + +**Files:** +- Modify: `python/lsst/images/zarr/_layout.py` +- Modify: `tests/test_zarr_layout.py` + +The affine residual validator extracts the linear / affine portion of the AST FrameSet's pixel-to-sky mapping, samples residuals on an 11×11 grid, and decides whether to emit the OME `coordinateTransformations` block. The contract: + +- Input: a `FrameSet`, a 2-D image bbox `(y_size, x_size)`, and a max residual threshold (default 1.0 pixel). +- Output: `AffineCheckResult` carrying either the affine `coordinateTransformations` to emit, **or** a `dropped=True` flag with the observed max residual. + +The function does **not** know about zarr; it only knows about AST. The output archive consumes the result and threads it into the OME multiscale block. + +- [ ] **Step 1: Write the failing test** + +Append to `tests/test_zarr_layout.py`: + +```python +@unittest.skipUnless(HAVE_ZARR, "zarr is not installed") +class AffineValidatorTestCase(unittest.TestCase): + def _make_linear_frame_set(self, *, scale: float = 0.2): + # Build a synthetic FrameSet whose pixel-to-sky is a pure scale. + from lsst.images._transforms._ast import ( + Frame, + FrameSet, + ZoomMap, + ) + + base = Frame(2, "Domain=PIXEL") + sky = Frame(2, "Domain=SKY") + fs = FrameSet(base) + fs.addFrame(FrameSet.BASE, ZoomMap(2, scale), sky) + return fs + + def _make_distorted_frame_set(self): + # Build a FrameSet that adds a polynomial distortion on top of + # a linear pixel-to-sky map; the affine approximation will be + # off by many pixels at the corners. + from lsst.images._transforms._ast import ( + Frame, + FrameSet, + PolyMap, + ZoomMap, + CmpMap, + ) + + base = Frame(2, "Domain=PIXEL") + sky = Frame(2, "Domain=SKY") + # Forward polynomial: x' = x + 0.001 * y^2; y' = y + 0.001 * x^2. + # PolyMap coefficient table format: [coeff, output_index, x_power, y_power]. + forward_coeffs = [ + [1.0, 1, 1, 0], + [0.001, 1, 0, 2], + [1.0, 2, 0, 1], + [0.001, 2, 2, 0], + ] + poly = PolyMap(forward_coeffs, 2, "IterInverse=1, NIterInverse=20") + cmp = CmpMap(poly, ZoomMap(2, 0.2), True) + fs = FrameSet(base) + fs.addFrame(FrameSet.BASE, cmp, sky) + return fs + + def test_pure_linear_passes(self) -> None: + from lsst.images.zarr._layout import affine_check + + fs = self._make_linear_frame_set(scale=0.2) + result = affine_check( + frame_set=fs, + image_shape=(64, 64), + max_residual_pixels=1.0, + ) + self.assertFalse(result.dropped) + self.assertIsNotNone(result.coordinate_transformations) + self.assertLess(result.max_residual_pixels, 1e-6) + + def test_high_distortion_drops_block(self) -> None: + from lsst.images.zarr._layout import affine_check + + fs = self._make_distorted_frame_set() + # 4096-pixel-wide image: 0.001 * 2048^2 ~ 4000 pixels of error + # at the corners. Way over the 1-pixel threshold. + result = affine_check( + frame_set=fs, + image_shape=(4096, 4096), + max_residual_pixels=1.0, + ) + self.assertTrue(result.dropped) + self.assertGreater(result.max_residual_pixels, 1.0) + # When dropped, the function still reports the residual so the + # output archive can record it as lsst.wcs_simplified_max_residual_pixels. +``` + +- [ ] **Step 2: Run to verify failure** + +Run: `pytest tests/test_zarr_layout.py::AffineValidatorTestCase -v` +Expected: FAIL — `affine_check` does not exist. + +- [ ] **Step 3: Implement `affine_check`** + +Append to `python/lsst/images/zarr/_layout.py`: + +```python +__all__ = ( + "AffineCheckResult", + "affine_check", + "axes_for_archive_class", + "chunks_aligned_to", + "chunks_for", +) + + +from dataclasses import dataclass + + +@dataclass +class AffineCheckResult: + """Result of validating a simplified affine against a full WCS. + + When ``dropped`` is False, ``coordinate_transformations`` is the + OME-NGFF ``coordinateTransformations`` list to emit. When True, + the caller must omit the block (or emit a unit scale only) and + record ``max_residual_pixels`` as the observed worst error. + """ + + dropped: bool + max_residual_pixels: float + coordinate_transformations: list[dict[str, Any]] | None + + +def affine_check( + *, + frame_set: Any, + image_shape: tuple[int, int], + max_residual_pixels: float = 1.0, + grid: int = 11, +) -> AffineCheckResult: + """Build an OME affine ``coordinateTransformations`` for ``frame_set``, + validate it on an ``grid``×``grid`` sample, and decide whether to keep it. + + The simplified affine is constructed by mapping three reference + pixels (origin and the two unit-axis steps) through ``frame_set`` + to recover the linear coefficients. The full pixel-to-sky map is + then evaluated at every grid point and compared to the affine's + prediction; the worst great-circle separation is divided by the + pixel scale to get a pixel-equivalent residual. + + If ``max_residual <= max_residual_pixels``, returns a result whose + ``coordinate_transformations`` is the affine block. Otherwise + returns a dropped result and the caller must emit the unit scale + (or no transformations at all). + """ + import numpy as np + + h, w = image_shape + + # 1. Recover the linear / affine portion by mapping three pixels. + pixels = np.array([[0.0, 0.0], [1.0, 0.0], [0.0, 1.0]]) + sky_at_ref = _frame_set_apply(frame_set, pixels) + origin = sky_at_ref[0] + dxsky = sky_at_ref[1] - origin + dysky = sky_at_ref[2] - origin + affine_matrix = np.array( + [ + [dxsky[0], dysky[0], origin[0]], + [dxsky[1], dysky[1], origin[1]], + [0.0, 0.0, 1.0], + ] + ) + + pixel_scale_y = float(np.linalg.norm(dysky)) + pixel_scale_x = float(np.linalg.norm(dxsky)) + pixel_scale = float(np.sqrt(pixel_scale_y * pixel_scale_x)) + if pixel_scale <= 0.0: + return AffineCheckResult( + dropped=True, + max_residual_pixels=float("inf"), + coordinate_transformations=None, + ) + + # 2. Sample residuals on a grid spanning [0, h-1] x [0, w-1]. + ys = np.linspace(0.0, max(h - 1, 0), grid) + xs = np.linspace(0.0, max(w - 1, 0), grid) + grid_pixels = np.array([[y, x] for y in ys for x in xs]) + sky_full = _frame_set_apply(frame_set, grid_pixels) + affine_pred = (affine_matrix[:2, :2] @ grid_pixels.T).T + origin + great_circle = _angular_separation(sky_full, affine_pred) + max_residual = float(np.max(great_circle) / pixel_scale) + + coordinate_transformations: list[dict[str, Any]] = [ + { + "type": "scale", + "scale": [pixel_scale_y, pixel_scale_x], + }, + { + "type": "affine", + "affine": affine_matrix.tolist(), + }, + ] + + if max_residual > max_residual_pixels: + return AffineCheckResult( + dropped=True, + max_residual_pixels=max_residual, + coordinate_transformations=None, + ) + return AffineCheckResult( + dropped=False, + max_residual_pixels=max_residual, + coordinate_transformations=coordinate_transformations, + ) + + +def _frame_set_apply(frame_set: Any, pixels: Any) -> Any: + """Apply ``frame_set``'s base->current mapping to a (N, 2) pixel array.""" + import numpy as np + + pixels = np.asarray(pixels, dtype=float) + mapping = frame_set.getMapping(frame_set.base, frame_set.current) + # AST applyForward expects (n_axes, n_points); transpose round-trip. + out = mapping.applyForward(pixels.T) + return np.asarray(out).T + + +def _angular_separation(a: Any, b: Any) -> Any: + """Element-wise great-circle separation between two arrays of (lon, lat). + + Inputs in radians (AST default for unit sky frames). Returns a 1-D + array of separations in the same units as the input. + """ + import numpy as np + + a = np.asarray(a) + b = np.asarray(b) + lon_a, lat_a = a[:, 0], a[:, 1] + lon_b, lat_b = b[:, 0], b[:, 1] + dlon = lon_b - lon_a + return np.arccos( + np.clip( + np.sin(lat_a) * np.sin(lat_b) + np.cos(lat_a) * np.cos(lat_b) * np.cos(dlon), + -1.0, + 1.0, + ) + ) +``` + +- [ ] **Step 4: Run the tests** + +Run: `pytest tests/test_zarr_layout.py -v` +Expected: PASS — 8 tests; the linear FrameSet has near-zero residual, the polynomial FrameSet is dropped with `max_residual_pixels` in the thousands. + +- [ ] **Step 5: Commit** + +```bash +git add python/lsst/images/zarr/_layout.py tests/test_zarr_layout.py +git commit -m "feat: add affine_check residual validator for OME coordinateTransformations" +``` + +### Task 2.4: `ZarrOutputArchive` skeleton — `serialize_direct` / `serialize_pointer` / `iter_frame_sets` + +**Files:** +- Create: `python/lsst/images/zarr/_output_archive.py` +- Test: `tests/test_zarr_output_archive.py` + +The constructor builds an empty `ZarrDocument` and stashes the user's per-array overrides plus the `archive_metadata` dict (used by `_layout.chunks_for` to see `cell_shape`). `serialize_direct` returns a `NestedOutputArchive` so nested calls land at compound paths (`red/image` rather than `image`). `serialize_pointer` writes the sub-tree's JSON bytes to a `tree` array under the sub-archive's path and returns a `ZarrPointerModel(path="/tree")`. + +`add_array` / `add_table` / `add_structured_array` / `add_tree` follow in subsequent tasks; they raise `NotImplementedError` here so the abstract class is concretely implementable. + +- [ ] **Step 1: Write the failing test** + +Create `tests/test_zarr_output_archive.py`: + +```python +# This file is part of lsst-images. +# +# Developed for the LSST Data Management System. +# This product includes software developed by the LSST Project +# (https://www.lsst.org). +# See the COPYRIGHT file at the top-level directory of this distribution +# for details of code ownership. +# +# Use of this source code is governed by a 3-clause BSD-style +# license that can be found in the LICENSE file. + +from __future__ import annotations + +import unittest + +import pydantic + +try: + from lsst.images.zarr._common import ZarrPointerModel + from lsst.images.zarr._output_archive import ZarrOutputArchive + + HAVE_ZARR = True +except ImportError: + HAVE_ZARR = False + + +class _Sub(pydantic.BaseModel): + label: str = "sub" + + +@unittest.skipUnless(HAVE_ZARR, "zarr is not installed") +class ZarrOutputArchiveSkeletonTestCase(unittest.TestCase): + def test_serialize_direct_returns_nested_result(self) -> None: + archive = ZarrOutputArchive() + + def serializer(arch): # noqa: ANN001 + return _Sub(label="ok") + + result = archive.serialize_direct("red", serializer) + self.assertEqual(result.label, "ok") + + def test_serialize_pointer_writes_json_subtree(self) -> None: + archive = ZarrOutputArchive() + + def serializer(arch): # noqa: ANN001 + return _Sub(label="psf") + + pointer = archive.serialize_pointer("psf", serializer, key=12345) + self.assertIsInstance(pointer, ZarrPointerModel) + self.assertEqual(pointer.path, "/psf/tree") + # Cached on second call. + again = archive.serialize_pointer("psf", serializer, key=12345) + self.assertEqual(again, pointer) + # IR holds the JSON bytes as a 1-D uint8 array. + node = archive.document.root.get("/psf/tree") + self.assertEqual(str(node.dtype), "uint8") + + +if __name__ == "__main__": + unittest.main() +``` + +- [ ] **Step 2: Run to verify failure** + +Run: `pytest tests/test_zarr_output_archive.py -v` +Expected: FAIL — `ImportError`. + +- [ ] **Step 3: Write the skeleton** + +Create `python/lsst/images/zarr/_output_archive.py`: + +```python +# This file is part of lsst-images. +# +# Developed for the LSST Data Management System. +# This product includes software developed by the LSST Project +# (https://www.lsst.org). +# See the COPYRIGHT file at the top-level directory of this distribution +# for details of code ownership. +# +# Use of this source code is governed by a 3-clause BSD-style +# license that can be found in the LICENSE file. + +from __future__ import annotations + +__all__ = ("ZarrOutputArchive", "write") + +from collections.abc import Callable, Hashable, Iterator, Mapping +from typing import Any + +import numpy as np +import pydantic + +from .._transforms import FrameSet +from ..serialization import ( + ArchiveTree, + NestedOutputArchive, + OutputArchive, +) +from ._common import ( + ZarrCompressionOptions, + ZarrPointerModel, + archive_path_to_zarr_path, +) +from ._model import ZarrArray, ZarrDocument, ZarrGroup + + +class ZarrOutputArchive(OutputArchive[ZarrPointerModel]): + """Output archive that populates a `ZarrDocument` IR. + + Bytes are not written until the IR is materialized via + `ZarrDocument.to_zarr`, which the public `write` helper performs + on context-manager exit. + + Parameters + ---------- + chunks + Per-array chunk overrides keyed by the JSON pointer of the + attribute the array backs (or its zarr path). ``None`` for a + key means "use the layout default". + shards, compression + Same shape as ``chunks``. + archive_class + Top-level archive class name (``"VisitImage"``, ``"CellCoadd"``, + …). Used by the layout layer to pick chunk defaults; set by + ``write()`` before ``obj.serialize`` runs so ``add_array`` + sees the right value. + archive_metadata + Class-specific layout hints (``cell_shape`` for ``CellCoadd``). + """ + + def __init__( + self, + *, + chunks: Mapping[str, tuple[int, ...] | None] | None = None, + shards: Mapping[str, tuple[int, ...] | None] | None = None, + compression: Mapping[str, ZarrCompressionOptions | None] | None = None, + archive_class: str = "Image", + archive_metadata: Mapping[str, Any] | None = None, + ) -> None: + self.document = ZarrDocument(root=ZarrGroup()) + self._chunks = dict(chunks) if chunks else {} + self._shards = dict(shards) if shards else {} + self._compression = dict(compression) if compression else {} + self._archive_class = archive_class + self._archive_metadata = dict(archive_metadata) if archive_metadata else {} + self._pointers: dict[Hashable, ZarrPointerModel] = {} + self._frame_sets: list[tuple[FrameSet, ZarrPointerModel]] = [] + + def serialize_direct[T: pydantic.BaseModel]( + self, name: str, serializer: Callable[[OutputArchive[ZarrPointerModel]], T] + ) -> T: + nested = NestedOutputArchive[ZarrPointerModel](name, self) + return serializer(nested) + + def serialize_pointer[T: ArchiveTree]( + self, + name: str, + serializer: Callable[[OutputArchive[ZarrPointerModel]], T], + key: Hashable, + ) -> ZarrPointerModel: + if (cached := self._pointers.get(key)) is not None: + return cached + # Run the serializer first so any nested add_array calls land + # inside the IR before we dump this sub-tree to JSON. + archive_path = name if name.startswith("/") else f"/{name}" + sub_zarr_path = archive_path_to_zarr_path(archive_path) + model = self.serialize_direct(name, serializer) + json_bytes = model.model_dump_json().encode("utf-8") + parent = self.document.root.ensure_group(sub_zarr_path) + parent.arrays["tree"] = ZarrArray(data=np.frombuffer(json_bytes, dtype=np.uint8)) + pointer = ZarrPointerModel(path=f"{sub_zarr_path}/tree") + self._pointers[key] = pointer + return pointer + + def serialize_frame_set[T: ArchiveTree]( + self, + name: str, + frame_set: FrameSet, + serializer: Callable[[OutputArchive], T], + key: Hashable, + ) -> ZarrPointerModel: + pointer = self.serialize_pointer(name, serializer, key) + self._frame_sets.append((frame_set, pointer)) + return pointer + + def iter_frame_sets(self) -> Iterator[tuple[FrameSet, ZarrPointerModel]]: + return iter(self._frame_sets) + + def add_array(self, *args: Any, **kwargs: Any) -> Any: # type: ignore[override] + raise NotImplementedError("add_array lands in Task 2.5") + + def add_table(self, *args: Any, **kwargs: Any) -> Any: # type: ignore[override] + raise NotImplementedError("add_table lands in Task 2.6") + + def add_structured_array(self, *args: Any, **kwargs: Any) -> Any: # type: ignore[override] + raise NotImplementedError("add_structured_array lands in Task 2.6") + + +def write(*args: Any, **kwargs: Any) -> Any: + """Public write helper. Implemented in Task 2.7.""" + raise NotImplementedError("write() lands in Task 2.7") +``` + +- [ ] **Step 4: Run the tests** + +Run: `pytest tests/test_zarr_output_archive.py -v` +Expected: PASS — 2 tests pass. + +- [ ] **Step 5: Commit** + +```bash +git add python/lsst/images/zarr/_output_archive.py tests/test_zarr_output_archive.py +git commit -m "feat: add ZarrOutputArchive skeleton with serialize_direct/pointer/frame_set" +``` + +### Task 2.5: `add_array` — image, variance, and the 2-D packed mask + +**Files:** +- Modify: `python/lsst/images/zarr/_output_archive.py` +- Test: `tests/test_zarr_output_archive.py` + +`add_array(array, name=...)` does three different things depending on the name: + +1. `name == "image"` (or any non-mask name) — stage the array verbatim with default chunks (or overrides), attach `_ARRAY_DIMENSIONS` and `units` / `long_name` if known. The chunk shape is held aside as the "image chunks" so siblings can align to it. +2. `name == "variance"` — derive chunks from `image_chunks` via `chunks_aligned_to` when the user has not overridden, attach `_ARRAY_DIMENSIONS = ["y", "x"]`. +3. `name == "mask"` — convert the 3-D `(y, x, mask_size)` in-memory mask into a 2-D `(y, x)` packed-integer array of `mask_dtype_for_plane_count(n_planes)`. Build CF flag attrs from the schema (passed via `archive_metadata["mask_schema"]`). Derive chunks from `image_chunks` when not overridden. + +Anonymous (nested) arrays land at the path equal to `name`, no special-case behavior. + +- [ ] **Step 1: Write the failing test** + +Append to `tests/test_zarr_output_archive.py`: + +```python +@unittest.skipUnless(HAVE_ZARR, "zarr is not installed") +class ZarrOutputArchiveAddArrayTestCase(unittest.TestCase): + def test_add_image(self) -> None: + import numpy as np + + archive = ZarrOutputArchive() + ref = archive.add_array( + np.ones((4, 5), dtype=np.float32), name="image" + ) + self.assertEqual(ref.source, "zarr:/image") + self.assertEqual(list(ref.shape), [4, 5]) + node = archive.document.root.get("/image") + self.assertEqual(node.shape, (4, 5)) + self.assertEqual(node.attributes.extra["_ARRAY_DIMENSIONS"], ["y", "x"]) + + def test_add_variance_aligns_to_image_chunks(self) -> None: + import numpy as np + + archive = ZarrOutputArchive(chunks={"image": (2, 2)}) + archive.add_array(np.ones((4, 5), dtype=np.float32), name="image") + archive.add_array(np.ones((4, 5), dtype=np.float64), name="variance") + var_node = archive.document.root.get("/variance") + self.assertEqual(tuple(var_node.chunks), (2, 2)) + + def test_add_mask_packs_to_2d_with_cf_flag_attrs(self) -> None: + import numpy as np + + from lsst.images import MaskPlane, MaskSchema + + schema = MaskSchema( + [ + MaskPlane("BAD", "Bad pixel."), + MaskPlane("SAT", "Saturated."), + MaskPlane("CR", "Cosmic ray."), + ] + ) + # In-memory mask is (y, x, mask_size). + in_memory = np.zeros((4, 5, 1), dtype=np.uint8) + in_memory[0, 0, 0] = 0b1 # BAD + in_memory[1, 1, 0] = 0b110 # SAT | CR + + archive = ZarrOutputArchive(archive_metadata={"mask_schema": schema}) + archive.add_array(np.ones((4, 5), dtype=np.float32), name="image") + ref = archive.add_array(in_memory, name="mask") + self.assertEqual(ref.source, "zarr:/mask") + node = archive.document.root.get("/mask") + # 2-D packed integer. + self.assertEqual(node.shape, (4, 5)) + self.assertEqual(str(node.dtype), "uint8") # 3 planes -> uint8 + # Bytes packed correctly. + np.testing.assert_array_equal(node.data[0, 0], 0b1) + np.testing.assert_array_equal(node.data[1, 1], 0b110) + # CF flag attrs. + attrs = node.attributes.extra + self.assertEqual(attrs["flag_masks"], [1, 2, 4]) + self.assertEqual(attrs["flag_meanings"], "BAD SAT CR") + self.assertEqual( + attrs["flag_descriptions"], + ["Bad pixel.", "Saturated.", "Cosmic ray."], + ) + self.assertEqual(attrs["_ARRAY_DIMENSIONS"], ["y", "x"]) + + def test_add_mask_picks_widest_dtype_for_40_planes(self) -> None: + import numpy as np + + from lsst.images import MaskPlane, MaskSchema + + planes = [MaskPlane(f"P{i}", f"Plane {i}.") for i in range(40)] + schema = MaskSchema(planes) + in_memory = np.zeros((4, 5, 5), dtype=np.uint8) # mask_size=5 + + archive = ZarrOutputArchive(archive_metadata={"mask_schema": schema}) + archive.add_array(np.ones((4, 5), dtype=np.float32), name="image") + archive.add_array(in_memory, name="mask") + node = archive.document.root.get("/mask") + self.assertEqual(node.shape, (4, 5)) + self.assertEqual(str(node.dtype), "uint64") + + def test_add_mask_refuses_more_than_64_planes(self) -> None: + import numpy as np + + from lsst.images import MaskPlane, MaskSchema + + planes = [MaskPlane(f"P{i}", f"Plane {i}.") for i in range(65)] + schema = MaskSchema(planes) + in_memory = np.zeros((4, 5, 9), dtype=np.uint8) + + archive = ZarrOutputArchive(archive_metadata={"mask_schema": schema}) + archive.add_array(np.ones((4, 5), dtype=np.float32), name="image") + with self.assertRaisesRegex(ValueError, "supports up to 64"): + archive.add_array(in_memory, name="mask") + + def test_add_anonymous_nested_array(self) -> None: + import numpy as np + + archive = ZarrOutputArchive() + ref = archive.add_array( + np.ones((3,), dtype=np.float32), name="psf/centroids" + ) + self.assertEqual(ref.source, "zarr:/psf/centroids") + self.assertEqual(archive.document.root.get("/psf/centroids").shape, (3,)) +``` + +- [ ] **Step 2: Run to verify failure** + +Run: `pytest tests/test_zarr_output_archive.py::ZarrOutputArchiveAddArrayTestCase -v` +Expected: FAIL — `add_array` raises `NotImplementedError`. + +- [ ] **Step 3: Implement `add_array` and the mask-packing helper** + +In `python/lsst/images/zarr/_output_archive.py`, extend imports: + +```python +import astropy.io.fits +import astropy.table +import astropy.units + +from ..serialization import ( + ArchiveTree, + ArrayReferenceModel, + NestedOutputArchive, + NumberType, + OutputArchive, + TableColumnModel, + TableModel, + no_header_updates, +) +from ._common import ( + ZarrCompressionOptions, + ZarrPointerModel, + archive_path_to_zarr_path, + mask_dtype_for_plane_count, +) +from ._layout import chunks_aligned_to, chunks_for +from ._model import ( + CfFlagAttributes, + MaskPlaneEntry, + ZarrArray, + ZarrDocument, + ZarrGroup, + build_image_array_attrs, +) +``` + +Add an `_image_chunks` field to `__init__`: + +```python + self._image_chunks: tuple[int, ...] | None = None +``` + +Replace the `add_array` placeholder: + +```python + def add_array( + self, + array: np.ndarray, + *, + name: str | None = None, + update_header: Callable[[astropy.io.fits.Header], None] = no_header_updates, + ) -> ArrayReferenceModel: + if name is None: + raise ValueError("Anonymous arrays are not supported in ZarrOutputArchive.") + archive_path = name if name.startswith("/") else f"/{name}" + zarr_path = archive_path_to_zarr_path(archive_path) + leaf = zarr_path.rsplit("/", 1)[-1] + parent_path = zarr_path[: -(len(leaf) + 1)] or "/" + parent = self.document.root.ensure_group(parent_path) + + # Mask: pack 3-D (y, x, mask_size) -> 2-D wide-int packed. + if leaf == "mask" and array.ndim == 3: + packed, flag_attrs = self._pack_mask(array) + chunks = self._chunks.get(name) or self._chunks.get(leaf) + if chunks is None and self._image_chunks is not None: + chunks = chunks_aligned_to( + image_chunks=self._image_chunks, shape=packed.shape + ) + extra: dict[str, Any] = {"_ARRAY_DIMENSIONS": ["y", "x"]} + extra.update(flag_attrs.dump()) + ir_array = ZarrArray( + data=packed, + chunks=chunks, + shards=self._shards.get(name), + compression=self._compression.get(name), + ) + ir_array.attributes.extra = extra + parent.arrays[leaf] = ir_array + return ArrayReferenceModel( + source=f"zarr:{zarr_path}", + shape=list(packed.shape), + datatype=NumberType.from_numpy(packed.dtype), + ) + + # variance / other top-level siblings: align to image's chunks. + if leaf in ("variance",) or (parent_path == "/" and self._image_chunks): + chunks = self._chunks.get(name) or self._chunks.get(leaf) + if chunks is None and self._image_chunks is not None and array.ndim == len( + self._image_chunks + ): + chunks = chunks_aligned_to( + image_chunks=self._image_chunks, shape=array.shape + ) + else: + chunks = self._chunks.get(name) or self._chunks.get(leaf) + + # Default chunks for the top-level image: from layout rules. + if chunks is None and parent_path == "/" and leaf == "image": + chunks = chunks_for( + self._archive_class, + array.shape, + None, + archive_metadata=self._archive_metadata, + ) + + ir_array = ZarrArray( + data=np.ascontiguousarray(array), + chunks=chunks, + shards=self._shards.get(name), + compression=self._compression.get(name), + ) + if parent_path == "/" and leaf in ("image", "variance"): + ir_array.attributes.extra = build_image_array_attrs( + axes=("y", "x"), + long_name="science image" if leaf == "image" else "image variance", + ) + parent.arrays[leaf] = ir_array + + # Remember the image's chunks so siblings can align. + if parent_path == "/" and leaf == "image" and chunks is not None: + self._image_chunks = tuple(chunks) + + return ArrayReferenceModel( + source=f"zarr:{zarr_path}", + shape=list(array.shape), + datatype=NumberType.from_numpy(array.dtype), + ) + + def _pack_mask( + self, array: np.ndarray + ) -> tuple[np.ndarray, CfFlagAttributes]: + """Pack a 3-D ``(y, x, mask_size)`` mask into a 2-D wide-int array. + + The schema is taken from ``self._archive_metadata["mask_schema"]``. + Returns the packed array and the CF flag attributes. + """ + from lsst.images import MaskSchema + + schema = self._archive_metadata.get("mask_schema") + if not isinstance(schema, MaskSchema): + raise ValueError( + "Writing a 3-D mask requires archive_metadata['mask_schema'] " + "to be set; the output archive cannot infer the plane " + "definitions otherwise." + ) + n_planes = len(schema) + target_dtype = mask_dtype_for_plane_count(n_planes) + # Pack: each (y, x) pixel's mask_size bytes -> one wide integer. + # Byte 0 is the low byte (planes 0..7), byte 1 is the next, etc. + packed = np.zeros(array.shape[:2], dtype=target_dtype) + for i in range(array.shape[2]): + packed |= array[..., i].astype(target_dtype) << (8 * i) + planes = [ + MaskPlaneEntry(name=p.name, bit=i, description=p.description) + for i, p in enumerate(schema) + ] + return packed, CfFlagAttributes(planes=planes) +``` + +- [ ] **Step 4: Run the tests** + +Run: `pytest tests/test_zarr_output_archive.py -v` +Expected: PASS — 8 tests; mask packs to the correct dtype, CF flag attrs are populated, sibling chunks align. + +- [ ] **Step 5: Commit** + +```bash +git add python/lsst/images/zarr/_output_archive.py tests/test_zarr_output_archive.py +git commit -m "feat: implement add_array with image/variance/mask handling and CF flag attrs" +``` + +### Task 2.6: `add_table` and `add_structured_array` + +**Files:** +- Modify: `python/lsst/images/zarr/_output_archive.py` +- Modify: `tests/test_zarr_output_archive.py` + +Tables stage one 1-D zarr array per column under `/lsst/tables//` and attach the table's `meta` block to the parent group's `lsst` namespace. + +- [ ] **Step 1: Write the failing test** + +Append to `tests/test_zarr_output_archive.py`: + +```python +@unittest.skipUnless(HAVE_ZARR, "zarr is not installed") +class ZarrOutputArchiveAddTableTestCase(unittest.TestCase): + def test_add_table_creates_one_array_per_column(self) -> None: + import astropy.table + import numpy as np + + archive = ZarrOutputArchive() + original = astropy.table.Table( + { + "x": np.arange(4, dtype=np.int32), + "y": np.arange(4, dtype=np.float32), + }, + meta={"comment": "small catalog"}, + ) + model = archive.add_table(original, name="cat") + self.assertEqual(len(model.columns), 2) + sources = {c.name: c.data.source for c in model.columns} + self.assertEqual(sources["x"], "zarr:/lsst/tables/cat/x") + self.assertEqual(sources["y"], "zarr:/lsst/tables/cat/y") + # Each column is its own zarr array under the parent group. + x_node = archive.document.root.get("/lsst/tables/cat/x") + self.assertEqual(x_node.shape, (4,)) +``` + +- [ ] **Step 2: Run to verify failure** + +Run: `pytest tests/test_zarr_output_archive.py::ZarrOutputArchiveAddTableTestCase -v` +Expected: FAIL — `add_table` raises `NotImplementedError`. + +- [ ] **Step 3: Implement `add_table` and `add_structured_array`** + +Replace the placeholders in `_output_archive.py`: + +```python + def add_table( + self, + table: astropy.table.Table, + *, + name: str | None = None, + update_header: Callable[[astropy.io.fits.Header], None] = no_header_updates, + ) -> TableModel: + if name is None: + raise ValueError("Anonymous tables are not supported in ZarrOutputArchive.") + columns = TableColumnModel.from_table(table) + archive_path = name if name.startswith("/") else f"/{name}" + table_zarr_path = f"/lsst/tables{archive_path}" + parent = self.document.root.ensure_group(table_zarr_path) + for c in columns: + assert isinstance(c.data, ArrayReferenceModel) + column_array = np.ascontiguousarray(np.asarray(table[c.name])) + parent.arrays[c.name] = ZarrArray(data=column_array) + c.data.source = f"zarr:{table_zarr_path}/{c.name}" + return TableModel(columns=columns, meta=table.meta) + + def add_structured_array( + self, + array: np.ndarray, + *, + name: str | None = None, + units: Mapping[str, astropy.units.Unit] | None = None, + descriptions: Mapping[str, str] | None = None, + update_header: Callable[[astropy.io.fits.Header], None] = no_header_updates, + ) -> TableModel: + if name is None: + raise ValueError("Anonymous structured arrays are not supported.") + columns = TableColumnModel.from_record_dtype(array.dtype) + archive_path = name if name.startswith("/") else f"/{name}" + table_zarr_path = f"/lsst/tables{archive_path}" + parent = self.document.root.ensure_group(table_zarr_path) + for c in columns: + assert isinstance(c.data, ArrayReferenceModel) + column_array = np.ascontiguousarray(array[c.name]) + parent.arrays[c.name] = ZarrArray(data=column_array) + c.data.source = f"zarr:{table_zarr_path}/{c.name}" + if units and (unit := units.get(c.name)): + c.unit = unit + if descriptions and (description := descriptions.get(c.name)): + c.description = description + return TableModel(columns=columns) +``` + +- [ ] **Step 4: Run the tests** + +Run: `pytest tests/test_zarr_output_archive.py -v` +Expected: PASS — 9 tests. + +- [ ] **Step 5: Commit** + +```bash +git add python/lsst/images/zarr/_output_archive.py tests/test_zarr_output_archive.py +git commit -m "feat: implement ZarrOutputArchive add_table and add_structured_array" +``` + +### Task 2.7: `add_tree`, OME multiscale + WCS validator integration, public `write()` + +**Files:** +- Modify: `python/lsst/images/zarr/_output_archive.py` +- Modify: `python/lsst/images/zarr/__init__.py` +- Modify: `tests/test_zarr_output_archive.py` + +`add_tree` finalizes the IR: + +1. Stage the JSON tree at `/tree`. +2. Stage the AST WCS string at `/wcs_ast` (when an AST FrameSet was registered via `serialize_frame_set` or supplied directly). +3. Build the OME multiscale block. If a top-level `/image` array exists and the archive carries a frame set, run `affine_check`. If the result drops the affine, emit a unit-scale block and set `lsst.wcs_simplified_dropped: true` with the residual. +4. Set `lsst.archive_class`, `lsst.tree`, `lsst.wcs_ast` (if present), `data_model`, `version`, `lsst.cell_grid` (when `archive_metadata["cell_grid"]` is set). + +The public `write(obj, path, ...)` function constructs the archive, runs the serializer, calls `add_tree`, and materializes via `open_store_for_write` + `to_zarr`. + +- [ ] **Step 1: Write the failing test** + +Append to `tests/test_zarr_output_archive.py`: + +```python +@unittest.skipUnless(HAVE_ZARR, "zarr is not installed") +class ZarrWriteHelperTestCase(unittest.TestCase): + def test_write_image_to_local_directory(self) -> None: + import os + import tempfile + + import numpy as np + import zarr + + from lsst.images import Box, Image + from lsst.images.zarr import write + from lsst.images.zarr._common import LSST_NS, OME_NS + from lsst.images.zarr._model import ZarrDocument + + original = Image( + np.arange(20, dtype=np.float32).reshape(4, 5), + bbox=Box.factory[10:14, 20:25], + ) + with tempfile.TemporaryDirectory() as tmp: + target = os.path.join(tmp, "out.zarr") + tree = write(original, target) + self.assertIsNotNone(tree) + with zarr.storage.LocalStore(target, read_only=True) as store: + doc = ZarrDocument.from_zarr(store) + # Top-level image and tree are present. + self.assertIn("image", doc.root.arrays) + self.assertIn("tree", doc.root.arrays) + self.assertEqual(doc.root.arrays["image"].shape, (4, 5)) + # LSST root attrs. + lsst_attrs = doc.root.attributes.lsst + self.assertEqual(lsst_attrs["archive_class"], "Image") + self.assertEqual(lsst_attrs["tree"], "tree") + # OME multiscales points at /image; no projection means + # the unit scale is emitted. + ome = doc.root.attributes.ome + self.assertIn("multiscales", ome) + self.assertEqual( + ome["multiscales"][0]["datasets"][0]["path"], "image" + ) + # data_model + version on root. + self.assertEqual( + doc.root.attributes.extra["data_model"], "org.lsst.image" + ) +``` + +- [ ] **Step 2: Run to verify failure** + +Run: `pytest tests/test_zarr_output_archive.py::ZarrWriteHelperTestCase -v` +Expected: FAIL — `write()` raises `NotImplementedError`. + +- [ ] **Step 3: Implement `add_tree` and `write`** + +Append to `python/lsst/images/zarr/_output_archive.py`: + +```python + def add_tree(self, tree: ArchiveTree) -> None: + """Finalize the IR: write JSON tree, WCS, and root attributes. + + Called once after the user's serializer has populated arrays + / sub-trees. Sets the ``lsst.*`` and ``ome.*`` blocks on the + root group, stages ``/tree`` as 1-D ``uint8`` UTF-8 JSON, and + runs the affine residual validator if the archive carries a + frame set. + """ + from ._layout import affine_check, axes_for_archive_class + from ._model import OmeMultiscale + + # Stage the JSON tree at /tree. + json_bytes = tree.model_dump_json().encode("utf-8") + self.document.root.arrays["tree"] = ZarrArray( + data=np.frombuffer(json_bytes, dtype=np.uint8) + ) + + # Stage the AST WCS string at /wcs_ast when a frame set is registered. + wcs_ast_path: str | None = None + if self._frame_sets: + wcs_ast_path = self._stage_wcs_ast(self._frame_sets[0][0]) + + # Root LSST attrs. + lsst = self.document.root.attributes.lsst + lsst["archive_class"] = self._archive_class + lsst["tree"] = "tree" + if wcs_ast_path is not None: + lsst["wcs_ast"] = wcs_ast_path + if "cell_grid" in self._archive_metadata: + lsst["cell_grid"] = self._archive_metadata["cell_grid"] + + # data_model / version go to the top level (not under lsst:). + self.document.root.attributes.extra["data_model"] = self._data_model_for( + self._archive_class + ) + self.document.root.attributes.extra["version"] = 1 + + # OME multiscale block, gated by axes_for_archive_class. + axes = axes_for_archive_class(self._archive_class) + if axes and "image" in self.document.root.arrays: + image_array = self.document.root.arrays["image"] + ct: list[dict[str, Any]] | None = None + if self._frame_sets: + fs = self._frame_sets[0][0] + check = affine_check( + frame_set=fs._get_ast_frame_set(), + image_shape=image_array.shape, + max_residual_pixels=1.0, + ) + if check.dropped: + lsst["wcs_simplified_dropped"] = True + lsst["wcs_simplified_max_residual_pixels"] = check.max_residual_pixels + else: + lsst["wcs_simplified_dropped"] = False + lsst["wcs_simplified_max_residual_pixels"] = check.max_residual_pixels + ct = check.coordinate_transformations + multiscale = OmeMultiscale( + name=self._archive_class.lower(), + axes=axes, + dataset_path="image", + coordinate_transformations=ct, + ) + self.document.root.attributes.ome["multiscales"] = [multiscale.dump()] + + def _stage_wcs_ast(self, frame_set: FrameSet) -> str: + """Encode an AST FrameSet as a UTF-8 string and stage it at /wcs_ast.""" + from .._transforms._ast import Channel, StringStream + + ast_fs = frame_set._get_ast_frame_set() + stream = StringStream() + Channel(stream, options="Full=-1,Comment=0,Indent=0").write(ast_fs) + text = stream.getSinkData() + self.document.root.arrays["wcs_ast"] = ZarrArray( + data=np.frombuffer(text.encode("utf-8"), dtype=np.uint8) + ) + return "wcs_ast" + + @staticmethod + def _data_model_for(archive_class: str) -> str: + """Map an archive class name to the public ``data_model`` string.""" + return { + "Image": "org.lsst.image", + "Mask": "org.lsst.mask", + "MaskedImage": "org.lsst.masked_image", + "VisitImage": "org.lsst.visit_image", + "ColorImage": "org.lsst.color_image", + "CellCoadd": "org.lsst.cell_coadd", + }.get(archive_class, f"org.lsst.{archive_class.lower()}") + + +def write( + obj: Any, + path: Any, + *, + chunks: Mapping[str, tuple[int, ...] | None] | None = None, + shards: Mapping[str, tuple[int, ...] | None] | None = None, + compression: Mapping[str, ZarrCompressionOptions | None] | None = None, + metadata: Mapping[str, Any] | None = None, + butler_info: Any | None = None, +) -> ArchiveTree: + """Write ``obj`` to a zarr archive at ``path``. + + Parameters mirror the FITS / NDF write helpers. The store + implementation (LocalStore / ZipStore / FsspecStore) is selected + from the URI shape by ``_store.open_store_for_write``. + """ + from ._store import open_store_for_write + + archive_class = type(obj).__name__ + archive_default_name = getattr(obj, "_archive_default_name", None) + archive_metadata: dict[str, Any] = {} + if (cell_shape := getattr(obj, "cell_shape", None)) is not None: + archive_metadata["cell_shape"] = tuple(cell_shape) + if (cell_grid := getattr(obj, "cell_grid", None)) is not None: + archive_metadata["cell_grid"] = { + "bbox": list(cell_grid.bbox) if hasattr(cell_grid, "bbox") else None, + "cell_shape": list(cell_grid.cell_shape) + if hasattr(cell_grid, "cell_shape") + else None, + } + if (mask_schema := getattr(obj, "mask_schema", None)) is not None: + archive_metadata["mask_schema"] = mask_schema + + archive = ZarrOutputArchive( + chunks=chunks, + shards=shards, + compression=compression, + archive_class=archive_class, + archive_metadata=archive_metadata, + ) + if archive_default_name is not None: + tree = archive.serialize_direct(archive_default_name, obj.serialize) + else: + tree = obj.serialize(archive) + if metadata is not None: + tree.metadata.update(metadata) + if butler_info is not None: + tree.butler_info = butler_info + archive.add_tree(tree) + with open_store_for_write(path) as store: + archive.document.to_zarr(store) + return tree +``` + +Re-export from `python/lsst/images/zarr/__init__.py` (replace the placeholder comment): + +```python +from ._common import * # noqa: F401, F403 +from ._output_archive import * # noqa: F401, F403 +``` + +- [ ] **Step 4: Run the tests** + +Run: `pytest tests/test_zarr_output_archive.py -v` +Expected: PASS — 10 tests. + +- [ ] **Step 5: Commit** + +```bash +git add python/lsst/images/zarr/_output_archive.py python/lsst/images/zarr/__init__.py tests/test_zarr_output_archive.py +git commit -m "feat: add ZarrOutputArchive.add_tree and public write() helper" +``` + +### Task 2.8: Layout-level write tests for `MaskedImage` and `VisitImage` + +**Files:** +- Modify: `tests/test_zarr_output_archive.py` + +Pin the on-disk shape for the two harder archive classes. + +- [ ] **Step 1: Write the test** + +Append to `tests/test_zarr_output_archive.py`: + +```python +@unittest.skipUnless(HAVE_ZARR, "zarr is not installed") +class ZarrWriteOnDiskShapeTestCase(unittest.TestCase): + def _round_trip_doc(self, obj): # noqa: ANN001 + import os + import tempfile + + import zarr + + from lsst.images.zarr import write + from lsst.images.zarr._model import ZarrDocument + + with tempfile.TemporaryDirectory() as tmp: + target = os.path.join(tmp, "out.zarr") + write(obj, target) + with zarr.storage.LocalStore(target, read_only=True) as store: + return ZarrDocument.from_zarr(store) + + def test_masked_image_layout(self) -> None: + import numpy as np + + from lsst.images import Box, Image, MaskedImage, MaskPlane, MaskSchema + + schema = MaskSchema([MaskPlane("BAD", "Bad pixel.")]) + image = Image( + np.arange(20, dtype=np.float32).reshape(4, 5), + bbox=Box.factory[10:14, 20:25], + ) + masked = MaskedImage(image, mask_schema=schema) + masked.mask.set("BAD", image.array % 2 == 0) + + doc = self._round_trip_doc(masked) + self.assertEqual( + doc.root.attributes.lsst["archive_class"], "MaskedImage" + ) + # image / variance / mask are sibling root arrays. + self.assertIn("image", doc.root.arrays) + self.assertIn("variance", doc.root.arrays) + self.assertIn("mask", doc.root.arrays) + # Mask is 2-D packed integer with CF flag attrs. + mask = doc.root.arrays["mask"] + self.assertEqual(mask.shape, (4, 5)) + self.assertEqual(mask.attributes.extra["flag_meanings"], "BAD") + # CF / xarray dims on every 2-D array. + for name in ("image", "variance", "mask"): + self.assertEqual( + doc.root.arrays[name].attributes.extra["_ARRAY_DIMENSIONS"], + ["y", "x"], + ) + + def test_visit_image_layout(self) -> None: + import numpy as np + + from lsst.images import Box, Image, MaskPlane, MaskSchema, VisitImage + + schema = MaskSchema([MaskPlane("BAD", "Bad pixel.")]) + image = Image( + np.arange(20, dtype=np.float32).reshape(4, 5), + bbox=Box.factory[10:14, 20:25], + ) + visit = VisitImage(image=image, mask_schema=schema) + doc = self._round_trip_doc(visit) + self.assertEqual(doc.root.attributes.lsst["archive_class"], "VisitImage") + self.assertIn("image", doc.root.arrays) +``` + +- [ ] **Step 2: Run the tests** + +Run: `pytest tests/test_zarr_output_archive.py::ZarrWriteOnDiskShapeTestCase -v` +Expected: PASS — both tests. If `VisitImage`'s constructor in this codebase needs different arguments than the snippet, adapt the constructor call only — the on-disk assertions stay. + +- [ ] **Step 3: Commit** + +```bash +git add tests/test_zarr_output_archive.py +git commit -m "test: pin on-disk zarr layout for MaskedImage and VisitImage" +``` + +--- + +**End of Phase 2.** Eight tasks. The output side now produces: + +- `image`, `variance`, `mask` siblings at the root with aligned chunks +- 2-D packed-integer mask with CF `flag_masks` / `flag_meanings` / `flag_descriptions` +- `_ARRAY_DIMENSIONS` and `units` / `long_name` per array (xarray-readable) +- OME multiscales metadata pointing at `/image` +- Affine `coordinateTransformations` validated against an 11×11 grid; dropped to unit scale when residual exceeds 1 pixel +- `wcs_ast` 1-D `uint8` array as the authoritative WCS round-trip source + +Phase 3 inverts this: `ZarrInputArchive`, `read()`, and the lazy-subset assertions that prove `slices=` only fetches the touched chunks. + +## Phase 3 — `ZarrInputArchive`, `read()`, lazy subset enforcement, mask unpack + +This phase delivers the read side. The hard constraint is the **lazy subset invariant**: `get_array(model, slices=...)` must forward `slices` to the underlying `zarr.Array` handle so a 4×4 subset of a 4096×4096 remote VisitImage downloads only the chunks intersecting that slice. The phase ships with a `_CountingStore`-based regression test that fails if any code path materializes the full array before slicing. + +The phase also adds the **mask unpack path**: `Mask.serialize` (when the archive sets `_prefer_native_mask_arrays = True`) hands us a 3-D `(y, x, mask_size)` array which Phase 2's `add_array` packs to 2-D wide-integer; on read, `get_array` detects the rank mismatch (model claims 3-D, on-disk is 2-D, on-disk has `flag_masks` attribute) and unpacks via bit shifts. + +### Task 3.0: Wire up `_prefer_native_mask_arrays` + +**Files:** +- Modify: `python/lsst/images/zarr/_output_archive.py` +- Test: `tests/test_zarr_round_trip.py` (later in this phase confirms it round-trips) + +A one-line retrofit to Phase 2 to make `Mask.serialize` choose the native 3-D path for our archive (matching what the NDF backend does). Without this, `Mask.serialize` calls `add_array` multiple times with 2-D `int32` splits and our packing path never runs. + +- [ ] **Step 1: Add the class attribute** + +In `python/lsst/images/zarr/_output_archive.py`, edit the `ZarrOutputArchive` class definition to add the class attribute right above `__init__`: + +```python +class ZarrOutputArchive(OutputArchive[ZarrPointerModel]): + """Output archive that populates a `ZarrDocument` IR. + + ... (existing docstring) ... + """ + + _prefer_native_mask_arrays: ClassVar[bool] = True + """Tell Mask.serialize to hand us the 3-D ``(y, x, mask_size)`` + array in one ``add_array`` call. Our ``add_array`` packs that into + a 2-D wide-integer array on disk with CF flag_masks / flag_meanings + attributes. + """ + + def __init__(...): + ... +``` + +(Add `from typing import ClassVar` to the imports if it is not already present.) + +- [ ] **Step 2: Run the existing tests to confirm no regression** + +Run: `pytest tests/test_zarr_output_archive.py -v` +Expected: PASS — all 10 Phase 2 tests still pass; the class attribute does not change behavior for direct `add_array(3D)` calls. + +- [ ] **Step 3: Commit** + +```bash +git add python/lsst/images/zarr/_output_archive.py +git commit -m "feat: opt ZarrOutputArchive into native 3-D mask serialization" +``` + +### Task 3.1: `ZarrInputArchive` skeleton — open + `get_tree` + error taxonomy + +**Files:** +- Create: `python/lsst/images/zarr/_input_archive.py` +- Test: `tests/test_zarr_input_archive.py` + +Constructor takes a `ZarrDocument` (built lazily via `from_zarr`). `get_tree(model_type)` reads `/tree`'s bytes and validates them. The `open` classmethod is a context manager around `_store.open_store_for_read`. + +Error taxonomy (per spec §4): +- Missing `lsst.archive_class` → `ArchiveReadError("File is not an LSST zarr archive")`. +- `lsst.version` newer than `LSST_VERSION` → `ArchiveReadError("Unsupported lsst:version ")`. + +`ZarrAttributes.load` keeps the on-disk `version` under a private sentinel `__version_remembered_at_load__` so the input archive can validate without going back to the raw store. + +- [ ] **Step 1: Update `ZarrAttributes.load` / `dump` to round-trip the version sentinel** + +In `python/lsst/images/zarr/_model.py`, change `ZarrAttributes.load` to keep the version under a private key, and `dump` to ignore that key: + +```python + @classmethod + def load(cls, raw: dict[str, Any]) -> Self: + lsst = dict(raw.get(LSST_NS, {})) + version = lsst.pop("version", None) + if version is not None: + lsst["__version_remembered_at_load__"] = version + ome = dict(raw.get(OME_NS, {})) + ome.pop("version", None) + extra = {k: v for k, v in raw.items() if k not in (LSST_NS, OME_NS)} + return cls(lsst=lsst, ome=ome, extra=extra) + + def dump(self) -> dict[str, Any]: + out: dict[str, Any] = dict(self.extra) + public_lsst = { + k: v for k, v in self.lsst.items() if not k.startswith("__") + } + out[LSST_NS] = {"version": LSST_VERSION, **public_lsst} + if self.ome: + out[OME_NS] = {"version": OME_VERSION, **self.ome} + return out +``` + +- [ ] **Step 2: Write the failing test** + +Create `tests/test_zarr_input_archive.py`: + +```python +# This file is part of lsst-images. +# +# Developed for the LSST Data Management System. +# This product includes software developed by the LSST Project +# (https://www.lsst.org). +# See the COPYRIGHT file at the top-level directory of this distribution +# for details of code ownership. +# +# Use of this source code is governed by a 3-clause BSD-style +# license that can be found in the LICENSE file. + +from __future__ import annotations + +import os +import tempfile +import unittest + +import numpy as np + +try: + import zarr + + from lsst.images.serialization import ArchiveReadError + from lsst.images.zarr._common import LSST_NS, LSST_VERSION + from lsst.images.zarr._input_archive import ZarrInputArchive + from lsst.images.zarr._model import ZarrDocument + + HAVE_ZARR = True +except ImportError: + HAVE_ZARR = False + + +@unittest.skipUnless(HAVE_ZARR, "zarr is not installed") +class ZarrInputArchiveSkeletonTestCase(unittest.TestCase): + def test_open_reads_tree(self) -> None: + from lsst.images import Box, Image + from lsst.images.zarr import write + from lsst.images._image import ImageSerializationModel + + original = Image( + np.arange(20, dtype=np.float32).reshape(4, 5), + bbox=Box.factory[10:14, 20:25], + ) + with tempfile.TemporaryDirectory() as tmp: + target = os.path.join(tmp, "out.zarr") + write(original, target) + with ZarrInputArchive.open(target) as archive: + tree = archive.get_tree(ImageSerializationModel) + self.assertIsNotNone(tree) + + def test_missing_archive_class_raises(self) -> None: + with tempfile.TemporaryDirectory() as tmp: + target = os.path.join(tmp, "bare.zarr") + os.makedirs(target) + store = zarr.storage.LocalStore(target, read_only=False) + zarr.create_group(store=store, zarr_format=3) # no lsst attrs + with self.assertRaisesRegex(ArchiveReadError, "not an LSST zarr archive"): + with ZarrInputArchive.open(target): + pass + + def test_future_version_refused(self) -> None: + with tempfile.TemporaryDirectory() as tmp: + target = os.path.join(tmp, "future.zarr") + os.makedirs(target) + store = zarr.storage.LocalStore(target, read_only=False) + root = zarr.create_group(store=store, zarr_format=3) + root.update_attributes( + { + LSST_NS: { + "version": LSST_VERSION + 1, + "archive_class": "Image", + "tree": "tree", + } + } + ) + with self.assertRaisesRegex(ArchiveReadError, "Unsupported lsst:version"): + with ZarrInputArchive.open(target): + pass + + +if __name__ == "__main__": + unittest.main() +``` + +- [ ] **Step 3: Run the test to verify it fails** + +Run: `pytest tests/test_zarr_input_archive.py -v` +Expected: FAIL — `ImportError`. + +- [ ] **Step 4: Write `_input_archive.py`** + +Create `python/lsst/images/zarr/_input_archive.py`: + +```python +# This file is part of lsst-images. +# +# Developed for the LSST Data Management System. +# This product includes software developed by the LSST Project +# (https://www.lsst.org). +# See the COPYRIGHT file at the top-level directory of this distribution +# for details of code ownership. +# +# Use of this source code is governed by a 3-clause BSD-style +# license that can be found in the LICENSE file. + +from __future__ import annotations + +__all__ = ("ZarrInputArchive", "read") + +from collections.abc import Callable, Iterator +from contextlib import contextmanager +from types import EllipsisType +from typing import Any, Self + +import astropy.io.fits +import astropy.table +import numpy as np + +from lsst.resources import ResourcePathExpression + +from .._transforms import FrameSet +from ..serialization import ( + ArchiveReadError, + ArchiveTree, + ArrayReferenceModel, + InlineArrayModel, + InputArchive, + ReadResult, + TableModel, + no_header_updates, +) +from ._common import LSST_VERSION, ZarrPointerModel +from ._model import ZarrArray, ZarrDocument + + +class ZarrInputArchive(InputArchive[ZarrPointerModel]): + """Reads zarr archives written by `ZarrOutputArchive`.""" + + def __init__(self, document: ZarrDocument) -> None: + self._document = document + self._validate_root_attributes() + self._deserialized_pointer_cache: dict[str, Any] = {} + self._frame_set_cache: dict[str, FrameSet] = {} + + @classmethod + @contextmanager + def open(cls, path: ResourcePathExpression) -> Iterator[Self]: + """Open a zarr archive for reading.""" + from ._store import open_store_for_read + + with open_store_for_read(path) as store: + doc = ZarrDocument.from_zarr(store) + yield cls(doc) + + @property + def document(self) -> ZarrDocument: + return self._document + + def get_tree[T: ArchiveTree](self, model_type: type[T]) -> T: + """Read and validate the main Pydantic tree at ``/tree``.""" + try: + node = self._document.root.get("/tree") + except KeyError: + raise ArchiveReadError( + "File has no /tree array; this is not an LSST zarr archive." + ) from None + if not isinstance(node, ZarrArray): + raise ArchiveReadError("/tree must be a zarr array, not a group.") + json_bytes = bytes(node.read()) + return model_type.model_validate_json(json_bytes.decode("utf-8")) + + def _validate_root_attributes(self) -> None: + attrs = self._document.root.attributes.lsst + if "archive_class" not in attrs: + raise ArchiveReadError( + "File is not an LSST zarr archive (missing lsst.archive_class)." + ) + version = attrs.get("__version_remembered_at_load__", LSST_VERSION) + if version > LSST_VERSION: + raise ArchiveReadError( + f"Unsupported lsst:version {version}; this reader supports up " + f"to {LSST_VERSION}." + ) + + # The remaining abstract methods land in subsequent tasks. + def deserialize_pointer(self, *args: Any, **kwargs: Any) -> Any: # type: ignore[override] + raise NotImplementedError("deserialize_pointer lands in Task 3.3") + + def get_frame_set(self, *args: Any, **kwargs: Any) -> Any: # type: ignore[override] + raise NotImplementedError("get_frame_set lands in Task 3.3") + + def get_array(self, *args: Any, **kwargs: Any) -> Any: # type: ignore[override] + raise NotImplementedError("get_array lands in Task 3.2") + + def get_table(self, *args: Any, **kwargs: Any) -> Any: # type: ignore[override] + raise NotImplementedError("get_table lands in Task 3.4") + + def get_structured_array(self, *args: Any, **kwargs: Any) -> Any: # type: ignore[override] + raise NotImplementedError("get_structured_array lands in Task 3.4") + + +def read(*args: Any, **kwargs: Any) -> Any: + """Public read helper. Implemented in Task 3.5.""" + raise NotImplementedError("read() lands in Task 3.5") +``` + +- [ ] **Step 5: Run all relevant tests** + +Run: `pytest tests/test_zarr_input_archive.py tests/test_zarr_model.py -v` +Expected: PASS — input archive skeleton tests pass; the version-sentinel update does not break model tests. + +- [ ] **Step 6: Commit** + +```bash +git add python/lsst/images/zarr/_input_archive.py python/lsst/images/zarr/_model.py tests/test_zarr_input_archive.py +git commit -m "feat: add ZarrInputArchive skeleton with get_tree and version validation" +``` + +### Task 3.2: `get_array` — lazy slice forwarding + mask unpack + +**Files:** +- Modify: `python/lsst/images/zarr/_input_archive.py` +- Modify: `tests/test_zarr_input_archive.py` + +`get_array(model, slices=...)`: + +1. Resolve the model's `source` (always plain `zarr:/` — no query suffix). +2. Fetch the `ZarrArray` IR node — still lazy. +3. **Mask unpack:** if the model claims a 3-D `(y, x, mask_size)` shape but the on-disk array is 2-D and carries `flag_masks` attribute, slice the 2-D array first (forwarding `slices` if it has rank 2; or its `slices[:-1]` if rank 3 was requested) and unpack via bit shifts to reconstruct the 3-D mask. +4. Otherwise call `ir_array.read(slices=slices)`, forwarding directly to the lazy handle. + +The lazy invariant test uses `_CountingStore` to count chunk fetches and asserts a single-chunk subset of a 16×16 / chunks=(4,4) array touches strictly fewer keys than a full read. + +- [ ] **Step 1: Write the failing tests** + +Append to `tests/test_zarr_input_archive.py`: + +```python +class _CountingStore(zarr.storage.MemoryStore if HAVE_ZARR else object): + """A MemoryStore that counts get() calls.""" + + def __init__(self) -> None: + super().__init__() + self.reads = 0 + + async def get(self, key, prototype, byte_range=None): # type: ignore[override] + self.reads += 1 + return await super().get(key, prototype, byte_range) + + +@unittest.skipUnless(HAVE_ZARR, "zarr is not installed") +class ZarrInputArchiveLazySubsetTestCase(unittest.TestCase): + """Lazy-subset invariant: subset reads only fetch touched chunks.""" + + def test_subset_read_touches_only_intersecting_chunks(self) -> None: + from lsst.images.serialization import ArrayReferenceModel, NumberType + + store = _CountingStore() + root = zarr.create_group(store=store, zarr_format=3) + root.update_attributes( + { + LSST_NS: { + "version": LSST_VERSION, + "archive_class": "Image", + "tree": "tree", + } + } + ) + zarr_array = root.create_array( + name="image", shape=(16, 16), chunks=(4, 4), dtype="float32" + ) + zarr_array[:] = np.arange(256, dtype=np.float32).reshape(16, 16) + # Stub /tree so the input archive's constructor accepts the file. + root.create_array(name="tree", shape=(2,), chunks=(2,), dtype="uint8")[:] = b"{}" + + doc = ZarrDocument.from_zarr(store) + archive = ZarrInputArchive(doc) + + store.reads = 0 + full_ref = ArrayReferenceModel( + source="zarr:/image", + shape=[16, 16], + datatype=NumberType.from_numpy(np.dtype("float32")), + ) + full = archive.get_array(full_ref) + full_reads = store.reads + self.assertEqual(full.shape, (16, 16)) + + store.reads = 0 + subset = archive.get_array(full_ref, slices=(slice(0, 4), slice(0, 4))) + subset_reads = store.reads + self.assertEqual(subset.shape, (4, 4)) + np.testing.assert_array_equal(subset, np.arange(256).reshape(16, 16)[:4, :4]) + self.assertLess(subset_reads, full_reads) + + +@unittest.skipUnless(HAVE_ZARR, "zarr is not installed") +class ZarrInputArchiveMaskUnpackTestCase(unittest.TestCase): + """Round-trip a packed 2-D mask through get_array's unpack path.""" + + def test_unpack_2d_packed_back_to_3d(self) -> None: + from lsst.images.serialization import ArrayReferenceModel, NumberType + + # Build an archive that has a 2-D packed mask on disk. + store = zarr.storage.MemoryStore() + root = zarr.create_group(store=store, zarr_format=3) + root.update_attributes( + { + LSST_NS: { + "version": LSST_VERSION, + "archive_class": "Mask", + "tree": "tree", + } + } + ) + # 4x5 mask, 3 planes -> packed in uint8. + on_disk = np.zeros((4, 5), dtype=np.uint8) + on_disk[0, 0] = 0b001 # plane 0 + on_disk[1, 1] = 0b110 # planes 1+2 + mask_array = root.create_array( + name="mask", shape=(4, 5), chunks=(4, 5), dtype="uint8" + ) + mask_array[:] = on_disk + mask_array.update_attributes( + { + "_ARRAY_DIMENSIONS": ["y", "x"], + "flag_masks": [1, 2, 4], + "flag_meanings": "BAD SAT CR", + "flag_descriptions": ["Bad pixel.", "Saturated.", "Cosmic ray."], + } + ) + root.create_array(name="tree", shape=(2,), chunks=(2,), dtype="uint8")[:] = b"{}" + + doc = ZarrDocument.from_zarr(store) + archive = ZarrInputArchive(doc) + + # The model claims a 3-D shape (mask_size = 1 because <=8 planes). + model = ArrayReferenceModel( + source="zarr:/mask", + shape=[4, 5, 1], + datatype=NumberType.from_numpy(np.dtype("uint8")), + ) + result = archive.get_array(model) + self.assertEqual(result.shape, (4, 5, 1)) + self.assertEqual(result[0, 0, 0], 0b001) + self.assertEqual(result[1, 1, 0], 0b110) + + def test_unpack_uint64_with_5_bytes(self) -> None: + from lsst.images.serialization import ArrayReferenceModel, NumberType + + # 40 planes packed into uint64 -> mask_size = 5. + store = zarr.storage.MemoryStore() + root = zarr.create_group(store=store, zarr_format=3) + root.update_attributes( + { + LSST_NS: { + "version": LSST_VERSION, + "archive_class": "Mask", + "tree": "tree", + } + } + ) + on_disk = np.zeros((4, 5), dtype=np.uint64) + on_disk[0, 0] = 0x01_02_03_04_05 # arbitrary bit pattern + mask_array = root.create_array( + name="mask", shape=(4, 5), chunks=(4, 5), dtype="uint64" + ) + mask_array[:] = on_disk + mask_array.update_attributes( + { + "_ARRAY_DIMENSIONS": ["y", "x"], + "flag_masks": [1 << i for i in range(40)], + "flag_meanings": " ".join(f"P{i}" for i in range(40)), + "flag_descriptions": [f"Plane {i}." for i in range(40)], + } + ) + root.create_array(name="tree", shape=(2,), chunks=(2,), dtype="uint8")[:] = b"{}" + + doc = ZarrDocument.from_zarr(store) + archive = ZarrInputArchive(doc) + + model = ArrayReferenceModel( + source="zarr:/mask", + shape=[4, 5, 5], + datatype=NumberType.from_numpy(np.dtype("uint8")), + ) + result = archive.get_array(model) + self.assertEqual(result.shape, (4, 5, 5)) + # Bytes recovered from the packed uint64. + self.assertEqual(result[0, 0, 0], 0x05) # low byte + self.assertEqual(result[0, 0, 1], 0x04) + self.assertEqual(result[0, 0, 2], 0x03) + self.assertEqual(result[0, 0, 3], 0x02) + self.assertEqual(result[0, 0, 4], 0x01) +``` + +- [ ] **Step 2: Run the tests to verify they fail** + +Run: `pytest tests/test_zarr_input_archive.py -v` +Expected: FAIL — `get_array` raises `NotImplementedError` for both new test classes. + +- [ ] **Step 3: Implement `get_array`** + +In `python/lsst/images/zarr/_input_archive.py`, replace the `get_array` placeholder: + +```python + def get_array( + self, + model: ArrayReferenceModel | InlineArrayModel, + *, + slices: tuple[slice, ...] | EllipsisType = ..., + strip_header: Callable[[astropy.io.fits.Header], None] = no_header_updates, + ) -> np.ndarray: + if isinstance(model, InlineArrayModel): + data: np.ndarray = np.array(model.data, dtype=model.datatype.to_numpy()) + return data if slices is ... else data[slices] + if not isinstance(model.source, str) or not model.source.startswith("zarr:"): + raise ArchiveReadError( + f"ZarrInputArchive cannot resolve array source {model.source!r}; " + f"expected a 'zarr:' reference." + ) + zarr_path = model.source[len("zarr:") :] + try: + node = self._document.root.get(zarr_path) + except KeyError: + raise ArchiveReadError(f"Array reference {zarr_path!r} not in store.") from None + if not isinstance(node, ZarrArray): + raise ArchiveReadError(f"{zarr_path!r} is not an array.") + + # Mask unpack: model claims 3-D (y, x, mask_size); on-disk is 2-D + # (y, x) packed wide-int with flag_masks attribute. + claimed_shape = tuple(model.shape) if model.shape is not None else None + if ( + claimed_shape is not None + and len(claimed_shape) == 3 + and len(node.shape) == 2 + and "flag_masks" in node.attributes.extra + ): + return self._read_packed_mask(node, claimed_shape, slices) + + # Standard path: forward slices straight to the lazy handle. + return node.read(slices=slices) + + def _read_packed_mask( + self, + node: ZarrArray, + claimed_shape: tuple[int, ...], + slices: tuple[slice, ...] | EllipsisType, + ) -> np.ndarray: + """Unpack a 2-D wide-int mask back to 3-D ``(y, x, mask_size)``. + + ``slices`` is forwarded to the underlying handle as-is when it + has rank 2; rank-3 slices have their last axis stripped and + re-applied after the unpack. + """ + mask_size = claimed_shape[2] + # Forward 2-D slice to the lazy handle; only intersecting + # chunks are fetched even on remote stores. + if slices is ...: + spatial_slices: tuple[slice, ...] | EllipsisType = ... + byte_slice: slice | EllipsisType = ... + elif len(slices) == 3: + spatial_slices = slices[:2] + byte_slice = slices[2] + else: + spatial_slices = slices + byte_slice = ... + packed = node.read(slices=spatial_slices) + # Unpack: low byte first. + out = np.empty(packed.shape + (mask_size,), dtype=np.uint8) + for i in range(mask_size): + out[..., i] = (packed >> np.uint64(8 * i)) & np.uint64(0xFF) + if byte_slice is ...: + return out + return out[..., byte_slice] +``` + +- [ ] **Step 4: Run the tests to verify they pass** + +Run: `pytest tests/test_zarr_input_archive.py -v` +Expected: PASS — lazy-subset invariant holds, mask unpack recovers both single-byte and five-byte packings. + +- [ ] **Step 5: Commit** + +```bash +git add python/lsst/images/zarr/_input_archive.py tests/test_zarr_input_archive.py +git commit -m "feat: implement ZarrInputArchive.get_array with lazy slices and mask unpack" +``` + +### Task 3.3: `deserialize_pointer`, `get_frame_set`, AST WCS reconstruction + +**Files:** +- Modify: `python/lsst/images/zarr/_input_archive.py` +- Modify: `tests/test_zarr_input_archive.py` + +`deserialize_pointer(pointer, model_type, deserializer)`: + +1. Cache hit by `pointer.path` → return cached object. +2. Read JSON bytes at `pointer.path` (a `ZarrArray` of `uint8`). +3. Validate via `model_type.model_validate_json` and call `deserializer(model, self)`. +4. Cache the result; if it is a `FrameSet`, also cache it under `_frame_set_cache` so `get_frame_set` can return it. + +For `Projection.deserialize` to find the AST WCS, the Projection serialization model carries a `ZarrPointerModel` referencing `/wcs_ast` (set by `add_tree` in Phase 2). When that pointer is deserialized, the deserializer reads the AST string bytes via `get_array` (the `wcs_ast` array is plain `uint8` so `get_array` returns it as-is) and reconstructs the FrameSet with `astshim.Object.fromString`. + +The AST reconstruction is performed inside the projection deserializer, not the input archive — but the input archive needs to expose the bytes at `/wcs_ast` so the deserializer can call `get_array` on it. That happens automatically since `/wcs_ast` is just a regular zarr array. + +- [ ] **Step 1: Write the failing test** + +Append to `tests/test_zarr_input_archive.py`: + +```python +@unittest.skipUnless(HAVE_ZARR, "zarr is not installed") +class ZarrInputArchivePointerTestCase(unittest.TestCase): + def test_deserialize_pointer_caches_results(self) -> None: + import pydantic + + from lsst.images.zarr._common import ZarrPointerModel + + class _Sub(pydantic.BaseModel): + label: str + + store = zarr.storage.MemoryStore() + root = zarr.create_group(store=store, zarr_format=3) + root.update_attributes( + {LSST_NS: {"version": LSST_VERSION, "archive_class": "Image", "tree": "tree"}} + ) + # Stub /tree. + root.create_array(name="tree", shape=(2,), chunks=(2,), dtype="uint8")[:] = b"{}" + # Sub-archive with its own /tree at /psf/tree. + json_bytes = b'{"label": "psf"}' + psf = root.create_group("psf") + arr = psf.create_array( + name="tree", + shape=(len(json_bytes),), + chunks=(len(json_bytes),), + dtype="uint8", + ) + arr[:] = np.frombuffer(json_bytes, dtype=np.uint8) + + doc = ZarrDocument.from_zarr(store) + archive = ZarrInputArchive(doc) + + deserialize_calls: list[int] = [] + + def deserializer(model, arch): # noqa: ANN001 + deserialize_calls.append(1) + return model + + pointer = ZarrPointerModel(path="/psf/tree") + first = archive.deserialize_pointer(pointer, _Sub, deserializer) + second = archive.deserialize_pointer(pointer, _Sub, deserializer) + self.assertEqual(first.label, "psf") + self.assertIs(first, second) + self.assertEqual(len(deserialize_calls), 1) +``` + +- [ ] **Step 2: Run to verify failure** + +Run: `pytest tests/test_zarr_input_archive.py::ZarrInputArchivePointerTestCase -v` +Expected: FAIL — `deserialize_pointer` raises `NotImplementedError`. + +- [ ] **Step 3: Implement `deserialize_pointer` and `get_frame_set`** + +Replace the placeholders in `python/lsst/images/zarr/_input_archive.py`: + +```python + def deserialize_pointer[U: ArchiveTree, V]( + self, + pointer: ZarrPointerModel, + model_type: type[U], + deserializer: Callable[[U, InputArchive[ZarrPointerModel]], V], + ) -> V: + if (cached := self._deserialized_pointer_cache.get(pointer.path)) is not None: + return cached + try: + node = self._document.root.get(pointer.path) + except KeyError: + raise ArchiveReadError( + f"Pointer reference {pointer.path!r} not in store." + ) from None + if not isinstance(node, ZarrArray): + raise ArchiveReadError(f"Pointer target {pointer.path!r} is not an array.") + json_text = bytes(node.read()).decode("utf-8") + model = model_type.model_validate_json(json_text) + result = deserializer(model, self) + self._deserialized_pointer_cache[pointer.path] = result + if isinstance(result, FrameSet): + self._frame_set_cache[pointer.path] = result + return result + + def get_frame_set(self, pointer: ZarrPointerModel) -> FrameSet: + try: + return self._frame_set_cache[pointer.path] + except KeyError: + raise AssertionError( + f"Frame set at {pointer.path!r} must be deserialised via " + f"deserialize_pointer before any dependent transform can be." + ) from None +``` + +- [ ] **Step 4: Run the tests** + +Run: `pytest tests/test_zarr_input_archive.py -v` +Expected: PASS — pointer-cache test asserts the deserializer is called exactly once. + +- [ ] **Step 5: Commit** + +```bash +git add python/lsst/images/zarr/_input_archive.py tests/test_zarr_input_archive.py +git commit -m "feat: implement deserialize_pointer and get_frame_set" +``` + +### Task 3.4: `get_table`, `get_structured_array` + +**Files:** +- Modify: `python/lsst/images/zarr/_input_archive.py` +- Modify: `tests/test_zarr_input_archive.py` + +Mirrors the FITS implementation: each column is a separate `ArrayReferenceModel(source=f"zarr:/lsst/tables//")` resolved via `get_array`. + +- [ ] **Step 1: Write the failing test** + +Append to `tests/test_zarr_input_archive.py`: + +```python +@unittest.skipUnless(HAVE_ZARR, "zarr is not installed") +class ZarrInputArchiveTableTestCase(unittest.TestCase): + def test_get_table_reconstructs_columns(self) -> None: + import astropy.table + + from lsst.images.zarr._model import ZarrArray + from lsst.images.zarr._output_archive import ZarrOutputArchive + + out = ZarrOutputArchive() + # Wire up the LSST root attributes. + out.document.root.attributes.lsst["archive_class"] = "Image" + out.document.root.attributes.lsst["tree"] = "tree" + out.document.root.arrays["tree"] = ZarrArray( + data=np.frombuffer(b"{}", dtype=np.uint8) + ) + original = astropy.table.Table( + { + "x": np.arange(4, dtype=np.int32), + "y": np.arange(4, dtype=np.float32), + } + ) + model = out.add_table(original, name="cat") + + store = zarr.storage.MemoryStore() + out.document.to_zarr(store) + doc = ZarrDocument.from_zarr(store) + inp = ZarrInputArchive(doc) + + recovered = inp.get_table(model) + self.assertEqual(recovered.colnames, ["x", "y"]) + np.testing.assert_array_equal(recovered["x"], original["x"]) + np.testing.assert_array_equal(recovered["y"], original["y"]) +``` + +- [ ] **Step 2: Run to verify failure** + +Run: `pytest tests/test_zarr_input_archive.py::ZarrInputArchiveTableTestCase -v` +Expected: FAIL — `get_table` raises `NotImplementedError`. + +- [ ] **Step 3: Implement `get_table` and `get_structured_array`** + +Replace the placeholders: + +```python + def get_table( + self, + model: TableModel, + strip_header: Callable[[astropy.io.fits.Header], None] = no_header_updates, + ) -> astropy.table.Table: + result = astropy.table.Table(meta=model.meta) + for column_model in model.columns: + if isinstance(column_model.data, InlineArrayModel): + data: Any = column_model.data.data + else: + data = self.get_array(column_model.data, strip_header=strip_header) + result[column_model.name] = astropy.table.Column( + data, + name=column_model.name, + dtype=column_model.data.datatype.to_numpy(), + unit=column_model.unit, + description=column_model.description, + meta=column_model.meta, + ) + return result + + def get_structured_array( + self, + model: TableModel, + strip_header: Callable[[astropy.io.fits.Header], None] = no_header_updates, + ) -> np.ndarray: + return self.get_table(model, strip_header).as_array() +``` + +- [ ] **Step 4: Run the test** + +Run: `pytest tests/test_zarr_input_archive.py -v` +Expected: PASS — all input-archive tests pass. + +- [ ] **Step 5: Commit** + +```bash +git add python/lsst/images/zarr/_input_archive.py tests/test_zarr_input_archive.py +git commit -m "feat: implement ZarrInputArchive.get_table and get_structured_array" +``` + +### Task 3.5: Public `read()` helper + +**Files:** +- Modify: `python/lsst/images/zarr/_input_archive.py` +- Modify: `python/lsst/images/zarr/__init__.py` +- Modify: `tests/test_zarr_input_archive.py` + +`read(cls, path, **kwargs)` opens a `ZarrInputArchive`, calls `archive.get_tree(cls._get_archive_tree_type(ZarrPointerModel))`, and returns `ReadResult(tree.deserialize(archive, **kwargs), tree.metadata, tree.butler_info)`. No auto-detect path in v1 — files without `lsst.archive_class` raise. + +- [ ] **Step 1: Write the failing test** + +Append to `tests/test_zarr_input_archive.py`: + +```python +@unittest.skipUnless(HAVE_ZARR, "zarr is not installed") +class ZarrReadHelperTestCase(unittest.TestCase): + def test_round_trip_image(self) -> None: + from lsst.images import Box, Image + from lsst.images.zarr import read, write + + original = Image( + np.arange(20, dtype=np.float32).reshape(4, 5), + bbox=Box.factory[10:14, 20:25], + ) + with tempfile.TemporaryDirectory() as tmp: + target = os.path.join(tmp, "out.zarr") + write(original, target) + result = read(Image, target) + self.assertEqual(result.deserialized.array.shape, (4, 5)) + np.testing.assert_array_equal(result.deserialized.array, original.array) + self.assertEqual(result.deserialized.bbox, original.bbox) +``` + +- [ ] **Step 2: Run to verify failure** + +Run: `pytest tests/test_zarr_input_archive.py::ZarrReadHelperTestCase -v` +Expected: FAIL — `read()` raises `NotImplementedError`. + +- [ ] **Step 3: Implement `read`** + +Replace the placeholder in `_input_archive.py`: + +```python +def read[T: Any](cls: type[T], path: ResourcePathExpression, **kwargs: Any) -> ReadResult[T]: + """Read an object from a zarr archive. + + The archive's root attributes name the in-memory class via + ``lsst.archive_class``. Files without this attribute raise; auto- + detect of foreign zarr files is a follow-up. + """ + with ZarrInputArchive.open(path) as archive: + tree_type = cls._get_archive_tree_type(ZarrPointerModel) + tree = archive.get_tree(tree_type) + obj = tree.deserialize(archive, **kwargs) + return ReadResult(obj, tree.metadata, tree.butler_info) +``` + +Re-export from `python/lsst/images/zarr/__init__.py`: + +```python +from ._common import * # noqa: F401, F403 +from ._input_archive import * # noqa: F401, F403 +from ._output_archive import * # noqa: F401, F403 +``` + +- [ ] **Step 4: Run the round-trip test** + +Run: `pytest tests/test_zarr_input_archive.py::ZarrReadHelperTestCase -v` +Expected: PASS — `Image` round-trips via `write` + `read`. + +- [ ] **Step 5: Commit** + +```bash +git add python/lsst/images/zarr/_input_archive.py python/lsst/images/zarr/__init__.py tests/test_zarr_input_archive.py +git commit -m "feat: add public zarr.read() helper" +``` + +### Task 3.6: `RoundtripZarr` test helper + round-trips for Image / MaskedImage / VisitImage + +**Files:** +- Modify: `python/lsst/images/tests/_roundtrip.py` (add `RoundtripZarr`) +- Create: `tests/test_zarr_round_trip.py` + +`RoundtripZarr` lets the existing `RoundtripBase` pattern exercise the zarr backend the same way it does FITS / JSON / NDF. The new test file uses it to round-trip the three image types covered by Phase 2. + +- [ ] **Step 1: Add `RoundtripZarr` to `_roundtrip.py`** + +Edit `python/lsst/images/tests/_roundtrip.py`. Add `"RoundtripZarr"` to `__all__`, then append after `RoundtripNdf`: + +```python +class RoundtripZarr[T](RoundtripBase[T]): + def inspect(self) -> Any: + """Open the zarr archive's IR for inspection.""" + import zarr + + from lsst.images.zarr._model import ZarrDocument + + return ZarrDocument.from_zarr( + zarr.storage.LocalStore(self.filename, read_only=True) + ) + + def _get_extension(self) -> str: + return ".zarr" + + def _write(self, obj: Any, filename: str) -> ArchiveTree: + from .. import zarr as zarr_backend + + return zarr_backend.write(obj, filename) + + def _read(self, obj_type: Any, filename: str) -> ReadResult: + from .. import zarr as zarr_backend + + return zarr_backend.read(obj_type, filename) +``` + +If `RoundtripBase` constructs the on-disk path with `tempfile.NamedTemporaryFile`, audit it for directory-vs-file assumptions: a zarr archive is a directory when `_get_extension()` returns `.zarr`. Mirror what NDF does with `.sdf` (single file) but extend to handle the directory case — likely a `tempfile.TemporaryDirectory` used as the parent and the archive path joined under it. + +- [ ] **Step 2: Write the failing test** + +Create `tests/test_zarr_round_trip.py`: + +```python +# This file is part of lsst-images. +# +# Developed for the LSST Data Management System. +# This product includes software developed by the LSST Project +# (https://www.lsst.org). +# See the COPYRIGHT file at the top-level directory of this distribution +# for details of code ownership. +# +# Use of this source code is governed by a 3-clause BSD-style +# license that can be found in the LICENSE file. + +from __future__ import annotations + +import unittest + +import numpy as np + +try: + import zarr # noqa: F401 + + from lsst.images.tests import RoundtripZarr + + HAVE_ZARR = True +except ImportError: + HAVE_ZARR = False + + +@unittest.skipUnless(HAVE_ZARR, "zarr is not installed") +class ZarrRoundTripTestCase(unittest.TestCase): + def test_image_round_trip(self) -> None: + from lsst.images import Box, Image + + original = Image( + np.arange(20, dtype=np.float32).reshape(4, 5), + bbox=Box.factory[10:14, 20:25], + ) + with RoundtripZarr(self, original) as roundtrip: + recovered = roundtrip.recovered + np.testing.assert_array_equal(recovered.array, original.array) + self.assertEqual(recovered.bbox, original.bbox) + + def test_masked_image_round_trip(self) -> None: + from lsst.images import Box, Image, MaskedImage, MaskPlane, MaskSchema + + schema = MaskSchema( + [ + MaskPlane("BAD", "Bad pixel."), + MaskPlane("SAT", "Saturated."), + MaskPlane("CR", "Cosmic ray."), + ] + ) + image = Image( + np.arange(20, dtype=np.float32).reshape(4, 5), + bbox=Box.factory[10:14, 20:25], + ) + original = MaskedImage(image, mask_schema=schema) + original.mask.set("BAD", image.array % 2 == 0) + original.mask.set("SAT", image.array > 10) + + with RoundtripZarr(self, original) as roundtrip: + recovered = roundtrip.recovered + np.testing.assert_array_equal(recovered.image.array, original.image.array) + np.testing.assert_array_equal(recovered.mask.array, original.mask.array) + + def test_masked_image_with_40_planes_round_trip(self) -> None: + from lsst.images import Box, Image, MaskedImage, MaskPlane, MaskSchema + + schema = MaskSchema([MaskPlane(f"P{i}", f"Plane {i}.") for i in range(40)]) + image = Image( + np.arange(20, dtype=np.float32).reshape(4, 5), + bbox=Box.factory[10:14, 20:25], + ) + original = MaskedImage(image, mask_schema=schema) + original.mask.set("P0", image.array % 2 == 0) + original.mask.set("P39", image.array > 10) + + with RoundtripZarr(self, original) as roundtrip: + recovered = roundtrip.recovered + # 40 planes packed into uint64 on disk, unpacked to 5 bytes per pixel. + np.testing.assert_array_equal(recovered.mask.array, original.mask.array) + + def test_visit_image_round_trip(self) -> None: + from lsst.images import Box, Image, MaskPlane, MaskSchema, VisitImage + + schema = MaskSchema([MaskPlane("BAD", "Bad pixel.")]) + image = Image( + np.arange(20, dtype=np.float32).reshape(4, 5), + bbox=Box.factory[10:14, 20:25], + ) + original = VisitImage(image=image, mask_schema=schema) + + with RoundtripZarr(self, original) as roundtrip: + recovered = roundtrip.recovered + np.testing.assert_array_equal(recovered.image.array, original.image.array) + + +if __name__ == "__main__": + unittest.main() +``` + +- [ ] **Step 3: Run the tests** + +Run: `pytest tests/test_zarr_round_trip.py -v` +Expected: PASS — all four round-trips. If a test fails because some per-class detail is missing (e.g. a `lsst.companions` style key our `add_tree` should set, or a Projection deserializer that needs to find `/wcs_ast`), fix it in `_input_archive.py` / `_output_archive.py` and re-run. The 40-plane test is the load-bearing assertion that the wide-int packing + unpack round-trip is bit-exact. + +- [ ] **Step 4: Commit** + +```bash +git add python/lsst/images/tests/_roundtrip.py tests/test_zarr_round_trip.py +git commit -m "test: round-trip Image, MaskedImage (3- and 40-plane), VisitImage through zarr" +``` + +--- + +**End of Phase 3.** Seven tasks. Read side complete for `Image` / `MaskedImage` / `VisitImage`, lazy-subset invariant pinned by `_CountingStore`, mask unpack pinned by both 3-plane (uint8) and 40-plane (uint64) tests, full write→read round-trips green. Phase 4 adds `ColorImage` (recursive sub-archives) and `CellCoadd` (cell-aligned chunks + native 4-D PSF). + +## Phase 4 — `ColorImage` and `CellCoadd` + +This phase adds the two archive classes whose layouts go beyond the flat `image`/`variance`/`mask` siblings: + +- **`ColorImage`**: red/green/blue sub-archives. Each is itself a valid Image-shaped sub-archive (its own `image` array, its own OME multiscales, its own `lsst.archive_class = "Image"`). The root group has `lsst.archive_class = "ColorImage"` and **no** OME multiscales of its own. +- **`CellCoadd`**: `image`/`variance`/`mask` siblings (cell-aligned chunks) plus a 4-D `psf` array `(Cy, Cx, Py, Px)` with single-cell chunks `(1, 1, Py, Px)`. `lsst.cell_grid = {bbox, cell_shape}` on the root attrs. + +The recurring theme: **no fixup pass**. Each `add_array` call lands at the path its `name` argument names. Per-archive-class attribute decoration runs once in `add_tree` against the populated IR. + +### Task 4.1: Recursive sub-archive attribute decoration + +**Files:** +- Modify: `python/lsst/images/zarr/_layout.py` (add `decorate_sub_archives`) +- Modify: `python/lsst/images/zarr/_output_archive.py` (call it from `add_tree`) +- Modify: `tests/test_zarr_layout.py` +- Modify: `tests/test_zarr_output_archive.py` + +For ColorImage's `red/`, `green/`, `blue/` to be valid OME-NGFF / xarray groups in their own right, each needs `lsst.archive_class = "Image"` and an `ome.multiscales` block pointing at its `image` array. The decoration is purely metadata — no bytes move. + +The detection rule for "this sub-group is a sub-archive": it contains an `image` array (any rank). The decoration is recursive — sub-sub-archives (e.g. a Projection's parameter image inside a PSF sub-archive) get the same treatment. + +- [ ] **Step 1: Write the failing test** + +Append to `tests/test_zarr_layout.py`: + +```python +@unittest.skipUnless(HAVE_ZARR, "zarr is not installed") +class DecorateSubArchivesTestCase(unittest.TestCase): + def test_sub_group_with_image_gets_lsst_and_ome_attrs(self) -> None: + import numpy as np + + from lsst.images.zarr._layout import decorate_sub_archives + from lsst.images.zarr._model import ZarrArray, ZarrDocument, ZarrGroup + + doc = ZarrDocument(root=ZarrGroup()) + doc.root.attributes.lsst["archive_class"] = "ColorImage" + # red sub-archive with its own image array. + red = doc.root.ensure_group("/red") + red.arrays["image"] = ZarrArray(data=np.ones((4, 5), dtype="float32")) + + decorate_sub_archives(doc) + + self.assertEqual(red.attributes.lsst["archive_class"], "Image") + self.assertIn("multiscales", red.attributes.ome) + self.assertEqual( + red.attributes.ome["multiscales"][0]["datasets"][0]["path"], "image" + ) + + def test_root_archive_class_is_unchanged(self) -> None: + import numpy as np + + from lsst.images.zarr._layout import decorate_sub_archives + from lsst.images.zarr._model import ZarrArray, ZarrDocument, ZarrGroup + + doc = ZarrDocument(root=ZarrGroup()) + doc.root.attributes.lsst["archive_class"] = "ColorImage" + red = doc.root.ensure_group("/red") + red.arrays["image"] = ZarrArray(data=np.ones((4, 5), dtype="float32")) + + decorate_sub_archives(doc) + + # Root keeps ColorImage; only sub-groups are decorated. + self.assertEqual(doc.root.attributes.lsst["archive_class"], "ColorImage") +``` + +- [ ] **Step 2: Run to verify failure** + +Run: `pytest tests/test_zarr_layout.py::DecorateSubArchivesTestCase -v` +Expected: FAIL — `decorate_sub_archives` does not exist. + +- [ ] **Step 3: Implement the decoration pass** + +Append to `python/lsst/images/zarr/_layout.py`: + +```python +__all__ = ( + "AffineCheckResult", + "affine_check", + "axes_for_archive_class", + "chunks_aligned_to", + "chunks_for", + "decorate_sub_archives", +) + + +def decorate_sub_archives(document: "ZarrDocument") -> None: + """Walk ``document`` and decorate every sub-archive group with attrs. + + A sub-archive is any group below the root that contains an ``image`` + array. Decoration adds ``lsst.archive_class = "Image"`` and an + ``ome.multiscales`` block pointing at the sub-archive's ``image`` + array. Recursive: nested sub-archives are decorated too. + + The root group is left alone — its ``lsst.archive_class`` is set + by ``add_tree`` based on the in-memory object's type. + """ + from ._model import OmeMultiscale, ZarrDocument, ZarrGroup # local: avoid cycle + + if not isinstance(document, ZarrDocument): + raise TypeError(type(document).__name__) + _decorate_walk(document.root, depth=0) + + +def _decorate_walk(group: "ZarrGroup", *, depth: int) -> None: + from ._model import OmeMultiscale, ZarrGroup # local: avoid cycle + + for name, sub in group.groups.items(): + if "image" in sub.arrays: + sub.attributes.lsst.setdefault("archive_class", "Image") + sub.attributes.lsst.setdefault("tree", "tree") if "tree" in sub.arrays else None + if "multiscales" not in sub.attributes.ome: + multiscale = OmeMultiscale( + name="image", + axes=("y", "x"), + dataset_path="image", + ) + sub.attributes.ome["multiscales"] = [multiscale.dump()] + _decorate_walk(sub, depth=depth + 1) +``` + +In `python/lsst/images/zarr/_output_archive.py`, call it at the end of `add_tree` (just before the method returns): + +```python + from ._layout import decorate_sub_archives + + decorate_sub_archives(self.document) +``` + +- [ ] **Step 4: Add an output-archive integration test** + +Append to `tests/test_zarr_output_archive.py`: + +```python +@unittest.skipUnless(HAVE_ZARR, "zarr is not installed") +class ZarrColorImageWriteTestCase(unittest.TestCase): + def test_color_image_emits_recursive_sub_archives(self) -> None: + import os + import tempfile + + import numpy as np + import zarr + + from lsst.images import Box, ColorImage, Image + from lsst.images.zarr import write + from lsst.images.zarr._common import LSST_NS, OME_NS + from lsst.images.zarr._model import ZarrDocument + + red = Image(np.full((4, 5), 1, dtype=np.uint8), bbox=Box.factory[10:14, 20:25]) + green = Image(np.full((4, 5), 2, dtype=np.uint8), bbox=red.bbox) + blue = Image(np.full((4, 5), 3, dtype=np.uint8), bbox=red.bbox) + color = ColorImage(red=red, green=green, blue=blue) + + with tempfile.TemporaryDirectory() as tmp: + target = os.path.join(tmp, "out.zarr") + write(color, target) + with zarr.storage.LocalStore(target, read_only=True) as store: + doc = ZarrDocument.from_zarr(store) + # Root: ColorImage, no ome.multiscales (axes_for_archive_class + # returns () for ColorImage). + self.assertEqual( + doc.root.attributes.lsst["archive_class"], "ColorImage" + ) + self.assertNotIn("multiscales", doc.root.attributes.ome) + # Each channel sub-archive has its own image array... + for channel in ("red", "green", "blue"): + sub = doc.root.groups[channel] + self.assertIn("image", sub.arrays) + self.assertEqual(sub.arrays["image"].shape, (4, 5)) + # ...and is decorated as a valid Image sub-archive. + self.assertEqual(sub.attributes.lsst["archive_class"], "Image") + self.assertIn("multiscales", sub.attributes.ome) + self.assertEqual( + sub.attributes.ome["multiscales"][0]["datasets"][0]["path"], + "image", + ) +``` + +- [ ] **Step 5: Run all tests** + +Run: `pytest tests/test_zarr_layout.py tests/test_zarr_output_archive.py -v` +Expected: PASS — decoration is applied recursively, ColorImage's three channels are valid Image sub-archives. + +- [ ] **Step 6: Commit** + +```bash +git add python/lsst/images/zarr/_layout.py python/lsst/images/zarr/_output_archive.py tests/test_zarr_layout.py tests/test_zarr_output_archive.py +git commit -m "feat: decorate sub-archives with lsst.archive_class and ome.multiscales" +``` + +### Task 4.2: ColorImage round-trip + +**Files:** +- Modify: `tests/test_zarr_round_trip.py` + +The decoration in 4.1 plus the existing `read()` deserializer should round-trip ColorImage with no further code changes. This task asserts that. + +- [ ] **Step 1: Write the test** + +Append to `tests/test_zarr_round_trip.py`: + +```python +@unittest.skipUnless(HAVE_ZARR, "zarr is not installed") +class ZarrColorImageRoundTripTestCase(unittest.TestCase): + def test_color_image_round_trip(self) -> None: + from lsst.images import Box, ColorImage, Image + + red = Image(np.full((4, 5), 1, dtype=np.uint8), bbox=Box.factory[10:14, 20:25]) + green = Image(np.full((4, 5), 2, dtype=np.uint8), bbox=red.bbox) + blue = Image(np.full((4, 5), 3, dtype=np.uint8), bbox=red.bbox) + original = ColorImage(red=red, green=green, blue=blue) + + with RoundtripZarr(self, original) as roundtrip: + recovered = roundtrip.recovered + np.testing.assert_array_equal(recovered.red.array, original.red.array) + np.testing.assert_array_equal(recovered.green.array, original.green.array) + np.testing.assert_array_equal(recovered.blue.array, original.blue.array) +``` + +- [ ] **Step 2: Run the test** + +Run: `pytest tests/test_zarr_round_trip.py::ZarrColorImageRoundTripTestCase -v` +Expected: PASS. If this fails because the ColorImage deserializer needs sub-archive `tree` documents that we are not staging (since we use `serialize_direct`, not `serialize_pointer`), the failure tells you exactly what's missing — adapt the `decorate_sub_archives` pass to also write a per-sub-archive `tree` document if the ColorImage deserializer demands it. + +- [ ] **Step 3: Commit** + +```bash +git add tests/test_zarr_round_trip.py +git commit -m "test: round-trip ColorImage through the zarr backend" +``` + +### Task 4.3: CellCoadd PSF — single-cell chunks for the 4-D array + +**Files:** +- Modify: `python/lsst/images/zarr/_layout.py` (extend `chunks_for` to accept `axis_hint`) +- Modify: `python/lsst/images/zarr/_output_archive.py` (special-case `name="psf"` to chunk per-cell) +- Modify: `tests/test_zarr_layout.py` +- Modify: `tests/test_zarr_output_archive.py` + +CellCoadd's PSF is a 4-D array `(Cy, Cx, Py, Px)` where the leading two axes index cells and the trailing two are the per-cell PSF image. Single-cell reads should be one chunk, so the default chunk shape is `(1, 1, Py, Px)`. + +`add_array` recognises `name == "psf"` (or names ending in `/psf`) and applies the single-cell-chunked default if the user has not overridden. + +- [ ] **Step 1: Write the failing test** + +Append to `tests/test_zarr_output_archive.py`: + +```python +@unittest.skipUnless(HAVE_ZARR, "zarr is not installed") +class ZarrPsfChunkingTestCase(unittest.TestCase): + def test_psf_array_uses_single_cell_chunks(self) -> None: + import numpy as np + + psf = np.zeros((2, 3, 21, 21), dtype=np.float32) + archive = ZarrOutputArchive(archive_class="CellCoadd") + ref = archive.add_array(psf, name="psf") + self.assertEqual(ref.source, "zarr:/psf") + node = archive.document.root.get("/psf") + # Single-cell chunks: leading axes are 1; spatial axes match shape. + self.assertEqual(tuple(node.chunks), (1, 1, 21, 21)) + + def test_psf_user_override_wins(self) -> None: + import numpy as np + + psf = np.zeros((2, 3, 21, 21), dtype=np.float32) + archive = ZarrOutputArchive( + archive_class="CellCoadd", + chunks={"psf": (2, 3, 21, 21)}, + ) + archive.add_array(psf, name="psf") + node = archive.document.root.get("/psf") + self.assertEqual(tuple(node.chunks), (2, 3, 21, 21)) +``` + +- [ ] **Step 2: Run to verify failure** + +Run: `pytest tests/test_zarr_output_archive.py::ZarrPsfChunkingTestCase -v` +Expected: FAIL — current `add_array` defaults to `min(1024, dim)` per axis, giving `(2, 3, 21, 21)` (already small enough) but for larger Cy/Cx the leading axes would not be 1. + +- [ ] **Step 3: Implement the special case** + +In `python/lsst/images/zarr/_output_archive.py`, edit `add_array` to handle the PSF name. After computing `parent_path` and `leaf` and before staging the `ZarrArray`, add: + +```python + # Default chunks for a CellCoadd-style 4-D PSF: one cell per chunk. + if ( + chunks is None + and leaf == "psf" + and array.ndim == 4 + and parent_path in ("/", "") + ): + chunks = (1, 1, array.shape[2], array.shape[3]) +``` + +(Place this after the existing `chunks` resolution chain so user overrides still win.) + +- [ ] **Step 4: Run the tests** + +Run: `pytest tests/test_zarr_output_archive.py::ZarrPsfChunkingTestCase -v` +Expected: PASS — both tests. + +- [ ] **Step 5: Commit** + +```bash +git add python/lsst/images/zarr/_output_archive.py tests/test_zarr_output_archive.py +git commit -m "feat: default CellCoadd PSF to single-cell chunks (1, 1, Py, Px)" +``` + +### Task 4.4: CellCoadd output-archive layout test + +**Files:** +- Modify: `tests/test_zarr_output_archive.py` + +Pin the on-disk layout for a `CellCoadd`: image / variance / mask siblings with cell-aligned chunks, 4-D PSF with single-cell chunks, `lsst.cell_grid` on the root. + +The test's CellCoadd construction is implementer-supplied — the existing `python/lsst/images/cells/_coadd.py` constructor takes a particular set of arguments. The implementer must read it and assemble a minimal valid coadd; the on-disk assertions below stand regardless of constructor specifics. + +- [ ] **Step 1: Write the test** + +Append to `tests/test_zarr_output_archive.py`: + +```python +@unittest.skipUnless(HAVE_ZARR, "zarr is not installed") +class ZarrCellCoaddWriteTestCase(unittest.TestCase): + def test_cell_coadd_layout(self) -> None: + import os + import tempfile + + import zarr + + from lsst.images.zarr import write + from lsst.images.zarr._model import ZarrDocument + + coadd = _make_minimal_cell_coadd( + cell_shape=(256, 256), + shape=(512, 512), + n_cells=(2, 2), + psf_shape=(21, 21), + ) + + with tempfile.TemporaryDirectory() as tmp: + target = os.path.join(tmp, "coadd.zarr") + write(coadd, target) + with zarr.storage.LocalStore(target, read_only=True) as store: + doc = ZarrDocument.from_zarr(store) + # Root archive class. + self.assertEqual( + doc.root.attributes.lsst["archive_class"], "CellCoadd" + ) + # cell_grid metadata is on the root attrs. + self.assertIn("cell_grid", doc.root.attributes.lsst) + cg = doc.root.attributes.lsst["cell_grid"] + self.assertEqual(tuple(cg["cell_shape"]), (256, 256)) + # image / variance / mask siblings, cell-aligned chunks. + self.assertEqual(tuple(doc.root.arrays["image"].chunks), (256, 256)) + self.assertEqual(tuple(doc.root.arrays["variance"].chunks), (256, 256)) + self.assertEqual(tuple(doc.root.arrays["mask"].chunks), (256, 256)) + # 4-D psf with single-cell chunks. + psf = doc.root.arrays["psf"] + self.assertEqual(psf.shape, (2, 2, 21, 21)) + self.assertEqual(tuple(psf.chunks), (1, 1, 21, 21)) + + +def _make_minimal_cell_coadd(*, cell_shape, shape, n_cells, psf_shape): # noqa: ANN001, ANN201 + """Construct a minimal CellCoadd for layout testing. + + Implementer: read ``python/lsst/images/cells/_coadd.py`` and + assemble the smallest valid CellCoadd whose ``cell_shape``, + overall image shape, cell-grid dimensions, and per-cell PSF + shape match the requested values. The test only asserts on the + on-disk layout the write helper produces. + """ + raise unittest.SkipTest( + "Implementer: build a minimal CellCoadd per the local ctor." + ) +``` + +- [ ] **Step 2: Run the test** + +Run: `pytest tests/test_zarr_output_archive.py::ZarrCellCoaddWriteTestCase -v` +Expected: After the implementer replaces `_make_minimal_cell_coadd`, PASS. SKIP otherwise — the placeholder must be replaced before merging this phase. + +- [ ] **Step 3: Commit** + +```bash +git add tests/test_zarr_output_archive.py +git commit -m "test: pin on-disk zarr layout for CellCoadd" +``` + +### Task 4.5: CellCoadd round-trip + +**Files:** +- Modify: `tests/test_zarr_round_trip.py` + +The same minimal CellCoadd factory used in Task 4.4 round-trips through `RoundtripZarr`. Spot-checks the image and one per-cell PSF. + +- [ ] **Step 1: Write the test** + +Append to `tests/test_zarr_round_trip.py`: + +```python +@unittest.skipUnless(HAVE_ZARR, "zarr is not installed") +class ZarrCellCoaddRoundTripTestCase(unittest.TestCase): + def test_cell_coadd_round_trip(self) -> None: + original = _make_minimal_cell_coadd_with_psf() # implementer-supplied + with RoundtripZarr(self, original) as roundtrip: + recovered = roundtrip.recovered + np.testing.assert_array_equal( + recovered.image.array, original.image.array + ) + # Spot-check one per-cell PSF if the API exposes them. + if hasattr(original, "psf") and hasattr(original.psf, "per_cell"): + np.testing.assert_array_equal( + recovered.psf.per_cell[0, 0], original.psf.per_cell[0, 0] + ) + + +def _make_minimal_cell_coadd_with_psf(): # noqa: ANN201 + """Implementer: assemble a minimal CellCoadd with a 4-D per-cell PSF. + + Reuse `_make_minimal_cell_coadd` from `test_zarr_output_archive.py` + if the same factory works here, or build one in this file. + """ + raise unittest.SkipTest( + "Implementer: assemble a minimal CellCoadd with a per-cell PSF." + ) +``` + +- [ ] **Step 2: Run the test** + +Run: `pytest tests/test_zarr_round_trip.py::ZarrCellCoaddRoundTripTestCase -v` +Expected: After the implementer replaces the factory, PASS. SKIP otherwise; replace before merging this phase. + +- [ ] **Step 3: Commit** + +```bash +git add tests/test_zarr_round_trip.py +git commit -m "test: round-trip CellCoadd through the zarr backend" +``` + +--- + +**End of Phase 4.** Five tasks. ColorImage writes its three channels as recursive sub-archives (each a valid Image sub-archive with its own OME multiscales), CellCoadd writes flat siblings with cell-aligned chunks plus a 4-D PSF with single-cell chunks. Both types round-trip without any byte duplication or fixup pass. Phase 5 covers FITS↔Zarr opaque-metadata round-trips, xarray interop assertions, and the optional external-reader sanity tests. + +## Phase 5 — Cross-format round-trips, xarray interop, external readers + +This phase makes the zarr backend a peer of FITS / NDF for round-trip preservation: an object read from FITS carries its primary-HDU header in `_opaque_metadata`, and writing that object to zarr preserves those cards so a later round-trip back to FITS reproduces the original headers byte-for-byte. + +It also confirms the **xarray interop contract**: `xr.open_zarr(path)` returns a `Dataset` with `image` / `variance` / `mask` data variables sharing the `(y, x)` dimensions and CF flag attrs surviving on the mask. Two optional external-reader checks (`ngff-validator`, `ome-zarr-py`) round out the phase; both skip silently when their dependencies are absent. + +### Task 5.1: Persist `FitsOpaqueMetadata` on write to zarr + +**Files:** +- Modify: `python/lsst/images/zarr/_layout.py` (add `serialize_fits_opaque_metadata`) +- Modify: `python/lsst/images/zarr/_output_archive.py` (extend `write` to call it) +- Modify: `tests/test_zarr_output_archive.py` + +The opaque metadata lives at `/lsst/opaque_metadata/fits/primary` as a 1-D `uint8` array containing UTF-8 JSON. The JSON encodes the astropy `Header` as a flat `{keyword: value}` dict. The root attribute `lsst.opaque_metadata_format = "fits"` flags its presence. + +- [ ] **Step 1: Write the failing test** + +Append to `tests/test_zarr_output_archive.py`: + +```python +@unittest.skipUnless(HAVE_ZARR, "zarr is not installed") +class ZarrOpaqueMetadataWriteTestCase(unittest.TestCase): + def test_fits_opaque_metadata_persists(self) -> None: + import json as _json + import os + import tempfile + + import astropy.io.fits + import numpy as np + import zarr + + from lsst.images import Box, Image + from lsst.images.fits._common import ExtensionKey, FitsOpaqueMetadata + from lsst.images.zarr import write + from lsst.images.zarr._model import ZarrDocument + + image = Image( + np.arange(20, dtype=np.float32).reshape(4, 5), + bbox=Box.factory[10:14, 20:25], + ) + header = astropy.io.fits.Header() + header["ORIGIN"] = "RUBIN" + header["EXPTIME"] = 30.0 + opaque = FitsOpaqueMetadata() + opaque.headers[ExtensionKey()] = header + image._opaque_metadata = opaque + + with tempfile.TemporaryDirectory() as tmp: + target = os.path.join(tmp, "out.zarr") + write(image, target) + with zarr.storage.LocalStore(target, read_only=True) as store: + doc = ZarrDocument.from_zarr(store) + self.assertEqual( + doc.root.attributes.lsst.get("opaque_metadata_format"), + "fits", + ) + opaque_node = doc.root.get("/lsst/opaque_metadata/fits/primary") + json_bytes = bytes(opaque_node.read()) + cards = _json.loads(json_bytes) + self.assertEqual(cards["ORIGIN"], "RUBIN") + self.assertEqual(cards["EXPTIME"], 30.0) +``` + +- [ ] **Step 2: Run to verify failure** + +Run: `pytest tests/test_zarr_output_archive.py::ZarrOpaqueMetadataWriteTestCase -v` +Expected: FAIL — `/lsst/opaque_metadata/fits/primary` is not in the store. + +- [ ] **Step 3: Implement opaque-metadata serialization** + +Append to `python/lsst/images/zarr/_layout.py`: + +```python +def serialize_fits_opaque_metadata(document: "ZarrDocument", opaque: Any) -> None: + """Stage a `FitsOpaqueMetadata` object into the IR. + + Stores the primary-HDU header as a JSON-encoded ``uint8`` array at + ``/lsst/opaque_metadata/fits/primary`` and sets the + ``lsst.opaque_metadata_format`` attribute on the root group. + No-op if the metadata is empty or missing a primary header. + """ + import json as _json + + import numpy as np + + from ..fits._common import ExtensionKey + from ._model import ZarrArray + + primary = opaque.headers.get(ExtensionKey()) + if primary is None or len(primary) == 0: + return + cards = {card.keyword: card.value for card in primary.cards if card.keyword} + json_bytes = _json.dumps(cards).encode("utf-8") + parent = document.root.ensure_group("/lsst/opaque_metadata/fits") + parent.arrays["primary"] = ZarrArray( + data=np.frombuffer(json_bytes, dtype=np.uint8) + ) + document.root.attributes.lsst["opaque_metadata_format"] = "fits" +``` + +In `python/lsst/images/zarr/_output_archive.py`, extend `write` to call this *after* `add_tree` returns and *before* the IR is materialized: + +```python +def write( + obj: Any, + path: Any, + *, + chunks=None, + shards=None, + compression=None, + metadata=None, + butler_info=None, +) -> ArchiveTree: + from ._store import open_store_for_write + + archive_class = type(obj).__name__ + archive_default_name = getattr(obj, "_archive_default_name", None) + archive_metadata: dict[str, Any] = {} + if (cell_shape := getattr(obj, "cell_shape", None)) is not None: + archive_metadata["cell_shape"] = tuple(cell_shape) + if (cell_grid := getattr(obj, "cell_grid", None)) is not None: + archive_metadata["cell_grid"] = { + "bbox": list(cell_grid.bbox) if hasattr(cell_grid, "bbox") else None, + "cell_shape": list(cell_grid.cell_shape) + if hasattr(cell_grid, "cell_shape") + else None, + } + if (mask_schema := getattr(obj, "mask_schema", None)) is not None: + archive_metadata["mask_schema"] = mask_schema + + archive = ZarrOutputArchive( + chunks=chunks, + shards=shards, + compression=compression, + archive_class=archive_class, + archive_metadata=archive_metadata, + ) + if archive_default_name is not None: + tree = archive.serialize_direct(archive_default_name, obj.serialize) + else: + tree = obj.serialize(archive) + if metadata is not None: + tree.metadata.update(metadata) + if butler_info is not None: + tree.butler_info = butler_info + archive.add_tree(tree) + # Stage opaque metadata after add_tree so the namespace attribute + # writes happen in the right order. + opaque = getattr(obj, "_opaque_metadata", None) + if opaque is not None: + from ._layout import serialize_fits_opaque_metadata + + try: + serialize_fits_opaque_metadata(archive.document, opaque) + except ImportError: + pass # opaque is not a FITS one; ignore + with open_store_for_write(path) as store: + archive.document.to_zarr(store) + return tree +``` + +- [ ] **Step 4: Run the tests** + +Run: `pytest tests/test_zarr_output_archive.py::ZarrOpaqueMetadataWriteTestCase -v` +Expected: PASS — opaque metadata is staged at the spec path with the correct format flag. + +- [ ] **Step 5: Commit** + +```bash +git add python/lsst/images/zarr/_layout.py python/lsst/images/zarr/_output_archive.py tests/test_zarr_output_archive.py +git commit -m "feat: persist FitsOpaqueMetadata at /lsst/opaque_metadata/fits/primary on zarr write" +``` + +### Task 5.2: Restore `FitsOpaqueMetadata` on read from zarr + +**Files:** +- Modify: `python/lsst/images/zarr/_layout.py` (add `deserialize_fits_opaque_metadata`) +- Modify: `python/lsst/images/zarr/_input_archive.py` (read it in `__init__`; expose via `get_opaque_metadata`; attach in `read`) +- Modify: `tests/test_zarr_input_archive.py` + +`get_opaque_metadata()` returns a `FitsOpaqueMetadata` reconstructed from `/lsst/opaque_metadata/fits/primary`. The `read()` helper attaches it to the deserialized object as `obj._opaque_metadata` (matching FITS / NDF read patterns). + +- [ ] **Step 1: Write the failing test** + +Append to `tests/test_zarr_input_archive.py`: + +```python +@unittest.skipUnless(HAVE_ZARR, "zarr is not installed") +class ZarrOpaqueMetadataReadTestCase(unittest.TestCase): + def test_fits_opaque_metadata_round_trips(self) -> None: + import astropy.io.fits + + from lsst.images import Box, Image + from lsst.images.fits._common import ExtensionKey, FitsOpaqueMetadata + from lsst.images.zarr import read, write + + image = Image( + np.arange(20, dtype=np.float32).reshape(4, 5), + bbox=Box.factory[10:14, 20:25], + ) + header = astropy.io.fits.Header() + header["ORIGIN"] = "RUBIN" + header["EXPTIME"] = 30.0 + opaque = FitsOpaqueMetadata() + opaque.headers[ExtensionKey()] = header + image._opaque_metadata = opaque + + with tempfile.TemporaryDirectory() as tmp: + target = os.path.join(tmp, "out.zarr") + write(image, target) + recovered = read(Image, target).deserialized + recovered_opaque = recovered._opaque_metadata + self.assertIsInstance(recovered_opaque, FitsOpaqueMetadata) + recovered_header = recovered_opaque.headers[ExtensionKey()] + self.assertEqual(recovered_header["ORIGIN"], "RUBIN") + self.assertEqual(recovered_header["EXPTIME"], 30.0) +``` + +- [ ] **Step 2: Run to verify failure** + +Run: `pytest tests/test_zarr_input_archive.py::ZarrOpaqueMetadataReadTestCase -v` +Expected: FAIL — `recovered._opaque_metadata` is `None` or unset. + +- [ ] **Step 3: Implement deserialization** + +Append to `python/lsst/images/zarr/_layout.py`: + +```python +def deserialize_fits_opaque_metadata(document: "ZarrDocument") -> Any | None: + """Reconstruct a `FitsOpaqueMetadata` from the IR, or return None. + + Returns ``None`` when the archive does not have a FITS opaque + metadata block (the common case for archives that originated as + native zarr). + """ + import json as _json + + from ..fits._common import ExtensionKey, FitsOpaqueMetadata + from ._model import ZarrArray + + if document.root.attributes.lsst.get("opaque_metadata_format") != "fits": + return None + try: + node = document.root.get("/lsst/opaque_metadata/fits/primary") + except KeyError: + return None + if not isinstance(node, ZarrArray): + return None + json_bytes = bytes(node.read()).decode("utf-8") + cards = _json.loads(json_bytes) + import astropy.io.fits + + header = astropy.io.fits.Header() + for key, value in cards.items(): + header[key] = value + opaque = FitsOpaqueMetadata() + opaque.headers[ExtensionKey()] = header + return opaque +``` + +In `python/lsst/images/zarr/_input_archive.py`, store opaque metadata at construction time, expose it, and attach it in `read`: + +```python + def __init__(self, document: ZarrDocument) -> None: + self._document = document + self._validate_root_attributes() + self._deserialized_pointer_cache = {} + self._frame_set_cache = {} + from ._layout import deserialize_fits_opaque_metadata + + self._opaque_metadata = deserialize_fits_opaque_metadata(document) + + def get_opaque_metadata(self) -> Any | None: + return self._opaque_metadata +``` + +…and in `read`: + +```python +def read[T: Any](cls, path, **kwargs): + with ZarrInputArchive.open(path) as archive: + tree_type = cls._get_archive_tree_type(ZarrPointerModel) + tree = archive.get_tree(tree_type) + obj = tree.deserialize(archive, **kwargs) + if (opaque := archive.get_opaque_metadata()) is not None: + obj._opaque_metadata = opaque + return ReadResult(obj, tree.metadata, tree.butler_info) +``` + +- [ ] **Step 4: Run the tests** + +Run: `pytest tests/test_zarr_input_archive.py::ZarrOpaqueMetadataReadTestCase -v` +Expected: PASS — recovered header has both cards. + +- [ ] **Step 5: Commit** + +```bash +git add python/lsst/images/zarr/_input_archive.py python/lsst/images/zarr/_layout.py tests/test_zarr_input_archive.py +git commit -m "feat: restore FitsOpaqueMetadata on zarr read" +``` + +### Task 5.3: FITS → Zarr → FITS round-trip + +**Files:** +- Create: `tests/test_zarr_cross_format.py` + +End-to-end: read a FITS file, write it to zarr, read the zarr back, write it to FITS. The final FITS file's primary header must match the original's card-for-card. + +- [ ] **Step 1: Write the test** + +Create `tests/test_zarr_cross_format.py`: + +```python +# This file is part of lsst-images. +# +# Developed for the LSST Data Management System. +# This product includes software developed by the LSST Project +# (https://www.lsst.org). +# See the COPYRIGHT file at the top-level directory of this distribution +# for details of code ownership. +# +# Use of this source code is governed by a 3-clause BSD-style +# license that can be found in the LICENSE file. + +from __future__ import annotations + +import os +import tempfile +import unittest + +import numpy as np + +try: + import zarr # noqa: F401 + + from lsst.images.zarr import read as zarr_read + from lsst.images.zarr import write as zarr_write + + HAVE_ZARR = True +except ImportError: + HAVE_ZARR = False + + +@unittest.skipUnless(HAVE_ZARR, "zarr is not installed") +class FitsZarrCrossFormatTestCase(unittest.TestCase): + def test_fits_to_zarr_to_fits_preserves_primary_header(self) -> None: + import astropy.io.fits + + from lsst.images import Box, Image + from lsst.images.fits import read as fits_read + from lsst.images.fits import write as fits_write + + original = Image( + np.arange(20, dtype=np.float32).reshape(4, 5), + bbox=Box.factory[10:14, 20:25], + ) + with tempfile.TemporaryDirectory() as tmp: + fits_a = os.path.join(tmp, "a.fits") + zarr_path = os.path.join(tmp, "b.zarr") + fits_b = os.path.join(tmp, "c.fits") + + def update_header(header): # noqa: ANN001 + header["ORIGIN"] = "RUBIN" + header["EXPTIME"] = 30.0 + + fits_write(original, fits_a, update_header=update_header) + from_fits = fits_read(Image, fits_a).deserialized + zarr_write(from_fits, zarr_path) + from_zarr = zarr_read(Image, zarr_path).deserialized + fits_write(from_zarr, fits_b) + + with astropy.io.fits.open(fits_b) as hdul: + self.assertEqual(hdul[0].header["ORIGIN"], "RUBIN") + self.assertEqual(hdul[0].header["EXPTIME"], 30.0) + + +if __name__ == "__main__": + unittest.main() +``` + +- [ ] **Step 2: Run the test** + +Run: `pytest tests/test_zarr_cross_format.py -v` +Expected: PASS — both cards survive the FITS→Zarr→FITS pipeline. + +- [ ] **Step 3: Commit** + +```bash +git add tests/test_zarr_cross_format.py +git commit -m "test: FITS↔Zarr opaque-metadata round-trip" +``` + +### Task 5.4: xarray interop assertion + +**Files:** +- Create: `tests/test_zarr_xarray_interop.py` + +The whole point of the xarray/CF root layout is that `xr.open_zarr(path)` returns a `Dataset` with the masked-image components as data variables sharing the `(y, x)` dimensions, and the CF `flag_masks` / `flag_meanings` survive on the `mask` variable. This test pins that contract. + +Skipped if `xarray` is not installed; the implementer adds `xarray` to the test extras when this test is added. + +- [ ] **Step 1: Write the test** + +Create `tests/test_zarr_xarray_interop.py`: + +```python +# This file is part of lsst-images. +# +# Developed for the LSST Data Management System. +# This product includes software developed by the LSST Project +# (https://www.lsst.org). +# See the COPYRIGHT file at the top-level directory of this distribution +# for details of code ownership. +# +# Use of this source code is governed by a 3-clause BSD-style +# license that can be found in the LICENSE file. + +from __future__ import annotations + +import os +import tempfile +import unittest + +import numpy as np + +try: + import zarr # noqa: F401 + + from lsst.images.zarr import write + + HAVE_ZARR = True +except ImportError: + HAVE_ZARR = False + +try: + import xarray as xr # noqa: F401 + + HAVE_XARRAY = True +except ImportError: + HAVE_XARRAY = False + + +@unittest.skipUnless(HAVE_ZARR and HAVE_XARRAY, "xarray is not installed") +class XarrayInteropTestCase(unittest.TestCase): + def test_open_zarr_returns_dataset_with_masked_image_components(self) -> None: + from lsst.images import Box, Image, MaskedImage, MaskPlane, MaskSchema + + schema = MaskSchema( + [ + MaskPlane("BAD", "Bad pixel."), + MaskPlane("SAT", "Saturated."), + MaskPlane("CR", "Cosmic ray."), + ] + ) + image = Image( + np.arange(20, dtype=np.float32).reshape(4, 5), + bbox=Box.factory[10:14, 20:25], + ) + masked = MaskedImage(image, mask_schema=schema) + masked.mask.set("BAD", image.array % 2 == 0) + + with tempfile.TemporaryDirectory() as tmp: + target = os.path.join(tmp, "masked.zarr") + write(masked, target) + ds = xr.open_zarr(target) + # Three data variables sharing the (y, x) dims. + self.assertIn("image", ds.data_vars) + self.assertIn("variance", ds.data_vars) + self.assertIn("mask", ds.data_vars) + self.assertEqual(ds["image"].dims, ("y", "x")) + self.assertEqual(ds["mask"].dims, ("y", "x")) + self.assertEqual(ds["image"].shape, (4, 5)) + # CF flag attrs survive on the mask variable. + self.assertEqual(ds["mask"].attrs["flag_meanings"], "BAD SAT CR") + self.assertEqual(list(ds["mask"].attrs["flag_masks"]), [1, 2, 4]) + + +if __name__ == "__main__": + unittest.main() +``` + +- [ ] **Step 2: Run the test** + +Run: `pytest tests/test_zarr_xarray_interop.py -v` +Expected: PASS if `xarray` is installed; SKIP otherwise. If it fails when xarray is present, inspect what xarray sees: most often it's a `_ARRAY_DIMENSIONS` typo, or `tree` / `wcs_ast` arrays leaking into the Dataset (xarray treats every zarr array in the group as a data variable — those are 1-D `uint8` arrays so they should appear as 1-D variables, harmless, but they shouldn't shadow `image` etc.). + +- [ ] **Step 3: Commit** + +```bash +git add tests/test_zarr_xarray_interop.py +git commit -m "test: xarray.open_zarr returns Dataset with image/variance/mask data variables" +``` + +### Task 5.5: Optional `ome-zarr-py` external-reader sanity test + +**Files:** +- Create: `tests/test_zarr_external_reader.py` + +This test confirms the bytes we emit are readable by `ome-zarr-py` (the upstream OME-Zarr toolkit). It checks only the science array — `ome-zarr-py` doesn't know about `lsst:` extensions. Skipped when the package isn't installed. + +- [ ] **Step 1: Write the test** + +Create `tests/test_zarr_external_reader.py`: + +```python +# This file is part of lsst-images. +# +# Developed for the LSST Data Management System. +# This product includes software developed by the LSST Project +# (https://www.lsst.org). +# See the COPYRIGHT file at the top-level directory of this distribution +# for details of code ownership. +# +# Use of this source code is governed by a 3-clause BSD-style +# license that can be found in the LICENSE file. + +from __future__ import annotations + +import os +import tempfile +import unittest + +import numpy as np + +try: + import zarr # noqa: F401 + + from lsst.images.zarr import write + + HAVE_ZARR = True +except ImportError: + HAVE_ZARR = False + +try: + import ome_zarr # noqa: F401 + import ome_zarr.io # noqa: F401 + import ome_zarr.reader # noqa: F401 + + HAVE_OME_ZARR = True +except ImportError: + HAVE_OME_ZARR = False + + +@unittest.skipUnless(HAVE_ZARR and HAVE_OME_ZARR, "ome-zarr is not installed") +class OmeZarrReaderTestCase(unittest.TestCase): + def test_ome_zarr_can_open_image(self) -> None: + from lsst.images import Box, Image + + original = Image( + np.arange(20, dtype=np.float32).reshape(4, 5), + bbox=Box.factory[10:14, 20:25], + ) + with tempfile.TemporaryDirectory() as tmp: + target = os.path.join(tmp, "out.zarr") + write(original, target) + from ome_zarr.io import parse_url + from ome_zarr.reader import Reader + + location = parse_url(target) + self.assertIsNotNone(location) + reader = Reader(location) + nodes = list(reader()) + self.assertGreaterEqual(len(nodes), 1) + data = nodes[0].data[0] # level 0 + self.assertEqual(tuple(data.shape), (4, 5)) + + +if __name__ == "__main__": + unittest.main() +``` + +- [ ] **Step 2: Run the test** + +Run: `pytest tests/test_zarr_external_reader.py -v` +Expected: PASS if `ome-zarr` is installed; SKIP otherwise. + +- [ ] **Step 3: Commit** + +```bash +git add tests/test_zarr_external_reader.py +git commit -m "test: ome-zarr-py can open archives written by lsst.images.zarr" +``` + +### Task 5.6: Optional `ngff-validator` compliance test + +**Files:** +- Create: `tests/test_zarr_ome_compliance.py` + +`ngff-validator` checks an archive against the OME-NGFF schema. Invoked via subprocess if available; skipped otherwise. Validates representative outputs of every supported archive class. + +- [ ] **Step 1: Write the test** + +Create `tests/test_zarr_ome_compliance.py`: + +```python +# This file is part of lsst-images. +# +# Developed for the LSST Data Management System. +# This product includes software developed by the LSST Project +# (https://www.lsst.org). +# See the COPYRIGHT file at the top-level directory of this distribution +# for details of code ownership. +# +# Use of this source code is governed by a 3-clause BSD-style +# license that can be found in the LICENSE file. + +from __future__ import annotations + +import os +import shutil +import subprocess +import tempfile +import unittest + +import numpy as np + +try: + import zarr # noqa: F401 + + from lsst.images.zarr import write + + HAVE_ZARR = True +except ImportError: + HAVE_ZARR = False + +NGFF_VALIDATOR = shutil.which("ngff-validator") + + +@unittest.skipUnless(HAVE_ZARR and NGFF_VALIDATOR, "ngff-validator is not on PATH") +class NgffComplianceTestCase(unittest.TestCase): + def _validate(self, target: str) -> None: + result = subprocess.run( + [NGFF_VALIDATOR, target], + capture_output=True, + text=True, + check=False, + ) + self.assertEqual( + result.returncode, + 0, + f"ngff-validator failed for {target}:\n" + f"STDOUT:\n{result.stdout}\nSTDERR:\n{result.stderr}", + ) + + def test_image_validates(self) -> None: + from lsst.images import Box, Image + + image = Image( + np.arange(20, dtype=np.float32).reshape(4, 5), + bbox=Box.factory[10:14, 20:25], + ) + with tempfile.TemporaryDirectory() as tmp: + target = os.path.join(tmp, "out.zarr") + write(image, target) + self._validate(target) + + def test_masked_image_validates(self) -> None: + from lsst.images import Box, Image, MaskedImage, MaskPlane, MaskSchema + + schema = MaskSchema([MaskPlane("BAD", "Bad pixel.")]) + image = Image( + np.arange(20, dtype=np.float32).reshape(4, 5), + bbox=Box.factory[10:14, 20:25], + ) + masked = MaskedImage(image, mask_schema=schema) + + with tempfile.TemporaryDirectory() as tmp: + target = os.path.join(tmp, "masked.zarr") + write(masked, target) + self._validate(target) + + +if __name__ == "__main__": + unittest.main() +``` + +- [ ] **Step 2: Run the test** + +Run: `pytest tests/test_zarr_ome_compliance.py -v` +Expected: PASS if `ngff-validator` is on PATH; SKIP otherwise. If a real install is available and validation fails, fix the layout (most likely an axis-type misclassification or a `coordinateTransformations` shape error) before merging. + +- [ ] **Step 3: Commit** + +```bash +git add tests/test_zarr_ome_compliance.py +git commit -m "test: ngff-validator compliance check (skipped when validator absent)" +``` + +--- + +**End of Phase 5.** Six tasks. FITS↔Zarr round-trips preserve primary-HDU cards through Zarr; xarray interop is pinned by an `xr.open_zarr` test that asserts `Dataset` shape and CF flag attrs on the mask; optional external-reader checks confirm OME-NGFF compliance and `ome-zarr-py` interop when their dependencies are installed. Phase 6 wraps up with module documentation and a changelog entry. + +## Phase 6 — Documentation, changelog, and final integration + +Phase 6 wraps up the backend with the documentation that makes it discoverable. The reference docs live under `doc/lsst.images/` and follow the same `automodapi`-driven pattern as the other backends; the changelog uses `towncrier` fragments under `doc/changes/`. + +### Task 6.1: Expand the module docstring + +**Files:** +- Modify: `python/lsst/images/zarr/__init__.py` + +The Phase 1 `__init__.py` carries a short docstring. Replace it with a full-fat version covering layout, lazy reads, FITS round-trip, and the v1 follow-ups. + +- [ ] **Step 1: Replace the docstring** + +Edit `python/lsst/images/zarr/__init__.py`. Replace the existing docstring (everything between the first triple-quote and the matching one) with: + +```python +"""Zarr v3 archive backend for `lsst.images`. + +This module reads and writes Zarr v3 archives whose root layout is +xarray/CF-shaped (``image``, ``variance``, ``mask`` as siblings sharing +``(y, x)`` dimensions, CF ``flag_masks`` / ``flag_meanings`` / +``flag_descriptions`` on the mask) with OME-NGFF v0.5 multiscales +metadata as a discoverability layer pointing at the same ``image`` +array. The same bytes are visible to ``xarray``, GDAL's Zarr driver, +and OME-Zarr tooling like ``napari`` and ``ome-zarr-py``. + +Supported types +--------------- + +Every image type that already serializes to FITS / JSON / NDF: +`~lsst.images.Image`, `~lsst.images.Mask`, `~lsst.images.MaskedImage`, +`~lsst.images.VisitImage`, `~lsst.images.ColorImage`, and +`lsst.images.cells.CellCoadd`, plus any object reachable through the +`~lsst.images.serialization.OutputArchive` interface. + +On-disk layout +-------------- + +A `MaskedImage` archive contains: + +- ``image``, ``variance``, ``mask`` arrays at the root, shaped + ``(Y, X)`` with shared chunk sizes. +- ``tree`` — 1-D ``uint8`` zarr array containing UTF-8 JSON of the + Pydantic archive tree (the round-trip authority). +- ``wcs_ast`` — 1-D ``uint8`` zarr array containing the AST FrameSet + text (the WCS round-trip authority). + +The mask is a 2-D unsigned integer (``uint8`` for ≤8 planes, up to +``uint64`` for 64 planes; >64 raises). Each pixel's bits encode the +applicable mask planes — the same logical representation the FITS +backend uses, so FITS↔Zarr mask round-trips need no bit-repacking. + +For `ColorImage`, the three channels are written as recursive +sub-archives at ``red/``, ``green/``, ``blue/``. Each sub-archive is +itself a valid Image-shaped OME-NGFF group with its own ``image`` +array, OME multiscales metadata, and ``lsst.archive_class = "Image"``. + +For `CellCoadd`, ``image`` / ``variance`` / ``mask`` are siblings +(cell-aligned chunks driven by ``cell_shape``), and ``psf`` is a 4-D +``(Cy, Cx, Py, Px)`` array with single-cell chunks +``(1, 1, Py, Px)``. ``lsst.cell_grid = {bbox, cell_shape}`` lives on +the root attrs. + +The OME multiscales ``dataset.path`` always points at a sibling array +(``"image"`` for the standard case). No bytes are duplicated for the +OME view — the science array is the same array xarray sees. + +WCS handling +------------ + +The AST ``FrameSet`` text at ``wcs_ast`` is the round-trip authority. +For external tools (napari, neuroglancer), the layout layer also +emits an OME-NGFF v0.5 affine ``coordinateTransformations`` block +that approximates the linear part of the pixel-to-sky map. Before +emitting, residuals are sampled on an 11×11 grid; if the worst +pixel-equivalent error exceeds 1.0 pixel, the affine block is dropped +and ``lsst.wcs_simplified_dropped: true`` is recorded with the +observed maximum. Readers always reconstruct the projection from +``wcs_ast``. + +Full RFC-5 nonlinear coordinate transformations as authoritative +output is a follow-up; it is blocked on writing an AST JSON channel +that serializes a ``FrameSet`` to / from RFC-5 transformation JSON. + +Cloud-friendly defaults +----------------------- + +- Default chunk geometry is tile-aligned: ``min(1024, dim)`` per + axis for plain images, ``cell_shape`` for `CellCoadd`, single-cell + for `CellCoadd`'s 4-D PSF. +- Sharding (zarr v3 native) is enabled by default with a tunable + shard size (4×4 chunks by default) so object counts on S3 / GCS + stay manageable for multi-gigabyte images. +- Subset reads via the ``slices=`` argument to + `~lsst.images.serialization.InputArchive.get_array` exploit zarr's + chunk index: only chunks intersecting the slice are fetched, even + from remote stores. +- Both ``DirectoryStore`` and ``ZipStore`` are supported. The store + is selected from the URI shape: ``*.zarr.zip`` → ZipStore, + otherwise directory. Remote URIs (``s3://``, ``gs://``, + ``http(s)://``) go through `lsst.resources.ResourcePath` and + `fsspec`. + +Round-trip with FITS +-------------------- + +When an object that originated from a FITS read carries a +`~lsst.images.fits.FitsOpaqueMetadata`, the primary-HDU header is +preserved at ``/lsst/opaque_metadata/fits/primary``. Reading the +zarr back attaches an equivalent ``FitsOpaqueMetadata`` to the +deserialized object so a subsequent FITS write reproduces the +original cards. + +Optional install +---------------- + +This backend requires `zarr >= 3.0`. Install via the ``[zarr]`` +extra:: + + pip install lsst-images[zarr] + +The top-level ``import lsst.images.zarr`` raises a clear +`ImportError` with this guidance if `zarr` is not installed. + +Follow-ups +---------- + +These items are tracked separately from the initial backend release: + +- Lazy / dask-friendly read API (``read_lazy()``). +- Multiscale pyramid generation (level 1, 2, …) for visualization + tools. +- NGFF RFC-5 nonlinear coordinate transformations as authoritative + output (blocked on AST JSON channel work). +- 3-D mask fallback for `>64` planes. +- ``zarr.consolidated_metadata`` extension to reduce object-list + calls on cloud stores. +- NCZarr / NetCDF interop (``_NCZARR_*`` markers + optional 1-D + coordinate variables; purely additive when adopted). +- Stacked OME view for `ColorImage` (single ``(3, Y, X)`` array + alongside the per-channel sub-archives, gated by an explicit + flag because of the byte-duplication cost). +""" +``` + +- [ ] **Step 2: Verify the docstring is well-formed** + +Run: `python -c "import lsst.images.zarr; help(lsst.images.zarr)" | head -60` +Expected: docstring renders cleanly with no `:role:` typos or unclosed code blocks. A deeper Sphinx build runs in Task 6.2. + +- [ ] **Step 3: Commit** + +```bash +git add python/lsst/images/zarr/__init__.py +git commit -m "docs: expand lsst.images.zarr module docstring" +``` + +### Task 6.2: Add the reference docs page + +**Files:** +- Create: `doc/lsst.images/zarr.rst` +- Modify: `doc/lsst.images/index.rst` (add `zarr.rst` to the toctree) + +Mirrors `doc/lsst.images/ndf.rst` exactly so Sphinx renders the API in the same shape as the other backends. + +- [ ] **Step 1: Create the reference page** + +Create `doc/lsst.images/zarr.rst`: + +```rst +Zarr I/O +======== + +A Zarr v3 serialization backend whose on-disk layout is xarray/CF-shaped +at the root (``image`` / ``variance`` / ``mask`` as siblings sharing +``(y, x)`` dimensions, CF ``flag_masks`` / ``flag_meanings`` on the +mask) with OME-NGFF v0.5 multiscales metadata as a discoverability +layer pointing at the same ``image`` array. The same bytes are visible +to ``xarray``, GDAL's Zarr driver, and OME-Zarr tooling like +``napari`` and ``ome-zarr-py``. + +Default chunking is tile-aligned (~1024×1024 for plain images, +``cell_shape`` for ``CellCoadd``); sharding is enabled by default; and +subset reads via ``slices=`` only fetch the chunks they need — including +on remote stores accessed through ``lsst.resources.ResourcePath`` and +``fsspec``. + +.. automodapi:: lsst.images.zarr + :no-inheritance-diagram: + :include-all-objects: + :inherited-members: +``` + +- [ ] **Step 2: Add the page to the toctree** + +In `doc/lsst.images/index.rst`, find the line containing `ndf.rst` and add `zarr.rst` after it (preserving alphabetical order): + +```rst + fits.rst + json.rst + ndf.rst + zarr.rst +``` + +- [ ] **Step 3: Verify the docs build** + +Run: `cd doc && sphinx-build -W -b html . _build/html` (only if a Sphinx environment is set up locally; otherwise skip and rely on CI). +Expected: clean build with no warnings about undefined references. + +- [ ] **Step 4: Commit** + +```bash +git add doc/lsst.images/zarr.rst doc/lsst.images/index.rst +git commit -m "docs: add Zarr backend reference page" +``` + +### Task 6.3: Add the towncrier changelog fragment + +**Files:** +- Create: `doc/changes/DM-XXXXX.feature.md` (replace `XXXXX` with the assigned Jira ticket number) + +Each user-visible change lands as a single Markdown fragment under `doc/changes/`. For this work it's a **feature**. + +- [ ] **Step 1: Create the fragment** + +Create `doc/changes/DM-XXXXX.feature.md` (replace `XXXXX` with the actual Jira ticket number): + +```markdown +Added a new `lsst.images.zarr` archive backend that reads and writes Zarr v3 archives. The on-disk layout is xarray/CF-shaped at the root (`image`, `variance`, `mask` as siblings sharing `(y, x)` dimensions, CF `flag_masks`/`flag_meanings` on the mask) with OME-NGFF v0.5 multiscales metadata layered on top — the same bytes are visible to xarray, GDAL, and OME-Zarr tooling like `napari` and `ome-zarr-py`. Supports every image type the FITS / JSON / NDF backends support (`Image`, `Mask`, `MaskedImage`, `VisitImage`, `ColorImage`, `CellCoadd`). Cloud-friendly defaults (tile-aligned chunks, zarr v3 sharding, fsspec-backed remote stores) and subset reads that only fetch the chunks they need. Install via the new `[zarr]` extra (`pip install lsst-images[zarr]`). +``` + +- [ ] **Step 2: Commit** + +```bash +git add doc/changes/DM-XXXXX.feature.md +git commit -m "docs: changelog entry for lsst.images.zarr backend" +``` + +### Task 6.4: Run the full test suite and finalize + +**Files:** none (verification step). + +- [ ] **Step 1: Run the full zarr test set** + +Run: `pytest tests/test_zarr_*.py -v` +Expected: all tests pass; external-reader and validator tests pass or skip cleanly depending on what's installed; CellCoadd tests skip cleanly until the implementer-supplied factories are filled in. + +- [ ] **Step 2: Run the full package test suite to catch regressions** + +Run: `pytest tests/ -v` +Expected: all existing tests still pass; the new `RoundtripZarr` helper does not break unrelated test files. + +- [ ] **Step 3: Type-check the new module** + +Run: `mypy python/lsst/images/zarr` +Expected: no errors. Address any warnings before merging. + +- [ ] **Step 4: Lint and format** + +Run: `ruff check python/lsst/images/zarr tests/test_zarr_*.py && ruff format --check python/lsst/images/zarr tests/test_zarr_*.py` +Expected: no findings. + +- [ ] **Step 5: Final commit (if any cleanups were needed)** + +```bash +git status # should be clean +``` + +If lint / mypy required fixes, commit them with a focused message such as `chore: type-check and lint cleanup for lsst.images.zarr`. + +--- + +**End of Phase 6.** Documentation and final verification complete. The backend is ready for review and merge. + +--- + +## Self-Review Notes + +**Spec coverage** — every section of `docs/superpowers/specs/2026-05-22-zarr-io-design.md` maps to at least one task: + +| Spec section | Task(s) | +|---|---| +| §1 Goals / scope / standards alignment | All phases collectively | +| §2 Module layout | 1.1 (skeleton), 1.2 (`_common`), 1.3-1.5 (`_model`), 2.1 (`_store`), 2.2-2.3 (`_layout`), 2.4-2.7 / 3.1-3.5 (archives) | +| §3 On-disk layout (root, siblings, attrs) | 2.5 (`add_array` for image/variance/mask), 2.7 (`add_tree` for root attrs and OME multiscales), 4.1 (recursive sub-archive decoration) | +| §3 Axis choice per archive class | 2.2 (`axes_for_archive_class`), 4.1 (sub-archive `("y", "x")`), 4.4 (CellCoadd) | +| §3 Mask 2-D packed integer with CF flag attrs | 1.2 (`mask_dtype_for_plane_count`), 1.5 (`CfFlagAttributes`), 2.5 (mask packing in `add_array`), 3.0 (native-mask flag), 3.2 (mask unpack on read), 3.6 (3-plane and 40-plane round-trips) | +| §3 JSON tree at `/tree` | 2.7 (`add_tree` stages JSON bytes); 3.1 (`get_tree` reads them) | +| §3 AST WCS at `/wcs_ast` | 2.7 (`_stage_wcs_ast`), 3.3 (Projection deserializer reads it) | +| §3 Tables under `/lsst/tables//` | 2.6 (output), 3.4 (input) | +| §3 Recursive composition | 4.1 (`decorate_sub_archives`) | +| §3 Chunking / sharding defaults / aligned siblings | 1.3-1.4 (defaults in IR), 2.2 (`chunks_for`, `chunks_aligned_to`), 4.3 (PSF single-cell chunks) | +| §4 FITS opaque-metadata round-trip | 5.1 (write), 5.2 (read), 5.3 (full FITS↔Zarr) | +| §4 WCS validation: 11×11 grid, 1-pixel threshold | 2.3 (`affine_check`), 2.7 (integration in `add_tree`) | +| §4 Error taxonomy | 1.2 (`>64`-plane refusal), 3.1 (missing `archive_class`, `>LSST_VERSION`), 3.2 (bad source string) | +| §4 Mode and atomicity | 2.1 (create-only enforcement) | +| §4 Chunk-aligned subset reads (lazy invariant) | 1.3 (`_CountingStore` test on the IR), 3.2 (regression test on the input archive) | +| §4 Mask schema mismatches | inherited from existing `Mask.deserialize`; v1 surfaces it through the standard error path; explicit dedicated test deferred to a follow-up | +| §4 Empty / minimal cases | 2.7 (no `wcs_ast` when no projection; unit-scale `coordinateTransformations` default), 2.5 (image without variance / mask) | +| §4 Forward compatibility | 1.3 (unknown-key preservation in `ZarrAttributes`), 3.1 (version refusal) | +| §5 Test layout | One test file per module, plus `test_zarr_round_trip.py`, `test_zarr_cross_format.py`, `test_zarr_xarray_interop.py`, `test_zarr_ome_compliance.py`, `test_zarr_external_reader.py` | +| §5 Rollout plan (6 numbered steps) | Phases 1–6 directly mirror the spec's rollout | +| §6 Follow-ups | Documented in 6.1's docstring (RFC-5, 3-D mask fallback, dask read, multiscale pyramid, consolidated metadata, NCZarr, stacked OME ColorImage view) | + +**Implementer-judgement handoffs** — places where the plan asks the engineer to consult local code rather than follow a literal recipe: + +- Tasks 4.4 / 4.5: minimal `CellCoadd` constructor — `_make_minimal_cell_coadd` and `_make_minimal_cell_coadd_with_psf` are `SkipTest` placeholders to be replaced by reading `python/lsst/images/cells/_coadd.py`. +- Task 6.3: the towncrier fragment filename uses `DM-XXXXX` — pick the real ticket number when committing. +- Task 3.6: the `RoundtripBase` helper may need a small directory-vs-file fix to accept `.zarr` directories. + +These are intentional handoffs, not placeholder content in the production code. + +**Type / name consistency** — IR types and key methods stay consistent across phases: + +- `ZarrDocument`, `ZarrGroup`, `ZarrArray`, `ZarrAttributes` introduced in 1.3-1.4, used everywhere after. +- `ZarrAttributes` has three namespaces (`lsst`, `ome`, `extra`); `extra` is read by xarray / CF tooling and tested in 1.3, 1.4, 5.4. +- `ZarrCompressionOptions.default_for_dtype` from 1.2 is consumed by the `to_zarr` codec builder in 1.4. +- `_layout.chunks_for` / `chunks_aligned_to` defined in 2.2 are used by the output archive in 2.5; `_layout.affine_check` defined in 2.3 is used in 2.7. +- `lsst.archive_class`, `lsst.tree`, `lsst.wcs_ast`, `lsst.cell_grid`, `lsst.opaque_metadata_format`, `lsst.wcs_simplified_dropped`, `lsst.wcs_simplified_max_residual_pixels` are spelled the same in every task that reads or writes them. +- The sliced-source convention (`?c=N`, `?cell=Cy,Cx`) from the v1 plan is **deliberately absent** — the no-stacking rule means every `ArrayReferenceModel.source` is plain `zarr:/`. + +**Critical invariants pinned by tests** — the four invariants stated in the plan header each have at least one failing test: + +1. Lazy reads — `_CountingStore` test in 1.3 (IR level) and 3.2 (input archive level). +2. Aligned chunks — Phase 2.5 test asserting `variance` follows `image_chunks` after the override; CellCoadd test in 4.4 asserting all three siblings have `cell_shape` chunks. +3. Affine residual validator — Phase 2.3 tests with a synthetic linear FrameSet (passes) and a synthetic high-distortion FrameSet (drops). +4. No byte duplication — implicit in the "no fixup pass" architecture; explicit assertions in 4.1 (root has no OME multiscales for ColorImage) and 4.4 (CellCoadd PSF is a single 4-D array, not per-cell groups + a stacked array). diff --git a/docs/superpowers/plans/2026-05-25-zarr-sharding.md b/docs/superpowers/plans/2026-05-25-zarr-sharding.md new file mode 100644 index 00000000..15acf24d --- /dev/null +++ b/docs/superpowers/plans/2026-05-25-zarr-sharding.md @@ -0,0 +1,888 @@ +# Zarr v3 Default Sharding Implementation Plan + +> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking. + +**Goal:** Enable automatic zarr v3 sharding for bulk pixel arrays and drop the per-axis chunk default from 1024 to 256, with no public API changes. + +**Architecture:** A pure `default_shards(chunks, shape, dtype, *, target_bytes)` helper lives next to `chunks_for` in `_layout.py`. Two module-level constants (chunk-axis limit and target shard bytes) live in `_common.py`; the shard target reads `LSST_IMAGES_ZARR_TARGET_SHARD_BYTES` once at import. `ZarrOutputArchive.add_array` calls the helper at the same point chunks are decided, so the IR's `ZarrArray.shards` is populated whenever the caller didn't supply an override. The model writer (`_group_to_zarr`) is unchanged. + +**Tech Stack:** Python 3.12, zarr v3 (`zarr-python` 3.x), numpy, unittest. Project uses `.pyenv/bin/python` to run; system Python lacks zarr. + +**Design spec:** `docs/superpowers/specs/2026-05-25-zarr-sharding-design.md` + +--- + +## File Structure + +| Path | Change | Responsibility | +|---------------------------------------------------|----------|---------------------------------------------| +| `python/lsst/images/zarr/_common.py` | modify | add `DEFAULT_CHUNK_AXIS_LIMIT`, `DEFAULT_TARGET_SHARD_BYTES` | +| `python/lsst/images/zarr/_layout.py` | modify | drop hardcoded 1024, read from `_common`; add `default_shards` | +| `python/lsst/images/zarr/_output_archive.py` | modify | call `default_shards` alongside `chunks_for` at the two existing sites | +| `tests/test_zarr_layout.py` | modify | update existing chunk-default test; add `default_shards` unit tests | +| `tests/test_zarr_common.py` | modify | add subprocess-based env-var tests | +| `tests/test_zarr_round_trip.py` | modify | add a 300×300 round-trip that asserts on-disk `shards` is set | +| `tests/test_zarr_output_archive.py` | modify | add a CellCoadd PSF shard-defaulting test | +| `tests/test_zarr_store.py` | modify | add a sharded write/read through `ZipStore` | + +No new files. + +--- + +## Task 1: Lower the chunk-axis default to 256 (test-first) + +**Files:** +- Modify: `tests/test_zarr_layout.py:56-60` (`test_chunks_for_default`) +- Modify: `python/lsst/images/zarr/_common.py` (add constant + export) +- Modify: `python/lsst/images/zarr/_layout.py:50` (replace hardcoded 1024) + +- [ ] **Step 1: Update the existing chunk-default test to expect 256** + +In `tests/test_zarr_layout.py`, replace `test_chunks_for_default` (currently lines 56–60) with: + +```python + def test_chunks_for_default(self) -> None: + # Plain images clamp to the per-axis chunk limit (256 by default). + self.assertEqual(chunks_for("Image", (4096, 4096), None), (256, 256)) + # Smaller than the limit -> use full dim. + self.assertEqual(chunks_for("Image", (200, 100), None), (200, 100)) +``` + +Also update `test_chunks_for_cell_coadd_without_metadata_falls_back` (currently lines 73–74): + +```python + def test_chunks_for_cell_coadd_without_metadata_falls_back(self) -> None: + self.assertEqual(chunks_for("CellCoadd", (4096, 4096), None), (256, 256)) +``` + +- [ ] **Step 2: Run the test to verify it fails** + +Run: `.pyenv/bin/python -m pytest tests/test_zarr_layout.py::LayoutTestCase::test_chunks_for_default -v` + +Expected: FAIL — actual is `(1024, 1024)`, expected is `(256, 256)`. + +- [ ] **Step 3: Add `DEFAULT_CHUNK_AXIS_LIMIT` to `_common.py`** + +Edit `python/lsst/images/zarr/_common.py`: + +Add to `__all__` (currently lines 14–23): + +```python +__all__ = ( + "DEFAULT_CHUNK_AXIS_LIMIT", + "LSST_NS", + "LSST_VERSION", + "OME_NS", + "OME_VERSION", + "ZarrCompressionOptions", + "ZarrPointerModel", + "archive_path_to_zarr_path", + "mask_dtype_for_plane_count", +) +``` + +After `LSST_VERSION = 1` (line 40) and its docstring, add: + +```python +DEFAULT_CHUNK_AXIS_LIMIT = 256 +"""Per-axis cap on the auto-derived chunk shape for plain image arrays. + +Used by `lsst.images.zarr._layout.chunks_for` when the caller does not +supply an explicit override and the archive class does not have a +class-specific chunk rule. Chunks of ~256 elements per spatial axis +trade some compression ratio for cutout-friendly partial reads. +""" +``` + +- [ ] **Step 4: Read the constant from `_layout.py`** + +Edit `python/lsst/images/zarr/_layout.py`. + +Add a new import. The existing import section already pulls from `..fits._common` and from `._model`; add a sibling line: + +```python +from ._common import DEFAULT_CHUNK_AXIS_LIMIT +``` + +Delete the line `_DEFAULT_AXIS_LIMIT = 1024` (currently the only module-level numeric constant in this file; sits just before `axes_for_archive_class`). + +In `chunks_for`, replace the `_DEFAULT_AXIS_LIMIT` reference at the very end of the function: + +```python + return tuple(min(_DEFAULT_AXIS_LIMIT, dim) for dim in shape) +``` + +with + +```python + return tuple(min(DEFAULT_CHUNK_AXIS_LIMIT, dim) for dim in shape) +``` + +- [ ] **Step 5: Run the test to verify it passes** + +Run: `.pyenv/bin/python -m pytest tests/test_zarr_layout.py -v` + +Expected: PASS for `test_chunks_for_default`, `test_chunks_for_cell_coadd_without_metadata_falls_back`, and all other tests in the file. + +- [ ] **Step 6: Commit** + +```bash +git add python/lsst/images/zarr/_common.py python/lsst/images/zarr/_layout.py tests/test_zarr_layout.py +git commit -m "feat(zarr): drop chunk-axis default 1024 -> 256, centralize constant" +``` + +--- + +## Task 2: Add `DEFAULT_TARGET_SHARD_BYTES` constant with env-var override + +**Files:** +- Modify: `python/lsst/images/zarr/_common.py` (add constant + env-var read) +- Modify: `tests/test_zarr_common.py` (add subprocess tests) + +- [ ] **Step 1: Inspect `tests/test_zarr_common.py` to see existing test conventions** + +Run: `head -30 tests/test_zarr_common.py` + +Review what's there. The new tests will follow the same `unittest.TestCase` style. + +- [ ] **Step 2: Write the env-var subprocess tests** + +Append to `tests/test_zarr_common.py` (before the `if __name__ == "__main__":` block): + +```python +import subprocess +import sys + + +class TargetShardBytesEnvVarTestCase(unittest.TestCase): + """`DEFAULT_TARGET_SHARD_BYTES` reads from env var at import time.""" + + def _import_in_subprocess(self, env_value: str | None) -> subprocess.CompletedProcess[str]: + env = dict(os.environ) + env.pop("LSST_IMAGES_ZARR_TARGET_SHARD_BYTES", None) + if env_value is not None: + env["LSST_IMAGES_ZARR_TARGET_SHARD_BYTES"] = env_value + code = ( + "from lsst.images.zarr._common import DEFAULT_TARGET_SHARD_BYTES;" + "print(DEFAULT_TARGET_SHARD_BYTES)" + ) + return subprocess.run( + [sys.executable, "-c", code], + env=env, + capture_output=True, + text=True, + check=False, + ) + + def test_unset_uses_default(self) -> None: + result = self._import_in_subprocess(None) + self.assertEqual(result.returncode, 0, result.stderr) + self.assertEqual(result.stdout.strip(), str(16 * 1024 * 1024)) + + def test_set_value_overrides(self) -> None: + result = self._import_in_subprocess("1234567") + self.assertEqual(result.returncode, 0, result.stderr) + self.assertEqual(result.stdout.strip(), "1234567") + + def test_garbage_value_fails_at_import(self) -> None: + result = self._import_in_subprocess("not-a-number") + self.assertNotEqual(result.returncode, 0) + self.assertIn("LSST_IMAGES_ZARR_TARGET_SHARD_BYTES", result.stderr) +``` + +If the file does not already import `os`, add `import os` to the imports at the top. + +- [ ] **Step 3: Run the tests to verify they fail** + +Run: `.pyenv/bin/python -m pytest tests/test_zarr_common.py::TargetShardBytesEnvVarTestCase -v` + +Expected: FAIL — `DEFAULT_TARGET_SHARD_BYTES` does not exist yet (`ImportError`). + +- [ ] **Step 4: Add the constant to `_common.py`** + +Edit `python/lsst/images/zarr/_common.py`. + +Add `DEFAULT_TARGET_SHARD_BYTES` to `__all__` so it becomes: + +```python +__all__ = ( + "DEFAULT_CHUNK_AXIS_LIMIT", + "DEFAULT_TARGET_SHARD_BYTES", + "LSST_NS", + "LSST_VERSION", + "OME_NS", + "OME_VERSION", + "ZarrCompressionOptions", + "ZarrPointerModel", + "archive_path_to_zarr_path", + "mask_dtype_for_plane_count", +) +``` + +Add `import os` near the other stdlib imports. + +After the `DEFAULT_CHUNK_AXIS_LIMIT` block added in Task 1, append: + +```python +def _read_target_shard_bytes() -> int: + """Read `LSST_IMAGES_ZARR_TARGET_SHARD_BYTES` or return the default. + + Parsed as a base-10 integer. A non-integer value raises ``ValueError`` + at import time — silent typos are worse than loud failure. + """ + raw = os.environ.get("LSST_IMAGES_ZARR_TARGET_SHARD_BYTES") + if raw is None: + return 16 * 1024 * 1024 + try: + return int(raw) + except ValueError as exc: + raise ValueError( + f"LSST_IMAGES_ZARR_TARGET_SHARD_BYTES={raw!r} is not a base-10 integer." + ) from exc + + +DEFAULT_TARGET_SHARD_BYTES: int = _read_target_shard_bytes() +"""Target uncompressed byte size for an auto-derived shard. + +Read from ``LSST_IMAGES_ZARR_TARGET_SHARD_BYTES`` once at import time; +defaults to 16 MiB. Used by `lsst.images.zarr._layout.default_shards` to +decide how many chunks to combine into a shard. +""" +``` + +- [ ] **Step 5: Run the tests to verify they pass** + +Run: `.pyenv/bin/python -m pytest tests/test_zarr_common.py::TargetShardBytesEnvVarTestCase -v` + +Expected: PASS for all three subtests. + +- [ ] **Step 6: Run the full common test file as a regression check** + +Run: `.pyenv/bin/python -m pytest tests/test_zarr_common.py -v` + +Expected: All tests pass. + +- [ ] **Step 7: Commit** + +```bash +git add python/lsst/images/zarr/_common.py tests/test_zarr_common.py +git commit -m "feat(zarr): add DEFAULT_TARGET_SHARD_BYTES with env-var override" +``` + +--- + +## Task 3: Add `default_shards` helper (test-first) + +**Files:** +- Modify: `python/lsst/images/zarr/_layout.py` (add helper + export) +- Modify: `tests/test_zarr_layout.py` (add new test case) + +- [ ] **Step 1: Write the unit tests** + +Append a new test class to `tests/test_zarr_layout.py` (before the `if __name__ == "__main__":` block, after the existing `LayoutTestCase`): + +```python +@unittest.skipUnless(HAVE_ZARR, "zarr is not installed") +class DefaultShardsTestCase(unittest.TestCase): + """The `default_shards` byte-budget rule.""" + + TARGET = 16 * 1024 * 1024 # 16 MiB + + def test_4k_float32_image_uses_byte_budget(self) -> None: + result = default_shards( + chunks=(256, 256), + shape=(4096, 4096), + dtype=np.dtype("float32"), + target_bytes=self.TARGET, + ) + self.assertEqual(result, (2048, 2048)) + + def test_3d_mask_plane_axis_untouched(self) -> None: + # chunks already cover the plane axis; growable axes are y, x only. + result = default_shards( + chunks=(8, 256, 256), + shape=(8, 4096, 4096), + dtype=np.dtype("uint8"), + target_bytes=self.TARGET, + ) + self.assertEqual(result, (8, 1536, 1536)) + + def test_tiny_single_chunk_returns_none(self) -> None: + result = default_shards( + chunks=(40,), + shape=(40,), + dtype=np.dtype("uint8"), + target_bytes=self.TARGET, + ) + self.assertIsNone(result) + + def test_chunks_equal_shape_returns_none(self) -> None: + result = default_shards( + chunks=(1024, 1024), + shape=(1024, 1024), + dtype=np.dtype("float32"), + target_bytes=self.TARGET, + ) + self.assertIsNone(result) + + def test_already_big_chunk_returns_none(self) -> None: + # 4096*4096*4 = 64 MiB > 16 MiB target. + result = default_shards( + chunks=(4096, 4096), + shape=(8192, 8192), + dtype=np.dtype("float32"), + target_bytes=self.TARGET, + ) + self.assertIsNone(result) + + def test_k_le_one_returns_none(self) -> None: + # chunk_bytes = 256*256*4 = 256 KiB; ratio = 4 with one growable axis; + # k = round(4) = 4 -> not this boundary. Construct a case where + # ratio is just above 1: 256 KiB chunk, 384 KiB target -> ratio 1.5, + # k = round(1.5) = 2 -> sharded. Use 256 KiB chunk, 320 KiB target + # -> ratio 1.25, k = round(1.25) = 1 -> None. + chunk_bytes = 256 * 256 * 4 + result = default_shards( + chunks=(256, 256), + shape=(4096, 4096), + dtype=np.dtype("float32"), + target_bytes=int(chunk_bytes * 1.25), + ) + self.assertIsNone(result) + + def test_cap_at_array_bounds(self) -> None: + # 600x600 float32; chunk_bytes = 256 KiB; ratio = 64; k = 8. + # Uncapped shard would be (2048, 2048) but the array is only + # 3 chunks per axis (ceil(600/256) = 3), so the cap is (768, 768). + result = default_shards( + chunks=(256, 256), + shape=(600, 600), + dtype=np.dtype("float32"), + target_bytes=self.TARGET, + ) + self.assertEqual(result, (768, 768)) + + def test_cell_coadd_psf(self) -> None: + # (25, 25, 150, 150) float32 with (1, 1, 150, 150) chunks. + # chunk_bytes = 90 KiB; ratio ~= 186; growable axes are 0 and 1 + # (cell-grid axes). k = round(sqrt(186)) = 14. + result = default_shards( + chunks=(1, 1, 150, 150), + shape=(25, 25, 150, 150), + dtype=np.dtype("float32"), + target_bytes=self.TARGET, + ) + self.assertEqual(result, (14, 14, 150, 150)) + + def test_mismatched_ndim_raises(self) -> None: + with self.assertRaisesRegex(ValueError, "rank"): + default_shards( + chunks=(256, 256), + shape=(4096, 4096, 4096), + dtype=np.dtype("float32"), + target_bytes=self.TARGET, + ) + + def test_zero_itemsize_returns_none(self) -> None: + # void(0) has itemsize 0; defensive guard against degenerate dtypes. + result = default_shards( + chunks=(256, 256), + shape=(4096, 4096), + dtype=np.dtype("V0"), + target_bytes=self.TARGET, + ) + self.assertIsNone(result) +``` + +Update the import block at the top of `tests/test_zarr_layout.py` (currently lines 25–32) to also import `default_shards`: + +```python +try: + from lsst.images.zarr._layout import ( + affine_check, + axes_for_archive_class, + chunks_aligned_to, + chunks_for, + decorate_sub_archives, + default_shards, + ) + from lsst.images.zarr._model import ZarrArray, ZarrDocument, ZarrGroup + + HAVE_ZARR = True +except ImportError: + HAVE_ZARR = False +``` + +- [ ] **Step 2: Run the new tests to verify they fail** + +Run: `.pyenv/bin/python -m pytest tests/test_zarr_layout.py::DefaultShardsTestCase -v` + +Expected: All ten subtests FAIL with `ImportError` for `default_shards`. + +- [ ] **Step 3: Implement `default_shards` in `_layout.py`** + +Add `"default_shards"` to the `__all__` tuple at the top of `python/lsst/images/zarr/_layout.py` (currently lines 29–38), keeping alphabetical order: + +```python +__all__ = ( + "AffineCheckResult", + "affine_check", + "axes_for_archive_class", + "chunks_aligned_to", + "chunks_for", + "decorate_sub_archives", + "default_shards", + "deserialize_fits_opaque_metadata", + "serialize_fits_opaque_metadata", +) +``` + +Add `import math` to the imports at the top of the file (alongside `numpy`). + +After `chunks_aligned_to` (currently ends at line 118), add: + +```python +def default_shards( + *, + chunks: tuple[int, ...], + shape: tuple[int, ...], + dtype: np.dtype, + target_bytes: int, +) -> tuple[int, ...] | None: + """Derive a default shard shape from ``chunks``, ``shape``, and ``dtype``. + + Returns ``None`` when sharding would be a no-op: the array is + already a single chunk per axis, the chunk is already at least + ``target_bytes`` big, or the byte budget rounds to ``k == 1`` + chunks per growable axis. + + The rule grows only axes whose ``chunks[i] < shape[i]`` (the + others already cover the full extent), uses one uniform multiplier + ``k = round(ratio ** (1 / num_growable_axes))`` to stay close to + the byte budget, and caps each axis at ``chunks[i] * ceil(shape[i] + / chunks[i])`` so a small array does not get a shard larger than + itself. Every shard axis is an integer multiple of the + corresponding chunk axis, as required by zarr v3. + + Parameters + ---------- + chunks + Chunk shape, one int per axis. + shape + Array shape, one int per axis. + dtype + Array dtype; only ``itemsize`` is consulted. + target_bytes + Target uncompressed shard size. Typically + `DEFAULT_TARGET_SHARD_BYTES`. + + Raises + ------ + ValueError + If ``len(chunks) != len(shape)``. + """ + if len(chunks) != len(shape): + raise ValueError( + f"chunks rank {len(chunks)} does not match shape rank {len(shape)}." + ) + itemsize = dtype.itemsize + if itemsize == 0: + return None + chunk_bytes = math.prod(chunks) * itemsize + if chunk_bytes >= target_bytes: + return None + growable = [i for i in range(len(shape)) if chunks[i] < shape[i]] + if not growable: + return None + ratio = target_bytes / chunk_bytes + k = max(1, round(ratio ** (1.0 / len(growable)))) + if k <= 1: + return None + shard = list(chunks) + for i in growable: + n_chunks_axis = math.ceil(shape[i] / chunks[i]) + shard[i] = min(chunks[i] * k, chunks[i] * n_chunks_axis) + return tuple(shard) +``` + +Note: the helper takes its arguments keyword-only to match the style of `chunks_aligned_to` and to make calls at the use sites self-documenting. + +- [ ] **Step 4: Update the test calls to use keyword arguments** + +The unit tests written in Step 1 already pass arguments by keyword (`chunks=..., shape=..., dtype=..., target_bytes=...`). No change needed. + +- [ ] **Step 5: Run the new tests to verify they pass** + +Run: `.pyenv/bin/python -m pytest tests/test_zarr_layout.py::DefaultShardsTestCase -v` + +Expected: All ten subtests PASS. + +- [ ] **Step 6: Run the full layout test file as a regression check** + +Run: `.pyenv/bin/python -m pytest tests/test_zarr_layout.py -v` + +Expected: All tests pass (existing tests should be unaffected). + +- [ ] **Step 7: Commit** + +```bash +git add python/lsst/images/zarr/_layout.py tests/test_zarr_layout.py +git commit -m "feat(zarr): add default_shards helper for byte-budget shard sizing" +``` + +--- + +## Task 4: Wire `default_shards` into `ZarrOutputArchive.add_array` + +**Files:** +- Modify: `python/lsst/images/zarr/_output_archive.py` (mask path ~L188, generic path ~L226) +- Modify: `tests/test_zarr_round_trip.py` (add a sharded round-trip test) + +- [ ] **Step 1: Write the failing round-trip test** + +Append to `tests/test_zarr_round_trip.py` (inside the `ZarrRoundTripTestCase` class, after `test_image_round_trip`): + +```python + def test_image_round_trip_writes_shards(self) -> None: + # 300x300 float32: chunks (256, 256) -> shard (512, 512) by the + # byte-budget rule (target 16 MiB, ratio ~64, k ~ 8 capped at the + # 2-chunk-per-axis ceiling of 256 * 2 = 512). + import zarr as _zarr + + from lsst.images.zarr._store import open_store_for_read + + original = Image( + np.zeros((300, 300), dtype=np.float32), + bbox=Box.factory[0:300, 0:300], + ) + with RoundtripZarr(self, original) as roundtrip: + with open_store_for_read(roundtrip.filename) as store: + root = _zarr.open_group(store=store, mode="r", zarr_format=3) + image_arr = root["image"] + self.assertEqual(tuple(image_arr.chunks), (256, 256)) + self.assertEqual(tuple(image_arr.shards), (512, 512)) + # Single-chunk metadata arrays must NOT be sharded. + lsst_json_arr = root["lsst_json"] + self.assertIsNone(lsst_json_arr.shards) + # Data round-trip is preserved. + np.testing.assert_array_equal(roundtrip.result.array, original.array) +``` + +- [ ] **Step 2: Run the test to verify it fails** + +Run: `.pyenv/bin/python -m pytest tests/test_zarr_round_trip.py::ZarrRoundTripTestCase::test_image_round_trip_writes_shards -v` + +Expected: FAIL — `image_arr.shards` is `None` because the archive doesn't populate shards yet. + +- [ ] **Step 3: Wire `default_shards` into the mask-path branch of `add_array`** + +Edit `python/lsst/images/zarr/_output_archive.py`. The existing imports at the top (currently lines 40–53) are multi-line tuples. Update them in place. + +Change the `_common` import block to add `DEFAULT_TARGET_SHARD_BYTES`: + +```python +from ._common import ( + DEFAULT_TARGET_SHARD_BYTES, + ZarrCompressionOptions, + ZarrPointerModel, + archive_path_to_zarr_path, + mask_dtype_for_plane_count, +) +``` + +Change the `_layout` import block to add `default_shards`: + +```python +from ._layout import ( + affine_check, + axes_for_archive_class, + chunks_aligned_to, + chunks_for, + decorate_sub_archives, + default_shards, + serialize_fits_opaque_metadata, +) +``` + +In the mask branch (currently lines 180–200), replace: + +```python + chunks = self._chunks.get(name) or self._chunks.get(leaf) + if chunks is None and self._image_chunks is not None: + chunks = chunks_aligned_to(image_chunks=self._image_chunks, shape=packed.shape) + extra: dict[str, Any] = {"_ARRAY_DIMENSIONS": ["y", "x"]} + extra.update(flag_attrs.dump()) + ir_array = ZarrArray( + data=packed, + chunks=chunks, + shards=self._shards.get(name), + compression=self._compression.get(name), + ) +``` + +with: + +```python + chunks = self._chunks.get(name) or self._chunks.get(leaf) + if chunks is None and self._image_chunks is not None: + chunks = chunks_aligned_to(image_chunks=self._image_chunks, shape=packed.shape) + shards = self._shards.get(name) or self._shards.get(leaf) + if shards is None and chunks is not None: + shards = default_shards( + chunks=tuple(chunks), + shape=tuple(packed.shape), + dtype=packed.dtype, + target_bytes=DEFAULT_TARGET_SHARD_BYTES, + ) + extra: dict[str, Any] = {"_ARRAY_DIMENSIONS": ["y", "x"]} + extra.update(flag_attrs.dump()) + ir_array = ZarrArray( + data=packed, + chunks=chunks, + shards=shards, + compression=self._compression.get(name), + ) +``` + +- [ ] **Step 4: Wire `default_shards` into the generic branch of `add_array`** + +In the generic branch (currently lines 202–231), find this block: + +```python + ir_array = ZarrArray( + data=np.ascontiguousarray(array), + chunks=chunks, + shards=self._shards.get(name), + compression=self._compression.get(name), + ) +``` + +Replace with: + +```python + shards = self._shards.get(name) or self._shards.get(leaf) + if shards is None and chunks is not None: + shards = default_shards( + chunks=tuple(chunks), + shape=tuple(array.shape), + dtype=array.dtype, + target_bytes=DEFAULT_TARGET_SHARD_BYTES, + ) + ir_array = ZarrArray( + data=np.ascontiguousarray(array), + chunks=chunks, + shards=shards, + compression=self._compression.get(name), + ) +``` + +Note: `chunks is not None` guards the unusual case where neither `_chunks` nor any layout default fired — `default_shards` only makes sense once we have a chunk shape. In practice `chunks` is non-`None` for `image`, `variance`, `mask`, and `psf`; for table columns and structured-array columns it is `None` (those use `add_table` / `add_structured_array`, which do not pass through this branch). + +- [ ] **Step 5: Run the round-trip test to verify it passes** + +Run: `.pyenv/bin/python -m pytest tests/test_zarr_round_trip.py::ZarrRoundTripTestCase::test_image_round_trip_writes_shards -v` + +Expected: PASS — `image_arr.shards == (512, 512)` and `lsst_json_arr.shards is None`. + +- [ ] **Step 6: Run all zarr round-trip tests as a regression check** + +Run: `.pyenv/bin/python -m pytest tests/test_zarr_round_trip.py -v` + +Expected: All tests pass — existing data-equality assertions are unchanged. + +- [ ] **Step 7: Run the full zarr suite** + +Run: `.pyenv/bin/python -m pytest tests/ -k zarr -v` + +Expected: All zarr-related tests pass. + +- [ ] **Step 8: Commit** + +```bash +git add python/lsst/images/zarr/_output_archive.py tests/test_zarr_round_trip.py +git commit -m "feat(zarr): default-shard image, variance, mask in ZarrOutputArchive" +``` + +--- + +## Task 5: Verify CellCoadd PSF gets sharded + +**Files:** +- Modify: `tests/test_zarr_output_archive.py` (extend `ZarrPsfChunkingTestCase`) + +- [ ] **Step 1: Write a PSF shard-defaulting test** + +Append to `tests/test_zarr_output_archive.py` inside `ZarrPsfChunkingTestCase` (currently lines 277–298), after `test_psf_user_override_wins`: + +```python + def test_psf_array_gets_default_shards(self) -> None: + # 25x25 cells of 150x150 float32: chunk_bytes = 90 KiB, + # ratio ~ 186, k = round(sqrt(186)) = 14 -> shard (14, 14, 150, 150). + psf = np.zeros((25, 25, 150, 150), dtype=np.float32) + archive = ZarrOutputArchive(archive_class="CellCoadd") + archive.add_array(psf, name="psf") + node = archive.document.root.get("/psf") + self.assertEqual(tuple(node.shards), (14, 14, 150, 150)) + + def test_psf_user_shard_override_wins(self) -> None: + psf = np.zeros((25, 25, 150, 150), dtype=np.float32) + archive = ZarrOutputArchive( + archive_class="CellCoadd", + shards={"psf": (5, 5, 150, 150)}, + ) + archive.add_array(psf, name="psf") + node = archive.document.root.get("/psf") + self.assertEqual(tuple(node.shards), (5, 5, 150, 150)) + + def test_small_psf_skips_sharding(self) -> None: + # 2x3 cells of 21x21 float32: chunk_bytes = 1764 B, ratio ~9295, + # but ceil(2/1) * ceil(3/1) = 6 cells total -> capped shard equals + # the array; effective shard becomes (2, 3, 21, 21) which equals + # shape, so no sharding is meaningful. The byte-budget rule still + # produces a tuple — verify it is the capped value, not None. + psf = np.zeros((2, 3, 21, 21), dtype=np.float32) + archive = ZarrOutputArchive(archive_class="CellCoadd") + archive.add_array(psf, name="psf") + node = archive.document.root.get("/psf") + # Either way is acceptable: shards=(2,3,21,21) (capped) or shards=None. + # The default rule returns the capped value; assert that. + self.assertEqual(tuple(node.shards), (2, 3, 21, 21)) +``` + +- [ ] **Step 2: Run the new tests** + +Run: `.pyenv/bin/python -m pytest tests/test_zarr_output_archive.py::ZarrPsfChunkingTestCase -v` + +Expected: PASS for all subtests, including the two existing ones (`test_psf_array_uses_single_cell_chunks` and `test_psf_user_override_wins`). For the small PSF, the helper computes `k = round(sqrt(16777216 / 1764)) = 98`, then caps each growable axis at the cell-grid extent (2 and 3), yielding shard `(2, 3, 21, 21)` — the whole array fits in one shard, which is the desired outcome (6 chunks bundled into one file). + +- [ ] **Step 3: Run the full output-archive test file as a regression check** + +Run: `.pyenv/bin/python -m pytest tests/test_zarr_output_archive.py -v` + +Expected: All tests pass. + +- [ ] **Step 4: Commit** + +```bash +git add tests/test_zarr_output_archive.py +git commit -m "test(zarr): cover CellCoadd PSF shard defaulting and overrides" +``` + +--- + +## Task 6: Verify sharded write round-trips through `ZipStore` + +**Files:** +- Modify: `tests/test_zarr_store.py` (add a sharded round-trip via zip) + +- [ ] **Step 1: Write the test** + +Append to `tests/test_zarr_store.py` inside `StoreDispatchTestCase`, after `test_create_only_refuses_existing`: + +```python + def test_zip_store_round_trips_sharded_array(self) -> None: + import numpy as np + + with tempfile.TemporaryDirectory() as tmp: + target = os.path.join(tmp, "out.zarr.zip") + data = np.arange(300 * 300, dtype=np.float32).reshape(300, 300) + with open_store_for_write(target) as store: + group = zarr.create_group(store=store, zarr_format=3) + arr = group.create_array( + name="image", + shape=data.shape, + chunks=(256, 256), + shards=(512, 512), + dtype=data.dtype, + ) + arr[:] = data + with open_store_for_read(target) as store: + group = zarr.open_group(store=store, mode="r", zarr_format=3) + image = group["image"] + self.assertEqual(tuple(image.chunks), (256, 256)) + self.assertEqual(tuple(image.shards), (512, 512)) + np.testing.assert_array_equal(image[...], data) +``` + +- [ ] **Step 2: Run the test to verify it passes** + +Run: `.pyenv/bin/python -m pytest tests/test_zarr_store.py::StoreDispatchTestCase::test_zip_store_round_trips_sharded_array -v` + +Expected: PASS — `ZipStore` handles sharded arrays without special handling on our side. + +If this fails (zarr-python 3.x not honoring shards through `ZipStore`), stop and discuss with the user before proceeding. The spec assumes this works; failure would be a real finding worth surfacing. + +- [ ] **Step 3: Run the full store test file** + +Run: `.pyenv/bin/python -m pytest tests/test_zarr_store.py -v` + +Expected: All tests pass. + +- [ ] **Step 4: Commit** + +```bash +git add tests/test_zarr_store.py +git commit -m "test(zarr): round-trip a sharded array through ZipStore" +``` + +--- + +## Task 7: Final regression sweep and changelog + +**Files:** +- Modify: `doc/changes/` (add changelog fragment if the project uses one) + +- [ ] **Step 1: Run the full zarr test suite** + +Run: `.pyenv/bin/python -m pytest tests/ -k zarr -v` + +Expected: All zarr tests pass. + +- [ ] **Step 2: Run the full project test suite** + +Run: `.pyenv/bin/python -m pytest tests/ -v` + +Expected: All tests pass; no unrelated regressions. + +- [ ] **Step 3: Check whether a changelog fragment is required** + +Run: `ls doc/changes/ 2>/dev/null && head -20 doc/changes/README.rst 2>/dev/null || echo "no changelog dir"` + +If `doc/changes/` exists, follow the existing fragment-naming convention. If a `.rst`/`.md` template can be found in nearby commits (look at `git log --oneline -- doc/changes/`), match that style. + +If a fragment is required, create one summarising: +- "Default sharding now enabled for image, variance, mask, and PSF arrays in zarr archives. The per-axis chunk default has been lowered from 1024 to 256 to better suit cutout-style science access patterns. Public API is unchanged. Tunable via `LSST_IMAGES_ZARR_TARGET_SHARD_BYTES`." + +If no changelog system is detected, skip this step. + +- [ ] **Step 4: Verify mypy passes** + +Run: `.pyenv/bin/python -m mypy python/lsst/images/zarr/` + +Expected: No new type errors. (The zarr module was clean as of commit `9c2f01e`; this change adds only typed code.) + +- [ ] **Step 5: Final commit (only if a changelog fragment was added)** + +```bash +git add doc/changes/ +git commit -m "docs: changelog fragment for zarr default sharding" +``` + +- [ ] **Step 6: Sanity check the full diff** + +Run: `git log --oneline origin/main..HEAD` + +Expected: 5–6 commits — one per task above, in order. + +Run: `git diff --stat origin/main..HEAD` + +Expected: Three production files (`_common.py`, `_layout.py`, `_output_archive.py`) and four test files modified, plus possibly a changelog fragment. No other source files changed. + +--- + +## Self-review notes + +- **Spec coverage**: Architecture (Task 1, Task 2 add constants; Task 3 adds helper; Task 4 wires it in), the `default_shards` rule (Task 3), per-array behaviour (Tasks 4 + 5), error handling (covered by Task 3 unit tests), backward compatibility (Task 4 step 6 regression sweep, Task 6 zip round-trip), all five testing categories from the spec (Task 3 unit tests, Task 2 env-var test, Task 4 round-trip integration, Task 5 PSF-specific round-trip, Task 6 zip round-trip). +- **No placeholders**: every code step shows the full code; every test step shows the full assertion. +- **Type consistency**: the helper signature is `default_shards(*, chunks, shape, dtype, target_bytes)` everywhere — task 3 (definition), task 3 (unit tests), task 4 (call sites). The constant name `DEFAULT_TARGET_SHARD_BYTES` and chunk constant `DEFAULT_CHUNK_AXIS_LIMIT` are used consistently across `_common.py`, `_layout.py`, and `_output_archive.py`. +- **Spec deviation**: the spec's claim that "object dtype has itemsize 0" is incorrect — `np.dtype('O').itemsize == 8`. The `itemsize == 0` guard still exists (it triggers on `np.dtype('V0')`), and the unit test in Task 3 covers it via void(0) instead of object. diff --git a/docs/superpowers/specs/2026-05-22-zarr-io-design.md b/docs/superpowers/specs/2026-05-22-zarr-io-design.md new file mode 100644 index 00000000..d9e60197 --- /dev/null +++ b/docs/superpowers/specs/2026-05-22-zarr-io-design.md @@ -0,0 +1,653 @@ +# Zarr I/O Backend for `lsst.images` — Design (revised) + +**Status:** Approved (design phase). Supersedes the v1 design at commit `a11db46` after collaborator review. +**Date:** 2026-05-22 (revised) +**Author:** Tim Jenness (with Claude collaborator) + +## 1. Goals, Scope, Non-Goals + +### Goals + +Add a `lsst.images.zarr` subpackage providing: + +- A `ZarrOutputArchive` and `ZarrInputArchive` implementing the existing + `lsst.images.serialization` `OutputArchive` / `InputArchive` ABCs. +- Top-level `read()` and `write()` helpers consistent with the FITS, + JSON, and NDF backends. +- A Python intermediate representation (IR) — `ZarrDocument`, + `ZarrGroup`, `ZarrArray`, etc. — that describes the on-disk layout + independently of `zarr-python`, mirroring the role `NdfDocument` + plays for the NDF backend. + +Because the backend builds on the abstract archive interface, every +image type that already serializes to FITS/JSON/NDF (`Image`, `Mask`, +`MaskedImage`, `VisitImage`, `ColorImage`, `CellCoadd`, plus any +`serialize()`-implementing object reachable through the archive) works +with no per-type code in the backend itself. + +### Standards alignment (changed from v1) + +The on-disk layout is **xarray-/CF-shaped at the root** with +**OME-NGFF v0.5 metadata as a discoverability layer on top**. The root +group is a sibling collection of arrays (`image/`, `variance/`, +`mask/`) so: + +- `xr.open_zarr(path)` returns a `Dataset` with the masked-image + components as data variables sharing the `(y, x)` dimensions. +- Geospatial / CF tooling (rasterio, GDAL's Zarr driver, QGIS) reads + the `mask` array's `flag_masks` / `flag_meanings` / + `flag_descriptions` attributes directly. +- OME-NGFF tooling (`napari`, `neuroglancer`, `ngff-validator`, + `ome-zarr-py`) sees an OME multiscales block whose + `dataset.path` points at the same `image` array — the OME view and + the xarray view share bytes. + +The pivot vs the v1 design: the root is no longer a multiscale image +with `lsst:` companions hanging off it; companion arrays are +first-class siblings, and OME's `multiscales.datasets[].path` references +them. This enables xarray / GDAL interop with no byte duplication. + +### Cloud-first, local works too + +- Default chunk geometry is tile-aligned (~1024×1024 for plain images, + `cell_shape` for `CellCoadd`). +- Sharding (zarr v3 native) is enabled by default with a tunable shard + size to keep object counts manageable on S3/GCS. +- Subset reads via `slices=` exploit zarr's chunk index, going + straight to the lazy `zarr.Array` handle so only the touched chunks + are fetched. +- Both `DirectoryStore` and `ZipStore` are supported; the choice is + driven by URI shape (`*.zarr.zip` → `ZipStore`, otherwise directory). + Remote URIs go through `lsst.resources.ResourcePath` and `fsspec`. + +### Scope + +Same image-type coverage as the FITS backend: `Image`, `Mask` (2-D in +v1), `MaskedImage`, `VisitImage`, `ColorImage`, `CellCoadd`, plus any +`serialize()`-implementing object reachable through the archive +interface. + +`ColorImage` writes its three channels as **sibling sub-archives** +(`red/`, `green/`, `blue/`), not as a stacked `(3, Y, X)` array — see +[§3](#3-on-disk-layout). The previous design's stacking + JSON-pointer +rewrite is removed because it duplicates bytes for large images. + +`CellCoadd`'s per-cell PSF is whatever shape `CellCoadd.serialize` +natively emits — typically a 4-D `(Cy, Cx, Py, Px)` array — with +cell-aligned chunks. No fixup pass. + +### Non-Goals (initial release) + +- No dask / lazy `read_lazy()` API — added later, tracked as + follow-up. +- No multi-level OME multiscale pyramid (we only ever write one level + pointed at by `path: image`). +- **No NGFF RFC-5 nonlinear coordinate transformations as + authoritative.** v1 emits an OME-NGFF v0.5 affine + `coordinateTransformations` as an external-tool affordance, with + the AST `FrameSet` string as the authoritative round-trip source. + RFC-5 transformations as authoritative is a follow-up — see + [§6](#6-follow-up-work-out-of-scope) — **blocked on writing an AST + JSON channel** that serializes a `FrameSet` to / from RFC-5 + transformation JSON. +- No 3-D mask layout for masks with more than 64 planes — v1 raises + on write. 3-D fallback tracked as follow-up. +- No automatic OME `consolidated_metadata` extension. Tracked as + follow-up. + +### Dependency + +Optional `[zarr]` extra requiring `zarr >= 3.0` and any required codec +packages. The top-level `lsst/images/zarr/__init__.py` does a guarded +`import zarr` and raises `ImportError` with installation guidance if +missing, mirroring the NDF backend. + +## 2. Module Layout and Architecture + +``` +python/lsst/images/zarr/ +├── __init__.py guarded `import zarr`; re-exports +├── _common.py ZarrPointerModel (analog of NdfPointerModel), +│ attribute namespace constants ("lsst:", "ome:"), +│ ZarrCompressionOptions dataclass, +│ path/JSON-pointer helpers +├── _model.py Python intermediate representation: +│ ZarrDocument, ZarrGroup, ZarrArray, ZarrAttributes, +│ OmeMultiscale, OmeOmero, from_zarr() / to_zarr() +│ materialization methods +├── _layout.py Layout rules: archive-class → axes mapping; +│ CF flag-attrs construction for mask groups; +│ affine extraction + residual validator; +│ OME multiscale block construction +├── _output_archive.py ZarrOutputArchive and write() +├── _input_archive.py ZarrInputArchive and read() +└── _store.py Wrapper that turns a ResourcePath / fsspec URI + into the right zarr.storage.Store + (LocalStore / ZipStore / FsspecStore) +``` + +### Fit with existing abstractions + +- `ZarrOutputArchive[ZarrPointerModel]` implements the abstract + methods (`serialize_direct`, `serialize_pointer`, + `serialize_frame_set`, `add_array`, `add_table`, + `add_structured_array`, `iter_frame_sets`). +- `ZarrPointerModel` is a small Pydantic model holding a zarr path + (e.g. `"/lsst/psf/tree"`); when a model field carries a + `ZarrPointerModel`, the consumer dereferences it through the input + archive — same pattern as `NdfPointerModel`. +- `update_header` callbacks (intended for FITS) are accepted and + ignored, identical to the JSON backend. +- The `serialization.ArchiveTree` JSON tree is stored verbatim as a + UTF-8 zarr array at `tree` (root-level). Array references in the + tree resolve to zarr paths under the same root. + +### Two-pass write driven by the IR + +During `obj.serialize(archive)`, the archive populates an in-memory +`ZarrDocument`. Only when the context manager exits does the IR +materialize to zarr-python via the configured store. + +Benefits: + +- Per-class layout decisions (CF flag attrs on mask, OME multiscale + block, cell-grid metadata) are made once in `_layout.py` against + the populated IR. +- Tests can assert on the IR without writing files. +- A future "validate-then-commit" step (e.g. `ngff-validator` + integration) can run against the IR. + +Compared to the v1 design, the IR's *write* side has **no fixup +pass** that rewrites or stacks staged arrays. Each `add_array(name)` +call lands at the zarr path equal to `name` (after stripping the +leading `/`). `name="image"` → `/image`; `name="mask"` → `/mask`; +the nested `name="red/image"` produced by +`serialize_direct("red", red.serialize)` → `/red/image`. There is +no special-case dictionary mapping JSON pointers to zarr paths. + +### Lazy read invariant (unchanged from v1) + +`ZarrArray.data` holds either a staged `numpy.ndarray` (write side) +or a lazy `zarr.Array` handle (read side). `from_zarr` never reads +chunk bytes; only `ZarrArray.read(slices=...)` does, and it forwards +`slices` straight to the lazy handle so only chunks intersecting +the slice are fetched. A `_CountingStore`-based regression test +asserts a single-chunk subset of a 16×16 / chunks=(4,4) array +touches strictly fewer chunk reads than a full read. + +### Read mirrors write + +`ZarrInputArchive.open()` opens the store, builds a `ZarrDocument` +view backed by lazy zarr-python objects, validates the +`lsst.archive_class` and `lsst.version` root attributes, locates the +`tree` JSON document, and parses it into the appropriate +`ArchiveTree` Pydantic model. `get_array(model, slices=...)` +translates the model's path into a chunk-aligned zarr read. + +`ArrayReferenceModel.source` strings are plain `zarr:/`. The +v1 design's `?c=N` and `?cell=Cy,Cx` query suffixes are removed — +no stacking means no compound source URLs. + +### Backend write helper signature + +```python +def write( + obj: Any, + path: ResourcePathExpression | None = None, + *, + chunks: Mapping[str, tuple[int, ...] | None] | None = None, + shards: Mapping[str, tuple[int, ...] | None] | None = None, + compression: Mapping[str, ZarrCompressionOptions | None] | None = None, + metadata: dict[str, MetadataValue] | None = None, + butler_info: ButlerInfo | None = None, +) -> ArchiveTree: ... +``` + +`chunks`, `shards`, and `compression` are per-array dicts keyed by +the JSON pointer of the attribute the array backs (or its zarr +path), mirroring the existing `compression_options` pattern from +the FITS backend. Different arrays have different ranks (2-D image, +2-D mask, 4-D per-cell PSF) so a single tuple value would not be +meaningful. Missing keys fall back to the per-class defaults from +[§3](#chunking-and-sharding-defaults). A value of `None` for a key +means "use the default for this array"; explicitly setting `shards` +to `{}` does *not* disable sharding — to disable, pass +`{"": None}` per array. + +`image`, `variance`, and `mask` are expected to share the spatial +chunk shape (CF / xarray / GDAL all assume aligned chunks). The +output archive derives `variance` and `mask` chunks from `image`'s +chunk shape when the user has not overridden them. + +## 3. On-Disk Layout (the spec) + +### Root layout per archive class + +Every archive class lays out its data as **siblings under the root**. +Non-array metadata (the JSON round-trip tree, the AST WCS string) +also lives at the root so xarray and ome-zarr both see a clean +group. + +For a `MaskedImage` / `VisitImage`: + +``` +visitimage.zarr/ +├── zarr.json ← group attrs (see below) +├── image/ ← (Y, X) zarr array, science pixels +├── variance/ ← (Y, X) zarr array +├── mask/ ← (Y, X) zarr array, packed mask integers +├── tree ← 1-D uint8 array, pydantic JSON round-trip +└── wcs_ast ← 1-D uint8 array, AST FrameSet text +``` + +For an `Image` with no projection, `wcs_ast` is omitted; for an +`Image` with no mask/variance, those siblings are simply absent. + +For `ColorImage`: + +``` +colorimage.zarr/ +├── zarr.json ← lsst.archive_class = "ColorImage"; no OME multiscales +├── red/ ← itself a valid Image-shaped sub-archive +├── green/ ← (with its own image/, multiscales, etc.) +├── blue/ +├── tree +└── wcs_ast +``` + +Each channel sub-archive is a valid `Image` archive in its own right +(its own `image/` array, its own `lsst.archive_class = "Image"`, its +own OME multiscales). The root group's `lsst.archive_class` is +`"ColorImage"` and it has **no OME multiscales of its own** — there +is no stacked multi-channel array, so there is nothing for OME to +render at root level. External tools reading the root see three +nested OME images, which is consistent with the recursive-composition +rule. (A future follow-up may add a stacked single-array view; v1 +does not because of the no-byte-duplication rule.) + +For `CellCoadd`: + +``` +cellcoadd.zarr/ +├── zarr.json ← lsst.archive_class + lsst.cell_grid +├── image/ ← (Y, X), chunks aligned to cell_shape +├── variance/ +├── mask/ +├── psf/ ← (Cy, Cx, Py, Px) 4-D, chunks (1, 1, Py, Px) +├── tree +└── wcs_ast +``` + +`psf` is whatever shape `CellCoadd.serialize` natively emits — there +is no stacking fixup. Cell-grid metadata lives in the +`lsst.cell_grid` block of the root group's attributes. + +### Top-level group attributes (`zarr.json` `attributes`) + +```jsonc +{ + "data_model": "org.lsst.masked_image", // or .image / .visit_image / etc. + "version": 1, // org.lsst.* schema version + + "ome": { + "version": "0.5", + "multiscales": [{ + "name": "", + "axes": [/* see per-class table below */], + "datasets": [{ + "path": "image", + "coordinateTransformations": [/* affine; see §4 below */] + }] + }], + // Only present on archive classes whose top-level array has a + // channel axis. Not used in v1 (no stacked ColorImage view). + "omero": { "channels": [...] } + }, + + "lsst": { + "version": 1, // schema version of lsst extension + "archive_class": "VisitImage", // dispatch for read-side construction + "tree": "tree", // zarr path to JSON tree (relative) + "wcs_ast": "wcs_ast", // zarr path to AST string, optional + "wcs_simplified_dropped": false, // see §4 below + "wcs_simplified_max_residual_pixels": 0.13, // observed max; only when affine emitted + "opaque_metadata_format": "fits", // optional, only when present + "cell_grid": { "bbox": ..., "cell_shape": [256, 256] } // CellCoadd only + } +} +``` + +For `ColorImage`, the root group has `lsst.archive_class = "ColorImage"` +and no `ome.multiscales`. + +### Axis choice per archive class + +| Archive class | Axes (root multiscale) | Top-level science array | Notes | +|---|---|---|---| +| `Image`, `MaskedImage`, `VisitImage`, `CellCoadd` | `[y, x]` | `image` | Standard 2-D image. | +| `Mask` (standalone, 2-D) | `[y, x]` | `mask` | When written outside a parent. | +| `ColorImage` | (none at root) | (none at root) | Each `red/`, `green/`, `blue/` sub-archive carries its own `[y, x]` multiscale. | + +### Image / variance arrays — array attrs + +`image/zarr.json` (and likewise `variance/zarr.json` and any other +2-D float sibling): + +```jsonc +{ + "_ARRAY_DIMENSIONS": ["y", "x"], // xarray + "long_name": "science image", // CF + "units": "adu" // CF (when known) +} +``` + +### Mask array — 2-D packed integers with CF flag attrs + +`mask` is a **2-D `(y, x)` unsigned-integer array**. The dtype is +chosen by the schema's plane count: `uint8` for ≤8 planes, `uint16` +for ≤16, `uint32` for ≤32, `uint64` for ≤64. Each pixel's bits encode +which planes apply at that pixel — the same logical representation +the FITS backend writes, so FITS↔Zarr mask round-trips need no bit- +repacking. + +`mask/zarr.json`: + +```jsonc +{ + "_ARRAY_DIMENSIONS": ["y", "x"], + "flag_masks": [1, 2, 4, 8, 16], + "flag_meanings": "BAD SAT CR INTRP NO_DATA", + "flag_descriptions": [ + "Bad pixel.", + "Saturated.", + "Cosmic ray.", + "Interpolated.", + "No data." + ] +} +``` + +`flag_masks` and `flag_meanings` are CF conventions: +`flag_meanings` is a **single space-separated string** (not a list) +per CF; `flag_descriptions` is the LSST extension carrying the +human-readable per-plane text from `MaskPlane.description`. + +Schemas with **more than 64 planes** raise on write in v1. A 3-D +`(plane_byte, y, x)` fallback is tracked as a follow-up. + +### The JSON round-trip tree (`tree`) + +A 1-D `uint8` zarr array containing UTF-8 JSON. Same content the JSON +backend produces, but with `ArrayReferenceModel` references whose +source strings are zarr paths within the store: `"zarr:/image"`, +`"zarr:/mask"`, `"zarr:/red/image"` (for nested ColorImage channels), +`"zarr:/psf"` (for CellCoadd). These resolve into the zarr store, not +into the JSON document itself, so they do not use the JSON-Pointer +`#/` fragment prefix. There are **no compound source URLs** (no +`?c=N`, no `?cell=Cy,Cx`) because no arrays are stacked. + +### AST WCS string (`wcs_ast`) + +A 1-D `uint8` zarr array containing the AST `FrameSet` text produced +by an `astshim.Channel`. The full text is stored as bytes; this is +the **authoritative round-trip source** for the WCS. The OME affine +emitted in `multiscales.datasets[].coordinateTransformations` is an +approximation for external tools and is dropped when its residual +exceeds the [§4](#4-error-handling-edge-cases-round-trips) threshold. + +For multi-frame-set archives (`serialize_frame_set` calls referencing +distinct WCS objects), each frame set is stored at +`/lsst/frame_sets/` and referenced via `ZarrPointerModel` in +the JSON tree, mirroring the NDF / FITS pattern. + +### Tables + +A table named `` lives at `/lsst/tables//`: one +1-D zarr array per column, sibling to the others under a group whose +attributes carry the `lsst.table = {columns: [...], length: N, +meta: {...}}` block. Structured arrays use the same group form; the +deserialised type differs. + +### Recursive composition + +Any sub-archive that holds image-shaped data (e.g. `red/`, `green/`, +`blue/` for `ColorImage`; PSF model parameter images for archives +that nest them) creates a nested group at its archive path that is +itself a valid OME-NGFF / xarray group, with its own +`ome.multiscales` and `lsst.archive_class` attributes. The top-level +is not special; the same rules apply at every level. + +### Chunking and sharding defaults + +- Default chunk for a 2-D image: `min(1024, dim)` per axis. For + `CellCoadd`: `cell_shape`. +- Default shard: 4×4 chunks (i.e. 4096×4096 for plain images, 4×4 + cells for `CellCoadd`) if shard size would be ≥ 1 MiB; otherwise + no sharding. +- Default codec stack: `bytes -> blosc(zstd, clevel=5, + shuffle=byte)` for floats; `bytes -> blosc(zstd, clevel=5, + shuffle=bit)` for integers and masks. +- All defaults are overridable via `ZarrCompressionOptions` per-array + (keyed by JSON pointer / zarr path). +- `image`, `variance`, and `mask` share the spatial chunk shape; + the output archive derives `variance` / `mask` chunks from + `image`'s when not explicitly overridden. + +## 4. Error Handling, Edge Cases, Round-Trips + +### Round-trip rules + +- A zarr file written from an object read from FITS preserves its + primary-HDU `FitsOpaqueMetadata` at + `/lsst/opaque_metadata/fits/primary` (1-D `uint8` array of + JSON-encoded astropy `Header`). Reading the zarr back attaches an + equivalent `FitsOpaqueMetadata` to the deserialized object so a + subsequent FITS write preserves the original cards. +- Any `lsst.*` attributes the archive does not recognise are + preserved verbatim and re-emitted on write of an unchanged tree + (forward compatibility). + +### WCS validation: simplified-affine residual check + +When emitting OME `coordinateTransformations` for a multiscale +dataset, the layout layer: + +1. Extracts the linear / affine portion of the AST `FrameSet`'s + pixel-to-sky mapping as a 3×3 affine block. +2. Samples residuals on an **11×11 grid** spanning the image bbox. + At each grid point, computes pixel→sky via both the full AST + `FrameSet` and the simplified affine, takes the great-circle + separation, and divides by the pixel scale to get a + pixel-equivalent residual. +3. If `max_residual > 1.0 pixel`, **drops the + `coordinateTransformations` block** for the dataset (emits the + unit scale `[1.0, 1.0]` only) and sets + `lsst.wcs_simplified_dropped: true` on the root group, recording + the observed max residual under `lsst.wcs_simplified_max_residual_pixels`. + +Readers always reconstruct the projection from `wcs_ast` regardless +of whether the affine block was emitted or dropped — the OME affine +is purely an external-tool affordance. + +### Error taxonomy + +Extends existing `serialization.ArchiveReadError`: + +- `ArchiveReadError("File has no zarr.json")` for missing root + metadata. +- `ArchiveReadError("File is not an LSST zarr archive")` when + `lsst.archive_class` is missing. +- `ArchiveReadError(f"Unsupported lsst:version {N}")` for + forward-incompatible schema versions. +- `ArchiveReadError(f"Mask has {N} planes; v1 supports up to 64. " + f"3-D fallback is a follow-up.")` on write of a `>64`-plane Mask. +- `ArchiveReadError("On-disk mask schema does not match requested " + "schema: ...")` for read-time schema mismatches; both schemas are + attached, identical to NDF. +- `InvalidParameterError` for unknown `read()` kwargs. +- `InvalidComponentError` for `deserialize_component` on unknown + component names. +- Validation failures from `model_validate_json` propagate as + `ArchiveReadError`. + +### Mode and atomicity + +- Write opens the store in create-only mode (refuses to overwrite an + existing zarr root, mirroring FITS/NDF). +- For `LocalStore`, a partial failure leaves a partial directory — + same risk profile as NDF write failures. Document this and + recommend writing to a temp `ResourcePath` then renaming. +- `ZipStore` writes are atomic (the file is not valid until the + central directory is written), so failures leave no garbage. + +### Chunk-aligned subset reads (lazy invariant) + +- `get_array(model, slices=...)` passes `slices` straight to the + backing `zarr.Array` handle. Zarr handles chunk boundary + alignment internally; only chunks intersecting the slice are + fetched. +- For 2-D mask reads (the v1 layout), spatial slices apply as on + the image; there is no plane-axis to consider. +- A `_CountingStore`-based regression test asserts that a + single-chunk subset of a 16×16 / chunks=(4,4) array touches + strictly fewer chunk reads than a full read. This is the load- + bearing test for cloud-friendly subsetting. + +### Mask schema mismatches + +If a `Mask` is read where the on-disk plane definitions differ from +the in-memory schema being requested, raise `ArchiveReadError` +with both schemas attached, identical to the NDF backend. + +### Empty / minimal cases + +- `Image` with no projection: omit `wcs_ast`; the OME multiscale's + `coordinateTransformations` is the unit scale `[1.0, 1.0]`. The + `tree` JSON document is just an `ImageSerializationModel` with + no `projection` field. +- `Image` plus metadata only: as above; `metadata` lives in the + JSON tree. + +### Forward compatibility + +- `lsst.version` is an integer; readers refuse versions newer than + they understand. +- Unknown `lsst.*` keys at any level are preserved verbatim through + the IR (`ZarrAttributes.load` keeps them; `dump` re-emits them). + This buys partial-knowledge round-trips without losing extension + data. + +## 5. Testing Strategy and Rollout + +### Test layout + +Mirrors the NDF pattern (`tests/test_ndf_*.py`): + +- `tests/test_zarr_common.py` — `_common.py` constants, path + helpers, `ZarrCompressionOptions` dataclass. +- `tests/test_zarr_model.py` — IR types in isolation: `ZarrDocument` + round-trip via `from_zarr` / `to_zarr` against an in-memory + store, attribute schema validation. Lazy invariant on + `ZarrArray.from_zarr`. +- `tests/test_zarr_layout.py` — `_layout.py` rules: which axes for + which archive class, CF flag-attrs construction for masks, + affine-residual validator (synthetic linear WCS passes; synthetic + high-distortion WCS triggers the drop), chunk derivation + (including `cell_shape` alignment). +- `tests/test_zarr_store.py` — URI dispatch (`LocalStore` / + `ZipStore` / `FsspecStore`), create-only refusal. +- `tests/test_zarr_output_archive.py` — write paths for every + supported archive class (`Image`, `Mask`, `MaskedImage`, + `VisitImage`, `ColorImage`, `CellCoadd`), verifying the on-disk + layout matches the spec by inspecting the IR. +- `tests/test_zarr_input_archive.py` — read paths and `slices=` + subset reads, `_CountingStore` lazy-invariant assertion, error + taxonomy tests, opaque-metadata round-trips. +- `tests/test_zarr_round_trip.py` — full write→read round-trips for + every type, plus FITS↔Zarr cross-format round-trips for the + types that already do FITS↔NDF round-trips. +- `tests/test_zarr_xarray_interop.py` — `xr.open_zarr(path)` returns + a `Dataset` with `image` / `variance` / `mask` data variables + sharing `(y, x)` dims; CF flag attributes survive on the mask + variable. Skipped if `xarray` is not installed. +- `tests/test_zarr_ome_compliance.py` — *if* `ngff-validator` (or + equivalent) can be installed in CI, run it against representative + outputs to catch OME-Zarr spec drift. Skipped if the tool is + unavailable. +- `tests/test_zarr_external_reader.py` — sanity-check that the + `ome-zarr` Python tooling can open our files and read the science + array (not LSST extensions). Skipped if `ome-zarr` is not + installed. + +### CI / dev requirements + +Add `zarr >= 3.0` to the optional test dependency set so tests run +automatically. The package metadata adds `[zarr]` extra to the +user-facing extras. + +### Rollout plan + +Scoped into separate tickets/PRs to keep review tractable: + +1. Skeleton + `_common.py` + `_model.py` IR + tests for the IR + alone. No write/read yet. +2. `_store.py` + `_layout.py` (axes, chunks, affine validator) + + `ZarrOutputArchive` + write helper. Cover `Image`, + `MaskedImage`, `VisitImage` only. Output-side tests, including + CF flag-attrs assertions on the mask group and the affine- + residual validator behaviour. +3. `ZarrInputArchive` + read helper + `slices=` subset reads (with + `_CountingStore` regression test) + error taxonomy. Input-side + tests + round-trip for the types in step 2. +4. `ColorImage` (recursive composition of three `Image` sub-archives) + + `CellCoadd` (cell-aligned chunks + 4-D PSF). Round-trip tests. +5. Cross-format round-trips (FITS ↔ Zarr opaque metadata + round-trip). Optional `ome-zarr` external-reader sanity test. + `xarray` interop test. +6. Documentation: module docstring (mirroring the FITS/NDF module + docstrings) describing the layout, plus a changelog entry. + +## 6. Follow-Up Work (Out of Scope) + +Captured here so they are not lost; each is to be tracked as its +own ticket once the initial backend lands. + +- **NGFF RFC-5 nonlinear coordinate transformations.** Replace the + affine-only OME block with a real `sequence(affine, projection, + ...)` block and treat it as authoritative; `wcs_ast` becomes an + optional fallback rather than the source of truth. This is high- + interest because tangent-plane pixel-to-sky transformations + (CellCoadd) and polynomial corrections (VisitImage TAN-SIP) + currently round-trip only through the AST string; richer OME + support would expose them to external tools. **This work is + blocked on writing an AST JSON channel** that serializes a + `FrameSet` to and from RFC-5 transformation JSON — this is a + non-trivial piece of work in its own right and is recorded as a + tracked dependency with no v1 timeline. +- **3-D mask fallback for `>64`-plane masks.** Adds a per-class + layout switch: 2-D packed for ≤64 planes (CF-compliant), 3-D + `(plane_byte, y, x)` for `>64` (CF-extension annotations). v1 + raises on write for `>64`. +- **Lazy / dask-friendly read API** (`read_lazy()` returning open + zarr arrays / `xr.Dataset` for downstream dask integration). +- **Multiscale pyramid generation** (level 1, 2, … coarsenings) for + visualization tools. +- **`zarr.consolidated_metadata` extension** to reduce object-list + calls on cloud stores. +- **Stacked OME view for `ColorImage`.** A future need for a single + `(3, Y, X)` OME-readable array could be met by writing a stacked + view alongside the per-channel sub-archives. v1 does not because + of the no-byte-duplication rule; the per-channel sub-archives are + themselves valid OME images. +- **NCZarr / NetCDF interop.** Unidata's NCZarr layers a NetCDF data + model on top of Zarr, unlocking native reads via `libnetcdf` and + the downstream R / Fortran / MATLAB / IDL ecosystems. v1 is + already partially compatible because `_ARRAY_DIMENSIONS` (xarray) + is the same dimension-naming convention NCZarr uses. Full + compliance is **purely additive**: add `_NCZARR_GROUP` and + `_NCZARR_ARRAY` attribute markers (no layout change, no extra + bytes), and optionally write 1-D `y` / `x` coordinate variables + so the file is self-describing as a NetCDF dataset. Held out of + v1 because NCZarr's zarr-v3 mapping is still evolving and we'd + rather pin against a stable revision; the upgrade requires no + migration of existing files when we adopt it. diff --git a/docs/superpowers/specs/2026-05-25-zarr-sharding-design.md b/docs/superpowers/specs/2026-05-25-zarr-sharding-design.md new file mode 100644 index 00000000..0cdb9ed9 --- /dev/null +++ b/docs/superpowers/specs/2026-05-25-zarr-sharding-design.md @@ -0,0 +1,272 @@ +# Zarr v3 Sharding & Smaller Chunk Defaults + +Date: 2026-05-25 +Status: approved (brainstorming complete; awaiting implementation plan) + +## Background + +The zarr v3 backend currently writes arrays without shards. The +`shards` field is plumbed through `ZarrArray` and `ZarrOutputArchive` +all the way to `zarr.create_array`, but the archive never populates +it, so every chunk becomes a separate object on disk / in cloud +storage. The default per-axis chunk limit is 1024, which produces ~4 +MiB float32 chunks — fine for full-image reads but on the larger end +for cutout-style science access. + +Modern zarr v3 guidance for cloud-backed stores is: + +- small-ish *logical chunks* sized for science access patterns; +- larger *physical shards* sized to amortise S3 / GCS request cost; +- avoid `.zarr.zip` for cloud distribution — keep it for packaging + and local export. + +This spec covers the first two. Zip support stays as it is today +(useful for tests and packaging); we are not deprecating it. + +## Goals + +- Default sharding "just works" with no public API changes — the + caller does not have to think about chunk-vs-shard ratios. +- Smaller chunk default for science access (256² for plain images). +- One internal knob (`DEFAULT_TARGET_SHARD_BYTES`) and one env-var + escape hatch for tuning without code changes. +- Old archives continue to read; round-trip data equality is + preserved. + +## Non-goals + +- Changing `ZarrCompressionOptions` defaults. +- Re-tuning the `CellCoadd` cell-aligned chunk rule. +- Reading `shards` metadata back into the IR (`ZarrArray.from_zarr` + still ignores it; the input archive slices through `zarr.Array`). +- Adding any new kwarg to public `write_zarr`. +- Deprecating `ZipStore`. + +## Architecture + +Three files are touched. No public API additions or renames. + +``` +python/lsst/images/zarr/ + _common.py # +DEFAULT_CHUNK_AXIS_LIMIT (was hardcoded 1024 in _layout) + # +DEFAULT_TARGET_SHARD_BYTES (env-overridable, read once at import) + _layout.py # chunks_for: clamp constant moves to _common, value 1024 → 256 + # +default_shards(chunks, shape, dtype, *, target_bytes) helper + _output_archive.py # call default_shards alongside chunks_for in add_array; + # IR node gets shards populated when caller did not override +``` + +`_model.py` is **not** modified. `_group_to_zarr` continues to pass +`shards=array.shards` through to `zarr.create_array`. By the time the +IR reaches the writer, every array's `shards` is either explicitly +set by the caller, populated by the default helper, or `None` (for +tiny single-chunk arrays). + +### Why eager defaulting in the archive layer + +This pattern mirrors the existing `chunks_for` / +`chunks_aligned_to` helpers in `_layout.py`. The archive sets +shape-derived defaults at IR-construction time, the model writer +stays a dumb serialiser, and tests can assert IR-level shape +decisions without driving zarr. Lazy defaulting in `_group_to_zarr` +was considered and rejected — it would push policy logic into the +writer and make the IR's effective layout invisible until write +time. + +## Constants + +In `_common.py`: + +- `DEFAULT_CHUNK_AXIS_LIMIT: int = 256` — replaces the hardcoded + `_DEFAULT_AXIS_LIMIT = 1024` currently in `_layout.py`. +- `DEFAULT_TARGET_SHARD_BYTES: int` — `16 * 1024 * 1024` by default. + At import time read `LSST_IMAGES_ZARR_TARGET_SHARD_BYTES`; if set, + parse as base-10 int. A `ValueError` from `int()` propagates and + fails import — silent typos are worse than loud failure. No + `1MiB`-style suffix parsing. + +`chunks_for` in `_layout.py` reads `DEFAULT_CHUNK_AXIS_LIMIT` from +`_common`. `chunks_aligned_to` is unchanged — it derives sibling +chunks from `image_chunks`, so it follows the new default +automatically. + +The `CellCoadd` cell-aligned branch and the 4-D PSF branch +(`(1, 1, h, w)`) in `chunks_for` are unchanged — those are +class-specific layout rules, not default-clamp questions. + +## The `default_shards` rule + +Pure function, no archive-class arg, no `archive_metadata` arg: + +```python +def default_shards( + chunks: tuple[int, ...], + shape: tuple[int, ...], + dtype: np.dtype, + *, + target_bytes: int, +) -> tuple[int, ...] | None: + if len(chunks) != len(shape): + raise ValueError("chunks and shape rank mismatch") + itemsize = dtype.itemsize + if itemsize == 0: + return None # object dtype etc. + chunk_bytes = math.prod(chunks) * itemsize + if chunk_bytes >= target_bytes: + return None # one chunk already big enough + growable = [i for i in range(len(shape)) if chunks[i] < shape[i]] + if not growable: + return None # array fits in one chunk per axis + ratio = target_bytes / chunk_bytes + k = round(ratio ** (1.0 / len(growable))) + if k <= 1: + return None # rounding produced a no-op shard + shard = list(chunks) + for i in growable: + n_chunks_axis = math.ceil(shape[i] / chunks[i]) + shard[i] = min(chunks[i] * k, chunks[i] * n_chunks_axis) + return tuple(shard) +``` + +Properties of the rule: + +- **Integer-multiple alignment per axis**: every shard axis is + `chunks[i] * m` for some `m ≥ 1`. zarr v3 requires this. +- **Spatial-only growth falls out for free**: a 3-D mask + `(8, 4096, 4096)` chunked `(8, 256, 256)` has `growable = [1, 2]`, + so the plane axis is left alone. +- **Tiny arrays skip sharding**: a `(N, 80)` FITS-card array, a + single-chunk `lsst_json`, or any array whose chunks already cover + every axis returns `None`. +- **CellCoadd PSF** `(25, 25, h, w)` chunked `(1, 1, h, w)` has + `growable = [0, 1]`, so it shards the cell-grid axes only — no + class-specific rule needed. +- **Cap at array bounds**: small arrays do not get shards larger + than the array itself. + +### Worked examples (target = 16 MiB) + +| array | shape | chunks | dtype | result | +|--------------------|--------------------|-------------------|---------|--------------------------| +| `image` (4k×4k) | (4096, 4096) | (256, 256) | float32 | shard `(2048, 2048)` | +| `mask` (3-D, 4k) | (8, 4096, 4096) | (8, 256, 256) | uint8 | shard `(8, 1536, 1536)` | +| `variance` (4k×4k) | (4096, 4096) | (256, 256) | float32 | shard `(2048, 2048)` | +| CellCoadd `psf` | (25, 25, 150, 150) | (1, 1, 150, 150) | float32 | shard `(14, 14, 150, 150)` | +| small image | (600, 600) | (256, 256) | float32 | shard `(768, 768)` (capped) | +| `lsst_json` | (N,) | (N,) | uint8 | `None` | +| `wcs_ast` | (M,) | (M,) | uint8 | `None` | +| FITS primary | (N, 80) | (N, 80) | uint8 | `None` | + +## Per-array behaviour in `ZarrOutputArchive` + +The pattern at every site that decides chunks today +(`_output_archive.py:183` for the MaskedImage path, +`_output_archive.py:202-241` for `add_array`): + +```python +chunks = self._chunks.get(name) or self._chunks.get(leaf) or +shards = self._shards.get(name) or self._shards.get(leaf) +if shards is None: + shards = default_shards( + chunks, packed.shape, packed.dtype, + target_bytes=DEFAULT_TARGET_SHARD_BYTES, + ) +ZarrArray(data=..., chunks=chunks, shards=shards, ...) +``` + +Coverage by call site: + +| call site | what gets sharded | +|----------------------------------------------|--------------------------------------------| +| MaskedImage path (`_output_archive.py:~183`) | `image`, `variance`, `mask` | +| `add_array` generic (`_output_archive.py:~228`) | top-level sibling arrays | +| `add_array` PSF branch (`_output_archive.py:~223`) | CellCoadd `psf` 4-D | +| JSON tree (`_output_archive.py:~142`, `:~337`) | `lsst_json` — helper returns `None` | +| `wcs_ast` (`_output_archive.py:~406`) | helper returns `None` | +| `serialize_fits_opaque_metadata` (`_layout.py:~281`) | helper returns `None` | + +Bulk pixel arrays (`image`, `variance`, `mask`, `psf`) and any +user-supplied extra arrays large enough to qualify gain `shards`. +Everything tiny / single-chunk is auto-`None`. + +User overrides remain unchanged: passing `shards={"image": (...)}` to +`write_zarr` still wins because the override is consulted before the +default helper. + +## Error handling + +- `default_shards` raises `ValueError` on mismatched ndim between + `chunks` and `shape`, mirroring `chunks_aligned_to`. All other + inputs are total — no exceptions on well-formed numeric data. +- `dtype.itemsize == 0` (object dtype) → `None`. Defensive guard; + object dtypes are not written today. +- Env-var parse failure raises at import. + +## Backward compatibility + +- **Reading old archives**: unaffected. `ZarrArray.from_zarr` does + not consult `shards`. The input archive slices through + `zarr.Array`. +- **Round-trip equality**: byte-equal data round-trips unchanged. + Tests asserting array equality continue to pass. +- **On-disk file counts**: any test asserting a specific file count + on disk needs updating. None known today. +- **Old test fixtures** (e.g. `dp1.zarr/`): readable as before; the + change is write-side only. +- **ZipStore**: unchanged. `zarr.storage.ZipStore` accepts sharded + arrays the same way as `LocalStore` — shards inside a zip are + nested keys, no special handling. + +### Performance note + +A 4k×4k float32 image full-read goes from 16 chunks to 256 chunks +when the chunk default drops 1024 → 256. Sharding keeps the I/O +profile identical (4 GETs, same wire bytes), but per-chunk decode +runs 16× more often. Expected to be invisible: blosc-zstd decode +is fast and concurrent. If a benchmark regresses, the fallback is +to bump `DEFAULT_CHUNK_AXIS_LIMIT` to 512. + +## Testing + +### Unit tests for `default_shards` (new file `tests/test_zarr_layout.py`) + +- 4k×4k float32 with `(256, 256)` chunks → `(2048, 2048)`. +- 3-D mask `(8, 4096, 4096)` uint8 with `(8, 256, 256)` chunks → + `(8, 1536, 1536)` — plane axis untouched. +- Tiny 1-D single-chunk array → `None`. +- `chunks == shape` (single-chunk of any size) → `None`. +- `chunk_bytes >= target_bytes` (already-big chunk) → `None`. +- `k <= 1` boundary → `None`. +- Cap at array bounds: shape `(600, 600)`, chunks `(256, 256)`, + ratio 64 → shard `(768, 768)`, not `(2048, 2048)`. +- Mismatched ndim raises `ValueError`. +- `dtype.itemsize == 0` → `None`. + +### Env-var test (`tests/test_zarr_layout.py`) + +- Set `LSST_IMAGES_ZARR_TARGET_SHARD_BYTES`, re-import the module + in a subprocess (cleanest way to re-run import-time init), assert + the constant changed. +- Garbage value raises at import. + +### Round-trip / integration (extend existing zarr round-trip tests) + +- Assert one large-image round-trip writes an `image` array whose + on-disk metadata has non-`None` `shards` and shards are integer + multiples of chunks per axis. +- Assert `lsst_json` and `wcs_ast` arrays come back with `shards` + unset (or `None` in metadata). +- CellCoadd round-trip: assert PSF `psf` array's `shards != chunks` + (i.e. the byte-budget rule actually fired). +- Existing data-equality round-trip checks are unmodified and + continue to gate correctness. + +### Zip round-trip (extend `tests/test_zarr_store.py`) + +- Add one assertion to an existing zip test that a sharded + write/read round-trips through `ZipStore` cleanly. + +### Verification command + +`.pyenv/bin/python -m pytest tests/ -k zarr` is the gate for the +implementation phase. diff --git a/pyproject.toml b/pyproject.toml index b5309d3b..919a1685 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -53,6 +53,8 @@ piff = ["piff >= 1.6", "galsim >= 2.7"] butler = ["lsst-daf-butler"] # Add feature for Starlink NDF (HDS-on-HDF5) read/write support. ndf = ["h5py >= 3.10"] +# Add feature for Zarr v3 read/write support. +zarr = ["zarr >= 3.0"] [tool.setuptools.packages.find] where = ["python"] diff --git a/python/lsst/images/_transforms/_ast.py b/python/lsst/images/_transforms/_ast.py index 118259f5..ee7ec1f4 100644 --- a/python/lsst/images/_transforms/_ast.py +++ b/python/lsst/images/_transforms/_ast.py @@ -14,6 +14,8 @@ from collections.abc import Iterable from typing import TYPE_CHECKING, Any, ClassVar, Self +import numpy as np + __all__ = ( "USING_STARLINK_PYAST", "Channel", @@ -23,10 +25,12 @@ "Frame", "FrameSet", "Mapping", + "PolyMap", "ShiftMap", "SkyFrame", "StringStream", "UnitMap", + "ZoomMap", ) if TYPE_CHECKING: @@ -45,10 +49,12 @@ FrameSet, Mapping, Object, + PolyMap, ShiftMap, SkyFrame, StringStream, UnitMap, + ZoomMap, ) except ImportError: import starlink.Ast @@ -157,6 +163,32 @@ def inverted(self) -> Mapping: copy.invert() return Mapping._wrap(copy) + def linearApprox(self, lbnd: Any, ubnd: Any, tol: float) -> np.ndarray | None: + """Best linear approximation to this mapping over a hyper-box. + + Parameters + ---------- + lbnd, ubnd + Per-axis lower / upper input-coordinate bounds of the + box over which the approximation is required. + tol + Maximum permitted deviation from linearity, expressed + as a positive Cartesian displacement in the output + coordinate system. + + Returns + ------- + coeffs + A 1-D array of length ``Nout * (1 + Nin)`` giving the + linear coefficients for each output, ordered + ``[c, m_in0, m_in1, ...]`` per output. ``None`` if no + linear fit within ``tol`` exists. + """ + success, coeffs = self._impl.linearapprox(lbnd, ubnd, tol) + if not success: + return None + return np.asarray(coeffs) + class UnitMap(Mapping): def __init__(self, n_coord: int): super().__init__(starlink.Ast.UnitMap(n_coord)) @@ -175,6 +207,28 @@ def __init__(self, map_a: Mapping, map_b: Mapping, series: bool): _IMPL_TYPE: ClassVar[type[starlink.Ast.CmpMap]] = starlink.Ast.CmpMap + class ZoomMap(Mapping): + def __init__(self, n_coord: int, zoom: float): + super().__init__(starlink.Ast.ZoomMap(n_coord, zoom)) + + _IMPL_TYPE: ClassVar[type[starlink.Ast.ZoomMap]] = starlink.Ast.ZoomMap + + class PolyMap(Mapping): + def __init__(self, coeff_f: Any, coeff_i_or_nout: Any, options: str = ""): + # astshim's PolyMap takes ``nout`` as the second positional; + # starlink.Ast.PolyMap requires an explicit inverse-coefficient + # array. Adapt to both by synthesizing an empty inverse when + # an integer ``nout`` is supplied. + coeff_f_arr = np.asarray(coeff_f, dtype=float) + if isinstance(coeff_i_or_nout, int): + nin = coeff_f_arr.shape[1] - 2 + coeff_i = np.zeros((0, 2 + nin), dtype=float) + else: + coeff_i = np.asarray(coeff_i_or_nout, dtype=float) + super().__init__(starlink.Ast.PolyMap(coeff_f_arr, coeff_i, options)) + + _IMPL_TYPE: ClassVar[type[starlink.Ast.PolyMap]] = starlink.Ast.PolyMap + class Frame(Mapping): def __init__(self, n_axes: int, options: str = ""): super().__init__(starlink.Ast.Frame(n_axes, options)) diff --git a/python/lsst/images/tests/_roundtrip.py b/python/lsst/images/tests/_roundtrip.py index dfb987e6..992cd00f 100644 --- a/python/lsst/images/tests/_roundtrip.py +++ b/python/lsst/images/tests/_roundtrip.py @@ -11,7 +11,7 @@ from __future__ import annotations -__all__ = ("RoundtripFits", "RoundtripJson", "RoundtripNdf", "TemporaryButler") +__all__ = ("RoundtripFits", "RoundtripJson", "RoundtripNdf", "RoundtripZarr", "TemporaryButler") import tempfile import unittest @@ -328,3 +328,45 @@ def _read(self, obj_type: Any, filename: str) -> ReadResult: from .. import ndf return ndf.read(obj_type, filename) + + +class RoundtripZarr[T](RoundtripBase[T]): + """Round-trip helper for the zarr backend. + + Zarr archives are directories rather than single files, so the + base class's ``NamedTemporaryFile`` pattern doesn't fit. + ``_run_without_butler`` is overridden to use a ``TemporaryDirectory`` + and a fresh archive path inside it. + """ + + def inspect(self) -> Any: + """Open the zarr archive's IR for inspection.""" + import zarr as _zarr + + from ..zarr._model import ZarrDocument + + return ZarrDocument.from_zarr(_zarr.storage.LocalStore(self.filename, read_only=True)) + + def _get_extension(self) -> str: + return ".zarr" + + def _write(self, obj: Any, filename: str) -> ArchiveTree: + from .. import zarr as zarr_backend + + return zarr_backend.write(obj, filename) + + def _read(self, obj_type: Any, filename: str) -> ReadResult: + from .. import zarr as zarr_backend + + return zarr_backend.read(obj_type, filename) + + def _run_without_butler(self) -> None: + import os + + parent = self._exit_stack.enter_context(tempfile.TemporaryDirectory()) + target = os.path.join(parent, f"out{self._get_extension()}") + self._filename = target + self._serialized = self._write(self._original, target) + read_result = self._read(type(self._original), target) + self._tc.assertIsNone(read_result.butler_info) + self.result = read_result.deserialized diff --git a/python/lsst/images/zarr/__init__.py b/python/lsst/images/zarr/__init__.py new file mode 100644 index 00000000..44bd8bc8 --- /dev/null +++ b/python/lsst/images/zarr/__init__.py @@ -0,0 +1,125 @@ +# This file is part of lsst-images. +# +# Developed for the LSST Data Management System. +# This product includes software developed by the LSST Project +# (https://www.lsst.org). +# See the COPYRIGHT file at the top-level directory of this distribution +# for details of code ownership. +# +# Use of this source code is governed by a 3-clause BSD-style +# license that can be found in the LICENSE file. + +"""Zarr v3 archive backend for `lsst.images`. + +This module reads and writes Zarr v3 archives whose root layout is +xarray/CF-shaped (``image``, ``variance``, ``mask`` as siblings sharing +``(y, x)`` dimensions, CF ``flag_masks`` / ``flag_meanings`` / +``flag_descriptions`` on the mask) with OME-NGFF v0.5 multiscales +metadata as a discoverability layer pointing at the same ``image`` +array. The same bytes are visible to ``xarray``, GDAL's Zarr driver, +and OME-Zarr tooling like ``napari`` and ``ome-zarr-py``. + +Supported types +--------------- + +Every image type that already serializes to FITS / JSON / NDF: +`~lsst.images.Image`, `~lsst.images.Mask`, `~lsst.images.MaskedImage`, +`~lsst.images.VisitImage`, `~lsst.images.ColorImage`, plus any object +reachable through the `~lsst.images.serialization.OutputArchive` +interface. + +On-disk layout +-------------- + +A `~lsst.images.MaskedImage` archive contains: + +- ``image``, ``variance``, ``mask`` arrays at the root, shaped + ``(Y, X)`` with shared chunk sizes. +- ``lsst_json`` — 1-D ``uint8`` zarr array containing UTF-8 JSON of + the Pydantic archive tree (the round-trip authority). The same name + convention is used by the FITS backend's ``JSON`` HDU and the NDF + backend's ``/MORE/LSST/JSON`` path. WCS information (including + full SIP / PolyMap distortion coefficients) lives inside this JSON + as part of the projection sub-tree. + +The mask is a 2-D unsigned integer (``uint8`` for ≤8 planes, up to +``uint64`` for 64 planes; >64 raises). Each pixel's bits encode the +applicable mask planes. + +For `~lsst.images.ColorImage`, the three channels are written as flat 2-D arrays +at ``red``, ``green``, ``blue``. + +For ``CellCoadd``, ``image`` / ``variance`` / ``mask`` are siblings +(cell-aligned chunks driven by ``cell_shape``), and ``psf`` is a 4-D +``(Cy, Cx, Py, Px)`` array with single-cell chunks +``(1, 1, Py, Px)``. + +WCS handling +------------ + +The full WCS (frames, mappings, polynomial distortions) round-trips +through the JSON tree at ``lsst_json``. The layout layer also emits +an OME-NGFF v0.5 affine ``coordinateTransformations`` block on the +root group as a discoverability aid for OME tooling. Before emitting, +residuals are sampled on an 11×11 grid; if the worst pixel-equivalent +error exceeds 1.0 pixel, the affine block is dropped and +``lsst.wcs_simplified_dropped: true`` is recorded with the observed +maximum. The OME block is informational only — readers always +reconstruct the projection from the JSON tree. + +Cloud-friendly defaults +----------------------- + +- Default chunk geometry is tile-aligned: ``min(256, dim)`` per + axis for plain images, ``cell_shape`` for ``CellCoadd``, + single-cell for ``CellCoadd``'s 4-D PSF. The per-axis cap is + configurable via the `DEFAULT_CHUNK_AXIS_LIMIT` constant. +- Bulk pixel arrays (``image``, ``variance``, ``mask``, and + ``CellCoadd``'s ``psf``) are sharded by default to keep object + counts on S3 / GCS low. The shard size is chosen by a byte-budget + rule (~16 MiB by default; tunable via the + ``LSST_IMAGES_ZARR_TARGET_SHARD_BYTES`` environment variable). + Tiny single-chunk arrays (``lsst_json``, ``wcs_ast``, FITS + opaque-metadata blocks) stay unsharded. +- Subset reads via ``slices=`` to + `~lsst.images.serialization.InputArchive.get_array` exploit zarr's + chunk index: only chunks intersecting the slice are fetched, even + from remote stores. +- Both ``DirectoryStore`` and ``ZipStore`` are supported; the choice + is driven by URI shape (``*.zarr.zip`` → ``ZipStore``, otherwise + directory). Remote URIs (``s3://``, ``gs://``, ``http(s)://``) go + through `lsst.resources.ResourcePath` and ``fsspec``. + +Round-trip with FITS +-------------------- + +When an object that originated from a FITS read carries a +`~lsst.images.fits.FitsOpaqueMetadata`, the primary-HDU header is +preserved at ``/lsst/opaque_metadata/fits/primary``. Reading the +zarr back attaches an equivalent ``FitsOpaqueMetadata`` to the +deserialized object so a subsequent FITS write reproduces the +original cards. + +Optional install +---------------- + +This backend requires ``zarr >= 3.0``. Install via the ``[zarr]`` +extra:: + + pip install lsst-images[zarr] + +The top-level ``import lsst.images.zarr`` raises a clear +`ImportError` with this guidance if ``zarr`` is not installed. +""" + +try: + import zarr # noqa: F401 +except ImportError as e: + raise ImportError( + "lsst.images.zarr requires the optional 'zarr' package (>=3.0). " + "Install it directly or via 'pip install lsst-images[zarr]'." + ) from e + +from ._common import * +from ._input_archive import * +from ._output_archive import * diff --git a/python/lsst/images/zarr/_common.py b/python/lsst/images/zarr/_common.py new file mode 100644 index 00000000..01db83aa --- /dev/null +++ b/python/lsst/images/zarr/_common.py @@ -0,0 +1,161 @@ +# This file is part of lsst-images. +# +# Developed for the LSST Data Management System. +# This product includes software developed by the LSST Project +# (https://www.lsst.org). +# See the COPYRIGHT file at the top-level directory of this distribution +# for details of code ownership. +# +# Use of this source code is governed by a 3-clause BSD-style +# license that can be found in the LICENSE file. + +from __future__ import annotations + +__all__ = ( + "DEFAULT_CHUNK_AXIS_LIMIT", + "DEFAULT_TARGET_SHARD_BYTES", + "LSST_NS", + "LSST_VERSION", + "OME_NS", + "OME_VERSION", + "ZarrCompressionOptions", + "ZarrPointerModel", + "archive_path_to_zarr_path", + "mask_dtype_for_plane_count", +) + +import os +from dataclasses import dataclass +from typing import ClassVar, Self + +import numpy as np +import pydantic + +LSST_NS = "lsst" +"""Top-level zarr-attributes namespace key for LSST extensions.""" + +OME_NS = "ome" +"""Top-level zarr-attributes namespace key for OME-NGFF metadata.""" + +OME_VERSION = "0.5" +"""OME-Zarr / NGFF version this backend writes.""" + +LSST_VERSION = 1 +"""Schema version of the ``lsst:`` extension this backend writes. + +Readers refuse versions newer than they understand. Bump on +backwards-incompatible changes to the on-disk layout. +""" + +DEFAULT_CHUNK_AXIS_LIMIT = 256 +"""Per-axis cap on the auto-derived chunk shape for plain image arrays. + +Used by `lsst.images.zarr._layout.chunks_for` when the caller does not +supply an explicit override and the archive class does not have a +class-specific chunk rule. Chunks of ~256 elements per spatial axis +trade some compression ratio for cutout-friendly partial reads. +""" + + +def _read_target_shard_bytes() -> int: + """Read ``LSST_IMAGES_ZARR_TARGET_SHARD_BYTES`` or return the default. + + Parsed as a base-10 integer. A non-integer value raises ``ValueError`` + at import time — silent typos are worse than loud failure. + """ + raw = os.environ.get("LSST_IMAGES_ZARR_TARGET_SHARD_BYTES") + if raw is None: + return 16 * 1024 * 1024 + try: + return int(raw) + except ValueError as exc: + raise ValueError(f"LSST_IMAGES_ZARR_TARGET_SHARD_BYTES={raw!r} is not a base-10 integer.") from exc + + +DEFAULT_TARGET_SHARD_BYTES: int = _read_target_shard_bytes() +"""Target uncompressed byte size for an auto-derived shard. + +Read from ``LSST_IMAGES_ZARR_TARGET_SHARD_BYTES`` once at import time; +defaults to 16 MiB. Used by `lsst.images.zarr._layout.default_shards` to +decide how many chunks to combine into a shard. +""" + + +class ZarrPointerModel(pydantic.BaseModel): + """Reference to a zarr archive sub-tree by absolute zarr path. + + Used by `ZarrOutputArchive` / `ZarrInputArchive` to point to + sub-trees that have been hoisted out of the main JSON tree into + separate zarr arrays. The path is interpreted relative to the + archive root, e.g. ``"/lsst/psf/lsst_json"``. + """ + + path: str + """Absolute zarr path (e.g. ``/lsst/psf/lsst_json``).""" + + +@dataclass(frozen=True) +class ZarrCompressionOptions: + """Per-array zarr v3 codec configuration. + + The default codec stack is ``bytes -> blosc(zstd, clevel=5)`` with + byte-shuffle for floats and bit-shuffle for integers (and masks). + All defaults are overridable per-array via the ``compression`` + keyword to ``write()``. + """ + + codec: str = "blosc" + cname: str = "zstd" + clevel: int = 5 + shuffle: str = "shuffle" # 'shuffle' (byte) or 'bitshuffle' or 'noshuffle' + + DEFAULT_FLOAT: ClassVar[Self] + DEFAULT_INT: ClassVar[Self] + + @classmethod + def default_for_dtype(cls, dtype: str | np.dtype) -> Self: + """Return the default codec stack for a numpy dtype.""" + kind = np.dtype(dtype).kind + # 'u' (unsigned int), 'i' (signed int), 'b' (bool) -> bit-shuffle. + if kind in ("u", "i", "b"): + return cls.DEFAULT_INT + return cls.DEFAULT_FLOAT + + +ZarrCompressionOptions.DEFAULT_FLOAT = ZarrCompressionOptions(shuffle="shuffle") +ZarrCompressionOptions.DEFAULT_INT = ZarrCompressionOptions(shuffle="bitshuffle") + + +def archive_path_to_zarr_path(archive_path: str) -> str: + """Translate a serialization archive path to its zarr path. + + The empty archive path maps to the root-level JSON tree at + ``/lsst_json``. Non-empty archive paths are kept verbatim (with a + leading slash). The v1 design's JSON-pointer mapping table is + intentionally absent: arrays land where their archive name says + they do. + """ + if not archive_path: + return "/lsst_json" + stripped = archive_path.strip("/") + return f"/{stripped}" + + +def mask_dtype_for_plane_count(n_planes: int) -> np.dtype: + """Pick the smallest unsigned-integer dtype that holds ``n_planes`` bits. + + Returns ``uint8`` for <=8 planes, ``uint16`` for <=16, ``uint32`` + for <=32, ``uint64`` for <=64. Raises `ValueError` for >64 planes; + a 3-D fallback for that case is tracked as a follow-up. + """ + if n_planes <= 0: + raise ValueError(f"n_planes must be positive, got {n_planes}.") + if n_planes <= 8: + return np.dtype("uint8") + if n_planes <= 16: + return np.dtype("uint16") + if n_planes <= 32: + return np.dtype("uint32") + if n_planes <= 64: + return np.dtype("uint64") + raise ValueError(f"Mask has {n_planes} planes; v1 supports up to 64. 3-D fallback is a follow-up.") diff --git a/python/lsst/images/zarr/_input_archive.py b/python/lsst/images/zarr/_input_archive.py new file mode 100644 index 00000000..ac957129 --- /dev/null +++ b/python/lsst/images/zarr/_input_archive.py @@ -0,0 +1,246 @@ +# This file is part of lsst-images. +# +# Developed for the LSST Data Management System. +# This product includes software developed by the LSST Project +# (https://www.lsst.org). +# See the COPYRIGHT file at the top-level directory of this distribution +# for details of code ownership. +# +# Use of this source code is governed by a 3-clause BSD-style +# license that can be found in the LICENSE file. + +from __future__ import annotations + +__all__ = ("ZarrInputArchive", "read") + +from collections.abc import Callable, Iterator +from contextlib import contextmanager +from types import EllipsisType +from typing import Any, Self + +import astropy.io.fits +import astropy.table +import numpy as np + +from lsst.resources import ResourcePathExpression + +from .._transforms import FrameSet +from ..fits._common import FitsOpaqueMetadata +from ..serialization import ( + ArchiveReadError, + ArchiveTree, + ArrayReferenceModel, + InlineArrayModel, + InputArchive, + ReadResult, + TableModel, + no_header_updates, +) +from ._common import LSST_VERSION, ZarrPointerModel +from ._layout import deserialize_fits_opaque_metadata +from ._model import ZarrArray, ZarrDocument +from ._store import open_store_for_read + + +class ZarrInputArchive(InputArchive[ZarrPointerModel]): + """Reads zarr archives written by `ZarrOutputArchive`.""" + + def __init__(self, document: ZarrDocument) -> None: + self._document = document + self._validate_root_attributes() + self._deserialized_pointer_cache: dict[str, Any] = {} + self._frame_set_cache: dict[str, FrameSet] = {} + self._opaque_metadata = deserialize_fits_opaque_metadata(document) + + def get_opaque_metadata(self) -> FitsOpaqueMetadata | None: + """Return any FITS opaque metadata recovered from the archive.""" + return self._opaque_metadata + + @classmethod + @contextmanager + def open(cls, path: ResourcePathExpression) -> Iterator[Self]: + """Open a zarr archive for reading.""" + with open_store_for_read(path) as store: + doc = ZarrDocument.from_zarr(store) + yield cls(doc) + + @property + def document(self) -> ZarrDocument: + return self._document + + def get_tree[T: ArchiveTree](self, model_type: type[T]) -> T: + """Read and validate the main Pydantic tree at ``/lsst_json``.""" + try: + node = self._document.root.get("/lsst_json") + except KeyError: + raise ArchiveReadError( + "File has no /lsst_json array; this is not an LSST zarr archive." + ) from None + if not isinstance(node, ZarrArray): + raise ArchiveReadError("/lsst_json must be a zarr array, not a group.") + json_bytes = bytes(node.read()) + return model_type.model_validate_json(json_bytes.decode("utf-8")) + + def _validate_root_attributes(self) -> None: + attrs = self._document.root.attributes.lsst + if "archive_class" not in attrs: + raise ArchiveReadError("File is not an LSST zarr archive (missing lsst.archive_class).") + version = attrs.get("__version_remembered_at_load__", LSST_VERSION) + if version > LSST_VERSION: + raise ArchiveReadError( + f"Unsupported lsst:version {version}; this reader supports up to {LSST_VERSION}." + ) + + def deserialize_pointer[U: ArchiveTree, V]( + self, + pointer: ZarrPointerModel, + model_type: type[U], + deserializer: Callable[[U, InputArchive[ZarrPointerModel]], V], + ) -> V: + if (cached := self._deserialized_pointer_cache.get(pointer.path)) is not None: + return cached + try: + node = self._document.root.get(pointer.path) + except KeyError: + raise ArchiveReadError(f"Pointer reference {pointer.path!r} not in store.") from None + if not isinstance(node, ZarrArray): + raise ArchiveReadError(f"Pointer target {pointer.path!r} is not an array.") + json_text = bytes(node.read()).decode("utf-8") + model = model_type.model_validate_json(json_text) + result = deserializer(model, self) + self._deserialized_pointer_cache[pointer.path] = result + if isinstance(result, FrameSet): + self._frame_set_cache[pointer.path] = result + return result + + def get_frame_set(self, pointer: ZarrPointerModel) -> FrameSet: + try: + return self._frame_set_cache[pointer.path] + except KeyError: + raise AssertionError( + f"Frame set at {pointer.path!r} must be deserialised via " + f"deserialize_pointer before any dependent transform can be." + ) from None + + def get_array( + self, + model: ArrayReferenceModel | InlineArrayModel, + *, + slices: tuple[slice, ...] | EllipsisType = ..., + strip_header: Callable[[astropy.io.fits.Header], None] = no_header_updates, + ) -> np.ndarray: + if isinstance(model, InlineArrayModel): + data: np.ndarray = np.array(model.data, dtype=model.datatype.to_numpy()) + return data if slices is ... else data[slices] + if not isinstance(model.source, str) or not model.source.startswith("zarr:"): + raise ArchiveReadError( + f"ZarrInputArchive cannot resolve array source {model.source!r}; " + f"expected a 'zarr:' reference." + ) + zarr_path = model.source[len("zarr:") :] + try: + node = self._document.root.get(zarr_path) + except KeyError: + raise ArchiveReadError(f"Array reference {zarr_path!r} not in store.") from None + if not isinstance(node, ZarrArray): + raise ArchiveReadError(f"{zarr_path!r} is not an array.") + + # Mask unpack: model claims 3-D (mask_size, y, x); on-disk is 2-D + # (y, x) packed wide-int with flag_masks attribute. + claimed_shape = tuple(model.shape) if model.shape is not None else None + if ( + claimed_shape is not None + and len(claimed_shape) == 3 + and len(node.shape) == 2 + and "flag_masks" in node.attributes.extra + ): + return self._read_packed_mask(node, claimed_shape, np.dtype(model.datatype.to_numpy()), slices) + + # Standard path: forward slices straight to the lazy handle. + return node.read(slices=slices) + + def _read_packed_mask( + self, + node: ZarrArray, + claimed_shape: tuple[int, ...], + element_dtype: np.dtype, + slices: tuple[slice, ...] | EllipsisType, + ) -> np.ndarray: + """Unpack a 2-D wide-int mask back to 3-D ``(mask_size, y, x)``. + + Mask deserialization expects the storage layout that + ``Mask.serialize`` streamed — ``(mask_size, y, x)`` — with one + ``element_dtype`` element per slice along the leading axis, + matching the schema's element packing. Each element's bits + live at packed positions ``[stride*i, stride*(i+1))`` where + ``stride = 8 * element_dtype.itemsize``. Rank-3 ``slices`` + from the deserializer are ``(element_axis, y_slice, + x_slice)``; the leading slice is stripped before forwarding + the spatial slice to the lazy handle and re-applied to the + unpacked output. + """ + mask_size = claimed_shape[-1] + # Forward slice to the lazy handle so only intersecting chunks + # are fetched even on remote stores. + if slices is ...: + spatial_slices: tuple[slice, ...] | EllipsisType = ... + element_slice: slice | EllipsisType = ... + elif len(slices) == 3: + element_slice = slices[0] + spatial_slices = slices[1:] + else: + spatial_slices = slices + element_slice = ... + packed = node.read(slices=spatial_slices) + stride = 8 * element_dtype.itemsize + element_mask = (np.uint64(1) << np.uint64(stride)) - np.uint64(1) + out = np.empty((mask_size,) + packed.shape, dtype=element_dtype) + for i in range(mask_size): + out[i] = ((packed >> np.uint64(stride * i)) & element_mask).astype(element_dtype) + if element_slice is ...: + return out + return out[element_slice] + + def get_table( + self, + model: TableModel, + strip_header: Callable[[astropy.io.fits.Header], None] = no_header_updates, + ) -> astropy.table.Table: + result = astropy.table.Table(meta=model.meta) + for column_model in model.columns: + if isinstance(column_model.data, InlineArrayModel): + data: Any = column_model.data.data + else: + data = self.get_array(column_model.data, strip_header=strip_header) + result[column_model.name] = astropy.table.Column( + data, + name=column_model.name, + dtype=column_model.data.datatype.to_numpy(), + unit=column_model.unit, + description=column_model.description, + meta=column_model.meta, + ) + return result + + def get_structured_array( + self, + model: TableModel, + strip_header: Callable[[astropy.io.fits.Header], None] = no_header_updates, + ) -> np.ndarray: + return self.get_table(model, strip_header).as_array() + + +def read[T: Any](cls: type[T], path: ResourcePathExpression, **kwargs: Any) -> ReadResult[T]: + """Read an object from a zarr archive. + + The archive's root attributes name the in-memory class via + ``lsst.archive_class``. Files without this attribute raise; auto- + detect of foreign zarr files is a follow-up. + """ + with ZarrInputArchive.open(path) as archive: + tree_type = cls._get_archive_tree_type(ZarrPointerModel) + tree = archive.get_tree(tree_type) + obj = tree.deserialize(archive, **kwargs) + if (opaque := archive.get_opaque_metadata()) is not None: + obj._opaque_metadata = opaque + return ReadResult(obj, tree.metadata, tree.butler_info) diff --git a/python/lsst/images/zarr/_layout.py b/python/lsst/images/zarr/_layout.py new file mode 100644 index 00000000..140d2c49 --- /dev/null +++ b/python/lsst/images/zarr/_layout.py @@ -0,0 +1,380 @@ +# This file is part of lsst-images. +# +# Developed for the LSST Data Management System. +# This product includes software developed by the LSST Project +# (https://www.lsst.org). +# See the COPYRIGHT file at the top-level directory of this distribution +# for details of code ownership. +# +# Use of this source code is governed by a 3-clause BSD-style +# license that can be found in the LICENSE file. + +"""Per-archive-class layout rules for the zarr backend. + +This module centralises the decisions that vary by image type: + +- which OME axes apply (``ColorImage`` has no root multiscale) +- default chunk sizes (clamped to ``DEFAULT_CHUNK_AXIS_LIMIT`` per axis, + cell-aligned for `CellCoadd`, image-aligned for `variance` / `mask` + siblings) +- the affine residual validator that gates the OME + ``coordinateTransformations`` block + +Keeping these in one place lets the output archive populate the IR +generically. +""" + +from __future__ import annotations + +__all__ = ( + "AffineCheckResult", + "affine_check", + "axes_for_archive_class", + "chunks_aligned_to", + "chunks_for", + "decorate_sub_archives", + "default_shards", + "deserialize_fits_opaque_metadata", + "serialize_fits_opaque_metadata", +) + +import math +from collections.abc import Mapping +from dataclasses import dataclass +from typing import Any + +import astropy.io.fits +import numpy as np + +from ..fits._common import ExtensionKey, FitsOpaqueMetadata +from ._common import DEFAULT_CHUNK_AXIS_LIMIT +from ._model import OmeMultiscale, ZarrArray, ZarrDocument + + +def axes_for_archive_class(name: str) -> tuple[str, ...]: + """Return the OME axis tuple for a given archive class. + + Returns an empty tuple for ``ColorImage`` to signal that there is + no OME multiscale at the root of that class — the per-channel + sub-archives carry their own ``(y, x)`` multiscales. + """ + if name == "ColorImage": + return () + return ("y", "x") + + +def chunks_for( + archive_class: str, + shape: tuple[int, ...], + override: tuple[int, ...] | None, + *, + archive_metadata: Mapping[str, Any] | None = None, +) -> tuple[int, ...]: + """Return the chunk shape to use for a top-level array. + + Parameters + ---------- + archive_class + Top-level archive class name; used for class-specific + defaults like ``CellCoadd``'s cell-aligned chunks. + shape + The full array shape, used to clamp the default per-axis. + override + User-supplied chunk shape. If not ``None`` it is returned + verbatim after a length check. + archive_metadata + Class-specific layout hints. ``CellCoadd`` reads + ``"cell_shape"`` from this mapping. + """ + if override is not None: + if len(override) != len(shape): + raise ValueError( + f"chunks override has rank {len(override)}, expected {len(shape)} for {archive_class!r}." + ) + return tuple(override) + if archive_class == "CellCoadd" and archive_metadata is not None: + cell_shape = archive_metadata.get("cell_shape") + if cell_shape is not None: + return tuple(min(c, dim) for c, dim in zip(cell_shape, shape, strict=True)) + return tuple(min(DEFAULT_CHUNK_AXIS_LIMIT, dim) for dim in shape) + + +def chunks_aligned_to( + *, + image_chunks: tuple[int, ...], + shape: tuple[int, ...], +) -> tuple[int, ...]: + """Derive a sibling array's chunks from the ``image`` array's chunks. + + Used by `ZarrOutputArchive.add_array` for ``variance`` and + ``mask`` siblings when the user has not provided an explicit + override. The result is per-axis ``min(image_chunks[i], + shape[i])`` so a sibling smaller than ``image`` is not + over-chunked. + """ + if len(image_chunks) != len(shape): + raise ValueError( + f"image_chunks rank {len(image_chunks)} does not match sibling shape rank {len(shape)}." + ) + return tuple(min(c, dim) for c, dim in zip(image_chunks, shape, strict=True)) + + +def default_shards( + *, + chunks: tuple[int, ...], + shape: tuple[int, ...], + dtype: np.dtype, + target_bytes: int, +) -> tuple[int, ...] | None: + """Derive a default shard shape from ``chunks``, ``shape``, and ``dtype``. + + Returns ``None`` when sharding would be a no-op: ``dtype.itemsize`` + is zero (object dtypes), the array is already a single chunk per + axis, the chunk is already at least ``target_bytes`` big, or the + byte budget rounds to ``k == 1`` chunks per growable axis. + + The rule grows only axes whose ``chunks[i] < shape[i]`` (the + others already cover the full extent), uses one uniform multiplier + ``k = round(ratio ** (1 / num_growable_axes))`` to stay close to + the byte budget, and caps each axis at ``chunks[i] * ceil(shape[i] + / chunks[i])`` so a small array does not get a shard larger than + itself. Every shard axis is an integer multiple of the + corresponding chunk axis, as required by zarr v3. + + Parameters + ---------- + chunks + Chunk shape, one int per axis. + shape + Array shape, one int per axis. + dtype + Array dtype; only ``itemsize`` is consulted. + target_bytes + Target uncompressed shard size. Typically + `~lsst.images.zarr._common.DEFAULT_TARGET_SHARD_BYTES`. + + Raises + ------ + ValueError + If ``len(chunks) != len(shape)``. + """ + if len(chunks) != len(shape): + raise ValueError(f"chunks rank {len(chunks)} does not match shape rank {len(shape)}.") + itemsize = dtype.itemsize + if itemsize == 0: + return None + chunk_bytes = math.prod(chunks) * itemsize + if chunk_bytes >= target_bytes: + return None + growable = [i for i in range(len(shape)) if chunks[i] < shape[i]] + if not growable: + return None + ratio = target_bytes / chunk_bytes + k = round(ratio ** (1.0 / len(growable))) + if k <= 1: + return None # budget allows at most a 1x multiplier — no-op shard + shard = list(chunks) + for i in growable: + n_chunks_axis = math.ceil(shape[i] / chunks[i]) + shard[i] = min(chunks[i] * k, chunks[i] * n_chunks_axis) + return tuple(shard) + + +@dataclass +class AffineCheckResult: + """Result of asking AST whether a simplified affine fits a full WCS. + + When ``dropped`` is False, ``coordinate_transformations`` is the + OME-NGFF ``coordinateTransformations`` list to emit. When True, + AST could not find a linear approximation that stays within the + requested per-pixel tolerance over the whole image footprint, and + the caller must omit the block (or emit a unit scale only). + """ + + dropped: bool + coordinate_transformations: list[dict[str, Any]] | None + + +def affine_check( + *, + frame_set: Any, + image_shape: tuple[int, int], + max_residual_pixels: float = 1.0, +) -> AffineCheckResult: + """Build an OME affine ``coordinateTransformations`` from ``frame_set``. + + Delegates to AST's ``linearapprox`` over the image footprint with + a tolerance scaled to ``max_residual_pixels`` of pixel-equivalent + error. AST returns the affine coefficients when the approximation + fits and ``None`` otherwise. + + Parameters + ---------- + frame_set + AST FrameSet whose base→current mapping goes from pixel + coordinates to sky. + image_shape + ``(h, w)`` of the image; used as the bounds of the box AST is + asked to approximate over. + max_residual_pixels + Maximum permitted deviation, in pixels, of any point in the + box from the linear prediction. AST is given the equivalent + threshold in output (sky) units after multiplying by the local + pixel scale. + """ + h, w = image_shape + mapping = frame_set.getMapping(frame_set.base, frame_set.current) + + # Local pixel scale near the image origin: convert the user-supplied + # pixel tolerance into the output-coordinate units AST expects. + sample = _frame_set_apply(frame_set, np.array([[0.0, 0.0], [1.0, 0.0], [0.0, 1.0]])) + origin = sample[0] + pixel_scale_axis0 = float(np.linalg.norm(sample[1] - origin)) + pixel_scale_axis1 = float(np.linalg.norm(sample[2] - origin)) + pixel_scale = float(np.sqrt(pixel_scale_axis0 * pixel_scale_axis1)) + if pixel_scale <= 0.0: + return AffineCheckResult(dropped=True, coordinate_transformations=None) + + tol_output = max_residual_pixels * pixel_scale + coeffs = mapping.linearApprox( + [0.0, 0.0], + [float(max(h - 1, 0)), float(max(w - 1, 0))], + tol_output, + ) + if coeffs is None: + # AST could not find a linear approximation within the requested + # tolerance over the image footprint. + return AffineCheckResult(dropped=True, coordinate_transformations=None) + + # AST coeffs layout for (Nin=2, Nout=2): the first Nout entries are + # the per-output constants; the remaining Nout*Nin entries are the + # Jacobian, ordered column-major (all ∂y/∂x_0 first, then all + # ∂y/∂x_1, etc.). + if len(coeffs) != 6: + raise ValueError( + f"linearApprox returned {len(coeffs)} coefficients; expected 6 for a 2-D pixel→sky mapping." + ) + c0, c1, j00, j10, j01, j11 = (float(x) for x in coeffs) + + # Pixel scale per input axis: length of the corresponding Jacobian + # column in output coordinates. + scale_axis0 = float(np.hypot(j00, j10)) + scale_axis1 = float(np.hypot(j01, j11)) + + # NGFF composes ``coordinateTransformations`` in list order: the + # scale is applied first, then the affine. To avoid double-counting + # the pixel-size factor, normalise each Jacobian column by its + # length so the affine carries only the rotation / shear that the + # scale does not capture. ``pixel_scale`` is the geometric mean of + # the two column norms; if it were zero we'd already have returned + # above, so dividing by ``scale_axis*`` is safe here. + j00_n = j00 / scale_axis0 + j10_n = j10 / scale_axis0 + j01_n = j01 / scale_axis1 + j11_n = j11 / scale_axis1 + affine_matrix = [[j00_n, j01_n, c0], [j10_n, j11_n, c1], [0.0, 0.0, 1.0]] + + coordinate_transformations: list[dict[str, Any]] = [ + {"type": "scale", "scale": [scale_axis0, scale_axis1]}, + {"type": "affine", "affine": affine_matrix}, + ] + return AffineCheckResult( + dropped=False, + coordinate_transformations=coordinate_transformations, + ) + + +def _frame_set_apply(frame_set: Any, pixels: Any) -> Any: + """Apply ``frame_set``'s base->current mapping to a (N, 2) pixel array.""" + pixels = np.asarray(pixels, dtype=float) + mapping = frame_set.getMapping(frame_set.base, frame_set.current) + out = mapping.applyForward(pixels.T) + return np.asarray(out).T + + +def decorate_sub_archives(document: ZarrDocument) -> None: + """Decorate sub-archive groups with ``lsst.archive_class`` and OME attrs. + + A sub-archive is any group below the root that contains an + ``image`` array. Decoration adds ``lsst.archive_class = "Image"`` + and an ``ome.multiscales`` block pointing at the sub-archive's + ``image`` array. Recursive: nested sub-archives are decorated too. + + The root group is left alone — its ``lsst.archive_class`` is set + by ``add_tree`` based on the in-memory object's type. + """ + if not isinstance(document, ZarrDocument): + raise TypeError(type(document).__name__) + _decorate_walk(document.root) + + +def _decorate_walk(group: Any) -> None: + for sub in group.groups.values(): + if "image" in sub.arrays: + sub.attributes.lsst.setdefault("archive_class", "Image") + if "lsst_json" in sub.arrays: + sub.attributes.lsst.setdefault("json", "lsst_json") + if "multiscales" not in sub.attributes.ome: + multiscale = OmeMultiscale( + name="image", + axes=("y", "x"), + dataset_path="image", + ) + sub.attributes.ome["multiscales"] = [multiscale.dump()] + _decorate_walk(sub) + + +def serialize_fits_opaque_metadata(document: ZarrDocument, opaque: FitsOpaqueMetadata) -> None: + """Stage a `FitsOpaqueMetadata` object into the IR. + + Stores the primary-HDU header as a 2-D ``(N, 80)`` ``uint8`` array + at ``/lsst/opaque_metadata/fits/primary`` — one row per FITS card, + one column per character — and sets ``lsst.opaque_metadata_format + = "fits"`` on the root group. The bytes are + ``astropy.io.fits.Header.tostring()`` output verbatim (cards + + ``END`` + padding to a 2880-byte block), so the round-trip is + byte-exact and preserves comments, ``HISTORY``, ``COMMENT``, + ``CONTINUE``, and ``HIERARCH`` cards. No-op if the metadata is + empty or missing a primary header. + """ + primary = opaque.headers.get(ExtensionKey()) + if primary is None or len(primary) == 0: + return + text = primary.tostring() + if len(text) % 80 != 0: + raise ValueError( + f"Header.tostring() returned {len(text)} bytes; expected a " + "multiple of 80 (one 80-char FITS card per row)." + ) + n_cards = len(text) // 80 + cards = np.ascontiguousarray(np.frombuffer(text.encode("ascii"), dtype=np.uint8).reshape(n_cards, 80)) + parent = document.root.ensure_group("/lsst/opaque_metadata/fits") + # Single chunk: the header is always read whole. + ir_array = ZarrArray(data=cards, chunks=cards.shape) + ir_array.attributes.extra["_ARRAY_DIMENSIONS"] = ["card", "char"] + parent.arrays["primary"] = ir_array + document.root.attributes.lsst["opaque_metadata_format"] = "fits" + + +def deserialize_fits_opaque_metadata(document: ZarrDocument) -> FitsOpaqueMetadata | None: + """Reconstruct a `FitsOpaqueMetadata` from the IR, or return None. + + Returns ``None`` when the archive does not have a FITS opaque + metadata block (the common case for archives that originated as + native zarr). ``Header.fromstring`` parses cards up to the ``END`` + marker and drops the padding, so the recovered header carries + only the real cards. + """ + if document.root.attributes.lsst.get("opaque_metadata_format") != "fits": + return None + try: + node = document.root.get("/lsst/opaque_metadata/fits/primary") + except KeyError: + return None + if not isinstance(node, ZarrArray): + return None + text = bytes(node.read()).decode("ascii") + header = astropy.io.fits.Header.fromstring(text) + opaque = FitsOpaqueMetadata() + opaque.headers[ExtensionKey()] = header + return opaque diff --git a/python/lsst/images/zarr/_model.py b/python/lsst/images/zarr/_model.py new file mode 100644 index 00000000..39c29327 --- /dev/null +++ b/python/lsst/images/zarr/_model.py @@ -0,0 +1,414 @@ +# This file is part of lsst-images. +# +# Developed for the LSST Data Management System. +# This product includes software developed by the LSST Project +# (https://www.lsst.org). +# See the COPYRIGHT file at the top-level directory of this distribution +# for details of code ownership. +# +# Use of this source code is governed by a 3-clause BSD-style +# license that can be found in the LICENSE file. + +"""Python intermediate representation for zarr / xarray-CF / OME-NGFF content. + +The IR is the source of truth for what gets written. ``ZarrOutputArchive`` +populates a `ZarrDocument`; on context-manager exit, `to_zarr` materializes +it through a configured ``zarr.storage.Store``. + +Reads invert that flow: ``ZarrInputArchive`` opens the store and calls +`ZarrDocument.from_zarr`, which builds the IR around **lazy** ``zarr.Array`` +handles. No array bytes are read until a caller asks for them via +`ZarrArray.read`, which forwards slices straight to the underlying handle. +This keeps subset reads of remote files cheap: only the chunks intersecting +the requested slice are fetched. +""" + +from __future__ import annotations + +__all__ = ( + "CfFlagAttributes", + "MaskPlaneEntry", + "OmeMultiscale", + "OmeOmeroChannel", + "ZarrArray", + "ZarrAttributes", + "ZarrDocument", + "ZarrGroup", + "build_image_array_attrs", +) + +from dataclasses import dataclass, field +from types import EllipsisType +from typing import Any, Self, cast + +import numpy as np +import zarr +from zarr.abc.store import Store +from zarr.codecs import BloscCodec, BytesCodec + +from ._common import LSST_NS, LSST_VERSION, OME_NS, OME_VERSION, ZarrCompressionOptions + + +@dataclass +class ZarrAttributes: + """Namespaced attributes attached to a `ZarrGroup` or `ZarrArray`. + + Three namespaces: + + - ``lsst`` — LSST extensions (always emitted with a ``version`` key). + - ``ome`` — OME-NGFF (emitted only when non-empty). + - ``extra`` — flat top-level keys for CF / xarray conventions + (``_ARRAY_DIMENSIONS``, ``flag_masks``, ``flag_meanings``, + ``flag_descriptions``, ``units``, ``long_name``, …). These live at + the top of ``zarr.json`` ``attributes`` so xarray and CF tooling + see them without unwrapping a namespace. + """ + + lsst: dict[str, Any] = field(default_factory=dict) + ome: dict[str, Any] = field(default_factory=dict) + extra: dict[str, Any] = field(default_factory=dict) + + def dump(self) -> dict[str, Any]: + """Return the raw mapping zarr-python writes to ``zarr.json``.""" + out: dict[str, Any] = dict(self.extra) + # lsst is always present so readers can dispatch on lsst.archive_class. + public_lsst = {k: v for k, v in self.lsst.items() if not k.startswith("__")} + out[LSST_NS] = {"version": LSST_VERSION, **public_lsst} + if self.ome: + out[OME_NS] = {"version": OME_VERSION, **self.ome} + return out + + @classmethod + def load(cls, raw: dict[str, Any]) -> Self: + """Construct from a raw attributes mapping read from zarr.""" + lsst = dict(raw.get(LSST_NS, {})) + version = lsst.pop("version", None) + if version is not None: + # Stash the on-disk version under a private sentinel so the input + # archive can validate without going back to the raw store. + lsst["__version_remembered_at_load__"] = version + ome = dict(raw.get(OME_NS, {})) + ome.pop("version", None) + extra = {k: v for k, v in raw.items() if k not in (LSST_NS, OME_NS)} + return cls(lsst=lsst, ome=ome, extra=extra) + + +@dataclass +class ZarrArray: + """An IR node holding either staged numpy data or a lazy zarr handle. + + Parameters + ---------- + data + Either a ``numpy.ndarray`` (when staged for write by the output + archive) or a ``zarr.Array`` (when read by the input archive). + The two forms never mix in a single instance. + chunks + Per-axis chunk shape. ``None`` lets `to_zarr` derive a fallback + default for any IR node that reached the writer without explicit + chunks (the output archive normally sets these via the + `~lsst.images.zarr._layout.chunks_for` family of rules). + shards + Per-axis shard shape (zarr v3 native). ``None`` means the array + is unsharded. Populated by `ZarrOutputArchive` via the + `~lsst.images.zarr._layout.default_shards` rule for arrays large + enough to benefit; tiny / single-chunk arrays stay ``None``. + compression + Codec configuration. ``None`` falls back to + `ZarrCompressionOptions.default_for_dtype`. + attributes + Namespaced attributes for this array's ``zarr.json``. + """ + + data: np.ndarray | zarr.Array + chunks: tuple[int, ...] | None = None + shards: tuple[int, ...] | None = None + compression: ZarrCompressionOptions | None = None + attributes: ZarrAttributes = field(default_factory=ZarrAttributes) + + @property + def shape(self) -> tuple[int, ...]: + return tuple(self.data.shape) + + @property + def dtype(self) -> np.dtype: + return np.dtype(self.data.dtype) + + @classmethod + def from_zarr(cls, zarr_array: zarr.Array) -> Self: + """Wrap an open ``zarr.Array`` without reading its data.""" + attrs = ZarrAttributes.load(dict(zarr_array.attrs)) + # Mirror native zarr v3 ``dimension_names`` into the xarray v2-style + # ``_ARRAY_DIMENSIONS`` attribute when only the v3 form is present, + # so downstream consumers see both. + dim_names = getattr(zarr_array.metadata, "dimension_names", None) + if dim_names and "_ARRAY_DIMENSIONS" not in attrs.extra: + attrs.extra["_ARRAY_DIMENSIONS"] = list(dim_names) + return cls( + data=zarr_array, + chunks=tuple(zarr_array.chunks), + attributes=attrs, + ) + + def read(self, *, slices: tuple[slice, ...] | EllipsisType = ...) -> np.ndarray: + """Materialize this array (or a slice of it) into numpy. + + For a `ZarrArray` backed by a lazy handle, this is the only + place that touches array bytes. ``slices`` is forwarded straight + to the handle so only chunks intersecting the slice are fetched. + """ + if isinstance(self.data, np.ndarray): + return self.data if slices is ... else self.data[slices] + result = self.data[...] if slices is ... else self.data[slices] + return np.asarray(result) + + +@dataclass +class ZarrGroup: + """A zarr group: nested groups, arrays, and namespaced attributes.""" + + groups: dict[str, ZarrGroup] = field(default_factory=dict) + arrays: dict[str, ZarrArray] = field(default_factory=dict) + attributes: ZarrAttributes = field(default_factory=ZarrAttributes) + + def get(self, path: str) -> ZarrGroup | ZarrArray: + """Return a child by absolute or relative zarr path.""" + if path in ("", "/"): + return self + parts = [p for p in path.strip("/").split("/") if p] + cursor: ZarrGroup | ZarrArray = self + for part in parts: + if not isinstance(cursor, ZarrGroup): + raise KeyError(path) + if part in cursor.arrays: + cursor = cursor.arrays[part] + elif part in cursor.groups: + cursor = cursor.groups[part] + else: + raise KeyError(path) + return cursor + + def ensure_group(self, path: str) -> ZarrGroup: + """Return or create a sub-group at ``path``.""" + if path in ("", "/"): + return self + parts = [p for p in path.strip("/").split("/") if p] + cursor = self + for part in parts: + if part in cursor.arrays: + raise KeyError(f"{part!r} already exists as an array.") + if part not in cursor.groups: + cursor.groups[part] = ZarrGroup() + cursor = cursor.groups[part] + return cursor + + +@dataclass +class ZarrDocument: + """A complete zarr archive root.""" + + root: ZarrGroup = field(default_factory=ZarrGroup) + + @classmethod + def from_zarr(cls, store: Store) -> Self: + """Open ``store`` and build a lazy IR view of its contents.""" + zarr_root = zarr.open_group(store=store, mode="r", zarr_format=3) + return cls(root=_group_from_zarr(zarr_root)) + + def to_zarr(self, store: Store) -> None: + """Materialize this IR into ``store`` (which must be empty).""" + zarr_root = zarr.create_group(store=store, zarr_format=3, overwrite=False) + _group_to_zarr(self.root, zarr_root) + + +def _group_from_zarr(zarr_group: zarr.Group) -> ZarrGroup: + """Build a lazy `ZarrGroup` IR from an open ``zarr.Group``.""" + ir = ZarrGroup(attributes=ZarrAttributes.load(dict(zarr_group.attrs))) + for name, child in zarr_group.members(): + if isinstance(child, zarr.Array): + ir.arrays[name] = ZarrArray.from_zarr(child) + else: + ir.groups[name] = _group_from_zarr(child) + return ir + + +def _group_to_zarr(ir: ZarrGroup, zarr_group: zarr.Group) -> None: + """Write a `ZarrGroup` IR into an open ``zarr.Group``.""" + if dumped := ir.attributes.dump(): + zarr_group.update_attributes(dumped) + for name, sub in ir.groups.items(): + sub_zarr = zarr_group.create_group(name) + _group_to_zarr(sub, sub_zarr) + for name, array in ir.arrays.items(): + if not isinstance(array.data, np.ndarray): + raise TypeError( + f"Cannot write ZarrArray at {name!r}: data is a lazy zarr.Array, " + "not numpy. Read it first or pass a fresh numpy array." + ) + chunks = array.chunks or _default_chunks(array.data.shape) + compression = array.compression or ZarrCompressionOptions.default_for_dtype(str(array.dtype)) + serializer, compressors = _build_codecs(compression) + # Promote ``_ARRAY_DIMENSIONS`` from the CF-style attribute to the + # native zarr v3 ``dimension_names`` metadata field; xarray's v3 + # backend reads from there, not from attributes, and refuses to + # open the parent group if *any* array lacks the field. Arrays + # without explicit names fall back to ``[None] * ndim``. + dim_names = array.attributes.extra.get("_ARRAY_DIMENSIONS") + if dim_names is None: + dim_names = [None] * array.data.ndim + else: + dim_names = list(dim_names) + zarr_array = zarr_group.create_array( + name=name, + shape=array.data.shape, + chunks=chunks, + dtype=array.data.dtype, + shards=array.shards, + serializer=serializer, + compressors=compressors, + dimension_names=dim_names, + ) + zarr_array[:] = array.data + if dumped := array.attributes.dump(): + zarr_array.update_attributes(dumped) + + +def _default_chunks(shape: tuple[int, ...]) -> tuple[int, ...]: + """Return the default chunk shape: ``min(1024, dim)`` per axis.""" + return tuple(min(1024, dim) for dim in shape) + + +@dataclass +class OmeMultiscale: + """OME-NGFF v0.5 multiscales metadata for a single-level image. + + The backend always writes one level whose ``path`` points at a + sibling array (``image`` for typical archives). + ``coordinate_transformations`` defaults to a unit ``scale`` so the + OME block is well-formed even when the simplified affine is + dropped by the residual validator. + """ + + name: str + axes: tuple[str, ...] + dataset_path: str = "image" + coordinate_transformations: list[dict[str, Any]] | None = None + + @staticmethod + def _axis_block(name: str) -> dict[str, Any]: + if name == "c": + return {"name": "c", "type": "channel"} + if name == "t": + return {"name": "t", "type": "time"} + return {"name": name, "type": "space", "unit": "pixel"} + + def dump(self) -> dict[str, Any]: + ndim = len(self.axes) + ct = self.coordinate_transformations + if ct is None: + ct = [{"type": "scale", "scale": [1.0] * ndim}] + return { + "name": self.name, + "axes": [self._axis_block(a) for a in self.axes], + "datasets": [ + { + "path": self.dataset_path, + "coordinateTransformations": ct, + } + ], + } + + +@dataclass +class OmeOmeroChannel: + """OME ``omero/channels`` entry (used only when a channel axis exists).""" + + label: str + color: str | None = None + + def dump(self) -> dict[str, Any]: + out: dict[str, Any] = {"label": self.label} + if self.color is not None: + out["color"] = self.color + return out + + +@dataclass +class MaskPlaneEntry: + """One mask-plane definition.""" + + name: str + bit: int + description: str = "" + + +@dataclass +class CfFlagAttributes: + """CF-conventions flag metadata for a 2-D packed mask array. + + Emits ``flag_masks`` (list of bit values), ``flag_meanings`` + (single space-separated string per CF), and the LSST extension + ``flag_descriptions`` (list of human-readable strings parallel to + ``flag_meanings``). + """ + + planes: list[MaskPlaneEntry] = field(default_factory=list) + + def dump(self) -> dict[str, Any]: + return { + "flag_masks": [int(1 << p.bit) for p in self.planes], + "flag_meanings": " ".join(p.name for p in self.planes), + "flag_descriptions": [p.description for p in self.planes], + } + + @classmethod + def load(cls, raw: dict[str, Any]) -> Self: + meanings = raw.get("flag_meanings", "").split() + masks = [int(m) for m in raw.get("flag_masks", [])] + descriptions = list(raw.get("flag_descriptions", [""] * len(meanings))) + planes = [] + for name, mask, desc in zip(meanings, masks, descriptions, strict=False): + # Recover bit position from the mask value (always a power of 2). + bit = (mask & -mask).bit_length() - 1 + planes.append(MaskPlaneEntry(name=name, bit=bit, description=desc)) + return cls(planes=planes) + + +def build_image_array_attrs( + *, + axes: tuple[str, ...], + units: str | None = None, + long_name: str | None = None, +) -> dict[str, Any]: + """Build the CF / xarray attribute block for an image array. + + Used for arrays of rank 2 or higher. + """ + out: dict[str, Any] = {"_ARRAY_DIMENSIONS": list(axes)} + if units is not None: + out["units"] = units + if long_name is not None: + out["long_name"] = long_name + return out + + +def _build_codecs(options: ZarrCompressionOptions) -> tuple[Any, list[Any]]: + """Build a zarr v3 codec stack from `ZarrCompressionOptions`. + + Returns a ``(serializer, compressors)`` pair suitable for the + ``serializer=`` and ``compressors=`` keyword arguments of + `zarr.Group.create_array`. + """ + if options.codec != "blosc": + raise NotImplementedError(f"Unsupported codec {options.codec!r}.") + serializer = BytesCodec() + # ``cname`` and ``shuffle`` are typed as enum literals on BloscCodec; + # at runtime any equivalent string is accepted, so cast through Any. + compressors = [ + BloscCodec( + cname=cast(Any, options.cname), + clevel=options.clevel, + shuffle=cast(Any, options.shuffle), + ) + ] + return serializer, compressors diff --git a/python/lsst/images/zarr/_output_archive.py b/python/lsst/images/zarr/_output_archive.py new file mode 100644 index 00000000..ccc8fc66 --- /dev/null +++ b/python/lsst/images/zarr/_output_archive.py @@ -0,0 +1,563 @@ +# This file is part of lsst-images. +# +# Developed for the LSST Data Management System. +# This product includes software developed by the LSST Project +# (https://www.lsst.org). +# See the COPYRIGHT file at the top-level directory of this distribution +# for details of code ownership. +# +# Use of this source code is governed by a 3-clause BSD-style +# license that can be found in the LICENSE file. + +from __future__ import annotations + +__all__ = ("ZarrOutputArchive", "write") + +from collections.abc import Callable, Hashable, Iterator, Mapping +from typing import Any, ClassVar, cast + +import astropy.io.fits +import astropy.table +import astropy.units +import numpy as np +import pydantic +import zarr + +from .._mask import Mask, MaskSchema +from .._transforms import FrameSet +from .._transforms._ast import Channel, StringStream +from ..fits._common import FitsOpaqueMetadata +from ..serialization import ( + ArchiveTree, + ArrayReferenceModel, + NestedOutputArchive, + NumberType, + OutputArchive, + TableColumnModel, + TableModel, + no_header_updates, +) +from ._common import ( + DEFAULT_TARGET_SHARD_BYTES, + ZarrCompressionOptions, + ZarrPointerModel, + archive_path_to_zarr_path, + mask_dtype_for_plane_count, +) +from ._layout import ( + affine_check, + axes_for_archive_class, + chunks_aligned_to, + chunks_for, + decorate_sub_archives, + default_shards, + serialize_fits_opaque_metadata, +) +from ._model import ( + CfFlagAttributes, + MaskPlaneEntry, + OmeMultiscale, + ZarrArray, + ZarrDocument, + ZarrGroup, + build_image_array_attrs, +) +from ._store import open_store_for_write + + +class ZarrOutputArchive(OutputArchive[ZarrPointerModel]): + """Output archive that populates a ``ZarrDocument`` IR. + + Bytes are not written until the IR is materialized via + ``ZarrDocument.to_zarr``, which the public `write` helper performs + on context-manager exit. + + Parameters + ---------- + chunks + Per-array chunk overrides keyed by the array's archive path + (e.g. ``"image"``). ``None`` for a key means "use the layout + default". + shards, compression + Same shape as ``chunks``. + archive_class + Top-level archive class name (``"VisitImage"``, ``"CellCoadd"``, + …). Used by the layout layer to pick chunk defaults; set by + ``write()`` before ``obj.serialize`` runs so ``add_array`` + sees the right value. + archive_metadata + Class-specific layout hints (``cell_shape`` for ``CellCoadd``, + ``mask_schema`` for the mask packer). + """ + + _prefer_native_mask_arrays: ClassVar[bool] = True + """Tell ``Mask.serialize`` to hand us the 3-D ``(y, x, mask_size)`` + array in one ``add_array`` call. ``add_array`` packs it into a 2-D + wide-integer array on disk with CF ``flag_masks`` / ``flag_meanings`` + attributes. + """ + + def __init__( + self, + *, + chunks: Mapping[str, tuple[int, ...] | None] | None = None, + shards: Mapping[str, tuple[int, ...] | None] | None = None, + compression: Mapping[str, ZarrCompressionOptions | None] | None = None, + archive_class: str = "Image", + archive_metadata: Mapping[str, Any] | None = None, + ) -> None: + self.document = ZarrDocument(root=ZarrGroup()) + self._chunks = dict(chunks) if chunks else {} + self._shards = dict(shards) if shards else {} + self._compression = dict(compression) if compression else {} + self._archive_class = archive_class + self._archive_metadata = dict(archive_metadata) if archive_metadata else {} + self._pointers: dict[Hashable, ZarrPointerModel] = {} + self._frame_sets: list[tuple[FrameSet, ZarrPointerModel]] = [] + self._image_chunks: tuple[int, ...] | None = None + + def serialize_direct[T: pydantic.BaseModel]( + self, + name: str, + serializer: Callable[[OutputArchive[ZarrPointerModel]], T], + ) -> T: + nested = NestedOutputArchive[ZarrPointerModel](name, self) + return serializer(nested) + + def serialize_pointer[T: ArchiveTree]( + self, + name: str, + serializer: Callable[[OutputArchive[ZarrPointerModel]], T], + key: Hashable, + ) -> ZarrPointerModel: + if (cached := self._pointers.get(key)) is not None: + return cached + archive_path = name if name.startswith("/") else f"/{name}" + sub_zarr_path = archive_path_to_zarr_path(archive_path) + # Run the serializer first so any nested add_array calls land + # inside the IR before we dump this sub-tree to JSON. + model = self.serialize_direct(name, serializer) + json_bytes = model.model_dump_json().encode("utf-8") + parent = self.document.root.ensure_group(sub_zarr_path) + # Single-chunk storage: the JSON tree is always read whole. + tree_data = np.frombuffer(json_bytes, dtype=np.uint8) + parent.arrays["lsst_json"] = ZarrArray(data=tree_data, chunks=tree_data.shape) + pointer = ZarrPointerModel(path=f"{sub_zarr_path}/lsst_json") + self._pointers[key] = pointer + return pointer + + def serialize_frame_set[T: ArchiveTree]( + self, + name: str, + frame_set: FrameSet, + serializer: Callable[[OutputArchive], T], + key: Hashable, + ) -> ZarrPointerModel: + pointer = self.serialize_pointer(name, serializer, key) + self._frame_sets.append((frame_set, pointer)) + return pointer + + def iter_frame_sets(self) -> Iterator[tuple[FrameSet, ZarrPointerModel]]: + return iter(self._frame_sets) + + def add_array( + self, + array: np.ndarray, + *, + name: str | None = None, + update_header: Callable[[astropy.io.fits.Header], None] = no_header_updates, + ) -> ArrayReferenceModel: + if name is None: + raise ValueError("Anonymous arrays are not supported in ZarrOutputArchive.") + archive_path = name if name.startswith("/") else f"/{name}" + zarr_path = archive_path_to_zarr_path(archive_path) + leaf = zarr_path.rsplit("/", 1)[-1] + parent_path = zarr_path[: -(len(leaf) + 1)] or "/" + parent = self.document.root.ensure_group(parent_path) + + # Mask: pack 3-D (mask_size, y, x) -> 2-D (y, x) wide-int packed. + # Mask.serialize emits the byte axis first when the archive opts in + # via _prefer_native_mask_arrays (matching the HDF5/NDF convention); + # we undo that so the on-disk array is the natural xarray layout. + if leaf == "mask" and array.ndim == 3: + array = np.moveaxis(array, 0, -1) + packed, flag_attrs = self._pack_mask(array) + chunks = self._chunks.get(name) or self._chunks.get(leaf) + if chunks is None and self._image_chunks is not None: + chunks = chunks_aligned_to(image_chunks=self._image_chunks, shape=packed.shape) + shards = self._shards.get(name) or self._shards.get(leaf) + if shards is None and chunks is not None: + shards = default_shards( + chunks=tuple(chunks), + shape=tuple(packed.shape), + dtype=packed.dtype, + target_bytes=DEFAULT_TARGET_SHARD_BYTES, + ) + extra: dict[str, Any] = {"_ARRAY_DIMENSIONS": ["y", "x"]} + extra.update(flag_attrs.dump()) + ir_array = ZarrArray( + data=packed, + chunks=chunks, + shards=shards, + compression=self._compression.get(name), + ) + ir_array.attributes.extra = extra + parent.arrays[leaf] = ir_array + # The model reports the schema's element dtype (uint8 / + # uint16 / ...) so the input archive can recover the + # original ``(y, x, mask_size)`` array; the on-disk array + # itself is the wide packed integer. + return ArrayReferenceModel( + source=f"zarr:{zarr_path}", + shape=list(packed.shape), + datatype=NumberType.from_numpy(array.dtype), + ) + + chunks = self._chunks.get(name) or self._chunks.get(leaf) + # variance / other top-level siblings: align to image's chunks. + if ( + chunks is None + and self._image_chunks is not None + and parent_path == "/" + and leaf == "variance" + and array.ndim == len(self._image_chunks) + ): + chunks = chunks_aligned_to(image_chunks=self._image_chunks, shape=array.shape) + + # Default chunks for the top-level image: from layout rules. + if chunks is None and parent_path == "/" and leaf == "image": + chunks = chunks_for( + self._archive_class, + array.shape, + None, + archive_metadata=self._archive_metadata, + ) + + # Default chunks for a CellCoadd-style 4-D PSF: one cell per chunk. + if chunks is None and leaf == "psf" and array.ndim == 4 and parent_path == "/": + chunks = (1, 1, array.shape[2], array.shape[3]) + + shards = self._shards.get(name) or self._shards.get(leaf) + if shards is None and chunks is not None: + shards = default_shards( + chunks=tuple(chunks), + shape=tuple(array.shape), + dtype=array.dtype, + target_bytes=DEFAULT_TARGET_SHARD_BYTES, + ) + ir_array = ZarrArray( + data=np.ascontiguousarray(array), + chunks=chunks, + shards=shards, + compression=self._compression.get(name), + ) + if parent_path == "/" and leaf in ("image", "variance"): + ir_array.attributes.extra = build_image_array_attrs( + axes=("y", "x"), + long_name="science image" if leaf == "image" else "image variance", + ) + parent.arrays[leaf] = ir_array + + # Remember the image's chunks so siblings can align. + if parent_path == "/" and leaf == "image" and chunks is not None: + self._image_chunks = tuple(chunks) + + return ArrayReferenceModel( + source=f"zarr:{zarr_path}", + shape=list(array.shape), + datatype=NumberType.from_numpy(array.dtype), + ) + + def _pack_mask(self, array: np.ndarray) -> tuple[np.ndarray, CfFlagAttributes]: + """Pack a 3-D ``(y, x, mask_size)`` mask into a 2-D wide-int array. + + The schema is taken from ``self._archive_metadata["mask_schema"]``. + Returns the packed array and the CF flag attributes. + """ + schema = self._archive_metadata.get("mask_schema") + if not isinstance(schema, MaskSchema): + raise ValueError( + "Writing a 3-D mask requires archive_metadata['mask_schema'] " + "to be set; the output archive cannot infer the plane " + "definitions otherwise." + ) + n_planes = len(schema) + target_dtype = mask_dtype_for_plane_count(n_planes) + # Pack: each (y, x) pixel's mask_size schema-dtype elements + # become one wide integer. Element 0 occupies bits + # [0, stride), element 1 occupies [stride, 2*stride), etc., + # where stride = 8 * schema.dtype.itemsize. Plane N therefore + # lives at packed bit position N, matching the CF flag_masks + # attribute (1 << N). + stride = 8 * array.dtype.itemsize + packed = np.zeros(array.shape[:2], dtype=target_dtype) + for i in range(array.shape[2]): + packed |= array[..., i].astype(target_dtype) << (stride * i) + # ``MaskSchema`` may carry ``None`` placeholders for retired plane + # bits; drop them in the CF flag list. + planes = [ + MaskPlaneEntry(name=p.name, bit=i, description=p.description) + for i, p in enumerate(schema) + if p is not None + ] + return packed, CfFlagAttributes(planes=planes) + + def add_table( + self, + table: astropy.table.Table, + *, + name: str | None = None, + update_header: Callable[[astropy.io.fits.Header], None] = no_header_updates, + ) -> TableModel: + if name is None: + raise ValueError("Anonymous tables are not supported in ZarrOutputArchive.") + columns = TableColumnModel.from_table(table) + archive_path = name if name.startswith("/") else f"/{name}" + table_zarr_path = f"/lsst/tables{archive_path}" + parent = self.document.root.ensure_group(table_zarr_path) + for c in columns: + assert isinstance(c.data, ArrayReferenceModel) + column_array = np.ascontiguousarray(np.asarray(table[c.name])) + parent.arrays[c.name] = ZarrArray(data=column_array) + c.data.source = f"zarr:{table_zarr_path}/{c.name}" + return TableModel(columns=columns, meta=table.meta) + + def add_structured_array( + self, + array: np.ndarray, + *, + name: str | None = None, + units: Mapping[str, astropy.units.Unit] | None = None, + descriptions: Mapping[str, str] | None = None, + update_header: Callable[[astropy.io.fits.Header], None] = no_header_updates, + ) -> TableModel: + if name is None: + raise ValueError("Anonymous structured arrays are not supported.") + columns = TableColumnModel.from_record_dtype(array.dtype) + archive_path = name if name.startswith("/") else f"/{name}" + table_zarr_path = f"/lsst/tables{archive_path}" + parent = self.document.root.ensure_group(table_zarr_path) + for c in columns: + assert isinstance(c.data, ArrayReferenceModel) + column_array = np.ascontiguousarray(array[c.name]) + parent.arrays[c.name] = ZarrArray(data=column_array) + c.data.source = f"zarr:{table_zarr_path}/{c.name}" + if units and (unit := units.get(c.name)): + c.unit = unit + if descriptions and (description := descriptions.get(c.name)): + c.description = description + return TableModel(columns=columns) + + def add_tree(self, tree: ArchiveTree) -> None: + """Finalize the IR: write JSON tree, WCS, and root attributes. + + Called once after the user's serializer has populated arrays + / sub-trees. Sets the ``lsst.*`` and ``ome.*`` blocks on the + root group, stages ``/lsst_json`` as 1-D ``uint8`` UTF-8 JSON, + and runs the affine residual validator if the archive carries + a frame set. + """ + # Stage the JSON tree at /lsst_json (single chunk — read whole). + # Name mirrors NDF's /MORE/LSST/JSON and FITS's "JSON" HDU. + json_bytes = tree.model_dump_json().encode("utf-8") + tree_data = np.frombuffer(json_bytes, dtype=np.uint8) + self.document.root.arrays["lsst_json"] = ZarrArray(data=tree_data, chunks=tree_data.shape) + + # Stage the AST WCS string at /wcs_ast when a frame set is registered. + wcs_ast_path: str | None = None + if self._frame_sets: + wcs_ast_path = self._stage_wcs_ast(self._frame_sets[0][0]) + + # Root LSST attrs. + lsst = self.document.root.attributes.lsst + lsst["archive_class"] = self._archive_class + lsst["json"] = "lsst_json" + if wcs_ast_path is not None: + lsst["wcs_ast"] = wcs_ast_path + if "cell_grid" in self._archive_metadata: + lsst["cell_grid"] = self._archive_metadata["cell_grid"] + + # data_model / version go to the top level (not under lsst:). + self.document.root.attributes.extra["data_model"] = self._data_model_for(self._archive_class) + self.document.root.attributes.extra["version"] = 1 + + # OME multiscale block, gated by axes_for_archive_class. + axes = axes_for_archive_class(self._archive_class) + if axes and "image" in self.document.root.arrays: + image_array = self.document.root.arrays["image"] + ct: list[dict[str, Any]] | None = None + if self._frame_sets: + fs = self._frame_sets[0][0] + if len(image_array.shape) != 2: + raise ValueError( + f"Top-level image must be 2-D for the OME affine " + f"check; got shape {image_array.shape}." + ) + image_shape: tuple[int, int] = (image_array.shape[0], image_array.shape[1]) + check = affine_check( + frame_set=fs, + image_shape=image_shape, + max_residual_pixels=1.0, + ) + lsst["wcs_simplified_dropped"] = check.dropped + if not check.dropped: + ct = check.coordinate_transformations + multiscale = OmeMultiscale( + name=self._archive_class.lower(), + axes=axes, + dataset_path="image", + coordinate_transformations=ct, + ) + self.document.root.attributes.ome["multiscales"] = [multiscale.dump()] + + # Walk sub-groups and decorate each one that holds an ``image`` + # array (e.g. ``ColorImage`` channels) as its own valid Image + # sub-archive with OME multiscales. + decorate_sub_archives(self.document) + + def _stage_wcs_ast(self, frame_set: FrameSet) -> str: + """Encode an AST FrameSet as UTF-8 text and stage at /wcs_ast. + + Currently dead — left for future use; see ``add_tree``'s frame-set + hook. + """ + from .._transforms._ast import Object as _AstObject + + stream = StringStream() + # FrameSet inherits from Object in our AST bridge; cast for the + # ``Channel.write`` signature which is typed against the base class. + Channel(stream, options="Full=-1,Comment=0,Indent=0").write(cast(_AstObject, frame_set)) + text = stream.getSinkData() + wcs_data = np.frombuffer(text.encode("utf-8"), dtype=np.uint8) + # Single chunk: WCS is always read whole. + self.document.root.arrays["wcs_ast"] = ZarrArray(data=wcs_data, chunks=wcs_data.shape) + return "wcs_ast" + + @staticmethod + def _data_model_for(archive_class: str) -> str: + """Map an archive class name to the public ``data_model`` string.""" + return { + "Image": "org.lsst.image", + "Mask": "org.lsst.mask", + "MaskedImage": "org.lsst.masked_image", + "VisitImage": "org.lsst.visit_image", + "ColorImage": "org.lsst.color_image", + "CellCoadd": "org.lsst.cell_coadd", + }.get(archive_class, f"org.lsst.{archive_class.lower()}") + + +def build_archive_metadata(obj: Any) -> dict[str, Any]: + """Resolve layout-affecting metadata from an in-memory archive object. + + The output archive's chunk and metadata rules consult + ``cell_shape`` (used by `~lsst.images.zarr._layout.chunks_for` to + align chunks to a `CellCoadd`'s cells) and ``mask_schema`` (used + by `_pack_mask` to produce CF flag attributes). Different archive + classes expose this information under different attribute names: + + - ``Image``: nothing (no cell grid, no mask schema). + - ``MaskedImage``: ``mask.schema``. + - ``Mask``: ``schema`` directly on the object. + - ``CellCoadd``: ``mask.schema`` and ``grid.cell_shape``. + + Returns a flat ``dict`` ready to pass as + ``ZarrOutputArchive(archive_metadata=...)``. Keys are present + only when a value was found. + """ + metadata: dict[str, Any] = {} + cell_shape = _resolve_cell_shape(obj) + if cell_shape is not None: + metadata["cell_shape"] = cell_shape + mask_schema = _resolve_mask_schema(obj) + if mask_schema is not None: + metadata["mask_schema"] = mask_schema + return metadata + + +def _resolve_cell_shape(obj: Any) -> tuple[int, ...] | None: + """Return the cell shape as a ``(y, x)`` tuple, or ``None``. + + Tries ``obj.cell_shape`` first, then ``obj.grid.cell_shape`` + (used by `CellCoadd`), then ``obj.cell_grid.cell_shape``. + """ + direct = getattr(obj, "cell_shape", None) + if direct is not None: + return tuple(direct) + grid = getattr(obj, "grid", None) + if grid is None: + grid = getattr(obj, "cell_grid", None) + if grid is not None: + nested = getattr(grid, "cell_shape", None) + if nested is not None: + return tuple(nested) + return None + + +def _resolve_mask_schema(obj: Any) -> MaskSchema | None: + """Return the mask schema, or ``None`` if the object has no mask.""" + direct = getattr(obj, "mask_schema", None) + if direct is not None: + return direct + mask = getattr(obj, "mask", None) + if mask is not None: + nested = getattr(mask, "schema", None) + if nested is not None: + return nested + if isinstance(obj, Mask): + # Top-level Mask: schema is on the object itself. + return obj.schema + return None + + +def write( + obj: Any, + path: Any, + *, + chunks: Mapping[str, tuple[int, ...] | None] | None = None, + shards: Mapping[str, tuple[int, ...] | None] | None = None, + compression: Mapping[str, ZarrCompressionOptions | None] | None = None, + metadata: Mapping[str, Any] | None = None, + butler_info: Any | None = None, +) -> ArchiveTree: + """Write ``obj`` to a zarr archive at ``path``. + + Parameters mirror the FITS / NDF write helpers. The store + implementation (LocalStore / ZipStore / FsspecStore) is selected + from the URI shape by ``_store.open_store_for_write``. + """ + archive_class = type(obj).__name__ + archive_default_name = getattr(obj, "_archive_default_name", None) + archive_metadata = build_archive_metadata(obj) + + archive = ZarrOutputArchive( + chunks=chunks, + shards=shards, + compression=compression, + archive_class=archive_class, + archive_metadata=archive_metadata, + ) + if archive_default_name is not None: + tree = archive.serialize_direct(archive_default_name, obj.serialize) + else: + tree = obj.serialize(archive) + if metadata is not None: + tree.metadata.update(metadata) + if butler_info is not None: + tree.butler_info = butler_info + archive.add_tree(tree) + # Stage opaque metadata after add_tree so the namespace attribute + # writes happen in the right order. + opaque = getattr(obj, "_opaque_metadata", None) + if isinstance(opaque, FitsOpaqueMetadata): + serialize_fits_opaque_metadata(archive.document, opaque) + with open_store_for_write(path) as store: + archive.document.to_zarr(store) + # Consolidate metadata so a single read fetches the whole + # hierarchy's zarr.json contents — significant perf win on + # remote stores. ZipStore does not support consolidation; it + # raises TypeError, which we ignore so zip writes still work. + try: + zarr.consolidate_metadata(store) + except TypeError: + pass + return tree diff --git a/python/lsst/images/zarr/_store.py b/python/lsst/images/zarr/_store.py new file mode 100644 index 00000000..032cecef --- /dev/null +++ b/python/lsst/images/zarr/_store.py @@ -0,0 +1,104 @@ +# This file is part of lsst-images. +# +# Developed for the LSST Data Management System. +# This product includes software developed by the LSST Project +# (https://www.lsst.org). +# See the COPYRIGHT file at the top-level directory of this distribution +# for details of code ownership. +# +# Use of this source code is governed by a 3-clause BSD-style +# license that can be found in the LICENSE file. + +from __future__ import annotations + +__all__ = ("open_store_for_read", "open_store_for_write") + +import os +from collections.abc import Iterator +from contextlib import contextmanager + +import zarr +from zarr.abc.store import Store + +from lsst.resources import ResourcePath, ResourcePathExpression + + +def _is_zip(rp: ResourcePath) -> bool: + return rp.path.endswith(".zarr.zip") or rp.path.endswith(".zip") + + +def _is_remote(rp: ResourcePath) -> bool: + return rp.scheme not in ("", "file") + + +@contextmanager +def open_store_for_write(path: ResourcePathExpression) -> Iterator[Store]: + """Open a zarr store for writing. + + Refuses to overwrite a non-empty existing store. The returned + context manager closes the store on exit; for ``ZipStore`` this + finalizes the central directory. + """ + rp = ResourcePath(path) + store: Store + if _is_zip(rp): + if _is_remote(rp): + raise NotImplementedError("Remote ZipStore writes are a follow-up.") + local = rp.ospath + if os.path.exists(local) and os.path.getsize(local) > 0: + raise OSError(f"File {local!r} already exists.") + zip_store = zarr.storage.ZipStore(local, mode="w") + try: + yield zip_store + finally: + if getattr(zip_store, "_is_open", False): + zip_store.close() + return + if _is_remote(rp): + import fsspec + + fs, fs_path = fsspec.url_to_fs(str(rp)) + if fs.exists(fs_path) and fs.ls(fs_path): + raise OSError(f"Store {rp!s} already exists.") + store = zarr.storage.FsspecStore(fs=fs, path=fs_path, read_only=False) + yield store + return + local = rp.ospath + if os.path.exists(local) and os.listdir(local): + raise OSError(f"Directory {local!r} already exists and is non-empty.") + os.makedirs(local, exist_ok=True) + store = zarr.storage.LocalStore(local, read_only=False) + yield store + + +@contextmanager +def open_store_for_read(path: ResourcePathExpression) -> Iterator[Store]: + """Open a zarr store for reading.""" + rp = ResourcePath(path) + store: Store + if _is_zip(rp): + if _is_remote(rp): + with rp.as_local() as local: + zip_store = zarr.storage.ZipStore(local.ospath, mode="r") + try: + yield zip_store + finally: + if getattr(zip_store, "_is_open", False): + zip_store.close() + return + zip_store = zarr.storage.ZipStore(rp.ospath, mode="r") + try: + yield zip_store + finally: + if getattr(zip_store, "_is_open", False): + zip_store.close() + return + if _is_remote(rp): + import fsspec + + fs, fs_path = fsspec.url_to_fs(str(rp)) + store = zarr.storage.FsspecStore(fs=fs, path=fs_path, read_only=True) + yield store + return + store = zarr.storage.LocalStore(rp.ospath, read_only=True) + yield store diff --git a/requirements.txt b/requirements.txt index 6bd55a85..0fcd11bd 100644 --- a/requirements.txt +++ b/requirements.txt @@ -9,3 +9,5 @@ astro-metadata-translator @ git+https://github.com/lsst/astro_metadata_translato starlink-pyast >= 4.0.0 scipy >= 1.13 shapely >= 2.1 +xarray >= 2024.1 +zarr >= 3.0 diff --git a/tests/test_cell_coadd.py b/tests/test_cell_coadd.py index 8e47af0d..deac31f7 100644 --- a/tests/test_cell_coadd.py +++ b/tests/test_cell_coadd.py @@ -24,11 +24,21 @@ DP2_COADD_DATA_ID, DP2_COADD_MISSING_CELL, RoundtripFits, + RoundtripZarr, assert_masked_images_equal, assert_psfs_equal, compare_cell_coadd_to_legacy, ) +try: + import zarr + + from lsst.images.zarr._store import open_store_for_read + + HAVE_ZARR = True +except ImportError: + HAVE_ZARR = False + DATA_DIR = os.environ.get("TESTDATA_IMAGES_DIR", None) @@ -147,6 +157,22 @@ def test_roundtrip(self) -> None: psf_points=self.psf_points, ) + @unittest.skipUnless(HAVE_ZARR, "zarr is not installed") + def test_zarr_roundtrip_uses_cell_aligned_chunks(self) -> None: + """Writing a CellCoadd to zarr aligns chunks to the cell shape. + + The bug fixed in DM-55041 was that ``write()`` probed + ``obj.cell_shape`` / ``obj.cell_grid`` but `CellCoadd` exposes + the cell shape under ``obj.grid.cell_shape``. Without the fix, + real CellCoadd writes fall back to generic 256-pixel chunks + instead of cell-aligned chunks. + """ + cell_shape = self.cell_coadd.grid.cell_shape + with RoundtripZarr(self, self.cell_coadd, "CellCoadd") as roundtrip: + with open_store_for_read(roundtrip.filename) as store: + root = zarr.open_group(store=store, mode="r", zarr_format=3) + self.assertEqual(tuple(root["image"].chunks), (cell_shape.y, cell_shape.x)) + if __name__ == "__main__": unittest.main() diff --git a/tests/test_zarr_common.py b/tests/test_zarr_common.py new file mode 100644 index 00000000..cb757695 --- /dev/null +++ b/tests/test_zarr_common.py @@ -0,0 +1,123 @@ +# This file is part of lsst-images. +# +# Developed for the LSST Data Management System. +# This product includes software developed by the LSST Project +# (https://www.lsst.org). +# See the COPYRIGHT file at the top-level directory of this distribution +# for details of code ownership. +# +# Use of this source code is governed by a 3-clause BSD-style +# license that can be found in the LICENSE file. + +from __future__ import annotations + +import os +import subprocess +import sys +import unittest + +import numpy as np + +try: + from lsst.images.zarr._common import ( + LSST_NS, + LSST_VERSION, + OME_NS, + OME_VERSION, + ZarrCompressionOptions, + ZarrPointerModel, + archive_path_to_zarr_path, + mask_dtype_for_plane_count, + ) + + HAVE_ZARR = True +except ImportError: + HAVE_ZARR = False + + +@unittest.skipUnless(HAVE_ZARR, "zarr is not installed") +class CommonTestCase(unittest.TestCase): + """Tests for the zarr ``_common`` module.""" + + def test_pointer_round_trips(self) -> None: + original = ZarrPointerModel(path="/lsst/psf/lsst_json") + recovered = ZarrPointerModel.model_validate_json(original.model_dump_json()) + self.assertEqual(recovered, original) + + def test_constants(self) -> None: + self.assertEqual(LSST_NS, "lsst") + self.assertEqual(OME_NS, "ome") + self.assertEqual(OME_VERSION, "0.5") + self.assertGreaterEqual(LSST_VERSION, 1) + + def test_archive_path_translation(self) -> None: + # Empty archive path -> the canonical root-level JSON tree. + self.assertEqual(archive_path_to_zarr_path(""), "/lsst_json") + # Non-empty archive paths are kept verbatim. + self.assertEqual(archive_path_to_zarr_path("/image"), "/image") + self.assertEqual(archive_path_to_zarr_path("image"), "/image") + self.assertEqual(archive_path_to_zarr_path("/red/image"), "/red/image") + self.assertEqual(archive_path_to_zarr_path("/psf"), "/psf") + + def test_compression_defaults(self) -> None: + floats = ZarrCompressionOptions.default_for_dtype("float32") + self.assertEqual(floats.codec, "blosc") + self.assertEqual(floats.shuffle, "shuffle") + ints = ZarrCompressionOptions.default_for_dtype("uint8") + self.assertEqual(ints.shuffle, "bitshuffle") + + def test_mask_dtype_picks_smallest_fit(self) -> None: + self.assertEqual(mask_dtype_for_plane_count(1), np.dtype("uint8")) + self.assertEqual(mask_dtype_for_plane_count(8), np.dtype("uint8")) + self.assertEqual(mask_dtype_for_plane_count(9), np.dtype("uint16")) + self.assertEqual(mask_dtype_for_plane_count(16), np.dtype("uint16")) + self.assertEqual(mask_dtype_for_plane_count(17), np.dtype("uint32")) + self.assertEqual(mask_dtype_for_plane_count(32), np.dtype("uint32")) + self.assertEqual(mask_dtype_for_plane_count(33), np.dtype("uint64")) + self.assertEqual(mask_dtype_for_plane_count(64), np.dtype("uint64")) + + def test_mask_dtype_refuses_more_than_64_planes(self) -> None: + with self.assertRaisesRegex(ValueError, "supports up to 64"): + mask_dtype_for_plane_count(65) + + +@unittest.skipUnless(HAVE_ZARR, "zarr is not installed") +class TargetShardBytesEnvVarTestCase(unittest.TestCase): + """`DEFAULT_TARGET_SHARD_BYTES` reads from env var at import time.""" + + def _import_in_subprocess(self, env_value: str | None) -> subprocess.CompletedProcess[str]: + env = dict(os.environ) + env.pop("LSST_IMAGES_ZARR_TARGET_SHARD_BYTES", None) + if env_value is not None: + env["LSST_IMAGES_ZARR_TARGET_SHARD_BYTES"] = env_value + code = ( + "from lsst.images.zarr._common import DEFAULT_TARGET_SHARD_BYTES;" + "print(DEFAULT_TARGET_SHARD_BYTES)" + ) + return subprocess.run( + [sys.executable, "-c", code], + env=env, + capture_output=True, + text=True, + check=False, + ) + + def test_unset_uses_default(self) -> None: + result = self._import_in_subprocess(None) + self.assertEqual(result.returncode, 0, result.stderr) + self.assertEqual(result.stdout.strip(), str(16 * 1024 * 1024)) + + def test_set_value_overrides(self) -> None: + result = self._import_in_subprocess("1234567") + self.assertEqual(result.returncode, 0, result.stderr) + self.assertEqual(result.stdout.strip(), "1234567") + + def test_garbage_value_fails_at_import(self) -> None: + result = self._import_in_subprocess("not-a-number") + self.assertNotEqual(result.returncode, 0) + self.assertIn("LSST_IMAGES_ZARR_TARGET_SHARD_BYTES", result.stderr) + self.assertIn("is not a base-10 integer", result.stderr) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/test_zarr_cross_format.py b/tests/test_zarr_cross_format.py new file mode 100644 index 00000000..27fef9e6 --- /dev/null +++ b/tests/test_zarr_cross_format.py @@ -0,0 +1,66 @@ +# This file is part of lsst-images. +# +# Developed for the LSST Data Management System. +# This product includes software developed by the LSST Project +# (https://www.lsst.org). +# See the COPYRIGHT file at the top-level directory of this distribution +# for details of code ownership. +# +# Use of this source code is governed by a 3-clause BSD-style +# license that can be found in the LICENSE file. + +from __future__ import annotations + +import os +import tempfile +import unittest + +import astropy.io.fits +import numpy as np + +from lsst.images import Box, Image +from lsst.images.fits import read as fits_read +from lsst.images.fits import write as fits_write + +try: + import zarr # noqa: F401 + + from lsst.images.zarr import read as zarr_read + from lsst.images.zarr import write as zarr_write + + HAVE_ZARR = True +except ImportError: + HAVE_ZARR = False + + +@unittest.skipUnless(HAVE_ZARR, "zarr is not installed") +class FitsZarrCrossFormatTestCase(unittest.TestCase): + """End-to-end FITS -> Zarr -> FITS preserves the primary header.""" + + def test_fits_to_zarr_to_fits_preserves_primary_header(self) -> None: + original = Image( + np.arange(20, dtype=np.float32).reshape(4, 5), + bbox=Box.factory[10:14, 20:25], + ) + with tempfile.TemporaryDirectory() as tmp: + fits_a = os.path.join(tmp, "a.fits") + zarr_path = os.path.join(tmp, "b.zarr") + fits_b = os.path.join(tmp, "c.fits") + + def update_header(header): + header["ORIGIN"] = "RUBIN" + header["EXPTIME"] = 30.0 + + fits_write(original, fits_a, update_header=update_header) + from_fits = fits_read(Image, fits_a).deserialized + zarr_write(from_fits, zarr_path) + from_zarr = zarr_read(Image, zarr_path).deserialized + fits_write(from_zarr, fits_b) + + with astropy.io.fits.open(fits_b) as hdul: + self.assertEqual(hdul[0].header["ORIGIN"], "RUBIN") + self.assertEqual(hdul[0].header["EXPTIME"], 30.0) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/test_zarr_external_reader.py b/tests/test_zarr_external_reader.py new file mode 100644 index 00000000..852e6520 --- /dev/null +++ b/tests/test_zarr_external_reader.py @@ -0,0 +1,66 @@ +# This file is part of lsst-images. +# +# Developed for the LSST Data Management System. +# This product includes software developed by the LSST Project +# (https://www.lsst.org). +# See the COPYRIGHT file at the top-level directory of this distribution +# for details of code ownership. +# +# Use of this source code is governed by a 3-clause BSD-style +# license that can be found in the LICENSE file. + +from __future__ import annotations + +import os +import tempfile +import unittest + +import numpy as np + +from lsst.images import Box, Image + +try: + import zarr # noqa: F401 + + from lsst.images.zarr import write + + HAVE_ZARR = True +except ImportError: + HAVE_ZARR = False + +try: + import ome_zarr + import ome_zarr.io + import ome_zarr.reader # noqa: F401 + + HAVE_OME_ZARR = True +except ImportError: + HAVE_OME_ZARR = False + + +@unittest.skipUnless(HAVE_ZARR and HAVE_OME_ZARR, "ome-zarr is not installed") +class OmeZarrReaderTestCase(unittest.TestCase): + """``ome-zarr-py`` can open archives written by ``lsst.images.zarr``.""" + + def test_ome_zarr_can_open_image(self) -> None: + from ome_zarr.io import parse_url + from ome_zarr.reader import Reader + + original = Image( + np.arange(20, dtype=np.float32).reshape(4, 5), + bbox=Box.factory[10:14, 20:25], + ) + with tempfile.TemporaryDirectory() as tmp: + target = os.path.join(tmp, "out.zarr") + write(original, target) + location = parse_url(target) + self.assertIsNotNone(location) + reader = Reader(location) + nodes = list(reader()) + self.assertGreaterEqual(len(nodes), 1) + data = nodes[0].data[0] # level 0 + self.assertEqual(tuple(data.shape), (4, 5)) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/test_zarr_input_archive.py b/tests/test_zarr_input_archive.py new file mode 100644 index 00000000..c479ecd7 --- /dev/null +++ b/tests/test_zarr_input_archive.py @@ -0,0 +1,405 @@ +# This file is part of lsst-images. +# +# Developed for the LSST Data Management System. +# This product includes software developed by the LSST Project +# (https://www.lsst.org). +# See the COPYRIGHT file at the top-level directory of this distribution +# for details of code ownership. +# +# Use of this source code is governed by a 3-clause BSD-style +# license that can be found in the LICENSE file. + +from __future__ import annotations + +import os +import tempfile +import unittest + +import astropy.io.fits +import astropy.table +import numpy as np +import pydantic + +from lsst.images import Box, Image +from lsst.images._image import ImageSerializationModel +from lsst.images.fits._common import ExtensionKey, FitsOpaqueMetadata +from lsst.images.serialization import ArchiveReadError + +try: + import zarr + + from lsst.images.serialization import ArrayReferenceModel, NumberType + from lsst.images.zarr import ZarrPointerModel, read, write + from lsst.images.zarr._common import LSST_NS, LSST_VERSION + from lsst.images.zarr._input_archive import ZarrInputArchive + from lsst.images.zarr._model import ZarrArray, ZarrDocument + from lsst.images.zarr._output_archive import ZarrOutputArchive + + HAVE_ZARR = True +except ImportError: + HAVE_ZARR = False + + +class _CountingStore(zarr.storage.MemoryStore if HAVE_ZARR else object): + """A `zarr.storage.MemoryStore` that counts ``get`` calls. + + The counter is shared across instances created by zarr's + ``with_read_only`` so the test sees every read regardless of which + store wrapper handles it. + """ + + _shared_counter: list[int] = [0] + + def __init__(self, *args, **kwargs) -> None: + super().__init__(*args, **kwargs) + + @property + def reads(self) -> int: + return self._shared_counter[0] + + @reads.setter + def reads(self, value: int) -> None: + self._shared_counter[0] = value + + async def get(self, key, prototype, byte_range=None): + self._shared_counter[0] += 1 + return await super().get(key, prototype, byte_range) + + +@unittest.skipUnless(HAVE_ZARR, "zarr is not installed") +class ZarrInputArchiveSkeletonTestCase(unittest.TestCase): + """Open + version validation + ``get_tree``.""" + + def test_open_reads_tree(self) -> None: + original = Image( + np.arange(20, dtype=np.float32).reshape(4, 5), + bbox=Box.factory[10:14, 20:25], + ) + with tempfile.TemporaryDirectory() as tmp: + target = os.path.join(tmp, "out.zarr") + write(original, target) + with ZarrInputArchive.open(target) as archive: + tree = archive.get_tree(ImageSerializationModel) + self.assertIsNotNone(tree) + + def test_missing_archive_class_raises(self) -> None: + with tempfile.TemporaryDirectory() as tmp: + target = os.path.join(tmp, "bare.zarr") + os.makedirs(target) + store = zarr.storage.LocalStore(target, read_only=False) + zarr.create_group(store=store, zarr_format=3) # no lsst attrs + with self.assertRaisesRegex(ArchiveReadError, "not an LSST zarr archive"): + with ZarrInputArchive.open(target): + pass + + def test_future_version_refused(self) -> None: + with tempfile.TemporaryDirectory() as tmp: + target = os.path.join(tmp, "future.zarr") + os.makedirs(target) + store = zarr.storage.LocalStore(target, read_only=False) + root = zarr.create_group(store=store, zarr_format=3) + root.update_attributes( + { + LSST_NS: { + "version": LSST_VERSION + 1, + "archive_class": "Image", + "json": "lsst_json", + } + } + ) + with self.assertRaisesRegex(ArchiveReadError, "Unsupported lsst:version"): + with ZarrInputArchive.open(target): + pass + + +@unittest.skipUnless(HAVE_ZARR, "zarr is not installed") +class ZarrInputArchiveLazySubsetTestCase(unittest.TestCase): + """Subset reads only fetch chunks intersecting the slice.""" + + def test_subset_read_touches_only_intersecting_chunks(self) -> None: + store = _CountingStore() + root = zarr.create_group(store=store, zarr_format=3) + root.update_attributes( + { + LSST_NS: { + "version": LSST_VERSION, + "archive_class": "Image", + "json": "lsst_json", + } + } + ) + zarr_array = root.create_array(name="image", shape=(16, 16), chunks=(4, 4), dtype="float32") + zarr_array[:] = np.arange(256, dtype=np.float32).reshape(16, 16) + # Stub /lsst_json so the input archive's constructor accepts the file. + tree_arr = root.create_array(name="lsst_json", shape=(2,), chunks=(2,), dtype="uint8") + tree_arr[:] = np.frombuffer(b"{}", dtype=np.uint8) + + doc = ZarrDocument.from_zarr(store) + archive = ZarrInputArchive(doc) + + store.reads = 0 + full_ref = ArrayReferenceModel( + source="zarr:/image", + shape=[16, 16], + datatype=NumberType.from_numpy(np.dtype("float32")), + ) + full = archive.get_array(full_ref) + full_reads = store.reads + self.assertEqual(full.shape, (16, 16)) + + store.reads = 0 + subset = archive.get_array(full_ref, slices=(slice(0, 4), slice(0, 4))) + subset_reads = store.reads + self.assertEqual(subset.shape, (4, 4)) + np.testing.assert_array_equal(subset, np.arange(256).reshape(16, 16)[:4, :4]) + self.assertLess(subset_reads, full_reads) + + +@unittest.skipUnless(HAVE_ZARR, "zarr is not installed") +class ZarrInputArchiveMaskUnpackTestCase(unittest.TestCase): + """Round-trip a packed 2-D mask through ``get_array``'s unpack path.""" + + def test_unpack_2d_packed_back_to_3d(self) -> None: + store = zarr.storage.MemoryStore() + root = zarr.create_group(store=store, zarr_format=3) + root.update_attributes( + { + LSST_NS: { + "version": LSST_VERSION, + "archive_class": "Mask", + "json": "lsst_json", + } + } + ) + # 4x5 mask, 3 planes -> packed in uint8. + on_disk = np.zeros((4, 5), dtype=np.uint8) + on_disk[0, 0] = 0b001 + on_disk[1, 1] = 0b110 + mask_array = root.create_array(name="mask", shape=(4, 5), chunks=(4, 5), dtype="uint8") + mask_array[:] = on_disk + mask_array.update_attributes( + { + "_ARRAY_DIMENSIONS": ["y", "x"], + "flag_masks": [1, 2, 4], + "flag_meanings": "BAD SAT CR", + "flag_descriptions": ["Bad pixel.", "Saturated.", "Cosmic ray."], + } + ) + tree_arr = root.create_array(name="lsst_json", shape=(2,), chunks=(2,), dtype="uint8") + tree_arr[:] = np.frombuffer(b"{}", dtype=np.uint8) + + doc = ZarrDocument.from_zarr(store) + archive = ZarrInputArchive(doc) + + # The model records (y, x, mask_size) but the storage layout is the + # transposed (mask_size, y, x) — Mask.deserialize does the final + # moveaxis to recover (y, x, mask_size). + model = ArrayReferenceModel( + source="zarr:/mask", + shape=[4, 5, 1], + datatype=NumberType.from_numpy(np.dtype("uint8")), + ) + result = archive.get_array(model) + self.assertEqual(result.shape, (1, 4, 5)) + self.assertEqual(result[0, 0, 0], 0b001) + self.assertEqual(result[0, 1, 1], 0b110) + + def test_unpack_uint64_with_5_bytes(self) -> None: + # 40 planes packed into uint64 -> mask_size = 5. + store = zarr.storage.MemoryStore() + root = zarr.create_group(store=store, zarr_format=3) + root.update_attributes( + { + LSST_NS: { + "version": LSST_VERSION, + "archive_class": "Mask", + "json": "lsst_json", + } + } + ) + on_disk = np.zeros((4, 5), dtype=np.uint64) + on_disk[0, 0] = 0x01_02_03_04_05 # arbitrary bit pattern + mask_array = root.create_array(name="mask", shape=(4, 5), chunks=(4, 5), dtype="uint64") + mask_array[:] = on_disk + mask_array.update_attributes( + { + "_ARRAY_DIMENSIONS": ["y", "x"], + "flag_masks": [1 << i for i in range(40)], + "flag_meanings": " ".join(f"P{i}" for i in range(40)), + "flag_descriptions": [f"Plane {i}." for i in range(40)], + } + ) + tree_arr = root.create_array(name="lsst_json", shape=(2,), chunks=(2,), dtype="uint8") + tree_arr[:] = np.frombuffer(b"{}", dtype=np.uint8) + + doc = ZarrDocument.from_zarr(store) + archive = ZarrInputArchive(doc) + + model = ArrayReferenceModel( + source="zarr:/mask", + shape=[4, 5, 5], + datatype=NumberType.from_numpy(np.dtype("uint8")), + ) + result = archive.get_array(model) + self.assertEqual(result.shape, (5, 4, 5)) + # Bytes recovered from the packed uint64 (mask_size, y, x order). + self.assertEqual(result[0, 0, 0], 0x05) # low byte + self.assertEqual(result[1, 0, 0], 0x04) + self.assertEqual(result[2, 0, 0], 0x03) + self.assertEqual(result[3, 0, 0], 0x02) + self.assertEqual(result[4, 0, 0], 0x01) + + +@unittest.skipUnless(HAVE_ZARR, "zarr is not installed") +class ZarrInputArchivePointerTestCase(unittest.TestCase): + """``deserialize_pointer`` cache + JSON sub-tree handling.""" + + def test_deserialize_pointer_caches_results(self) -> None: + class _Sub(pydantic.BaseModel): + label: str + + store = zarr.storage.MemoryStore() + root = zarr.create_group(store=store, zarr_format=3) + root.update_attributes( + {LSST_NS: {"version": LSST_VERSION, "archive_class": "Image", "json": "lsst_json"}} + ) + # Stub /lsst_json. + tree_arr = root.create_array(name="lsst_json", shape=(2,), chunks=(2,), dtype="uint8") + tree_arr[:] = np.frombuffer(b"{}", dtype=np.uint8) + # Sub-archive with its own /lsst_json at /psf/lsst_json. + json_bytes = b'{"label": "psf"}' + psf = root.create_group("psf") + arr = psf.create_array( + name="lsst_json", shape=(len(json_bytes),), chunks=(len(json_bytes),), dtype="uint8" + ) + arr[:] = np.frombuffer(json_bytes, dtype=np.uint8) + + doc = ZarrDocument.from_zarr(store) + archive = ZarrInputArchive(doc) + + deserialize_calls: list[int] = [] + + def deserializer(model, arch): + deserialize_calls.append(1) + return model + + pointer = ZarrPointerModel(path="/psf/lsst_json") + first = archive.deserialize_pointer(pointer, _Sub, deserializer) + second = archive.deserialize_pointer(pointer, _Sub, deserializer) + self.assertEqual(first.label, "psf") + self.assertIs(first, second) + self.assertEqual(len(deserialize_calls), 1) + + +@unittest.skipUnless(HAVE_ZARR, "zarr is not installed") +class ZarrInputArchiveTableTestCase(unittest.TestCase): + """``get_table`` reconstructs columns via ``get_array``.""" + + def test_get_table_reconstructs_columns(self) -> None: + out = ZarrOutputArchive() + out.document.root.attributes.lsst["archive_class"] = "Image" + out.document.root.attributes.lsst["json"] = "lsst_json" + out.document.root.arrays["lsst_json"] = ZarrArray(data=np.frombuffer(b"{}", dtype=np.uint8)) + original = astropy.table.Table( + { + "x": np.arange(4, dtype=np.int32), + "y": np.arange(4, dtype=np.float32), + } + ) + model = out.add_table(original, name="cat") + + store = zarr.storage.MemoryStore() + out.document.to_zarr(store) + doc = ZarrDocument.from_zarr(store) + inp = ZarrInputArchive(doc) + + recovered = inp.get_table(model) + self.assertEqual(recovered.colnames, ["x", "y"]) + np.testing.assert_array_equal(recovered["x"], original["x"]) + np.testing.assert_array_equal(recovered["y"], original["y"]) + + +@unittest.skipUnless(HAVE_ZARR, "zarr is not installed") +class ZarrReadHelperTestCase(unittest.TestCase): + """End-to-end public ``read()`` round-trip.""" + + def test_round_trip_image(self) -> None: + original = Image( + np.arange(20, dtype=np.float32).reshape(4, 5), + bbox=Box.factory[10:14, 20:25], + ) + with tempfile.TemporaryDirectory() as tmp: + target = os.path.join(tmp, "out.zarr") + write(original, target) + result = read(Image, target) + self.assertEqual(result.deserialized.array.shape, (4, 5)) + np.testing.assert_array_equal(result.deserialized.array, original.array) + self.assertEqual(result.deserialized.bbox, original.bbox) + + +@unittest.skipUnless(HAVE_ZARR, "zarr is not installed") +class ZarrOpaqueMetadataReadTestCase(unittest.TestCase): + """FITS opaque metadata is restored on read.""" + + def test_fits_opaque_metadata_round_trips(self) -> None: + image = Image( + np.arange(20, dtype=np.float32).reshape(4, 5), + bbox=Box.factory[10:14, 20:25], + ) + header = astropy.io.fits.Header() + header["ORIGIN"] = "RUBIN" + header["EXPTIME"] = 30.0 + opaque = FitsOpaqueMetadata() + opaque.headers[ExtensionKey()] = header + image._opaque_metadata = opaque + + with tempfile.TemporaryDirectory() as tmp: + target = os.path.join(tmp, "out.zarr") + write(image, target) + recovered = read(Image, target).deserialized + recovered_opaque = recovered._opaque_metadata + self.assertIsInstance(recovered_opaque, FitsOpaqueMetadata) + recovered_header = recovered_opaque.headers[ExtensionKey()] + self.assertEqual(recovered_header["ORIGIN"], "RUBIN") + self.assertEqual(recovered_header["EXPTIME"], 30.0) + + def test_fits_opaque_metadata_preserves_full_card_fidelity(self) -> None: + """Comments, HISTORY, COMMENT, and HIERARCH all survive round-trip.""" + image = Image( + np.arange(20, dtype=np.float32).reshape(4, 5), + bbox=Box.factory[10:14, 20:25], + ) + header = astropy.io.fits.Header() + header["ORIGIN"] = ("RUBIN", "Source observatory") + header["EXPTIME"] = (30.0, "[s] Total exposure time") + header["HIERARCH LSST INSTRUMENT"] = "LSSTCam" + header.add_history("Bias-subtracted on 2026-05-21") + header.add_history("ISR completed 2026-05-22") + header.add_comment("This file was generated for testing.") + opaque = FitsOpaqueMetadata() + opaque.headers[ExtensionKey()] = header + image._opaque_metadata = opaque + + with tempfile.TemporaryDirectory() as tmp: + target = os.path.join(tmp, "out.zarr") + write(image, target) + recovered = read(Image, target).deserialized + recovered_header = recovered._opaque_metadata.headers[ExtensionKey()] + # Byte-exact equality of the serialized card stream. + self.assertEqual(recovered_header.tostring(), header.tostring()) + # Spot-check the round-tripped values + comments. + self.assertEqual(recovered_header.comments["ORIGIN"], "Source observatory") + self.assertEqual(recovered_header.comments["EXPTIME"], "[s] Total exposure time") + self.assertEqual(recovered_header["HIERARCH LSST INSTRUMENT"], "LSSTCam") + self.assertEqual( + list(recovered_header["HISTORY"]), + ["Bias-subtracted on 2026-05-21", "ISR completed 2026-05-22"], + ) + self.assertEqual( + list(recovered_header["COMMENT"]), + ["This file was generated for testing."], + ) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/test_zarr_layout.py b/tests/test_zarr_layout.py new file mode 100644 index 00000000..d1bb4959 --- /dev/null +++ b/tests/test_zarr_layout.py @@ -0,0 +1,300 @@ +# This file is part of lsst-images. +# +# Developed for the LSST Data Management System. +# This product includes software developed by the LSST Project +# (https://www.lsst.org). +# See the COPYRIGHT file at the top-level directory of this distribution +# for details of code ownership. +# +# Use of this source code is governed by a 3-clause BSD-style +# license that can be found in the LICENSE file. + +from __future__ import annotations + +import unittest + +import numpy as np + +from lsst.images._transforms._ast import ( + CmpMap, + Frame, + FrameSet, + PolyMap, + ZoomMap, +) + +try: + from lsst.images.zarr._layout import ( + affine_check, + axes_for_archive_class, + chunks_aligned_to, + chunks_for, + decorate_sub_archives, + default_shards, + ) + from lsst.images.zarr._model import ZarrArray, ZarrDocument, ZarrGroup + + HAVE_ZARR = True +except ImportError: + HAVE_ZARR = False + + +@unittest.skipUnless(HAVE_ZARR, "zarr is not installed") +class LayoutTestCase(unittest.TestCase): + """Per-archive-class axes and chunk derivation rules.""" + + def test_axes_for_archive_class(self) -> None: + # Standard 2-D images use (y, x). + self.assertEqual(axes_for_archive_class("Image"), ("y", "x")) + self.assertEqual(axes_for_archive_class("MaskedImage"), ("y", "x")) + self.assertEqual(axes_for_archive_class("VisitImage"), ("y", "x")) + self.assertEqual(axes_for_archive_class("Mask"), ("y", "x")) + self.assertEqual(axes_for_archive_class("CellCoadd"), ("y", "x")) + # ColorImage's root has no top-level multiscale; this returns + # an empty tuple to signal "no OME multiscale at this level". + self.assertEqual(axes_for_archive_class("ColorImage"), ()) + + def test_chunks_for_default(self) -> None: + # Plain images clamp to the per-axis chunk limit (256 by default). + self.assertEqual(chunks_for("Image", (4096, 4096), None), (256, 256)) + # Smaller than the limit -> use full dim. + self.assertEqual(chunks_for("Image", (200, 100), None), (200, 100)) + + def test_chunks_for_override(self) -> None: + self.assertEqual(chunks_for("Image", (4096, 4096), (256, 256)), (256, 256)) + + def test_chunks_for_cell_coadd_uses_cell_shape(self) -> None: + result = chunks_for( + "CellCoadd", + (4096, 4096), + None, + archive_metadata={"cell_shape": (256, 256)}, + ) + self.assertEqual(result, (256, 256)) + + def test_chunks_for_cell_coadd_without_metadata_falls_back(self) -> None: + self.assertEqual(chunks_for("CellCoadd", (4096, 4096), None), (256, 256)) + + def test_chunks_aligned_to_matches_image(self) -> None: + # variance / mask follow image's chunks when not overridden. + self.assertEqual( + chunks_aligned_to(image_chunks=(256, 256), shape=(4096, 4096)), + (256, 256), + ) + # If the sibling shape is smaller than image's chunks, clamp. + self.assertEqual( + chunks_aligned_to(image_chunks=(1024, 1024), shape=(300, 600)), + (300, 600), + ) + + +@unittest.skipUnless(HAVE_ZARR, "zarr is not installed") +class AffineValidatorTestCase(unittest.TestCase): + """Affine-residual validator gating the OME affine block.""" + + def _make_linear_frame_set(self, *, scale: float = 0.2) -> FrameSet: + base = Frame(2, "Domain=PIXEL") + sky = Frame(2, "Domain=SKY") + fs = FrameSet(base) + fs.addFrame(FrameSet.BASE, ZoomMap(2, scale), sky) + return fs + + def _make_distorted_frame_set(self) -> FrameSet: + base = Frame(2, "Domain=PIXEL") + sky = Frame(2, "Domain=SKY") + forward_coeffs = [ + [1.0, 1, 1, 0], + [0.001, 1, 0, 2], + [1.0, 2, 0, 1], + [0.001, 2, 2, 0], + ] + poly = PolyMap(forward_coeffs, 2, "IterInverse=1, NIterInverse=20") + cmp = CmpMap(poly, ZoomMap(2, 0.2), True) + fs = FrameSet(base) + fs.addFrame(FrameSet.BASE, cmp, sky) + return fs + + def test_pure_linear_passes(self) -> None: + # NGFF v0.5 composes ``coordinateTransformations`` in list + # order: ``scale`` is applied first, then ``affine``. For a + # pure 0.2 pixel→sky scale, the composed effect on a unit + # pixel must be 0.2 — not 0.04 (which would result from + # leaving the scale embedded in both the explicit scale block + # AND the affine's Jacobian). + fs = self._make_linear_frame_set(scale=0.2) + result = affine_check( + frame_set=fs, + image_shape=(64, 64), + max_residual_pixels=1.0, + ) + self.assertFalse(result.dropped) + self.assertIsNotNone(result.coordinate_transformations) + ct = result.coordinate_transformations + assert ct is not None # for type checkers + scale_block, affine_block = ct[0], ct[1] + self.assertEqual(scale_block["type"], "scale") + self.assertEqual(affine_block["type"], "affine") + # The scale block carries the per-axis pixel size; the affine + # has unit-norm columns (a pure rotation/translation here). + self.assertAlmostEqual(scale_block["scale"][0], 0.2) + self.assertAlmostEqual(scale_block["scale"][1], 0.2) + self.assertAlmostEqual(affine_block["affine"][0][0], 1.0) + self.assertAlmostEqual(affine_block["affine"][1][1], 1.0) + # Compose scale ∘ affine and apply to a unit pixel vector. + scale = scale_block["scale"] + affine = np.array(affine_block["affine"]) + scaled = np.array([scale[0] * 1.0, scale[1] * 1.0, 1.0]) + composed = affine @ scaled + self.assertAlmostEqual(composed[0], 0.2) + self.assertAlmostEqual(composed[1], 0.2) + + def test_high_distortion_drops_block(self) -> None: + fs = self._make_distorted_frame_set() + result = affine_check( + frame_set=fs, + image_shape=(4096, 4096), + max_residual_pixels=1.0, + ) + self.assertTrue(result.dropped) + self.assertIsNone(result.coordinate_transformations) + + +@unittest.skipUnless(HAVE_ZARR, "zarr is not installed") +class DecorateSubArchivesTestCase(unittest.TestCase): + """`decorate_sub_archives` walks the IR and adds OME / lsst attrs.""" + + def test_sub_group_with_image_gets_lsst_and_ome_attrs(self) -> None: + doc = ZarrDocument(root=ZarrGroup()) + doc.root.attributes.lsst["archive_class"] = "ColorImage" + red = doc.root.ensure_group("/red") + red.arrays["image"] = ZarrArray(data=np.ones((4, 5), dtype="float32")) + + decorate_sub_archives(doc) + + self.assertEqual(red.attributes.lsst["archive_class"], "Image") + self.assertIn("multiscales", red.attributes.ome) + self.assertEqual(red.attributes.ome["multiscales"][0]["datasets"][0]["path"], "image") + + def test_root_archive_class_is_unchanged(self) -> None: + doc = ZarrDocument(root=ZarrGroup()) + doc.root.attributes.lsst["archive_class"] = "ColorImage" + red = doc.root.ensure_group("/red") + red.arrays["image"] = ZarrArray(data=np.ones((4, 5), dtype="float32")) + + decorate_sub_archives(doc) + + # Root keeps ColorImage; only sub-groups are decorated. + self.assertEqual(doc.root.attributes.lsst["archive_class"], "ColorImage") + + +@unittest.skipUnless(HAVE_ZARR, "zarr is not installed") +class DefaultShardsTestCase(unittest.TestCase): + """The `default_shards` byte-budget rule.""" + + TARGET = 16 * 1024 * 1024 # 16 MiB + + def test_4k_float32_image_uses_byte_budget(self) -> None: + result = default_shards( + chunks=(256, 256), + shape=(4096, 4096), + dtype=np.dtype("float32"), + target_bytes=self.TARGET, + ) + self.assertEqual(result, (2048, 2048)) + + def test_3d_mask_plane_axis_untouched(self) -> None: + # chunks already cover the plane axis; growable axes are y, x only. + result = default_shards( + chunks=(8, 256, 256), + shape=(8, 4096, 4096), + dtype=np.dtype("uint8"), + target_bytes=self.TARGET, + ) + self.assertEqual(result, (8, 1536, 1536)) + + def test_tiny_single_chunk_returns_none(self) -> None: + result = default_shards( + chunks=(40,), + shape=(40,), + dtype=np.dtype("uint8"), + target_bytes=self.TARGET, + ) + self.assertIsNone(result) + + def test_chunks_equal_shape_returns_none(self) -> None: + result = default_shards( + chunks=(1024, 1024), + shape=(1024, 1024), + dtype=np.dtype("float32"), + target_bytes=self.TARGET, + ) + self.assertIsNone(result) + + def test_already_big_chunk_returns_none(self) -> None: + # 4096*4096*4 = 64 MiB > 16 MiB target. + result = default_shards( + chunks=(4096, 4096), + shape=(8192, 8192), + dtype=np.dtype("float32"), + target_bytes=self.TARGET, + ) + self.assertIsNone(result) + + def test_k_le_one_returns_none(self) -> None: + # chunk=256x256 float32 = 256 KiB; ratio=1.25 -> k=round(1.25)=1 + # -> returns None. + chunk_bytes = 256 * 256 * 4 + result = default_shards( + chunks=(256, 256), + shape=(4096, 4096), + dtype=np.dtype("float32"), + target_bytes=int(chunk_bytes * 1.25), + ) + self.assertIsNone(result) + + def test_cap_at_array_bounds(self) -> None: + # 600x600 float32; chunk_bytes = 256 KiB; ratio = 64; k = 8. + # Uncapped shard would be (2048, 2048) but the array only has + # 3 chunks per axis (ceil(600/256) = 3), so the cap is (768, 768). + result = default_shards( + chunks=(256, 256), + shape=(600, 600), + dtype=np.dtype("float32"), + target_bytes=self.TARGET, + ) + self.assertEqual(result, (768, 768)) + + def test_cell_coadd_psf(self) -> None: + # (25, 25, 150, 150) float32 with (1, 1, 150, 150) chunks. + # chunk_bytes = 90 KiB; ratio ~= 186; growable axes are 0 and 1. + # k = round(sqrt(186)) = 14. + result = default_shards( + chunks=(1, 1, 150, 150), + shape=(25, 25, 150, 150), + dtype=np.dtype("float32"), + target_bytes=self.TARGET, + ) + self.assertEqual(result, (14, 14, 150, 150)) + + def test_mismatched_ndim_raises(self) -> None: + with self.assertRaisesRegex(ValueError, "rank"): + default_shards( + chunks=(256, 256), + shape=(4096, 4096, 4096), + dtype=np.dtype("float32"), + target_bytes=self.TARGET, + ) + + def test_zero_itemsize_returns_none(self) -> None: + # void(0) has itemsize 0; defensive guard against degenerate dtypes. + result = default_shards( + chunks=(256, 256), + shape=(4096, 4096), + dtype=np.dtype("V0"), + target_bytes=self.TARGET, + ) + self.assertIsNone(result) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/test_zarr_model.py b/tests/test_zarr_model.py new file mode 100644 index 00000000..00a9041c --- /dev/null +++ b/tests/test_zarr_model.py @@ -0,0 +1,256 @@ +# This file is part of lsst-images. +# +# Developed for the LSST Data Management System. +# This product includes software developed by the LSST Project +# (https://www.lsst.org). +# See the COPYRIGHT file at the top-level directory of this distribution +# for details of code ownership. +# +# Use of this source code is governed by a 3-clause BSD-style +# license that can be found in the LICENSE file. + +from __future__ import annotations + +import unittest + +import numpy as np + +try: + import zarr + + from lsst.images.zarr._common import LSST_NS, LSST_VERSION, OME_NS, OME_VERSION + from lsst.images.zarr._model import ( + CfFlagAttributes, + MaskPlaneEntry, + OmeMultiscale, + ZarrArray, + ZarrAttributes, + ZarrDocument, + ZarrGroup, + build_image_array_attrs, + ) + + HAVE_ZARR = True +except ImportError: + HAVE_ZARR = False + + +@unittest.skipUnless(HAVE_ZARR, "zarr is not installed") +class ZarrAttributesTestCase(unittest.TestCase): + """Tests for `ZarrAttributes` namespacing and round-tripping.""" + + def test_dump_separates_namespaces(self) -> None: + attrs = ZarrAttributes() + attrs.lsst["archive_class"] = "MaskedImage" + attrs.ome["multiscales"] = [{"name": "image"}] + attrs.extra["_ARRAY_DIMENSIONS"] = ["y", "x"] + attrs.extra["units"] = "adu" + dumped = attrs.dump() + self.assertEqual(dumped[LSST_NS]["archive_class"], "MaskedImage") + self.assertEqual(dumped[LSST_NS]["version"], LSST_VERSION) + self.assertEqual(dumped[OME_NS]["multiscales"], [{"name": "image"}]) + self.assertEqual(dumped[OME_NS]["version"], OME_VERSION) + # CF / xarray attrs sit at the top level, not inside lsst: or ome:. + self.assertEqual(dumped["_ARRAY_DIMENSIONS"], ["y", "x"]) + self.assertEqual(dumped["units"], "adu") + + def test_load_preserves_unknown_keys(self) -> None: + # Forward compatibility: unknown lsst.* keys must survive a + # load -> dump round-trip. + raw = { + LSST_NS: { + "version": LSST_VERSION, + "archive_class": "Image", + "future_thing": {"x": 1}, + }, + OME_NS: {"version": OME_VERSION, "multiscales": []}, + "_ARRAY_DIMENSIONS": ["y", "x"], + "units": "adu", + } + attrs = ZarrAttributes.load(raw) + dumped = attrs.dump() + self.assertEqual(dumped[LSST_NS]["future_thing"], {"x": 1}) + self.assertEqual(dumped["units"], "adu") + + +@unittest.skipUnless(HAVE_ZARR, "zarr is not installed") +class ZarrArrayTestCase(unittest.TestCase): + """Tests for `ZarrArray` lazy backing and slice forwarding.""" + + def test_lazy_data_after_from_zarr(self) -> None: + store = zarr.storage.MemoryStore() + root = zarr.create_group(store=store, zarr_format=3) + zarr_array = root.create_array(name="image", shape=(8, 8), chunks=(4, 4), dtype="float32") + zarr_array[:] = np.arange(64, dtype=np.float32).reshape(8, 8) + + ir_array = ZarrArray.from_zarr(zarr_array) + # Lazy invariant: data is the zarr.Array handle, not numpy. + self.assertIsInstance(ir_array.data, zarr.Array) + self.assertNotIsInstance(ir_array.data, np.ndarray) + self.assertEqual(ir_array.shape, (8, 8)) + self.assertEqual(str(ir_array.dtype), "float32") + + def test_subset_does_not_materialize_full_array(self) -> None: + store = _CountingStore() + root = zarr.create_group(store=store, zarr_format=3) + zarr_array = root.create_array(name="image", shape=(16, 16), chunks=(4, 4), dtype="int32") + zarr_array[:] = np.arange(256, dtype=np.int32).reshape(16, 16) + store.reads = 0 # reset after the write phase + + ir_array = ZarrArray.from_zarr(zarr_array) + # Reading shape / dtype must not fetch any chunk data. + self.assertEqual(ir_array.shape, (16, 16)) + self.assertEqual(store.reads, 0) + + subset = ir_array.read(slices=(slice(0, 4), slice(0, 4))) + self.assertEqual(subset.shape, (4, 4)) + np.testing.assert_array_equal(subset, np.arange(256).reshape(16, 16)[:4, :4]) + # A 4x4 subset aligned with chunks=(4, 4) intersects exactly one + # data chunk; allow a small margin for incidental metadata reads, + # but stay tight enough to catch a regression that fetches 2+ chunks. + self.assertLessEqual(store.reads, 4) + + def test_staged_numpy_array_is_eager(self) -> None: + data = np.arange(12, dtype=np.float64).reshape(3, 4) + ir_array = ZarrArray(data=data) + self.assertIs(ir_array.data, data) + self.assertEqual(ir_array.shape, (3, 4)) + + +class _CountingStore(zarr.storage.MemoryStore if HAVE_ZARR else object): + """A MemoryStore that counts get() calls.""" + + def __init__(self) -> None: + super().__init__() + self.reads = 0 + + async def get(self, key, prototype, byte_range=None): # type: ignore[override] + self.reads += 1 + return await super().get(key, prototype, byte_range) + + +@unittest.skipUnless(HAVE_ZARR, "zarr is not installed") +class ZarrDocumentTestCase(unittest.TestCase): + """Tests for `ZarrDocument` / `ZarrGroup` round-trip and tree walking.""" + + def test_round_trip_through_memory_store(self) -> None: + # Build a flat IR: image, variance, mask siblings at root. + doc = ZarrDocument(root=ZarrGroup()) + doc.root.attributes.lsst["archive_class"] = "MaskedImage" + doc.root.attributes.lsst["json"] = "lsst_json" + + image = ZarrArray(data=np.ones((4, 4), dtype="float32")) + image.attributes.extra["_ARRAY_DIMENSIONS"] = ["y", "x"] + doc.root.arrays["image"] = image + + mask = ZarrArray(data=np.zeros((4, 4), dtype="uint8")) + mask.attributes.extra["_ARRAY_DIMENSIONS"] = ["y", "x"] + mask.attributes.extra["flag_masks"] = [1, 2] + mask.attributes.extra["flag_meanings"] = "BAD SAT" + doc.root.arrays["mask"] = mask + + # Stub a 1-D uint8 'tree' array (JSON bytes). + doc.root.arrays["lsst_json"] = ZarrArray(data=np.frombuffer(b"{}", dtype=np.uint8)) + + store = zarr.storage.MemoryStore() + doc.to_zarr(store) + + # Reload and verify lazy invariant on every array. + recovered = ZarrDocument.from_zarr(store) + self.assertIsInstance(recovered.root.arrays["image"].data, zarr.Array) + self.assertIsInstance(recovered.root.arrays["mask"].data, zarr.Array) + self.assertEqual(recovered.root.attributes.lsst["archive_class"], "MaskedImage") + # CF flag attrs round-trip via the extra namespace. + self.assertEqual( + recovered.root.arrays["mask"].attributes.extra["flag_meanings"], + "BAD SAT", + ) + # xarray dims round-trip. + self.assertEqual( + recovered.root.arrays["image"].attributes.extra["_ARRAY_DIMENSIONS"], + ["y", "x"], + ) + # Subset reads still go through the lazy handle. + np.testing.assert_array_equal(recovered.root.arrays["image"].read(), np.ones((4, 4), dtype="float32")) + + def test_get_walks_paths(self) -> None: + doc = ZarrDocument(root=ZarrGroup()) + doc.root.arrays["image"] = ZarrArray(data=np.zeros((2, 2), dtype="float32")) + red = doc.root.ensure_group("/red") + red.arrays["image"] = ZarrArray(data=np.ones((2, 2), dtype="float32")) + + # Absolute and relative paths. + self.assertIs(doc.root.get("/image"), doc.root.arrays["image"]) + self.assertIs(doc.root.get("image"), doc.root.arrays["image"]) + self.assertIs(doc.root.get("/red/image"), red.arrays["image"]) + self.assertIs(doc.root.get("/"), doc.root) + + with self.assertRaises(KeyError): + doc.root.get("/missing") + + +@unittest.skipUnless(HAVE_ZARR, "zarr is not installed") +class OmeCfHelpersTestCase(unittest.TestCase): + """Tests for the OME / CF attribute-shape helpers.""" + + def test_multiscale_emits_expected_shape(self) -> None: + m = OmeMultiscale( + name="visitimage", + axes=("y", "x"), + dataset_path="image", + ) + d = m.dump() + self.assertEqual(d["name"], "visitimage") + self.assertEqual( + d["axes"], + [ + {"name": "y", "type": "space", "unit": "pixel"}, + {"name": "x", "type": "space", "unit": "pixel"}, + ], + ) + self.assertEqual(d["datasets"][0]["path"], "image") + # Default coordinate transform is unit scale until a real one is set. + self.assertEqual( + d["datasets"][0]["coordinateTransformations"], + [{"type": "scale", "scale": [1.0, 1.0]}], + ) + + def test_multiscale_with_affine(self) -> None: + m = OmeMultiscale( + name="image", + axes=("y", "x"), + dataset_path="image", + coordinate_transformations=[ + {"type": "scale", "scale": [0.2, 0.2]}, + { + "type": "affine", + "affine": [[1.0, 0.0, 0.0], [0.0, 1.0, 0.0], [0.0, 0.0, 1.0]], + }, + ], + ) + d = m.dump() + self.assertEqual(len(d["datasets"][0]["coordinateTransformations"]), 2) + self.assertEqual(d["datasets"][0]["coordinateTransformations"][0]["type"], "scale") + + def test_cf_flag_attributes(self) -> None: + cf = CfFlagAttributes( + planes=[ + MaskPlaneEntry(name="BAD", bit=0, description="Bad pixel."), + MaskPlaneEntry(name="SAT", bit=1, description="Saturated."), + MaskPlaneEntry(name="CR", bit=2, description="Cosmic ray."), + ] + ) + d = cf.dump() + self.assertEqual(d["flag_masks"], [1, 2, 4]) + self.assertEqual(d["flag_meanings"], "BAD SAT CR") + self.assertEqual(d["flag_descriptions"], ["Bad pixel.", "Saturated.", "Cosmic ray."]) + + def test_image_array_attrs(self) -> None: + attrs = build_image_array_attrs(axes=("y", "x"), units="adu", long_name="science image") + self.assertEqual(attrs["_ARRAY_DIMENSIONS"], ["y", "x"]) + self.assertEqual(attrs["units"], "adu") + self.assertEqual(attrs["long_name"], "science image") + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/test_zarr_ome_compliance.py b/tests/test_zarr_ome_compliance.py new file mode 100644 index 00000000..095c4c6a --- /dev/null +++ b/tests/test_zarr_ome_compliance.py @@ -0,0 +1,78 @@ +# This file is part of lsst-images. +# +# Developed for the LSST Data Management System. +# This product includes software developed by the LSST Project +# (https://www.lsst.org). +# See the COPYRIGHT file at the top-level directory of this distribution +# for details of code ownership. +# +# Use of this source code is governed by a 3-clause BSD-style +# license that can be found in the LICENSE file. + +from __future__ import annotations + +import os +import shutil +import subprocess +import tempfile +import unittest + +import numpy as np + +from lsst.images import Box, Image, MaskedImage, MaskPlane, MaskSchema + +try: + import zarr # noqa: F401 + + from lsst.images.zarr import write + + HAVE_ZARR = True +except ImportError: + HAVE_ZARR = False + +NGFF_VALIDATOR = shutil.which("ngff-validator") + + +@unittest.skipUnless(HAVE_ZARR and NGFF_VALIDATOR, "ngff-validator is not on PATH") +class NgffComplianceTestCase(unittest.TestCase): + """Archives written by zarr backend validate against NGFF schema.""" + + def _validate(self, target: str) -> None: + result = subprocess.run( + [NGFF_VALIDATOR, target], + capture_output=True, + text=True, + check=False, + ) + self.assertEqual( + result.returncode, + 0, + f"ngff-validator failed for {target}:\nSTDOUT:\n{result.stdout}\nSTDERR:\n{result.stderr}", + ) + + def test_image_validates(self) -> None: + image = Image( + np.arange(20, dtype=np.float32).reshape(4, 5), + bbox=Box.factory[10:14, 20:25], + ) + with tempfile.TemporaryDirectory() as tmp: + target = os.path.join(tmp, "out.zarr") + write(image, target) + self._validate(target) + + def test_masked_image_validates(self) -> None: + schema = MaskSchema([MaskPlane("BAD", "Bad pixel.")]) + image = Image( + np.arange(20, dtype=np.float32).reshape(4, 5), + bbox=Box.factory[10:14, 20:25], + ) + masked = MaskedImage(image, mask_schema=schema) + + with tempfile.TemporaryDirectory() as tmp: + target = os.path.join(tmp, "masked.zarr") + write(masked, target) + self._validate(target) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/test_zarr_output_archive.py b/tests/test_zarr_output_archive.py new file mode 100644 index 00000000..0b4a6323 --- /dev/null +++ b/tests/test_zarr_output_archive.py @@ -0,0 +1,427 @@ +# This file is part of lsst-images. +# +# Developed for the LSST Data Management System. +# This product includes software developed by the LSST Project +# (https://www.lsst.org). +# See the COPYRIGHT file at the top-level directory of this distribution +# for details of code ownership. +# +# Use of this source code is governed by a 3-clause BSD-style +# license that can be found in the LICENSE file. + +from __future__ import annotations + +import os +import tempfile +import unittest +from types import SimpleNamespace + +import astropy.io.fits +import astropy.table +import numpy as np +import pydantic + +from lsst.images import YX, Box, ColorImage, Image, Mask, MaskedImage, MaskPlane, MaskSchema +from lsst.images.fits._common import ExtensionKey, FitsOpaqueMetadata + +try: + import zarr + + from lsst.images.zarr import ZarrPointerModel, write + from lsst.images.zarr._model import ZarrDocument + from lsst.images.zarr._output_archive import ZarrOutputArchive, build_archive_metadata + + HAVE_ZARR = True +except ImportError: + HAVE_ZARR = False + + +class _Sub(pydantic.BaseModel): + label: str = "sub" + + +@unittest.skipUnless(HAVE_ZARR, "zarr is not installed") +class ZarrOutputArchiveSkeletonTestCase(unittest.TestCase): + """Constructor + serialize_direct / serialize_pointer plumbing.""" + + def test_serialize_direct_returns_nested_result(self) -> None: + archive = ZarrOutputArchive() + + def serializer(arch): + return _Sub(label="ok") + + result = archive.serialize_direct("red", serializer) + self.assertEqual(result.label, "ok") + + def test_serialize_pointer_writes_json_subtree(self) -> None: + archive = ZarrOutputArchive() + + def serializer(arch): + return _Sub(label="psf") + + pointer = archive.serialize_pointer("psf", serializer, key=12345) + self.assertIsInstance(pointer, ZarrPointerModel) + self.assertEqual(pointer.path, "/psf/lsst_json") + # Cached on second call. + again = archive.serialize_pointer("psf", serializer, key=12345) + self.assertEqual(again, pointer) + # IR holds the JSON bytes as a 1-D uint8 array. + node = archive.document.root.get("/psf/lsst_json") + self.assertEqual(str(node.dtype), "uint8") + + +@unittest.skipUnless(HAVE_ZARR, "zarr is not installed") +class ZarrOutputArchiveAddArrayTestCase(unittest.TestCase): + """`add_array` handling for image / variance / mask plus nested arrays.""" + + def test_add_image(self) -> None: + archive = ZarrOutputArchive() + ref = archive.add_array(np.ones((4, 5), dtype=np.float32), name="image") + self.assertEqual(ref.source, "zarr:/image") + self.assertEqual(list(ref.shape), [4, 5]) + node = archive.document.root.get("/image") + self.assertEqual(node.shape, (4, 5)) + self.assertEqual(node.attributes.extra["_ARRAY_DIMENSIONS"], ["y", "x"]) + + def test_add_variance_aligns_to_image_chunks(self) -> None: + archive = ZarrOutputArchive(chunks={"image": (2, 2)}) + archive.add_array(np.ones((4, 5), dtype=np.float32), name="image") + archive.add_array(np.ones((4, 5), dtype=np.float64), name="variance") + var_node = archive.document.root.get("/variance") + self.assertEqual(tuple(var_node.chunks), (2, 2)) + + def test_add_mask_packs_to_2d_with_cf_flag_attrs(self) -> None: + schema = MaskSchema( + [ + MaskPlane("BAD", "Bad pixel."), + MaskPlane("SAT", "Saturated."), + MaskPlane("CR", "Cosmic ray."), + ] + ) + # ``Mask.serialize`` emits the byte axis first when the archive opts + # into native-mask arrays — shape ``(mask_size, y, x)``. + in_memory = np.zeros((1, 4, 5), dtype=np.uint8) + in_memory[0, 0, 0] = 0b1 # BAD + in_memory[0, 1, 1] = 0b110 # SAT | CR + + archive = ZarrOutputArchive(archive_metadata={"mask_schema": schema}) + archive.add_array(np.ones((4, 5), dtype=np.float32), name="image") + ref = archive.add_array(in_memory, name="mask") + self.assertEqual(ref.source, "zarr:/mask") + node = archive.document.root.get("/mask") + # 2-D packed integer. + self.assertEqual(node.shape, (4, 5)) + self.assertEqual(str(node.dtype), "uint8") # 3 planes -> uint8 + # Bytes packed correctly. + np.testing.assert_array_equal(node.data[0, 0], 0b1) + np.testing.assert_array_equal(node.data[1, 1], 0b110) + # CF flag attrs. + attrs = node.attributes.extra + self.assertEqual(attrs["flag_masks"], [1, 2, 4]) + self.assertEqual(attrs["flag_meanings"], "BAD SAT CR") + self.assertEqual( + attrs["flag_descriptions"], + ["Bad pixel.", "Saturated.", "Cosmic ray."], + ) + self.assertEqual(attrs["_ARRAY_DIMENSIONS"], ["y", "x"]) + + def test_add_mask_picks_widest_dtype_for_40_planes(self) -> None: + planes = [MaskPlane(f"P{i}", f"Plane {i}.") for i in range(40)] + schema = MaskSchema(planes) + # 40 planes -> mask_size=5 -> (5, y, x). + in_memory = np.zeros((5, 4, 5), dtype=np.uint8) + + archive = ZarrOutputArchive(archive_metadata={"mask_schema": schema}) + archive.add_array(np.ones((4, 5), dtype=np.float32), name="image") + archive.add_array(in_memory, name="mask") + node = archive.document.root.get("/mask") + self.assertEqual(node.shape, (4, 5)) + self.assertEqual(str(node.dtype), "uint64") + + def test_add_mask_refuses_more_than_64_planes(self) -> None: + planes = [MaskPlane(f"P{i}", f"Plane {i}.") for i in range(65)] + schema = MaskSchema(planes) + # 65 planes -> mask_size=9 -> (9, y, x). + in_memory = np.zeros((9, 4, 5), dtype=np.uint8) + + archive = ZarrOutputArchive(archive_metadata={"mask_schema": schema}) + archive.add_array(np.ones((4, 5), dtype=np.float32), name="image") + with self.assertRaisesRegex(ValueError, "supports up to 64"): + archive.add_array(in_memory, name="mask") + + def test_add_anonymous_nested_array(self) -> None: + archive = ZarrOutputArchive() + ref = archive.add_array(np.ones((3,), dtype=np.float32), name="psf/centroids") + self.assertEqual(ref.source, "zarr:/psf/centroids") + self.assertEqual(archive.document.root.get("/psf/centroids").shape, (3,)) + + +@unittest.skipUnless(HAVE_ZARR, "zarr is not installed") +class ZarrOutputArchiveAddTableTestCase(unittest.TestCase): + """`add_table` / `add_structured_array` plumbing.""" + + def test_add_table_creates_one_array_per_column(self) -> None: + archive = ZarrOutputArchive() + original = astropy.table.Table( + { + "x": np.arange(4, dtype=np.int32), + "y": np.arange(4, dtype=np.float32), + }, + meta={"comment": "small catalog"}, + ) + model = archive.add_table(original, name="cat") + self.assertEqual(len(model.columns), 2) + sources = {c.name: c.data.source for c in model.columns} + self.assertEqual(sources["x"], "zarr:/lsst/tables/cat/x") + self.assertEqual(sources["y"], "zarr:/lsst/tables/cat/y") + # Each column is its own zarr array under the parent group. + x_node = archive.document.root.get("/lsst/tables/cat/x") + self.assertEqual(x_node.shape, (4,)) + + +@unittest.skipUnless(HAVE_ZARR, "zarr is not installed") +class ZarrWriteHelperTestCase(unittest.TestCase): + """Public ``write()`` end-to-end for a plain `Image`.""" + + def test_write_image_to_local_directory(self) -> None: + original = Image( + np.arange(20, dtype=np.float32).reshape(4, 5), + bbox=Box.factory[10:14, 20:25], + ) + with tempfile.TemporaryDirectory() as tmp: + target = os.path.join(tmp, "out.zarr") + tree = write(original, target) + self.assertIsNotNone(tree) + with zarr.storage.LocalStore(target, read_only=True) as store: + doc = ZarrDocument.from_zarr(store) + # Top-level image and tree are present. + self.assertIn("image", doc.root.arrays) + self.assertIn("lsst_json", doc.root.arrays) + self.assertEqual(doc.root.arrays["image"].shape, (4, 5)) + # LSST root attrs. + lsst_attrs = doc.root.attributes.lsst + self.assertEqual(lsst_attrs["archive_class"], "Image") + self.assertEqual(lsst_attrs["json"], "lsst_json") + # OME multiscales points at /image; no projection means + # the unit scale is emitted. + ome = doc.root.attributes.ome + self.assertIn("multiscales", ome) + self.assertEqual(ome["multiscales"][0]["datasets"][0]["path"], "image") + # data_model + version on root. + self.assertEqual(doc.root.attributes.extra["data_model"], "org.lsst.image") + + +@unittest.skipUnless(HAVE_ZARR, "zarr is not installed") +class ZarrWriteOnDiskShapeTestCase(unittest.TestCase): + """Pin the on-disk layout for harder archive classes.""" + + def _round_trip_doc(self, obj): + with tempfile.TemporaryDirectory() as tmp: + target = os.path.join(tmp, "out.zarr") + write(obj, target) + with zarr.storage.LocalStore(target, read_only=True) as store: + return ZarrDocument.from_zarr(store) + + def test_masked_image_layout(self) -> None: + schema = MaskSchema([MaskPlane("BAD", "Bad pixel.")]) + image = Image( + np.arange(20, dtype=np.float32).reshape(4, 5), + bbox=Box.factory[10:14, 20:25], + ) + masked = MaskedImage(image, mask_schema=schema) + masked.mask.set("BAD", image.array % 2 == 0) + + doc = self._round_trip_doc(masked) + self.assertEqual(doc.root.attributes.lsst["archive_class"], "MaskedImage") + # image / variance / mask are sibling root arrays. + self.assertIn("image", doc.root.arrays) + self.assertIn("variance", doc.root.arrays) + self.assertIn("mask", doc.root.arrays) + # Mask is 2-D packed integer with CF flag attrs. + mask = doc.root.arrays["mask"] + self.assertEqual(mask.shape, (4, 5)) + self.assertEqual(mask.attributes.extra["flag_meanings"], "BAD") + # CF / xarray dims on every 2-D array. + for name in ("image", "variance", "mask"): + self.assertEqual( + doc.root.arrays[name].attributes.extra["_ARRAY_DIMENSIONS"], + ["y", "x"], + ) + + +@unittest.skipUnless(HAVE_ZARR, "zarr is not installed") +class ZarrColorImageWriteTestCase(unittest.TestCase): + """ColorImage emits decorated red/green/blue sub-archives.""" + + def test_color_image_emits_per_channel_arrays(self) -> None: + arr = np.zeros((4, 5, 3), dtype=np.uint8) + arr[..., 0] = 1 + arr[..., 1] = 2 + arr[..., 2] = 3 + color = ColorImage(arr, bbox=Box.factory[10:14, 20:25]) + + with tempfile.TemporaryDirectory() as tmp: + target = os.path.join(tmp, "out.zarr") + write(color, target) + with zarr.storage.LocalStore(target, read_only=True) as store: + doc = ZarrDocument.from_zarr(store) + # Root: ColorImage, no ome.multiscales + # (axes_for_archive_class returns () for ColorImage). + self.assertEqual(doc.root.attributes.lsst["archive_class"], "ColorImage") + self.assertNotIn("multiscales", doc.root.attributes.ome) + # Each channel is a top-level 2-D array. + for channel in ("red", "green", "blue"): + self.assertIn(channel, doc.root.arrays) + self.assertEqual(doc.root.arrays[channel].shape, (4, 5)) + + +@unittest.skipUnless(HAVE_ZARR, "zarr is not installed") +class ZarrPsfChunkingTestCase(unittest.TestCase): + """`add_array` defaults a 4-D ``psf`` array to single-cell chunks.""" + + def test_psf_array_uses_single_cell_chunks(self) -> None: + psf = np.zeros((2, 3, 21, 21), dtype=np.float32) + archive = ZarrOutputArchive(archive_class="CellCoadd") + ref = archive.add_array(psf, name="psf") + self.assertEqual(ref.source, "zarr:/psf") + node = archive.document.root.get("/psf") + # Single-cell chunks: leading axes are 1; spatial axes match shape. + self.assertEqual(tuple(node.chunks), (1, 1, 21, 21)) + + def test_psf_user_override_wins(self) -> None: + psf = np.zeros((2, 3, 21, 21), dtype=np.float32) + archive = ZarrOutputArchive( + archive_class="CellCoadd", + chunks={"psf": (2, 3, 21, 21)}, + ) + archive.add_array(psf, name="psf") + node = archive.document.root.get("/psf") + self.assertEqual(tuple(node.chunks), (2, 3, 21, 21)) + + def test_psf_array_gets_default_shards(self) -> None: + # 25x25 cells of 150x150 float32: chunk_bytes = 90 KiB, + # ratio ~ 186, k = round(sqrt(186)) = 14 -> shard (14, 14, 150, 150). + psf = np.zeros((25, 25, 150, 150), dtype=np.float32) + archive = ZarrOutputArchive(archive_class="CellCoadd") + archive.add_array(psf, name="psf") + node = archive.document.root.get("/psf") + self.assertEqual(tuple(node.shards), (14, 14, 150, 150)) + + def test_psf_user_shard_override_wins(self) -> None: + psf = np.zeros((25, 25, 150, 150), dtype=np.float32) + archive = ZarrOutputArchive( + archive_class="CellCoadd", + shards={"psf": (5, 5, 150, 150)}, + ) + archive.add_array(psf, name="psf") + node = archive.document.root.get("/psf") + self.assertEqual(tuple(node.shards), (5, 5, 150, 150)) + + def test_small_psf_shard_caps_at_array_bounds(self) -> None: + # 2x3 cells of 21x21 float32: chunk_bytes = 1764 B, ratio ~9511, + # 2 growable axes, k = round(sqrt(9511)) = 98. The cap clamps + # each growable axis to chunks[i] * ceil(shape[i]/chunks[i]) = + # 1 * shape[i] = shape[i], yielding shard (2, 3, 21, 21) — the + # whole 6-cell PSF goes into one shard. Inner axes (21, 21) are + # not growable since chunks already cover them. + psf = np.zeros((2, 3, 21, 21), dtype=np.float32) + archive = ZarrOutputArchive(archive_class="CellCoadd") + archive.add_array(psf, name="psf") + node = archive.document.root.get("/psf") + self.assertEqual(tuple(node.shards), (2, 3, 21, 21)) + + +@unittest.skipUnless(HAVE_ZARR, "zarr is not installed") +class ZarrOpaqueMetadataWriteTestCase(unittest.TestCase): + """FITS opaque metadata persists at /lsst/opaque_metadata/fits/primary.""" + + def test_fits_opaque_metadata_persists(self) -> None: + image = Image( + np.arange(20, dtype=np.float32).reshape(4, 5), + bbox=Box.factory[10:14, 20:25], + ) + header = astropy.io.fits.Header() + header["ORIGIN"] = "RUBIN" + header["EXPTIME"] = 30.0 + opaque = FitsOpaqueMetadata() + opaque.headers[ExtensionKey()] = header + image._opaque_metadata = opaque + + with tempfile.TemporaryDirectory() as tmp: + target = os.path.join(tmp, "out.zarr") + write(image, target) + with zarr.storage.LocalStore(target, read_only=True) as store: + doc = ZarrDocument.from_zarr(store) + self.assertEqual( + doc.root.attributes.lsst.get("opaque_metadata_format"), + "fits", + ) + opaque_node = doc.root.get("/lsst/opaque_metadata/fits/primary") + # ``(N, 80)`` byte array with explicit dim names. + self.assertEqual(len(opaque_node.shape), 2) + self.assertEqual(opaque_node.shape[1], 80) + self.assertEqual( + opaque_node.attributes.extra["_ARRAY_DIMENSIONS"], + ["card", "char"], + ) + # Recover the original header from the raw bytes. + text = bytes(opaque_node.read()).decode("ascii") + recovered = astropy.io.fits.Header.fromstring(text) + self.assertEqual(recovered["ORIGIN"], "RUBIN") + self.assertEqual(recovered["EXPTIME"], 30.0) + + +@unittest.skipUnless(HAVE_ZARR, "zarr is not installed") +class BuildArchiveMetadataTestCase(unittest.TestCase): + """`build_archive_metadata` resolves cell shape and mask schema.""" + + def test_cell_shape_from_grid_attribute(self) -> None: + # CellCoadd exposes its cells via ``.grid.cell_shape`` (a YX), + # not via ``.cell_shape`` directly. The resolver must walk the + # grid attribute so cell-aligned chunks fire on real writes. + grid = SimpleNamespace(cell_shape=YX(y=150, x=200)) + obj = SimpleNamespace(grid=grid) + metadata = build_archive_metadata(obj) + self.assertEqual(metadata["cell_shape"], (150, 200)) + + def test_cell_shape_direct_attribute_wins(self) -> None: + # If both ``.cell_shape`` and ``.grid.cell_shape`` exist the + # direct attribute is preferred (allows callers to override). + grid = SimpleNamespace(cell_shape=YX(y=150, x=200)) + obj = SimpleNamespace(cell_shape=(64, 64), grid=grid) + metadata = build_archive_metadata(obj) + self.assertEqual(metadata["cell_shape"], (64, 64)) + + def test_cell_shape_from_legacy_cell_grid_attribute(self) -> None: + # Older / hypothetical objects may expose ``.cell_grid`` instead + # of ``.grid``; the resolver falls through to that. + cell_grid = SimpleNamespace(cell_shape=YX(y=128, x=128)) + obj = SimpleNamespace(cell_grid=cell_grid) + metadata = build_archive_metadata(obj) + self.assertEqual(metadata["cell_shape"], (128, 128)) + + def test_no_cell_shape_for_plain_image(self) -> None: + image = Image(np.zeros((4, 5), dtype=np.float32), bbox=Box.factory[0:4, 0:5]) + metadata = build_archive_metadata(image) + self.assertNotIn("cell_shape", metadata) + + def test_mask_schema_from_inner_mask(self) -> None: + schema = MaskSchema([MaskPlane("BAD", "Bad pixel.")]) + image = Image(np.zeros((4, 5), dtype=np.float32), bbox=Box.factory[0:4, 0:5]) + masked = MaskedImage(image, mask_schema=schema) + metadata = build_archive_metadata(masked) + self.assertIs(metadata["mask_schema"], masked.mask.schema) + + def test_mask_schema_for_top_level_mask(self) -> None: + schema = MaskSchema([MaskPlane("BAD", "Bad pixel.")]) + mask = Mask( + np.zeros((4, 5, schema.mask_size), dtype=schema.dtype), + bbox=Box.factory[0:4, 0:5], + schema=schema, + ) + metadata = build_archive_metadata(mask) + self.assertIs(metadata["mask_schema"], schema) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/test_zarr_round_trip.py b/tests/test_zarr_round_trip.py new file mode 100644 index 00000000..ac24f956 --- /dev/null +++ b/tests/test_zarr_round_trip.py @@ -0,0 +1,172 @@ +# This file is part of lsst-images. +# +# Developed for the LSST Data Management System. +# This product includes software developed by the LSST Project +# (https://www.lsst.org). +# See the COPYRIGHT file at the top-level directory of this distribution +# for details of code ownership. +# +# Use of this source code is governed by a 3-clause BSD-style +# license that can be found in the LICENSE file. + +from __future__ import annotations + +import unittest + +import numpy as np + +from lsst.images import Box, ColorImage, Image, Mask, MaskedImage, MaskPlane, MaskSchema + +try: + import zarr + + from lsst.images.tests import RoundtripZarr + from lsst.images.zarr._store import open_store_for_read + + HAVE_ZARR = True +except ImportError: + HAVE_ZARR = False + + +@unittest.skipUnless(HAVE_ZARR, "zarr is not installed") +class ZarrRoundTripTestCase(unittest.TestCase): + """Full write -> read round-trips for the supported image types.""" + + def test_image_round_trip(self) -> None: + original = Image( + np.arange(20, dtype=np.float32).reshape(4, 5), + bbox=Box.factory[10:14, 20:25], + ) + with RoundtripZarr(self, original) as roundtrip: + recovered = roundtrip.result + np.testing.assert_array_equal(recovered.array, original.array) + self.assertEqual(recovered.bbox, original.bbox) + + def test_image_round_trip_writes_shards(self) -> None: + # 300x300 float32: chunks (256, 256) -> shard (512, 512) by the + # byte-budget rule (target 16 MiB, ratio ~64, k ~ 8 capped at the + # 2-chunk-per-axis ceiling of 256 * 2 = 512). + original = Image( + np.zeros((300, 300), dtype=np.float32), + bbox=Box.factory[0:300, 0:300], + ) + with RoundtripZarr(self, original) as roundtrip: + with open_store_for_read(roundtrip.filename) as store: + root = zarr.open_group(store=store, mode="r", zarr_format=3) + image_arr = root["image"] + self.assertEqual(tuple(image_arr.chunks), (256, 256)) + self.assertEqual(tuple(image_arr.shards), (512, 512)) + # Single-chunk metadata arrays must NOT be sharded. + lsst_json_arr = root["lsst_json"] + self.assertIsNone(lsst_json_arr.shards) + # Data round-trip is preserved. + np.testing.assert_array_equal(roundtrip.result.array, original.array) + + def test_masked_image_round_trip(self) -> None: + schema = MaskSchema( + [ + MaskPlane("BAD", "Bad pixel."), + MaskPlane("SAT", "Saturated."), + MaskPlane("CR", "Cosmic ray."), + ] + ) + image = Image( + np.arange(20, dtype=np.float32).reshape(4, 5), + bbox=Box.factory[10:14, 20:25], + ) + original = MaskedImage(image, mask_schema=schema) + original.mask.set("BAD", image.array % 2 == 0) + original.mask.set("SAT", image.array > 10) + + with RoundtripZarr(self, original) as roundtrip: + recovered = roundtrip.result + np.testing.assert_array_equal(recovered.image.array, original.image.array) + np.testing.assert_array_equal(recovered.mask.array, original.mask.array) + + def test_mask_round_trip(self) -> None: + # Top-level Mask: schema is on the object itself, not on an + # inner ``mask`` attribute. write() must reach it. + schema = MaskSchema( + [ + MaskPlane("BAD", "Bad pixel."), + MaskPlane("SAT", "Saturated."), + MaskPlane("CR", "Cosmic ray."), + ] + ) + original = Mask( + np.zeros((4, 5, schema.mask_size), dtype=schema.dtype), + bbox=Box.factory[10:14, 20:25], + schema=schema, + ) + original.set("BAD", np.array([[i % 2 == 0 for i in range(5)] for _ in range(4)])) + original.set("SAT", np.array([[i > 2 for i in range(5)] for _ in range(4)])) + with RoundtripZarr(self, original) as roundtrip: + recovered = roundtrip.result + np.testing.assert_array_equal(recovered.array, original.array) + self.assertEqual(recovered.bbox, original.bbox) + self.assertEqual(list(recovered.schema.names), list(original.schema.names)) + + def test_uint16_mask_packs_with_element_stride(self) -> None: + # 20-plane uint16 schema (mask_size = 2 elements per pixel). + # Setting plane 16 must produce an on-disk packed value whose + # CF flag_masks[16] bit is set — the bit position must match + # the schema's element-stride layout, not the byte-stride + # layout. Without the fix, plane 16 lands at packed bit 8. + schema = MaskSchema( + [MaskPlane(f"P{i}", f"Plane {i}.") for i in range(20)], + dtype=np.uint16, + ) + image = Image( + np.zeros((4, 5), dtype=np.float32), + bbox=Box.factory[10:14, 20:25], + ) + original = MaskedImage(image, mask_schema=schema) + target_pixel = np.zeros((4, 5), dtype=bool) + target_pixel[0, 0] = True + original.mask.set("P16", target_pixel) + with RoundtripZarr(self, original) as roundtrip: + with open_store_for_read(roundtrip.filename) as store: + root = zarr.open_group(store=store, mode="r", zarr_format=3) + mask_arr = root["mask"] + flag_masks = list(mask_arr.attrs["flag_masks"]) + on_disk = int(mask_arr[0, 0]) + self.assertEqual(flag_masks[16], 1 << 16) + self.assertNotEqual(on_disk & flag_masks[16], 0) + recovered = roundtrip.result + np.testing.assert_array_equal(recovered.mask.array, original.mask.array) + + def test_masked_image_with_40_planes_round_trip(self) -> None: + schema = MaskSchema([MaskPlane(f"P{i}", f"Plane {i}.") for i in range(40)]) + image = Image( + np.arange(20, dtype=np.float32).reshape(4, 5), + bbox=Box.factory[10:14, 20:25], + ) + original = MaskedImage(image, mask_schema=schema) + original.mask.set("P0", image.array % 2 == 0) + original.mask.set("P39", image.array > 10) + + with RoundtripZarr(self, original) as roundtrip: + recovered = roundtrip.result + # 40 planes packed into uint64 on disk; recovered as 5 bytes/pixel. + np.testing.assert_array_equal(recovered.mask.array, original.mask.array) + + +@unittest.skipUnless(HAVE_ZARR, "zarr is not installed") +class ZarrColorImageRoundTripTestCase(unittest.TestCase): + """ColorImage round-trips through the zarr backend.""" + + def test_color_image_round_trip(self) -> None: + arr = np.zeros((4, 5, 3), dtype=np.uint8) + arr[..., 0] = 1 + arr[..., 1] = 2 + arr[..., 2] = 3 + original = ColorImage(arr, bbox=Box.factory[10:14, 20:25]) + + with RoundtripZarr(self, original) as roundtrip: + recovered = roundtrip.result + np.testing.assert_array_equal(recovered.array, original.array) + self.assertEqual(recovered.bbox, original.bbox) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/test_zarr_store.py b/tests/test_zarr_store.py new file mode 100644 index 00000000..e6c11fc8 --- /dev/null +++ b/tests/test_zarr_store.py @@ -0,0 +1,86 @@ +# This file is part of lsst-images. +# +# Developed for the LSST Data Management System. +# This product includes software developed by the LSST Project +# (https://www.lsst.org). +# See the COPYRIGHT file at the top-level directory of this distribution +# for details of code ownership. +# +# Use of this source code is governed by a 3-clause BSD-style +# license that can be found in the LICENSE file. + +from __future__ import annotations + +import os +import tempfile +import unittest + +import numpy as np + +try: + import zarr + + from lsst.images.zarr._store import open_store_for_read, open_store_for_write + + HAVE_ZARR = True +except ImportError: + HAVE_ZARR = False + + +@unittest.skipUnless(HAVE_ZARR, "zarr is not installed") +class StoreDispatchTestCase(unittest.TestCase): + """URI-based dispatch for zarr stores.""" + + def test_local_directory(self) -> None: + with tempfile.TemporaryDirectory() as tmp: + target = os.path.join(tmp, "out.zarr") + with open_store_for_write(target) as store: + self.assertIsInstance(store, zarr.storage.LocalStore) + zarr.create_group(store=store, zarr_format=3) + with open_store_for_read(target) as store: + self.assertIsInstance(store, zarr.storage.LocalStore) + root = zarr.open_group(store=store, mode="r") + self.assertEqual(list(root.keys()), []) + + def test_zip_store(self) -> None: + with tempfile.TemporaryDirectory() as tmp: + target = os.path.join(tmp, "out.zarr.zip") + with open_store_for_write(target) as store: + self.assertIsInstance(store, zarr.storage.ZipStore) + zarr.create_group(store=store, zarr_format=3) + with open_store_for_read(target) as store: + self.assertIsInstance(store, zarr.storage.ZipStore) + + def test_create_only_refuses_existing(self) -> None: + with tempfile.TemporaryDirectory() as tmp: + target = os.path.join(tmp, "out.zarr") + with open_store_for_write(target) as store: + zarr.create_group(store=store, zarr_format=3) + with self.assertRaisesRegex(OSError, "already exists"): + with open_store_for_write(target): + pass + + def test_zip_store_round_trips_sharded_array(self) -> None: + with tempfile.TemporaryDirectory() as tmp: + target = os.path.join(tmp, "out.zarr.zip") + data = np.arange(300 * 300, dtype=np.float32).reshape(300, 300) + with open_store_for_write(target) as store: + group = zarr.create_group(store=store, zarr_format=3) + arr = group.create_array( + name="image", + shape=data.shape, + chunks=(256, 256), + shards=(512, 512), + dtype=data.dtype, + ) + arr[:] = data + with open_store_for_read(target) as store: + group = zarr.open_group(store=store, mode="r", zarr_format=3) + image = group["image"] + self.assertEqual(tuple(image.chunks), (256, 256)) + self.assertEqual(tuple(image.shards), (512, 512)) + np.testing.assert_array_equal(image[...], data) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/test_zarr_xarray_interop.py b/tests/test_zarr_xarray_interop.py new file mode 100644 index 00000000..8048d369 --- /dev/null +++ b/tests/test_zarr_xarray_interop.py @@ -0,0 +1,179 @@ +# This file is part of lsst-images. +# +# Developed for the LSST Data Management System. +# This product includes software developed by the LSST Project +# (https://www.lsst.org). +# See the COPYRIGHT file at the top-level directory of this distribution +# for details of code ownership. +# +# Use of this source code is governed by a 3-clause BSD-style +# license that can be found in the LICENSE file. + +from __future__ import annotations + +import os +import tempfile +import unittest + +import numpy as np + +from lsst.images import Box, Image, Mask, MaskedImage, MaskPlane, MaskSchema + +try: + import zarr # noqa: F401 + + from lsst.images.zarr import write + + HAVE_ZARR = True +except ImportError: + HAVE_ZARR = False + +try: + import xarray as xr + + HAVE_XARRAY = True +except ImportError: + HAVE_XARRAY = False + + +@unittest.skipUnless(HAVE_ZARR and HAVE_XARRAY, "xarray is not installed") +class XarrayInteropTestCase(unittest.TestCase): + """``xr.open_zarr`` returns a Dataset with the masked-image siblings.""" + + def _make_masked_image(self) -> MaskedImage: + schema = MaskSchema( + [ + MaskPlane("BAD", "Bad pixel."), + MaskPlane("SAT", "Saturated."), + MaskPlane("CR", "Cosmic ray."), + ] + ) + image = Image( + np.arange(20, dtype=np.float32).reshape(4, 5), + bbox=Box.factory[10:14, 20:25], + ) + masked = MaskedImage(image, mask_schema=schema) + masked.mask.set("BAD", image.array % 2 == 0) + return masked + + def test_open_zarr_returns_dataset_with_masked_image_components(self) -> None: + masked = self._make_masked_image() + + with tempfile.TemporaryDirectory() as tmp: + target = os.path.join(tmp, "masked.zarr") + write(masked, target) + ds = xr.open_zarr(target, consolidated=False) + # Three data variables sharing the (y, x) dims. + self.assertIn("image", ds.data_vars) + self.assertIn("variance", ds.data_vars) + self.assertIn("mask", ds.data_vars) + self.assertEqual(ds["image"].dims, ("y", "x")) + self.assertEqual(ds["mask"].dims, ("y", "x")) + self.assertEqual(ds["image"].shape, (4, 5)) + # CF flag attrs survive on the mask variable. + self.assertEqual(ds["mask"].attrs["flag_meanings"], "BAD SAT CR") + self.assertEqual(list(ds["mask"].attrs["flag_masks"]), [1, 2, 4]) + + def test_open_zarr_uses_consolidated_metadata(self) -> None: + """``write()`` consolidates metadata so xr.open_zarr uses one fetch.""" + import warnings + + masked = self._make_masked_image() + with tempfile.TemporaryDirectory() as tmp: + target = os.path.join(tmp, "masked.zarr") + write(masked, target) + # Default ``consolidated=None`` means "use it if available"; + # if it isn't present xarray emits a ``RuntimeWarning`` and + # falls back to walking every array. Promote that warning to + # an error to confirm the consolidated path is taken. + with warnings.catch_warnings(): + warnings.simplefilter("error", RuntimeWarning) + xr.open_zarr(target) + + def test_open_zarr_data_values_match_in_memory(self) -> None: + """The bytes xarray reads are the same bytes the archive wrote.""" + masked = self._make_masked_image() + + with tempfile.TemporaryDirectory() as tmp: + target = os.path.join(tmp, "masked.zarr") + write(masked, target) + ds = xr.open_zarr(target, consolidated=False) + np.testing.assert_array_equal(ds["image"].values, masked.image.array) + np.testing.assert_array_equal(ds["variance"].values, masked.variance.array) + # Mask on disk is a 2-D packed wide-int; compare against the + # equivalent packing of the in-memory (y, x, mask_size) array. + packed = np.zeros(masked.mask.array.shape[:2], dtype=ds["mask"].dtype) + for i in range(masked.mask.array.shape[2]): + packed |= masked.mask.array[..., i].astype(ds["mask"].dtype) << (8 * i) + np.testing.assert_array_equal(ds["mask"].values, packed) + + +@unittest.skipUnless(HAVE_ZARR and HAVE_XARRAY, "xarray is not installed") +class XarrayCfFlagDecodingTestCase(unittest.TestCase): + """A standalone CF-aware reader can decode plane membership. + + Uses ``xarray.open_zarr`` to read the archive without any LSST + code on the read side, then applies the standard CF flag-decoding + rule ``(value & flag_masks[i]) != 0`` to recover the plane + membership of every pixel. The recovered membership must match + what was written. Catches regressions in the on-disk packing + layout (e.g. element-stride vs byte-stride bugs) that would + otherwise be invisible to an internal round-trip. + """ + + def test_uint16_schema_decodes_under_cf_rules(self) -> None: + # 20-plane uint16 schema (mask_size = 2) — exercises the + # multi-element packing path. + plane_names = [f"P{i}" for i in range(20)] + schema = MaskSchema( + [MaskPlane(name, f"Plane {name}.") for name in plane_names], + dtype=np.uint16, + ) + # Set distinct planes in distinct pixels so a single pass can + # cover the whole bit range, including planes that only the + # high element holds (P16..P19). + original = Mask( + np.zeros((4, 5, schema.mask_size), dtype=schema.dtype), + bbox=Box.factory[0:4, 0:5], + schema=schema, + ) + plane_for_pixel = {(0, 0): "P0", (0, 1): "P7", (1, 2): "P8", (2, 3): "P15", (3, 4): "P16"} + for (y, x), plane_name in plane_for_pixel.items(): + sel = np.zeros((4, 5), dtype=bool) + sel[y, x] = True + original.set(plane_name, sel) + + with tempfile.TemporaryDirectory() as tmp: + target = os.path.join(tmp, "mask.zarr") + write(original, target) + ds = xr.open_zarr(target, consolidated=False) + mask_da = ds["mask"] + flag_masks = list(mask_da.attrs["flag_masks"]) + flag_meanings = mask_da.attrs["flag_meanings"].split() + self.assertEqual(flag_meanings, plane_names) + self.assertEqual(flag_masks, [1 << i for i in range(20)]) + mask_values = mask_da.values + # CF decode: plane i is set at (y, x) iff + # (mask_values[y, x] & flag_masks[i]) != 0. + for (y, x), plane_name in plane_for_pixel.items(): + plane_idx = flag_meanings.index(plane_name) + bit = flag_masks[plane_idx] + self.assertNotEqual( + int(mask_values[y, x]) & bit, + 0, + f"plane {plane_name} (bit {bit:#x}) not set at ({y}, {x}); " + f"on-disk value = {int(mask_values[y, x]):#x}", + ) + # All other planes must be unset at this pixel. + for other_idx in range(len(flag_meanings)): + if other_idx == plane_idx: + continue + self.assertEqual( + int(mask_values[y, x]) & flag_masks[other_idx], + 0, + f"plane {flag_meanings[other_idx]} unexpectedly set at ({y}, {x})", + ) + + +if __name__ == "__main__": + unittest.main()