From 67e2b37a1c80844ea8ce84769f628f5edc1700ff Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Piotr=20C=C5=82apa?= <jakub@hume.ai>
Date: Fri, 6 Feb 2026 13:22:09 +0000
Subject: [PATCH 1/2] WSSample: always validate that keys match across subdirs

---
 wsds/ws_sample.py | 42 +++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 41 insertions(+), 1 deletion(-)

diff --git a/wsds/ws_sample.py b/wsds/ws_sample.py
index 5d76f8b..70eb2cf 100644
--- a/wsds/ws_sample.py
+++ b/wsds/ws_sample.py
@@ -7,12 +7,15 @@
     from .ws_dataset import WSDataset
 
 
-@dataclass(frozen=True, slots=True)
+@dataclass(frozen=True)
 class WSSample:
     dataset: "WSDataset"
     shard_name: str
     offset: int
     overrides: dict = field(default_factory=dict)
+    # Key verification state (mutable containers to work with frozen dataclass)
+    _verified_subdirs: set = field(default_factory=set, repr=False, compare=False)
+    _reference_key: list = field(default_factory=list, repr=False, compare=False)
 
     def get_audio(self, audio_columns=None):
         candidates = audio_columns or self.dataset._audio_file_keys
@@ -33,9 +36,46 @@ def items(self):
     def values(self):
         yield from (v for _, v in self.items())
 
+    def _verify_key_for_field(self, field: str):
+        """Verify __key__ in this field's subdir matches the reference key."""
+        value = self.dataset.fields.get(field)
+        if value is None:
+            return
+        (subdir, _column) = value[0]
+
+        if subdir in self._verified_subdirs:
+            return
+
+        # Skip computed columns (they don't have their own __key__)
+        if subdir in self.dataset.computed_columns:
+            self._verified_subdirs.add(subdir)
+            return
+
+        # Get __key__ from this subdir
+        try:
+            key = self.dataset.get_shard(subdir, self.shard_name).get_sample("__key__", self.offset)
+        except (WSShardMissingError, KeyError):
+            # Can't verify if shard or key is missing
+            self._verified_subdirs.add(subdir)
+            return
+
+        if not self._reference_key:
+            # First subdir accessed - store as reference
+            self._reference_key.append((subdir, key))
+        else:
+            ref_subdir, ref_key = self._reference_key[0]
+            if key != ref_key:
+                raise ValueError(
+                    f"Key mismatch at offset {self.offset} in shard {self.shard_name}: "
+                    f"{ref_subdir} has '{ref_key}' but {subdir} has '{key}'"
+                )
+
+        self._verified_subdirs.add(subdir)
+
     def __getitem__(self, field):
         if field in self.overrides:
             return self.overrides[field]
+        self._verify_key_for_field(field)
         return self.dataset.get_sample(self.shard_name, field, self.offset)
 
     def __setitem__(self, field, value):

From da238e8eab297332935e162f6df38ad055eb1e8d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Piotr=20C=C5=82apa?= <jakub@hume.ai>
Date: Fri, 13 Mar 2026 18:50:35 +0100
Subject: [PATCH 2/2] WSSample: print fields with missing shards last (#37)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* WSSample: print fields with missing shards last

* WSAudio: disable slots because it breaks code auto-reload (#38)

* WSAudio: disable slots because it breaks code auto-reload

* Added WSModalShard (#41)

* Extract audio codec layer from ws_audio.py into audio_codec.py

Separates codec concerns (decoder backends, encoder, format utils) from
the data model layer (AudioReader, WSAudio) for better reusability and
testability.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>

* Add ModalFileReader for Modal Volume range requests

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>

* Centralize binary column decoding into ws_decode module

Extract duplicated npy/pyd/txt/audio decode logic from WSShard and
WSS3Shard into a shared decode_sample() function. Dispatch is now
based on column type (binary) rather than column-name heuristics.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>

* Added WSModalShard

* Big renaming and cleanups (#45)

* Move index SQL queries from WSDataset into WSIndex

- Add old/new index format detection (partition vs dataset_path columns)
- Add _partition_col property for unified SQL partition expression
- Add lookup_by_index() and lookup_by_key() methods to WSIndex
- Add shard_n_samples() and shard_global_offset() via _query_shard() helper
- Simplify WSDataset.__getitem__ to delegate to WSIndex lookups
- Replace all raw index.query() calls in WSDataset with WSIndex methods
- Update ws_tools.py to use new format detection

* Fix env var name in README: WSDS_DATASET_PATH → WSDS_DATASET_SEARCH_PATH

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>

* Move is_notebook() to utils, guard _ipython_display_ for terminal use

- Extract is_notebook() from convplayer.py into utils.py (simplified)
- Remove redundant _ipython_display_ from AudioReader and WSAudio
  (IPython already calls _repr_html_ automatically)
- Add is_notebook() guard to WSDataset and WSSample _ipython_display_
  so they fall back to print() in terminal IPython sessions

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>

* Improve naming consistency across codebase

- subdir → column_dir (utils, ws_sample, ws_modal_shard)
- shard_name → shard_ref on shard interfaces and WSSample
- dataset_path → partition in index and shard code
- dataset_dir → dataset_root in ws_indexer

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>

* Add get_audio() helper in ws_decode, use in WSSample

Centralizes audio column lookup logic so it can be reused outside
of WSSample (e.g. from plain dicts or other sample types).

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>

* Add rng parameter to WSDataset for reproducible sampling

Allows passing rng=42 (or a Random instance) to get deterministic
sample ordering in random_sample() and sql_select(). Also removes
unused needs_key variable.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>

* Fix hume_wsds module path remapping and sql_filter pl.first() usage

- Remap hume_wsds.* loader paths to wsds.* for backward compatibility
  with old index files that reference the former package name
- Use pl.first() instead of exprs[0] in sql_filter for correctness

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>

* Update module docstring with rich examples and fix doctests

- Add comprehensive docstring to __init__.py with working doctests
  showcasing SQL queries, random access, lazy loading, and audio
- Fix ws_dataset.py doctests (AudioReader src type, add shard_subsample=1)
- Fix ws_sink.py doctest (remove invalid batch_size param)
- Update tests.py to run wsds module doctests and fix imports

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>

* Apply ruff formatting

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>

* Moved the library showcase to README.md

* Use shard ref

---------

Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>
Co-authored-by: Shahbaz Mogal <shahbaz@hume.ai>

---------

Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>
Co-authored-by: Shahbaz Mogal <shahbaz@hume.ai>

---------

Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>
Co-authored-by: Shahbaz Mogal <shahbaz@hume.ai>

---------

Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>
Co-authored-by: Shahbaz Mogal <shahbaz@hume.ai>
---
 README.md                     |  82 +++++++++--
 tests.py                      |  12 +-
 wsds/__init__.py              |   8 +-
 wsds/audio_codec.py           | 251 +++++++++++++++++++++++++++++++
 wsds/convplayer.py            | 121 ++++++++-------
 wsds/pupyarrow/file_reader.py | 116 +++++++++++++++
 wsds/utils.py                 |  56 ++++---
 wsds/ws_audio.py              | 171 +++++----------------
 wsds/ws_dataset.py            | 270 +++++++++++++++++-----------------
 wsds/ws_decode.py             |  56 +++++++
 wsds/ws_feather_index.py      |  39 +++--
 wsds/ws_index.py              | 104 +++++++++++--
 wsds/ws_indexer.py            | 176 +++++++++++-----------
 wsds/ws_modal_shard.py        | 116 +++++++++++++++
 wsds/ws_s3_shard.py           |  38 ++---
 wsds/ws_sample.py             | 130 ++++++++--------
 wsds/ws_shard.py              |  53 +++----
 wsds/ws_sink.py               |   4 +-
 wsds/ws_tools.py              |  50 ++++---
 19 files changed, 1215 insertions(+), 638 deletions(-)
 create mode 100644 wsds/audio_codec.py
 create mode 100644 wsds/ws_decode.py
 create mode 100644 wsds/ws_modal_shard.py

diff --git a/README.md b/README.md
index 8ed0c84..1895568 100644
--- a/README.md
+++ b/README.md
@@ -1,27 +1,77 @@
-# WSDS
+## wsds — Web-Scale DataSets
 
-wsds merges SQL querying capabilities with native support for multimodal data (speech and video) in a single data
-format and a unified API. It uses shards for efficiency and to support very-scalable parallel data processing.
+**wsds** is a multimodal dataset library that combines the power of SQL querying with native support for speech, audio, and video data. Built for large-scale machine learning workflows, it lets you work with massive datasets efficiently, regardless of where you store your data (SSDs, HDDs, Weka, S3).
 
-wsds has a powerful database query engine integrated into it (built on top of Polars). This makes database-style
-operations like duplicate detection, group by operations and aggregations very fast and easy to write.
-This tight integration let's you run both SQL queries and efficient dataloaders directly on your data without any
-conversion or importing.
+```pycon
+>>> from wsds import WSDataset
+>>> dataset = WSDataset("librilight/v3-vad_ws")
+>>> print(str(dataset))
+WSDataset('librilight/v3-vad_ws', segmented=True)
+     Audio duration: 52.69 k hours
+    Speech duration: 47.44 k hours
+   Number of shards: 623
+  Number of samples: 22 662 659
+<BLANKLINE>
 
-## Getting Started
+```
+
+### Quick start
 
 ```bash
-# create environment
-conda create -n wsds python=3.10
-conda activate wsds
+pip install git+https://github.com/HumeAI/wsds.git
+```
+
+- **SQL Queries on Sharded Data** — Filter and select across your entire dataset using familiar SQL syntax, powered by Polars. Only the columns and shards you need are loaded.
+
+```pycon
+>>> dataset.sql_select('`transcription_wslang_raw.txt`', 'snr', 'tend - tstart as duration')
+INFO: to speed things up wsds is loading a random 24.08% subset of the shards, pass shard_subsample=1 to force it to load the whole dataset
+shape: (5_271_939, 3)
+┌─────────────────────────────────┬──────────┬───────────┐
+│ transcription_wslang_raw.txt    ┆ snr      ┆ duration  │
+│ ---                             ┆ ---      ┆ ---       │
+│ str                             ┆ f16      ┆ f32       │
+╞═════════════════════════════════╪══════════╪═══════════╡
+│  This is a liberal box recordi… ┆ 70.0625  ┆ 1.331058  │
+│  or liberty box recordings dur… ┆ 66.25    ┆ 1.962457  │
+│  For more information or to vo… ┆ 65.6875  ┆ 3.276451  │
+│  The Elder Eddas of Semen-Sekh… ┆ 51.09375 ┆ 4.863482  │
+│  Translated by Erasmus B. Ande… ┆ 70.1875  ┆ 1.843002  │
+│ …                               ┆ …        ┆ …         │
+│  I stared about me.             ┆ 66.1875  ┆ 1.433472  │
+│  and then pointing to the huge… ┆ 64.75    ┆ 3.703003  │
+│  It was there. Where it is now… ┆ 73.75    ┆ 3.651855  │
+│  He shrugged his shoulders, to… ┆ 65.0     ┆ 9.4198    │
+│  the first chance, and he made… ┆ 62.0     ┆ 11.501709 │
+└─────────────────────────────────┴──────────┴───────────┘
+
+```
+
+- **Random Access & Indexing** — Optional SQLite-based indexing enables fast random access by key or integer index across shards.
+
+```pycon
+>>> x = dataset['large/1259/lettersofjaneausten_etk_librivox_64kb_mp3/lettersofjaneausten_22_austen_64kb_032']
 
-# install hume_wsds
-pip install https://github.com/HumeAI/wsds.git
 ```
 
-## Tests
+- **Lazy, On-Demand Loading** — Samples are dict-like objects that load fields only when accessed, keeping memory usage minimal even for terabyte-scale datasets.
+
+```pycon
+>>> x['transcription_wslang_raw.txt'], x['dbu']
+(' The Sherers, I believe, are now really going to go. Joseph has had a bed here the last two nights, and I do not know whether this is not the day of moving. Mrs. Sherer called yesterday to take leave. The weather looks worse again.', -26.34375)
 
-To run tests you currently need a copy of the `librilight` dataset. The tests can be run with:
 ```
-WSDS_DATASET_PATH=/path/to/the/librilight/folder python tests.py
+
+- **Native Audio & Multimodal Support** — First-class handling of speech and audio data, including segmented datasets with voice activity detection and computed columns that reference source audio.
+
+```pycon
+>>> x['audio']
+WSAudio(audio_reader=AudioReader(src=<class '_io.BytesIO'>, sample_rate=None), tstart=614.46246, tend=627.3976)
+
 ```
+
+- **Sharded Architecture** — Data is stored in `.wsds` files (PyArrow IPC format) organized by column type into subdirectories, enabling efficient columnar access patterns.
+
+- **Atomic Writes** — The `WSSink` context manager provides safe, batched, compressed writes with atomic commit semantics.
+
+- **Flexible Data Linking** — Computed columns and `.wsds-link` files let you compose datasets without duplicating data, referencing columns across dataset boundaries.
diff --git a/tests.py b/tests.py
index c8cf3d2..39a9acb 100644
--- a/tests.py
+++ b/tests.py
@@ -1,12 +1,18 @@
-import unittest
 import doctest
-from . import ws_dataset, ws_shard, ws_sink
+import unittest
+
+import wsds
+from wsds import ws_dataset, ws_shard, ws_sink
+
 
 def load_tests(loader, tests, ignore):
+    tests.addTests(doctest.DocTestSuite(wsds))
     tests.addTests(doctest.DocTestSuite(ws_dataset))
     tests.addTests(doctest.DocTestSuite(ws_shard))
     tests.addTests(doctest.DocTestSuite(ws_sink))
+    tests.addTests(doctest.DocFileSuite("README.md"))
     return tests
 
-if __name__ == '__main__':
+
+if __name__ == "__main__":
     unittest.main()
diff --git a/wsds/__init__.py b/wsds/__init__.py
index fc9a1a0..b959670 100644
--- a/wsds/__init__.py
+++ b/wsds/__init__.py
@@ -1,11 +1,7 @@
 """
-# wsds dataset library
 
-Usage example:
->>> from wsds import WSDataset
->>> dataset = WSDataset("librilight/v3-vad_ws")
->>> for sample in dataset.random_samples(5):
->>>     print(sample['__key__'], sample['txt'])
+.. include:: ../README.md
+.. include:: ../docs/dataset-structure.md
 
 """
 
diff --git a/wsds/audio_codec.py b/wsds/audio_codec.py
new file mode 100644
index 0000000..2c64c24
--- /dev/null
+++ b/wsds/audio_codec.py
@@ -0,0 +1,251 @@
+"""Audio codec layer: encoding, decoding, and format utilities.
+
+This module contains all audio encoding/decoding logic, separated from the
+data model layer in ws_audio.py. It provides:
+- Decoder backends (TorchFFmpegAudioDecoder, CompatAudioDecoder)
+- A factory for creating decoders with automatic backend selection
+- MP3 encoding with multi-backend fallback
+- HTML audio rendering utility
+"""
+
+from __future__ import annotations
+
+import io
+import typing
+
+import pyarrow as pa
+
+
+def to_filelike(src: typing.Any) -> typing.BinaryIO:
+    """Coerces files, byte-strings and PyArrow binary buffers into file-like objects."""
+    if hasattr(src, "read"):  # an open file
+        return src
+    # if not an open file then we assume some kind of binary data in memory
+    if hasattr(src, "as_buffer"):  # PyArrow binary data
+        return pa.BufferReader(src.as_buffer())
+    return io.BytesIO(src)
+
+
+class TorchFFmpegAudioDecoder:
+    def __init__(self, src, sample_rate):
+        from torchffmpeg import MediaDecoder
+
+        if hasattr(src, "_optimal_read_size"):
+            buffer_size = src._optimal_read_size
+        else:
+            buffer_size = 128 * 1024
+        self.src = src
+        self.reader = MediaDecoder(to_filelike(self.src), buffer_size=buffer_size)
+        self.metadata = self.reader.get_src_stream_info(self.reader.default_audio_stream)
+
+        if sample_rate is None:
+            sample_rate = int(self.metadata.sample_rate)
+
+        self.sample_rate = sample_rate
+
+        self.reader.add_basic_audio_stream(
+            frames_per_chunk=int(32 * sample_rate),
+            sample_rate=sample_rate,
+            decoder_option={"threads": "4", "thread_type": "frame"},
+        )
+
+    def get_samples_played_in_range(self, tstart=0, tend=None):
+        import torch
+
+        self.reader.seek(max(0, tstart - 1), "key")
+
+        if tend is None:
+            chunks = []
+            more_data = True
+            while more_data:
+                if self.reader.fill_buffer() == 1:
+                    more_data = False
+                (chunk,) = self.reader.pop_chunks()
+                if chunk is not None:
+                    chunks.append(chunk)
+            prefix = int((tstart - chunks[0].pts) * self.sample_rate)
+            if prefix < 0:
+                prefix = 0
+            return torch.cat(chunks)[prefix:].mT
+
+        self.reader.fill_buffer()
+        (chunk,) = self.reader.pop_chunks()
+        prefix = int((tstart - chunk.pts) * self.sample_rate)
+        if prefix < 0:
+            prefix = 0
+        if tend:
+            samples = chunk[prefix : prefix + int((tend - tstart) * self.sample_rate)].mT
+        else:
+            samples = chunk[prefix:].mT
+        while chunk is not None:
+            (chunk,) = self.reader.pop_chunks()
+        return samples
+
+
+class CompatAudioDecoder:
+    def __init__(self, src, sample_rate):
+        import torchaudio
+
+        if not hasattr(torchaudio, "io"):
+            raise ImportError("You need either torchaudio<2.9 or torchcodec installed")
+        self.src = src
+        if hasattr(src, "_optimal_read_size"):
+            buffer_size = src._optimal_read_size
+        else:
+            buffer_size = 128 * 1024
+        self.reader = torchaudio.io.StreamReader(src=to_filelike(self.src), buffer_size=buffer_size)
+        self.metadata = self.reader.get_src_stream_info(0)
+
+        if sample_rate is None:
+            sample_rate = self.metadata.sample_rate
+
+        self.sample_rate = sample_rate
+
+        # fetch 32 seconds because we likely need 30s at maximum but the seeking may be imprecise (and we seek 1s early)
+        # FIXME: check if we can get away with some better settings here (-1, maybe 10s + concatenate the chunks in a loop)
+        self.reader.add_basic_audio_stream(
+            frames_per_chunk=int(32 * sample_rate),
+            sample_rate=sample_rate,
+            decoder_option={"threads": "4", "thread_type": "frame"},
+        )
+
+    def get_samples_played_in_range(self, tstart=0, tend=None):
+        # rought seek
+        self.reader.seek(max(0, tstart - 1), "key")
+
+        if tend is None:
+            import torch
+
+            chunks = []
+            more_data = True
+            while more_data:
+                if self.reader.fill_buffer() == 1:
+                    more_data = False
+                (chunk,) = self.reader.pop_chunks()
+                chunks.append(chunk)
+            prefix = int((tstart - chunks[0].pts) * self.sample_rate)
+            if prefix < 0:
+                prefix = 0
+            return torch.cat(chunks)[prefix:].mT
+
+        self.reader.fill_buffer()
+        (chunk,) = self.reader.pop_chunks()
+        # tight crop (seems accurate down to 1 sample in my tests)
+        prefix = int((tstart - chunk.pts) * self.sample_rate)
+        if prefix < 0:
+            prefix = 0
+        if tend:
+            samples = chunk[prefix : prefix + int((tend - tstart) * self.sample_rate)].mT
+        else:
+            samples = chunk[prefix:].mT
+        # clear out any remaining data
+        while chunk is not None:
+            (chunk,) = self.reader.pop_chunks()
+        return samples
+
+
+def create_decoder(src, sample_rate=None):
+    """Factory: tries torchffmpeg -> torchcodec -> torchaudio, returns a decoder instance.
+
+    Args:
+        src: A file-like object or bytes-like source for audio data.
+        sample_rate: Optional target sample rate for resampling.
+
+    Returns:
+        A decoder instance with .metadata, .sample_rate, and .get_samples_played_in_range() interface.
+    """
+    try:
+        from torchffmpeg import MediaDecoder as _  # noqa: F401
+
+        AudioDecoder = TorchFFmpegAudioDecoder
+    except ImportError:
+        try:
+            from torchcodec.decoders import AudioDecoder
+        except ImportError:
+            AudioDecoder = CompatAudioDecoder
+
+    return AudioDecoder(src, sample_rate=sample_rate)
+
+
+def decode_segment(src, start=0, end=None, sample_rate=None):
+    """One-shot decode: creates decoder, reads segment, returns tensor with .sample_rate attr.
+
+    Handles MP3 skip_samples compensation automatically.
+
+    Args:
+        src: Audio source (file-like, bytes, or PyArrow buffer).
+        start: Start time in seconds.
+        end: End time in seconds (None for rest of file).
+        sample_rate: Optional target sample rate.
+
+    Returns:
+        A torch.Tensor with a .sample_rate attribute.
+    """
+    filelike = to_filelike(src)
+    decoder = create_decoder(filelike, sample_rate)
+
+    skip_samples = 0
+    if decoder.metadata.codec == "mp3":
+        skip_samples = 1105
+
+    if sample_rate is None:
+        sample_rate = decoder.metadata.sample_rate
+
+    seek_adjustment = skip_samples / sample_rate if start > 0 else 0
+    samples = decoder.get_samples_played_in_range(
+        start + seek_adjustment, end + seek_adjustment if end is not None else None
+    )
+    if hasattr(samples, "data"):
+        samples = samples.data
+    samples.sample_rate = sample_rate
+    return samples
+
+
+def encode_mp3(samples) -> bytes:
+    """Encode a torch tensor to MP3 bytes.
+
+    Tries torchffmpeg -> torchcodec -> torchaudio as encoder backends.
+
+    Args:
+        samples: A torch.Tensor with a .sample_rate attribute. Shape: (channels, frames).
+
+    Returns:
+        MP3-encoded bytes.
+    """
+    out = io.BytesIO()
+    try:
+        from torchffmpeg import MediaEncoder
+
+        sample_rate = int(samples.sample_rate)
+        # samples is (channels, frames), write_audio_chunk expects (frames, channels)
+        waveform = samples.mT.float().contiguous()
+        enc = MediaEncoder(out, "mp3")
+        enc.add_audio_stream(sample_rate=sample_rate, num_channels=waveform.size(1), format="flt")
+        with enc.open():
+            enc.write_audio_chunk(0, waveform)
+    except ImportError:
+        try:
+            from torchcodec.encoders import AudioEncoder
+
+            AudioEncoder(samples, sample_rate=int(samples.sample_rate)).to_file_like(out, "mp3")
+        except ImportError:
+            import torchaudio
+
+            torchaudio.save(out, samples, int(samples.sample_rate), format="mp3")
+
+    return out.getvalue()
+
+
+def audio_to_html(samples) -> str:
+    """Encode samples to an HTML <audio> tag with base64 MP3 data.
+
+    Args:
+        samples: A torch.Tensor with a .sample_rate attribute.
+
+    Returns:
+        An HTML string with an embedded audio player.
+    """
+    import base64
+
+    mp3_data = base64.b64encode(encode_mp3(samples)).decode("ascii")
+    return f'<audio controls src="data:audio/mp3;base64,{mp3_data}"></audio>'
diff --git a/wsds/convplayer.py b/wsds/convplayer.py
index ddd3573..5a1ae7a 100644
--- a/wsds/convplayer.py
+++ b/wsds/convplayer.py
@@ -1,10 +1,11 @@
-import torch
-from PIL import Image
 import shutil
 from pathlib import Path
-import whisper
-import torchaudio
+
 import numpy as np
+import torch
+import torchaudio
+import whisper
+from PIL import Image
 
 HEADER = """
 <html>
@@ -267,7 +268,7 @@
 </html>
 """
 
-CSV = '''<div id="vote_results"></div>
+CSV = """<div id="vote_results"></div>
 <script>
   function calcResults() {
     results = {}; document.querySelectorAll('input[type=radio]').forEach((x) => { if(x.checked) results[x.name] = x.value }); results;
@@ -281,33 +282,26 @@
   }
   document.addEventListener('change', calcResults);
   calcResults()
-</script>'''
+</script>"""
+
 
 def mel_img(snd, sr, mel_min=-1, mel_max=2):
     mel = whisper.log_mel_spectrogram(torchaudio.functional.resample(snd, sr, 16000))
     return torch.clamp((mel - mel_min) / (mel_max - mel_min) * 255, 0, 255).numpy().astype(np.uint8)
 
+
 def ticks_img(h):
-    ticks = np.full((10,h), 255, dtype=np.uint8)
-    ticks[5:,::h//10] = 150
-    ticks[:,0] = 50
-    ticks[4:,h//2] = 50
+    ticks = np.full((10, h), 255, dtype=np.uint8)
+    ticks[5:, :: h // 10] = 150
+    ticks[:, 0] = 50
+    ticks[4:, h // 2] = 50
     return ticks
 
+
 from collections import defaultdict
 
-# from https://stackoverflow.com/a/39662359
-def is_notebook() -> bool:
-    try:
-        shell = get_ipython().__class__.__name__
-        if shell == 'ZMQInteractiveShell':
-            return True   # Jupyter notebook or qtconsole
-        elif shell == 'TerminalInteractiveShell':
-            return False  # Terminal running IPython
-        else:
-            return False  # Other type (?)
-    except NameError:
-        return False      # Probably standard Python interpreter
+from .utils import is_notebook
+
 
 class ColumnList:
     def __init__(self, player, side):
@@ -317,11 +311,11 @@ def __init__(self, player, side):
         self.content = defaultdict(list)
 
     def _get_fname(self, name, fmt):
-        return f"{name}-{self.side}-{len(self.content)+1:03d}.{fmt}"
+        return f"{name}-{self.side}-{len(self.content) + 1:03d}.{fmt}"
 
-    def _save_img(self, name, img, fmt='png'):
+    def _save_img(self, name, img, fmt="png"):
         fname = self._get_fname(name, fmt)
-        Image.fromarray(img.T).save(self.player.path/fname)
+        Image.fromarray(img.T).save(self.player.path / fname)
         return fname
 
     def append(self, name, content):
@@ -333,70 +327,91 @@ def put_html(self, name, t_start, html, t_len=None, bg="#eee", flex=10, width=No
         else:
             self.styles[name] = f' style="flex: {flex}"'
         y = t_start * self.player.pixels_per_second
-        height = "" if t_len is None else f' height:{t_len * self.player.pixels_per_second}px;'
-        self.append(name, f'<div class="col-{name}-{self.side}-html label" data-tstart="{t_start}" style="position: absolute; top: {y}px;{height} background-color: {bg};">{html}</div>')
+        height = "" if t_len is None else f" height:{t_len * self.player.pixels_per_second}px;"
+        self.append(
+            name,
+            f'<div class="col-{name}-{self.side}-html label" data-tstart="{t_start}" style="position: absolute; top: {y}px;{height} background-color: {bg};">{html}</div>',
+        )
 
-    def put_img(self, name, t, img, fmt='png', scalex=1, scaley=1):
+    def put_img(self, name, t, img, fmt="png", scalex=1, scaley=1):
         fname = self._save_img(name, img, fmt=fmt)
-        w,h = img.shape
-        self.put_html(name, f'<img src="{fname}" width={w/scalex} height={h/scaley} class="col-{name}-{self.side}-img">', width=w/scalex)
-
-    def append_img(self, name, img, fmt='png', scalex=1, scaley=1, repeat_y=False):
+        w, h = img.shape
+        self.put_html(
+            name,
+            f'<img src="{fname}" width={w / scalex} height={h / scaley} class="col-{name}-{self.side}-img">',
+            width=w / scalex,
+        )
+
+    def append_img(self, name, img, fmt="png", scalex=1, scaley=1, repeat_y=False):
         fname = self._save_img(name, img, fmt=fmt)
-        w,h = img.shape
+        w, h = img.shape
         if not repeat_y:
-            self.append(name, f'<img src="{fname}" width={w/scalex} height={h/scaley} class="col-{name}-{self.side}-img">')
+            self.append(
+                name, f'<img src="{fname}" width={w / scalex} height={h / scaley} class="col-{name}-{self.side}-img">'
+            )
         else:
-            self.append(name, f'<div class="col-{name}-{self.side}-img" style="height: 100%; width: {w}px; background: url(\'{fname}\') repeat-y; background-size: {w/scalex}px {h/scaley}px;"></div>')
+            self.append(
+                name,
+                f'<div class="col-{name}-{self.side}-img" style="height: 100%; width: {w}px; background: url(\'{fname}\') repeat-y; background-size: {w / scalex}px {h / scaley}px;"></div>',
+            )
 
     def __str__(self):
         lines = []
         cols = self.content.items()
-        if self.side == 'right': cols = reversed(cols)
-        for name,c in cols:
-            lines.append(f'<div class="col col-{self.side} col-{name}-{self.side}"{self.styles[name]}>'+('\n'.join(self.content[name]))+'</div>')
-        return '\n'.join(lines)
+        if self.side == "right":
+            cols = reversed(cols)
+        for name, c in cols:
+            lines.append(
+                f'<div class="col col-{self.side} col-{name}-{self.side}"{self.styles[name]}>'
+                + ("\n".join(self.content[name]))
+                + "</div>"
+            )
+        return "\n".join(lines)
+
 
 class ConvPlayer:
     def __init__(self, path, snd, sr, rmdir=False, pixels_per_second=50):
-        if isinstance(path, str): path = Path(path)
+        if isinstance(path, str):
+            path = Path(path)
         self.path = path
         self.pixels_per_second = pixels_per_second
 
-        self.left = ColumnList(self, 'left')
-        self.right = ColumnList(self, 'right')
+        self.left = ColumnList(self, "left")
+        self.right = ColumnList(self, "right")
 
-        if rmdir and path.exists(): shutil.rmtree(path)
+        if rmdir and path.exists():
+            shutil.rmtree(path)
         path.mkdir(exist_ok=True)
 
         self._add_html()
         self._add_audio(snd, sr)
 
     def _add_html(self):
-        self.html = open(self.path/"index.html", "w")
+        self.html = open(self.path / "index.html", "w")
         self.html.write(HEADER)
 
     def _add_audio(self, snd, sr):
-        torchaudio.save(self.path/"snd.m4a", snd, sr)
+        torchaudio.save(self.path / "snd.m4a", snd, sr)
         ticks = ticks_img(self.pixels_per_second * 2)
-        self.left.append_img('ticks', ticks, scaley=2, repeat_y=True)
-        self.right.append_img('ticks', ticks[::-1], scaley=2, repeat_y=True)
-        for i,snd in enumerate(torch.split(snd, 5*60*sr)):
+        self.left.append_img("ticks", ticks, scaley=2, repeat_y=True)
+        self.right.append_img("ticks", ticks[::-1], scaley=2, repeat_y=True)
+        for i, snd in enumerate(torch.split(snd, 5 * 60 * sr)):
             mels = mel_img(snd, sr)
-            for c,i in zip([self.left, self.right], mels):
-                c.append_img('mel', i, scaley=100 / self.pixels_per_second)
+            for c, i in zip([self.left, self.right], mels):
+                c.append_img("mel", i, scaley=100 / self.pixels_per_second)
 
     def close(self, zip=False, show=False):
         self.html.write('<div class="middle-box">\n')
-        self.html.write(str(self.left)+'\n')
+        self.html.write(str(self.left) + "\n")
         self.html.write('<div class="col-separator"></div>')
-        self.html.write(str(self.right)+'\n')
+        self.html.write(str(self.right) + "\n")
         self.html.write(f"<script>pixels_per_second = {self.pixels_per_second}</script>")
         self.html.write(FOOTER)
         self.html.close()
         if show:
             if is_notebook():
-                from IPython.display import HTML 
+                from IPython.display import HTML
+
                 display(HTML(f'<a href="{self.path}/index.html">View player</a>'))
             else:
                 print(f"{self.path}/index.html")
diff --git a/wsds/pupyarrow/file_reader.py b/wsds/pupyarrow/file_reader.py
index d03c153..c9cf562 100644
--- a/wsds/pupyarrow/file_reader.py
+++ b/wsds/pupyarrow/file_reader.py
@@ -1,6 +1,7 @@
 from __future__ import annotations
 
 import os
+import threading
 import time
 from pathlib import Path
 from typing import BinaryIO
@@ -127,3 +128,118 @@ def _raw_read_end(self, n: int) -> bytes:
         range_header = f"bytes=-{n}"
         resp = self._client.get_object(Bucket=self._bucket, Key=self._key, Range=range_header)
         return resp["Body"].read()
+
+
+class _ModalEventLoop:
+    """A persistent event loop running on a dedicated daemon thread.
+
+    All Modal gRPC work is dispatched here so the client's channel stays
+    bound to a single loop, and callers on the main thread (or Jupyter,
+    or another loop) are never blocked by "loop already running" errors."""
+
+    def __init__(self):
+        import asyncio
+
+        self._loop = asyncio.new_event_loop()
+        self._thread = threading.Thread(target=self._loop.run_forever, daemon=True)
+        self._thread.start()
+
+    def run(self, coro):
+        """Submit *coro* to the background loop and block until it completes."""
+        import asyncio
+
+        future = asyncio.run_coroutine_threadsafe(coro, self._loop)
+        return future.result()
+
+    def close(self):
+        self._loop.call_soon_threadsafe(self._loop.stop)
+        self._thread.join()
+
+
+# Module-level singleton — created on first use.
+_modal_loop: _ModalEventLoop | None = None
+_modal_loop_lock = threading.Lock()
+
+
+def _get_modal_loop() -> _ModalEventLoop:
+    global _modal_loop
+    if _modal_loop is None:
+        with _modal_loop_lock:
+            if _modal_loop is None:
+                _modal_loop = _ModalEventLoop()
+    return _modal_loop
+
+
+class ModalFileReader(FileReader):
+    """FileReader backed by Modal Volume range requests via gRPC.
+
+    Uses the undocumented ``start``/``len`` fields on ``VolumeGetFile2Request``
+    to fetch only the needed byte ranges.  Presigned block URLs returned by the
+    gRPC call are downloaded with ``urllib``.
+
+    All async gRPC work runs on a shared daemon-thread event loop (see
+    ``_ModalEventLoop``) so it works regardless of whether the caller already
+    has a running loop (Jupyter, Modal synchronizer, etc.)."""
+
+    def __init__(self, vol, path: str, *, verbose: bool = False):
+        super().__init__(verbose=verbose)
+        self._vol = vol
+        self._path = path
+        self._size: int | None = None
+        self._loop = _get_modal_loop()
+
+    @classmethod
+    def from_name(cls, volume_name: str, path: str, *, verbose: bool = False) -> "ModalFileReader":
+        """Create a reader for *path* inside the named Modal Volume."""
+        loop = _get_modal_loop()
+        vol = loop.run(cls._hydrate(volume_name))
+        reader = cls(vol, path, verbose=verbose)
+        return reader
+
+    @staticmethod
+    async def _hydrate(volume_name: str):
+        from modal.volume import _Volume
+
+        vol = _Volume.from_name(volume_name)
+        await vol.hydrate()
+        return vol
+
+    async def _get_range(self, start: int, length: int):
+        from modal_proto import api_pb2
+
+        req = api_pb2.VolumeGetFile2Request(
+            volume_id=self._vol.object_id,
+            path=self._path,
+            start=start,
+            len=length,
+        )
+        return await self._vol._client.stub.VolumeGetFile2(req)
+
+    def _fetch_urls(self, resp) -> bytes:
+        """Download presigned block URLs and concatenate the bytes."""
+        import requests
+
+        chunks = []
+        for url in resp.get_urls:
+            r = requests.get(url)
+            r.raise_for_status()
+            chunks.append(r.content)
+        return b"".join(chunks)
+
+    def _ensure_size(self) -> int:
+        """Fetch the total file size (cached after first call)."""
+        if self._size is None:
+            resp = self._loop.run(self._get_range(0, 1))
+            self._size = resp.size
+        return self._size
+
+    def _raw_read(self, offset: int, length: int) -> bytes:
+        resp = self._loop.run(self._get_range(offset, length))
+        if self._size is None:
+            self._size = resp.size
+        return self._fetch_urls(resp)
+
+    def _raw_read_end(self, n: int) -> bytes:
+        size = self._ensure_size()
+        offset = max(size - n, 0)
+        return self._raw_read(offset, size - offset)
diff --git a/wsds/utils.py b/wsds/utils.py
index fc32f48..8643ade 100644
--- a/wsds/utils.py
+++ b/wsds/utils.py
@@ -26,6 +26,7 @@ def from_s3(cls, s3_client, key, bucket, err):
 class WSShardCorruptedError(Exception):
     fname: str
 
+
 def get_columns(fname):
     if isinstance(fname, Path):
         fname = str(fname)
@@ -93,20 +94,20 @@ def list_all_columns(ds_path, shard_name=None):
 
 def list_all_shards(dataset: str, verbose: bool = False, print_missing: bool = False):
     shards = {}
-    for subdir in Path(dataset).iterdir():
-        if not subdir.is_dir():
+    for column_dir in Path(dataset).iterdir():
+        if not column_dir.is_dir():
             continue
-        shards[subdir] = {file.name for file in subdir.iterdir() if file.suffix == ".wsds"}
-        if not shards[subdir]:
+        shards[column_dir] = {file.name for file in column_dir.iterdir() if file.suffix == ".wsds"}
+        if not shards[column_dir]:
             if verbose:
-                print(f"error: empty folder {subdir}")
-            del shards[subdir]
+                print(f"error: empty folder {column_dir}")
+            del shards[column_dir]
 
     common_shards = {v for shard_values in shards.values() for v in shard_values}
     num_common = len(common_shards)
 
     errors = False
-    for subdir, files in shards.items():
+    for column_dir, files in shards.items():
         missing = common_shards - files
         n_missing = len(missing)
         if n_missing == 0:
@@ -115,7 +116,7 @@ def list_all_shards(dataset: str, verbose: bool = False, print_missing: bool = F
             status = f"[MISSING {n_missing}]"
 
         if verbose:
-            print(f"Path {subdir} has {len(files)}/{num_common} shards {status}")
+            print(f"Path {column_dir} has {len(files)}/{num_common} shards {status}")
 
         if n_missing > 0 and print_missing:
             for m in sorted(missing):
@@ -225,6 +226,15 @@ def scan_ipc(path: str | Path, *args, glob=True, **kwargs):
         return pl.scan_ipc(f, *args, **kwargs)
 
 
+def is_notebook() -> bool:
+    """Detect if running in a Jupyter notebook vs terminal IPython or plain Python."""
+    try:
+        shell = get_ipython().__class__.__name__
+        return shell == "ZMQInteractiveShell"
+    except NameError:
+        return False
+
+
 def format_duration(duration):
     """Formats a duration in seconds as hours (or minutes or kilo-hours)."""
     hours = duration / 3600
@@ -237,11 +247,12 @@ def format_duration(duration):
     else:
         return f"{hours:.2f} hours"
 
+
 def preload_shard(shard_fname):
     try:
         with open(shard_fname, "rb") as f:
             f.seek(-6, os.SEEK_END)
-            if f.read() != b'ARROW1':
+            if f.read() != b"ARROW1":
                 print(f"Invalid file format {shard_fname}: ARROW1 magic not found")
                 return False
     except FileNotFoundError:
@@ -252,10 +263,11 @@ def preload_shard(shard_fname):
         return False
     return True
 
+
 def validate_shards(
-    dataset: "WSDataset", shards: list[tuple[str, str]], subdirs: list[str], tail_bytes: int = 10240
+    dataset: "WSDataset", shards: list[tuple[str, str]], column_dirs: list[str], tail_bytes: int = 10240
 ):
-    """Prefetch and validate shard files for the given shards and subdirs.
+    """Prefetch and validate shard files for the given shards and column dirs.
 
     Uses a ProcessPoolExecutor to load them concurrently, which helps with network filesystems
     where latency is the bottleneck. This is useful before operations that need to read
@@ -263,27 +275,29 @@ def validate_shards(
 
     Args:
         dataset: The WSDataset instance
-        shards: List of (dataset_path, shard_name) tuples identifying the shards
-        subdirs: List of subdirectory names to prefetch
+        shards: List of (partition, shard_name) tuples identifying the shards
+        column_dirs: List of column directory names to prefetch
         tail_bytes: Number of bytes to read from the end of each file (default 10KB)
 
     Returns:
-        List of shards that loaded successfully across all subdirs
+        List of shards that loaded successfully across all column dirs
     """
 
     # Filter out computed columns (they don't have actual shard files)
-    actual_subdirs = [s for s in subdirs if s not in dataset.computed_columns]
+    actual_column_dirs = [s for s in column_dirs if s not in dataset.computed_columns]
 
-    if not actual_subdirs or not shards:
+    if not actual_column_dirs or not shards:
         return []
 
-    # Create all combinations of shards and subdirs
-    shard_files = [dataset.get_shard_path(subdir, shard_name) for shard_name in shards for subdir in actual_subdirs]
+    # Create all combinations of shards and column dirs
+    shard_files = [
+        dataset.get_shard_path(column_dir, shard_name) for shard_name in shards for column_dir in actual_column_dirs
+    ]
 
     with ProcessPoolExecutor(max_workers=min(len(shard_files), 64)) as executor:
         results = list(executor.map(preload_shard, shard_files))
 
-    # Check that all subdirs loaded successfully for each shard
+    # Check that all column dirs loaded successfully for each shard
     # Return all given shards with an ok flag
-    num_subdirs = len(actual_subdirs)
-    return [(shard, all(results[i * num_subdirs : (i + 1) * num_subdirs])) for i, shard in enumerate(shards)]
+    num_column_dirs = len(actual_column_dirs)
+    return [(shard, all(results[i * num_column_dirs : (i + 1) * num_column_dirs])) for i, shard in enumerate(shards)]
diff --git a/wsds/ws_audio.py b/wsds/ws_audio.py
index f560edf..1dd7351 100644
--- a/wsds/ws_audio.py
+++ b/wsds/ws_audio.py
@@ -1,20 +1,9 @@
 from __future__ import annotations
 
-import io
 import typing
 from dataclasses import dataclass
 
-import pyarrow as pa
-
-
-def to_filelike(src: typing.Any) -> typing.BinaryIO:
-    """Coerces files, byte-strings and PyArrow binary buffers into file-like objects."""
-    if hasattr(src, "read"):  # an open file
-        return src
-    # if not an open file then we assume some kind of binary data in memory
-    if hasattr(src, "as_buffer"):  # PyArrow binary data
-        return pa.BufferReader(src.as_buffer())
-    return io.BytesIO(src)
+from .audio_codec import audio_to_html, create_decoder, encode_mp3, to_filelike
 
 
 def load_segment(src, start, end, sample_rate=None):
@@ -28,90 +17,17 @@ def load_segment(src, start, end, sample_rate=None):
     return AudioReader(src).read_segment(start, end, sample_rate=sample_rate)
 
 
-class CompatAudioDecoder:
-    def __init__(self, src, sample_rate):
-        import torchaudio
-
-        if not hasattr(torchaudio, "io"):
-            raise ImportError("You need either torchaudio<2.9 or torchcodec installed")
-        self.src = src
-        if hasattr(src, "_optimal_read_size"):
-            buffer_size = src._optimal_read_size
-        else:
-            buffer_size = 128 * 1024
-        self.reader = torchaudio.io.StreamReader(src=to_filelike(self.src), buffer_size=buffer_size)
-        self.metadata = self.reader.get_src_stream_info(0)
-
-        if sample_rate is None:
-            sample_rate = self.metadata.sample_rate
-
-        self.sample_rate = sample_rate
-
-        # fetch 32 seconds because we likely need 30s at maximum but the seeking may be imprecise (and we seek 1s early)
-        # FIXME: check if we can get away with some better settings here (-1, maybe 10s + concatenate the chunks in a loop)
-        self.reader.add_basic_audio_stream(
-            frames_per_chunk=int(32 * sample_rate),
-            sample_rate=sample_rate,
-            decoder_option={"threads": "4", "thread_type": "frame"}
-        )
-
-    def get_samples_played_in_range(self, tstart=0, tend=None):
-        # rought seek
-        self.reader.seek(max(0, tstart - 1), "key")
-
-        if tend == None:
-            import torch
-            chunks = []
-            more_data = True
-            while more_data:
-                if self.reader.fill_buffer() == 1:
-                    more_data = False
-                (chunk,) = self.reader.pop_chunks()
-                chunks.append(chunk)
-            prefix = int((tstart - chunks[0].pts) * self.sample_rate)
-            if prefix < 0: prefix = 0
-            return torch.cat(chunks)[prefix:].mT
-
-        self.reader.fill_buffer()
-        (chunk,) = self.reader.pop_chunks()
-        # tight crop (seems accurate down to 1 sample in my tests)
-        prefix = int((tstart - chunk.pts) * self.sample_rate)
-        if prefix < 0: prefix = 0
-        if tend:
-            samples = chunk[prefix : prefix + int((tend - tstart) * self.sample_rate)].mT
-        else:
-            samples = chunk[prefix:].mT
-        # clear out any remaining data
-        while chunk is not None:
-            (chunk,) = self.reader.pop_chunks()
-        return samples
-
-
-def _audio_to_mp3(samples):
-    from io import BytesIO
-
-    out = BytesIO()
-    try:
-        from torchcodec.encoders import AudioEncoder
-        AudioEncoder(samples, sample_rate=int(samples.sample_rate)).to_file_like(out, "mp3")
-    except ImportError:
-        import torchaudio
-        torchaudio.save(out, samples, int(samples.sample_rate), format="mp3")
-
-    return out.getvalue()
-
-
-@dataclass(slots=True)
+@dataclass()
 class AudioReader:
     """A lazy seeking-capable audio reader for random-access to recordings stored in wsds shards."""
 
     src: typing.Any
-    reader: CompatAudioDecoder | None = None
-    sample_rate: int | None = None
+    _decoder: typing.Any = None
+    _sample_rate: int | None = None
     skip_samples: int = 0
 
     def __repr__(self):
-        return f"AudioReader(src={type(self.src)}, sample_rate={self.sample_rate})"
+        return f"AudioReader(src={type(self.src)}, sample_rate={self._sample_rate})"
 
     def unwrap(self):
         """Return the raw audio bytes"""
@@ -122,72 +38,63 @@ def unwrap(self):
         else:
             raise TypeError(f"Unsupported AudioReader src type: {type(self.src)}")
 
-    # we materialize the reader on first use
-    def get_reader(self, sample_rate=None):
+    def get_decoder(self, sample_rate=None):
+        """Lazily creates/caches decoder via audio_codec.create_decoder()."""
         sample_rate_switch = False
-        if self.sample_rate is not None:
-            sample_rate_switch = self.sample_rate != sample_rate
-
-        if self.reader is None or sample_rate_switch:
-            try:
-                from torchcodec.decoders import AudioDecoder
-            except ImportError:
-                AudioDecoder = CompatAudioDecoder
-
-            reader = AudioDecoder(to_filelike(self.src), sample_rate=sample_rate)
-            # mp3 has encoder delays that are not handled well when seeking (http://mp3decoders.mp3-tech.org/decoders_lame.html)
-            if reader.metadata.codec == "mp3":
+        if self._sample_rate is not None:
+            sample_rate_switch = self._sample_rate != sample_rate
+
+        if self._decoder is None or sample_rate_switch:
+            decoder = create_decoder(to_filelike(self.src), sample_rate=sample_rate)
+            # mp3 has encoder delays that are not handled well when seeking
+            if decoder.metadata.codec == "mp3":
                 self.skip_samples = 1105
 
             if sample_rate is None:
-                sample_rate = reader.metadata.sample_rate
+                sample_rate = decoder.metadata.sample_rate
 
-            self.reader = reader
-            self.sample_rate = sample_rate
+            self._decoder = decoder
+            self._sample_rate = sample_rate
 
-        return self.reader, self.sample_rate
+        return self._decoder, self._sample_rate
 
     @property
     def metadata(self):
-        reader, sample_rate = self.get_reader()
-        return reader.metadata
+        decoder, sample_rate = self.get_decoder()
+        return decoder.metadata
+
+    @property
+    def sample_rate(self):
+        _, sr = self.get_decoder()
+        return sr
 
     def read_segment(self, start=0, end=None, sample_rate=None):
-        reader, sample_rate = self.get_reader(sample_rate)
+        decoder, sample_rate = self.get_decoder(sample_rate)
         seek_adjustment = self.skip_samples / sample_rate if start > 0 else 0
-        _samples = reader.get_samples_played_in_range(
+        _samples = decoder.get_samples_played_in_range(
             start + seek_adjustment, end + seek_adjustment if end is not None else None
         )
         if hasattr(_samples, "data"):
             samples = _samples.data
+        else:
+            samples = _samples
         samples.sample_rate = sample_rate
         return samples
 
     def load(self, sample_rate=None):
         samples = self.read_segment(sample_rate=sample_rate)
-        sample_rate = samples.sample_rate
         return samples
 
     def _repr_html_(self):
-        import base64
-
-        samples = self.read_segment()
-        mp3_data = base64.b64encode(_audio_to_mp3(samples)).decode("ascii")
-        return f'<audio controls src="data:audio/mp3;base64,{mp3_data}"></audio>'
+        return audio_to_html(self.read_segment())
 
     def _display_(self):
         import marimo
 
-        samples = self.read_segment()
-        return marimo.audio(_audio_to_mp3(samples))
-
-    def _ipython_display_(self):
-        from IPython.display import HTML, display
-
-        display(HTML(self._repr_html_()))
+        return marimo.audio(encode_mp3(self.read_segment()))
 
 
-@dataclass(frozen=True, slots=True)
+@dataclass(frozen=True)
 class WSAudio:
     """A lazy reference to a single sample from a segmented audio file."""
 
@@ -248,19 +155,9 @@ def metadata(self):
         return self.audio_reader.metadata
 
     def _repr_html_(self):
-        import base64
-
-        samples = self.load()
-        mp3_data = base64.b64encode(_audio_to_mp3(samples)).decode("ascii")
-        return f'<audio controls src="data:audio/mp3;base64,{mp3_data}"></audio>'
+        return audio_to_html(self.load())
 
     def _display_(self):
         import marimo
 
-        samples = self.load()
-        return marimo.audio(_audio_to_mp3(samples))
-
-    def _ipython_display_(self):
-        from IPython.display import HTML, display
-
-        display(HTML(self._repr_html_()))
+        return marimo.audio(encode_mp3(self.load()))
diff --git a/wsds/ws_dataset.py b/wsds/ws_dataset.py
index bdbfa95..35dd5cd 100644
--- a/wsds/ws_dataset.py
+++ b/wsds/ws_dataset.py
@@ -14,8 +14,8 @@
     list_all_columns,
     list_all_shards,
     parse_key,
-    validate_shards,
     scan_ipc,
+    validate_shards,
 )
 from .ws_index import WSIndex
 from .ws_sample import WSSample
@@ -25,8 +25,8 @@
 class WSDataset:
     """A multimodal dataset.
 
-    A dataset works like a table (dataframe) of samples. Samples are split into directories,
-    with each directory storing a subset of columns. Inside these directories are shards, with
+    A dataset works like a table (dataframe) of samples. Samples are split into column directories,
+    with each column directory storing a subset of columns. Inside these directories are shards, with
     each shard storing a subset of rows. This enables very efficient parallelization of data
     processing.
 
@@ -35,33 +35,38 @@ class WSDataset:
     load the requested data.
 
     Examples:
-    >>> dataset = WSDataset("librilight/v3-vad_ws")
+    >>> dataset = WSDataset("librilight/v3-vad_ws", rng=42)
     >>> sample = dataset["large/5304/the_tinted_venus_1408_librivox_64kb_mp3/tintedvenus_05_anstey_64kb_090"]
     >>> print(repr(sample["transcription_wslang_raw.txt"]))
     ' I will accompany you," she said.'
     >>> sample['audio']
-    WSAudio(audio_reader=AudioReader(src=<class 'pyarrow.lib.BinaryScalar'>, sample_rate=None), tstart=1040.2133, tend=1042.8413)
+    WSAudio(audio_reader=AudioReader(src=<class '_io.BytesIO'>, sample_rate=None), tstart=1040.2133, tend=1042.8413)
     """
 
-    dataset_dir: Path
-    """Path to the dataset directory."""
+    dataset_root: Path
+    """Path to the dataset root directory."""
     fields: dict
     """List of fields available for each sample."""
     computed_columns: dict
     """List of computed columns (e.g. the source audio or video link). @private"""
 
-    # FIXME: this should be overridable with metadata in index.sqlite3
-    _audio_file_keys = ["flac", "mp3", "sox", "wav", "m4a", "ogg", "wma", "opus", "audio"]
-
     def __init__(
         self,
-        dataset_dir: str | Path,
+        dataset_root: str | Path,
         include_in_progress: bool = True,
         key_folder: str | None = None,
         disable_memory_map: bool = False,
         ignore_index: bool = False,
+        rng: random.Random | int | None = None,
     ):
-        self.dataset_dir = self._resolve_path(dataset_dir)
+        self.dataset_root = self._resolve_path(dataset_root)
+
+        if isinstance(rng, int):
+            self.rng = random.Random(rng)
+        elif rng is not None:
+            self.rng = rng
+        else:
+            self.rng = random
 
         if include_in_progress is not True:
             print("NOTE: include_in_progress is deprecated and all subdirs are included by default")
@@ -71,7 +76,7 @@ def __init__(
         self.index = None
         self.segmented = False
         self.disable_memory_map = disable_memory_map
-        index_file = self.dataset_dir / "index.sqlite3"
+        index_file = self.dataset_root / "index.sqlite3"
         if not ignore_index and index_file.exists():
             self.index = WSIndex(index_file)
             meta = self.index.metadata
@@ -79,11 +84,11 @@ def __init__(
         else:
             meta = {}
 
-        if 'fields' in meta:
-            self.fields = meta['fields']
+        if "fields" in meta:
+            self.fields = meta["fields"]
         else:
-            dataset_path, shard_name  = next(self.index.shards()) if self.index else ("", None)
-            self.fields = list_all_columns(self.dataset_dir / dataset_path, shard_name)
+            partition, shard_name = next(self.index.shards()) if self.index else ("", None)
+            self.fields = list_all_columns(self.dataset_root / partition, shard_name)
 
         if "computed_columns" in meta:
             self.computed_columns = meta["computed_columns"]
@@ -91,7 +96,7 @@ def __init__(
             self.computed_columns = {}
 
         # look for additional columns that are not in the index (like a wsds-link to S3 storage)
-        self.fields.update(list_all_columns(self.dataset_dir))
+        self.fields.update(list_all_columns(self.dataset_root))
 
         # Normalize old-style single-tuple fields to list-of-tuples
         for k, v in self.fields.items():
@@ -136,7 +141,7 @@ def random_sample(self):
         True
         """
         assert self.index is not None, "Random access is only supported for indexed datasets"
-        return self[random.randrange(self.index.n_samples)]
+        return self[self.rng.randrange(self.index.n_samples)]
 
     def __iter__(self):
         """Starts at a random position in the dataset and yields samples sequentially.
@@ -159,38 +164,18 @@ def random_chunks(self, max_N: int):
 
     def __getitem__(self, key_or_index: str | int):
         """Returns a sample with the given __key__ or sample index."""
-        # Figure out the shard name, local offset (wrt shard) and global offset for the given key or index
-        shard_name, local_offset, global_offset = None, None, None
-
-        if self.index.has_dataset_path:
-            dataset_path = 's.dataset_path'
-        else:
-            dataset_path = "''"
-
         if isinstance(key_or_index, int):
-            r = self.index.query(
-                f"SELECT s.shard, global_offset, {dataset_path} FROM shards AS s WHERE s.global_offset <= ? ORDER BY s.global_offset DESC LIMIT 1",
-                key_or_index,
-            ).fetchone()
+            r = self.index.lookup_by_index(key_or_index)
             if not r:
                 return None
-
-            shard_name, shard_global_offset, dataset_path = r
+            partition, shard_name, local_offset = r
             global_offset = key_or_index
-            local_offset = global_offset - shard_global_offset
         elif isinstance(key_or_index, str):
-            # FIXME: push `parse_key` to the index class
             file_name, offset_of_key_wrt_file = self.parse_key(key_or_index)
-            r = self.index.query(
-                f"SELECT s.shard, s.global_offset, f.offset, {dataset_path} FROM files AS f, shards AS s WHERE f.name = ? AND s.shard_id == f.shard_id",
-                file_name,
-            ).fetchone()
+            r = self.index.lookup_by_key(file_name, offset_of_key_wrt_file)
             if not r:
                 return None
-
-            shard_name, shard_global_offset, file_offset_in_shard, dataset_path = r
-            local_offset = file_offset_in_shard + offset_of_key_wrt_file
-            global_offset = shard_global_offset + local_offset
+            partition, shard_name, local_offset, global_offset = r
         else:
             raise TypeError(f"Invalid key type: {type(key_or_index)}")
 
@@ -199,23 +184,21 @@ def __getitem__(self, key_or_index: str | int):
             overrides.update(
                 {filter_name: filter_df.row(global_offset)[0] for filter_name, filter_df in self._filter_dfs.items()}
             )
-        return WSSample(self, (dataset_path, shard_name), local_offset, overrides=overrides)
+        return WSSample(self, (partition, shard_name), local_offset, overrides=overrides)
 
     def sequential_from(self, sample, max_N=None):
         """Yields samples sequentially from the given `sample`, stopping after `max_N` samples."""
-        shard_name, i = sample.shard_name, sample.offset
-        max_N = min(i + (max_N or sys.maxsize), self._shard_n_samples(shard_name))
+        shard_ref, i = sample.shard_ref, sample.offset
+        max_N = min(i + (max_N or sys.maxsize), self._shard_n_samples(shard_ref))
         # without an index, we still return the sample but you'll get an error on first field access
 
         shard_global_offset = None
         if self._filter_dfs is not None:
             # We need to know the global shard offset to know what filter values to use for the sample
-            shard_global_offset = self.index.query(
-                "SELECT global_offset FROM shards WHERE shard = ?", shard_name
-            ).fetchone()[0]
+            shard_global_offset = self.index.shard_global_offset(shard_ref)
 
         while i < max_N:
-            sample = WSSample(self, shard_name, i)
+            sample = WSSample(self, shard_ref, i)
             if self.index is None:
                 # if we don't have an index we have to try loading
                 # the sample to check if it exists
@@ -230,19 +213,16 @@ def sequential_from(self, sample, max_N=None):
             yield sample
             i += 1
 
-    def _shard_n_samples(self, shard_name: (str, str)) -> int:
+    def _shard_n_samples(self, shard_ref: (str, str)) -> int:
         if not self.index:
             return sys.maxsize
-        r = self.index.query("SELECT n_samples FROM shards WHERE shard = ?", shard_name[1]).fetchone()
-        if r is None:
-            raise IndexError(f"Shard not found: {shard_name}")
-        return r[0]
+        return self.index.shard_n_samples(shard_ref)
 
-    def iter_shard(self, shard_name):
-        dataset_path, shard_name = shard_name
+    def iter_shard(self, shard_ref):
+        partition, shard_name = shard_ref
         if shard_name.endswith(".wsds"):
             shard_name = shard_name[:-5]
-        return self.sequential_from(WSSample(self, (dataset_path, shard_name), 0))
+        return self.sequential_from(WSSample(self, (partition, shard_name), 0))
 
     def __len__(self):
         """Returns the number of samples in the dataset.
@@ -257,12 +237,11 @@ def __len__(self):
     def _parse_sql_queries_polars(self, *queries, shard_subsample=1, rng=None, shard_pipe=None):
         """Parses SQL queries via Polars to:
         - extract the Polars expressions for each query
-        - use the expressions to build a list of subdirs to load shards from"""
+        - use the expressions to build a list of column dirs to load shards from"""
 
-        subdirs = defaultdict(list)
+        column_dirs = defaultdict(list)
         exprs = []
         needed_special_columns = []
-        needs_key = False
         for query in queries:
             if "." in query and query in self.fields:
                 print(f"TIP: You seem to have passes a column name ({query}) which has dots in it.")
@@ -276,70 +255,66 @@ def _parse_sql_queries_polars(self, *queries, shard_subsample=1, rng=None, shard
 
             expr = pl.sql_expr(query)
             for col in expr.meta.root_names():
-                if col == "__key__" or col == '__shard_path__' or col == '__shard_offset__':
+                if col == "__key__" or col == "__shard_path__" or col == "__shard_offset__":
                     # __key__ exists in all shards
                     needed_special_columns.append(col)
                     continue
-                subdir, field = self.fields[col][0]
+                column_dir, field = self.fields[col][0]
                 # Check if this is a computed/remote column (e.g., source-linked or S3-backed field)
-                if subdir in self.computed_columns:
+                if column_dir in self.computed_columns:
                     raise ValueError(
                         f"Column '{col}' is a computed/remote column and cannot be used in SQL queries. "
                         f"Use sample['{col}'] to access it instead."
                     )
                 assert col == field, "renamed fields are not supported in SQL queries yet"
-                subdirs[subdir].append(field)
+                column_dirs[column_dir].append(field)
             exprs.append(expr)
 
-        # If only __key__ is in the query, we need to load shards from at least one subdir
-        (key_subdir, _column) = self.fields["__key__"][0]
+        # If only __key__ is in the query, we need to load shards from at least one column_dir
+        (key_column_dir, _column) = self.fields["__key__"][0]
         if needed_special_columns:
-            if subdirs:
-                key_subdir = list(subdirs.keys())[0]
-            subdirs[key_subdir] += needed_special_columns
+            if column_dirs:
+                key_column_dir = list(column_dirs.keys())[0]
+            column_dirs[key_column_dir] += needed_special_columns
 
         if rng is None:
-            rng = random
+            rng = self.rng
         shard_list = self.get_shard_list()
         if shard_subsample != 1:
             shard_list = rng.sample(shard_list, int(len(shard_list) * shard_subsample))
 
         # Prefetch shard tails concurrently to warm up the filesystem cache
-        verified_shard_list = validate_shards(self, shard_list, list(subdirs.keys()))
+        verified_shard_list = validate_shards(self, shard_list, list(column_dirs.keys()))
 
         row_merge = []
-        subdir_samples = {}
+        column_dir_samples = {}
         missing = defaultdict(list)
-        for shard, shard_ok in verified_shard_list:
+        for shard_ref, shard_ok in verified_shard_list:
             col_merge = []
-            for subdir, fields in subdirs.items():
-                shard_path = self.get_shard_path(subdir, shard)
+            for column_dir, fields in column_dirs.items():
+                shard_path = self.get_shard_path(column_dir, shard_ref)
                 if shard_ok:
                     df = scan_ipc(
-                        shard_path, glob=False,
-                        include_file_paths="__shard_path__" if subdir == key_subdir else None,
-                        row_index_name="__shard_offset__" if subdir == key_subdir else None,
+                        shard_path,
+                        glob=False,
+                        include_file_paths="__shard_path__" if column_dir == key_column_dir else None,
+                        row_index_name="__shard_offset__" if column_dir == key_column_dir else None,
                     ).select(fields)
-                    if subdir not in subdir_samples:
-                        subdir_samples[subdir] = df.clear().collect()
+                    if column_dir not in column_dir_samples:
+                        column_dir_samples[column_dir] = df.clear().collect()
                 else:
                     # create a fake dataframe with all NULL rows and matching schema
                     if self.index:
-                        if self.index.has_dataset_path:
-                            (n_samples,) = self.index.query(
-                                "SELECT n_samples FROM shards WHERE shards.dataset_path = ? AND shards.shard = ?", *shard
-                            ).fetchone()
-                        else:
-                            (n_samples,) = self.index.query(
-                                "SELECT n_samples FROM shards WHERE shards.shard = ?", shard[1]
-                            ).fetchone()
+                        n_samples = self.index.shard_n_samples(shard_ref)
                         df = pl.defer(
-                            lambda subdir=subdir, n_samples=n_samples: subdir_samples[subdir].clear(n=n_samples),
-                            schema=lambda subdir=subdir: subdir_samples[subdir].schema,
+                            lambda column_dir=column_dir, n_samples=n_samples: column_dir_samples[column_dir].clear(
+                                n=n_samples
+                            ),
+                            schema=lambda column_dir=column_dir: column_dir_samples[column_dir].schema,
                         )
                     else:
                         df = None
-                    missing[subdir].append(shard)
+                    missing[column_dir].append(shard_ref)
                 if df is not None:
                     col_merge.append(df)
             if col_merge:
@@ -351,14 +326,14 @@ def _parse_sql_queries_polars(self, *queries, shard_subsample=1, rng=None, shard
         if missing:
             filled = " (filled them with NULLs)" if self.index else " (skipped them)"
             print(f"WARNING: You are missing or invalid shards for some of the columns{filled}:")
-            for subdir, shards in missing.items():
-                msg = f"{subdir}: {shards[:10]}"
+            for column_dir, shards in missing.items():
+                msg = f"{column_dir}: {shards[:10]}"
                 if len(shards) > 10:
                     msg += f" ... ({len(shards) - 10} more)"
                 print(msg)
             if not row_merge:
                 raise WSShardMissingError(
-                    f"No usable shards found (columns: {', '.join(subdirs)}) for dataset in: {str(self.dataset_dir)}"
+                    f"No usable shards found (columns: {', '.join(column_dirs)}) for dataset in: {str(self.dataset_root)}"
                 )
 
         return exprs, pl.concat(row_merge)
@@ -386,19 +361,28 @@ def _check_for_subsampling(self, shard_subsample):
                 shard_subsample = 1
             else:
                 shard_subsample = 150 / self.index.n_shards
-                if not hasattr(self, '_shown_subsampling_info'):
-                    print(f"INFO: to speed things up wsds is loading a random {shard_subsample*100:.2f}% subset of the shards, pass shard_subsample=1 to force it to load the whole dataset")
+                if not hasattr(self, "_shown_subsampling_info"):
+                    print(
+                        f"INFO: to speed things up wsds is loading a random {shard_subsample * 100:.2f}% subset of the shards, pass shard_subsample=1 to force it to load the whole dataset"
+                    )
                     self._shown_subsampling_info = True
         return shard_subsample
 
     def sql_select(
-        self, *queries, return_as_lazyframe=False, shard_subsample=None, rng=42, shard_pipe=None,
+        self,
+        *queries,
+        return_as_lazyframe=False,
+        shard_subsample=None,
+        rng=42,
+        shard_pipe=None,
     ) -> pl.DataFrame | pl.LazyFrame:
         """Given a list of SQL expressions, returns a Polars DataFrame/ LazyFrame with the results."""
         if isinstance(rng, int):
             rng = random.Random(rng)
         exprs, df = self._parse_sql_queries_polars(
-            *queries, shard_subsample=self._check_for_subsampling(shard_subsample), rng=rng,
+            *queries,
+            shard_subsample=self._check_for_subsampling(shard_subsample),
+            rng=rng,
             shard_pipe=shard_pipe,
         )
 
@@ -412,8 +396,10 @@ def sql_filter(self, query, shard_subsample=None, rng=42):
         if isinstance(rng, int):
             rng = random.Random(rng)
 
-        exprs, df = self._parse_sql_queries_polars(query, '__key__', shard_subsample=self._check_for_subsampling(shard_subsample), rng=rng)
-        return df.filter(exprs[0]).select("__key__").filter(pl.col("__key__").is_not_null()).collect()["__key__"]
+        exprs, df = self._parse_sql_queries_polars(
+            query, "__key__", shard_subsample=self._check_for_subsampling(shard_subsample), rng=rng
+        )
+        return df.filter(pl.first()).select("__key__").filter(pl.col("__key__").is_not_null()).collect()["__key__"]
 
     def filtered(
         self,
@@ -430,10 +416,10 @@ def filtered(
 
         Examples:
         >>> dataset = WSDataset("librilight/v3-vad_ws")
-        >>> next(dataset.filtered('pq < 3', shuffle=False))['__key__']  # first low-quality sample
+        >>> next(dataset.filtered('pq < 3', shuffle=False, shard_subsample=1))['__key__']  # first low-quality sample
         'large/6454/over_plum_pudding_1305_librivox_64kb_mp3/plumpudding_09_bangs_64kb_072'
-        >>> next(dataset.filtered("CAST(`transcription_wslang_raw.txt` AS string) ILIKE '%between New Orleans and St. Louis%'", shuffle=False))['__key__']
-        'large/107/oldtimes_jg_librivox_64kb_mp3/oldtimesonthemississippi_07_twain_64kb_032'
+        >>> next(dataset.filtered("CAST(`transcription_wslang_raw.txt` AS string) ILIKE '%between New Orleans%'", shuffle=False, shard_subsample=1))['__key__']
+        'large/10244/carpentersna_1612_librivox_64kb_mp3/geographicalreaderna_40_carpenter_64kb_034'
         """
         import polars as pl
 
@@ -458,17 +444,17 @@ def filtered(
     # Helper and internal API
     #
     def _resolve_path(self, path_str: str) -> Path:
-        """If the 'path' is relative and does not exist, we search for it using 'WSDS_DATASET_PATH' env var.
-        WSDS_DATASET_PATH is a colon-separated list of directories where datasets are stored.
+        """If the 'path' is relative and does not exist, we search for it using 'WSDS_DATASET_SEARCH_PATH' env var.
+        WSDS_DATASET_SEARCH_PATH is a colon-separated list of directories where datasets are stored.
 
         Example:
-            WSDS_DATASET_PATH=/path/to/datasets:/another/path/to/datasets"""
+            WSDS_DATASET_SEARCH_PATH=/path/to/datasets:/another/path/to/datasets"""
 
         path = Path(path_str)
         if path.is_absolute() or path.exists():
             return path
 
-        for base_path_str in os.environ.get("WSDS_DATASET_PATH", "").split(":"):
+        for base_path_str in os.environ.get("WSDS_DATASET_SEARCH_PATH", "").split(":"):
             base_path = Path(base_path_str)
             if (base_path / path).exists():
                 return base_path / path
@@ -479,11 +465,11 @@ def get_shard_list(self, ignore_index=False):
         if not ignore_index and self.index:
             return list(self.index.shards())
         else:
-            return list_all_shards(self.dataset_dir)
+            return list_all_shards(self.dataset_root)
 
-    def get_shard_path(self, subdir, shard_name):
-        dataset_path, shard_name = shard_name
-        dir = self.dataset_dir / dataset_path / subdir
+    def get_shard_path(self, column_dir, shard_ref):
+        partition, shard_name = shard_ref
+        dir = self.dataset_root / partition / column_dir
         return (Path(dir) / shard_name).with_suffix(".wsds")
 
     def _get_loader_class(self, spec: dict):
@@ -491,6 +477,8 @@ def _get_loader_class(self, spec: dict):
         loader_class = spec["loader"]
         if isinstance(loader_class, list):
             loader_mod, loader_name = loader_class
+            if loader_mod.startswith("hume_wsds."):
+                loader_mod = "wsds." + loader_mod[len("hume_wsds.") :]
             loader_module = importlib.import_module(loader_mod)
             return getattr(loader_module, loader_name)
         return loader_class
@@ -499,11 +487,11 @@ def _register_wsds_links(self):
         # Collect links first to avoid modifying dict during iteration
         links_to_register = []
         for value in self.fields.values():
-            (subdir, _column) = value[0]
-            if subdir.endswith(".wsds-link"):
-                spec = json.loads((self.dataset_dir / subdir).read_text())
-                self.computed_columns[subdir] = spec
-                links_to_register.append((subdir, spec))
+            (column_dir, _column) = value[0]
+            if column_dir.endswith(".wsds-link"):
+                spec = json.loads((self.dataset_root / column_dir).read_text())
+                self.computed_columns[column_dir] = spec
+                links_to_register.append((column_dir, spec))
 
         # Ask each loader class what columns it provides
         for link_file, spec in links_to_register:
@@ -516,41 +504,41 @@ def _register_wsds_links(self):
                     self.fields[col_name] = [(link_file, col_name)]
 
     def add_computed(self, name, **link):
-        subdir = name + ".wsds-computed"
-        self.computed_columns[subdir] = link
-        self.fields[name] = [(subdir, name)]
+        column_dir = name + ".wsds-computed"
+        self.computed_columns[column_dir] = link
+        self.fields[name] = [(column_dir, name)]
 
-    def get_linked_dataset(self, dataset_dir):
-        dataset_dir = self.dataset_dir / dataset_dir
-        if dataset_dir not in self._linked_datasets:
-            self._linked_datasets[dataset_dir] = WSDataset(dataset_dir)
-        return self._linked_datasets[dataset_dir]
+    def get_linked_dataset(self, relative_path):
+        linked_root = self.dataset_root / relative_path
+        if linked_root not in self._linked_datasets:
+            self._linked_datasets[linked_root] = WSDataset(linked_root)
+        return self._linked_datasets[linked_root]
 
-    def get_linked_shard(self, link, shard_name):
+    def get_linked_shard(self, link, shard_ref):
         loader_class = self._get_loader_class(link)
-        return loader_class.from_link(link, self, shard_name)
+        return loader_class.from_link(link, self, shard_ref)
 
-    def get_shard(self, subdir, shard_name):
-        shard_path = self.get_shard_path(subdir, shard_name)
+    def get_shard(self, column_dir, shard_ref):
+        shard_path = self.get_shard_path(column_dir, shard_ref)
 
         shard = self._open_shards.get(shard_path.parent, None)
-        if shard is not None and shard.shard_name == shard_name:
+        if shard is not None and shard.shard_ref == shard_ref:
             return shard
 
-        if subdir in self.computed_columns:
-            shard = self.get_linked_shard(self.computed_columns[subdir], shard_name)
+        if column_dir in self.computed_columns:
+            shard = self.get_linked_shard(self.computed_columns[column_dir], shard_ref)
         else:
-            shard = WSShard(self, shard_path, shard_name=shard_name)
+            shard = WSShard(self, shard_path, shard_ref=shard_ref)
 
         self._open_shards[shard_path.parent] = shard
         return shard
 
-    def get_sample(self, shard_name, field, offset):
+    def get_sample(self, shard_ref, field, offset):
         alternatives = self.fields[field]
         last_err = None
-        for subdir, column in alternatives:
+        for column_dir, column in alternatives:
             try:
-                return self.get_shard(subdir, shard_name).get_sample(column, offset)
+                return self.get_shard(column_dir, shard_ref).get_sample(column, offset)
             except WSShardMissingError as e:
                 last_err = e
                 continue
@@ -576,8 +564,8 @@ def __str__(self):
 
     def __repr__(self):
         if self.index is None:
-            return f"WSDataset({repr(str(self.dataset_dir))}, segmented={self.segmented}, index=None)"
-        return f"WSDataset({repr(str(self.dataset_dir))}, segmented={self.segmented})"
+            return f"WSDataset({repr(str(self.dataset_root))}, segmented={self.segmented}, index=None)"
+        return f"WSDataset({repr(str(self.dataset_root))}, segmented={self.segmented})"
 
     def _display_(self):
         import marimo
@@ -593,6 +581,12 @@ def _display_(self):
         )
 
     def _ipython_display_(self):
+        from .utils import is_notebook
+
+        if not is_notebook():
+            print(str(self))
+            return
+
         from IPython.display import Markdown, display
 
         if self.index is None:
diff --git a/wsds/ws_decode.py b/wsds/ws_decode.py
new file mode 100644
index 0000000..2c85e09
--- /dev/null
+++ b/wsds/ws_decode.py
@@ -0,0 +1,56 @@
+import pickle
+
+import numpy as np
+
+from .ws_audio import AudioReader
+
+AUDIO_FILE_KEYS = frozenset(
+    [
+        "audio",  # recommended so all shards can have the same columns
+        "flac",
+        "mp3",
+        "sox",
+        "wav",
+        "m4a",
+        "ogg",
+        "wma",
+        "opus",  # fallback for old datasets
+    ]
+)
+
+
+def decode_sample(column: str, data):
+    """Decode a binary column value from a file-like object based on column name.
+
+    Handles .npy (numpy), .pyd (pickle), .txt (UTF-8 string), and audio columns.
+    Must only be called on binary columns.
+    """
+    if column.endswith("npy"):
+        return np.load(data)
+    elif column.endswith("pyd"):
+        return pickle.load(data)
+    elif column.endswith("txt"):
+        return data if isinstance(data, str) else data.read().decode("utf-8")
+    elif column in AUDIO_FILE_KEYS:
+        return AudioReader(data)
+    raise ValueError(f"Unknown binary column type: {column}")
+
+
+def get_audio(sample, audio_columns=None):
+    """Find and return the first audio column value from a dict-like sample.
+
+    Args:
+        sample: A dict-like object (e.g. WSSample) supporting `keys()` and `__getitem__`.
+        audio_columns: Optional list of column names to try. Defaults to AUDIO_FILE_KEYS.
+
+    Returns:
+        The audio value (typically an AudioReader or WSAudio).
+
+    Raises:
+        KeyError: If no audio column is found in the sample.
+    """
+    candidates = audio_columns or AUDIO_FILE_KEYS
+    for col in candidates:
+        if col in sample:
+            return sample[col]
+    raise KeyError(f"No audio column found (tried {list(candidates)}), available keys: {list(sample.keys())}")
diff --git a/wsds/ws_feather_index.py b/wsds/ws_feather_index.py
index ae3b356..9400e57 100644
--- a/wsds/ws_feather_index.py
+++ b/wsds/ws_feather_index.py
@@ -10,7 +10,7 @@ class WSFeatherIndex:
 
     Uses feather files:
     - `shard-index.feather`: shard metadata with columns:
-        shard_id, dataset_path, shard_name, n_samples, segment_id (global offset),
+        shard_id, partition, shard_name, n_samples, segment_id (global offset),
         audio_duration, speech_duration
     - `episode-index.feather`: episode/file info sorted by segment_id, with columns:
         segment_id, shard_id, episode_id, audio_duration, speech_duration
@@ -98,12 +98,12 @@ def metadata(self) -> dict:
     #
 
     def shards(self):
-        """Iterate over all shards as (dataset_path, shard_name) tuples.
+        """Iterate over all shards as (partition, shard_name) tuples.
 
         Yields tuples in the order shards were added to the index.
         """
         for row in self._shard_df.iter_rows(named=True):
-            yield (row["dataset_path"], row["shard_name"])
+            yield (row["partition"], row["shard_name"])
 
     #
     # Shard lookups
@@ -116,7 +116,7 @@ def get_shard_by_global_index(self, global_index: int) -> tuple[str, int, str] |
             global_index: The global sample index (0-based across the entire dataset).
 
         Returns:
-            Tuple of (shard_name, shard_global_offset, dataset_path) or None if not found.
+            Tuple of (shard_name, shard_global_offset, partition) or None if not found.
             The local offset within the shard is: global_index - shard_global_offset.
         """
         if global_index < 0 or global_index >= self.n_samples:
@@ -125,13 +125,13 @@ def get_shard_by_global_index(self, global_index: int) -> tuple[str, int, str] |
         # Binary search for the shard containing this index
         # search_sorted with side="right" returns index where global_index would be inserted
         # We want the shard where global_offset <= global_index, so subtract 1
-        idx = self._shard_df.select(pl.col('segment_id').search_sorted(global_index, side='right')).item()
+        idx = self._shard_df.select(pl.col("segment_id").search_sorted(global_index, side="right")).item()
 
         if idx < 0:
             return None
 
         row = self._shard_df.row(idx, named=True)
-        return (row["shard_name"], int(row["segment_id"]), row["dataset_path"])
+        return (row["shard_name"], int(row["segment_id"]), row["partition"])
 
     def get_shard_by_file_name(self, file_name: str) -> tuple[str, int, int, str] | None:
         """Find the shard containing a given source file.
@@ -140,14 +140,14 @@ def get_shard_by_file_name(self, file_name: str) -> tuple[str, int, int, str] |
             file_name: The source file name (without segment suffix).
 
         Returns:
-            Tuple of (shard_name, shard_global_offset, file_offset_in_shard, dataset_path)
+            Tuple of (shard_name, shard_global_offset, file_offset_in_shard, partition)
             or None if not found.
         """
         if self._name_df is None:
             raise RuntimeError("episode-name-index.feather is required to search by episode name")
 
         # Binary search in sorted name series
-        idx = self._name_df.select(pl.col('name').search_sorted(file_name, side='right')).item()
+        idx = self._name_df.select(pl.col("name").search_sorted(file_name, side="right")).item()
 
         if idx >= len(self._names) or self._names[idx] != file_name:
             return None
@@ -156,7 +156,6 @@ def get_shard_by_file_name(self, file_name: str) -> tuple[str, int, int, str] |
         shard_id = name_row["shard_id"]
         episode_id = name_row["episode_id"]
 
-        # Get shard info by shard_id (shard_id is the row index)
         shard_row = self._shard_df.row(shard_id, named=True)
         shard_global_offset = int(shard_row["segment_id"])
 
@@ -172,7 +171,7 @@ def get_shard_by_file_name(self, file_name: str) -> tuple[str, int, int, str] |
             shard_row["shard_name"],
             shard_global_offset,
             file_offset,
-            shard_row["dataset_path"],
+            shard_row["partition"],
         )
 
     def get_shard_global_offset(self, shard_name: str) -> int | None:
@@ -193,16 +192,14 @@ def get_shard_n_samples(self, shard: tuple[str, str]) -> int | None:
         """Get the number of samples in a shard.
 
         Args:
-            shard: Tuple of (dataset_path, shard_name).
+            shard: Tuple of (partition, shard_name).
 
         Returns:
             The number of samples in the shard, or None if not found.
         """
-        dataset_path, shard_name = shard
-        if dataset_path:
-            filtered = self._shard_df.filter(
-                (pl.col("dataset_path") == dataset_path) & (pl.col("shard_name") == shard_name)
-            )
+        partition, shard_name = shard
+        if partition:
+            filtered = self._shard_df.filter((pl.col("partition") == partition) & (pl.col("shard_name") == shard_name))
         else:
             filtered = self._shard_df.filter(pl.col("shard_name") == shard_name)
 
@@ -214,16 +211,14 @@ def get_shard_info(self, shard: tuple[str, str]) -> tuple[int, int] | None:
         """Get n_samples and shard_id for a shard.
 
         Args:
-            shard: Tuple of (dataset_path, shard_name).
+            shard: Tuple of (partition, shard_name).
 
         Returns:
             Tuple of (n_samples, shard_id) or None if not found.
         """
-        dataset_path, shard_name = shard
-        if dataset_path:
-            filtered = self._shard_df.filter(
-                (pl.col("dataset_path") == dataset_path) & (pl.col("shard_name") == shard_name)
-            )
+        partition, shard_name = shard
+        if partition:
+            filtered = self._shard_df.filter((pl.col("partition") == partition) & (pl.col("shard_name") == shard_name))
         else:
             filtered = self._shard_df.filter(pl.col("shard_name") == shard_name)
 
diff --git a/wsds/ws_index.py b/wsds/ws_index.py
index 33f6881..b750c33 100644
--- a/wsds/ws_index.py
+++ b/wsds/ws_index.py
@@ -2,6 +2,7 @@
 import json
 import sqlite3
 from pathlib import Path
+
 from . import utils
 
 
@@ -16,10 +17,10 @@ def __enter__(self):
         self.fname.unlink(missing_ok=True)
         self.conn = sqlite3.connect(self.fname)
 
-        self.conn.execute('PRAGMA journal_mode = OFF;')
-        self.conn.execute('PRAGMA synchronous = 0;')
-        self.conn.execute('PRAGMA locking_mode = EXCLUSIVE;')
-        self.conn.execute('PRAGMA temp_store = MEMORY;')
+        self.conn.execute("PRAGMA journal_mode = OFF;")
+        self.conn.execute("PRAGMA synchronous = 0;")
+        self.conn.execute("PRAGMA locking_mode = EXCLUSIVE;")
+        self.conn.execute("PRAGMA temp_store = MEMORY;")
 
         self.conn.execute("""
         CREATE TABLE files (
@@ -35,10 +36,10 @@ def __enter__(self):
             shard TEXT NOT NULL,
             n_samples INTEGER NOT NULL,
             global_offset INTEGER NOT NULL,
-            dataset_path TEXT NULL
+            partition TEXT NULL
         );""")
         self.conn.execute("""
-        CREATE UNIQUE INDEX shard_name ON shards (shard, dataset_path);
+        CREATE UNIQUE INDEX shard_name ON shards (shard, partition);
         """)
         self.conn.execute("""
         CREATE UNIQUE INDEX shard_global_offset ON shards (global_offset);
@@ -59,8 +60,8 @@ def append_metadata(self, metadata):
     def append(self, s):
         # we ensure plain Python types for everything passed in, otherwise sqlite will silently save invalid data
         shard_id = self.conn.execute(
-            "INSERT INTO shards (shard, n_samples, global_offset, dataset_path) VALUES (?, ?, ?, ?);",
-            (str(s["shard_name"]), int(s["n_samples"]), self.global_offset, str(s["dataset_path"])),
+            "INSERT INTO shards (shard, n_samples, global_offset, partition) VALUES (?, ?, ?, ?);",
+            (str(s["shard_name"]), int(s["n_samples"]), self.global_offset, str(s["partition"])),
         ).lastrowid
         for name, offset, audio_duration, speech_duration in s["index"]:
             try:
@@ -75,7 +76,9 @@ def append(self, s):
                         (name,),
                     ).fetchone()
                     if audio_duration - old_duration < 10e-3:
-                        print(f"Skipping duplicate episode: {repr(name)} ({utils.format_duration(audio_duration)} long)")
+                        print(
+                            f"Skipping duplicate episode: {repr(name)} ({utils.format_duration(audio_duration)} long)"
+                        )
                         continue
                     raise ValueError(
                         f"Detected duplicate file name: {repr(name)} in shard \n{repr(s['shard_name'])}, previously seen in {repr(old_shard)}"
@@ -100,7 +103,14 @@ def __init__(self, fname: str):
             raise ValueError(f"WSIndex not found: {fname}")
         # immutable=1,ro=True greatly speeds up all queries when the database is on a remote/cluster file system
         self.conn = sqlite3.connect(f"file:{fname}?immutable=1,ro=True", uri=True)
-        self.has_dataset_path = self.conn.execute("SELECT COUNT(*) FROM pragma_table_info('shards') WHERE name='dataset_path'").fetchone()[0]
+
+        # Detect index format: new indexes have a 'partition' column, old indexes have 'dataset_path'
+        columns = {
+            row[0]
+            for row in self.conn.execute("SELECT name FROM pragma_table_info('shards')").fetchall()
+        }
+        self.has_partition = "partition" in columns
+        self.has_dataset_path = "dataset_path" in columns
 
     @functools.cached_property
     def n_shards(self):
@@ -126,16 +136,82 @@ def speech_duration(self):
             return self.metadata["speech_duration"]
         return self.conn.execute("SELECT SUM(speech_duration) FROM files;").fetchone()[0]
 
+    @functools.cached_property
+    def _partition_col(self):
+        """SQL expression for the partition column, adapting to old/new index formats."""
+        if self.has_partition:
+            return "s.partition"
+        if self.has_dataset_path:
+            return "s.dataset_path"
+        return "''"
+
     def shards(self):
-        dataset_path = 'dataset_path' if self.has_dataset_path else "''"
-        return self.conn.execute(f"SELECT {dataset_path}, shard FROM shards ORDER BY rowid;")
+        return self.conn.execute(f"SELECT {self._partition_col}, shard FROM shards AS s ORDER BY rowid;")
+
+    def lookup_by_index(self, index: int):
+        """Look up a sample by global index. Returns (partition, shard_name, local_offset) or None."""
+        r = self.conn.execute(
+            f"SELECT s.shard, s.global_offset, {self._partition_col} FROM shards AS s"
+            f" WHERE s.global_offset <= ? ORDER BY s.global_offset DESC LIMIT 1",
+            (index,),
+        ).fetchone()
+        if not r:
+            return None
+        shard_name, shard_global_offset, partition = r
+        local_offset = index - shard_global_offset
+        return partition, shard_name, local_offset
+
+    def lookup_by_key(self, file_name: str, offset_of_key_wrt_file: int):
+        """Look up a sample by file name and offset within file.
+        Returns (partition, shard_name, local_offset, global_offset) or None."""
+        r = self.conn.execute(
+            f"SELECT s.shard, s.global_offset, f.offset, {self._partition_col}"
+            f" FROM files AS f, shards AS s WHERE f.name = ? AND s.shard_id == f.shard_id",
+            (file_name,),
+        ).fetchone()
+        if not r:
+            return None
+        shard_name, shard_global_offset, file_offset_in_shard, partition = r
+        local_offset = file_offset_in_shard + offset_of_key_wrt_file
+        global_offset = shard_global_offset + local_offset
+        return partition, shard_name, local_offset, global_offset
+
+    def _query_shard(self, columns, shard_ref):
+        """Query shard table columns for a given (partition, shard_name) ref."""
+        partition, shard_name = shard_ref
+        if self.has_partition or self.has_dataset_path:
+            return self.conn.execute(
+                f"SELECT {columns} FROM shards AS s WHERE {self._partition_col} = ? AND s.shard = ?",
+                (partition, shard_name),
+            ).fetchone()
+        else:
+            return self.conn.execute(
+                f"SELECT {columns} FROM shards WHERE shard = ?", (shard_name,)
+            ).fetchone()
+
+    def shard_n_samples(self, shard_ref):
+        """Return the number of samples in a shard, given a (partition, shard_name) ref."""
+        r = self._query_shard("n_samples", shard_ref)
+        if r is None:
+            raise IndexError(f"Shard not found: {shard_ref}")
+        return r[0]
+
+    def shard_global_offset(self, shard_ref):
+        """Return the global offset of a shard, given a (partition, shard_name) ref."""
+        r = self._query_shard("global_offset", shard_ref)
+        if r is None:
+            raise IndexError(f"Shard not found: {shard_ref}")
+        return r[0]
 
     def dataframe(self):
         import polars as pl
-        df = pl.read_database_uri("""
+
+        df = pl.read_database_uri(
+            """
             SELECT f.name, audio_duration, speech_duration, s.shard, s.n_samples
             FROM files as f, shards as s
-            WHERE f.shard_id == s.shard_id""", f"sqlite://{self.fname}"
+            WHERE f.shard_id == s.shard_id""",
+            f"sqlite://{self.fname}",
         )
         return df
 
diff --git a/wsds/ws_indexer.py b/wsds/ws_indexer.py
index 98b96e2..25d738f 100644
--- a/wsds/ws_indexer.py
+++ b/wsds/ws_indexer.py
@@ -1,5 +1,8 @@
 """
-Core wsds/Polars indexing logic for extracting and merging episode indices.
+wsds index creation
+
+`extract_batch_index` – extracts episode-start sample offsets for all shards (for both source and segmented datasets)
+`merge_batch_indices` – merges extracted indices across multiple partitions into a single SQLite wsds index
 """
 
 import json
@@ -21,41 +24,35 @@ def extract_episodes(episode_idx: pl.DataFrame) -> pl.DataFrame:
     Takes a DataFrame with segment keys (e.g. "episode_123_0", "episode_123_1")
     and aggregates them into episodes by extracting the base key and summing durations.
     """
-    return (episode_idx
-        .with_columns(
-            pl.col('__key__').str.extract(r"(.*)_[0-9]+$", 1),
-            shard = pl.col('__shard_path__').str.extract(r"([^/]+).wsds$", 1),
+    return (
+        episode_idx.with_columns(
+            pl.col("__key__").str.extract(r"(.*)_[0-9]+$", 1),
+            shard=pl.col("__shard_path__").str.extract(r"([^/]+).wsds$", 1),
         )
-        .group_by('__key__', maintain_order=True)
+        .group_by("__key__", maintain_order=True)
         .agg(
-            pl.sum('speech_duration'),
-            pl.len().alias('segments'),
-            pl.first('shard'),
-            pl.first('offset'),
+            pl.sum("speech_duration"),
+            pl.len().alias("segments"),
+            pl.first("shard"),
+            pl.first("offset"),
         )
     )
 
 
 def make_shard_idx(
-    sample_idx: pl.DataFrame,
-    n_samples_expr: pl.Expr,
-    dataset_path: Path | str,
-    shard_id_offset: int = 0
+    sample_idx: pl.DataFrame, n_samples_expr: pl.Expr, partition: Path | str, shard_id_offset: int = 0
 ) -> pl.DataFrame:
     """
     Create a shard index from a sample/episode index.
 
     Groups samples by shard and computes aggregate statistics.
     """
-    return (sample_idx
-        .group_by('shard', maintain_order=True)
-        .agg(
-            n_samples_expr,
-            pl.sum('audio_duration')
-        )
-        .with_row_index('shard_id', offset=shard_id_offset)
+    return (
+        sample_idx.group_by("shard", maintain_order=True)
+        .agg(n_samples_expr, pl.sum("audio_duration"))
+        .with_row_index("shard_id", offset=shard_id_offset)
         .with_columns(
-            dataset_path = pl.lit(str(dataset_path)),
+            partition=pl.lit(str(partition)),
         )
     )
 
@@ -66,7 +63,7 @@ def write_index(
     episode_idx: pl.DataFrame,
     fields: dict,
     source_path: str | None = None,
-    vad_column: str | None = None
+    vad_column: str | None = None,
 ):
     """
     Write a wsds SQLite index file with shard and episode data.
@@ -79,8 +76,8 @@ def write_index(
         source_path: Path to source dataset (for computed audio columns)
         vad_column: VAD column name (for segmented datasets)
     """
-    audio_duration, speech_duration = episode_idx.select('audio_duration', 'speech_duration').sum().row(0)
-    with AtomicFile(f'{path}/index.sqlite3') as fname:
+    audio_duration, speech_duration = episode_idx.select("audio_duration", "speech_duration").sum().row(0)
+    with AtomicFile(f"{path}/index.sqlite3") as fname:
         with WSDSIndexWriter(fname) as index:
             metadata = {}
             if source_path and vad_column:
@@ -93,20 +90,19 @@ def write_index(
                 }
                 fields = {k: v for k, v in fields.items()}
                 fields["audio"] = ("audio.wsds-computed", "audio")
-                metadata['segmented'] = True
+                metadata["segmented"] = True
             else:
-                metadata['segmented'] = False
-            metadata.update({"fields": fields, 'audio_duration': audio_duration, 'speech_duration': speech_duration})
+                metadata["segmented"] = False
+            metadata.update({"fields": fields, "audio_duration": audio_duration, "speech_duration": speech_duration})
             index.append_metadata(metadata)
 
-        conn = dict(connection=f'sqlite:///{fname}', if_table_exists='append', engine='adbc')
-        shard_idx.drop('audio_duration').write_database(table_name='shards', **conn)
-        episode_idx.with_columns(pl.col('speech_duration').fill_null(-1)).write_database(table_name='files', **conn)
+        conn = dict(connection=f"sqlite:///{fname}", if_table_exists="append", engine="adbc")
+        shard_idx.drop("audio_duration").write_database(table_name="shards", **conn)
+        episode_idx.with_columns(pl.col("speech_duration").fill_null(-1)).write_database(table_name="files", **conn)
 
 
 def extract_batch_index(
-    batch_path: Path | str,
-    overwrite: bool = False
+    batch_path: Path | str, overwrite: bool = False
 ) -> tuple[str, str | None, str | None, str | None]:
     """
     Extract episode indices from a single batch directory.
@@ -124,8 +120,8 @@ def extract_batch_index(
     batch = Path(batch_path)
 
     # Process source dataset
-    ds_path = batch / 'source'
-    out_file = ds_path / 'episode-list.feather'
+    ds_path = batch / "source"
+    out_file = ds_path / "episode-list.feather"
 
     if out_file.exists() and not overwrite:
         print(f"Skipping, {out_file} already exists")
@@ -143,7 +139,7 @@ def extract_batch_index(
         except Exception as e:
             return str(batch), "error initializing source dataset", repr(e), traceback.format_exc()
 
-        print(f"Loaded dataset {source_ds.dataset_dir} in {time.perf_counter() - start:.1f}s")
+        print(f"Loaded dataset {source_ds.dataset_root} in {time.perf_counter() - start:.1f}s")
 
         # Update fields.json
         fields = {}
@@ -151,30 +147,34 @@ def extract_batch_index(
             if isinstance(v[0], str) and v[1] in ["sample_source_id", "src_key"]:
                 continue
             fields[k] = v
-        with AtomicFile(ds_path / 'fields.json') as fname:
-            with open(fname, 'w') as f:
+        with AtomicFile(ds_path / "fields.json") as fname:
+            with open(fname, "w") as f:
                 json.dump(fields, f)
 
         try:
             start = time.perf_counter()
-            source_idx = (
-                source_ds
-                .sql_select('__key__', 'load_duration AS audio_duration', '__shard_path__', '__shard_offset__ AS offset', shard_subsample=1)
-                .with_columns(
-                    pl.col('audio_duration').cast(pl.Float32),
-                    speech_duration=pl.lit(None).cast(pl.Float32()),
-                    shard=pl.col('__shard_path__').str.extract(r"([^/]+).wsds$", 1),
-                )
+            source_idx = source_ds.sql_select(
+                "__key__",
+                "load_duration AS audio_duration",
+                "__shard_path__",
+                "__shard_offset__ AS offset",
+                shard_subsample=1,
+            ).with_columns(
+                pl.col("audio_duration").cast(pl.Float32),
+                speech_duration=pl.lit(None).cast(pl.Float32()),
+                shard=pl.col("__shard_path__").str.extract(r"([^/]+).wsds$", 1),
             )
-            source_idx.write_ipc(batch / 'source/episode-list.feather', compression='zstd')
+            source_idx.write_ipc(batch / "source/episode-list.feather", compression="zstd")
 
-            print(f"Extracted {len(source_idx)} episodes from {source_ds.dataset_dir} in {time.perf_counter() - start:.1f}s")
+            print(
+                f"Extracted {len(source_idx)} episodes from {source_ds.dataset_root} in {time.perf_counter() - start:.1f}s"
+            )
         except Exception as e:
             return str(batch), "error extracting source episodes", repr(e), traceback.format_exc()
 
     # Process filtered_vad dataset
-    ds_path = batch / 'filtered_vad'
-    out_file = ds_path / 'episode-list.feather'
+    ds_path = batch / "filtered_vad"
+    out_file = ds_path / "episode-list.feather"
 
     if out_file.exists() and not overwrite:
         print(f"Skipping, {out_file} already exists")
@@ -187,7 +187,7 @@ def extract_batch_index(
             print(f"Error initializing WSDataset at {batch / 'filtered_vad'}: {e}")
             return str(batch), "error initializing filtered_vad dataset", repr(e), traceback.format_exc()
 
-        print(f"Loaded dataset {vad_ds.dataset_dir} in {time.perf_counter() - start:.1f}s")
+        print(f"Loaded dataset {vad_ds.dataset_root} in {time.perf_counter() - start:.1f}s")
 
         # Update fields.json
         fields = {}
@@ -195,21 +195,23 @@ def extract_batch_index(
             if isinstance(v[0], str) and v[1] in ["sample_source_id", "src_key"]:
                 continue
             fields[k] = v
-        with AtomicFile(ds_path / 'fields.json') as fname:
-            with open(fname, 'w') as f:
+        with AtomicFile(ds_path / "fields.json") as fname:
+            with open(fname, "w") as f:
                 json.dump(fields, f)
 
         try:
             start = time.perf_counter()
-            vad_idx = (
-                vad_ds
-                .sql_select('__key__', 'tend - tstart AS speech_duration', '__shard_path__', '__shard_offset__ AS offset', shard_subsample=1,
-                    shard_pipe=extract_episodes)
-                .join(source_idx['__key__', 'audio_duration'], on='__key__')
-            )
-            vad_idx.write_ipc(batch / 'filtered_vad/episode-list.feather', compression='zstd')
-
-            print(f"Extracted {len(vad_idx)} episodes from {vad_ds.dataset_dir} in {time.perf_counter() - start:.1f}s")
+            vad_idx = vad_ds.sql_select(
+                "__key__",
+                "tend - tstart AS speech_duration",
+                "__shard_path__",
+                "__shard_offset__ AS offset",
+                shard_subsample=1,
+                shard_pipe=extract_episodes,
+            ).join(source_idx["__key__", "audio_duration"], on="__key__")
+            vad_idx.write_ipc(batch / "filtered_vad/episode-list.feather", compression="zstd")
+
+            print(f"Extracted {len(vad_idx)} episodes from {vad_ds.dataset_root} in {time.perf_counter() - start:.1f}s")
         except Exception as e:
             return str(batch), "error extracting filtered_vad episodes", repr(e), traceback.format_exc()
 
@@ -246,7 +248,7 @@ def merge_batch_indices(
 
     for batch in batches:
         ds_path = Path(batch) / dataset_kind
-        idx_file = ds_path / 'episode-list.feather'
+        idx_file = ds_path / "episode-list.feather"
         if idx_file.exists():
             size += idx_file.stat().st_size
 
@@ -259,21 +261,22 @@ def merge_batch_indices(
             # create shard index
             shard_idx = make_shard_idx(
                 episode_idx,
-                n_samples_expr=pl.len().alias('n_samples') if dataset_kind == 'source' else pl.sum('segments').alias('n_samples'),
-                dataset_path=os.path.relpath(ds_path, dst),
+                n_samples_expr=pl.len().alias("n_samples")
+                if dataset_kind == "source"
+                else pl.sum("segments").alias("n_samples"),
+                partition=os.path.relpath(ds_path, dst),
                 shard_id_offset=n_shards,
             )
             n_shards += len(shard_idx)
             # replace shard names with unique indices
-            episode_idx = (episode_idx
-                .rename({'__key__': 'name'})
-                .join(shard_idx.select('shard', 'shard_id'), on='shard')
+            episode_idx = episode_idx.rename({"__key__": "name"}).join(
+                shard_idx.select("shard", "shard_id"), on="shard"
             )
             episode_idxs.append(episode_idx)
             shard_idxs.append(shard_idx)
 
             merge_field_errors = []
-            with open(ds_path / 'fields.json') as f:
+            with open(ds_path / "fields.json") as f:
                 for k, v in json.load(f).items():
                     if k not in merged_fields:
                         merged_fields[k] = v
@@ -281,38 +284,41 @@ def merge_batch_indices(
                         if v != merged_fields[k]:
                             merge_field_errors.append(k)
             if merge_field_errors:
-                errors.append((str(idx_file), f"error merging fields", None, ', '.join(merge_field_errors)))
+                errors.append((str(idx_file), "error merging fields", None, ", ".join(merge_field_errors)))
         else:
             errors.append((str(idx_file), "missing file", None, None))
 
     merged_episode_idx = (
         pl.concat(episode_idxs)
-        .unique(subset=['name'])
-        .sort('name')
-        .select('name', 'shard_id', 'offset', 'audio_duration', 'speech_duration')
+        .unique(subset=["name"])
+        .sort("name")
+        .select("name", "shard_id", "offset", "audio_duration", "speech_duration")
     )
-    merged_shard_idx = (
-        pl.concat(shard_idxs)
-        .with_columns(
-            global_offset=pl.col('n_samples').cum_sum() - pl.col('n_samples'),
-        )
+    merged_shard_idx = pl.concat(shard_idxs).with_columns(
+        global_offset=pl.col("n_samples").cum_sum() - pl.col("n_samples"),
     )
 
-    print(f"Merged {len(merged_episode_idx)} {dataset_kind} episodes ({size/1024/1024:.1f} MB) for {dest_path} in {time.perf_counter() - start:.2f} s")
+    print(
+        f"Merged {len(merged_episode_idx)} {dataset_kind} episodes ({size / 1024 / 1024:.1f} MB) for {dest_path} in {time.perf_counter() - start:.2f} s"
+    )
 
     start = time.perf_counter()
     dst.mkdir(exist_ok=True, parents=True)
 
-    merged_episode_idx.write_ipc(dst / 'episode-index.feather')
-    merged_shard_idx.write_ipc(dst / 'shard-index.feather')
+    merged_episode_idx.write_ipc(dst / "episode-index.feather")
+    merged_shard_idx.write_ipc(dst / "shard-index.feather")
     print(f"Saved feather indices to {dst} in {time.perf_counter() - start:.2f} s")
 
     try:
         start = time.perf_counter()
         write_index(
-            dst, merged_shard_idx, merged_episode_idx, merged_fields,
-            vad_column='vad.npy' if dataset_kind == 'filtered_vad' else None,
-            source_path='../source')
+            dst,
+            merged_shard_idx,
+            merged_episode_idx,
+            merged_fields,
+            vad_column="vad.npy" if dataset_kind == "filtered_vad" else None,
+            source_path="../source",
+        )
         print(f"Saved index to {dst} in {time.perf_counter() - start:.2f} s")
 
     except Exception as e:
@@ -322,7 +328,7 @@ def merge_batch_indices(
     for path, error, exc_repr, tb in errors:
         print("    ", path, "-", error, exc_repr or "")
 
-    with open(dst / 'indexing.log', 'w') as f:
+    with open(dst / "indexing.log", "w") as f:
         for path, error, exc_repr, tb in errors:
             f.write(f"{path} - {error}")
             if exc_repr:
diff --git a/wsds/ws_modal_shard.py b/wsds/ws_modal_shard.py
new file mode 100644
index 0000000..0c48f20
--- /dev/null
+++ b/wsds/ws_modal_shard.py
@@ -0,0 +1,116 @@
+import os
+import typing
+from typing import TYPE_CHECKING, Optional, Tuple
+
+from .pupyarrow.file_reader import ModalFileReader
+from .pupyarrow.pupyarrow import FeatherFile, LazyBinaryArray
+from .ws_decode import decode_sample
+from .ws_shard import WSShardInterface
+
+if TYPE_CHECKING:
+    from .ws_dataset import WSDataset
+
+
+class WSModalShard(WSShardInterface):
+    """A shard reader that loads data from a Modal Volume via range requests.
+
+    Uses ModalFileReader (gRPC ``VolumeGetFile2`` with ``start``/``len``) so
+    that only the IPC footer and the specific batch(es) needed are fetched,
+    rather than downloading the entire shard file."""
+
+    def __init__(self, dataset: "WSDataset", volume_name: str, path: str, shard_ref: Optional[Tuple[str, str]]=None):
+        self.dataset = dataset
+        self.shard_ref = shard_ref
+        self.volume_name = volume_name
+        self.path = path
+
+        self._reader = ModalFileReader.from_name(volume_name, path)
+        self._feather = FeatherFile(self._reader)
+        self.batch_size = int(self._feather.schema.custom_metadata["batch_size"])
+
+        # cache
+        self._start = None
+        self._end = None
+        self._batch = None
+
+    @classmethod
+    def from_link(cls, link, dataset, shard_ref):
+        """Create a Modal shard from a link spec.
+
+        The volume path is built as ``<prefix>/<partition>/<column_dir>/<shard>.wsds``.
+        ``column_dir`` comes from the link spec (required when the volume mirrors the
+        local dataset directory layout with per-column subdirectories)."""
+        partition, shard = shard_ref
+        prefix = link.get("prefix", "")
+        column_dir = link.get("subdir", "")
+        parts = [p for p in (prefix, partition, column_dir, f"{shard}.wsds") if p]
+        path = os.path.normpath("/".join(parts))
+        # Strip leading "../" — partition is relative to the index but
+        # volume paths are absolute from the volume root.
+        while path.startswith("../"):
+            path = path[3:]
+        return cls(dataset, link["volume_name"], path, shard_ref=shard_ref)
+
+    @classmethod
+    def get_columns(cls, link, dataset):
+        """Return columns provided by this Modal link."""
+        if "columns" in link:
+            return {col: col for col in link["columns"]}
+        columns = cls._discover_columns(link)
+        return {col: col for col in columns if col != "__key__"}
+
+    @classmethod
+    def _discover_columns(cls, link):
+        """Read one shard's footer from the Modal Volume to discover column names."""
+        import modal
+
+        vol = modal.Volume.from_name(link["volume_name"])
+        prefix = link["prefix"]
+        for entry in vol.listdir(prefix):
+            if entry.path.endswith(".wsds"):
+                reader = ModalFileReader.from_name(link["volume_name"], entry.path)
+                feather = FeatherFile(reader)
+                names = feather.schema.names
+                reader.close()
+                return names
+        raise ValueError(f"No .wsds files found in modal volume '{link['volume_name']}' at prefix '{prefix}'")
+
+    def _modal_path(self) -> str:
+        return f"modal://{self.volume_name}/{self.path}"
+
+    def get_sample(self, column: str, offset: int) -> typing.Any:
+        if self._batch is None or offset < self._start or offset >= self._end:
+            i = offset // self.batch_size
+            if i >= self._feather.num_record_batches:
+                raise IndexError(f"{offset} is out of range for shard {self._modal_path()}")
+            self._batch = self._feather.record_batch(i)
+            if i < self._feather.num_record_batches - 1:
+                if self._batch.num_rows < self.batch_size:
+                    raise ValueError(
+                        f"Batch {i} in shard {self._modal_path()} is incomplete "
+                        f"(has only {self._batch.num_rows} rows instead of {self.batch_size})"
+                    )
+            self._start = i * self.batch_size
+            self._end = self._start + self.batch_size
+
+        j = offset % self.batch_size
+        if j >= self._batch.num_rows:
+            raise IndexError(f"{offset} is out of range for shard {self._modal_path()}")
+        try:
+            col = self._batch.column(column)
+        except KeyError:
+            raise KeyError(f"column {column} not found in shard {self._modal_path()}")
+        data = col[j]
+        try:
+            if isinstance(col, LazyBinaryArray):
+                data._optimal_read_size = 2 * 1024 * 1024
+                return decode_sample(column, data)
+        except Exception as e:
+            raise ValueError(f"Failed to decode column {column} in shard {self._modal_path()} (offset {offset}): {e}")
+        return data
+
+    def __repr__(self):
+        r = f"WSModalShard('{self._modal_path()}')"
+        if self._batch:
+            r += f" # cached_region = [{self._start}, {self._end}]"
+        return r
diff --git a/wsds/ws_s3_shard.py b/wsds/ws_s3_shard.py
index 3f0c4c3..6e30dab 100644
--- a/wsds/ws_s3_shard.py
+++ b/wsds/ws_s3_shard.py
@@ -1,15 +1,12 @@
 import os
-import pickle
 import typing
-from typing import TYPE_CHECKING
+from typing import TYPE_CHECKING, Optional, Tuple
 from urllib.parse import urlparse
 
-import numpy as np
-
 from .pupyarrow.file_reader import S3FileReader
-from .pupyarrow.pupyarrow import FeatherFile
+from .pupyarrow.pupyarrow import FeatherFile, LazyBinaryArray
 from .utils import WSShardMissingError
-from .ws_audio import AudioReader
+from .ws_decode import decode_sample
 from .ws_shard import WSShardInterface
 
 if TYPE_CHECKING:
@@ -23,9 +20,9 @@ class WSS3Shard(WSShardInterface):
     IPC footer and the specific batch(es) needed are fetched, rather than
     downloading the entire shard file."""
 
-    def __init__(self, dataset: "WSDataset", bucket: str, key: str, shard_name=None, s3_client=None):
+    def __init__(self, dataset: "WSDataset", bucket: str, key: str, shard_ref: Optional[Tuple[str, str]]=None, s3_client=None):
         self.dataset = dataset
-        self.shard_name = shard_name
+        self.shard_ref = shard_ref
         self.bucket = bucket
         self.key = key
 
@@ -47,14 +44,14 @@ def __init__(self, dataset: "WSDataset", bucket: str, key: str, shard_name=None,
         self._batch = None
 
     @classmethod
-    def from_s3_url(cls, dataset: "WSDataset", url: str, shard_name=None, s3_client=None):
+    def from_s3_url(cls, dataset: "WSDataset", url: str, shard_ref: Optional[Tuple[str, str]]=None, s3_client=None):
         """Construct from an s3://bucket/key URL."""
         parsed = urlparse(url)
         if parsed.scheme != "s3":
             raise ValueError(f"expected s3:// URL, got: {url}")
         bucket = parsed.netloc
         key = parsed.path.lstrip("/")
-        return cls(dataset, bucket, key, shard_name=shard_name, s3_client=s3_client)
+        return cls(dataset, bucket, key, shard_ref=shard_ref, s3_client=s3_client)
 
     @classmethod
     def get_columns(cls, link, dataset):
@@ -65,13 +62,13 @@ def get_columns(cls, link, dataset):
         return {col: col for col in columns if col != "__key__"}
 
     @classmethod
-    def from_link(cls, link, dataset, shard_name):
+    def from_link(cls, link, dataset, shard_ref):
         """Create an S3 shard from a link spec."""
-        dataset_path, shard = shard_name
+        partition, shard = shard_ref
         prefix = link["prefix"]
-        key = f"{prefix}/{dataset_path}/{shard}.wsds" if dataset_path else f"{prefix}/{shard}.wsds"
+        key = f"{prefix}/{partition}/{shard}.wsds" if partition else f"{prefix}/{shard}.wsds"
         s3_client = cls._make_s3_client(link.get("endpoint_url"))
-        return cls(dataset, link["bucket"], os.path.normpath(key), shard_name=shard_name, s3_client=s3_client)
+        return cls(dataset, link["bucket"], os.path.normpath(key), shard_ref=shard_ref, s3_client=s3_client)
 
     @classmethod
     def _make_s3_client(cls, endpoint_url=None):
@@ -124,19 +121,12 @@ def get_sample(self, column: str, offset: int) -> typing.Any:
             raise KeyError(f"column {column} not found in shard {self._s3_path()}")
         data = col[j]
         try:
-            if column.endswith("npy"):
-                return np.load(data)
-            elif column.endswith("pyd"):
-                return pickle.load(data)
-            elif column.endswith("txt"):
-                return data if isinstance(data, str) else data.decode("utf-8")
-            elif column in self.dataset._audio_file_keys:
+            if isinstance(col, LazyBinaryArray):
                 data._optimal_read_size = 2 * 1024 * 1024
-                return AudioReader(data)
-            else:
-                return data
+                return decode_sample(column, data)
         except Exception as e:
             raise ValueError(f"Failed to decode column {column} in shard {self._s3_path()} (offset {offset}): {e}")
+        return data
 
     def __repr__(self):
         r = f"WSS3Shard('{self._s3_path()}')"
diff --git a/wsds/ws_sample.py b/wsds/ws_sample.py
index 70eb2cf..bfd7434 100644
--- a/wsds/ws_sample.py
+++ b/wsds/ws_sample.py
@@ -2,6 +2,7 @@
 from typing import TYPE_CHECKING
 
 from .utils import WSShardMissingError, validate_shards
+from .ws_decode import get_audio as _get_audio
 
 if TYPE_CHECKING:
     from .ws_dataset import WSDataset
@@ -10,22 +11,15 @@
 @dataclass(frozen=True)
 class WSSample:
     dataset: "WSDataset"
-    shard_name: str
+    shard_ref: str
     offset: int
     overrides: dict = field(default_factory=dict)
     # Key verification state (mutable containers to work with frozen dataclass)
-    _verified_subdirs: set = field(default_factory=set, repr=False, compare=False)
+    _verified_column_dirs: set = field(default_factory=set, repr=False, compare=False)
     _reference_key: list = field(default_factory=list, repr=False, compare=False)
 
     def get_audio(self, audio_columns=None):
-        candidates = audio_columns or self.dataset._audio_file_keys
-
-        r = self.get_one_of(*candidates)
-
-        if not r:
-            raise KeyError(f"No audio column (tried {candidates}) found among: {list(self.keys())}")
-
-        return r
+        return _get_audio(self, audio_columns)
 
     def keys(self):
         return self.dataset.fields.keys() | (self.overrides.keys() if self.overrides else set())
@@ -37,46 +31,46 @@ def values(self):
         yield from (v for _, v in self.items())
 
     def _verify_key_for_field(self, field: str):
-        """Verify __key__ in this field's subdir matches the reference key."""
+        """Verify __key__ in this field's column_dir matches the reference key."""
         value = self.dataset.fields.get(field)
         if value is None:
             return
-        (subdir, _column) = value[0]
+        (column_dir, _column) = value[0]
 
-        if subdir in self._verified_subdirs:
+        if column_dir in self._verified_column_dirs:
             return
 
         # Skip computed columns (they don't have their own __key__)
-        if subdir in self.dataset.computed_columns:
-            self._verified_subdirs.add(subdir)
+        if column_dir in self.dataset.computed_columns:
+            self._verified_column_dirs.add(column_dir)
             return
 
-        # Get __key__ from this subdir
+        # Get __key__ from this column_dir
         try:
-            key = self.dataset.get_shard(subdir, self.shard_name).get_sample("__key__", self.offset)
+            key = self.dataset.get_shard(column_dir, self.shard_ref).get_sample("__key__", self.offset)
         except (WSShardMissingError, KeyError):
             # Can't verify if shard or key is missing
-            self._verified_subdirs.add(subdir)
+            self._verified_column_dirs.add(column_dir)
             return
 
         if not self._reference_key:
-            # First subdir accessed - store as reference
-            self._reference_key.append((subdir, key))
+            # First column_dir accessed - store as reference
+            self._reference_key.append((column_dir, key))
         else:
-            ref_subdir, ref_key = self._reference_key[0]
+            ref_column_dir, ref_key = self._reference_key[0]
             if key != ref_key:
                 raise ValueError(
-                    f"Key mismatch at offset {self.offset} in shard {self.shard_name}: "
-                    f"{ref_subdir} has '{ref_key}' but {subdir} has '{key}'"
+                    f"Key mismatch at offset {self.offset} in shard {self.shard_ref}: "
+                    f"{ref_column_dir} has '{ref_key}' but {column_dir} has '{key}'"
                 )
 
-        self._verified_subdirs.add(subdir)
+        self._verified_column_dirs.add(column_dir)
 
     def __getitem__(self, field):
         if field in self.overrides:
             return self.overrides[field]
         self._verify_key_for_field(field)
-        return self.dataset.get_sample(self.shard_name, field, self.offset)
+        return self.dataset.get_sample(self.shard_ref, field, self.offset)
 
     def __setitem__(self, field, value):
         self.overrides[field] = value
@@ -117,53 +111,54 @@ def __repr_field__(self, field, repr=repr):
 
     def __repr__(self, repr=repr):
         r = [
-            f"WSSample({self.dataset.__repr__()}, shard_name={repr(self.shard_name)}, offset={repr(self.offset)}, fields={'{'}"
+            f"WSSample({self.dataset.__repr__()}, shard_ref={repr(self.shard_ref)}, offset={repr(self.offset)}, fields={'{'}"
         ]
         other = []
         txt = []
         arrays = []
 
-        # Group columns by subdirectory
-        subdir_columns = {}
+        # Group columns by column directory
+        columns_by_dir = {}
         for k in self.keys():
             if k in self.overrides:
-                subdir = "__overrides__"
+                column_dir = "__overrides__"
             elif k in self.dataset.fields:
                 value = self.dataset.fields[k]
-                (subdir, _column) = value[0]
+                (column_dir, _column) = value[0]
             else:
-                subdir = "__unknown__"
-            if subdir not in subdir_columns:
-                subdir_columns[subdir] = []
-            subdir_columns[subdir].append(k)
-
-        # Prefetch shard tails concurrently for all subdirs that will be accessed
-        subdirs_to_prefetch = [s for s in subdir_columns.keys() if s not in ("__overrides__", "__unknown__")]
-        if subdirs_to_prefetch:
-            validate_shards(self.dataset, [self.shard_name], subdirs_to_prefetch)
-
-        # Identify large subdirectories (>10 columns)
-        large_subdirs = {
-            subdir: cols
-            for subdir, cols in subdir_columns.items()
-            if len(cols) > 10 and subdir not in ("__overrides__", "__unknown__")
+                column_dir = "__unknown__"
+            if column_dir not in columns_by_dir:
+                columns_by_dir[column_dir] = []
+            columns_by_dir[column_dir].append(k)
+
+        # Prefetch shard tails concurrently for all column dirs that will be accessed
+        dirs_to_prefetch = [s for s in columns_by_dir.keys() if s not in ("__overrides__", "__unknown__")]
+        if dirs_to_prefetch:
+            validate_shards(self.dataset, [self.shard_ref], dirs_to_prefetch)
+
+        # Identify large column directories (>10 columns)
+        large_dirs = {
+            column_dir: cols
+            for column_dir, cols in columns_by_dir.items()
+            if len(cols) > 10 and column_dir not in ("__overrides__", "__unknown__")
         }
 
-        # Columns in small subdirectories go through normal classification
-        small_subdir_keys = set()
-        for subdir, cols in subdir_columns.items():
-            if subdir not in large_subdirs:
-                small_subdir_keys.update(cols)
+        # Columns in small column directories go through normal classification
+        small_dir_keys = set()
+        for column_dir, cols in columns_by_dir.items():
+            if column_dir not in large_dirs:
+                small_dir_keys.update(cols)
 
-        for k in small_subdir_keys:
+        missing = []
+        for k in small_dir_keys:
             try:
                 v = self[k]
-            except WSShardMissingError as err:
-                arrays.append(k)
-            except KeyError as err:
+            except WSShardMissingError:
+                missing.append(k)
+            except KeyError:
                 other.append(k)
             else:
-                if k == '__key__':
+                if k == "__key__":
                     other.insert(0, k)
                 elif hasattr(v, "shape") and v.shape:
                     arrays.append(k)
@@ -184,9 +179,9 @@ def print_keys(keys, max_keys=None):
             r.append("# Arrays:")
             print_keys(arrays)
 
-        # Handle large subdirectories
-        for subdir, cols in sorted(large_subdirs.items()):
-            r.append(f"# {subdir} ({len(cols)} columns, showing top 10):")
+        # Handle large column directories
+        for column_dir, cols in sorted(large_dirs.items()):
+            r.append(f"# {column_dir} ({len(cols)} columns, showing top 10):")
 
             # Try to get float values for sorting by highest value
             float_values = []
@@ -213,6 +208,10 @@ def print_keys(keys, max_keys=None):
             if len(cols) > 10:
                 r.append(f"  # ... and {len(cols) - 10} more columns")
 
+        if missing:
+            r.append("# Missing shards:")
+            print_keys(missing)
+
         r.append("})\n")
         return "\n".join(r)
 
@@ -231,13 +230,18 @@ def marimo_repr(x):
             else:
                 return repr(x)
 
-        # print(repr(special))
         html = marimo.md(f"```python\n{self.__repr__(repr=marimo_repr)}\n```").text
         for k, v in special.items():
             html = html.replace(k, v)
         return marimo.Html(html)
 
     def _ipython_display_(self):
+        from .utils import is_notebook
+
+        if not is_notebook():
+            print(repr(self))
+            return
+
         import random
 
         from IPython.display import HTML, display
@@ -253,9 +257,11 @@ def ipython_repr(x):
                 return repr(x)
 
         # Jupyter Markdown renders client-side so we cannot use the same trick as Marimo
-        html = ('<pre style="font-family: monospace; white-space: pre-wrap;">' +
-            self.__repr__(repr=ipython_repr).replace("<", "&lt;").replace(">", "&gt;") +
-            '</pre>')
+        html = (
+            '<pre style="font-family: monospace; white-space: pre-wrap;">'
+            + self.__repr__(repr=ipython_repr).replace("<", "&lt;").replace(">", "&gt;")
+            + "</pre>"
+        )
 
         for k, v in special.items():
             html = html.replace(k, v)
diff --git a/wsds/ws_shard.py b/wsds/ws_shard.py
index 15f4ab4..4d3d514 100644
--- a/wsds/ws_shard.py
+++ b/wsds/ws_shard.py
@@ -1,29 +1,26 @@
 import io
-import pickle
 import re
 import typing
 from dataclasses import dataclass
 from typing import TYPE_CHECKING
 
-import numpy as np
 import pyarrow as pa
 
+from .utils import WSShardMissingError
 from .ws_audio import AudioReader, WSAudio
+from .ws_decode import decode_sample
 from .ws_sample import WSSample
-from .utils import WSShardMissingError
 
 if TYPE_CHECKING:
     from .ws_dataset import WSDataset
 
 
 class WSShardInterface:
-    shard_name: str
+    shard_ref: (str, str)
     """Used by WSDataset to invalidate cached shards."""
 
     @classmethod
-    def get_columns(
-        cls, link: dict, source_dataset: "WSDataset", derived_dataset: "WSDataset"
-    ) -> dict[str, str] | None:
+    def get_columns(cls, link: dict, dataset: "WSDataset") -> dict[str, str] | None:
         """Return columns this link provides: {column_name: column_name}.
 
         Override this to provide multiple columns from a single link.
@@ -45,9 +42,9 @@ class WSShard(WSShardInterface):
     batch_size: int
     dataset: "WSDataset"
 
-    def __init__(self, dataset, fname, shard_name=None):
+    def __init__(self, dataset, fname, shard_ref=None):
         self.dataset = dataset
-        self.shard_name = shard_name
+        self.shard_ref = shard_ref
         self.fname = fname
 
         try:
@@ -85,19 +82,11 @@ def get_sample(self, column: str, offset: int) -> typing.Any:
         if self._data.schema.get_field_index(column) == -1:
             raise KeyError(f"column {column} not found in shard {self.fname}")
         data = self._data[column][j]
+        col_type = self._data.schema.field(column).type
         try:
-            # FIXME: implement proper encoders and decoders
-            if column.endswith("npy"):
-                return np.load(io.BytesIO(data.as_buffer()))
-            elif column.endswith("pyd"):
-                return pickle.load(io.BytesIO(data.as_buffer()))
-            elif column.endswith("txt"):
-                return data.as_buffer().to_pybytes().decode("utf-8")
-            elif column in self.dataset._audio_file_keys:
-                return AudioReader(data)
-            else:
-                # FIXME: we need to handle audio decoding here to avoid copying the entire audio buffer
-                return data.as_py(maps_as_pydicts="strict")
+            if pa.types.is_binary(col_type) or pa.types.is_large_binary(col_type):
+                return decode_sample(column, io.BytesIO(data.as_buffer()))
+            return data.as_py(maps_as_pydicts="strict")
         except Exception as e:
             raise ValueError(f"Failed to decode column {column} in shard {self.fname} (offset {offset}): {e}")
 
@@ -114,7 +103,7 @@ class WSSourceAudioShard(WSShardInterface):
 
     It is used via the `WSDataset.add_computed` method or the `.wsds-link` file mechanism."""
 
-    shard_name: str
+    shard_ref: (str, str)
     source_dataset: "WSDataset"  # noqa: F821
     derived_dataset: "WSDataset"  # noqa: F821
     vad_column: str
@@ -125,23 +114,23 @@ class WSSourceAudioShard(WSShardInterface):
     _source_reader: AudioReader = None
 
     @classmethod
-    def from_link(cls, link, dataset, shard_name):
+    def from_link(cls, link, dataset, shard_ref):
         source_dataset = dataset.get_linked_dataset(link["dataset_dir"])
-        return cls(shard_name, source_dataset, dataset, link["vad_column"])
+        return cls(shard_ref, source_dataset, dataset, link["vad_column"])
 
     def get_timestamps(self, segment_offset):
         return self._source_sample[self.vad_column][segment_offset]
 
     def get_sample(self, _column, offset):
         file_name, segment_offset = self.derived_dataset.parse_key(
-            WSSample(self.derived_dataset, self.shard_name, offset)["__key__"]
+            WSSample(self.derived_dataset, self.shard_ref, offset)["__key__"]
         )
 
         if self._source_file_name != file_name:
             self._source_sample = self.source_dataset[file_name]
             try:
                 self._source_reader = self._source_sample.get_audio()
-            except KeyError as err:
+            except KeyError:
                 raise WSShardMissingError("no audio shards found")
             self._source_file_name = file_name
 
@@ -153,8 +142,8 @@ class WSYoutubeVideoShard(WSSourceAudioShard):
     re_pattern: re.Pattern[str]
 
     @classmethod
-    def from_link(cls, link, dataset, shard_name):
-        self = super().from_link(link, dataset, shard_name)
+    def from_link(cls, link, dataset, shard_ref):
+        self = super().from_link(link, dataset, shard_ref)
         self.re_pattern = re.compile(link["youtube_id_regexp"])
         return self
 
@@ -179,7 +168,7 @@ class WSSourceLink(WSShardInterface):
     {"dataset_dir": "../source", "loader": ["wsds.ws_shard", "WSSourceLink"], "key_prefix": "source."}
     """
 
-    shard_name: str
+    shard_ref: (str, str)
     source_dataset: "WSDataset"
     derived_dataset: "WSDataset"
     key_prefix: str
@@ -202,14 +191,14 @@ def get_columns(cls, link, dataset):
         return columns
 
     @classmethod
-    def from_link(cls, link, dataset, shard_name):
+    def from_link(cls, link, dataset, shard_ref):
         source_dataset = dataset.get_linked_dataset(link["dataset_dir"])
         key_prefix = link.get("key_prefix", "source.")
-        return cls(shard_name, source_dataset, dataset, key_prefix)
+        return cls(shard_ref, source_dataset, dataset, key_prefix)
 
     def get_sample(self, column: str, offset: int):
         # Parse the derived dataset's key to get the source file name
-        derived_key = WSSample(self.derived_dataset, self.shard_name, offset)["__key__"]
+        derived_key = WSSample(self.derived_dataset, self.shard_ref, offset)["__key__"]
         file_name, _segment_offset = self.derived_dataset.parse_key(derived_key)
 
         if self._source_file_name != file_name:
diff --git a/wsds/ws_sink.py b/wsds/ws_sink.py
index f506c62..5c7101b 100644
--- a/wsds/ws_sink.py
+++ b/wsds/ws_sink.py
@@ -33,13 +33,13 @@ class WSBatchedSink:
     Automatically batches data and infers the schema from the first batch.
 
     Example:
-    >>> with WSBatchedSink('output.feather', batch_size=2, throwaway=True) as sink: sink.write({'a': 1, 'b': 'x'})
+    >>> with WSBatchedSink('output.feather', throwaway=True) as sink: sink.write({'a': 1, 'b': 'x'})
     """
 
     def __init__(
         self,
         fname,  # final output file name, intermediate output goes into a temporary file
-        min_batch_size_bytes: int = 1024*1024, # minimum size of a batch in bytes (1MB by default)
+        min_batch_size_bytes: int = 1024 * 1024,  # minimum size of a batch in bytes (1MB by default)
         compression: str | None = "zstd",
         throwaway=False,  # discard the temp file, useful for testing and benchmarking
     ):
diff --git a/wsds/ws_tools.py b/wsds/ws_tools.py
index 6a0e2eb..e65f8a2 100644
--- a/wsds/ws_tools.py
+++ b/wsds/ws_tools.py
@@ -153,14 +153,14 @@ def shards(dataset: Path, verbose=False, complete_in_progress=False):
 
         shard_names = list_all_shards(dataset, verbose=True, print_missing=False)
         print()
-        for subdir in Path(dataset).iterdir():
-            if not subdir.is_dir():
+        for column_dir in Path(dataset).iterdir():
+            if not column_dir.is_dir():
                 continue
-            shards = [(Path(dataset_path) / subdir / shard).with_suffix(".wsds") for dataset_path, shard in shard_names]
+            shards = [(Path(partition) / column_dir / shard).with_suffix(".wsds") for partition, shard in shard_names]
             schemas = {shard: get_shard_schema(shard) for shard in shards}
             unique = set(s for s in schemas.values() if s)
             if len(unique) > 1:
-                print(f"Found schema conflicts for {subdir}:\n")
+                print(f"Found schema conflicts for {column_dir}:\n")
                 for schema in unique:
                     matching_shards = [shard for shard, shard_schema in schemas.items() if schema == shard_schema]
                     prefix = f"  in {len(matching_shards)} shards: "
@@ -170,7 +170,7 @@ def shards(dataset: Path, verbose=False, complete_in_progress=False):
                             print(indented(" " * len(prefix), shard))
                         print()
             if None not in schemas.values() and complete_in_progress:
-                os.rename(subdir, str(subdir).replace('.in-progress', ''))
+                os.rename(column_dir, str(column_dir).replace('.in-progress', ''))
 
     @staticmethod
     def load_test_yaml(test_yaml_path: Path):
@@ -306,39 +306,39 @@ def keys(dataset: Path, verbose=False, skip_audio=True):
 
         dataset = Path(dataset)
         if next(dataset.iterdir()).suffix == ".wsds":
-            subdirs = [dataset]
+            column_dirs = [dataset]
             dataset = dataset.parent
         else:
-            subdirs = list(dataset.iterdir())
+            column_dirs = list(dataset.iterdir())
             if skip_audio:
-                subdirs = [dir for dir in subdirs if dir.name != "audio"]
+                column_dirs = [dir for dir in column_dirs if dir.name != "audio"]
 
         ds = WSDataset(dataset)
         shards = ds.get_shard_list()
         missing_shards = defaultdict(int)
         for shard in tqdm(shards, desc=str(dataset)):
             expected_keys = generate_all_keys_for_shard(ds.index, shard)
-            for subdir in subdirs:
-                if not subdir.is_dir():
+            for column_dir in column_dirs:
+                if not column_dir.is_dir():
                     continue
-                shard_fname = (Path(shard[0]) / subdir / shard[1]).with_suffix(".wsds")
+                shard_fname = (Path(shard[0]) / column_dir / shard[1]).with_suffix(".wsds")
                 if not shard_fname.exists():
-                    missing_shards[subdir] += 1
+                    missing_shards[column_dir] += 1
                 else:
                     try:
                         if not pl.scan_ipc(shard_fname).select((pl.col("__key__") == expected_keys).all()).collect().item():
-                            tqdm.write(f"Shard {shard} in {subdir} has keys that don't match the index.")
+                            tqdm.write(f"Shard {shard} in {column_dir} has keys that don't match the index.")
                     except pl.exceptions.ShapeError as err:
-                        tqdm.write(f"Shard {shard} in {subdir} has {pl.scan_ipc(shard_fname).select(pl.len()).collect().item()} keys while we expect {len(expected_keys)}.")
+                        tqdm.write(f"Shard {shard} in {column_dir} has {pl.scan_ipc(shard_fname).select(pl.len()).collect().item()} keys while we expect {len(expected_keys)}.")
                     reader = pa.RecordBatchFileReader(pa.memory_map(str(shard_fname)))
                     batch_size = int(reader.schema.metadata[b'batch_size'])
                     for i in range(reader.num_record_batches - 1):
                         batch = reader.get_batch(i)
                         if len(batch) != batch_size:
-                            tqdm.write(f"Batch {i} in shard {shard} in {subdir} has incorrect length.")
-        for subdir, count in missing_shards.items():
+                            tqdm.write(f"Batch {i} in shard {shard} in {column_dir} has incorrect length.")
+        for column_dir, count in missing_shards.items():
             tqdm.write("")
-            tqdm.write(f"{subdir}: missing {count} shards")
+            tqdm.write(f"{column_dir}: missing {count} shards")
 
     @staticmethod
     def all(base_path, skip_audio=True):
@@ -374,10 +374,14 @@ def get_shard_schema(fname):
 
 
 def generate_all_keys_for_shard(index, shard):
-    if index.has_dataset_path:
-        N, shard_id = index.query("SELECT n_samples, shard_id FROM shards WHERE shards.dataset_path = ? AND shards.shard = ?", *shard).fetchone()
+    partition, shard_name = shard
+    if index.has_partition or index.has_dataset_path:
+        N, shard_id = index.query(
+            f"SELECT n_samples, shard_id FROM shards AS s WHERE {index._partition_col} = ? AND s.shard = ?",
+            partition, shard_name,
+        ).fetchone()
     else:
-        N, shard_id = index.query("SELECT n_samples, shard_id FROM shards WHERE shards.shard = ?", shard[1]).fetchone()
+        N, shard_id = index.query("SELECT n_samples, shard_id FROM shards WHERE shards.shard = ?", shard_name).fetchone()
     files = index.query("SELECT name, offset FROM files WHERE files.shard_id == ?", shard_id).fetchall()
     df = pl.DataFrame(files, schema=["name", "offset"], orient="row")
     if not index.metadata["segmented"]:
@@ -420,7 +424,7 @@ def init(
         with WSDSIndexWriter(fname) as index:
             with multiprocessing.Pool(num_workers) as p:
                 for r in progress_bar(p.imap_unordered(shard_extractor, all_shards), total=len(all_shards)):
-                    r["dataset_path"] = ""
+                    r["partition"] = ""
                     try:
                         index.append(r)
                     except:
@@ -477,7 +481,7 @@ def init_split(
                 try:
                     with multiprocessing.Pool(num_workers) as p:
                         for r in p.imap_unordered(shard_extractor, all_shards):
-                            r["dataset_path"] = Path(split) / new_dataset
+                            r["partition"] = Path(split) / new_dataset
                             try:
                                 index.append(r)
                             except:
@@ -492,7 +496,7 @@ def init_split(
             new_fields = {
                 k: v
                 for k, v in ds.fields.items()
-                for (_subdir, col) in [v[0]]
+                for (_column_dir, col) in [v[0]]
                 if col not in ["sample_source_id", "src_key"]
             }
             if vad_column: