From 64306d4f37595a0d06cd216e5750840a4ce3185a Mon Sep 17 00:00:00 2001
From: James Le Houx <james.le-houx@gre.ac.uk>
Date: Tue, 5 May 2026 15:20:37 +0000
Subject: [PATCH 1/2] fix VoxelImage.from_numpy segfault on GPU build (host
 write to device mem)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Reported on Colab T4: notebook §3 crashes the kernel silently at
`_core.VoxelImage.from_numpy(arr, max_grid_size)`. Root cause: the
binding was filling the new iMultiFab with a CPU loop:

    auto fab = img->mf->array(mfi);   // Array4<int> view into iMultiFab data
    for (k...) for (j...) for (i...)
        fab(i, j, k) = ptr[idx];      // host write — but in CUDA mode this
                                      // is DEVICE memory → segfault

Works fine on CPU builds (where iMultiFab data is host memory) but every
attempt to ingest a NumPy array on the GPU build dies before any Python
print can flush. Affects every entry point that goes through from_numpy
— so high-level oi.tortuosity / oi.percolation_check / oi.volume_fraction
on the GPU wheel were ALL broken.

Switch to the AMReX idiom: stage the host data in a Gpu::DeviceVector,
then use ParallelFor with an AMREX_GPU_DEVICE lambda so the assignment
runs on the actual hardware that owns the iMultiFab memory.

  * CPU build (#ifndef AMREX_USE_GPU): src_ptr is the host pointer and
    ParallelFor expands to a serial/OMP host loop. Identical behaviour
    to before.
  * GPU build (#ifdef AMREX_USE_GPU): copy host → device once via
    Gpu::copyAsync + streamSynchronize, then ParallelFor launches a
    kernel that reads from the device buffer and writes through the
    Array4<int> view to its own memory.

Final streamSynchronize before FillBoundary makes sure the kernel is
done before the ghost-cell exchange reads it.

Note: this fix requires a wheel rebuild + new release. The pure-Python
preload helper in 4.2.9 already lets _core.so load on GPU runtimes;
this change makes it actually USABLE on them.

https://claude.ai/code/session_011dJ5Bwq4Tnr8wxH597XJFf
---
 python/bindings/module.cpp | 48 +++++++++++++++++++++++++-------------
 1 file changed, 32 insertions(+), 16 deletions(-)
diff --git a/python/bindings/module.cpp b/python/bindings/module.cpp
index 2512552..d151874 100644
--- a/python/bindings/module.cpp
+++ b/python/bindings/module.cpp
@@ -169,29 +169,45 @@ voxelimage_from_numpy(py::array_t<int32_t, py::array::c_style | py::array::force
     img->dm.define(img->ba);
     img->mf = std::make_shared<amrex::iMultiFab>(img->ba, img->dm, 1, 1);
 
-    const auto* ptr = static_cast<const int32_t*>(buf.ptr);
+    const auto* host_ptr = static_cast<const int32_t*>(buf.ptr);
+    const std::size_t total = static_cast<std::size_t>(nx) * static_cast<std::size_t>(ny) *
+                              static_cast<std::size_t>(nz);
+
+    // Stage the NumPy data in a buffer the kernel below can dereference safely.
+    // On CPU builds this is just the host pointer (no copy). On CUDA builds the
+    // iMultiFab data lives in device memory, so we have to copy the host array
+    // to a device-side buffer first — writing to fab(i,j,k) directly from host
+    // would segfault (T4, A100, etc.) because the Array4<int> view points at
+    // device memory.
+#ifdef AMREX_USE_GPU
+    amrex::Gpu::DeviceVector<int> dvec(total);
+    amrex::Gpu::copyAsync(amrex::Gpu::hostToDevice, host_ptr, host_ptr + total, dvec.begin());
+    amrex::Gpu::streamSynchronize();
+    const int* src_ptr = dvec.data();
+#else
+    const int* src_ptr = host_ptr;
+#endif
+
+    const int nx_l = nx;
+    const int ny_l = ny;
 
 #ifdef AMREX_USE_OMP
 #pragma omp parallel if (amrex::Gpu::notInLaunchRegion())
 #endif
     for (amrex::MFIter mfi(*(img->mf), amrex::TilingIfNotGPU()); mfi.isValid(); ++mfi) {
         const amrex::Box& bx = mfi.tilebox();
-        auto fab = img->mf->array(mfi);
-        const auto lo = amrex::lbound(bx);
-        const auto hi = amrex::ubound(bx);
-
-        for (int k = lo.z; k <= hi.z; ++k) {
-            for (int j = lo.y; j <= hi.y; ++j) {
-                for (int i = lo.x; i <= hi.x; ++i) {
-                    std::size_t idx = static_cast<std::size_t>(k) * static_cast<std::size_t>(ny) *
-                                          static_cast<std::size_t>(nx) +
-                                      static_cast<std::size_t>(j) * static_cast<std::size_t>(nx) +
-                                      static_cast<std::size_t>(i);
-                    fab(i, j, k) = ptr[idx];
-                }
-            }
-        }
+        auto const& fab = img->mf->array(mfi);
+        amrex::ParallelFor(bx, [=] AMREX_GPU_DEVICE(int i, int j, int k) noexcept {
+            const std::size_t idx = static_cast<std::size_t>(k) * static_cast<std::size_t>(ny_l) *
+                                        static_cast<std::size_t>(nx_l) +
+                                    static_cast<std::size_t>(j) * static_cast<std::size_t>(nx_l) +
+                                    static_cast<std::size_t>(i);
+            fab(i, j, k) = src_ptr[idx];
+        });
     }
+#ifdef AMREX_USE_GPU
+    amrex::Gpu::streamSynchronize();
+#endif
 
     img->mf->FillBoundary(img->geom.periodicity());
     return img;

From f02166b757ca051be46a6ae4f6a42e6f38ebb7c3 Mon Sep 17 00:00:00 2001
From: James Le Houx <james.le-houx@gre.ac.uk>
Date: Tue, 5 May 2026 15:26:28 +0000
Subject: [PATCH 2/2] profiling notebook: GPU-aware install, drop legacy
 probes, polish prose
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The profiling notebook had grown a few rough edges as we iterated:

 1. The install cell hard-coded `pip install openimpala`, ignoring the
    GPU runtime entirely. On a Colab T4 every solve was running on the
    CPU — exactly the failure mode §1a is supposed to detect. Rewrite
    to mirror tutorials/02/04/07: detect nvidia-smi, install
    openimpala-cuda on GPU runtimes, fall back to openimpala otherwise.

 2. §1a's build_info() probe carried a 60-line subprocess banner
    fallback for "pre-4.0.2 wheels". The published wheel has been at
    4.2.x for some time now and build_info() is in every wheel we
    publish. Drop the legacy fallback and the AttributeError handler.

 3. Drop colloquialisms in markdown headers and prose:
    - "Profiling & Bottleneck Hunt" -> "Profiling and Performance Tuning"
    - "exists for one job" / "If you only have 5 minutes" — removed
    - "*(focused, post-diagnosis)*" / "*(optional)*" parentheticals
    - "wonky workaround" / "the *classic* signature" — recast neutrally
    - The "issue #256 acceptance target" reference in §9b — replaced
      with an explanation that scales naturally to readers who weren't
      following the issue thread

 4. Add a proper contents table to the intro and clean up §2/§5/§6/§7/§9
    intros with consistent structure: one-paragraph context, bullet
    list of what to look for, one paragraph on interpretation.

 5. Fix a latent bug in §12's recommendation: the CPU-detection branch
    checked `backend == "cpu"`, but build_info()'s actual values are
    `cpp-cpu` / `cpp-cuda` / `cpp-hip` / `pure-python`. The check never
    fired in practice. Switch to the already-computed `is_gpu_build`
    flag so the "rebuild for GPU" recommendation actually appears when
    relevant.

No analysis logic changes — every code cell that produced a chart, a
fit, or a number still does so identically. This is purely a polish
pass to make the notebook read as a tool rather than a scratchpad.

https://claude.ai/code/session_011dJ5Bwq4Tnr8wxH597XJFf
---
 notebooks/profiling_and_tuning.ipynb | 439 +++++----------------------
 1 file changed, 83 insertions(+), 356 deletions(-)

diff --git a/notebooks/profiling_and_tuning.ipynb b/notebooks/profiling_and_tuning.ipynb
index 8542cb8..eeb1e35 100644
--- a/notebooks/profiling_and_tuning.ipynb
+++ b/notebooks/profiling_and_tuning.ipynb
@@ -19,38 +19,7 @@
   {
    "cell_type": "markdown",
    "metadata": {},
-   "source": [
-    "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/BASE-Laboratory/OpenImpala/blob/master/notebooks/profiling_and_tuning.ipynb)\n",
-    "\n",
-    "# OpenImpala Profiling & Bottleneck Hunt\n",
-    "\n",
-    "This notebook exists for **one job**: find out where OpenImpala is spending\n",
-    "its time, and whether the slowdown is in Python bindings, AMReX/HYPRE setup,\n",
-    "or the linear solve itself.\n",
-    "\n",
-    "Each `oi.tortuosity(numpy_array, ...)` call rebuilds the entire AMReX +\n",
-    "HYPRE stack from scratch and runs a percolation flood-fill before the solve.\n",
-    "That per-call overhead can dominate for small/medium problems. We isolate it\n",
-    "in the first four sections, *then* run the deeper C++/GPU profilers.\n",
-    "\n",
-    "**Sections**\n",
-    "\n",
-    "1. Setup\n",
-    "2. Synthetic datasets (tiny \u2192 medium)\n",
-    "3. **Python-level stage breakdown** \u2014 time every stage of one solve\n",
-    "4. **Fixed-overhead extraction** \u2014 fit `t(N) = a + b\u00b7N\u1d56` across sizes\n",
-    "5. **Amortization test** \u2014 reuse VoxelImage across repeated solves\n",
-    "6. **cProfile** \u2014 Python-level hottest frames\n",
-    "7. **AMReX TinyProfiler** \u2014 C++ function breakdown\n",
-    "8. **Nsight Systems** \u2014 GPU kernel timeline (if GPU present)\n",
-    "9. Solver comparison (focused, post-diagnosis)\n",
-    "10. `max_grid_size` tuning (focused, post-diagnosis)\n",
-    "11. Native-binary comparison *(optional, isolates binding overhead)*\n",
-    "12. Summary & HPC recommendations\n",
-    "\n",
-    "If you only have 5 minutes, run Sections 1\u20136. Sections 7+ need a real GPU\n",
-    "and a built `openimpala` executable for full value.\n"
-   ]
+   "source": "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/BASE-Laboratory/OpenImpala/blob/master/notebooks/profiling_and_tuning.ipynb)\n\n# OpenImpala — Profiling and Performance Tuning\n\nThis notebook is the diagnostic complement to the seven user tutorials. Where the tutorials show *what* to do, this one shows *why* a given solve takes the time it does — and what to change when it takes too long.\n\nEvery `oi.tortuosity(numpy_array, ...)` call rebuilds the full AMReX + HYPRE stack from scratch and runs a percolation flood fill before the linear solve. That per-call setup cost can dominate small and medium problems. The first half of the notebook isolates and quantifies that cost; the second half goes below the binding layer with C++ and GPU profilers.\n\n## Contents\n\n| Section | Topic |\n|---:|---|\n| 1 | Setup — environment detection and dependency install |\n| 1a | Build verification — confirm the GPU build is actually active |\n| 2 | Synthetic datasets at 16³ – 128³ |\n| 3 | Stage-by-stage timing of a single solve |\n| 3b | Per-stage scaling: O(N) compute vs constant overhead |\n| 4 | Fixed-overhead extraction via `t(N) = a + b·Nᵖ` |\n| 5 | Amortization: reusing the VoxelImage across solves |\n| 6 | cProfile — Python-level hot frames |\n| 7 | AMReX TinyProfiler — C++ function breakdown |\n| 8 | NVIDIA Nsight Systems — GPU kernel timeline |\n| 9 | Solver comparison and scaling validation |\n| 10 | `max_grid_size` tuning |\n| 11 | Native-binary comparison (optional) |\n| 12 | Summary and HPC recommendations |\n\nSections 1 – 6 run on any environment in a few minutes. Sections 7+ produce the most useful output on a Colab GPU runtime (T4 / A100 / L4)."
   },
   {
    "cell_type": "markdown",
@@ -72,10 +41,10 @@
     "    r = subprocess.run([\"nvidia-smi\", \"--query-gpu=name\", \"--format=csv,noheader\"],\n",
     "                       capture_output=True, text=True, timeout=10)\n",
     "    gpu_name = r.stdout.strip()\n",
-    "    print(f\"GPU detected: {gpu_name}\" if gpu_name else \"No GPU \u2014 CPU only.\")\n",
+    "    print(f\"GPU detected: {gpu_name}\" if gpu_name else \"No GPU — CPU only.\")\n",
     "except FileNotFoundError:\n",
     "    gpu_name = \"\"\n",
-    "    print(\"nvidia-smi not found \u2014 CPU only.\")\n"
+    "    print(\"nvidia-smi not found — CPU only.\")\n"
    ]
   },
   {
@@ -83,11 +52,7 @@
    "execution_count": null,
    "metadata": {},
    "outputs": [],
-   "source": [
-    "!apt-get install -y libopenmpi-dev > /dev/null 2>&1\n",
-    "!pip install openimpala porespy py-spy > /dev/null 2>&1\n",
-    "print(\"Dependencies installed.\")\n"
-   ]
+   "source": "# OpenImpala ships two PyPI wheels:\n#   openimpala       — pure-Python + SciPy fallback (CPU only)\n#   openimpala-cuda  — compiled C++ HYPRE/AMReX backend with CUDA kernels\n#\n# On a GPU runtime we install the CUDA wheel; otherwise we fall back to the\n# pure-Python package.\n!apt-get install -y libopenmpi-dev > /dev/null 2>&1\nif gpu_name:\n    print(\"GPU detected — installing openimpala-cuda\")\n    !pip install -q openimpala-cuda porespy py-spy\nelse:\n    print(\"No GPU detected — installing openimpala (CPU)\")\n    !pip install -q openimpala porespy py-spy\nprint(\"Dependencies installed.\")"
   },
   {
    "cell_type": "code",
@@ -121,98 +86,19 @@
   {
    "cell_type": "markdown",
    "metadata": {},
-   "source": [
-    "## 1a. Is the openimpala build actually using the GPU?\n",
-    "\n",
-    "Most \"OpenImpala is slow on Colab\" reports turn out to be the pip wheel\n",
-    "being **CPU-only + OpenMP** while a perfectly good T4 sits idle. We\n",
-    "probe the build by running a trivial solve in a subprocess with\n",
-    "`verbose=1` and inspecting the init banner.\n"
-   ]
+   "source": "## 1a. Build verification\n\nBefore profiling anything, confirm that the wheel `pip` installed actually has the GPU backend you expect. The most common cause of \"OpenImpala feels slow on Colab\" is a CPU-only wheel running on a GPU-equipped host — every solve in the rest of the notebook would be CPU-bound and ~10–50× slower than necessary on a single-phase 3D diffusion problem at 128³+.\n\n`oi.build_info()` returns a dict of compile-time feature flags plus the runtime GPU device count. We check that the backend matches the hardware, and warn loudly if it doesn't."
   },
   {
    "cell_type": "code",
    "execution_count": null,
    "metadata": {},
    "outputs": [],
-   "source": [
-    "# Use oi.build_info() when the wheel provides it (4.0.2+). Older wheels\n",
-    "# fall back to the subprocess banner probe so the notebook still works.\n",
-    "backend = None\n",
-    "info = None\n",
-    "try:\n",
-    "    info = oi.build_info()\n",
-    "    backend = info[\"backend\"]  # \"cpp-cuda\", \"cpp-hip\", \"cpp-cpu\", \"pure-python\"\n",
-    "    print(f\"openimpala backend: {backend}\")\n",
-    "    print(f\"  version:        {info.get('version', 'unknown')}\")\n",
-    "    print(f\"  OpenMP threads: {info.get('openmp_max_threads', 1)}\")\n",
-    "    print(f\"  MPI enabled:    {info.get('mpi_enabled', False)}\")\n",
-    "    print(f\"  HYPRE CUDA:     {info.get('hypre_cuda', False)}\")\n",
-    "    print(f\"  TinyProfile:    {info.get('tiny_profile', False)}\")\n",
-    "    if info.get(\"gpu_device_count\", -1) > 0:\n",
-    "        print(f\"  GPU devices:    {info['gpu_device_count']}\")\n",
-    "except AttributeError:\n",
-    "    # Pre-4.0.2 wheel \u2014 no build_info(). Fall back to banner probe.\n",
-    "    import subprocess, tempfile, os, textwrap\n",
-    "    probe = textwrap.dedent(\"\"\"\n",
-    "        import numpy as np, openimpala as oi\n",
-    "        arr = np.zeros((8, 8, 8), dtype=np.int32)\n",
-    "        with oi.Session():\n",
-    "            try: oi.tortuosity(arr, phase=0, direction='z', solver='pcg', verbose=1)\n",
-    "            except Exception as e: print(f'__ERR__ {e}')\n",
-    "    \"\"\").strip()\n",
-    "    tmp = tempfile.mkdtemp()\n",
-    "    path = os.path.join(tmp, \"probe.py\")\n",
-    "    with open(path, \"w\") as f:\n",
-    "        f.write(probe)\n",
-    "    r = subprocess.run([sys.executable, path], capture_output=True, text=True, timeout=90)\n",
-    "    banner = (r.stdout or \"\") + (r.stderr or \"\")\n",
-    "    low = banner.lower()\n",
-    "    if \"cuda\" in low or (\"gpu\" in low and \"no gpu\" not in low):\n",
-    "        backend = \"cpp-cuda\"\n",
-    "    elif \"hip\" in low and \"rocm\" in low:\n",
-    "        backend = \"cpp-hip\"\n",
-    "    else:\n",
-    "        backend = \"cpp-cpu\"\n",
-    "    print(f\"openimpala backend (banner probe): {backend}\")\n",
-    "    print(\"(install openimpala 4.0.2+ for a proper build_info() API)\")\n",
-    "\n",
-    "is_gpu_build = backend in (\"cpp-cuda\", \"cpp-hip\")\n",
-    "\n",
-    "if gpu_name and not is_gpu_build:\n",
-    "    print()\n",
-    "    print(\"=\" * 70)\n",
-    "    print(\"  \u26a0  CPU-ONLY BUILD DETECTED ON A GPU-EQUIPPED HOST\")\n",
-    "    print(\"=\" * 70)\n",
-    "    print(f\"  Host GPU:          {gpu_name}\")\n",
-    "    print(f\"  openimpala build:  {backend}\")\n",
-    "    print()\n",
-    "    print(\"  Your GPU is idle. Every solve in this notebook is CPU-bound.\")\n",
-    "    print(\"  For single-phase 3D diffusion on 128^3+ grids, a CUDA build is\")\n",
-    "    print(\"  typically 10-50x faster \u2014 this is the dominant source of\")\n",
-    "    print(\"  'OpenImpala feels slow on Colab'.\")\n",
-    "    print()\n",
-    "    print(\"  Fix: install the CUDA wheel instead.\")\n",
-    "    print(\"    !pip uninstall -y openimpala\")\n",
-    "    print(\"    !pip install openimpala-cuda\")\n",
-    "    print(\"  Then Runtime -> Restart session and re-run from the top.\")\n",
-    "    print(\"=\" * 70)\n",
-    "elif is_gpu_build:\n",
-    "    print(f\"\\n\u2713 {backend.upper()} build active on {gpu_name or 'accelerator'}\")\n",
-    "else:\n",
-    "    print(\"\\nCPU-only run (no accelerator detected).\")\n"
-   ]
+   "source": "info = oi.build_info()\nbackend = info[\"backend\"]   # one of: cpp-cuda, cpp-hip, cpp-cpu, pure-python\n\nprint(f\"openimpala backend: {backend}\")\nprint(f\"  version:        {info.get('version', 'unknown')}\")\nprint(f\"  OpenMP threads: {info.get('openmp_max_threads', 1)}\")\nprint(f\"  MPI enabled:    {info.get('mpi_enabled', False)}\")\nprint(f\"  HYPRE CUDA:     {info.get('hypre_cuda', False)}\")\nprint(f\"  TinyProfile:    {info.get('tiny_profile', False)}\")\nif info.get(\"gpu_device_count\", -1) > 0:\n    print(f\"  GPU devices:    {info['gpu_device_count']}\")\n\nis_gpu_build = backend in (\"cpp-cuda\", \"cpp-hip\")\n\nif gpu_name and not is_gpu_build:\n    print()\n    print(\"=\" * 70)\n    print(\"  WARNING: CPU-only build detected on a GPU-equipped host\")\n    print(\"=\" * 70)\n    print(f\"  Host GPU:          {gpu_name}\")\n    print(f\"  openimpala build:  {backend}\")\n    print()\n    print(\"  The GPU is idle. Every solve in this notebook will run on the CPU,\")\n    print(\"  which is typically 10–50× slower than the CUDA build for\")\n    print(\"  single-phase 3D diffusion on 128³+ grids.\")\n    print()\n    print(\"  To fix:\")\n    print(\"    !pip uninstall -y openimpala\")\n    print(\"    !pip install openimpala-cuda\")\n    print(\"  Then Runtime → Restart session and re-run from the top.\")\n    print(\"=\" * 70)\nelif is_gpu_build:\n    print(f\"\\n{backend.upper()} build active on {gpu_name or 'accelerator'}.\")\nelse:\n    print(\"\\nCPU-only run (no accelerator detected).\")"
   },
   {
    "cell_type": "markdown",
    "metadata": {},
-   "source": [
-    "## 2. Synthetic datasets\n",
-    "\n",
-    "Small sizes chosen so a full profiling run completes in minutes, not hours.\n",
-    "`data_micro` (16\u00b3) is tiny enough that its solve time is ~all overhead \u2014 we\n",
-    "use it to extract the per-call fixed cost.\n"
-   ]
+   "source": "## 2. Synthetic datasets\n\nPoreSpy is used to generate five synthetic porous structures at 16³, 32³, 64³, 96³, and 128³. Sizes are kept small enough that a full profiling run completes in minutes rather than hours, and large enough that the cubic scaling of voxel count is visible in the timings.\n\nThe 16³ dataset is small enough that its solve time is dominated by per-call setup rather than compute — useful for extracting the fixed overhead in §4."
   },
   {
    "cell_type": "code",
@@ -239,14 +125,14 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "## 3. Python-level stage breakdown \u2014 one call, one bar chart\n",
+    "## 3. Python-level stage breakdown — one call, one bar chart\n",
     "\n",
     "`oi.tortuosity(numpy_array, ...)` is not a single operation. From\n",
     "`facade.py` it is roughly:\n",
     "\n",
     "```\n",
-    "numpy \u2192 VoxelImage.from_numpy    (copy + AMReX BoxArray/Geometry/iMultiFab build)\n",
-    "PercolationCheck(img, ...)        (flood fill \u2014 runs every call!)\n",
+    "numpy → VoxelImage.from_numpy    (copy + AMReX BoxArray/Geometry/iMultiFab build)\n",
+    "PercolationCheck(img, ...)        (flood fill — runs every call!)\n",
     "VolumeFraction(img, ...)          (cheap count)\n",
     "TortuosityHypre(img, ..., solver) (HYPRE grid/stencil/matrix assembly)\n",
     "solver.value()                    (actual linear solve + flux integration)\n",
@@ -254,7 +140,7 @@
     "\n",
     "We time each stage independently. Whichever bar dominates *is* the\n",
     "bottleneck. If `from_numpy` or `TortuosityHypre(...)` construction dominates,\n",
-    "your slowdown is in binding/setup \u2014 not the solver.\n"
+    "your slowdown is in binding/setup — not the solver.\n"
    ]
   },
   {
@@ -307,7 +193,7 @@
     "    out[\"_iters\"] = solver_obj.iterations\n",
     "    return out\n",
     "\n",
-    "# Warm-up \u2014 first call pays lazy-init inside AMReX/HYPRE\n",
+    "# Warm-up — first call pays lazy-init inside AMReX/HYPRE\n",
     "_ = time_stages(datasets[32])\n",
     "\n",
     "stages = time_stages(data_medium, solver=\"pcg\")\n",
@@ -337,24 +223,24 @@
     "ax.set_yticks(y)\n",
     "ax.set_yticklabels(stage_names, fontsize=9)\n",
     "ax.set_xlabel(\"Wall time (s)\")\n",
-    "ax.set_title(\"Single oi.tortuosity(64\u00b3) call \u2014 stage breakdown\", fontweight=\"bold\")\n",
+    "ax.set_title(\"Single oi.tortuosity(64³) call — stage breakdown\", fontweight=\"bold\")\n",
     "ax.invert_yaxis()\n",
     "plt.tight_layout()\n",
     "plt.show()\n",
     "\n",
     "slowest = max(stage_names, key=lambda k: stages[k])\n",
-    "print(f\"\\nBottleneck at 64\u00b3: {slowest} ({100*stages[slowest]/stages['_total']:.1f}%)\")\n"
+    "print(f\"\\nBottleneck at 64³: {slowest} ({100*stages[slowest]/stages['_total']:.1f}%)\")\n"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "### \u00a73b Per-stage scaling \u2014 does this stage scale with N, or is it fixed overhead?\n",
+    "### §3b Per-stage scaling — does this stage scale with N, or is it fixed overhead?\n",
     "\n",
-    "Run the same decomposition at 32\u00b3 and 128\u00b3 and plot the ratio per stage.\n",
-    "A stage with ratio \u2248 `(128/32)\u00b3 = 64\u00d7` is doing O(N) work \u2014 legitimate\n",
-    "compute. A stage with ratio \u2248 1\u00d7 is constant overhead \u2014 every call pays\n",
+    "Run the same decomposition at 32³ and 128³ and plot the ratio per stage.\n",
+    "A stage with ratio ≈ `(128/32)³ = 64×` is doing O(N) work — legitimate\n",
+    "compute. A stage with ratio ≈ 1× is constant overhead — every call pays\n",
     "the same price regardless of problem size, which is the *classic* signature\n",
     "of binding / setup cost.\n"
    ]
@@ -370,8 +256,8 @@
     "\n",
     "# Ideal O(N) ratio when scaling voxel count by (128/32)^3\n",
     "N_ratio = (128 / 32) ** 3\n",
-    "print(f\"Ideal O(N) work ratio for 32->128: {N_ratio:.0f}\u00d7\")\n",
-    "print(f\"{'stage':<55s} {'32\u00b3 (s)':>10s} {'128\u00b3 (s)':>10s} {'ratio':>8s}  behaviour\")\n",
+    "print(f\"Ideal O(N) work ratio for 32->128: {N_ratio:.0f}×\")\n",
+    "print(f\"{'stage':<55s} {'32³ (s)':>10s} {'128³ (s)':>10s} {'ratio':>8s}  behaviour\")\n",
     "print(\"-\" * 110)\n",
     "\n",
     "rows = []\n",
@@ -379,13 +265,13 @@
     "    ts, tb = stages_small[k], stages_big[k]\n",
     "    r = tb / max(ts, 1e-9)\n",
     "    if r > 0.5 * N_ratio:\n",
-    "        label = \"scales O(N) \u2713\"\n",
+    "        label = \"scales O(N) ✓\"\n",
     "    elif r < 2.0:\n",
     "        label = \"~constant (overhead)\"\n",
     "    else:\n",
     "        label = \"sub-linear\"\n",
     "    rows.append({\"stage\": k, \"t_32\": ts, \"t_128\": tb, \"ratio\": r, \"label\": label})\n",
-    "    print(f\"{k:<55s} {ts:>10.3f} {tb:>10.3f} {r:>8.1f}\u00d7  {label}\")\n",
+    "    print(f\"{k:<55s} {ts:>10.3f} {tb:>10.3f} {r:>8.1f}×  {label}\")\n",
     "\n",
     "df_scale = pd.DataFrame(rows)\n"
    ]
@@ -399,15 +285,15 @@
     "fig, ax = plt.subplots(figsize=(11, 4))\n",
     "y = np.arange(len(stage_names))\n",
     "w = 0.4\n",
-    "ax.barh(y - w/2, df_scale[\"t_32\"], w, color=\"#3498db\", alpha=0.85, label=\"32\u00b3\")\n",
-    "ax.barh(y + w/2, df_scale[\"t_128\"], w, color=\"#e74c3c\", alpha=0.85, label=\"128\u00b3\")\n",
+    "ax.barh(y - w/2, df_scale[\"t_32\"], w, color=\"#3498db\", alpha=0.85, label=\"32³\")\n",
+    "ax.barh(y + w/2, df_scale[\"t_128\"], w, color=\"#e74c3c\", alpha=0.85, label=\"128³\")\n",
     "for i, r in enumerate(df_scale[\"ratio\"]):\n",
-    "    ax.text(df_scale[\"t_128\"].iloc[i] * 1.02, i + w/2, f\"{r:.0f}\u00d7\",\n",
+    "    ax.text(df_scale[\"t_128\"].iloc[i] * 1.02, i + w/2, f\"{r:.0f}×\",\n",
     "            va=\"center\", fontsize=8, color=\"#2c3e50\")\n",
     "ax.set_yticks(y)\n",
     "ax.set_yticklabels(stage_names, fontsize=9)\n",
     "ax.set_xlabel(\"Wall time (s)\")\n",
-    "ax.set_title(f\"Per-stage scaling (ideal O(N) ratio = {N_ratio:.0f}\u00d7)\", fontweight=\"bold\")\n",
+    "ax.set_title(f\"Per-stage scaling (ideal O(N) ratio = {N_ratio:.0f}×)\", fontweight=\"bold\")\n",
     "ax.axvline(0, color=\"#bdc3c7\", linewidth=0.5)\n",
     "ax.invert_yaxis()\n",
     "ax.legend(loc=\"lower right\")\n",
@@ -416,9 +302,9 @@
     "\n",
     "constant_stages = df_scale[df_scale[\"ratio\"] < 2.0]\n",
     "if not constant_stages.empty:\n",
-    "    print(\"\\n\u26a0 Stages that DON'T scale with problem size (pure overhead):\")\n",
+    "    print(\"\\n⚠ Stages that DON'T scale with problem size (pure overhead):\")\n",
     "    for _, r in constant_stages.iterrows():\n",
-    "        print(f\"    {r['stage']:55s}  ratio={r['ratio']:.1f}\u00d7\")\n",
+    "        print(f\"    {r['stage']:55s}  ratio={r['ratio']:.1f}×\")\n",
     "    print(\"  Every oi.tortuosity() call pays these, regardless of image size.\")\n"
    ]
   },
@@ -430,7 +316,7 @@
     "\n",
     "A call's wall time has two parts: a fixed per-call overhead `a` (binding\n",
     "dispatch, AMReX/HYPRE setup, flood fill) and a size-dependent compute part\n",
-    "`b\u00b7N\u1d56`. Fit `t(N) = a + b\u00b7N\u1d56` across sizes.\n",
+    "`b·Nᵖ`. Fit `t(N) = a + b·Nᵖ` across sizes.\n",
     "\n",
     "If `a` is a meaningful fraction of the medium-size solve time, every solve\n",
     "you run is paying a tax that has nothing to do with the actual physics.\n"
@@ -444,7 +330,7 @@
    "source": [
     "overhead_rows = []\n",
     "for n, d in datasets.items():\n",
-    "    # Two trials \u2014 median to reduce jitter without paying for 3\u00d7\n",
+    "    # Two trials — median to reduce jitter without paying for 3×\n",
     "    ts = []\n",
     "    for _ in range(2):\n",
     "        t0 = time.perf_counter()\n",
@@ -479,7 +365,7 @@
     "    print(f\"Fit failed: {e}\")\n",
     "    a_raw, b_fit, p_fit = T.min(), 0.0, 1.0\n",
     "\n",
-    "# Clamp: a negative intercept means there is no meaningful fixed overhead \u2014\n",
+    "# Clamp: a negative intercept means there is no meaningful fixed overhead —\n",
     "# pretending otherwise misleads the summary.\n",
     "a_fit = max(a_raw, 0.0)\n",
     "overhead_meaningful = a_fit > 0.01 * T.max()\n",
@@ -490,10 +376,10 @@
     "ax.loglog(N, T, \"o\", color=\"#2c3e50\", markersize=10, label=\"Measured\")\n",
     "xs = np.logspace(np.log10(N.min()), np.log10(N.max()), 200)\n",
     "ax.loglog(xs, model(xs, a_fit, b_fit, p_fit), \"-\", color=\"#3498db\",\n",
-    "          label=f\"Fit: {a_fit:.2f}s + {b_fit:.2e}\u00b7N^{p_fit:.2f}\")\n",
+    "          label=f\"Fit: {a_fit:.2f}s + {b_fit:.2e}·N^{p_fit:.2f}\")\n",
     "if overhead_meaningful:\n",
     "    ax.axhline(a_fit, color=\"#e74c3c\", linestyle=\"--\",\n",
-    "               label=f\"Fixed overhead \u2248 {a_fit:.2f}s\")\n",
+    "               label=f\"Fixed overhead ≈ {a_fit:.2f}s\")\n",
     "ax.set_xlabel(\"Voxels (N)\")\n",
     "ax.set_ylabel(\"Wall time (s)\")\n",
     "ax.set_title(\"Fixed overhead vs compute\", fontweight=\"bold\")\n",
@@ -502,13 +388,13 @@
     "plt.show()\n",
     "\n",
     "if a_raw < 0:\n",
-    "    print(f\"\\nFit intercept was negative ({a_raw:.2f}s) \u2014 clamped to 0.\")\n",
+    "    print(f\"\\nFit intercept was negative ({a_raw:.2f}s) — clamped to 0.\")\n",
     "    print(\"Interpretation: no detectable per-call fixed overhead; all time is compute.\")\n",
     "print(f\"Estimated fixed overhead per call: {a_fit:.2f}s\")\n",
     "print(f\"Compute scaling:                  O(N^{p_fit:.2f})\")\n",
     "\n",
     "if p_fit > 1.2:\n",
-    "    print(f\"\\n\u26a0 Super-linear compute scaling (O(N^{p_fit:.2f}) vs ideal O(N)).\")\n",
+    "    print(f\"\\n⚠ Super-linear compute scaling (O(N^{p_fit:.2f}) vs ideal O(N)).\")\n",
     "    print(\"  Likely cause: Krylov iteration count growing with N. A multigrid\")\n",
     "    print(\"  preconditioner (SMG/PFMG) or MLMG would bring this closer to O(N).\")\n",
     "\n",
@@ -519,17 +405,7 @@
   {
    "cell_type": "markdown",
    "metadata": {},
-   "source": [
-    "## 5. Amortization test \u2014 reuse VoxelImage across calls\n",
-    "\n",
-    "If most time is spent rebuilding AMReX/HYPRE state per call, then calling\n",
-    "the solver several times on the **same** `VoxelImage` should get cheaper\n",
-    "after the first call (matrix is already assembled inside the solver).\n",
-    "\n",
-    "We bypass `oi.tortuosity` and hit `_core.TortuosityHypre` directly so the\n",
-    "`VoxelImage` is reused. Compare this to calling `oi.tortuosity(numpy, ...)`\n",
-    "five times \u2014 each rebuild pays the full overhead.\n"
-   ]
+   "source": "## 5. Amortization test — reusing the VoxelImage\n\nIf most per-call time goes into rebuilding AMReX state, then running the solver multiple times against the *same* `VoxelImage` should be markedly cheaper than the equivalent number of calls through the high-level facade. This cell measures the gap.\n\nTwo paths are compared on `data_medium`, five calls each:\n\n* **Naive**: five calls to `oi.tortuosity(numpy_array, ...)`. Every call rebuilds the geometry, BoxArray, distribution mapping, iMultiFab, percolation mask, and HYPRE matrix from the bare NumPy input.\n* **Amortized**: one call to `_numpy_to_voxelimage` to build the `VoxelImage` once, then five direct calls to `_core.TortuosityHypre` reusing it.\n\nThe difference is the per-call setup tax. Sweeping (e.g. directional tortuosity, parameter studies) should always use the amortized pattern."
   },
   {
    "cell_type": "code",
@@ -540,7 +416,7 @@
     "from openimpala.facade import _numpy_to_voxelimage, _parse_direction, _parse_solver\n",
     "\n",
     "def naive_repeat(data, n=5, solver=\"pcg\"):\n",
-    "    \"\"\"Top-level facade \u2014 rebuilds everything every call.\"\"\"\n",
+    "    \"\"\"Top-level facade — rebuilds everything every call.\"\"\"\n",
     "    ts = []\n",
     "    for _ in range(n):\n",
     "        t0 = time.perf_counter()\n",
@@ -568,11 +444,11 @@
     "naive = naive_repeat(data_medium, n_repeat)\n",
     "amort = amortized_repeat(data_medium, n_repeat)\n",
     "\n",
-    "print(f\"Naive  oi.tortuosity(numpy) \u00d7 {n_repeat}: \" + \", \".join(f\"{t:.2f}s\" for t in naive))\n",
+    "print(f\"Naive  oi.tortuosity(numpy) × {n_repeat}: \" + \", \".join(f\"{t:.2f}s\" for t in naive))\n",
     "print(f\"Amortized (reuse VoxelImage): \" + \", \".join(f\"{t:.2f}s\" for t in amort))\n",
     "print(f\"\\nNaive total:      {sum(naive):.2f}s  (mean {np.mean(naive):.2f}s/call)\")\n",
     "print(f\"Amortized total:  {sum(amort):.2f}s  (mean {np.mean(amort):.2f}s/call)\")\n",
-    "print(f\"Speedup from reuse: {sum(naive)/max(sum(amort), 1e-9):.2f}\u00d7\")\n"
+    "print(f\"Speedup from reuse: {sum(naive)/max(sum(amort), 1e-9):.2f}×\")\n"
    ]
   },
   {
@@ -602,20 +478,7 @@
   {
    "cell_type": "markdown",
    "metadata": {},
-   "source": [
-    "## 6. cProfile \u2014 Python-level hottest frames\n",
-    "\n",
-    "`cProfile` shows cumulative time in Python frames, including time spent\n",
-    "inside pybind11 wrappers. Look for big chunks in:\n",
-    "\n",
-    "- `_numpy_to_voxelimage` or `VoxelImage.from_numpy` \u2192 data copy cost\n",
-    "- `PercolationCheck.__init__` \u2192 flood-fill dominates\n",
-    "- `TortuosityHypre.__init__` \u2192 matrix assembly dominates\n",
-    "- `TortuosityHypre.value` \u2192 the actual solve\n",
-    "\n",
-    "Anything above these (e.g. `_parse_direction`, `_parse_solver`) appearing\n",
-    "in the top 20 indicates pure Python dispatch waste.\n"
-   ]
+   "source": "## 6. cProfile — Python-level hot frames\n\ncProfile captures cumulative time per Python frame, including time spent inside pybind11 wrappers around C++ calls. This is a different view from §3 — instead of timing named stages, it counts every function call and surfaces unexpected frame costs.\n\nThings to look for in the top 20 frames:\n\n* `_numpy_to_voxelimage`, `VoxelImage.from_numpy` — data ingestion\n* `PercolationCheck.__init__` — flood-fill setup\n* `TortuosityHypre.__init__` — HYPRE matrix assembly\n* `TortuosityHypre.value` — the linear solve\n\nIf pure-Python helpers like `_parse_direction` or `_parse_solver` show up high, the binding is doing too much Python-side dispatch."
   },
   {
    "cell_type": "code",
@@ -638,24 +501,7 @@
   {
    "cell_type": "markdown",
    "metadata": {},
-   "source": [
-    "## 7. AMReX TinyProfiler \u2014 C++ internal breakdown\n",
-    "\n",
-    "Now we go *below* the binding layer. AMReX's TinyProfiler instruments\n",
-    "every `BL_PROFILE`-annotated function in the C++ source and prints a table\n",
-    "when the session finalizes.\n",
-    "\n",
-    "This is especially important given what \u00a73 just showed us: `solver.value()`\n",
-    "is ~96% of wall time at 64\u00b3 and scales super-linearly to 128\u00b3. TinyProfiler\n",
-    "will break that down into `TortuosityHypre::solve`, `setupMatrixEquation`,\n",
-    "`preconditionPhaseFab`, `global_fluxes`, `computePlaneFluxes`, etc.\n",
-    "\n",
-    "**Implementation note**: TinyProfiler prints via C++ streams on AMReX\n",
-    "finalize, which Python's `sys.stdout` redirection cannot capture (and\n",
-    "recycling a Session inside the same process is fragile). We run the solve\n",
-    "in a short-lived **subprocess** \u2014 Python's `capture_output=True` grabs\n",
-    "fd 1 + fd 2 at the OS level, so we get everything.\n"
-   ]
+   "source": "## 7. AMReX TinyProfiler — C++ function breakdown\n\n§3 showed that `solver.value()` accounts for the bulk of wall time at 64³ and that it scales super-linearly to 128³. To break that down further we need to go below the binding layer, into AMReX's own profiler.\n\n`AMReX::TinyProfiler` instruments every `BL_PROFILE`-annotated function in the C++ source and prints a tabular breakdown when AMReX finalises. The compiled wheel sets `-DAMReX_TINY_PROFILE=ON`, so the instrumentation is always on.\n\nThe profiler writes to C++ streams on `amrex::Finalize()`, which Python's `sys.stdout` redirection can't capture from inside a long-running session. We therefore run a single profiled solve in a short-lived subprocess — Python's `capture_output=True` collects stdout and stderr at the OS level and gives us the complete table."
   },
   {
    "cell_type": "code",
@@ -671,7 +517,7 @@
     "    Why a subprocess: TinyProfiler prints via C++ streams on AMReX finalize,\n",
     "    which sys.stdout redirection can't capture. A subprocess gives us a clean\n",
     "    AMReX lifecycle and Python's ``capture_output=True`` grabs fd 1 + fd 2\n",
-    "    at the OS level \u2014 including everything TinyProfiler emits.\n",
+    "    at the OS level — including everything TinyProfiler emits.\n",
     "\n",
     "    Returns (result_dict, combined_stdout_stderr).\n",
     "    \"\"\"\n",
@@ -712,15 +558,15 @@
     "            break\n",
     "\n",
     "    if proc.returncode != 0:\n",
-    "        print(f\"\u26a0 subprocess exited with code {proc.returncode}\")\n",
+    "        print(f\"⚠ subprocess exited with code {proc.returncode}\")\n",
     "    return result, combined\n",
     "\n",
-    "print(\"Running a profiled solve on 64^3 in a subprocess\u2026\")\n",
+    "print(\"Running a profiled solve on 64^3 in a subprocess…\")\n",
     "prof_result, prof_output = run_with_profiler(data_medium, solver=\"pcg\")\n",
     "if prof_result is not None:\n",
     "    print(f\"tau={prof_result['tau']:.4f}  converged={prof_result['converged']}  iters={prof_result['iters']}\")\n",
     "else:\n",
-    "    print(\"No result line found \u2014 solve may have failed; check output below.\")\n",
+    "    print(\"No result line found — solve may have failed; check output below.\")\n",
     "print(f\"Captured output length: {len(prof_output)} chars\")\n",
     "\n",
     "# Show the tail (TinyProfiler dumps near the end, on finalize)\n",
@@ -774,7 +620,7 @@
     "    print(df_prof.to_string(index=False))\n",
     "elif not has_tp_header:\n",
     "    print(\"=\" * 70)\n",
-    "    print(\"  \u26a0  No TinyProfiler table was emitted by this openimpala build\")\n",
+    "    print(\"  ⚠  No TinyProfiler table was emitted by this openimpala build\")\n",
     "    print(\"=\" * 70)\n",
     "    print(\"  The subprocess ran to completion (result captured above) but\")\n",
     "    print(\"  AMReX did not dump a TinyProfiler table on finalize. This means\")\n",
@@ -784,13 +630,13 @@
     "    print(\"    CMake:  -DAMReX_TINY_PROFILE=ON\")\n",
     "    print(\"    GNUmake: TINY_PROFILE=TRUE\")\n",
     "    print()\n",
-    "    print(\"  Until then \u00a77 cannot break down the C++ solve into components.\")\n",
-    "    print(\"  \u00a73 (Python stage breakdown) + \u00a73b (per-stage scaling) already\")\n",
+    "    print(\"  Until then §7 cannot break down the C++ solve into components.\")\n",
+    "    print(\"  §3 (Python stage breakdown) + §3b (per-stage scaling) already\")\n",
     "    print(\"  show that solver.value() is ~85-95% of wall time and scales\")\n",
-    "    print(\"  super-linearly \u2014 that IS the bottleneck regardless.\")\n",
+    "    print(\"  super-linearly — that IS the bottleneck regardless.\")\n",
     "    print(\"=\" * 70)\n",
     "else:\n",
-    "    print(\"TinyProfiler header detected but no rows parsed \u2014 regex mismatch.\")\n",
+    "    print(\"TinyProfiler header detected but no rows parsed — regex mismatch.\")\n",
     "    print(\"Dump tail of raw output above and adjust parse_tiny_profiler().\")\n"
    ]
   },
@@ -824,7 +670,7 @@
     "    ax.set_yticks(y)\n",
     "    ax.set_yticklabels(df_prof[\"function\"], fontsize=9)\n",
     "    ax.set_xlabel(\"Exclusive time (s)\")\n",
-    "    ax.set_title(\"AMReX TinyProfiler \u2014 C++ function breakdown\", fontweight=\"bold\")\n",
+    "    ax.set_title(\"AMReX TinyProfiler — C++ function breakdown\", fontweight=\"bold\")\n",
     "    ax.invert_yaxis()\n",
     "\n",
     "    from matplotlib.patches import Patch\n",
@@ -839,22 +685,22 @@
     "    plt.tight_layout()\n",
     "    plt.show()\n",
     "else:\n",
-    "    print(\"Skipping chart \u2014 no TinyProfiler data.\")\n"
+    "    print(\"Skipping chart — no TinyProfiler data.\")\n"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "## 8. NVIDIA Nsight Systems \u2014 GPU kernel timeline\n",
+    "## 8. NVIDIA Nsight Systems — GPU kernel timeline\n",
     "\n",
     "If a GPU is available, `nsys` captures a kernel-level timeline: launches,\n",
     "memory transfers, idle gaps. Signatures of Python-binding overhead on the\n",
     "GPU path are:\n",
     "\n",
-    "- Many tiny launches with gaps \u2192 Python loop driving the GPU\n",
-    "- Frequent H2D transfers of the same data \u2192 `numpy \u2192 VoxelImage` copies leaking out\n",
-    "- Long idle gaps between kernels \u2192 CPU-side serial work between solver calls\n",
+    "- Many tiny launches with gaps → Python loop driving the GPU\n",
+    "- Frequent H2D transfers of the same data → `numpy → VoxelImage` copies leaking out\n",
+    "- Long idle gaps between kernels → CPU-side serial work between solver calls\n",
     "\n",
     "We use a standalone script so the subprocess is clean.\n"
    ]
@@ -870,9 +716,9 @@
     "\n",
     "if not nsys_available:\n",
     "    if not gpu_name:\n",
-    "        print(\"No GPU \u2014 skipping Nsight. (Runtime > Change runtime type > T4 GPU)\")\n",
+    "        print(\"No GPU — skipping Nsight. (Runtime > Change runtime type > T4 GPU)\")\n",
     "    else:\n",
-    "        print(\"nsys not found \u2014 install via: apt-get install nsight-systems\")\n",
+    "        print(\"nsys not found — install via: apt-get install nsight-systems\")\n",
     "else:\n",
     "    print(f\"nsys: {shutil.which('nsys')}   GPU: {gpu_name}\")\n",
     "    script = '''\n",
@@ -943,9 +789,9 @@
     "            print(\"CUDA memory transfer summary (H2D/D2H):\")\n",
     "            print(df_mem.to_string(index=False))\n",
     "            print(\"\\nRed flags:\")\n",
-    "            print(\"  \u2022 Frequent H2D transfers of similar size \u2192 numpy\u2192VoxelImage\")\n",
+    "            print(\"  • Frequent H2D transfers of similar size → numpy→VoxelImage\")\n",
     "            print(\"    copy leaking out of bindings into the hot path\")\n",
-    "            print(\"  \u2022 Large D2H transfer per call \u2192 result readback overhead\")\n",
+    "            print(\"  • Large D2H transfer per call → result readback overhead\")\n",
     "        else:\n",
     "            print(mem.stdout[:1500])\n",
     "    else:\n",
@@ -955,13 +801,7 @@
   {
    "cell_type": "markdown",
    "metadata": {},
-   "source": [
-    "## 9. Solver comparison *(focused, post-diagnosis)*\n",
-    "\n",
-    "Now that you know where time goes, compare HYPRE solvers on `data_medium`\n",
-    "at a single grid size. `n_repeats=1` \u2014 this sweep is for picking a solver,\n",
-    "not measuring steady-state noise.\n"
-   ]
+   "source": "## 9. Solver comparison\n\nWith the bottlenecks identified, this section compares HYPRE solver / preconditioner combinations on `data_medium` at a single grid size. Each solver runs once — the goal is to pick a baseline configuration, not to measure steady-state noise. Repeat sweeps belong in `tests/benchmarks/`.\n\nThe combinations cover the textbook choices for symmetric Poisson-like operators (PCG with PFMG/SMG multigrid preconditioning), non-symmetric Krylov methods (GMRES, FlexGMRES, BiCGSTAB), and standalone multigrid (PFMG, SMG, AMReX MLMG)."
   },
   {
    "cell_type": "code",
@@ -1000,9 +840,9 @@
     "                        \"tau\": res.tortuosity, \"ok\": res.solver_converged})\n",
     "        print(f\"  {label:18s}  t={dt:6.2f}s  iters={res.iterations:4d}  tau={res.tortuosity:.4f}\")\n",
     "    except TypeError as e:\n",
-    "        # Older wheels don't know the 'preconditioner' kwarg \u2014 fall back without it.\n",
+    "        # Older wheels don't know the 'preconditioner' kwarg — fall back without it.\n",
     "        if \"preconditioner\" in str(e) and pc is not None:\n",
-    "            print(f\"  {label:18s}  SKIP \u2014 wheel predates preconditioner plumbing\")\n",
+    "            print(f\"  {label:18s}  SKIP — wheel predates preconditioner plumbing\")\n",
     "            records.append({\"solver\": label, \"t\": np.nan, \"iters\": np.nan,\n",
     "                            \"tau\": np.nan, \"ok\": False})\n",
     "            continue\n",
@@ -1010,7 +850,7 @@
     "    except Exception as e:\n",
     "        records.append({\"solver\": label, \"t\": np.nan, \"iters\": np.nan,\n",
     "                        \"tau\": np.nan, \"ok\": False})\n",
-    "        print(f\"  {label:18s}  FAILED \u2014 {e}\")\n",
+    "        print(f\"  {label:18s}  FAILED — {e}\")\n",
     "\n",
     "df_solvers = pd.DataFrame(records)\n",
     "df_solvers\n"
@@ -1033,11 +873,11 @@
     "ax1.set_yticks(y)\n",
     "ax1.set_yticklabels(df_solvers[\"solver\"])\n",
     "ax1.set_xlabel(\"Wall time (s)\", color=\"#2c3e50\")\n",
-    "ax1.set_title(f\"Solver comparison \u2014 64\u00b3, best by wall time: {best_solver}\",\n",
+    "ax1.set_title(f\"Solver comparison — 64³, best by wall time: {best_solver}\",\n",
     "              fontweight=\"bold\")\n",
     "ax1.invert_yaxis()\n",
     "\n",
-    "# Overlay iteration counts on a twin axis \u2014 this is the key diagnostic.\n",
+    "# Overlay iteration counts on a twin axis — this is the key diagnostic.\n",
     "# A solver with few iterations but expensive each (multigrid) can still\n",
     "# win overall; wall time alone hides that.\n",
     "ax2 = ax1.twiny()\n",
@@ -1065,7 +905,7 @@
     "plt.tight_layout()\n",
     "plt.show()\n",
     "\n",
-    "# Flag solvers with few iterations \u2014 likely multigrid/preconditioned.\n",
+    "# Flag solvers with few iterations — likely multigrid/preconditioned.\n",
     "# If one converged in <10 iterations but is slower per iteration, it may\n",
     "# still be the better choice at larger problem sizes.\n",
     "low_iter = ok[ok[\"iters\"] < 10]\n",
@@ -1075,25 +915,13 @@
     "        print(f\"  {r['solver']:10s}  {int(r['iters'])} iters  {r['t']:.2f}s\")\n",
     "    print(\"  These scale O(N) and will overtake PCG at larger grids.\")\n",
     "\n",
-    "print(f\"\\nBest solver at 64\u00b3 by wall time: {best_solver}\")\n"
+    "print(f\"\\nBest solver at 64³ by wall time: {best_solver}\")\n"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
-   "source": [
-    "### 9b. Scaling validation \u2014 does the multigrid preconditioner break super-linear PCG?\n",
-    "\n",
-    "\u00a73b showed plain PCG scales as `t(N) ~ N^1.76` \u2014 that's the root cause of\n",
-    "slow large-grid solves, not any Python/binding overhead. A multigrid\n",
-    "preconditioner should restore near-`O(N)` scaling. The issue #256\n",
-    "acceptance target is **p < 1.2** at 64\u00b3 \u2192 128\u00b3.\n",
-    "\n",
-    "Fits `t = a\u00b7N^p` across 32\u00b3 / 64\u00b3 / 96\u00b3 / 128\u00b3 for three representative\n",
-    "choices: plain PCG, PCG+PFMG (Krylov + multigrid preconditioner), and MLMG\n",
-    "(AMReX-native geometric multigrid). Uses a uniform phase-0 block so each\n",
-    "solve is cheap and always percolates.\n"
-   ]
+   "source": "### 9b. Scaling validation — does the multigrid preconditioner restore O(N) compute?\n\n§3b found plain PCG scaling as `t(N) ~ N^1.76` — the dominant source of slow large-grid solves, well above any Python or binding overhead. Theory predicts that pairing PCG with a geometric multigrid preconditioner (PFMG or SMG) drops the iteration count to a near-constant, restoring near-linear `O(N)` scaling overall. AMReX's native MLMG is also `O(N)` by construction.\n\nThis cell fits `t = a · N^p` across 32³, 64³, 96³, and 128³ for three representative configurations:\n\n* **`pcg`** — plain conjugate gradient (the baseline, super-linear)\n* **`pcg+pfmg`** — PCG with HYPRE's PFMG preconditioner\n* **`mlmg`** — AMReX's native geometric multigrid\n\nThe test problem is a uniform phase-0 block (always percolates, trivially convergent) so each solve is cheap and the differences come purely from the algorithmic scaling.\n\nA `p` value below ~1.2 indicates near-linear scaling; the plain PCG baseline typically lands around `p ≈ 1.7–1.8`."
   },
   {
    "cell_type": "code",
@@ -1111,7 +939,7 @@
     "scaling_records = []\n",
     "for label, kw in scaling_combos:\n",
     "    for N in scaling_sizes:\n",
-    "        arr = np.zeros((N, N, N), dtype=np.int32)  # uniform phase 0 \u2014 always percolates\n",
+    "        arr = np.zeros((N, N, N), dtype=np.int32)  # uniform phase 0 — always percolates\n",
     "        try:\n",
     "            t0 = time.perf_counter()\n",
     "            res = oi.tortuosity(arr, phase=0, direction=\"z\",\n",
@@ -1122,10 +950,10 @@
     "                                    \"ok\": bool(res.solver_converged)})\n",
     "            print(f\"  {label:10s}  {N:3d}^3  t={dt:7.3f}s  iters={res.iterations:4d}\")\n",
     "        except TypeError as e:\n",
-    "            # Wheel predates the preconditioner kwarg \u2014 skip only those rows.\n",
+    "            # Wheel predates the preconditioner kwarg — skip only those rows.\n",
     "            msg = str(e)\n",
     "            if (\"preconditioner\" in msg) and (\"preconditioner\" in kw):\n",
-    "                print(f\"  {label:10s}  {N:3d}^3  SKIP \u2014 wheel predates preconditioner kwarg\")\n",
+    "                print(f\"  {label:10s}  {N:3d}^3  SKIP — wheel predates preconditioner kwarg\")\n",
     "                scaling_records.append({\"solver\": label, \"N\": N, \"t\": np.nan,\n",
     "                                        \"iters\": 0, \"ok\": False})\n",
     "                continue\n",
@@ -1133,7 +961,7 @@
     "        except Exception as e:\n",
     "            scaling_records.append({\"solver\": label, \"N\": N, \"t\": np.nan,\n",
     "                                    \"iters\": 0, \"ok\": False})\n",
-    "            print(f\"  {label:10s}  {N:3d}^3  FAILED \u2014 {e}\")\n",
+    "            print(f\"  {label:10s}  {N:3d}^3  FAILED — {e}\")\n",
     "\n",
     "df_scale = pd.DataFrame(scaling_records)\n",
     "df_scale\n"
@@ -1144,52 +972,7 @@
    "execution_count": null,
    "metadata": {},
    "outputs": [],
-   "source": [
-    "# Fit t = a \u00b7 N^p in log-log per solver, then overlay the curves with an\n",
-    "# O(N) reference line and the O(N^1.76) baseline from \u00a73b.\n",
-    "fig, ax = plt.subplots(figsize=(9, 5.5))\n",
-    "palette = {\"pcg\": \"#e74c3c\", \"pcg+pfmg\": \"#2ecc71\", \"mlmg\": \"#8e44ad\"}\n",
-    "fit_results = {}\n",
-    "for label, _ in scaling_combos:\n",
-    "    sub = df_scale[(df_scale[\"solver\"] == label) & df_scale[\"ok\"]].dropna(subset=[\"t\"])\n",
-    "    if len(sub) < 2:\n",
-    "        fit_results[label] = np.nan\n",
-    "        continue\n",
-    "    p, loga = np.polyfit(np.log(sub[\"N\"].astype(float)), np.log(sub[\"t\"].astype(float)), 1)\n",
-    "    fit_results[label] = p\n",
-    "    ax.loglog(sub[\"N\"], sub[\"t\"], \"o-\", color=palette.get(label, \"#333333\"),\n",
-    "              markersize=8, linewidth=2, label=f\"{label}  (p={p:.2f})\")\n",
-    "\n",
-    "# Reference lines \u2014 anchored at the first successful measurement\n",
-    "anchor = df_scale[df_scale[\"ok\"]].dropna(subset=[\"t\"]).sort_values(\"N\")\n",
-    "if not anchor.empty:\n",
-    "    N_anchor = float(anchor.iloc[0][\"N\"])\n",
-    "    t_anchor = float(anchor.iloc[0][\"t\"])\n",
-    "    Ns = np.array(scaling_sizes, dtype=float)\n",
-    "    ax.loglog(Ns, t_anchor * (Ns / N_anchor) ** 1.0,\n",
-    "              \"k--\", alpha=0.4, linewidth=1, label=\"O(N) reference\")\n",
-    "    ax.loglog(Ns, t_anchor * (Ns / N_anchor) ** 1.76,\n",
-    "              \"r:\", alpha=0.5, linewidth=1, label=\"O(N^1.76) (\u00a73b baseline)\")\n",
-    "\n",
-    "ax.set_xlabel(\"Grid size N  (domain = N^3)\")\n",
-    "ax.set_ylabel(\"Wall time (s)\")\n",
-    "ax.set_title(\"Scaling validation \u2014 multigrid vs. plain Krylov\",\n",
-    "             fontweight=\"bold\")\n",
-    "ax.legend(loc=\"upper left\", framealpha=0.9)\n",
-    "ax.grid(True, which=\"both\", alpha=0.3)\n",
-    "plt.tight_layout()\n",
-    "plt.show()\n",
-    "\n",
-    "# Verdict against acceptance target (issue #256: p < 1.2)\n",
-    "print(\"\\nScaling exponents vs. acceptance target p < 1.2:\")\n",
-    "for label, p in fit_results.items():\n",
-    "    if np.isnan(p):\n",
-    "        print(f\"  {label:10s}  n/a  (insufficient data)\")\n",
-    "    elif p < 1.2:\n",
-    "        print(f\"  {label:10s}  p = {p:.2f}  \u2713 near-linear \u2014 target met\")\n",
-    "    else:\n",
-    "        print(f\"  {label:10s}  p = {p:.2f}  \u2717 super-linear \u2014 target NOT met\")\n"
-   ]
+   "source": "# Fit t = a · N^p in log-log per solver, then overlay the curves with an\n# O(N) reference line and the O(N^1.76) baseline from §3b.\nfig, ax = plt.subplots(figsize=(9, 5.5))\npalette = {\"pcg\": \"#e74c3c\", \"pcg+pfmg\": \"#2ecc71\", \"mlmg\": \"#8e44ad\"}\nfit_results = {}\nfor label, _ in scaling_combos:\n    sub = df_scale[(df_scale[\"solver\"] == label) & df_scale[\"ok\"]].dropna(subset=[\"t\"])\n    if len(sub) < 2:\n        fit_results[label] = np.nan\n        continue\n    p, loga = np.polyfit(np.log(sub[\"N\"].astype(float)), np.log(sub[\"t\"].astype(float)), 1)\n    fit_results[label] = p\n    ax.loglog(sub[\"N\"], sub[\"t\"], \"o-\", color=palette.get(label, \"#333333\"),\n              markersize=8, linewidth=2, label=f\"{label}  (p={p:.2f})\")\n\n# Reference lines — anchored at the first successful measurement\nanchor = df_scale[df_scale[\"ok\"]].dropna(subset=[\"t\"]).sort_values(\"N\")\nif not anchor.empty:\n    N_anchor = float(anchor.iloc[0][\"N\"])\n    t_anchor = float(anchor.iloc[0][\"t\"])\n    Ns = np.array(scaling_sizes, dtype=float)\n    ax.loglog(Ns, t_anchor * (Ns / N_anchor) ** 1.0,\n              \"k--\", alpha=0.4, linewidth=1, label=\"O(N) reference\")\n    ax.loglog(Ns, t_anchor * (Ns / N_anchor) ** 1.76,\n              \"r:\", alpha=0.5, linewidth=1, label=\"O(N^1.76) (§3b baseline)\")\n\nax.set_xlabel(\"Grid size N  (domain = N^3)\")\nax.set_ylabel(\"Wall time (s)\")\nax.set_title(\"Scaling validation — multigrid vs. plain Krylov\",\n             fontweight=\"bold\")\nax.legend(loc=\"upper left\", framealpha=0.9)\nax.grid(True, which=\"both\", alpha=0.3)\nplt.tight_layout()\nplt.show()\n\n# Verdict against the near-linear target (p < 1.2)\nprint(\"\\nScaling exponents vs. near-linear target (p < 1.2):\")\nfor label, p in fit_results.items():\n    if np.isnan(p):\n        print(f\"  {label:10s}  n/a  (insufficient data)\")\n    elif p < 1.2:\n        print(f\"  {label:10s}  p = {p:.2f}  near-linear — target met\")\n    else:\n        print(f\"  {label:10s}  p = {p:.2f}  super-linear — target NOT met\")"
   },
   {
    "cell_type": "markdown",
@@ -1197,7 +980,7 @@
    "source": [
     "## 10. `max_grid_size` tuning\n",
     "\n",
-    "Small boxes \u2192 better CPU cache + MPI parallelism. Large boxes \u2192 better\n",
+    "Small boxes → better CPU cache + MPI parallelism. Large boxes → better\n",
     "GPU kernel saturation. Sweep a few values with the best solver.\n"
    ]
   },
@@ -1224,7 +1007,7 @@
     "ax.plot(df_grid[\"max_grid_size\"], df_grid[\"t\"], \"o-\", linewidth=2, markersize=9)\n",
     "ax.set_xlabel(\"max_grid_size\")\n",
     "ax.set_ylabel(\"Wall time (s)\")\n",
-    "ax.set_title(f\"Grid sweep \u2014 64\u00b3, solver={best_solver}\", fontweight=\"bold\")\n",
+    "ax.set_title(f\"Grid sweep — 64³, solver={best_solver}\", fontweight=\"bold\")\n",
     "plt.tight_layout()\n",
     "plt.show()\n",
     "print(f\"Optimal max_grid_size: {best_gs}\")\n"
@@ -1233,16 +1016,7 @@
   {
    "cell_type": "markdown",
    "metadata": {},
-   "source": [
-    "## 11. Native-binary comparison *(optional)*\n",
-    "\n",
-    "Run the native `openimpala` executable on the same data (via a dumped\n",
-    "TIFF stack) and compare to the Python-binding call. The delta is\n",
-    "binding/orchestration overhead, nothing else.\n",
-    "\n",
-    "Skip this if you don't have the native binary built \u2014 \u00a73\u2013\u00a75 already\n",
-    "expose the Python-side story.\n"
-   ]
+   "source": "## 11. Native-binary comparison — optional\n\nIf the native `Diffusion` executable is available on `PATH`, this cell runs it on the same data via a `tiff` dump and compares wall time against the equivalent `oi.tortuosity` call. The difference quantifies how much overhead the Python/pybind11 binding layer adds on top of the C++ solver.\n\nThis section is informational — a CTest-driven comparison lives at `tests/benchmarks/`. Skip if no native binary is available; §3 – §5 already isolate the binding-side cost."
   },
   {
    "cell_type": "code",
@@ -1252,7 +1026,7 @@
    "source": [
     "native = shutil.which(\"openimpala\")\n",
     "if not native:\n",
-    "    print(\"No native openimpala binary on PATH \u2014 skipping comparison.\")\n",
+    "    print(\"No native openimpala binary on PATH — skipping comparison.\")\n",
     "else:\n",
     "    import tifffile, tempfile, textwrap, os\n",
     "    tmp = tempfile.mkdtemp()\n",
@@ -1277,12 +1051,12 @@
     "    t_native = time.perf_counter() - t0\n",
     "\n",
     "    # The native binary exits non-zero on input/parse errors but still returns\n",
-    "    # quickly \u2014 don't compare that 0.1s \"win\" to a real Python solve.\n",
+    "    # quickly — don't compare that 0.1s \"win\" to a real Python solve.\n",
     "    if res.returncode != 0:\n",
     "        print(\"=\" * 70)\n",
-    "        print(f\"  \u26a0  Native binary exited with code {res.returncode} in {t_native:.2f}s\")\n",
+    "        print(f\"  ⚠  Native binary exited with code {res.returncode} in {t_native:.2f}s\")\n",
     "        print(\"=\" * 70)\n",
-    "        print(\"  It did NOT actually solve \u2014 skipping the comparison.\")\n",
+    "        print(\"  It did NOT actually solve — skipping the comparison.\")\n",
     "        print()\n",
     "        print(\"  stderr tail:\")\n",
     "        for line in (res.stderr or \"\").strip().splitlines()[-10:]:\n",
@@ -1295,7 +1069,7 @@
     "        print(\"  Likely cause: the native binary on PATH expects a different\")\n",
     "        print(\"  inputs-file schema than what this cell writes (field names or\")\n",
     "        print(\"  types differ between versions). Fix the inputs file to match,\")\n",
-    "        print(\"  or ignore this comparison \u2014 \u00a73/\u00a74 already prove the bindings\")\n",
+    "        print(\"  or ignore this comparison — §3/§4 already prove the bindings\")\n",
     "        print(\"  add negligible overhead.\")\n",
     "    else:\n",
     "        t0 = time.perf_counter()\n",
@@ -1321,54 +1095,7 @@
    "execution_count": null,
    "metadata": {},
    "outputs": [],
-   "source": [
-    "# Collect headline numbers from the diagnostics above\n",
-    "stage_total = stages[\"_total\"]\n",
-    "stage_top = max((k for k in stages if not k.startswith(\"_\")), key=lambda k: stages[k])\n",
-    "fixed_overhead = a_fit\n",
-    "naive_mean = float(np.mean(naive))\n",
-    "amort_mean = float(np.mean(amort))\n",
-    "\n",
-    "print(\"=\" * 64)\n",
-    "print(\"  DIAGNOSIS\")\n",
-    "print(\"=\" * 64)\n",
-    "print(f\"  Build backend:                  {backend.upper()}\"\n",
-    "      + (f\"   (host GPU: {gpu_name})\" if gpu_name else \"\"))\n",
-    "print(f\"  Single-call bottleneck:         {stage_top}\")\n",
-    "print(f\"                                  ({100*stages[stage_top]/stage_total:.1f}% of {stage_total:.2f}s)\")\n",
-    "print(f\"  Compute scaling:                O(N^{p_fit:.2f})\"\n",
-    "      + (\"   \u26a0 super-linear\" if p_fit > 1.2 else \"\"))\n",
-    "print(f\"  Fixed per-call overhead (fit):  {fixed_overhead:.2f}s\"\n",
-    "      + (\"   (no meaningful fixed cost)\" if not overhead_meaningful else \"\"))\n",
-    "print(f\"  Per-call rebuild tax:           {naive_mean - amort_mean:.2f}s \"\n",
-    "      f\"({100*(naive_mean-amort_mean)/max(naive_mean,1e-9):.0f}% of naive call)\")\n",
-    "print(f\"  Best solver:                    {best_solver}\")\n",
-    "print(f\"  Best max_grid_size:             {best_gs}\")\n",
-    "print(\"=\" * 64)\n",
-    "\n",
-    "# Headline recommendation \u2014 the #1 thing to change given what we found\n",
-    "print(\"\\n  TOP RECOMMENDATION\")\n",
-    "print(\"  \" + \"-\" * 62)\n",
-    "if gpu_name and backend == \"cpu\":\n",
-    "    print(f\"  You have a {gpu_name} sitting idle. The pip wheel is CPU-only.\")\n",
-    "    print(\"  Rebuild openimpala with -DAMReX_GPU_BACKEND=CUDA \u2014 expect 10-50x.\")\n",
-    "    print(\"  Everything below is secondary until this is addressed.\")\n",
-    "elif p_fit > 1.2:\n",
-    "    print(f\"  Solver is scaling O(N^{p_fit:.2f}), not O(N). PCG iterations grow\")\n",
-    "    print(\"  with problem size. Switch to a multigrid preconditioner (SMG/PFMG\")\n",
-    "    print(\"  via HYPRE, or MLMG) to restore linear scaling at large grids.\")\n",
-    "else:\n",
-    "    print(f\"  Solver dominates; best found is {best_solver} with max_grid_size={best_gs}.\")\n",
-    "    print(\"  No free lunch here \u2014 further speedup needs algorithmic changes.\")\n",
-    "\n",
-    "# If the rebuild tax is large, recommend the amortized path\n",
-    "if naive_mean > 2 * amort_mean:\n",
-    "    print(\"\\n\u26a0 Per-call setup dominates. For sweeps, build VoxelImage once:\")\n",
-    "    print(\"    img = openimpala.facade._numpy_to_voxelimage(arr, max_grid_size)\")\n",
-    "    print(\"    s   = openimpala._core.TortuosityHypre(img, vf, 0, d, st, '.', 0, 1, 0, False)\")\n",
-    "    print(\"    s.value()\")\n",
-    "    print(\"  \u2026instead of calling oi.tortuosity(numpy, ...) each time.\")\n"
-   ]
+   "source": "# Headline numbers from the diagnostics above\nstage_total = stages[\"_total\"]\nstage_top = max((k for k in stages if not k.startswith(\"_\")), key=lambda k: stages[k])\nfixed_overhead = a_fit\nnaive_mean = float(np.mean(naive))\namort_mean = float(np.mean(amort))\n\nprint(\"=\" * 64)\nprint(\"  DIAGNOSIS\")\nprint(\"=\" * 64)\nprint(f\"  Build backend:                  {backend.upper()}\"\n      + (f\"   (host GPU: {gpu_name})\" if gpu_name else \"\"))\nprint(f\"  Single-call bottleneck:         {stage_top}\")\nprint(f\"                                  ({100*stages[stage_top]/stage_total:.1f}% of {stage_total:.2f}s)\")\nprint(f\"  Compute scaling:                O(N^{p_fit:.2f})\"\n      + (\"   (super-linear)\" if p_fit > 1.2 else \"\"))\nprint(f\"  Fixed per-call overhead (fit):  {fixed_overhead:.2f}s\"\n      + (\"   (no meaningful fixed cost)\" if not overhead_meaningful else \"\"))\nprint(f\"  Per-call rebuild tax:           {naive_mean - amort_mean:.2f}s \"\n      f\"({100*(naive_mean-amort_mean)/max(naive_mean,1e-9):.0f}% of naive call)\")\nprint(f\"  Best solver:                    {best_solver}\")\nprint(f\"  Best max_grid_size:             {best_gs}\")\nprint(\"=\" * 64)\n\n# Headline recommendation — the single most impactful change for this run\nprint(\"\\n  TOP RECOMMENDATION\")\nprint(\"  \" + \"-\" * 62)\nif gpu_name and not is_gpu_build:\n    print(f\"  A {gpu_name} is available but the installed wheel is CPU-only.\")\n    print(\"  Switch to openimpala-cuda for an expected 10–50× speedup on\")\n    print(\"  single-phase 3D diffusion at 128³+. Everything below is\")\n    print(\"  secondary until this is resolved.\")\nelif p_fit > 1.2:\n    print(f\"  Compute scales as O(N^{p_fit:.2f}). Plain Krylov iterations grow\")\n    print(\"  with problem size — switch to a multigrid preconditioner\")\n    print(\"  (PFMG/SMG via HYPRE, or AMReX MLMG) to restore O(N).\")\nelse:\n    print(f\"  Solver dominates wall time. Best configuration on this run:\")\n    print(f\"  solver={best_solver}, max_grid_size={best_gs}.\")\n    print(\"  Further speedup needs algorithmic changes or larger hardware.\")\n\n# Amortization recommendation — only fires when the rebuild tax is meaningful\nif naive_mean > 2 * amort_mean:\n    print()\n    print(\"  Per-call setup is a significant fraction of wall time. For sweeps,\")\n    print(\"  build the VoxelImage once and reuse it:\")\n    print()\n    print(\"    from openimpala.facade import _numpy_to_voxelimage\")\n    print(\"    from openimpala import _core\")\n    print(\"    img = _numpy_to_voxelimage(arr, max_grid_size)\")\n    print(\"    s   = _core.TortuosityHypre(img, vf, 0, d, st, '.', 0, 1, 0, False)\")\n    print(\"    s.value()\")\n    print()\n    print(\"  …rather than calling oi.tortuosity(numpy_array, ...) per iteration.\")"
   },
   {
    "cell_type": "code",