From 64306d4f37595a0d06cd216e5750840a4ce3185a Mon Sep 17 00:00:00 2001 From: James Le Houx Date: Tue, 5 May 2026 15:20:37 +0000 Subject: [PATCH 1/2] fix VoxelImage.from_numpy segfault on GPU build (host write to device mem) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Reported on Colab T4: notebook §3 crashes the kernel silently at `_core.VoxelImage.from_numpy(arr, max_grid_size)`. Root cause: the binding was filling the new iMultiFab with a CPU loop: auto fab = img->mf->array(mfi); // Array4 view into iMultiFab data for (k...) for (j...) for (i...) fab(i, j, k) = ptr[idx]; // host write — but in CUDA mode this // is DEVICE memory → segfault Works fine on CPU builds (where iMultiFab data is host memory) but every attempt to ingest a NumPy array on the GPU build dies before any Python print can flush. Affects every entry point that goes through from_numpy — so high-level oi.tortuosity / oi.percolation_check / oi.volume_fraction on the GPU wheel were ALL broken. Switch to the AMReX idiom: stage the host data in a Gpu::DeviceVector, then use ParallelFor with an AMREX_GPU_DEVICE lambda so the assignment runs on the actual hardware that owns the iMultiFab memory. * CPU build (#ifndef AMREX_USE_GPU): src_ptr is the host pointer and ParallelFor expands to a serial/OMP host loop. Identical behaviour to before. * GPU build (#ifdef AMREX_USE_GPU): copy host → device once via Gpu::copyAsync + streamSynchronize, then ParallelFor launches a kernel that reads from the device buffer and writes through the Array4 view to its own memory. Final streamSynchronize before FillBoundary makes sure the kernel is done before the ghost-cell exchange reads it. Note: this fix requires a wheel rebuild + new release. The pure-Python preload helper in 4.2.9 already lets _core.so load on GPU runtimes; this change makes it actually USABLE on them. https://claude.ai/code/session_011dJ5Bwq4Tnr8wxH597XJFf --- python/bindings/module.cpp | 48 +++++++++++++++++++++++++------------- 1 file changed, 32 insertions(+), 16 deletions(-) diff --git a/python/bindings/module.cpp b/python/bindings/module.cpp index 2512552..d151874 100644 --- a/python/bindings/module.cpp +++ b/python/bindings/module.cpp @@ -169,29 +169,45 @@ voxelimage_from_numpy(py::array_tdm.define(img->ba); img->mf = std::make_shared(img->ba, img->dm, 1, 1); - const auto* ptr = static_cast(buf.ptr); + const auto* host_ptr = static_cast(buf.ptr); + const std::size_t total = static_cast(nx) * static_cast(ny) * + static_cast(nz); + + // Stage the NumPy data in a buffer the kernel below can dereference safely. + // On CPU builds this is just the host pointer (no copy). On CUDA builds the + // iMultiFab data lives in device memory, so we have to copy the host array + // to a device-side buffer first — writing to fab(i,j,k) directly from host + // would segfault (T4, A100, etc.) because the Array4 view points at + // device memory. +#ifdef AMREX_USE_GPU + amrex::Gpu::DeviceVector dvec(total); + amrex::Gpu::copyAsync(amrex::Gpu::hostToDevice, host_ptr, host_ptr + total, dvec.begin()); + amrex::Gpu::streamSynchronize(); + const int* src_ptr = dvec.data(); +#else + const int* src_ptr = host_ptr; +#endif + + const int nx_l = nx; + const int ny_l = ny; #ifdef AMREX_USE_OMP #pragma omp parallel if (amrex::Gpu::notInLaunchRegion()) #endif for (amrex::MFIter mfi(*(img->mf), amrex::TilingIfNotGPU()); mfi.isValid(); ++mfi) { const amrex::Box& bx = mfi.tilebox(); - auto fab = img->mf->array(mfi); - const auto lo = amrex::lbound(bx); - const auto hi = amrex::ubound(bx); - - for (int k = lo.z; k <= hi.z; ++k) { - for (int j = lo.y; j <= hi.y; ++j) { - for (int i = lo.x; i <= hi.x; ++i) { - std::size_t idx = static_cast(k) * static_cast(ny) * - static_cast(nx) + - static_cast(j) * static_cast(nx) + - static_cast(i); - fab(i, j, k) = ptr[idx]; - } - } - } + auto const& fab = img->mf->array(mfi); + amrex::ParallelFor(bx, [=] AMREX_GPU_DEVICE(int i, int j, int k) noexcept { + const std::size_t idx = static_cast(k) * static_cast(ny_l) * + static_cast(nx_l) + + static_cast(j) * static_cast(nx_l) + + static_cast(i); + fab(i, j, k) = src_ptr[idx]; + }); } +#ifdef AMREX_USE_GPU + amrex::Gpu::streamSynchronize(); +#endif img->mf->FillBoundary(img->geom.periodicity()); return img; From f02166b757ca051be46a6ae4f6a42e6f38ebb7c3 Mon Sep 17 00:00:00 2001 From: James Le Houx Date: Tue, 5 May 2026 15:26:28 +0000 Subject: [PATCH 2/2] profiling notebook: GPU-aware install, drop legacy probes, polish prose MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The profiling notebook had grown a few rough edges as we iterated: 1. The install cell hard-coded `pip install openimpala`, ignoring the GPU runtime entirely. On a Colab T4 every solve was running on the CPU — exactly the failure mode §1a is supposed to detect. Rewrite to mirror tutorials/02/04/07: detect nvidia-smi, install openimpala-cuda on GPU runtimes, fall back to openimpala otherwise. 2. §1a's build_info() probe carried a 60-line subprocess banner fallback for "pre-4.0.2 wheels". The published wheel has been at 4.2.x for some time now and build_info() is in every wheel we publish. Drop the legacy fallback and the AttributeError handler. 3. Drop colloquialisms in markdown headers and prose: - "Profiling & Bottleneck Hunt" -> "Profiling and Performance Tuning" - "exists for one job" / "If you only have 5 minutes" — removed - "*(focused, post-diagnosis)*" / "*(optional)*" parentheticals - "wonky workaround" / "the *classic* signature" — recast neutrally - The "issue #256 acceptance target" reference in §9b — replaced with an explanation that scales naturally to readers who weren't following the issue thread 4. Add a proper contents table to the intro and clean up §2/§5/§6/§7/§9 intros with consistent structure: one-paragraph context, bullet list of what to look for, one paragraph on interpretation. 5. Fix a latent bug in §12's recommendation: the CPU-detection branch checked `backend == "cpu"`, but build_info()'s actual values are `cpp-cpu` / `cpp-cuda` / `cpp-hip` / `pure-python`. The check never fired in practice. Switch to the already-computed `is_gpu_build` flag so the "rebuild for GPU" recommendation actually appears when relevant. No analysis logic changes — every code cell that produced a chart, a fit, or a number still does so identically. This is purely a polish pass to make the notebook read as a tool rather than a scratchpad. https://claude.ai/code/session_011dJ5Bwq4Tnr8wxH597XJFf --- notebooks/profiling_and_tuning.ipynb | 439 +++++---------------------- 1 file changed, 83 insertions(+), 356 deletions(-) diff --git a/notebooks/profiling_and_tuning.ipynb b/notebooks/profiling_and_tuning.ipynb index 8542cb8..eeb1e35 100644 --- a/notebooks/profiling_and_tuning.ipynb +++ b/notebooks/profiling_and_tuning.ipynb @@ -19,38 +19,7 @@ { "cell_type": "markdown", "metadata": {}, - "source": [ - "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/BASE-Laboratory/OpenImpala/blob/master/notebooks/profiling_and_tuning.ipynb)\n", - "\n", - "# OpenImpala Profiling & Bottleneck Hunt\n", - "\n", - "This notebook exists for **one job**: find out where OpenImpala is spending\n", - "its time, and whether the slowdown is in Python bindings, AMReX/HYPRE setup,\n", - "or the linear solve itself.\n", - "\n", - "Each `oi.tortuosity(numpy_array, ...)` call rebuilds the entire AMReX +\n", - "HYPRE stack from scratch and runs a percolation flood-fill before the solve.\n", - "That per-call overhead can dominate for small/medium problems. We isolate it\n", - "in the first four sections, *then* run the deeper C++/GPU profilers.\n", - "\n", - "**Sections**\n", - "\n", - "1. Setup\n", - "2. Synthetic datasets (tiny \u2192 medium)\n", - "3. **Python-level stage breakdown** \u2014 time every stage of one solve\n", - "4. **Fixed-overhead extraction** \u2014 fit `t(N) = a + b\u00b7N\u1d56` across sizes\n", - "5. **Amortization test** \u2014 reuse VoxelImage across repeated solves\n", - "6. **cProfile** \u2014 Python-level hottest frames\n", - "7. **AMReX TinyProfiler** \u2014 C++ function breakdown\n", - "8. **Nsight Systems** \u2014 GPU kernel timeline (if GPU present)\n", - "9. Solver comparison (focused, post-diagnosis)\n", - "10. `max_grid_size` tuning (focused, post-diagnosis)\n", - "11. Native-binary comparison *(optional, isolates binding overhead)*\n", - "12. Summary & HPC recommendations\n", - "\n", - "If you only have 5 minutes, run Sections 1\u20136. Sections 7+ need a real GPU\n", - "and a built `openimpala` executable for full value.\n" - ] + "source": "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/BASE-Laboratory/OpenImpala/blob/master/notebooks/profiling_and_tuning.ipynb)\n\n# OpenImpala — Profiling and Performance Tuning\n\nThis notebook is the diagnostic complement to the seven user tutorials. Where the tutorials show *what* to do, this one shows *why* a given solve takes the time it does — and what to change when it takes too long.\n\nEvery `oi.tortuosity(numpy_array, ...)` call rebuilds the full AMReX + HYPRE stack from scratch and runs a percolation flood fill before the linear solve. That per-call setup cost can dominate small and medium problems. The first half of the notebook isolates and quantifies that cost; the second half goes below the binding layer with C++ and GPU profilers.\n\n## Contents\n\n| Section | Topic |\n|---:|---|\n| 1 | Setup — environment detection and dependency install |\n| 1a | Build verification — confirm the GPU build is actually active |\n| 2 | Synthetic datasets at 16³ – 128³ |\n| 3 | Stage-by-stage timing of a single solve |\n| 3b | Per-stage scaling: O(N) compute vs constant overhead |\n| 4 | Fixed-overhead extraction via `t(N) = a + b·Nᵖ` |\n| 5 | Amortization: reusing the VoxelImage across solves |\n| 6 | cProfile — Python-level hot frames |\n| 7 | AMReX TinyProfiler — C++ function breakdown |\n| 8 | NVIDIA Nsight Systems — GPU kernel timeline |\n| 9 | Solver comparison and scaling validation |\n| 10 | `max_grid_size` tuning |\n| 11 | Native-binary comparison (optional) |\n| 12 | Summary and HPC recommendations |\n\nSections 1 – 6 run on any environment in a few minutes. Sections 7+ produce the most useful output on a Colab GPU runtime (T4 / A100 / L4)." }, { "cell_type": "markdown", @@ -72,10 +41,10 @@ " r = subprocess.run([\"nvidia-smi\", \"--query-gpu=name\", \"--format=csv,noheader\"],\n", " capture_output=True, text=True, timeout=10)\n", " gpu_name = r.stdout.strip()\n", - " print(f\"GPU detected: {gpu_name}\" if gpu_name else \"No GPU \u2014 CPU only.\")\n", + " print(f\"GPU detected: {gpu_name}\" if gpu_name else \"No GPU — CPU only.\")\n", "except FileNotFoundError:\n", " gpu_name = \"\"\n", - " print(\"nvidia-smi not found \u2014 CPU only.\")\n" + " print(\"nvidia-smi not found — CPU only.\")\n" ] }, { @@ -83,11 +52,7 @@ "execution_count": null, "metadata": {}, "outputs": [], - "source": [ - "!apt-get install -y libopenmpi-dev > /dev/null 2>&1\n", - "!pip install openimpala porespy py-spy > /dev/null 2>&1\n", - "print(\"Dependencies installed.\")\n" - ] + "source": "# OpenImpala ships two PyPI wheels:\n# openimpala — pure-Python + SciPy fallback (CPU only)\n# openimpala-cuda — compiled C++ HYPRE/AMReX backend with CUDA kernels\n#\n# On a GPU runtime we install the CUDA wheel; otherwise we fall back to the\n# pure-Python package.\n!apt-get install -y libopenmpi-dev > /dev/null 2>&1\nif gpu_name:\n print(\"GPU detected — installing openimpala-cuda\")\n !pip install -q openimpala-cuda porespy py-spy\nelse:\n print(\"No GPU detected — installing openimpala (CPU)\")\n !pip install -q openimpala porespy py-spy\nprint(\"Dependencies installed.\")" }, { "cell_type": "code", @@ -121,98 +86,19 @@ { "cell_type": "markdown", "metadata": {}, - "source": [ - "## 1a. Is the openimpala build actually using the GPU?\n", - "\n", - "Most \"OpenImpala is slow on Colab\" reports turn out to be the pip wheel\n", - "being **CPU-only + OpenMP** while a perfectly good T4 sits idle. We\n", - "probe the build by running a trivial solve in a subprocess with\n", - "`verbose=1` and inspecting the init banner.\n" - ] + "source": "## 1a. Build verification\n\nBefore profiling anything, confirm that the wheel `pip` installed actually has the GPU backend you expect. The most common cause of \"OpenImpala feels slow on Colab\" is a CPU-only wheel running on a GPU-equipped host — every solve in the rest of the notebook would be CPU-bound and ~10–50× slower than necessary on a single-phase 3D diffusion problem at 128³+.\n\n`oi.build_info()` returns a dict of compile-time feature flags plus the runtime GPU device count. We check that the backend matches the hardware, and warn loudly if it doesn't." }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], - "source": [ - "# Use oi.build_info() when the wheel provides it (4.0.2+). Older wheels\n", - "# fall back to the subprocess banner probe so the notebook still works.\n", - "backend = None\n", - "info = None\n", - "try:\n", - " info = oi.build_info()\n", - " backend = info[\"backend\"] # \"cpp-cuda\", \"cpp-hip\", \"cpp-cpu\", \"pure-python\"\n", - " print(f\"openimpala backend: {backend}\")\n", - " print(f\" version: {info.get('version', 'unknown')}\")\n", - " print(f\" OpenMP threads: {info.get('openmp_max_threads', 1)}\")\n", - " print(f\" MPI enabled: {info.get('mpi_enabled', False)}\")\n", - " print(f\" HYPRE CUDA: {info.get('hypre_cuda', False)}\")\n", - " print(f\" TinyProfile: {info.get('tiny_profile', False)}\")\n", - " if info.get(\"gpu_device_count\", -1) > 0:\n", - " print(f\" GPU devices: {info['gpu_device_count']}\")\n", - "except AttributeError:\n", - " # Pre-4.0.2 wheel \u2014 no build_info(). Fall back to banner probe.\n", - " import subprocess, tempfile, os, textwrap\n", - " probe = textwrap.dedent(\"\"\"\n", - " import numpy as np, openimpala as oi\n", - " arr = np.zeros((8, 8, 8), dtype=np.int32)\n", - " with oi.Session():\n", - " try: oi.tortuosity(arr, phase=0, direction='z', solver='pcg', verbose=1)\n", - " except Exception as e: print(f'__ERR__ {e}')\n", - " \"\"\").strip()\n", - " tmp = tempfile.mkdtemp()\n", - " path = os.path.join(tmp, \"probe.py\")\n", - " with open(path, \"w\") as f:\n", - " f.write(probe)\n", - " r = subprocess.run([sys.executable, path], capture_output=True, text=True, timeout=90)\n", - " banner = (r.stdout or \"\") + (r.stderr or \"\")\n", - " low = banner.lower()\n", - " if \"cuda\" in low or (\"gpu\" in low and \"no gpu\" not in low):\n", - " backend = \"cpp-cuda\"\n", - " elif \"hip\" in low and \"rocm\" in low:\n", - " backend = \"cpp-hip\"\n", - " else:\n", - " backend = \"cpp-cpu\"\n", - " print(f\"openimpala backend (banner probe): {backend}\")\n", - " print(\"(install openimpala 4.0.2+ for a proper build_info() API)\")\n", - "\n", - "is_gpu_build = backend in (\"cpp-cuda\", \"cpp-hip\")\n", - "\n", - "if gpu_name and not is_gpu_build:\n", - " print()\n", - " print(\"=\" * 70)\n", - " print(\" \u26a0 CPU-ONLY BUILD DETECTED ON A GPU-EQUIPPED HOST\")\n", - " print(\"=\" * 70)\n", - " print(f\" Host GPU: {gpu_name}\")\n", - " print(f\" openimpala build: {backend}\")\n", - " print()\n", - " print(\" Your GPU is idle. Every solve in this notebook is CPU-bound.\")\n", - " print(\" For single-phase 3D diffusion on 128^3+ grids, a CUDA build is\")\n", - " print(\" typically 10-50x faster \u2014 this is the dominant source of\")\n", - " print(\" 'OpenImpala feels slow on Colab'.\")\n", - " print()\n", - " print(\" Fix: install the CUDA wheel instead.\")\n", - " print(\" !pip uninstall -y openimpala\")\n", - " print(\" !pip install openimpala-cuda\")\n", - " print(\" Then Runtime -> Restart session and re-run from the top.\")\n", - " print(\"=\" * 70)\n", - "elif is_gpu_build:\n", - " print(f\"\\n\u2713 {backend.upper()} build active on {gpu_name or 'accelerator'}\")\n", - "else:\n", - " print(\"\\nCPU-only run (no accelerator detected).\")\n" - ] + "source": "info = oi.build_info()\nbackend = info[\"backend\"] # one of: cpp-cuda, cpp-hip, cpp-cpu, pure-python\n\nprint(f\"openimpala backend: {backend}\")\nprint(f\" version: {info.get('version', 'unknown')}\")\nprint(f\" OpenMP threads: {info.get('openmp_max_threads', 1)}\")\nprint(f\" MPI enabled: {info.get('mpi_enabled', False)}\")\nprint(f\" HYPRE CUDA: {info.get('hypre_cuda', False)}\")\nprint(f\" TinyProfile: {info.get('tiny_profile', False)}\")\nif info.get(\"gpu_device_count\", -1) > 0:\n print(f\" GPU devices: {info['gpu_device_count']}\")\n\nis_gpu_build = backend in (\"cpp-cuda\", \"cpp-hip\")\n\nif gpu_name and not is_gpu_build:\n print()\n print(\"=\" * 70)\n print(\" WARNING: CPU-only build detected on a GPU-equipped host\")\n print(\"=\" * 70)\n print(f\" Host GPU: {gpu_name}\")\n print(f\" openimpala build: {backend}\")\n print()\n print(\" The GPU is idle. Every solve in this notebook will run on the CPU,\")\n print(\" which is typically 10–50× slower than the CUDA build for\")\n print(\" single-phase 3D diffusion on 128³+ grids.\")\n print()\n print(\" To fix:\")\n print(\" !pip uninstall -y openimpala\")\n print(\" !pip install openimpala-cuda\")\n print(\" Then Runtime → Restart session and re-run from the top.\")\n print(\"=\" * 70)\nelif is_gpu_build:\n print(f\"\\n{backend.upper()} build active on {gpu_name or 'accelerator'}.\")\nelse:\n print(\"\\nCPU-only run (no accelerator detected).\")" }, { "cell_type": "markdown", "metadata": {}, - "source": [ - "## 2. Synthetic datasets\n", - "\n", - "Small sizes chosen so a full profiling run completes in minutes, not hours.\n", - "`data_micro` (16\u00b3) is tiny enough that its solve time is ~all overhead \u2014 we\n", - "use it to extract the per-call fixed cost.\n" - ] + "source": "## 2. Synthetic datasets\n\nPoreSpy is used to generate five synthetic porous structures at 16³, 32³, 64³, 96³, and 128³. Sizes are kept small enough that a full profiling run completes in minutes rather than hours, and large enough that the cubic scaling of voxel count is visible in the timings.\n\nThe 16³ dataset is small enough that its solve time is dominated by per-call setup rather than compute — useful for extracting the fixed overhead in §4." }, { "cell_type": "code", @@ -239,14 +125,14 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## 3. Python-level stage breakdown \u2014 one call, one bar chart\n", + "## 3. Python-level stage breakdown — one call, one bar chart\n", "\n", "`oi.tortuosity(numpy_array, ...)` is not a single operation. From\n", "`facade.py` it is roughly:\n", "\n", "```\n", - "numpy \u2192 VoxelImage.from_numpy (copy + AMReX BoxArray/Geometry/iMultiFab build)\n", - "PercolationCheck(img, ...) (flood fill \u2014 runs every call!)\n", + "numpy → VoxelImage.from_numpy (copy + AMReX BoxArray/Geometry/iMultiFab build)\n", + "PercolationCheck(img, ...) (flood fill — runs every call!)\n", "VolumeFraction(img, ...) (cheap count)\n", "TortuosityHypre(img, ..., solver) (HYPRE grid/stencil/matrix assembly)\n", "solver.value() (actual linear solve + flux integration)\n", @@ -254,7 +140,7 @@ "\n", "We time each stage independently. Whichever bar dominates *is* the\n", "bottleneck. If `from_numpy` or `TortuosityHypre(...)` construction dominates,\n", - "your slowdown is in binding/setup \u2014 not the solver.\n" + "your slowdown is in binding/setup — not the solver.\n" ] }, { @@ -307,7 +193,7 @@ " out[\"_iters\"] = solver_obj.iterations\n", " return out\n", "\n", - "# Warm-up \u2014 first call pays lazy-init inside AMReX/HYPRE\n", + "# Warm-up — first call pays lazy-init inside AMReX/HYPRE\n", "_ = time_stages(datasets[32])\n", "\n", "stages = time_stages(data_medium, solver=\"pcg\")\n", @@ -337,24 +223,24 @@ "ax.set_yticks(y)\n", "ax.set_yticklabels(stage_names, fontsize=9)\n", "ax.set_xlabel(\"Wall time (s)\")\n", - "ax.set_title(\"Single oi.tortuosity(64\u00b3) call \u2014 stage breakdown\", fontweight=\"bold\")\n", + "ax.set_title(\"Single oi.tortuosity(64³) call — stage breakdown\", fontweight=\"bold\")\n", "ax.invert_yaxis()\n", "plt.tight_layout()\n", "plt.show()\n", "\n", "slowest = max(stage_names, key=lambda k: stages[k])\n", - "print(f\"\\nBottleneck at 64\u00b3: {slowest} ({100*stages[slowest]/stages['_total']:.1f}%)\")\n" + "print(f\"\\nBottleneck at 64³: {slowest} ({100*stages[slowest]/stages['_total']:.1f}%)\")\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "### \u00a73b Per-stage scaling \u2014 does this stage scale with N, or is it fixed overhead?\n", + "### §3b Per-stage scaling — does this stage scale with N, or is it fixed overhead?\n", "\n", - "Run the same decomposition at 32\u00b3 and 128\u00b3 and plot the ratio per stage.\n", - "A stage with ratio \u2248 `(128/32)\u00b3 = 64\u00d7` is doing O(N) work \u2014 legitimate\n", - "compute. A stage with ratio \u2248 1\u00d7 is constant overhead \u2014 every call pays\n", + "Run the same decomposition at 32³ and 128³ and plot the ratio per stage.\n", + "A stage with ratio ≈ `(128/32)³ = 64×` is doing O(N) work — legitimate\n", + "compute. A stage with ratio ≈ 1× is constant overhead — every call pays\n", "the same price regardless of problem size, which is the *classic* signature\n", "of binding / setup cost.\n" ] @@ -370,8 +256,8 @@ "\n", "# Ideal O(N) ratio when scaling voxel count by (128/32)^3\n", "N_ratio = (128 / 32) ** 3\n", - "print(f\"Ideal O(N) work ratio for 32->128: {N_ratio:.0f}\u00d7\")\n", - "print(f\"{'stage':<55s} {'32\u00b3 (s)':>10s} {'128\u00b3 (s)':>10s} {'ratio':>8s} behaviour\")\n", + "print(f\"Ideal O(N) work ratio for 32->128: {N_ratio:.0f}×\")\n", + "print(f\"{'stage':<55s} {'32³ (s)':>10s} {'128³ (s)':>10s} {'ratio':>8s} behaviour\")\n", "print(\"-\" * 110)\n", "\n", "rows = []\n", @@ -379,13 +265,13 @@ " ts, tb = stages_small[k], stages_big[k]\n", " r = tb / max(ts, 1e-9)\n", " if r > 0.5 * N_ratio:\n", - " label = \"scales O(N) \u2713\"\n", + " label = \"scales O(N) ✓\"\n", " elif r < 2.0:\n", " label = \"~constant (overhead)\"\n", " else:\n", " label = \"sub-linear\"\n", " rows.append({\"stage\": k, \"t_32\": ts, \"t_128\": tb, \"ratio\": r, \"label\": label})\n", - " print(f\"{k:<55s} {ts:>10.3f} {tb:>10.3f} {r:>8.1f}\u00d7 {label}\")\n", + " print(f\"{k:<55s} {ts:>10.3f} {tb:>10.3f} {r:>8.1f}× {label}\")\n", "\n", "df_scale = pd.DataFrame(rows)\n" ] @@ -399,15 +285,15 @@ "fig, ax = plt.subplots(figsize=(11, 4))\n", "y = np.arange(len(stage_names))\n", "w = 0.4\n", - "ax.barh(y - w/2, df_scale[\"t_32\"], w, color=\"#3498db\", alpha=0.85, label=\"32\u00b3\")\n", - "ax.barh(y + w/2, df_scale[\"t_128\"], w, color=\"#e74c3c\", alpha=0.85, label=\"128\u00b3\")\n", + "ax.barh(y - w/2, df_scale[\"t_32\"], w, color=\"#3498db\", alpha=0.85, label=\"32³\")\n", + "ax.barh(y + w/2, df_scale[\"t_128\"], w, color=\"#e74c3c\", alpha=0.85, label=\"128³\")\n", "for i, r in enumerate(df_scale[\"ratio\"]):\n", - " ax.text(df_scale[\"t_128\"].iloc[i] * 1.02, i + w/2, f\"{r:.0f}\u00d7\",\n", + " ax.text(df_scale[\"t_128\"].iloc[i] * 1.02, i + w/2, f\"{r:.0f}×\",\n", " va=\"center\", fontsize=8, color=\"#2c3e50\")\n", "ax.set_yticks(y)\n", "ax.set_yticklabels(stage_names, fontsize=9)\n", "ax.set_xlabel(\"Wall time (s)\")\n", - "ax.set_title(f\"Per-stage scaling (ideal O(N) ratio = {N_ratio:.0f}\u00d7)\", fontweight=\"bold\")\n", + "ax.set_title(f\"Per-stage scaling (ideal O(N) ratio = {N_ratio:.0f}×)\", fontweight=\"bold\")\n", "ax.axvline(0, color=\"#bdc3c7\", linewidth=0.5)\n", "ax.invert_yaxis()\n", "ax.legend(loc=\"lower right\")\n", @@ -416,9 +302,9 @@ "\n", "constant_stages = df_scale[df_scale[\"ratio\"] < 2.0]\n", "if not constant_stages.empty:\n", - " print(\"\\n\u26a0 Stages that DON'T scale with problem size (pure overhead):\")\n", + " print(\"\\n⚠ Stages that DON'T scale with problem size (pure overhead):\")\n", " for _, r in constant_stages.iterrows():\n", - " print(f\" {r['stage']:55s} ratio={r['ratio']:.1f}\u00d7\")\n", + " print(f\" {r['stage']:55s} ratio={r['ratio']:.1f}×\")\n", " print(\" Every oi.tortuosity() call pays these, regardless of image size.\")\n" ] }, @@ -430,7 +316,7 @@ "\n", "A call's wall time has two parts: a fixed per-call overhead `a` (binding\n", "dispatch, AMReX/HYPRE setup, flood fill) and a size-dependent compute part\n", - "`b\u00b7N\u1d56`. Fit `t(N) = a + b\u00b7N\u1d56` across sizes.\n", + "`b·Nᵖ`. Fit `t(N) = a + b·Nᵖ` across sizes.\n", "\n", "If `a` is a meaningful fraction of the medium-size solve time, every solve\n", "you run is paying a tax that has nothing to do with the actual physics.\n" @@ -444,7 +330,7 @@ "source": [ "overhead_rows = []\n", "for n, d in datasets.items():\n", - " # Two trials \u2014 median to reduce jitter without paying for 3\u00d7\n", + " # Two trials — median to reduce jitter without paying for 3×\n", " ts = []\n", " for _ in range(2):\n", " t0 = time.perf_counter()\n", @@ -479,7 +365,7 @@ " print(f\"Fit failed: {e}\")\n", " a_raw, b_fit, p_fit = T.min(), 0.0, 1.0\n", "\n", - "# Clamp: a negative intercept means there is no meaningful fixed overhead \u2014\n", + "# Clamp: a negative intercept means there is no meaningful fixed overhead —\n", "# pretending otherwise misleads the summary.\n", "a_fit = max(a_raw, 0.0)\n", "overhead_meaningful = a_fit > 0.01 * T.max()\n", @@ -490,10 +376,10 @@ "ax.loglog(N, T, \"o\", color=\"#2c3e50\", markersize=10, label=\"Measured\")\n", "xs = np.logspace(np.log10(N.min()), np.log10(N.max()), 200)\n", "ax.loglog(xs, model(xs, a_fit, b_fit, p_fit), \"-\", color=\"#3498db\",\n", - " label=f\"Fit: {a_fit:.2f}s + {b_fit:.2e}\u00b7N^{p_fit:.2f}\")\n", + " label=f\"Fit: {a_fit:.2f}s + {b_fit:.2e}·N^{p_fit:.2f}\")\n", "if overhead_meaningful:\n", " ax.axhline(a_fit, color=\"#e74c3c\", linestyle=\"--\",\n", - " label=f\"Fixed overhead \u2248 {a_fit:.2f}s\")\n", + " label=f\"Fixed overhead ≈ {a_fit:.2f}s\")\n", "ax.set_xlabel(\"Voxels (N)\")\n", "ax.set_ylabel(\"Wall time (s)\")\n", "ax.set_title(\"Fixed overhead vs compute\", fontweight=\"bold\")\n", @@ -502,13 +388,13 @@ "plt.show()\n", "\n", "if a_raw < 0:\n", - " print(f\"\\nFit intercept was negative ({a_raw:.2f}s) \u2014 clamped to 0.\")\n", + " print(f\"\\nFit intercept was negative ({a_raw:.2f}s) — clamped to 0.\")\n", " print(\"Interpretation: no detectable per-call fixed overhead; all time is compute.\")\n", "print(f\"Estimated fixed overhead per call: {a_fit:.2f}s\")\n", "print(f\"Compute scaling: O(N^{p_fit:.2f})\")\n", "\n", "if p_fit > 1.2:\n", - " print(f\"\\n\u26a0 Super-linear compute scaling (O(N^{p_fit:.2f}) vs ideal O(N)).\")\n", + " print(f\"\\n⚠ Super-linear compute scaling (O(N^{p_fit:.2f}) vs ideal O(N)).\")\n", " print(\" Likely cause: Krylov iteration count growing with N. A multigrid\")\n", " print(\" preconditioner (SMG/PFMG) or MLMG would bring this closer to O(N).\")\n", "\n", @@ -519,17 +405,7 @@ { "cell_type": "markdown", "metadata": {}, - "source": [ - "## 5. Amortization test \u2014 reuse VoxelImage across calls\n", - "\n", - "If most time is spent rebuilding AMReX/HYPRE state per call, then calling\n", - "the solver several times on the **same** `VoxelImage` should get cheaper\n", - "after the first call (matrix is already assembled inside the solver).\n", - "\n", - "We bypass `oi.tortuosity` and hit `_core.TortuosityHypre` directly so the\n", - "`VoxelImage` is reused. Compare this to calling `oi.tortuosity(numpy, ...)`\n", - "five times \u2014 each rebuild pays the full overhead.\n" - ] + "source": "## 5. Amortization test — reusing the VoxelImage\n\nIf most per-call time goes into rebuilding AMReX state, then running the solver multiple times against the *same* `VoxelImage` should be markedly cheaper than the equivalent number of calls through the high-level facade. This cell measures the gap.\n\nTwo paths are compared on `data_medium`, five calls each:\n\n* **Naive**: five calls to `oi.tortuosity(numpy_array, ...)`. Every call rebuilds the geometry, BoxArray, distribution mapping, iMultiFab, percolation mask, and HYPRE matrix from the bare NumPy input.\n* **Amortized**: one call to `_numpy_to_voxelimage` to build the `VoxelImage` once, then five direct calls to `_core.TortuosityHypre` reusing it.\n\nThe difference is the per-call setup tax. Sweeping (e.g. directional tortuosity, parameter studies) should always use the amortized pattern." }, { "cell_type": "code", @@ -540,7 +416,7 @@ "from openimpala.facade import _numpy_to_voxelimage, _parse_direction, _parse_solver\n", "\n", "def naive_repeat(data, n=5, solver=\"pcg\"):\n", - " \"\"\"Top-level facade \u2014 rebuilds everything every call.\"\"\"\n", + " \"\"\"Top-level facade — rebuilds everything every call.\"\"\"\n", " ts = []\n", " for _ in range(n):\n", " t0 = time.perf_counter()\n", @@ -568,11 +444,11 @@ "naive = naive_repeat(data_medium, n_repeat)\n", "amort = amortized_repeat(data_medium, n_repeat)\n", "\n", - "print(f\"Naive oi.tortuosity(numpy) \u00d7 {n_repeat}: \" + \", \".join(f\"{t:.2f}s\" for t in naive))\n", + "print(f\"Naive oi.tortuosity(numpy) × {n_repeat}: \" + \", \".join(f\"{t:.2f}s\" for t in naive))\n", "print(f\"Amortized (reuse VoxelImage): \" + \", \".join(f\"{t:.2f}s\" for t in amort))\n", "print(f\"\\nNaive total: {sum(naive):.2f}s (mean {np.mean(naive):.2f}s/call)\")\n", "print(f\"Amortized total: {sum(amort):.2f}s (mean {np.mean(amort):.2f}s/call)\")\n", - "print(f\"Speedup from reuse: {sum(naive)/max(sum(amort), 1e-9):.2f}\u00d7\")\n" + "print(f\"Speedup from reuse: {sum(naive)/max(sum(amort), 1e-9):.2f}×\")\n" ] }, { @@ -602,20 +478,7 @@ { "cell_type": "markdown", "metadata": {}, - "source": [ - "## 6. cProfile \u2014 Python-level hottest frames\n", - "\n", - "`cProfile` shows cumulative time in Python frames, including time spent\n", - "inside pybind11 wrappers. Look for big chunks in:\n", - "\n", - "- `_numpy_to_voxelimage` or `VoxelImage.from_numpy` \u2192 data copy cost\n", - "- `PercolationCheck.__init__` \u2192 flood-fill dominates\n", - "- `TortuosityHypre.__init__` \u2192 matrix assembly dominates\n", - "- `TortuosityHypre.value` \u2192 the actual solve\n", - "\n", - "Anything above these (e.g. `_parse_direction`, `_parse_solver`) appearing\n", - "in the top 20 indicates pure Python dispatch waste.\n" - ] + "source": "## 6. cProfile — Python-level hot frames\n\ncProfile captures cumulative time per Python frame, including time spent inside pybind11 wrappers around C++ calls. This is a different view from §3 — instead of timing named stages, it counts every function call and surfaces unexpected frame costs.\n\nThings to look for in the top 20 frames:\n\n* `_numpy_to_voxelimage`, `VoxelImage.from_numpy` — data ingestion\n* `PercolationCheck.__init__` — flood-fill setup\n* `TortuosityHypre.__init__` — HYPRE matrix assembly\n* `TortuosityHypre.value` — the linear solve\n\nIf pure-Python helpers like `_parse_direction` or `_parse_solver` show up high, the binding is doing too much Python-side dispatch." }, { "cell_type": "code", @@ -638,24 +501,7 @@ { "cell_type": "markdown", "metadata": {}, - "source": [ - "## 7. AMReX TinyProfiler \u2014 C++ internal breakdown\n", - "\n", - "Now we go *below* the binding layer. AMReX's TinyProfiler instruments\n", - "every `BL_PROFILE`-annotated function in the C++ source and prints a table\n", - "when the session finalizes.\n", - "\n", - "This is especially important given what \u00a73 just showed us: `solver.value()`\n", - "is ~96% of wall time at 64\u00b3 and scales super-linearly to 128\u00b3. TinyProfiler\n", - "will break that down into `TortuosityHypre::solve`, `setupMatrixEquation`,\n", - "`preconditionPhaseFab`, `global_fluxes`, `computePlaneFluxes`, etc.\n", - "\n", - "**Implementation note**: TinyProfiler prints via C++ streams on AMReX\n", - "finalize, which Python's `sys.stdout` redirection cannot capture (and\n", - "recycling a Session inside the same process is fragile). We run the solve\n", - "in a short-lived **subprocess** \u2014 Python's `capture_output=True` grabs\n", - "fd 1 + fd 2 at the OS level, so we get everything.\n" - ] + "source": "## 7. AMReX TinyProfiler — C++ function breakdown\n\n§3 showed that `solver.value()` accounts for the bulk of wall time at 64³ and that it scales super-linearly to 128³. To break that down further we need to go below the binding layer, into AMReX's own profiler.\n\n`AMReX::TinyProfiler` instruments every `BL_PROFILE`-annotated function in the C++ source and prints a tabular breakdown when AMReX finalises. The compiled wheel sets `-DAMReX_TINY_PROFILE=ON`, so the instrumentation is always on.\n\nThe profiler writes to C++ streams on `amrex::Finalize()`, which Python's `sys.stdout` redirection can't capture from inside a long-running session. We therefore run a single profiled solve in a short-lived subprocess — Python's `capture_output=True` collects stdout and stderr at the OS level and gives us the complete table." }, { "cell_type": "code", @@ -671,7 +517,7 @@ " Why a subprocess: TinyProfiler prints via C++ streams on AMReX finalize,\n", " which sys.stdout redirection can't capture. A subprocess gives us a clean\n", " AMReX lifecycle and Python's ``capture_output=True`` grabs fd 1 + fd 2\n", - " at the OS level \u2014 including everything TinyProfiler emits.\n", + " at the OS level — including everything TinyProfiler emits.\n", "\n", " Returns (result_dict, combined_stdout_stderr).\n", " \"\"\"\n", @@ -712,15 +558,15 @@ " break\n", "\n", " if proc.returncode != 0:\n", - " print(f\"\u26a0 subprocess exited with code {proc.returncode}\")\n", + " print(f\"⚠ subprocess exited with code {proc.returncode}\")\n", " return result, combined\n", "\n", - "print(\"Running a profiled solve on 64^3 in a subprocess\u2026\")\n", + "print(\"Running a profiled solve on 64^3 in a subprocess…\")\n", "prof_result, prof_output = run_with_profiler(data_medium, solver=\"pcg\")\n", "if prof_result is not None:\n", " print(f\"tau={prof_result['tau']:.4f} converged={prof_result['converged']} iters={prof_result['iters']}\")\n", "else:\n", - " print(\"No result line found \u2014 solve may have failed; check output below.\")\n", + " print(\"No result line found — solve may have failed; check output below.\")\n", "print(f\"Captured output length: {len(prof_output)} chars\")\n", "\n", "# Show the tail (TinyProfiler dumps near the end, on finalize)\n", @@ -774,7 +620,7 @@ " print(df_prof.to_string(index=False))\n", "elif not has_tp_header:\n", " print(\"=\" * 70)\n", - " print(\" \u26a0 No TinyProfiler table was emitted by this openimpala build\")\n", + " print(\" ⚠ No TinyProfiler table was emitted by this openimpala build\")\n", " print(\"=\" * 70)\n", " print(\" The subprocess ran to completion (result captured above) but\")\n", " print(\" AMReX did not dump a TinyProfiler table on finalize. This means\")\n", @@ -784,13 +630,13 @@ " print(\" CMake: -DAMReX_TINY_PROFILE=ON\")\n", " print(\" GNUmake: TINY_PROFILE=TRUE\")\n", " print()\n", - " print(\" Until then \u00a77 cannot break down the C++ solve into components.\")\n", - " print(\" \u00a73 (Python stage breakdown) + \u00a73b (per-stage scaling) already\")\n", + " print(\" Until then §7 cannot break down the C++ solve into components.\")\n", + " print(\" §3 (Python stage breakdown) + §3b (per-stage scaling) already\")\n", " print(\" show that solver.value() is ~85-95% of wall time and scales\")\n", - " print(\" super-linearly \u2014 that IS the bottleneck regardless.\")\n", + " print(\" super-linearly — that IS the bottleneck regardless.\")\n", " print(\"=\" * 70)\n", "else:\n", - " print(\"TinyProfiler header detected but no rows parsed \u2014 regex mismatch.\")\n", + " print(\"TinyProfiler header detected but no rows parsed — regex mismatch.\")\n", " print(\"Dump tail of raw output above and adjust parse_tiny_profiler().\")\n" ] }, @@ -824,7 +670,7 @@ " ax.set_yticks(y)\n", " ax.set_yticklabels(df_prof[\"function\"], fontsize=9)\n", " ax.set_xlabel(\"Exclusive time (s)\")\n", - " ax.set_title(\"AMReX TinyProfiler \u2014 C++ function breakdown\", fontweight=\"bold\")\n", + " ax.set_title(\"AMReX TinyProfiler — C++ function breakdown\", fontweight=\"bold\")\n", " ax.invert_yaxis()\n", "\n", " from matplotlib.patches import Patch\n", @@ -839,22 +685,22 @@ " plt.tight_layout()\n", " plt.show()\n", "else:\n", - " print(\"Skipping chart \u2014 no TinyProfiler data.\")\n" + " print(\"Skipping chart — no TinyProfiler data.\")\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "## 8. NVIDIA Nsight Systems \u2014 GPU kernel timeline\n", + "## 8. NVIDIA Nsight Systems — GPU kernel timeline\n", "\n", "If a GPU is available, `nsys` captures a kernel-level timeline: launches,\n", "memory transfers, idle gaps. Signatures of Python-binding overhead on the\n", "GPU path are:\n", "\n", - "- Many tiny launches with gaps \u2192 Python loop driving the GPU\n", - "- Frequent H2D transfers of the same data \u2192 `numpy \u2192 VoxelImage` copies leaking out\n", - "- Long idle gaps between kernels \u2192 CPU-side serial work between solver calls\n", + "- Many tiny launches with gaps → Python loop driving the GPU\n", + "- Frequent H2D transfers of the same data → `numpy → VoxelImage` copies leaking out\n", + "- Long idle gaps between kernels → CPU-side serial work between solver calls\n", "\n", "We use a standalone script so the subprocess is clean.\n" ] @@ -870,9 +716,9 @@ "\n", "if not nsys_available:\n", " if not gpu_name:\n", - " print(\"No GPU \u2014 skipping Nsight. (Runtime > Change runtime type > T4 GPU)\")\n", + " print(\"No GPU — skipping Nsight. (Runtime > Change runtime type > T4 GPU)\")\n", " else:\n", - " print(\"nsys not found \u2014 install via: apt-get install nsight-systems\")\n", + " print(\"nsys not found — install via: apt-get install nsight-systems\")\n", "else:\n", " print(f\"nsys: {shutil.which('nsys')} GPU: {gpu_name}\")\n", " script = '''\n", @@ -943,9 +789,9 @@ " print(\"CUDA memory transfer summary (H2D/D2H):\")\n", " print(df_mem.to_string(index=False))\n", " print(\"\\nRed flags:\")\n", - " print(\" \u2022 Frequent H2D transfers of similar size \u2192 numpy\u2192VoxelImage\")\n", + " print(\" • Frequent H2D transfers of similar size → numpy→VoxelImage\")\n", " print(\" copy leaking out of bindings into the hot path\")\n", - " print(\" \u2022 Large D2H transfer per call \u2192 result readback overhead\")\n", + " print(\" • Large D2H transfer per call → result readback overhead\")\n", " else:\n", " print(mem.stdout[:1500])\n", " else:\n", @@ -955,13 +801,7 @@ { "cell_type": "markdown", "metadata": {}, - "source": [ - "## 9. Solver comparison *(focused, post-diagnosis)*\n", - "\n", - "Now that you know where time goes, compare HYPRE solvers on `data_medium`\n", - "at a single grid size. `n_repeats=1` \u2014 this sweep is for picking a solver,\n", - "not measuring steady-state noise.\n" - ] + "source": "## 9. Solver comparison\n\nWith the bottlenecks identified, this section compares HYPRE solver / preconditioner combinations on `data_medium` at a single grid size. Each solver runs once — the goal is to pick a baseline configuration, not to measure steady-state noise. Repeat sweeps belong in `tests/benchmarks/`.\n\nThe combinations cover the textbook choices for symmetric Poisson-like operators (PCG with PFMG/SMG multigrid preconditioning), non-symmetric Krylov methods (GMRES, FlexGMRES, BiCGSTAB), and standalone multigrid (PFMG, SMG, AMReX MLMG)." }, { "cell_type": "code", @@ -1000,9 +840,9 @@ " \"tau\": res.tortuosity, \"ok\": res.solver_converged})\n", " print(f\" {label:18s} t={dt:6.2f}s iters={res.iterations:4d} tau={res.tortuosity:.4f}\")\n", " except TypeError as e:\n", - " # Older wheels don't know the 'preconditioner' kwarg \u2014 fall back without it.\n", + " # Older wheels don't know the 'preconditioner' kwarg — fall back without it.\n", " if \"preconditioner\" in str(e) and pc is not None:\n", - " print(f\" {label:18s} SKIP \u2014 wheel predates preconditioner plumbing\")\n", + " print(f\" {label:18s} SKIP — wheel predates preconditioner plumbing\")\n", " records.append({\"solver\": label, \"t\": np.nan, \"iters\": np.nan,\n", " \"tau\": np.nan, \"ok\": False})\n", " continue\n", @@ -1010,7 +850,7 @@ " except Exception as e:\n", " records.append({\"solver\": label, \"t\": np.nan, \"iters\": np.nan,\n", " \"tau\": np.nan, \"ok\": False})\n", - " print(f\" {label:18s} FAILED \u2014 {e}\")\n", + " print(f\" {label:18s} FAILED — {e}\")\n", "\n", "df_solvers = pd.DataFrame(records)\n", "df_solvers\n" @@ -1033,11 +873,11 @@ "ax1.set_yticks(y)\n", "ax1.set_yticklabels(df_solvers[\"solver\"])\n", "ax1.set_xlabel(\"Wall time (s)\", color=\"#2c3e50\")\n", - "ax1.set_title(f\"Solver comparison \u2014 64\u00b3, best by wall time: {best_solver}\",\n", + "ax1.set_title(f\"Solver comparison — 64³, best by wall time: {best_solver}\",\n", " fontweight=\"bold\")\n", "ax1.invert_yaxis()\n", "\n", - "# Overlay iteration counts on a twin axis \u2014 this is the key diagnostic.\n", + "# Overlay iteration counts on a twin axis — this is the key diagnostic.\n", "# A solver with few iterations but expensive each (multigrid) can still\n", "# win overall; wall time alone hides that.\n", "ax2 = ax1.twiny()\n", @@ -1065,7 +905,7 @@ "plt.tight_layout()\n", "plt.show()\n", "\n", - "# Flag solvers with few iterations \u2014 likely multigrid/preconditioned.\n", + "# Flag solvers with few iterations — likely multigrid/preconditioned.\n", "# If one converged in <10 iterations but is slower per iteration, it may\n", "# still be the better choice at larger problem sizes.\n", "low_iter = ok[ok[\"iters\"] < 10]\n", @@ -1075,25 +915,13 @@ " print(f\" {r['solver']:10s} {int(r['iters'])} iters {r['t']:.2f}s\")\n", " print(\" These scale O(N) and will overtake PCG at larger grids.\")\n", "\n", - "print(f\"\\nBest solver at 64\u00b3 by wall time: {best_solver}\")\n" + "print(f\"\\nBest solver at 64³ by wall time: {best_solver}\")\n" ] }, { "cell_type": "markdown", "metadata": {}, - "source": [ - "### 9b. Scaling validation \u2014 does the multigrid preconditioner break super-linear PCG?\n", - "\n", - "\u00a73b showed plain PCG scales as `t(N) ~ N^1.76` \u2014 that's the root cause of\n", - "slow large-grid solves, not any Python/binding overhead. A multigrid\n", - "preconditioner should restore near-`O(N)` scaling. The issue #256\n", - "acceptance target is **p < 1.2** at 64\u00b3 \u2192 128\u00b3.\n", - "\n", - "Fits `t = a\u00b7N^p` across 32\u00b3 / 64\u00b3 / 96\u00b3 / 128\u00b3 for three representative\n", - "choices: plain PCG, PCG+PFMG (Krylov + multigrid preconditioner), and MLMG\n", - "(AMReX-native geometric multigrid). Uses a uniform phase-0 block so each\n", - "solve is cheap and always percolates.\n" - ] + "source": "### 9b. Scaling validation — does the multigrid preconditioner restore O(N) compute?\n\n§3b found plain PCG scaling as `t(N) ~ N^1.76` — the dominant source of slow large-grid solves, well above any Python or binding overhead. Theory predicts that pairing PCG with a geometric multigrid preconditioner (PFMG or SMG) drops the iteration count to a near-constant, restoring near-linear `O(N)` scaling overall. AMReX's native MLMG is also `O(N)` by construction.\n\nThis cell fits `t = a · N^p` across 32³, 64³, 96³, and 128³ for three representative configurations:\n\n* **`pcg`** — plain conjugate gradient (the baseline, super-linear)\n* **`pcg+pfmg`** — PCG with HYPRE's PFMG preconditioner\n* **`mlmg`** — AMReX's native geometric multigrid\n\nThe test problem is a uniform phase-0 block (always percolates, trivially convergent) so each solve is cheap and the differences come purely from the algorithmic scaling.\n\nA `p` value below ~1.2 indicates near-linear scaling; the plain PCG baseline typically lands around `p ≈ 1.7–1.8`." }, { "cell_type": "code", @@ -1111,7 +939,7 @@ "scaling_records = []\n", "for label, kw in scaling_combos:\n", " for N in scaling_sizes:\n", - " arr = np.zeros((N, N, N), dtype=np.int32) # uniform phase 0 \u2014 always percolates\n", + " arr = np.zeros((N, N, N), dtype=np.int32) # uniform phase 0 — always percolates\n", " try:\n", " t0 = time.perf_counter()\n", " res = oi.tortuosity(arr, phase=0, direction=\"z\",\n", @@ -1122,10 +950,10 @@ " \"ok\": bool(res.solver_converged)})\n", " print(f\" {label:10s} {N:3d}^3 t={dt:7.3f}s iters={res.iterations:4d}\")\n", " except TypeError as e:\n", - " # Wheel predates the preconditioner kwarg \u2014 skip only those rows.\n", + " # Wheel predates the preconditioner kwarg — skip only those rows.\n", " msg = str(e)\n", " if (\"preconditioner\" in msg) and (\"preconditioner\" in kw):\n", - " print(f\" {label:10s} {N:3d}^3 SKIP \u2014 wheel predates preconditioner kwarg\")\n", + " print(f\" {label:10s} {N:3d}^3 SKIP — wheel predates preconditioner kwarg\")\n", " scaling_records.append({\"solver\": label, \"N\": N, \"t\": np.nan,\n", " \"iters\": 0, \"ok\": False})\n", " continue\n", @@ -1133,7 +961,7 @@ " except Exception as e:\n", " scaling_records.append({\"solver\": label, \"N\": N, \"t\": np.nan,\n", " \"iters\": 0, \"ok\": False})\n", - " print(f\" {label:10s} {N:3d}^3 FAILED \u2014 {e}\")\n", + " print(f\" {label:10s} {N:3d}^3 FAILED — {e}\")\n", "\n", "df_scale = pd.DataFrame(scaling_records)\n", "df_scale\n" @@ -1144,52 +972,7 @@ "execution_count": null, "metadata": {}, "outputs": [], - "source": [ - "# Fit t = a \u00b7 N^p in log-log per solver, then overlay the curves with an\n", - "# O(N) reference line and the O(N^1.76) baseline from \u00a73b.\n", - "fig, ax = plt.subplots(figsize=(9, 5.5))\n", - "palette = {\"pcg\": \"#e74c3c\", \"pcg+pfmg\": \"#2ecc71\", \"mlmg\": \"#8e44ad\"}\n", - "fit_results = {}\n", - "for label, _ in scaling_combos:\n", - " sub = df_scale[(df_scale[\"solver\"] == label) & df_scale[\"ok\"]].dropna(subset=[\"t\"])\n", - " if len(sub) < 2:\n", - " fit_results[label] = np.nan\n", - " continue\n", - " p, loga = np.polyfit(np.log(sub[\"N\"].astype(float)), np.log(sub[\"t\"].astype(float)), 1)\n", - " fit_results[label] = p\n", - " ax.loglog(sub[\"N\"], sub[\"t\"], \"o-\", color=palette.get(label, \"#333333\"),\n", - " markersize=8, linewidth=2, label=f\"{label} (p={p:.2f})\")\n", - "\n", - "# Reference lines \u2014 anchored at the first successful measurement\n", - "anchor = df_scale[df_scale[\"ok\"]].dropna(subset=[\"t\"]).sort_values(\"N\")\n", - "if not anchor.empty:\n", - " N_anchor = float(anchor.iloc[0][\"N\"])\n", - " t_anchor = float(anchor.iloc[0][\"t\"])\n", - " Ns = np.array(scaling_sizes, dtype=float)\n", - " ax.loglog(Ns, t_anchor * (Ns / N_anchor) ** 1.0,\n", - " \"k--\", alpha=0.4, linewidth=1, label=\"O(N) reference\")\n", - " ax.loglog(Ns, t_anchor * (Ns / N_anchor) ** 1.76,\n", - " \"r:\", alpha=0.5, linewidth=1, label=\"O(N^1.76) (\u00a73b baseline)\")\n", - "\n", - "ax.set_xlabel(\"Grid size N (domain = N^3)\")\n", - "ax.set_ylabel(\"Wall time (s)\")\n", - "ax.set_title(\"Scaling validation \u2014 multigrid vs. plain Krylov\",\n", - " fontweight=\"bold\")\n", - "ax.legend(loc=\"upper left\", framealpha=0.9)\n", - "ax.grid(True, which=\"both\", alpha=0.3)\n", - "plt.tight_layout()\n", - "plt.show()\n", - "\n", - "# Verdict against acceptance target (issue #256: p < 1.2)\n", - "print(\"\\nScaling exponents vs. acceptance target p < 1.2:\")\n", - "for label, p in fit_results.items():\n", - " if np.isnan(p):\n", - " print(f\" {label:10s} n/a (insufficient data)\")\n", - " elif p < 1.2:\n", - " print(f\" {label:10s} p = {p:.2f} \u2713 near-linear \u2014 target met\")\n", - " else:\n", - " print(f\" {label:10s} p = {p:.2f} \u2717 super-linear \u2014 target NOT met\")\n" - ] + "source": "# Fit t = a · N^p in log-log per solver, then overlay the curves with an\n# O(N) reference line and the O(N^1.76) baseline from §3b.\nfig, ax = plt.subplots(figsize=(9, 5.5))\npalette = {\"pcg\": \"#e74c3c\", \"pcg+pfmg\": \"#2ecc71\", \"mlmg\": \"#8e44ad\"}\nfit_results = {}\nfor label, _ in scaling_combos:\n sub = df_scale[(df_scale[\"solver\"] == label) & df_scale[\"ok\"]].dropna(subset=[\"t\"])\n if len(sub) < 2:\n fit_results[label] = np.nan\n continue\n p, loga = np.polyfit(np.log(sub[\"N\"].astype(float)), np.log(sub[\"t\"].astype(float)), 1)\n fit_results[label] = p\n ax.loglog(sub[\"N\"], sub[\"t\"], \"o-\", color=palette.get(label, \"#333333\"),\n markersize=8, linewidth=2, label=f\"{label} (p={p:.2f})\")\n\n# Reference lines — anchored at the first successful measurement\nanchor = df_scale[df_scale[\"ok\"]].dropna(subset=[\"t\"]).sort_values(\"N\")\nif not anchor.empty:\n N_anchor = float(anchor.iloc[0][\"N\"])\n t_anchor = float(anchor.iloc[0][\"t\"])\n Ns = np.array(scaling_sizes, dtype=float)\n ax.loglog(Ns, t_anchor * (Ns / N_anchor) ** 1.0,\n \"k--\", alpha=0.4, linewidth=1, label=\"O(N) reference\")\n ax.loglog(Ns, t_anchor * (Ns / N_anchor) ** 1.76,\n \"r:\", alpha=0.5, linewidth=1, label=\"O(N^1.76) (§3b baseline)\")\n\nax.set_xlabel(\"Grid size N (domain = N^3)\")\nax.set_ylabel(\"Wall time (s)\")\nax.set_title(\"Scaling validation — multigrid vs. plain Krylov\",\n fontweight=\"bold\")\nax.legend(loc=\"upper left\", framealpha=0.9)\nax.grid(True, which=\"both\", alpha=0.3)\nplt.tight_layout()\nplt.show()\n\n# Verdict against the near-linear target (p < 1.2)\nprint(\"\\nScaling exponents vs. near-linear target (p < 1.2):\")\nfor label, p in fit_results.items():\n if np.isnan(p):\n print(f\" {label:10s} n/a (insufficient data)\")\n elif p < 1.2:\n print(f\" {label:10s} p = {p:.2f} near-linear — target met\")\n else:\n print(f\" {label:10s} p = {p:.2f} super-linear — target NOT met\")" }, { "cell_type": "markdown", @@ -1197,7 +980,7 @@ "source": [ "## 10. `max_grid_size` tuning\n", "\n", - "Small boxes \u2192 better CPU cache + MPI parallelism. Large boxes \u2192 better\n", + "Small boxes → better CPU cache + MPI parallelism. Large boxes → better\n", "GPU kernel saturation. Sweep a few values with the best solver.\n" ] }, @@ -1224,7 +1007,7 @@ "ax.plot(df_grid[\"max_grid_size\"], df_grid[\"t\"], \"o-\", linewidth=2, markersize=9)\n", "ax.set_xlabel(\"max_grid_size\")\n", "ax.set_ylabel(\"Wall time (s)\")\n", - "ax.set_title(f\"Grid sweep \u2014 64\u00b3, solver={best_solver}\", fontweight=\"bold\")\n", + "ax.set_title(f\"Grid sweep — 64³, solver={best_solver}\", fontweight=\"bold\")\n", "plt.tight_layout()\n", "plt.show()\n", "print(f\"Optimal max_grid_size: {best_gs}\")\n" @@ -1233,16 +1016,7 @@ { "cell_type": "markdown", "metadata": {}, - "source": [ - "## 11. Native-binary comparison *(optional)*\n", - "\n", - "Run the native `openimpala` executable on the same data (via a dumped\n", - "TIFF stack) and compare to the Python-binding call. The delta is\n", - "binding/orchestration overhead, nothing else.\n", - "\n", - "Skip this if you don't have the native binary built \u2014 \u00a73\u2013\u00a75 already\n", - "expose the Python-side story.\n" - ] + "source": "## 11. Native-binary comparison — optional\n\nIf the native `Diffusion` executable is available on `PATH`, this cell runs it on the same data via a `tiff` dump and compares wall time against the equivalent `oi.tortuosity` call. The difference quantifies how much overhead the Python/pybind11 binding layer adds on top of the C++ solver.\n\nThis section is informational — a CTest-driven comparison lives at `tests/benchmarks/`. Skip if no native binary is available; §3 – §5 already isolate the binding-side cost." }, { "cell_type": "code", @@ -1252,7 +1026,7 @@ "source": [ "native = shutil.which(\"openimpala\")\n", "if not native:\n", - " print(\"No native openimpala binary on PATH \u2014 skipping comparison.\")\n", + " print(\"No native openimpala binary on PATH — skipping comparison.\")\n", "else:\n", " import tifffile, tempfile, textwrap, os\n", " tmp = tempfile.mkdtemp()\n", @@ -1277,12 +1051,12 @@ " t_native = time.perf_counter() - t0\n", "\n", " # The native binary exits non-zero on input/parse errors but still returns\n", - " # quickly \u2014 don't compare that 0.1s \"win\" to a real Python solve.\n", + " # quickly — don't compare that 0.1s \"win\" to a real Python solve.\n", " if res.returncode != 0:\n", " print(\"=\" * 70)\n", - " print(f\" \u26a0 Native binary exited with code {res.returncode} in {t_native:.2f}s\")\n", + " print(f\" ⚠ Native binary exited with code {res.returncode} in {t_native:.2f}s\")\n", " print(\"=\" * 70)\n", - " print(\" It did NOT actually solve \u2014 skipping the comparison.\")\n", + " print(\" It did NOT actually solve — skipping the comparison.\")\n", " print()\n", " print(\" stderr tail:\")\n", " for line in (res.stderr or \"\").strip().splitlines()[-10:]:\n", @@ -1295,7 +1069,7 @@ " print(\" Likely cause: the native binary on PATH expects a different\")\n", " print(\" inputs-file schema than what this cell writes (field names or\")\n", " print(\" types differ between versions). Fix the inputs file to match,\")\n", - " print(\" or ignore this comparison \u2014 \u00a73/\u00a74 already prove the bindings\")\n", + " print(\" or ignore this comparison — §3/§4 already prove the bindings\")\n", " print(\" add negligible overhead.\")\n", " else:\n", " t0 = time.perf_counter()\n", @@ -1321,54 +1095,7 @@ "execution_count": null, "metadata": {}, "outputs": [], - "source": [ - "# Collect headline numbers from the diagnostics above\n", - "stage_total = stages[\"_total\"]\n", - "stage_top = max((k for k in stages if not k.startswith(\"_\")), key=lambda k: stages[k])\n", - "fixed_overhead = a_fit\n", - "naive_mean = float(np.mean(naive))\n", - "amort_mean = float(np.mean(amort))\n", - "\n", - "print(\"=\" * 64)\n", - "print(\" DIAGNOSIS\")\n", - "print(\"=\" * 64)\n", - "print(f\" Build backend: {backend.upper()}\"\n", - " + (f\" (host GPU: {gpu_name})\" if gpu_name else \"\"))\n", - "print(f\" Single-call bottleneck: {stage_top}\")\n", - "print(f\" ({100*stages[stage_top]/stage_total:.1f}% of {stage_total:.2f}s)\")\n", - "print(f\" Compute scaling: O(N^{p_fit:.2f})\"\n", - " + (\" \u26a0 super-linear\" if p_fit > 1.2 else \"\"))\n", - "print(f\" Fixed per-call overhead (fit): {fixed_overhead:.2f}s\"\n", - " + (\" (no meaningful fixed cost)\" if not overhead_meaningful else \"\"))\n", - "print(f\" Per-call rebuild tax: {naive_mean - amort_mean:.2f}s \"\n", - " f\"({100*(naive_mean-amort_mean)/max(naive_mean,1e-9):.0f}% of naive call)\")\n", - "print(f\" Best solver: {best_solver}\")\n", - "print(f\" Best max_grid_size: {best_gs}\")\n", - "print(\"=\" * 64)\n", - "\n", - "# Headline recommendation \u2014 the #1 thing to change given what we found\n", - "print(\"\\n TOP RECOMMENDATION\")\n", - "print(\" \" + \"-\" * 62)\n", - "if gpu_name and backend == \"cpu\":\n", - " print(f\" You have a {gpu_name} sitting idle. The pip wheel is CPU-only.\")\n", - " print(\" Rebuild openimpala with -DAMReX_GPU_BACKEND=CUDA \u2014 expect 10-50x.\")\n", - " print(\" Everything below is secondary until this is addressed.\")\n", - "elif p_fit > 1.2:\n", - " print(f\" Solver is scaling O(N^{p_fit:.2f}), not O(N). PCG iterations grow\")\n", - " print(\" with problem size. Switch to a multigrid preconditioner (SMG/PFMG\")\n", - " print(\" via HYPRE, or MLMG) to restore linear scaling at large grids.\")\n", - "else:\n", - " print(f\" Solver dominates; best found is {best_solver} with max_grid_size={best_gs}.\")\n", - " print(\" No free lunch here \u2014 further speedup needs algorithmic changes.\")\n", - "\n", - "# If the rebuild tax is large, recommend the amortized path\n", - "if naive_mean > 2 * amort_mean:\n", - " print(\"\\n\u26a0 Per-call setup dominates. For sweeps, build VoxelImage once:\")\n", - " print(\" img = openimpala.facade._numpy_to_voxelimage(arr, max_grid_size)\")\n", - " print(\" s = openimpala._core.TortuosityHypre(img, vf, 0, d, st, '.', 0, 1, 0, False)\")\n", - " print(\" s.value()\")\n", - " print(\" \u2026instead of calling oi.tortuosity(numpy, ...) each time.\")\n" - ] + "source": "# Headline numbers from the diagnostics above\nstage_total = stages[\"_total\"]\nstage_top = max((k for k in stages if not k.startswith(\"_\")), key=lambda k: stages[k])\nfixed_overhead = a_fit\nnaive_mean = float(np.mean(naive))\namort_mean = float(np.mean(amort))\n\nprint(\"=\" * 64)\nprint(\" DIAGNOSIS\")\nprint(\"=\" * 64)\nprint(f\" Build backend: {backend.upper()}\"\n + (f\" (host GPU: {gpu_name})\" if gpu_name else \"\"))\nprint(f\" Single-call bottleneck: {stage_top}\")\nprint(f\" ({100*stages[stage_top]/stage_total:.1f}% of {stage_total:.2f}s)\")\nprint(f\" Compute scaling: O(N^{p_fit:.2f})\"\n + (\" (super-linear)\" if p_fit > 1.2 else \"\"))\nprint(f\" Fixed per-call overhead (fit): {fixed_overhead:.2f}s\"\n + (\" (no meaningful fixed cost)\" if not overhead_meaningful else \"\"))\nprint(f\" Per-call rebuild tax: {naive_mean - amort_mean:.2f}s \"\n f\"({100*(naive_mean-amort_mean)/max(naive_mean,1e-9):.0f}% of naive call)\")\nprint(f\" Best solver: {best_solver}\")\nprint(f\" Best max_grid_size: {best_gs}\")\nprint(\"=\" * 64)\n\n# Headline recommendation — the single most impactful change for this run\nprint(\"\\n TOP RECOMMENDATION\")\nprint(\" \" + \"-\" * 62)\nif gpu_name and not is_gpu_build:\n print(f\" A {gpu_name} is available but the installed wheel is CPU-only.\")\n print(\" Switch to openimpala-cuda for an expected 10–50× speedup on\")\n print(\" single-phase 3D diffusion at 128³+. Everything below is\")\n print(\" secondary until this is resolved.\")\nelif p_fit > 1.2:\n print(f\" Compute scales as O(N^{p_fit:.2f}). Plain Krylov iterations grow\")\n print(\" with problem size — switch to a multigrid preconditioner\")\n print(\" (PFMG/SMG via HYPRE, or AMReX MLMG) to restore O(N).\")\nelse:\n print(f\" Solver dominates wall time. Best configuration on this run:\")\n print(f\" solver={best_solver}, max_grid_size={best_gs}.\")\n print(\" Further speedup needs algorithmic changes or larger hardware.\")\n\n# Amortization recommendation — only fires when the rebuild tax is meaningful\nif naive_mean > 2 * amort_mean:\n print()\n print(\" Per-call setup is a significant fraction of wall time. For sweeps,\")\n print(\" build the VoxelImage once and reuse it:\")\n print()\n print(\" from openimpala.facade import _numpy_to_voxelimage\")\n print(\" from openimpala import _core\")\n print(\" img = _numpy_to_voxelimage(arr, max_grid_size)\")\n print(\" s = _core.TortuosityHypre(img, vf, 0, d, st, '.', 0, 1, 0, False)\")\n print(\" s.value()\")\n print()\n print(\" …rather than calling oi.tortuosity(numpy_array, ...) per iteration.\")" }, { "cell_type": "code",