BASE-Laboratory · jameslehoux · Apr 21, 2026 · Apr 21, 2026 · Apr 21, 2026
diff --git a/notebooks/profiling_and_tuning.ipynb b/notebooks/profiling_and_tuning.ipynb
@@ -136,17 +136,30 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "def detect_build_backend():\n",
-    "    \"\"\"Return ('cuda'|'hip'|'cpu', raw_init_banner).\"\"\"\n",
+    "# Use oi.build_info() when the wheel provides it (4.0.2+). Older wheels\n",
+    "# fall back to the subprocess banner probe so the notebook still works.\n",
+    "backend = None\n",
+    "info = None\n",
+    "try:\n",
+    "    info = oi.build_info()\n",
+    "    backend = info[\"backend\"]  # \"cpp-cuda\", \"cpp-hip\", \"cpp-cpu\", \"pure-python\"\n",
+    "    print(f\"openimpala backend: {backend}\")\n",
+    "    print(f\"  version:        {info.get('version', 'unknown')}\")\n",
+    "    print(f\"  OpenMP threads: {info.get('openmp_max_threads', 1)}\")\n",
+    "    print(f\"  MPI enabled:    {info.get('mpi_enabled', False)}\")\n",
+    "    print(f\"  HYPRE CUDA:     {info.get('hypre_cuda', False)}\")\n",
+    "    print(f\"  TinyProfile:    {info.get('tiny_profile', False)}\")\n",
+    "    if info.get(\"gpu_device_count\", -1) > 0:\n",
+    "        print(f\"  GPU devices:    {info['gpu_device_count']}\")\n",
+    "except AttributeError:\n",
+    "    # Pre-4.0.2 wheel \u2014 no build_info(). Fall back to banner probe.\n",
     "    import subprocess, tempfile, os, textwrap\n",
     "    probe = textwrap.dedent(\"\"\"\n",
     "        import numpy as np, openimpala as oi\n",
-    "        arr = np.zeros((8, 8, 8), dtype=np.int32)  # uniform phase 0\n",
+    "        arr = np.zeros((8, 8, 8), dtype=np.int32)\n",
     "        with oi.Session():\n",
-    "            try:\n",
-    "                oi.tortuosity(arr, phase=0, direction='z', solver='pcg', verbose=1)\n",
-    "            except Exception as e:\n",
-    "                print(f'__ERR__ {e}')\n",
+    "            try: oi.tortuosity(arr, phase=0, direction='z', solver='pcg', verbose=1)\n",
+    "            except Exception as e: print(f'__ERR__ {e}')\n",
     "    \"\"\").strip()\n",
     "    tmp = tempfile.mkdtemp()\n",
     "    path = os.path.join(tmp, \"probe.py\")\n",
@@ -155,36 +168,37 @@
     "    r = subprocess.run([sys.executable, path], capture_output=True, text=True, timeout=90)\n",
     "    banner = (r.stdout or \"\") + (r.stderr or \"\")\n",
     "    low = banner.lower()\n",
-    "    if \"cuda\" in low or \"device 0\" in low or \"gpu\" in low and \"no gpu\" not in low:\n",
-    "        return \"cuda\", banner\n",
-    "    if \"hip\" in low and \"rocm\" in low:\n",
-    "        return \"hip\", banner\n",
-    "    return \"cpu\", banner\n",
+    "    if \"cuda\" in low or (\"gpu\" in low and \"no gpu\" not in low):\n",
+    "        backend = \"cpp-cuda\"\n",
+    "    elif \"hip\" in low and \"rocm\" in low:\n",
+    "        backend = \"cpp-hip\"\n",
+    "    else:\n",
+    "        backend = \"cpp-cpu\"\n",
+    "    print(f\"openimpala backend (banner probe): {backend}\")\n",
+    "    print(\"(install openimpala 4.0.2+ for a proper build_info() API)\")\n",
     "\n",
-    "backend, banner = detect_build_backend()\n",
-    "print(f\"openimpala backend: {backend.upper()}\")\n",
-    "print(\"\\nInit banner (first 600 chars):\")\n",
-    "print(banner[:600])\n",
+    "is_gpu_build = backend in (\"cpp-cuda\", \"cpp-hip\")\n",
     "\n",
-    "if gpu_name and backend == \"cpu\":\n",
-    "    print(\"\\n\" + \"=\" * 70)\n",
+    "if gpu_name and not is_gpu_build:\n",
+    "    print()\n",
+    "    print(\"=\" * 70)\n",
     "    print(\"  \u26a0  CPU-ONLY BUILD DETECTED ON A GPU-EQUIPPED HOST\")\n",
     "    print(\"=\" * 70)\n",
-    "    print(f\"  Host GPU:         {gpu_name}\")\n",
-    "    print(f\"  openimpala build: CPU + OpenMP (no CUDA)\")\n",
+    "    print(f\"  Host GPU:          {gpu_name}\")\n",
+    "    print(f\"  openimpala build:  {backend}\")\n",
     "    print()\n",
     "    print(\"  Your GPU is idle. Every solve in this notebook is CPU-bound.\")\n",
-    "    print(\"  For single-phase 3D diffusion on 128^3+ grids, a CUDA build\")\n",
-    "    print(\"  is typically 10-50x faster. This is almost certainly the\")\n",
-    "    print(\"  dominant source of 'OpenImpala feels slow on Colab'.\")\n",
+    "    print(\"  For single-phase 3D diffusion on 128^3+ grids, a CUDA build is\")\n",
+    "    print(\"  typically 10-50x faster \u2014 this is the dominant source of\")\n",
+    "    print(\"  'OpenImpala feels slow on Colab'.\")\n",
     "    print()\n",
-    "    print(\"  Next steps:\")\n",
-    "    print(\"    - Install a CUDA-enabled openimpala wheel if one exists, or\")\n",
-    "    print(\"    - Build from source with -DAMReX_GPU_BACKEND=CUDA, or\")\n",
-    "    print(\"    - Use an Apptainer image built with CUDA support.\")\n",
+    "    print(\"  Fix: install the CUDA wheel instead.\")\n",
+    "    print(\"    !pip uninstall -y openimpala\")\n",
+    "    print(\"    !pip install openimpala-cuda\")\n",
+    "    print(\"  Then Runtime -> Restart session and re-run from the top.\")\n",
     "    print(\"=\" * 70)\n",
-    "elif backend != \"cpu\":\n",
-    "    print(f\"\\n\u2713 Using {backend.upper()} backend on {gpu_name or 'accelerator'}\")\n",
+    "elif is_gpu_build:\n",
+    "    print(f\"\\n\u2713 {backend.upper()} build active on {gpu_name or 'accelerator'}\")\n",
     "else:\n",
     "    print(\"\\nCPU-only run (no accelerator detected).\")\n"
    ]
@@ -1064,6 +1078,119 @@
     "print(f\"\\nBest solver at 64\u00b3 by wall time: {best_solver}\")\n"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 9b. Scaling validation \u2014 does the multigrid preconditioner break super-linear PCG?\n",
+    "\n",
+    "\u00a73b showed plain PCG scales as `t(N) ~ N^1.76` \u2014 that's the root cause of\n",
+    "slow large-grid solves, not any Python/binding overhead. A multigrid\n",
+    "preconditioner should restore near-`O(N)` scaling. The issue #256\n",
+    "acceptance target is **p < 1.2** at 64\u00b3 \u2192 128\u00b3.\n",
+    "\n",
+    "Fits `t = a\u00b7N^p` across 32\u00b3 / 64\u00b3 / 96\u00b3 / 128\u00b3 for three representative\n",
+    "choices: plain PCG, PCG+PFMG (Krylov + multigrid preconditioner), and MLMG\n",
+    "(AMReX-native geometric multigrid). Uses a uniform phase-0 block so each\n",
+    "solve is cheap and always percolates.\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "scaling_sizes = [32, 64, 96, 128]\n",
+    "scaling_combos = [\n",
+    "    (\"pcg\",      {\"solver\": \"pcg\"}),\n",
+    "    (\"pcg+pfmg\", {\"solver\": \"pcg\", \"preconditioner\": \"pfmg\"}),\n",
+    "    (\"mlmg\",     {\"solver\": \"mlmg\"}),\n",
+    "]\n",
+    "\n",
+    "scaling_records = []\n",
+    "for label, kw in scaling_combos:\n",
+    "    for N in scaling_sizes:\n",
+    "        arr = np.zeros((N, N, N), dtype=np.int32)  # uniform phase 0 \u2014 always percolates\n",
+    "        try:\n",
+    "            t0 = time.perf_counter()\n",
+    "            res = oi.tortuosity(arr, phase=0, direction=\"z\",\n",
+    "                                max_grid_size=min(32, N), verbose=0, **kw)\n",
+    "            dt = time.perf_counter() - t0\n",
+    "            scaling_records.append({\"solver\": label, \"N\": N, \"t\": dt,\n",
+    "                                    \"iters\": int(res.iterations),\n",
+    "                                    \"ok\": bool(res.solver_converged)})\n",
+    "            print(f\"  {label:10s}  {N:3d}^3  t={dt:7.3f}s  iters={res.iterations:4d}\")\n",
+    "        except TypeError as e:\n",
+    "            # Wheel predates the preconditioner kwarg \u2014 skip only those rows.\n",
+    "            msg = str(e)\n",
+    "            if (\"preconditioner\" in msg) and (\"preconditioner\" in kw):\n",
+    "                print(f\"  {label:10s}  {N:3d}^3  SKIP \u2014 wheel predates preconditioner kwarg\")\n",
+    "                scaling_records.append({\"solver\": label, \"N\": N, \"t\": np.nan,\n",
+    "                                        \"iters\": 0, \"ok\": False})\n",
+    "                continue\n",
+    "            raise\n",
+    "        except Exception as e:\n",
+    "            scaling_records.append({\"solver\": label, \"N\": N, \"t\": np.nan,\n",
+    "                                    \"iters\": 0, \"ok\": False})\n",
+    "            print(f\"  {label:10s}  {N:3d}^3  FAILED \u2014 {e}\")\n",
+    "\n",
+    "df_scale = pd.DataFrame(scaling_records)\n",
+    "df_scale\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Fit t = a \u00b7 N^p in log-log per solver, then overlay the curves with an\n",
+    "# O(N) reference line and the O(N^1.76) baseline from \u00a73b.\n",
+    "fig, ax = plt.subplots(figsize=(9, 5.5))\n",
+    "palette = {\"pcg\": \"#e74c3c\", \"pcg+pfmg\": \"#2ecc71\", \"mlmg\": \"#8e44ad\"}\n",
+    "fit_results = {}\n",
+    "for label, _ in scaling_combos:\n",
+    "    sub = df_scale[(df_scale[\"solver\"] == label) & df_scale[\"ok\"]].dropna(subset=[\"t\"])\n",
+    "    if len(sub) < 2:\n",
+    "        fit_results[label] = np.nan\n",
+    "        continue\n",
+    "    p, loga = np.polyfit(np.log(sub[\"N\"].astype(float)), np.log(sub[\"t\"].astype(float)), 1)\n",
+    "    fit_results[label] = p\n",
+    "    ax.loglog(sub[\"N\"], sub[\"t\"], \"o-\", color=palette.get(label, \"#333333\"),\n",
+    "              markersize=8, linewidth=2, label=f\"{label}  (p={p:.2f})\")\n",
+    "\n",
+    "# Reference lines \u2014 anchored at the first successful measurement\n",
+    "anchor = df_scale[df_scale[\"ok\"]].dropna(subset=[\"t\"]).sort_values(\"N\")\n",
+    "if not anchor.empty:\n",
+    "    N_anchor = float(anchor.iloc[0][\"N\"])\n",
+    "    t_anchor = float(anchor.iloc[0][\"t\"])\n",
+    "    Ns = np.array(scaling_sizes, dtype=float)\n",
+    "    ax.loglog(Ns, t_anchor * (Ns / N_anchor) ** 1.0,\n",
+    "              \"k--\", alpha=0.4, linewidth=1, label=\"O(N) reference\")\n",
+    "    ax.loglog(Ns, t_anchor * (Ns / N_anchor) ** 1.76,\n",
+    "              \"r:\", alpha=0.5, linewidth=1, label=\"O(N^1.76) (\u00a73b baseline)\")\n",
+    "\n",
+    "ax.set_xlabel(\"Grid size N  (domain = N^3)\")\n",
+    "ax.set_ylabel(\"Wall time (s)\")\n",
+    "ax.set_title(\"Scaling validation \u2014 multigrid vs. plain Krylov\",\n",
+    "             fontweight=\"bold\")\n",
+    "ax.legend(loc=\"upper left\", framealpha=0.9)\n",
+    "ax.grid(True, which=\"both\", alpha=0.3)\n",
+    "plt.tight_layout()\n",
+    "plt.show()\n",
+    "\n",
+    "# Verdict against acceptance target (issue #256: p < 1.2)\n",
+    "print(\"\\nScaling exponents vs. acceptance target p < 1.2:\")\n",
+    "for label, p in fit_results.items():\n",
+    "    if np.isnan(p):\n",
+    "        print(f\"  {label:10s}  n/a  (insufficient data)\")\n",
+    "    elif p < 1.2:\n",
+    "        print(f\"  {label:10s}  p = {p:.2f}  \u2713 near-linear \u2014 target met\")\n",
+    "    else:\n",
+    "        print(f\"  {label:10s}  p = {p:.2f}  \u2717 super-linear \u2014 target NOT met\")\n"
+   ]
+  },
   {
    "cell_type": "markdown",
    "metadata": {},

diff --git a/python/bindings/module.cpp b/python/bindings/module.cpp
@@ -11,6 +11,7 @@
 
 #include <pybind11/pybind11.h>
 #include <pybind11/numpy.h>
+#include <pybind11/stl.h>
 
 #include <AMReX.H>
 #include <AMReX_Box.H>
@@ -23,6 +24,15 @@
 #include <AMReX_RealBox.H>
 #include <AMReX_iMultiFab.H>
 
+#ifdef AMREX_USE_GPU
+#include <AMReX_Gpu.H>
+#endif
+#ifdef _OPENMP
+#include <omp.h>
+#endif
+
+#include <HYPRE_config.h>
+
 #include "VoxelImage.H"
 
 namespace py = pybind11;
@@ -56,6 +66,79 @@ static bool amrex_initialized() {
     return amrex::Initialized();
 }
 
+// =========================================================================
+// Build info — lets users verify which wheel they installed (CPU vs CUDA),
+// whether TinyProfile is on, OpenMP thread count, and visible GPU devices.
+// Critical for Colab users: §1a of the profiling notebook runs this first.
+// =========================================================================
+static py::dict build_info() {
+    py::dict info;
+
+#ifdef OPENIMPALA_USE_CUDA
+    info["cuda_enabled"] = true;
+#else
+    info["cuda_enabled"] = false;
+#endif
+
+#ifdef OPENIMPALA_USE_HIP
+    info["hip_enabled"] = true;
+#else
+    info["hip_enabled"] = false;
+#endif
+
+#ifdef OPENIMPALA_USE_GPU
+    info["gpu_enabled"] = true;
+#else
+    info["gpu_enabled"] = false;
+#endif
+
+#ifdef _OPENMP
+    info["openmp_enabled"] = true;
+    info["openmp_max_threads"] = omp_get_max_threads();
+#else
+    info["openmp_enabled"] = false;
+    info["openmp_max_threads"] = 1;
+#endif
+
+#ifdef AMREX_USE_MPI
+    info["mpi_enabled"] = true;
+#else
+    info["mpi_enabled"] = false;
+#endif
+
+#ifdef AMREX_TINY_PROFILE
+    info["tiny_profile"] = true;
+#else
+    info["tiny_profile"] = false;
+#endif
+
+#ifdef HYPRE_USING_CUDA
+    info["hypre_cuda"] = true;
+#else
+    info["hypre_cuda"] = false;
+#endif
+
+#ifdef HYPRE_USING_HIP
+    info["hypre_hip"] = true;
+#else
+    info["hypre_hip"] = false;
+#endif
+
+    // Runtime GPU device count — only meaningful when the wheel has GPU support
+    // AND AMReX has been initialised (otherwise the query can segfault).
+#ifdef AMREX_USE_GPU
+    if (amrex::Initialized()) {
+        info["gpu_device_count"] = amrex::Gpu::Device::numDevicesUsed();
+    } else {
+        info["gpu_device_count"] = -1; // unknown until init_amrex() called
+    }
+#else
+    info["gpu_device_count"] = 0;
+#endif
+
+    return info;
+}
+
 // =========================================================================
 // NumPy → VoxelImage factory
 // =========================================================================
@@ -128,6 +211,11 @@ PYBIND11_MODULE(_core, m) {
           "Shut down the AMReX runtime (no-op if not initialised).");
     m.def("amrex_initialized", &amrex_initialized,
           "Return True if the AMReX runtime is currently active.");
+    m.def("build_info", &build_info,
+          "Return a dict of compile-time feature flags (cuda, openmp, mpi, tiny_profile, "
+          "hypre_cuda) plus runtime GPU device count.  Use this to verify which wheel "
+          "is installed (CPU vs. CUDA) — critical for Colab users who need the GPU wheel "
+          "to actually use their T4/A100.");
 
     // --- VoxelImage opaque handle ---
     py::class_<OpenImpala::VoxelImage, std::shared_ptr<OpenImpala::VoxelImage>>(

diff --git a/python/bindings/solvers.cpp b/python/bindings/solvers.cpp
@@ -34,8 +34,7 @@ void init_solvers(py::module_& m) {
         .def(py::init([](std::shared_ptr<VoxelImage> img, amrex::Real vf, int phase,
                          OpenImpala::Direction dir, TortuosityHypre::SolverType solver_type,
                          const std::string& results_path, amrex::Real vlo, amrex::Real vhi,
-                         int verbose, bool write_plotfile,
-                         OpenImpala::PrecondType preconditioner) {
+                         int verbose, bool write_plotfile, OpenImpala::PrecondType preconditioner) {
                  return new TortuosityHypre(img->geom, img->ba, img->dm, *(img->mf), vf, phase, dir,
                                             solver_type, results_path, vlo, vhi, verbose,
                                             write_plotfile, preconditioner);
@@ -98,14 +97,13 @@ void init_solvers(py::module_& m) {
                          amrex::Real vlo, amrex::Real vhi, int verbose, bool write_plotfile,
                          amrex::Real eps, int maxiter, int max_coarsening_level) {
                  return new TortuosityMLMG(img->geom, img->ba, img->dm, *(img->mf), vf, phase, dir,
-                                           results_path, vlo, vhi, verbose, write_plotfile,
-                                           eps, maxiter, max_coarsening_level);
+                                           results_path, vlo, vhi, verbose, write_plotfile, eps,
+                                           maxiter, max_coarsening_level);
              }),
              py::arg("img"), py::arg("vf"), py::arg("phase"), py::arg("dir"),
              py::arg("results_path"), py::arg("vlo") = 0.0, py::arg("vhi") = 1.0,
-             py::arg("verbose") = 0, py::arg("write_plotfile") = false,
-             py::arg("eps") = 1.0e-9, py::arg("maxiter") = 200,
-             py::arg("max_coarsening_level") = 30, py::keep_alive<1, 2>())
+             py::arg("verbose") = 0, py::arg("write_plotfile") = false, py::arg("eps") = 1.0e-9,
+             py::arg("maxiter") = 200, py::arg("max_coarsening_level") = 30, py::keep_alive<1, 2>())
 
         .def(
             "value",

diff --git a/python/openimpala/__init__.py b/python/openimpala/__init__.py
@@ -65,7 +65,7 @@ def __getattr__(name):
     # Symbols that live in the facade module
     _FACADE_ATTRS = {
         "volume_fraction", "percolation_check", "tortuosity", "estimate_memory",
-        "read_image",
+        "read_image", "build_info",
     }
 
     if name in _CORE_ATTRS:
@@ -108,4 +108,5 @@ def __getattr__(name):
     "tortuosity",
     "estimate_memory",
     "read_image",
+    "build_info",
 ]