Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
185 changes: 156 additions & 29 deletions notebooks/profiling_and_tuning.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -136,17 +136,30 @@
"metadata": {},
"outputs": [],
"source": [
"def detect_build_backend():\n",
" \"\"\"Return ('cuda'|'hip'|'cpu', raw_init_banner).\"\"\"\n",
"# Use oi.build_info() when the wheel provides it (4.0.2+). Older wheels\n",
"# fall back to the subprocess banner probe so the notebook still works.\n",
"backend = None\n",
"info = None\n",
"try:\n",
" info = oi.build_info()\n",
" backend = info[\"backend\"] # \"cpp-cuda\", \"cpp-hip\", \"cpp-cpu\", \"pure-python\"\n",
" print(f\"openimpala backend: {backend}\")\n",
" print(f\" version: {info.get('version', 'unknown')}\")\n",
" print(f\" OpenMP threads: {info.get('openmp_max_threads', 1)}\")\n",
" print(f\" MPI enabled: {info.get('mpi_enabled', False)}\")\n",
" print(f\" HYPRE CUDA: {info.get('hypre_cuda', False)}\")\n",
" print(f\" TinyProfile: {info.get('tiny_profile', False)}\")\n",
" if info.get(\"gpu_device_count\", -1) > 0:\n",
" print(f\" GPU devices: {info['gpu_device_count']}\")\n",
"except AttributeError:\n",
" # Pre-4.0.2 wheel \u2014 no build_info(). Fall back to banner probe.\n",
" import subprocess, tempfile, os, textwrap\n",
" probe = textwrap.dedent(\"\"\"\n",
" import numpy as np, openimpala as oi\n",
" arr = np.zeros((8, 8, 8), dtype=np.int32) # uniform phase 0\n",
" arr = np.zeros((8, 8, 8), dtype=np.int32)\n",
" with oi.Session():\n",
" try:\n",
" oi.tortuosity(arr, phase=0, direction='z', solver='pcg', verbose=1)\n",
" except Exception as e:\n",
" print(f'__ERR__ {e}')\n",
" try: oi.tortuosity(arr, phase=0, direction='z', solver='pcg', verbose=1)\n",
" except Exception as e: print(f'__ERR__ {e}')\n",
" \"\"\").strip()\n",
" tmp = tempfile.mkdtemp()\n",
" path = os.path.join(tmp, \"probe.py\")\n",
Expand All @@ -155,36 +168,37 @@
" r = subprocess.run([sys.executable, path], capture_output=True, text=True, timeout=90)\n",
" banner = (r.stdout or \"\") + (r.stderr or \"\")\n",
" low = banner.lower()\n",
" if \"cuda\" in low or \"device 0\" in low or \"gpu\" in low and \"no gpu\" not in low:\n",
" return \"cuda\", banner\n",
" if \"hip\" in low and \"rocm\" in low:\n",
" return \"hip\", banner\n",
" return \"cpu\", banner\n",
" if \"cuda\" in low or (\"gpu\" in low and \"no gpu\" not in low):\n",
" backend = \"cpp-cuda\"\n",
" elif \"hip\" in low and \"rocm\" in low:\n",
" backend = \"cpp-hip\"\n",
" else:\n",
" backend = \"cpp-cpu\"\n",
" print(f\"openimpala backend (banner probe): {backend}\")\n",
" print(\"(install openimpala 4.0.2+ for a proper build_info() API)\")\n",
"\n",
"backend, banner = detect_build_backend()\n",
"print(f\"openimpala backend: {backend.upper()}\")\n",
"print(\"\\nInit banner (first 600 chars):\")\n",
"print(banner[:600])\n",
"is_gpu_build = backend in (\"cpp-cuda\", \"cpp-hip\")\n",
"\n",
"if gpu_name and backend == \"cpu\":\n",
" print(\"\\n\" + \"=\" * 70)\n",
"if gpu_name and not is_gpu_build:\n",
" print()\n",
" print(\"=\" * 70)\n",
" print(\" \u26a0 CPU-ONLY BUILD DETECTED ON A GPU-EQUIPPED HOST\")\n",
" print(\"=\" * 70)\n",
" print(f\" Host GPU: {gpu_name}\")\n",
" print(f\" openimpala build: CPU + OpenMP (no CUDA)\")\n",
" print(f\" Host GPU: {gpu_name}\")\n",
" print(f\" openimpala build: {backend}\")\n",
" print()\n",
" print(\" Your GPU is idle. Every solve in this notebook is CPU-bound.\")\n",
" print(\" For single-phase 3D diffusion on 128^3+ grids, a CUDA build\")\n",
" print(\" is typically 10-50x faster. This is almost certainly the\")\n",
" print(\" dominant source of 'OpenImpala feels slow on Colab'.\")\n",
" print(\" For single-phase 3D diffusion on 128^3+ grids, a CUDA build is\")\n",
" print(\" typically 10-50x faster \u2014 this is the dominant source of\")\n",
" print(\" 'OpenImpala feels slow on Colab'.\")\n",
" print()\n",
" print(\" Next steps:\")\n",
" print(\" - Install a CUDA-enabled openimpala wheel if one exists, or\")\n",
" print(\" - Build from source with -DAMReX_GPU_BACKEND=CUDA, or\")\n",
" print(\" - Use an Apptainer image built with CUDA support.\")\n",
" print(\" Fix: install the CUDA wheel instead.\")\n",
" print(\" !pip uninstall -y openimpala\")\n",
" print(\" !pip install openimpala-cuda\")\n",
" print(\" Then Runtime -> Restart session and re-run from the top.\")\n",
" print(\"=\" * 70)\n",
"elif backend != \"cpu\":\n",
" print(f\"\\n\u2713 Using {backend.upper()} backend on {gpu_name or 'accelerator'}\")\n",
"elif is_gpu_build:\n",
" print(f\"\\n\u2713 {backend.upper()} build active on {gpu_name or 'accelerator'}\")\n",
"else:\n",
" print(\"\\nCPU-only run (no accelerator detected).\")\n"
]
Expand Down Expand Up @@ -1064,6 +1078,119 @@
"print(f\"\\nBest solver at 64\u00b3 by wall time: {best_solver}\")\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### 9b. Scaling validation \u2014 does the multigrid preconditioner break super-linear PCG?\n",
"\n",
"\u00a73b showed plain PCG scales as `t(N) ~ N^1.76` \u2014 that's the root cause of\n",
"slow large-grid solves, not any Python/binding overhead. A multigrid\n",
"preconditioner should restore near-`O(N)` scaling. The issue #256\n",
"acceptance target is **p < 1.2** at 64\u00b3 \u2192 128\u00b3.\n",
"\n",
"Fits `t = a\u00b7N^p` across 32\u00b3 / 64\u00b3 / 96\u00b3 / 128\u00b3 for three representative\n",
"choices: plain PCG, PCG+PFMG (Krylov + multigrid preconditioner), and MLMG\n",
"(AMReX-native geometric multigrid). Uses a uniform phase-0 block so each\n",
"solve is cheap and always percolates.\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"scaling_sizes = [32, 64, 96, 128]\n",
"scaling_combos = [\n",
" (\"pcg\", {\"solver\": \"pcg\"}),\n",
" (\"pcg+pfmg\", {\"solver\": \"pcg\", \"preconditioner\": \"pfmg\"}),\n",
" (\"mlmg\", {\"solver\": \"mlmg\"}),\n",
"]\n",
"\n",
"scaling_records = []\n",
"for label, kw in scaling_combos:\n",
" for N in scaling_sizes:\n",
" arr = np.zeros((N, N, N), dtype=np.int32) # uniform phase 0 \u2014 always percolates\n",
" try:\n",
" t0 = time.perf_counter()\n",
" res = oi.tortuosity(arr, phase=0, direction=\"z\",\n",
" max_grid_size=min(32, N), verbose=0, **kw)\n",
" dt = time.perf_counter() - t0\n",
" scaling_records.append({\"solver\": label, \"N\": N, \"t\": dt,\n",
" \"iters\": int(res.iterations),\n",
" \"ok\": bool(res.solver_converged)})\n",
" print(f\" {label:10s} {N:3d}^3 t={dt:7.3f}s iters={res.iterations:4d}\")\n",
" except TypeError as e:\n",
" # Wheel predates the preconditioner kwarg \u2014 skip only those rows.\n",
" msg = str(e)\n",
" if (\"preconditioner\" in msg) and (\"preconditioner\" in kw):\n",
" print(f\" {label:10s} {N:3d}^3 SKIP \u2014 wheel predates preconditioner kwarg\")\n",
" scaling_records.append({\"solver\": label, \"N\": N, \"t\": np.nan,\n",
" \"iters\": 0, \"ok\": False})\n",
" continue\n",
" raise\n",
" except Exception as e:\n",
" scaling_records.append({\"solver\": label, \"N\": N, \"t\": np.nan,\n",
" \"iters\": 0, \"ok\": False})\n",
" print(f\" {label:10s} {N:3d}^3 FAILED \u2014 {e}\")\n",
"\n",
"df_scale = pd.DataFrame(scaling_records)\n",
"df_scale\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Fit t = a \u00b7 N^p in log-log per solver, then overlay the curves with an\n",
"# O(N) reference line and the O(N^1.76) baseline from \u00a73b.\n",
"fig, ax = plt.subplots(figsize=(9, 5.5))\n",
"palette = {\"pcg\": \"#e74c3c\", \"pcg+pfmg\": \"#2ecc71\", \"mlmg\": \"#8e44ad\"}\n",
"fit_results = {}\n",
"for label, _ in scaling_combos:\n",
" sub = df_scale[(df_scale[\"solver\"] == label) & df_scale[\"ok\"]].dropna(subset=[\"t\"])\n",
" if len(sub) < 2:\n",
" fit_results[label] = np.nan\n",
" continue\n",
" p, loga = np.polyfit(np.log(sub[\"N\"].astype(float)), np.log(sub[\"t\"].astype(float)), 1)\n",
" fit_results[label] = p\n",
" ax.loglog(sub[\"N\"], sub[\"t\"], \"o-\", color=palette.get(label, \"#333333\"),\n",
" markersize=8, linewidth=2, label=f\"{label} (p={p:.2f})\")\n",
"\n",
"# Reference lines \u2014 anchored at the first successful measurement\n",
"anchor = df_scale[df_scale[\"ok\"]].dropna(subset=[\"t\"]).sort_values(\"N\")\n",
"if not anchor.empty:\n",
" N_anchor = float(anchor.iloc[0][\"N\"])\n",
" t_anchor = float(anchor.iloc[0][\"t\"])\n",
" Ns = np.array(scaling_sizes, dtype=float)\n",
" ax.loglog(Ns, t_anchor * (Ns / N_anchor) ** 1.0,\n",
" \"k--\", alpha=0.4, linewidth=1, label=\"O(N) reference\")\n",
" ax.loglog(Ns, t_anchor * (Ns / N_anchor) ** 1.76,\n",
" \"r:\", alpha=0.5, linewidth=1, label=\"O(N^1.76) (\u00a73b baseline)\")\n",
"\n",
"ax.set_xlabel(\"Grid size N (domain = N^3)\")\n",
"ax.set_ylabel(\"Wall time (s)\")\n",
"ax.set_title(\"Scaling validation \u2014 multigrid vs. plain Krylov\",\n",
" fontweight=\"bold\")\n",
"ax.legend(loc=\"upper left\", framealpha=0.9)\n",
"ax.grid(True, which=\"both\", alpha=0.3)\n",
"plt.tight_layout()\n",
"plt.show()\n",
"\n",
"# Verdict against acceptance target (issue #256: p < 1.2)\n",
"print(\"\\nScaling exponents vs. acceptance target p < 1.2:\")\n",
"for label, p in fit_results.items():\n",
" if np.isnan(p):\n",
" print(f\" {label:10s} n/a (insufficient data)\")\n",
" elif p < 1.2:\n",
" print(f\" {label:10s} p = {p:.2f} \u2713 near-linear \u2014 target met\")\n",
" else:\n",
" print(f\" {label:10s} p = {p:.2f} \u2717 super-linear \u2014 target NOT met\")\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
Expand Down
88 changes: 88 additions & 0 deletions python/bindings/module.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@

#include <pybind11/pybind11.h>
#include <pybind11/numpy.h>
#include <pybind11/stl.h>

#include <AMReX.H>
#include <AMReX_Box.H>
Expand All @@ -23,6 +24,15 @@
#include <AMReX_RealBox.H>
#include <AMReX_iMultiFab.H>

#ifdef AMREX_USE_GPU
#include <AMReX_Gpu.H>
#endif
#ifdef _OPENMP
#include <omp.h>
#endif

#include <HYPRE_config.h>

#include "VoxelImage.H"

namespace py = pybind11;
Expand Down Expand Up @@ -56,6 +66,79 @@ static bool amrex_initialized() {
return amrex::Initialized();
}

// =========================================================================
// Build info — lets users verify which wheel they installed (CPU vs CUDA),
// whether TinyProfile is on, OpenMP thread count, and visible GPU devices.
// Critical for Colab users: §1a of the profiling notebook runs this first.
// =========================================================================
static py::dict build_info() {
py::dict info;

#ifdef OPENIMPALA_USE_CUDA
info["cuda_enabled"] = true;
#else
info["cuda_enabled"] = false;
#endif

#ifdef OPENIMPALA_USE_HIP
info["hip_enabled"] = true;
#else
info["hip_enabled"] = false;
#endif

#ifdef OPENIMPALA_USE_GPU
info["gpu_enabled"] = true;
#else
info["gpu_enabled"] = false;
#endif

#ifdef _OPENMP
info["openmp_enabled"] = true;
info["openmp_max_threads"] = omp_get_max_threads();
#else
info["openmp_enabled"] = false;
info["openmp_max_threads"] = 1;
#endif

#ifdef AMREX_USE_MPI
info["mpi_enabled"] = true;
#else
info["mpi_enabled"] = false;
#endif

#ifdef AMREX_TINY_PROFILE
info["tiny_profile"] = true;
#else
info["tiny_profile"] = false;
#endif

#ifdef HYPRE_USING_CUDA
info["hypre_cuda"] = true;
#else
info["hypre_cuda"] = false;
#endif

#ifdef HYPRE_USING_HIP
info["hypre_hip"] = true;
#else
info["hypre_hip"] = false;
#endif

// Runtime GPU device count — only meaningful when the wheel has GPU support
// AND AMReX has been initialised (otherwise the query can segfault).
#ifdef AMREX_USE_GPU
if (amrex::Initialized()) {
info["gpu_device_count"] = amrex::Gpu::Device::numDevicesUsed();
} else {
info["gpu_device_count"] = -1; // unknown until init_amrex() called
}
#else
info["gpu_device_count"] = 0;
#endif

return info;
}

// =========================================================================
// NumPy → VoxelImage factory
// =========================================================================
Expand Down Expand Up @@ -128,6 +211,11 @@ PYBIND11_MODULE(_core, m) {
"Shut down the AMReX runtime (no-op if not initialised).");
m.def("amrex_initialized", &amrex_initialized,
"Return True if the AMReX runtime is currently active.");
m.def("build_info", &build_info,
"Return a dict of compile-time feature flags (cuda, openmp, mpi, tiny_profile, "
"hypre_cuda) plus runtime GPU device count. Use this to verify which wheel "
"is installed (CPU vs. CUDA) — critical for Colab users who need the GPU wheel "
"to actually use their T4/A100.");

// --- VoxelImage opaque handle ---
py::class_<OpenImpala::VoxelImage, std::shared_ptr<OpenImpala::VoxelImage>>(
Expand Down
12 changes: 5 additions & 7 deletions python/bindings/solvers.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -34,8 +34,7 @@ void init_solvers(py::module_& m) {
.def(py::init([](std::shared_ptr<VoxelImage> img, amrex::Real vf, int phase,
OpenImpala::Direction dir, TortuosityHypre::SolverType solver_type,
const std::string& results_path, amrex::Real vlo, amrex::Real vhi,
int verbose, bool write_plotfile,
OpenImpala::PrecondType preconditioner) {
int verbose, bool write_plotfile, OpenImpala::PrecondType preconditioner) {
return new TortuosityHypre(img->geom, img->ba, img->dm, *(img->mf), vf, phase, dir,
solver_type, results_path, vlo, vhi, verbose,
write_plotfile, preconditioner);
Expand Down Expand Up @@ -98,14 +97,13 @@ void init_solvers(py::module_& m) {
amrex::Real vlo, amrex::Real vhi, int verbose, bool write_plotfile,
amrex::Real eps, int maxiter, int max_coarsening_level) {
return new TortuosityMLMG(img->geom, img->ba, img->dm, *(img->mf), vf, phase, dir,
results_path, vlo, vhi, verbose, write_plotfile,
eps, maxiter, max_coarsening_level);
results_path, vlo, vhi, verbose, write_plotfile, eps,
maxiter, max_coarsening_level);
}),
py::arg("img"), py::arg("vf"), py::arg("phase"), py::arg("dir"),
py::arg("results_path"), py::arg("vlo") = 0.0, py::arg("vhi") = 1.0,
py::arg("verbose") = 0, py::arg("write_plotfile") = false,
py::arg("eps") = 1.0e-9, py::arg("maxiter") = 200,
py::arg("max_coarsening_level") = 30, py::keep_alive<1, 2>())
py::arg("verbose") = 0, py::arg("write_plotfile") = false, py::arg("eps") = 1.0e-9,
py::arg("maxiter") = 200, py::arg("max_coarsening_level") = 30, py::keep_alive<1, 2>())

.def(
"value",
Expand Down
3 changes: 2 additions & 1 deletion python/openimpala/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ def __getattr__(name):
# Symbols that live in the facade module
_FACADE_ATTRS = {
"volume_fraction", "percolation_check", "tortuosity", "estimate_memory",
"read_image",
"read_image", "build_info",
}

if name in _CORE_ATTRS:
Expand Down Expand Up @@ -108,4 +108,5 @@ def __getattr__(name):
"tortuosity",
"estimate_memory",
"read_image",
"build_info",
]
Loading
Loading