diff --git a/notebooks/profiling_and_tuning.ipynb b/notebooks/profiling_and_tuning.ipynb index 0116b0a..8542cb8 100644 --- a/notebooks/profiling_and_tuning.ipynb +++ b/notebooks/profiling_and_tuning.ipynb @@ -136,17 +136,30 @@ "metadata": {}, "outputs": [], "source": [ - "def detect_build_backend():\n", - " \"\"\"Return ('cuda'|'hip'|'cpu', raw_init_banner).\"\"\"\n", + "# Use oi.build_info() when the wheel provides it (4.0.2+). Older wheels\n", + "# fall back to the subprocess banner probe so the notebook still works.\n", + "backend = None\n", + "info = None\n", + "try:\n", + " info = oi.build_info()\n", + " backend = info[\"backend\"] # \"cpp-cuda\", \"cpp-hip\", \"cpp-cpu\", \"pure-python\"\n", + " print(f\"openimpala backend: {backend}\")\n", + " print(f\" version: {info.get('version', 'unknown')}\")\n", + " print(f\" OpenMP threads: {info.get('openmp_max_threads', 1)}\")\n", + " print(f\" MPI enabled: {info.get('mpi_enabled', False)}\")\n", + " print(f\" HYPRE CUDA: {info.get('hypre_cuda', False)}\")\n", + " print(f\" TinyProfile: {info.get('tiny_profile', False)}\")\n", + " if info.get(\"gpu_device_count\", -1) > 0:\n", + " print(f\" GPU devices: {info['gpu_device_count']}\")\n", + "except AttributeError:\n", + " # Pre-4.0.2 wheel \u2014 no build_info(). Fall back to banner probe.\n", " import subprocess, tempfile, os, textwrap\n", " probe = textwrap.dedent(\"\"\"\n", " import numpy as np, openimpala as oi\n", - " arr = np.zeros((8, 8, 8), dtype=np.int32) # uniform phase 0\n", + " arr = np.zeros((8, 8, 8), dtype=np.int32)\n", " with oi.Session():\n", - " try:\n", - " oi.tortuosity(arr, phase=0, direction='z', solver='pcg', verbose=1)\n", - " except Exception as e:\n", - " print(f'__ERR__ {e}')\n", + " try: oi.tortuosity(arr, phase=0, direction='z', solver='pcg', verbose=1)\n", + " except Exception as e: print(f'__ERR__ {e}')\n", " \"\"\").strip()\n", " tmp = tempfile.mkdtemp()\n", " path = os.path.join(tmp, \"probe.py\")\n", @@ -155,36 +168,37 @@ " r = subprocess.run([sys.executable, path], capture_output=True, text=True, timeout=90)\n", " banner = (r.stdout or \"\") + (r.stderr or \"\")\n", " low = banner.lower()\n", - " if \"cuda\" in low or \"device 0\" in low or \"gpu\" in low and \"no gpu\" not in low:\n", - " return \"cuda\", banner\n", - " if \"hip\" in low and \"rocm\" in low:\n", - " return \"hip\", banner\n", - " return \"cpu\", banner\n", + " if \"cuda\" in low or (\"gpu\" in low and \"no gpu\" not in low):\n", + " backend = \"cpp-cuda\"\n", + " elif \"hip\" in low and \"rocm\" in low:\n", + " backend = \"cpp-hip\"\n", + " else:\n", + " backend = \"cpp-cpu\"\n", + " print(f\"openimpala backend (banner probe): {backend}\")\n", + " print(\"(install openimpala 4.0.2+ for a proper build_info() API)\")\n", "\n", - "backend, banner = detect_build_backend()\n", - "print(f\"openimpala backend: {backend.upper()}\")\n", - "print(\"\\nInit banner (first 600 chars):\")\n", - "print(banner[:600])\n", + "is_gpu_build = backend in (\"cpp-cuda\", \"cpp-hip\")\n", "\n", - "if gpu_name and backend == \"cpu\":\n", - " print(\"\\n\" + \"=\" * 70)\n", + "if gpu_name and not is_gpu_build:\n", + " print()\n", + " print(\"=\" * 70)\n", " print(\" \u26a0 CPU-ONLY BUILD DETECTED ON A GPU-EQUIPPED HOST\")\n", " print(\"=\" * 70)\n", - " print(f\" Host GPU: {gpu_name}\")\n", - " print(f\" openimpala build: CPU + OpenMP (no CUDA)\")\n", + " print(f\" Host GPU: {gpu_name}\")\n", + " print(f\" openimpala build: {backend}\")\n", " print()\n", " print(\" Your GPU is idle. Every solve in this notebook is CPU-bound.\")\n", - " print(\" For single-phase 3D diffusion on 128^3+ grids, a CUDA build\")\n", - " print(\" is typically 10-50x faster. This is almost certainly the\")\n", - " print(\" dominant source of 'OpenImpala feels slow on Colab'.\")\n", + " print(\" For single-phase 3D diffusion on 128^3+ grids, a CUDA build is\")\n", + " print(\" typically 10-50x faster \u2014 this is the dominant source of\")\n", + " print(\" 'OpenImpala feels slow on Colab'.\")\n", " print()\n", - " print(\" Next steps:\")\n", - " print(\" - Install a CUDA-enabled openimpala wheel if one exists, or\")\n", - " print(\" - Build from source with -DAMReX_GPU_BACKEND=CUDA, or\")\n", - " print(\" - Use an Apptainer image built with CUDA support.\")\n", + " print(\" Fix: install the CUDA wheel instead.\")\n", + " print(\" !pip uninstall -y openimpala\")\n", + " print(\" !pip install openimpala-cuda\")\n", + " print(\" Then Runtime -> Restart session and re-run from the top.\")\n", " print(\"=\" * 70)\n", - "elif backend != \"cpu\":\n", - " print(f\"\\n\u2713 Using {backend.upper()} backend on {gpu_name or 'accelerator'}\")\n", + "elif is_gpu_build:\n", + " print(f\"\\n\u2713 {backend.upper()} build active on {gpu_name or 'accelerator'}\")\n", "else:\n", " print(\"\\nCPU-only run (no accelerator detected).\")\n" ] @@ -1064,6 +1078,119 @@ "print(f\"\\nBest solver at 64\u00b3 by wall time: {best_solver}\")\n" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 9b. Scaling validation \u2014 does the multigrid preconditioner break super-linear PCG?\n", + "\n", + "\u00a73b showed plain PCG scales as `t(N) ~ N^1.76` \u2014 that's the root cause of\n", + "slow large-grid solves, not any Python/binding overhead. A multigrid\n", + "preconditioner should restore near-`O(N)` scaling. The issue #256\n", + "acceptance target is **p < 1.2** at 64\u00b3 \u2192 128\u00b3.\n", + "\n", + "Fits `t = a\u00b7N^p` across 32\u00b3 / 64\u00b3 / 96\u00b3 / 128\u00b3 for three representative\n", + "choices: plain PCG, PCG+PFMG (Krylov + multigrid preconditioner), and MLMG\n", + "(AMReX-native geometric multigrid). Uses a uniform phase-0 block so each\n", + "solve is cheap and always percolates.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "scaling_sizes = [32, 64, 96, 128]\n", + "scaling_combos = [\n", + " (\"pcg\", {\"solver\": \"pcg\"}),\n", + " (\"pcg+pfmg\", {\"solver\": \"pcg\", \"preconditioner\": \"pfmg\"}),\n", + " (\"mlmg\", {\"solver\": \"mlmg\"}),\n", + "]\n", + "\n", + "scaling_records = []\n", + "for label, kw in scaling_combos:\n", + " for N in scaling_sizes:\n", + " arr = np.zeros((N, N, N), dtype=np.int32) # uniform phase 0 \u2014 always percolates\n", + " try:\n", + " t0 = time.perf_counter()\n", + " res = oi.tortuosity(arr, phase=0, direction=\"z\",\n", + " max_grid_size=min(32, N), verbose=0, **kw)\n", + " dt = time.perf_counter() - t0\n", + " scaling_records.append({\"solver\": label, \"N\": N, \"t\": dt,\n", + " \"iters\": int(res.iterations),\n", + " \"ok\": bool(res.solver_converged)})\n", + " print(f\" {label:10s} {N:3d}^3 t={dt:7.3f}s iters={res.iterations:4d}\")\n", + " except TypeError as e:\n", + " # Wheel predates the preconditioner kwarg \u2014 skip only those rows.\n", + " msg = str(e)\n", + " if (\"preconditioner\" in msg) and (\"preconditioner\" in kw):\n", + " print(f\" {label:10s} {N:3d}^3 SKIP \u2014 wheel predates preconditioner kwarg\")\n", + " scaling_records.append({\"solver\": label, \"N\": N, \"t\": np.nan,\n", + " \"iters\": 0, \"ok\": False})\n", + " continue\n", + " raise\n", + " except Exception as e:\n", + " scaling_records.append({\"solver\": label, \"N\": N, \"t\": np.nan,\n", + " \"iters\": 0, \"ok\": False})\n", + " print(f\" {label:10s} {N:3d}^3 FAILED \u2014 {e}\")\n", + "\n", + "df_scale = pd.DataFrame(scaling_records)\n", + "df_scale\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Fit t = a \u00b7 N^p in log-log per solver, then overlay the curves with an\n", + "# O(N) reference line and the O(N^1.76) baseline from \u00a73b.\n", + "fig, ax = plt.subplots(figsize=(9, 5.5))\n", + "palette = {\"pcg\": \"#e74c3c\", \"pcg+pfmg\": \"#2ecc71\", \"mlmg\": \"#8e44ad\"}\n", + "fit_results = {}\n", + "for label, _ in scaling_combos:\n", + " sub = df_scale[(df_scale[\"solver\"] == label) & df_scale[\"ok\"]].dropna(subset=[\"t\"])\n", + " if len(sub) < 2:\n", + " fit_results[label] = np.nan\n", + " continue\n", + " p, loga = np.polyfit(np.log(sub[\"N\"].astype(float)), np.log(sub[\"t\"].astype(float)), 1)\n", + " fit_results[label] = p\n", + " ax.loglog(sub[\"N\"], sub[\"t\"], \"o-\", color=palette.get(label, \"#333333\"),\n", + " markersize=8, linewidth=2, label=f\"{label} (p={p:.2f})\")\n", + "\n", + "# Reference lines \u2014 anchored at the first successful measurement\n", + "anchor = df_scale[df_scale[\"ok\"]].dropna(subset=[\"t\"]).sort_values(\"N\")\n", + "if not anchor.empty:\n", + " N_anchor = float(anchor.iloc[0][\"N\"])\n", + " t_anchor = float(anchor.iloc[0][\"t\"])\n", + " Ns = np.array(scaling_sizes, dtype=float)\n", + " ax.loglog(Ns, t_anchor * (Ns / N_anchor) ** 1.0,\n", + " \"k--\", alpha=0.4, linewidth=1, label=\"O(N) reference\")\n", + " ax.loglog(Ns, t_anchor * (Ns / N_anchor) ** 1.76,\n", + " \"r:\", alpha=0.5, linewidth=1, label=\"O(N^1.76) (\u00a73b baseline)\")\n", + "\n", + "ax.set_xlabel(\"Grid size N (domain = N^3)\")\n", + "ax.set_ylabel(\"Wall time (s)\")\n", + "ax.set_title(\"Scaling validation \u2014 multigrid vs. plain Krylov\",\n", + " fontweight=\"bold\")\n", + "ax.legend(loc=\"upper left\", framealpha=0.9)\n", + "ax.grid(True, which=\"both\", alpha=0.3)\n", + "plt.tight_layout()\n", + "plt.show()\n", + "\n", + "# Verdict against acceptance target (issue #256: p < 1.2)\n", + "print(\"\\nScaling exponents vs. acceptance target p < 1.2:\")\n", + "for label, p in fit_results.items():\n", + " if np.isnan(p):\n", + " print(f\" {label:10s} n/a (insufficient data)\")\n", + " elif p < 1.2:\n", + " print(f\" {label:10s} p = {p:.2f} \u2713 near-linear \u2014 target met\")\n", + " else:\n", + " print(f\" {label:10s} p = {p:.2f} \u2717 super-linear \u2014 target NOT met\")\n" + ] + }, { "cell_type": "markdown", "metadata": {}, diff --git a/python/bindings/module.cpp b/python/bindings/module.cpp index 6b04863..08ad670 100644 --- a/python/bindings/module.cpp +++ b/python/bindings/module.cpp @@ -11,6 +11,7 @@ #include #include +#include #include #include @@ -23,6 +24,15 @@ #include #include +#ifdef AMREX_USE_GPU +#include +#endif +#ifdef _OPENMP +#include +#endif + +#include + #include "VoxelImage.H" namespace py = pybind11; @@ -56,6 +66,79 @@ static bool amrex_initialized() { return amrex::Initialized(); } +// ========================================================================= +// Build info — lets users verify which wheel they installed (CPU vs CUDA), +// whether TinyProfile is on, OpenMP thread count, and visible GPU devices. +// Critical for Colab users: §1a of the profiling notebook runs this first. +// ========================================================================= +static py::dict build_info() { + py::dict info; + +#ifdef OPENIMPALA_USE_CUDA + info["cuda_enabled"] = true; +#else + info["cuda_enabled"] = false; +#endif + +#ifdef OPENIMPALA_USE_HIP + info["hip_enabled"] = true; +#else + info["hip_enabled"] = false; +#endif + +#ifdef OPENIMPALA_USE_GPU + info["gpu_enabled"] = true; +#else + info["gpu_enabled"] = false; +#endif + +#ifdef _OPENMP + info["openmp_enabled"] = true; + info["openmp_max_threads"] = omp_get_max_threads(); +#else + info["openmp_enabled"] = false; + info["openmp_max_threads"] = 1; +#endif + +#ifdef AMREX_USE_MPI + info["mpi_enabled"] = true; +#else + info["mpi_enabled"] = false; +#endif + +#ifdef AMREX_TINY_PROFILE + info["tiny_profile"] = true; +#else + info["tiny_profile"] = false; +#endif + +#ifdef HYPRE_USING_CUDA + info["hypre_cuda"] = true; +#else + info["hypre_cuda"] = false; +#endif + +#ifdef HYPRE_USING_HIP + info["hypre_hip"] = true; +#else + info["hypre_hip"] = false; +#endif + + // Runtime GPU device count — only meaningful when the wheel has GPU support + // AND AMReX has been initialised (otherwise the query can segfault). +#ifdef AMREX_USE_GPU + if (amrex::Initialized()) { + info["gpu_device_count"] = amrex::Gpu::Device::numDevicesUsed(); + } else { + info["gpu_device_count"] = -1; // unknown until init_amrex() called + } +#else + info["gpu_device_count"] = 0; +#endif + + return info; +} + // ========================================================================= // NumPy → VoxelImage factory // ========================================================================= @@ -128,6 +211,11 @@ PYBIND11_MODULE(_core, m) { "Shut down the AMReX runtime (no-op if not initialised)."); m.def("amrex_initialized", &amrex_initialized, "Return True if the AMReX runtime is currently active."); + m.def("build_info", &build_info, + "Return a dict of compile-time feature flags (cuda, openmp, mpi, tiny_profile, " + "hypre_cuda) plus runtime GPU device count. Use this to verify which wheel " + "is installed (CPU vs. CUDA) — critical for Colab users who need the GPU wheel " + "to actually use their T4/A100."); // --- VoxelImage opaque handle --- py::class_>( diff --git a/python/bindings/solvers.cpp b/python/bindings/solvers.cpp index 7f9edb6..1ce9b7d 100644 --- a/python/bindings/solvers.cpp +++ b/python/bindings/solvers.cpp @@ -34,8 +34,7 @@ void init_solvers(py::module_& m) { .def(py::init([](std::shared_ptr img, amrex::Real vf, int phase, OpenImpala::Direction dir, TortuosityHypre::SolverType solver_type, const std::string& results_path, amrex::Real vlo, amrex::Real vhi, - int verbose, bool write_plotfile, - OpenImpala::PrecondType preconditioner) { + int verbose, bool write_plotfile, OpenImpala::PrecondType preconditioner) { return new TortuosityHypre(img->geom, img->ba, img->dm, *(img->mf), vf, phase, dir, solver_type, results_path, vlo, vhi, verbose, write_plotfile, preconditioner); @@ -98,14 +97,13 @@ void init_solvers(py::module_& m) { amrex::Real vlo, amrex::Real vhi, int verbose, bool write_plotfile, amrex::Real eps, int maxiter, int max_coarsening_level) { return new TortuosityMLMG(img->geom, img->ba, img->dm, *(img->mf), vf, phase, dir, - results_path, vlo, vhi, verbose, write_plotfile, - eps, maxiter, max_coarsening_level); + results_path, vlo, vhi, verbose, write_plotfile, eps, + maxiter, max_coarsening_level); }), py::arg("img"), py::arg("vf"), py::arg("phase"), py::arg("dir"), py::arg("results_path"), py::arg("vlo") = 0.0, py::arg("vhi") = 1.0, - py::arg("verbose") = 0, py::arg("write_plotfile") = false, - py::arg("eps") = 1.0e-9, py::arg("maxiter") = 200, - py::arg("max_coarsening_level") = 30, py::keep_alive<1, 2>()) + py::arg("verbose") = 0, py::arg("write_plotfile") = false, py::arg("eps") = 1.0e-9, + py::arg("maxiter") = 200, py::arg("max_coarsening_level") = 30, py::keep_alive<1, 2>()) .def( "value", diff --git a/python/openimpala/__init__.py b/python/openimpala/__init__.py index 229d776..f4ca2ed 100644 --- a/python/openimpala/__init__.py +++ b/python/openimpala/__init__.py @@ -65,7 +65,7 @@ def __getattr__(name): # Symbols that live in the facade module _FACADE_ATTRS = { "volume_fraction", "percolation_check", "tortuosity", "estimate_memory", - "read_image", + "read_image", "build_info", } if name in _CORE_ATTRS: @@ -108,4 +108,5 @@ def __getattr__(name): "tortuosity", "estimate_memory", "read_image", + "build_info", ] diff --git a/python/openimpala/facade.py b/python/openimpala/facade.py index f411568..84bf406 100644 --- a/python/openimpala/facade.py +++ b/python/openimpala/facade.py @@ -531,6 +531,63 @@ def estimate_memory( } +def build_info() -> dict: + """Return compile-time + runtime feature flags for the installed wheel. + + Critical for Colab users who need to verify they got the GPU wheel + (``pip install openimpala-cuda``) and not the default CPU wheel. + + Returns + ------- + dict + Keys: + + * ``backend``: ``"cpp-cuda"``, ``"cpp-hip"``, ``"cpp-cpu"``, or ``"pure-python"`` + * ``cuda_enabled`` / ``hip_enabled`` / ``gpu_enabled``: bool + * ``openmp_enabled``: bool; ``openmp_max_threads``: int + * ``mpi_enabled``: bool + * ``tiny_profile``: bool (BL_PROFILE regions emit a table at shutdown) + * ``hypre_cuda`` / ``hypre_hip``: bool (HYPRE solver device support) + * ``gpu_device_count``: int (runtime — ``-1`` if AMReX not yet initialised) + * ``version``: str (package version) + """ + from . import __version__ + + if _is_pure_python(): + # Pure-Python fallback (SciPy/CuPy) — no compiled backend at all. + # CuPy CG is GPU-accelerated but has no OpenMP / HYPRE / TinyProfile. + try: + import cupy # noqa: F401 + has_cupy = True + except ImportError: + has_cupy = False + return { + "backend": "pure-python", + "cuda_enabled": has_cupy, + "hip_enabled": False, + "gpu_enabled": has_cupy, + "openmp_enabled": False, + "openmp_max_threads": 1, + "mpi_enabled": False, + "tiny_profile": False, + "hypre_cuda": False, + "hypre_hip": False, + "gpu_device_count": -1, + "version": __version__, + } + + _core = _get_core() + info = dict(_core.build_info()) + if info.get("cuda_enabled"): + info["backend"] = "cpp-cuda" + elif info.get("hip_enabled"): + info["backend"] = "cpp-hip" + else: + info["backend"] = "cpp-cpu" + info["version"] = __version__ + return info + + def read_image( path: str, threshold: float = 0.5, diff --git a/src/props/TortuosityHypre.H b/src/props/TortuosityHypre.H index 6031878..f57968a 100644 --- a/src/props/TortuosityHypre.H +++ b/src/props/TortuosityHypre.H @@ -73,8 +73,7 @@ public: const amrex::Real vf, const int phase, const OpenImpala::Direction dir, const SolverType solvertype, const std::string& resultspath, const amrex::Real vlo = 0.0, const amrex::Real vhi = 1.0, int verbose = 0, - bool write_plotfile = false, - const PrecondType precond_type = PrecondType::SMG); + bool write_plotfile = false, const PrecondType precond_type = PrecondType::SMG); /** Destructor. Base class handles HYPRE resource cleanup. */ virtual ~TortuosityHypre() override = default; diff --git a/src/props/TortuosityHypre.cpp b/src/props/TortuosityHypre.cpp index e5bd090..c2afd86 100644 --- a/src/props/TortuosityHypre.cpp +++ b/src/props/TortuosityHypre.cpp @@ -93,10 +93,9 @@ OpenImpala::TortuosityHypre::TortuosityHypre(const amrex::Geometry& geom, const m_mf_phase(ba, dm, mf_phase_input.nComp(), mf_phase_input.nGrow()), m_phase(phase), m_vf(vf), m_dir(dir), m_vlo(vlo), m_vhi(vhi), m_resultspath(resultspath), m_write_plotfile(write_plotfile), m_precond_type(precond_type), - m_mf_phi(ba, dm, numComponentsPhi, 1), - m_mf_active_mask(ba, dm, 1, 1), m_mf_diff_coeff(ba, dm, 1, 1), m_active_vf(0.0), - m_first_call(true), m_value(std::numeric_limits::quiet_NaN()), m_flux_in(0.0), - m_flux_out(0.0) { + m_mf_phi(ba, dm, numComponentsPhi, 1), m_mf_active_mask(ba, dm, 1, 1), + m_mf_diff_coeff(ba, dm, 1, 1), m_active_vf(0.0), m_first_call(true), + m_value(std::numeric_limits::quiet_NaN()), m_flux_in(0.0), m_flux_out(0.0) { // Ensure HYPRE is initialised exactly once (thread-safe via std::call_once). // C++ tests call HYPRE_Init() in main(), but Python bindings have no main(). static std::once_flag hypre_once; diff --git a/src/props/TortuosityMLMG.H b/src/props/TortuosityMLMG.H index a9d3502..8ed9e6e 100644 --- a/src/props/TortuosityMLMG.H +++ b/src/props/TortuosityMLMG.H @@ -71,8 +71,7 @@ public: const amrex::Real vf, const int phase, const OpenImpala::Direction dir, const std::string& resultspath, const amrex::Real vlo = 0.0, const amrex::Real vhi = 1.0, int verbose = 0, bool write_plotfile = false, - amrex::Real eps = 1.0e-9, int maxiter = 200, - int max_coarsening_level = 30); + amrex::Real eps = 1.0e-9, int maxiter = 200, int max_coarsening_level = 30); ~TortuosityMLMG() override = default;