BASE-Laboratory · jameslehoux · Apr 21, 2026 · Apr 21, 2026 · Apr 21, 2026 · Apr 21, 2026
diff --git a/.github/workflows/pypi-wheels-cpu.yml b/.github/workflows/pypi-wheels-cpu.yml
@@ -41,7 +41,7 @@ jobs:
         uses: actions/cache@v4
         with:
           path: .cibw-deps-cache
-          key: cibw-deps-manylinux_2_28-x86_64-hdf5_1.14.6-tiff_4.6.0-hypre_2.31.0-amrex_25.03-tinyprof-v4
+          key: cibw-deps-manylinux_2_28-x86_64-hdf5_1.14.6-tiff_4.6.0-hypre_2.31.0-amrex_25.03-tinyprof-v5
 
       - name: Build wheels
         run: python -m cibuildwheel --output-dir wheelhouse
@@ -123,7 +123,6 @@ jobs:
             CMAKE_PREFIX_PATH="/usr/local"
             CMAKE_GENERATOR="Unix Makefiles"
             SETUPTOOLS_SCM_PRETEND_VERSION="${{ steps.version.outputs.version }}"
-            SKBUILD_CMAKE_ARGS="-DOPENIMPALA_ENABLE_TINY_PROFILE=ON"
 
           CIBW_REPAIR_WHEEL_COMMAND_LINUX: >
             auditwheel repair -w {dest_dir} {wheel}

diff --git a/.github/workflows/pypi-wheels-gpu.yml b/.github/workflows/pypi-wheels-gpu.yml
@@ -45,7 +45,7 @@ jobs:
         uses: actions/cache@v4
         with:
           path: .cibw-deps-cache
-          key: cibw-deps-gpu-cuda12.6-manylinux_2_34-x86_64-hdf5_1.14.6-tiff_4.6.0-hypre_2.31.0-amrex_25.03-gcc13-tinyprof-v3
+          key: cibw-deps-gpu-cuda12.6-manylinux_2_34-x86_64-hdf5_1.14.6-tiff_4.6.0-hypre_2.31.0-amrex_25.03-gcc13-nvtx3-v4
 
       - name: Build GPU wheels
         run: python -m cibuildwheel --output-dir wheelhouse
@@ -117,6 +117,7 @@ jobs:
             make install &&
             cd ../.. &&
             git clone --depth 1 --branch 25.03 https://github.com/AMReX-Codes/amrex.git /tmp/amrex &&
+            sed -i 's|CUDA::nvToolsExt|CUDA::nvtx3|g' /tmp/amrex/Tools/CMake/AMReXParallelBackends.cmake &&
             cmake -S /tmp/amrex -B /tmp/amrex/build
             -DCMAKE_INSTALL_PREFIX=/usr/local
             -DCMAKE_BUILD_TYPE=Release
@@ -138,11 +139,16 @@ jobs:
             tar czf /project/.cibw-deps-cache/deps.tar.gz /usr/local ;
             fi
 
-          # Rename the package to openimpala-cuda for the GPU wheel.
+          # Rename the package to openimpala-cuda and uncomment the nvidia-*-cu12
+          # runtime deps (kept commented in pyproject.toml so they don't pollute
+          # the CPU wheel). auditwheel --exclude drops the CUDA .so's from the
+          # wheel payload, so without these PyPI deps the wheel breaks on any
+          # machine that doesn't already have the CUDA toolkit installed.
           # The import name stays 'openimpala' — only the PyPI distribution name changes.
           CIBW_BEFORE_BUILD: >
             pip install "cmake>=3.28,<4" &&
-            sed -i 's/^name = "openimpala"/name = "openimpala-cuda"/' /project/pyproject.toml
+            sed -i 's/^name = "openimpala"/name = "openimpala-cuda"/' /project/pyproject.toml &&
+            sed -i 's/^    #"nvidia-/    "nvidia-/' /project/pyproject.toml
 
           # Point to MPI, CUDA, and our compiled GPU dependencies.
           CIBW_ENVIRONMENT_LINUX: >
@@ -157,7 +163,6 @@ jobs:
             CMAKE_PREFIX_PATH="/usr/local"
             CMAKE_GENERATOR="Unix Makefiles"
             CMAKE_ARGS="-DGPU_BACKEND=CUDA '-DCMAKE_CUDA_ARCHITECTURES=60;70;75;80;86;89;90' -DCMAKE_CUDA_HOST_COMPILER=/opt/rh/gcc-toolset-13/root/usr/bin/g++"
-            SKBUILD_CMAKE_ARGS="-DOPENIMPALA_ENABLE_TINY_PROFILE=ON"
             SETUPTOOLS_SCM_PRETEND_VERSION="${{ steps.version.outputs.version }}"
 
           # Vendor libraries but exclude host-specific MPI, OpenMP, Fortran runtime,
@@ -200,12 +205,16 @@ jobs:
           name: cibw-wheels-gpu
           path: ./wheelhouse/*.whl
 
-  upload_to_github_release:
-    name: Upload GPU wheels to GitHub Release
+  publish_to_pypi:
+    name: Publish GPU wheels to PyPI (openimpala-cuda)
     needs: build_gpu_wheels
     runs-on: ubuntu-latest
+    # Only publish on release; workflow_dispatch leaves the artifact for manual
+    # inspection without touching PyPI.
+    if: github.event_name == 'release'
+    environment: pypi
     permissions:
-      contents: write
+      id-token: write
 
     steps:
       - name: Download wheel artifacts
@@ -214,12 +223,7 @@ jobs:
           name: cibw-wheels-gpu
           path: dist/
 
-      - name: Upload wheels to GitHub Release
-        if: github.event_name == 'release'
-        uses: softprops/action-gh-release@v2
+      - name: Publish to PyPI
+        uses: pypa/gh-action-pypi-publish@release/v1
         with:
-          files: dist/*.whl
-
-      - name: List wheels (workflow_dispatch — no release to upload to)
-        if: github.event_name == 'workflow_dispatch'
-        run: ls -lh dist/
+          skip-existing: true
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -161,19 +161,12 @@ endif()
 # ==============================================================================
 # TinyProfiler
 # ==============================================================================
-# AMReX's TinyProfiler emits a function-level timing table at AMReX::Finalize().
-# Useful for diagnosing C++ hotspots from the profiling notebook (§7).
-#
-# This option assumes AMReX was itself built with AMReX_TINY_PROFILE=ON — if not,
-# BL_PROFILE regions compile away and no table is emitted. The wheel CI builds
-# AMReX with this flag when OPENIMPALA_ENABLE_TINY_PROFILE=ON.
-option(OPENIMPALA_ENABLE_TINY_PROFILE
-       "Enable AMReX TinyProfiler instrumentation (BL_PROFILE regions)" OFF)
-if(OPENIMPALA_ENABLE_TINY_PROFILE)
-    add_compile_definitions(AMREX_TINY_PROFILE)
-    message(STATUS "AMReX TinyProfiler: ENABLED (AMReX must also be built with "
-                   "AMReX_TINY_PROFILE=ON for tables to appear)")
-endif()
+# AMReX emits a function-level BL_PROFILE timing table at AMReX::Finalize() when
+# AMReX itself is built with -DAMReX_TINY_PROFILE=ON. That flag is exported via
+# the AMReX::amrex target's INTERFACE_COMPILE_DEFINITIONS, so every target that
+# links against AMReX::amrex picks up AMREX_TINY_PROFILE automatically — no
+# OpenImpala-side option is needed. The wheel CI sets -DAMReX_TINY_PROFILE=ON
+# when building AMReX from source.
 
 # ==============================================================================
 # Library targets

diff --git a/README.md b/README.md
@@ -145,10 +145,12 @@ If CuPy is not available, OpenImpala falls back to SciPy on the CPU.
 CUDA support:
 
 ```bash
-pip install openimpala-cuda --find-links \
-  https://github.com/BASE-Laboratory/OpenImpala/releases/expanded_assets/v4.0.6
+pip install openimpala-cuda
 ```
 
+The `openimpala-cuda` wheel requires a working NVIDIA CUDA 12 runtime (driver
++ toolkit). On Colab, Kaggle, and most cluster nodes this is already present.
+
 To install with optional dependencies:
 
 ```bash

diff --git a/docs/getting-started.md b/docs/getting-started.md
@@ -29,27 +29,28 @@ For HPC clusters that need the compiled C++ HYPRE solvers, a separate package
 is available:
 
 ```bash
-pip install openimpala-cuda --find-links \
-  https://github.com/BASE-Laboratory/OpenImpala/releases/expanded_assets/v4.0.6
+pip install openimpala-cuda
 ```
 
 This package bundles AMReX + HYPRE compiled with CUDA and is a drop-in
-replacement for the pure-Python `openimpala` package.
+replacement for the pure-Python `openimpala` package. It requires a working
+NVIDIA CUDA 12 runtime (driver + toolkit) on the host, which is already
+present on Colab, Kaggle, and most GPU cluster nodes.
 
 ### Container (HPC)
 
 For HPC clusters, download the pre-built Apptainer/Singularity container from
 [GitHub Releases](https://github.com/BASE-Laboratory/OpenImpala/releases):
 
 ```bash
-# Download the latest .sif file
-wget https://github.com/BASE-Laboratory/OpenImpala/releases/expanded_assets/v4.0.6openimpala-v4.0.0.sif
+# Download the latest .sif file (replace vX.Y.Z with the release tag)
+wget https://github.com/BASE-Laboratory/OpenImpala/releases/download/vX.Y.Z/openimpala-vX.Y.Z.sif
 
 # Run interactively
-apptainer shell openimpala-v4.0.0.sif
+apptainer shell openimpala-vX.Y.Z.sif
 
 # Run a simulation
-apptainer exec openimpala-v4.0.0.sif /opt/OpenImpala/build/Diffusion3d inputs
+apptainer exec openimpala-vX.Y.Z.sif /opt/OpenImpala/build/Diffusion3d inputs
 ```
 
 ### From source (developers)

diff --git a/docs/user-guide/gpu.md b/docs/user-guide/gpu.md
@@ -60,11 +60,10 @@ with oi.Session():
 
 For HPC clusters that need the compiled C++ HYPRE linear solvers with native
 CUDA support (AMReX + HYPRE compiled with CUDA), a separate package is
-available:
+available on PyPI:
 
 ```bash
-pip install openimpala-cuda --find-links \
-  https://github.com/BASE-Laboratory/OpenImpala/releases/expanded_assets/v4.0.6
+pip install openimpala-cuda
 ```
 
 The `openimpala-cuda` package is a drop-in replacement for `openimpala` and

diff --git a/notebooks/visualization_yt.ipynb b/notebooks/visualization_yt.ipynb
@@ -35,24 +35,7 @@
    "execution_count": null,
    "metadata": {},
    "outputs": [],
-   "source": [
-    "import subprocess, sys\n",
-    "\n",
-    "def _has_gpu():\n",
-    "    try:\n",
-    "        subprocess.check_output([\"nvidia-smi\"], stderr=subprocess.DEVNULL)\n",
-    "        return True\n",
-    "    except (FileNotFoundError, subprocess.CalledProcessError):\n",
-    "        return False\n",
-    "\n",
-    "_extras = \"yt matplotlib porespy\"\n",
-    "if _has_gpu():\n",
-    "    print(\"GPU detected — installing openimpala-cuda\")\n",
-    "    !pip install -q openimpala-cuda --find-links https://github.com/BASE-Laboratory/OpenImpala/releases/latest nvidia-cuda-runtime-cu12 nvidia-cublas-cu12 nvidia-cusparse-cu12 nvidia-curand-cu12 {_extras}\n",
-    "else:\n",
-    "    print(\"No GPU detected — installing openimpala (CPU)\")\n",
-    "    !pip install -q openimpala {_extras}"
-   ]
+   "source": "import subprocess, sys\n\ndef _has_gpu():\n    try:\n        subprocess.check_output([\"nvidia-smi\"], stderr=subprocess.DEVNULL)\n        return True\n    except (FileNotFoundError, subprocess.CalledProcessError):\n        return False\n\n_extras = \"yt matplotlib porespy\"\nif _has_gpu():\n    print(\"GPU detected — installing openimpala-cuda\")\n    !pip install -q openimpala-cuda {_extras}\nelse:\n    print(\"No GPU detected — installing openimpala (CPU)\")\n    !pip install -q openimpala {_extras}"
   },
   {
    "cell_type": "markdown",
@@ -413,4 +396,4 @@
  },
  "nbformat": 4,
  "nbformat_minor": 4
-}
+}
diff --git a/paper.md b/paper.md
@@ -83,7 +83,7 @@ with oi.Session():
     print(f"Tortuosity: {result.tortuosity:.4f}")
 ```
 
-A pure-Python package is distributed via PyPI (`pip install openimpala`) with automatic GPU acceleration via CuPy when available, and compiled CUDA GPU wheels with HYPRE solvers are available via GitHub Releases (`pip install openimpala-cuda`) for HPC deployments. Interactive tutorial notebooks are provided for Google Colab, covering workflows from basic tortuosity computation to digital twin parameterisation with PyBaMM. API reference documentation, installation guides, and interactive tutorial notebooks are available at https://base-laboratory.github.io/OpenImpala/
+A pure-Python package is distributed via PyPI (`pip install openimpala`) with automatic GPU acceleration via CuPy when available, and compiled CUDA GPU wheels with HYPRE solvers are also distributed via PyPI (`pip install openimpala-cuda`) for HPC deployments. Interactive tutorial notebooks are provided for Google Colab, covering workflows from basic tortuosity computation to digital twin parameterisation with PyBaMM. API reference documentation, installation guides, and interactive tutorial notebooks are available at https://base-laboratory.github.io/OpenImpala/
 
 ## Testing and Quality Assurance
 

diff --git a/pyproject.toml b/pyproject.toml
@@ -16,6 +16,17 @@ requires-python = ">=3.8"
 dependencies = [
     "numpy",
     "scipy>=1.7",
+    # GPU-only deps: kept commented out so the CPU wheel stays lean.
+    # The openimpala-cuda build in .github/workflows/pypi-wheels-gpu.yml
+    # uncomments these via sed before scikit-build-core reads this file.
+    # Do not remove the "# cuda-" markers — the sed rule keys off them.
+    # cuda-deps-start
+    #"nvidia-cuda-runtime-cu12",
+    #"nvidia-cublas-cu12",
+    #"nvidia-cusparse-cu12",
+    #"nvidia-curand-cu12",
+    #"nvidia-nvjitlink-cu12",
+    # cuda-deps-end
 ]
 
 [project.optional-dependencies]

diff --git a/tutorials/02_digital_twin.ipynb b/tutorials/02_digital_twin.ipynb
@@ -10,7 +10,7 @@
    "execution_count": null,
    "metadata": {},
    "outputs": [],
-   "source": "# Install OpenImpala and dependencies.\n# On GPU runtimes (T4, A100, etc.) we install the CUDA-accelerated wheel;\n# on CPU-only runtimes we fall back to the pure-Python/CPU package.\nimport subprocess, sys\n\ndef _has_gpu():\n    try:\n        subprocess.check_output([\"nvidia-smi\"], stderr=subprocess.DEVNULL)\n        return True\n    except (FileNotFoundError, subprocess.CalledProcessError):\n        return False\n\n_common = \"pybamm bpx tifffile matplotlib yt\"\nif _has_gpu():\n    print(\"GPU detected — installing openimpala-cuda\")\n    !pip install -q openimpala-cuda --find-links https://github.com/BASE-Laboratory/OpenImpala/releases/latest nvidia-cuda-runtime-cu12 nvidia-cublas-cu12 nvidia-cusparse-cu12 nvidia-curand-cu12 {_common}\nelse:\n    print(\"No GPU detected — installing openimpala (CPU)\")\n    !pip install -q openimpala {_common}"
+   "source": "# Install OpenImpala and dependencies.\n# On GPU runtimes (T4, A100, etc.) we install the CUDA-accelerated wheel;\n# on CPU-only runtimes we fall back to the pure-Python/CPU package.\nimport subprocess, sys\n\ndef _has_gpu():\n    try:\n        subprocess.check_output([\"nvidia-smi\"], stderr=subprocess.DEVNULL)\n        return True\n    except (FileNotFoundError, subprocess.CalledProcessError):\n        return False\n\n_common = \"pybamm bpx tifffile matplotlib yt\"\nif _has_gpu():\n    print(\"GPU detected — installing openimpala-cuda\")\n    !pip install -q openimpala-cuda {_common}\nelse:\n    print(\"No GPU detected — installing openimpala (CPU)\")\n    !pip install -q openimpala {_common}"
   },
   {
    "cell_type": "code",

diff --git a/tutorials/04_multiphase_and_fields.ipynb b/tutorials/04_multiphase_and_fields.ipynb
@@ -10,7 +10,7 @@
    "execution_count": null,
    "metadata": {},
    "outputs": [],
-   "source": "# Install OpenImpala (compiled C++ backend needed for low-level API in this tutorial).\n# Auto-detect GPU vs CPU runtime.\nimport subprocess, sys\n\ndef _has_gpu():\n    try:\n        subprocess.check_output([\"nvidia-smi\"], stderr=subprocess.DEVNULL)\n        return True\n    except (FileNotFoundError, subprocess.CalledProcessError):\n        return False\n\n_common = \"porespy yt matplotlib\"\nif _has_gpu():\n    print(\"GPU detected — installing openimpala-cuda\")\n    !pip install -q openimpala-cuda --find-links https://github.com/BASE-Laboratory/OpenImpala/releases/latest nvidia-cuda-runtime-cu12 nvidia-cublas-cu12 nvidia-cusparse-cu12 nvidia-curand-cu12 {_common}\nelse:\n    print(\"No GPU detected — installing openimpala (CPU)\")\n    !pip install -q openimpala {_common}"
+   "source": "# Install OpenImpala (compiled C++ backend needed for low-level API in this tutorial).\n# Auto-detect GPU vs CPU runtime.\nimport subprocess, sys\n\ndef _has_gpu():\n    try:\n        subprocess.check_output([\"nvidia-smi\"], stderr=subprocess.DEVNULL)\n        return True\n    except (FileNotFoundError, subprocess.CalledProcessError):\n        return False\n\n_common = \"porespy yt matplotlib\"\nif _has_gpu():\n    print(\"GPU detected — installing openimpala-cuda\")\n    !pip install -q openimpala-cuda {_common}\nelse:\n    print(\"No GPU detected — installing openimpala (CPU)\")\n    !pip install -q openimpala {_common}"
   },
   {
    "cell_type": "code",

diff --git a/tutorials/07_hpc_scaling.ipynb b/tutorials/07_hpc_scaling.ipynb
@@ -242,7 +242,7 @@
    "execution_count": null,
    "metadata": {},
    "outputs": [],
-   "source": "# Install OpenImpala (compiled C++ backend needed for HPC features in this tutorial).\n# Auto-detect GPU vs CPU runtime.\nimport subprocess, sys\n\ndef _has_gpu():\n    try:\n        subprocess.check_output([\"nvidia-smi\"], stderr=subprocess.DEVNULL)\n        return True\n    except (FileNotFoundError, subprocess.CalledProcessError):\n        return False\n\n_common = \"porespy matplotlib\"\nif _has_gpu():\n    print(\"GPU detected — installing openimpala-cuda\")\n    !pip install -q openimpala-cuda --find-links https://github.com/BASE-Laboratory/OpenImpala/releases/latest nvidia-cuda-runtime-cu12 nvidia-cublas-cu12 nvidia-cusparse-cu12 nvidia-curand-cu12 {_common}\nelse:\n    print(\"No GPU detected — installing openimpala (CPU)\")\n    !pip install -q openimpala {_common}"
+   "source": "# Install OpenImpala (compiled C++ backend needed for HPC features in this tutorial).\n# Auto-detect GPU vs CPU runtime.\nimport subprocess, sys\n\ndef _has_gpu():\n    try:\n        subprocess.check_output([\"nvidia-smi\"], stderr=subprocess.DEVNULL)\n        return True\n    except (FileNotFoundError, subprocess.CalledProcessError):\n        return False\n\n_common = \"porespy matplotlib\"\nif _has_gpu():\n    print(\"GPU detected — installing openimpala-cuda\")\n    !pip install -q openimpala-cuda {_common}\nelse:\n    print(\"No GPU detected — installing openimpala (CPU)\")\n    !pip install -q openimpala {_common}"
   },
   {
    "cell_type": "code",