From 036374ac89bc85477a1a5a46ceeb2b1dfd9c9284 Mon Sep 17 00:00:00 2001
From: James Le Houx <james.le-houx@gre.ac.uk>
Date: Tue, 21 Apr 2026 11:22:25 +0000
Subject: [PATCH 1/4] fix wheel builds: drop redundant CMake option, patch
 AMReX for CUDA 12
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

CPU wheel error ("gmake: *** No rule to make target '_core'") traced to
the SKBUILD_CMAKE_ARGS env var interfering with scikit-build-core's
cmake.args merge. The OPENIMPALA_ENABLE_TINY_PROFILE option was
redundant anyway — when AMReX is built with AMReX_TINY_PROFILE=ON, it
sets AMREX_TINY_PROFILE in its installed AMReX_Config.H header, which
every file including AMReX.H picks up automatically. Removed the option
and the env var; kept the AMReX-side build flag.

GPU wheel error ("CUDA::nvToolsExt target not found") is AMReX 25.03 vs.
CUDA 12 — libnvToolsExt was removed in CUDA 12 in favour of NVTX3
(header-only). Patch AMReX 25.03's CMake to use CUDA::nvtx3 instead,
applied via sed before configure. CMake 3.25+ (we have 3.28) exposes
CUDA::nvtx3 from CUDAToolkit, so this is drop-in.

Cache keys bumped (CPU v5, GPU nvtx3-v4) to force a fresh dep rebuild.

https://claude.ai/code/session_011dJ5Bwq4Tnr8wxH597XJFf
---
 .github/workflows/pypi-wheels-cpu.yml |  3 +--
 .github/workflows/pypi-wheels-gpu.yml |  4 ++--
 CMakeLists.txt                        | 19 ++++++-------------
 3 files changed, 9 insertions(+), 17 deletions(-)

diff --git a/.github/workflows/pypi-wheels-cpu.yml b/.github/workflows/pypi-wheels-cpu.yml
index fc03342..1463411 100644
--- a/.github/workflows/pypi-wheels-cpu.yml
+++ b/.github/workflows/pypi-wheels-cpu.yml
@@ -41,7 +41,7 @@ jobs:
         uses: actions/cache@v4
         with:
           path: .cibw-deps-cache
-          key: cibw-deps-manylinux_2_28-x86_64-hdf5_1.14.6-tiff_4.6.0-hypre_2.31.0-amrex_25.03-tinyprof-v4
+          key: cibw-deps-manylinux_2_28-x86_64-hdf5_1.14.6-tiff_4.6.0-hypre_2.31.0-amrex_25.03-tinyprof-v5
 
       - name: Build wheels
         run: python -m cibuildwheel --output-dir wheelhouse
@@ -123,7 +123,6 @@ jobs:
             CMAKE_PREFIX_PATH="/usr/local"
             CMAKE_GENERATOR="Unix Makefiles"
             SETUPTOOLS_SCM_PRETEND_VERSION="${{ steps.version.outputs.version }}"
-            SKBUILD_CMAKE_ARGS="-DOPENIMPALA_ENABLE_TINY_PROFILE=ON"
 
           CIBW_REPAIR_WHEEL_COMMAND_LINUX: >
             auditwheel repair -w {dest_dir} {wheel}
diff --git a/.github/workflows/pypi-wheels-gpu.yml b/.github/workflows/pypi-wheels-gpu.yml
index f9a58de..928b91e 100644
--- a/.github/workflows/pypi-wheels-gpu.yml
+++ b/.github/workflows/pypi-wheels-gpu.yml
@@ -45,7 +45,7 @@ jobs:
         uses: actions/cache@v4
         with:
           path: .cibw-deps-cache
-          key: cibw-deps-gpu-cuda12.6-manylinux_2_34-x86_64-hdf5_1.14.6-tiff_4.6.0-hypre_2.31.0-amrex_25.03-gcc13-tinyprof-v3
+          key: cibw-deps-gpu-cuda12.6-manylinux_2_34-x86_64-hdf5_1.14.6-tiff_4.6.0-hypre_2.31.0-amrex_25.03-gcc13-nvtx3-v4
 
       - name: Build GPU wheels
         run: python -m cibuildwheel --output-dir wheelhouse
@@ -117,6 +117,7 @@ jobs:
             make install &&
             cd ../.. &&
             git clone --depth 1 --branch 25.03 https://github.com/AMReX-Codes/amrex.git /tmp/amrex &&
+            sed -i 's|CUDA::nvToolsExt|CUDA::nvtx3|g' /tmp/amrex/Tools/CMake/AMReXParallelBackends.cmake &&
             cmake -S /tmp/amrex -B /tmp/amrex/build
             -DCMAKE_INSTALL_PREFIX=/usr/local
             -DCMAKE_BUILD_TYPE=Release
@@ -157,7 +158,6 @@ jobs:
             CMAKE_PREFIX_PATH="/usr/local"
             CMAKE_GENERATOR="Unix Makefiles"
             CMAKE_ARGS="-DGPU_BACKEND=CUDA '-DCMAKE_CUDA_ARCHITECTURES=60;70;75;80;86;89;90' -DCMAKE_CUDA_HOST_COMPILER=/opt/rh/gcc-toolset-13/root/usr/bin/g++"
-            SKBUILD_CMAKE_ARGS="-DOPENIMPALA_ENABLE_TINY_PROFILE=ON"
             SETUPTOOLS_SCM_PRETEND_VERSION="${{ steps.version.outputs.version }}"
 
           # Vendor libraries but exclude host-specific MPI, OpenMP, Fortran runtime,
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 9144b35..04da7e0 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -161,19 +161,12 @@ endif()
 # ==============================================================================
 # TinyProfiler
 # ==============================================================================
-# AMReX's TinyProfiler emits a function-level timing table at AMReX::Finalize().
-# Useful for diagnosing C++ hotspots from the profiling notebook (§7).
-#
-# This option assumes AMReX was itself built with AMReX_TINY_PROFILE=ON — if not,
-# BL_PROFILE regions compile away and no table is emitted. The wheel CI builds
-# AMReX with this flag when OPENIMPALA_ENABLE_TINY_PROFILE=ON.
-option(OPENIMPALA_ENABLE_TINY_PROFILE
-       "Enable AMReX TinyProfiler instrumentation (BL_PROFILE regions)" OFF)
-if(OPENIMPALA_ENABLE_TINY_PROFILE)
-    add_compile_definitions(AMREX_TINY_PROFILE)
-    message(STATUS "AMReX TinyProfiler: ENABLED (AMReX must also be built with "
-                   "AMReX_TINY_PROFILE=ON for tables to appear)")
-endif()
+# AMReX emits a function-level BL_PROFILE timing table at AMReX::Finalize() when
+# AMReX itself is built with -DAMReX_TINY_PROFILE=ON. That flag is exported via
+# the AMReX::amrex target's INTERFACE_COMPILE_DEFINITIONS, so every target that
+# links against AMReX::amrex picks up AMREX_TINY_PROFILE automatically — no
+# OpenImpala-side option is needed. The wheel CI sets -DAMReX_TINY_PROFILE=ON
+# when building AMReX from source.
 
 # ==============================================================================
 # Library targets

From 72e4d6440a2840b5ef843ee025630ccaea568a67 Mon Sep 17 00:00:00 2001
From: James Le Houx <james.le-houx@gre.ac.uk>
Date: Tue, 21 Apr 2026 11:26:02 +0000
Subject: [PATCH 2/4] publish GPU wheels to PyPI instead of GitHub Releases

Now that openimpala-cuda has been granted the 320 MiB per-file PyPI limit,
the GPU wheels fit and can be installed via `pip install openimpala-cuda`
like any other package. Mirror the CPU workflow's publish job: use the
pypi trusted-publisher flow (environment: pypi, id-token: write) via
pypa/gh-action-pypi-publish. Gate on github.event_name == 'release' so
workflow_dispatch runs still produce artifacts for manual inspection
without touching the index.

https://claude.ai/code/session_011dJ5Bwq4Tnr8wxH597XJFf
---
 .github/workflows/pypi-wheels-gpu.yml | 21 ++++++++++-----------
 1 file changed, 10 insertions(+), 11 deletions(-)

diff --git a/.github/workflows/pypi-wheels-gpu.yml b/.github/workflows/pypi-wheels-gpu.yml
index 928b91e..5f9f4cf 100644
--- a/.github/workflows/pypi-wheels-gpu.yml
+++ b/.github/workflows/pypi-wheels-gpu.yml
@@ -200,12 +200,16 @@ jobs:
           name: cibw-wheels-gpu
           path: ./wheelhouse/*.whl
 
-  upload_to_github_release:
-    name: Upload GPU wheels to GitHub Release
+  publish_to_pypi:
+    name: Publish GPU wheels to PyPI (openimpala-cuda)
     needs: build_gpu_wheels
     runs-on: ubuntu-latest
+    # Only publish on release; workflow_dispatch leaves the artifact for manual
+    # inspection without touching PyPI.
+    if: github.event_name == 'release'
+    environment: pypi
     permissions:
-      contents: write
+      id-token: write
 
     steps:
       - name: Download wheel artifacts
@@ -214,12 +218,7 @@ jobs:
           name: cibw-wheels-gpu
           path: dist/
 
-      - name: Upload wheels to GitHub Release
-        if: github.event_name == 'release'
-        uses: softprops/action-gh-release@v2
+      - name: Publish to PyPI
+        uses: pypa/gh-action-pypi-publish@release/v1
         with:
-          files: dist/*.whl
-
-      - name: List wheels (workflow_dispatch — no release to upload to)
-        if: github.event_name == 'workflow_dispatch'
-        run: ls -lh dist/
+          skip-existing: true

From 109f24ee7c9d1a381d5b02ec369ddb3251641aa7 Mon Sep 17 00:00:00 2001
From: James Le Houx <james.le-houx@gre.ac.uk>
Date: Tue, 21 Apr 2026 11:30:15 +0000
Subject: [PATCH 3/4] docs: drop GitHub-Releases find-links workaround for
 openimpala-cuda
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Now that openimpala-cuda is published to PyPI (previous commit switched the
GPU wheel workflow), the install collapses from

    pip install openimpala-cuda --find-links \
      https://github.com/BASE-Laboratory/OpenImpala/releases/expanded_assets/v4.0.6 \
      nvidia-cuda-runtime-cu12 nvidia-cublas-cu12 nvidia-cusparse-cu12 \
      nvidia-curand-cu12

down to plain

    pip install openimpala-cuda

The nvidia-*-cu12 packages were only needed because the --find-links index
didn't carry them; PyPI's resolver will pull whatever the wheel actually
declares. Updates every call site that showed the old incantation:

- README.md, docs/getting-started.md, docs/user-guide/gpu.md — advanced
  install sections
- paper.md — corrects "via GitHub Releases" wording for the JOSS draft
- notebooks/visualization_yt.ipynb — §0 install cell
- tutorials/02_digital_twin.ipynb — install cell
- tutorials/04_multiphase_and_fields.ipynb — install cell
- tutorials/07_hpc_scaling.ipynb — §6 install cell

Also fixes a malformed .sif wget URL in docs/getting-started.md (a stray
concatenation of expanded_assets/v4.0.6 with the filename) by switching to
a vX.Y.Z placeholder to match the pattern already used in tutorial 7.

https://claude.ai/code/session_011dJ5Bwq4Tnr8wxH597XJFf
---
 README.md                                |  6 ++++--
 docs/getting-started.md                  | 15 ++++++++-------
 docs/user-guide/gpu.md                   |  5 ++---
 notebooks/visualization_yt.ipynb         | 21 ++-------------------
 paper.md                                 |  2 +-
 tutorials/02_digital_twin.ipynb          |  2 +-
 tutorials/04_multiphase_and_fields.ipynb |  2 +-
 tutorials/07_hpc_scaling.ipynb           |  2 +-
 8 files changed, 20 insertions(+), 35 deletions(-)

diff --git a/README.md b/README.md
index c2eb9d3..66a58b4 100644
--- a/README.md
+++ b/README.md
@@ -145,10 +145,12 @@ If CuPy is not available, OpenImpala falls back to SciPy on the CPU.
 CUDA support:
 
 ```bash
-pip install openimpala-cuda --find-links \
-  https://github.com/BASE-Laboratory/OpenImpala/releases/expanded_assets/v4.0.6
+pip install openimpala-cuda
 ```
 
+The `openimpala-cuda` wheel requires a working NVIDIA CUDA 12 runtime (driver
++ toolkit). On Colab, Kaggle, and most cluster nodes this is already present.
+
 To install with optional dependencies:
 
 ```bash
diff --git a/docs/getting-started.md b/docs/getting-started.md
index 644d5e6..e1688f4 100644
--- a/docs/getting-started.md
+++ b/docs/getting-started.md
@@ -29,12 +29,13 @@ For HPC clusters that need the compiled C++ HYPRE solvers, a separate package
 is available:
 
 ```bash
-pip install openimpala-cuda --find-links \
-  https://github.com/BASE-Laboratory/OpenImpala/releases/expanded_assets/v4.0.6
+pip install openimpala-cuda
 ```
 
 This package bundles AMReX + HYPRE compiled with CUDA and is a drop-in
-replacement for the pure-Python `openimpala` package.
+replacement for the pure-Python `openimpala` package. It requires a working
+NVIDIA CUDA 12 runtime (driver + toolkit) on the host, which is already
+present on Colab, Kaggle, and most GPU cluster nodes.
 
 ### Container (HPC)
 
@@ -42,14 +43,14 @@ For HPC clusters, download the pre-built Apptainer/Singularity container from
 [GitHub Releases](https://github.com/BASE-Laboratory/OpenImpala/releases):
 
 ```bash
-# Download the latest .sif file
-wget https://github.com/BASE-Laboratory/OpenImpala/releases/expanded_assets/v4.0.6openimpala-v4.0.0.sif
+# Download the latest .sif file (replace vX.Y.Z with the release tag)
+wget https://github.com/BASE-Laboratory/OpenImpala/releases/download/vX.Y.Z/openimpala-vX.Y.Z.sif
 
 # Run interactively
-apptainer shell openimpala-v4.0.0.sif
+apptainer shell openimpala-vX.Y.Z.sif
 
 # Run a simulation
-apptainer exec openimpala-v4.0.0.sif /opt/OpenImpala/build/Diffusion3d inputs
+apptainer exec openimpala-vX.Y.Z.sif /opt/OpenImpala/build/Diffusion3d inputs
 ```
 
 ### From source (developers)
diff --git a/docs/user-guide/gpu.md b/docs/user-guide/gpu.md
index 95274a3..525633e 100644
--- a/docs/user-guide/gpu.md
+++ b/docs/user-guide/gpu.md
@@ -60,11 +60,10 @@ with oi.Session():
 
 For HPC clusters that need the compiled C++ HYPRE linear solvers with native
 CUDA support (AMReX + HYPRE compiled with CUDA), a separate package is
-available:
+available on PyPI:
 
 ```bash
-pip install openimpala-cuda --find-links \
-  https://github.com/BASE-Laboratory/OpenImpala/releases/expanded_assets/v4.0.6
+pip install openimpala-cuda
 ```
 
 The `openimpala-cuda` package is a drop-in replacement for `openimpala` and
diff --git a/notebooks/visualization_yt.ipynb b/notebooks/visualization_yt.ipynb
index 7034eb6..bbfb89c 100644
--- a/notebooks/visualization_yt.ipynb
+++ b/notebooks/visualization_yt.ipynb
@@ -35,24 +35,7 @@
    "execution_count": null,
    "metadata": {},
    "outputs": [],
-   "source": [
-    "import subprocess, sys\n",
-    "\n",
-    "def _has_gpu():\n",
-    "    try:\n",
-    "        subprocess.check_output([\"nvidia-smi\"], stderr=subprocess.DEVNULL)\n",
-    "        return True\n",
-    "    except (FileNotFoundError, subprocess.CalledProcessError):\n",
-    "        return False\n",
-    "\n",
-    "_extras = \"yt matplotlib porespy\"\n",
-    "if _has_gpu():\n",
-    "    print(\"GPU detected — installing openimpala-cuda\")\n",
-    "    !pip install -q openimpala-cuda --find-links https://github.com/BASE-Laboratory/OpenImpala/releases/latest nvidia-cuda-runtime-cu12 nvidia-cublas-cu12 nvidia-cusparse-cu12 nvidia-curand-cu12 {_extras}\n",
-    "else:\n",
-    "    print(\"No GPU detected — installing openimpala (CPU)\")\n",
-    "    !pip install -q openimpala {_extras}"
-   ]
+   "source": "import subprocess, sys\n\ndef _has_gpu():\n    try:\n        subprocess.check_output([\"nvidia-smi\"], stderr=subprocess.DEVNULL)\n        return True\n    except (FileNotFoundError, subprocess.CalledProcessError):\n        return False\n\n_extras = \"yt matplotlib porespy\"\nif _has_gpu():\n    print(\"GPU detected — installing openimpala-cuda\")\n    !pip install -q openimpala-cuda {_extras}\nelse:\n    print(\"No GPU detected — installing openimpala (CPU)\")\n    !pip install -q openimpala {_extras}"
   },
   {
    "cell_type": "markdown",
@@ -413,4 +396,4 @@
  },
  "nbformat": 4,
  "nbformat_minor": 4
-}
+}
\ No newline at end of file
diff --git a/paper.md b/paper.md
index 947135e..7aec22c 100644
--- a/paper.md
+++ b/paper.md
@@ -83,7 +83,7 @@ with oi.Session():
     print(f"Tortuosity: {result.tortuosity:.4f}")
 ```
 
-A pure-Python package is distributed via PyPI (`pip install openimpala`) with automatic GPU acceleration via CuPy when available, and compiled CUDA GPU wheels with HYPRE solvers are available via GitHub Releases (`pip install openimpala-cuda`) for HPC deployments. Interactive tutorial notebooks are provided for Google Colab, covering workflows from basic tortuosity computation to digital twin parameterisation with PyBaMM. API reference documentation, installation guides, and interactive tutorial notebooks are available at https://base-laboratory.github.io/OpenImpala/
+A pure-Python package is distributed via PyPI (`pip install openimpala`) with automatic GPU acceleration via CuPy when available, and compiled CUDA GPU wheels with HYPRE solvers are also distributed via PyPI (`pip install openimpala-cuda`) for HPC deployments. Interactive tutorial notebooks are provided for Google Colab, covering workflows from basic tortuosity computation to digital twin parameterisation with PyBaMM. API reference documentation, installation guides, and interactive tutorial notebooks are available at https://base-laboratory.github.io/OpenImpala/
 
 ## Testing and Quality Assurance
 
diff --git a/tutorials/02_digital_twin.ipynb b/tutorials/02_digital_twin.ipynb
index 8942498..2d8691b 100644
--- a/tutorials/02_digital_twin.ipynb
+++ b/tutorials/02_digital_twin.ipynb
@@ -10,7 +10,7 @@
    "execution_count": null,
    "metadata": {},
    "outputs": [],
-   "source": "# Install OpenImpala and dependencies.\n# On GPU runtimes (T4, A100, etc.) we install the CUDA-accelerated wheel;\n# on CPU-only runtimes we fall back to the pure-Python/CPU package.\nimport subprocess, sys\n\ndef _has_gpu():\n    try:\n        subprocess.check_output([\"nvidia-smi\"], stderr=subprocess.DEVNULL)\n        return True\n    except (FileNotFoundError, subprocess.CalledProcessError):\n        return False\n\n_common = \"pybamm bpx tifffile matplotlib yt\"\nif _has_gpu():\n    print(\"GPU detected — installing openimpala-cuda\")\n    !pip install -q openimpala-cuda --find-links https://github.com/BASE-Laboratory/OpenImpala/releases/latest nvidia-cuda-runtime-cu12 nvidia-cublas-cu12 nvidia-cusparse-cu12 nvidia-curand-cu12 {_common}\nelse:\n    print(\"No GPU detected — installing openimpala (CPU)\")\n    !pip install -q openimpala {_common}"
+   "source": "# Install OpenImpala and dependencies.\n# On GPU runtimes (T4, A100, etc.) we install the CUDA-accelerated wheel;\n# on CPU-only runtimes we fall back to the pure-Python/CPU package.\nimport subprocess, sys\n\ndef _has_gpu():\n    try:\n        subprocess.check_output([\"nvidia-smi\"], stderr=subprocess.DEVNULL)\n        return True\n    except (FileNotFoundError, subprocess.CalledProcessError):\n        return False\n\n_common = \"pybamm bpx tifffile matplotlib yt\"\nif _has_gpu():\n    print(\"GPU detected — installing openimpala-cuda\")\n    !pip install -q openimpala-cuda {_common}\nelse:\n    print(\"No GPU detected — installing openimpala (CPU)\")\n    !pip install -q openimpala {_common}"
   },
   {
    "cell_type": "code",
diff --git a/tutorials/04_multiphase_and_fields.ipynb b/tutorials/04_multiphase_and_fields.ipynb
index 8131c07..f664066 100644
--- a/tutorials/04_multiphase_and_fields.ipynb
+++ b/tutorials/04_multiphase_and_fields.ipynb
@@ -10,7 +10,7 @@
    "execution_count": null,
    "metadata": {},
    "outputs": [],
-   "source": "# Install OpenImpala (compiled C++ backend needed for low-level API in this tutorial).\n# Auto-detect GPU vs CPU runtime.\nimport subprocess, sys\n\ndef _has_gpu():\n    try:\n        subprocess.check_output([\"nvidia-smi\"], stderr=subprocess.DEVNULL)\n        return True\n    except (FileNotFoundError, subprocess.CalledProcessError):\n        return False\n\n_common = \"porespy yt matplotlib\"\nif _has_gpu():\n    print(\"GPU detected — installing openimpala-cuda\")\n    !pip install -q openimpala-cuda --find-links https://github.com/BASE-Laboratory/OpenImpala/releases/latest nvidia-cuda-runtime-cu12 nvidia-cublas-cu12 nvidia-cusparse-cu12 nvidia-curand-cu12 {_common}\nelse:\n    print(\"No GPU detected — installing openimpala (CPU)\")\n    !pip install -q openimpala {_common}"
+   "source": "# Install OpenImpala (compiled C++ backend needed for low-level API in this tutorial).\n# Auto-detect GPU vs CPU runtime.\nimport subprocess, sys\n\ndef _has_gpu():\n    try:\n        subprocess.check_output([\"nvidia-smi\"], stderr=subprocess.DEVNULL)\n        return True\n    except (FileNotFoundError, subprocess.CalledProcessError):\n        return False\n\n_common = \"porespy yt matplotlib\"\nif _has_gpu():\n    print(\"GPU detected — installing openimpala-cuda\")\n    !pip install -q openimpala-cuda {_common}\nelse:\n    print(\"No GPU detected — installing openimpala (CPU)\")\n    !pip install -q openimpala {_common}"
   },
   {
    "cell_type": "code",
diff --git a/tutorials/07_hpc_scaling.ipynb b/tutorials/07_hpc_scaling.ipynb
index bd0b783..e4c879e 100644
--- a/tutorials/07_hpc_scaling.ipynb
+++ b/tutorials/07_hpc_scaling.ipynb
@@ -242,7 +242,7 @@
    "execution_count": null,
    "metadata": {},
    "outputs": [],
-   "source": "# Install OpenImpala (compiled C++ backend needed for HPC features in this tutorial).\n# Auto-detect GPU vs CPU runtime.\nimport subprocess, sys\n\ndef _has_gpu():\n    try:\n        subprocess.check_output([\"nvidia-smi\"], stderr=subprocess.DEVNULL)\n        return True\n    except (FileNotFoundError, subprocess.CalledProcessError):\n        return False\n\n_common = \"porespy matplotlib\"\nif _has_gpu():\n    print(\"GPU detected — installing openimpala-cuda\")\n    !pip install -q openimpala-cuda --find-links https://github.com/BASE-Laboratory/OpenImpala/releases/latest nvidia-cuda-runtime-cu12 nvidia-cublas-cu12 nvidia-cusparse-cu12 nvidia-curand-cu12 {_common}\nelse:\n    print(\"No GPU detected — installing openimpala (CPU)\")\n    !pip install -q openimpala {_common}"
+   "source": "# Install OpenImpala (compiled C++ backend needed for HPC features in this tutorial).\n# Auto-detect GPU vs CPU runtime.\nimport subprocess, sys\n\ndef _has_gpu():\n    try:\n        subprocess.check_output([\"nvidia-smi\"], stderr=subprocess.DEVNULL)\n        return True\n    except (FileNotFoundError, subprocess.CalledProcessError):\n        return False\n\n_common = \"porespy matplotlib\"\nif _has_gpu():\n    print(\"GPU detected — installing openimpala-cuda\")\n    !pip install -q openimpala-cuda {_common}\nelse:\n    print(\"No GPU detected — installing openimpala (CPU)\")\n    !pip install -q openimpala {_common}"
   },
   {
    "cell_type": "code",

From 4e606f46341bfd447332c7c98c3b22b8c3a9004a Mon Sep 17 00:00:00 2001
From: James Le Houx <james.le-houx@gre.ac.uk>
Date: Tue, 21 Apr 2026 11:32:55 +0000
Subject: [PATCH 4/4] declare CUDA runtime deps for openimpala-cuda wheel
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

auditwheel repair --exclude drops libcudart / libcublas / libcusparse /
libcurand / libnvJitLink from the openimpala-cuda wheel payload, which
means the wheel only works on machines that already have the CUDA 12
toolkit installed — driver-only Colab/Kaggle runtimes have the libraries,
but a bare Python venv on an NVIDIA workstation does not.

Declare the nvidia-*-cu12 PyPI packages as runtime deps so pip pulls them
automatically. Keep them commented out in pyproject.toml with clear
markers so the CPU wheel (which uses the same file) doesn't grow a 1-2 GB
dep tree. The GPU workflow's existing sed step already rewrites `name =
"openimpala"` to `"openimpala-cuda"`; a second sed uncomments the
`#"nvidia-..."` lines in the same pass.

Verified with python3 -m tomllib that both variants produce valid TOML
and the expected dependency lists:

  CPU: ['numpy', 'scipy>=1.7']
  GPU: ['numpy', 'scipy>=1.7', 'nvidia-cuda-runtime-cu12',
        'nvidia-cublas-cu12', 'nvidia-cusparse-cu12',
        'nvidia-curand-cu12', 'nvidia-nvjitlink-cu12']

https://claude.ai/code/session_011dJ5Bwq4Tnr8wxH597XJFf
---
 .github/workflows/pypi-wheels-gpu.yml |  9 +++++++--
 pyproject.toml                        | 11 +++++++++++
 2 files changed, 18 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/pypi-wheels-gpu.yml b/.github/workflows/pypi-wheels-gpu.yml
index 5f9f4cf..e7f1ad3 100644
--- a/.github/workflows/pypi-wheels-gpu.yml
+++ b/.github/workflows/pypi-wheels-gpu.yml
@@ -139,11 +139,16 @@ jobs:
             tar czf /project/.cibw-deps-cache/deps.tar.gz /usr/local ;
             fi
 
-          # Rename the package to openimpala-cuda for the GPU wheel.
+          # Rename the package to openimpala-cuda and uncomment the nvidia-*-cu12
+          # runtime deps (kept commented in pyproject.toml so they don't pollute
+          # the CPU wheel). auditwheel --exclude drops the CUDA .so's from the
+          # wheel payload, so without these PyPI deps the wheel breaks on any
+          # machine that doesn't already have the CUDA toolkit installed.
           # The import name stays 'openimpala' — only the PyPI distribution name changes.
           CIBW_BEFORE_BUILD: >
             pip install "cmake>=3.28,<4" &&
-            sed -i 's/^name = "openimpala"/name = "openimpala-cuda"/' /project/pyproject.toml
+            sed -i 's/^name = "openimpala"/name = "openimpala-cuda"/' /project/pyproject.toml &&
+            sed -i 's/^    #"nvidia-/    "nvidia-/' /project/pyproject.toml
 
           # Point to MPI, CUDA, and our compiled GPU dependencies.
           CIBW_ENVIRONMENT_LINUX: >
diff --git a/pyproject.toml b/pyproject.toml
index 324c4bc..8ad4105 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -16,6 +16,17 @@ requires-python = ">=3.8"
 dependencies = [
     "numpy",
     "scipy>=1.7",
+    # GPU-only deps: kept commented out so the CPU wheel stays lean.
+    # The openimpala-cuda build in .github/workflows/pypi-wheels-gpu.yml
+    # uncomments these via sed before scikit-build-core reads this file.
+    # Do not remove the "# cuda-" markers — the sed rule keys off them.
+    # cuda-deps-start
+    #"nvidia-cuda-runtime-cu12",
+    #"nvidia-cublas-cu12",
+    #"nvidia-cusparse-cu12",
+    #"nvidia-curand-cu12",
+    #"nvidia-nvjitlink-cu12",
+    # cuda-deps-end
 ]
 
 [project.optional-dependencies]