diff --git a/.github/workflows/pypi-wheels-cpu.yml b/.github/workflows/pypi-wheels-cpu.yml
index fc03342b..14634116 100644
--- a/.github/workflows/pypi-wheels-cpu.yml
+++ b/.github/workflows/pypi-wheels-cpu.yml
@@ -41,7 +41,7 @@ jobs:
         uses: actions/cache@v4
         with:
           path: .cibw-deps-cache
-          key: cibw-deps-manylinux_2_28-x86_64-hdf5_1.14.6-tiff_4.6.0-hypre_2.31.0-amrex_25.03-tinyprof-v4
+          key: cibw-deps-manylinux_2_28-x86_64-hdf5_1.14.6-tiff_4.6.0-hypre_2.31.0-amrex_25.03-tinyprof-v5
 
       - name: Build wheels
         run: python -m cibuildwheel --output-dir wheelhouse
@@ -123,7 +123,6 @@ jobs:
             CMAKE_PREFIX_PATH="/usr/local"
             CMAKE_GENERATOR="Unix Makefiles"
             SETUPTOOLS_SCM_PRETEND_VERSION="${{ steps.version.outputs.version }}"
-            SKBUILD_CMAKE_ARGS="-DOPENIMPALA_ENABLE_TINY_PROFILE=ON"
 
           CIBW_REPAIR_WHEEL_COMMAND_LINUX: >
             auditwheel repair -w {dest_dir} {wheel}
diff --git a/.github/workflows/pypi-wheels-gpu.yml b/.github/workflows/pypi-wheels-gpu.yml
index f9a58deb..e7f1ad31 100644
--- a/.github/workflows/pypi-wheels-gpu.yml
+++ b/.github/workflows/pypi-wheels-gpu.yml
@@ -45,7 +45,7 @@ jobs:
         uses: actions/cache@v4
         with:
           path: .cibw-deps-cache
-          key: cibw-deps-gpu-cuda12.6-manylinux_2_34-x86_64-hdf5_1.14.6-tiff_4.6.0-hypre_2.31.0-amrex_25.03-gcc13-tinyprof-v3
+          key: cibw-deps-gpu-cuda12.6-manylinux_2_34-x86_64-hdf5_1.14.6-tiff_4.6.0-hypre_2.31.0-amrex_25.03-gcc13-nvtx3-v4
 
       - name: Build GPU wheels
         run: python -m cibuildwheel --output-dir wheelhouse
@@ -117,6 +117,7 @@ jobs:
             make install &&
             cd ../.. &&
             git clone --depth 1 --branch 25.03 https://github.com/AMReX-Codes/amrex.git /tmp/amrex &&
+            sed -i 's|CUDA::nvToolsExt|CUDA::nvtx3|g' /tmp/amrex/Tools/CMake/AMReXParallelBackends.cmake &&
             cmake -S /tmp/amrex -B /tmp/amrex/build
             -DCMAKE_INSTALL_PREFIX=/usr/local
             -DCMAKE_BUILD_TYPE=Release
@@ -138,11 +139,16 @@ jobs:
             tar czf /project/.cibw-deps-cache/deps.tar.gz /usr/local ;
             fi
 
-          # Rename the package to openimpala-cuda for the GPU wheel.
+          # Rename the package to openimpala-cuda and uncomment the nvidia-*-cu12
+          # runtime deps (kept commented in pyproject.toml so they don't pollute
+          # the CPU wheel). auditwheel --exclude drops the CUDA .so's from the
+          # wheel payload, so without these PyPI deps the wheel breaks on any
+          # machine that doesn't already have the CUDA toolkit installed.
           # The import name stays 'openimpala' — only the PyPI distribution name changes.
           CIBW_BEFORE_BUILD: >
             pip install "cmake>=3.28,<4" &&
-            sed -i 's/^name = "openimpala"/name = "openimpala-cuda"/' /project/pyproject.toml
+            sed -i 's/^name = "openimpala"/name = "openimpala-cuda"/' /project/pyproject.toml &&
+            sed -i 's/^    #"nvidia-/    "nvidia-/' /project/pyproject.toml
 
           # Point to MPI, CUDA, and our compiled GPU dependencies.
           CIBW_ENVIRONMENT_LINUX: >
@@ -157,7 +163,6 @@ jobs:
             CMAKE_PREFIX_PATH="/usr/local"
             CMAKE_GENERATOR="Unix Makefiles"
             CMAKE_ARGS="-DGPU_BACKEND=CUDA '-DCMAKE_CUDA_ARCHITECTURES=60;70;75;80;86;89;90' -DCMAKE_CUDA_HOST_COMPILER=/opt/rh/gcc-toolset-13/root/usr/bin/g++"
-            SKBUILD_CMAKE_ARGS="-DOPENIMPALA_ENABLE_TINY_PROFILE=ON"
             SETUPTOOLS_SCM_PRETEND_VERSION="${{ steps.version.outputs.version }}"
 
           # Vendor libraries but exclude host-specific MPI, OpenMP, Fortran runtime,
@@ -200,12 +205,16 @@ jobs:
           name: cibw-wheels-gpu
           path: ./wheelhouse/*.whl
 
-  upload_to_github_release:
-    name: Upload GPU wheels to GitHub Release
+  publish_to_pypi:
+    name: Publish GPU wheels to PyPI (openimpala-cuda)
     needs: build_gpu_wheels
     runs-on: ubuntu-latest
+    # Only publish on release; workflow_dispatch leaves the artifact for manual
+    # inspection without touching PyPI.
+    if: github.event_name == 'release'
+    environment: pypi
     permissions:
-      contents: write
+      id-token: write
 
     steps:
       - name: Download wheel artifacts
@@ -214,12 +223,7 @@ jobs:
           name: cibw-wheels-gpu
           path: dist/
 
-      - name: Upload wheels to GitHub Release
-        if: github.event_name == 'release'
-        uses: softprops/action-gh-release@v2
+      - name: Publish to PyPI
+        uses: pypa/gh-action-pypi-publish@release/v1
         with:
-          files: dist/*.whl
-
-      - name: List wheels (workflow_dispatch — no release to upload to)
-        if: github.event_name == 'workflow_dispatch'
-        run: ls -lh dist/
+          skip-existing: true
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 9144b351..04da7e01 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -161,19 +161,12 @@ endif()
 # ==============================================================================
 # TinyProfiler
 # ==============================================================================
-# AMReX's TinyProfiler emits a function-level timing table at AMReX::Finalize().
-# Useful for diagnosing C++ hotspots from the profiling notebook (§7).
-#
-# This option assumes AMReX was itself built with AMReX_TINY_PROFILE=ON — if not,
-# BL_PROFILE regions compile away and no table is emitted. The wheel CI builds
-# AMReX with this flag when OPENIMPALA_ENABLE_TINY_PROFILE=ON.
-option(OPENIMPALA_ENABLE_TINY_PROFILE
-       "Enable AMReX TinyProfiler instrumentation (BL_PROFILE regions)" OFF)
-if(OPENIMPALA_ENABLE_TINY_PROFILE)
-    add_compile_definitions(AMREX_TINY_PROFILE)
-    message(STATUS "AMReX TinyProfiler: ENABLED (AMReX must also be built with "
-                   "AMReX_TINY_PROFILE=ON for tables to appear)")
-endif()
+# AMReX emits a function-level BL_PROFILE timing table at AMReX::Finalize() when
+# AMReX itself is built with -DAMReX_TINY_PROFILE=ON. That flag is exported via
+# the AMReX::amrex target's INTERFACE_COMPILE_DEFINITIONS, so every target that
+# links against AMReX::amrex picks up AMREX_TINY_PROFILE automatically — no
+# OpenImpala-side option is needed. The wheel CI sets -DAMReX_TINY_PROFILE=ON
+# when building AMReX from source.
 
 # ==============================================================================
 # Library targets
diff --git a/README.md b/README.md
index c2eb9d3f..66a58b4f 100644
--- a/README.md
+++ b/README.md
@@ -145,10 +145,12 @@ If CuPy is not available, OpenImpala falls back to SciPy on the CPU.
 CUDA support:
 
 ```bash
-pip install openimpala-cuda --find-links \
-  https://github.com/BASE-Laboratory/OpenImpala/releases/expanded_assets/v4.0.6
+pip install openimpala-cuda
 ```
 
+The `openimpala-cuda` wheel requires a working NVIDIA CUDA 12 runtime (driver
++ toolkit). On Colab, Kaggle, and most cluster nodes this is already present.
+
 To install with optional dependencies:
 
 ```bash
diff --git a/docs/getting-started.md b/docs/getting-started.md
index 644d5e61..e1688f45 100644
--- a/docs/getting-started.md
+++ b/docs/getting-started.md
@@ -29,12 +29,13 @@ For HPC clusters that need the compiled C++ HYPRE solvers, a separate package
 is available:
 
 ```bash
-pip install openimpala-cuda --find-links \
-  https://github.com/BASE-Laboratory/OpenImpala/releases/expanded_assets/v4.0.6
+pip install openimpala-cuda
 ```
 
 This package bundles AMReX + HYPRE compiled with CUDA and is a drop-in
-replacement for the pure-Python `openimpala` package.
+replacement for the pure-Python `openimpala` package. It requires a working
+NVIDIA CUDA 12 runtime (driver + toolkit) on the host, which is already
+present on Colab, Kaggle, and most GPU cluster nodes.
 
 ### Container (HPC)
 
@@ -42,14 +43,14 @@ For HPC clusters, download the pre-built Apptainer/Singularity container from
 [GitHub Releases](https://github.com/BASE-Laboratory/OpenImpala/releases):
 
 ```bash
-# Download the latest .sif file
-wget https://github.com/BASE-Laboratory/OpenImpala/releases/expanded_assets/v4.0.6openimpala-v4.0.0.sif
+# Download the latest .sif file (replace vX.Y.Z with the release tag)
+wget https://github.com/BASE-Laboratory/OpenImpala/releases/download/vX.Y.Z/openimpala-vX.Y.Z.sif
 
 # Run interactively
-apptainer shell openimpala-v4.0.0.sif
+apptainer shell openimpala-vX.Y.Z.sif
 
 # Run a simulation
-apptainer exec openimpala-v4.0.0.sif /opt/OpenImpala/build/Diffusion3d inputs
+apptainer exec openimpala-vX.Y.Z.sif /opt/OpenImpala/build/Diffusion3d inputs
 ```
 
 ### From source (developers)
diff --git a/docs/user-guide/gpu.md b/docs/user-guide/gpu.md
index 95274a3b..525633e3 100644
--- a/docs/user-guide/gpu.md
+++ b/docs/user-guide/gpu.md
@@ -60,11 +60,10 @@ with oi.Session():
 
 For HPC clusters that need the compiled C++ HYPRE linear solvers with native
 CUDA support (AMReX + HYPRE compiled with CUDA), a separate package is
-available:
+available on PyPI:
 
 ```bash
-pip install openimpala-cuda --find-links \
-  https://github.com/BASE-Laboratory/OpenImpala/releases/expanded_assets/v4.0.6
+pip install openimpala-cuda
 ```
 
 The `openimpala-cuda` package is a drop-in replacement for `openimpala` and
diff --git a/notebooks/visualization_yt.ipynb b/notebooks/visualization_yt.ipynb
index 7034eb68..bbfb89c9 100644
--- a/notebooks/visualization_yt.ipynb
+++ b/notebooks/visualization_yt.ipynb
@@ -35,24 +35,7 @@
    "execution_count": null,
    "metadata": {},
    "outputs": [],
-   "source": [
-    "import subprocess, sys\n",
-    "\n",
-    "def _has_gpu():\n",
-    "    try:\n",
-    "        subprocess.check_output([\"nvidia-smi\"], stderr=subprocess.DEVNULL)\n",
-    "        return True\n",
-    "    except (FileNotFoundError, subprocess.CalledProcessError):\n",
-    "        return False\n",
-    "\n",
-    "_extras = \"yt matplotlib porespy\"\n",
-    "if _has_gpu():\n",
-    "    print(\"GPU detected — installing openimpala-cuda\")\n",
-    "    !pip install -q openimpala-cuda --find-links https://github.com/BASE-Laboratory/OpenImpala/releases/latest nvidia-cuda-runtime-cu12 nvidia-cublas-cu12 nvidia-cusparse-cu12 nvidia-curand-cu12 {_extras}\n",
-    "else:\n",
-    "    print(\"No GPU detected — installing openimpala (CPU)\")\n",
-    "    !pip install -q openimpala {_extras}"
-   ]
+   "source": "import subprocess, sys\n\ndef _has_gpu():\n    try:\n        subprocess.check_output([\"nvidia-smi\"], stderr=subprocess.DEVNULL)\n        return True\n    except (FileNotFoundError, subprocess.CalledProcessError):\n        return False\n\n_extras = \"yt matplotlib porespy\"\nif _has_gpu():\n    print(\"GPU detected — installing openimpala-cuda\")\n    !pip install -q openimpala-cuda {_extras}\nelse:\n    print(\"No GPU detected — installing openimpala (CPU)\")\n    !pip install -q openimpala {_extras}"
   },
   {
    "cell_type": "markdown",
@@ -413,4 +396,4 @@
  },
  "nbformat": 4,
  "nbformat_minor": 4
-}
+}
\ No newline at end of file
diff --git a/paper.md b/paper.md
index 947135ee..7aec22c5 100644
--- a/paper.md
+++ b/paper.md
@@ -83,7 +83,7 @@ with oi.Session():
     print(f"Tortuosity: {result.tortuosity:.4f}")
 ```
 
-A pure-Python package is distributed via PyPI (`pip install openimpala`) with automatic GPU acceleration via CuPy when available, and compiled CUDA GPU wheels with HYPRE solvers are available via GitHub Releases (`pip install openimpala-cuda`) for HPC deployments. Interactive tutorial notebooks are provided for Google Colab, covering workflows from basic tortuosity computation to digital twin parameterisation with PyBaMM. API reference documentation, installation guides, and interactive tutorial notebooks are available at https://base-laboratory.github.io/OpenImpala/
+A pure-Python package is distributed via PyPI (`pip install openimpala`) with automatic GPU acceleration via CuPy when available, and compiled CUDA GPU wheels with HYPRE solvers are also distributed via PyPI (`pip install openimpala-cuda`) for HPC deployments. Interactive tutorial notebooks are provided for Google Colab, covering workflows from basic tortuosity computation to digital twin parameterisation with PyBaMM. API reference documentation, installation guides, and interactive tutorial notebooks are available at https://base-laboratory.github.io/OpenImpala/
 
 ## Testing and Quality Assurance
 
diff --git a/pyproject.toml b/pyproject.toml
index 324c4bc6..8ad41057 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -16,6 +16,17 @@ requires-python = ">=3.8"
 dependencies = [
     "numpy",
     "scipy>=1.7",
+    # GPU-only deps: kept commented out so the CPU wheel stays lean.
+    # The openimpala-cuda build in .github/workflows/pypi-wheels-gpu.yml
+    # uncomments these via sed before scikit-build-core reads this file.
+    # Do not remove the "# cuda-" markers — the sed rule keys off them.
+    # cuda-deps-start
+    #"nvidia-cuda-runtime-cu12",
+    #"nvidia-cublas-cu12",
+    #"nvidia-cusparse-cu12",
+    #"nvidia-curand-cu12",
+    #"nvidia-nvjitlink-cu12",
+    # cuda-deps-end
 ]
 
 [project.optional-dependencies]
diff --git a/tutorials/02_digital_twin.ipynb b/tutorials/02_digital_twin.ipynb
index 8942498c..2d8691b4 100644
--- a/tutorials/02_digital_twin.ipynb
+++ b/tutorials/02_digital_twin.ipynb
@@ -10,7 +10,7 @@
    "execution_count": null,
    "metadata": {},
    "outputs": [],
-   "source": "# Install OpenImpala and dependencies.\n# On GPU runtimes (T4, A100, etc.) we install the CUDA-accelerated wheel;\n# on CPU-only runtimes we fall back to the pure-Python/CPU package.\nimport subprocess, sys\n\ndef _has_gpu():\n    try:\n        subprocess.check_output([\"nvidia-smi\"], stderr=subprocess.DEVNULL)\n        return True\n    except (FileNotFoundError, subprocess.CalledProcessError):\n        return False\n\n_common = \"pybamm bpx tifffile matplotlib yt\"\nif _has_gpu():\n    print(\"GPU detected — installing openimpala-cuda\")\n    !pip install -q openimpala-cuda --find-links https://github.com/BASE-Laboratory/OpenImpala/releases/latest nvidia-cuda-runtime-cu12 nvidia-cublas-cu12 nvidia-cusparse-cu12 nvidia-curand-cu12 {_common}\nelse:\n    print(\"No GPU detected — installing openimpala (CPU)\")\n    !pip install -q openimpala {_common}"
+   "source": "# Install OpenImpala and dependencies.\n# On GPU runtimes (T4, A100, etc.) we install the CUDA-accelerated wheel;\n# on CPU-only runtimes we fall back to the pure-Python/CPU package.\nimport subprocess, sys\n\ndef _has_gpu():\n    try:\n        subprocess.check_output([\"nvidia-smi\"], stderr=subprocess.DEVNULL)\n        return True\n    except (FileNotFoundError, subprocess.CalledProcessError):\n        return False\n\n_common = \"pybamm bpx tifffile matplotlib yt\"\nif _has_gpu():\n    print(\"GPU detected — installing openimpala-cuda\")\n    !pip install -q openimpala-cuda {_common}\nelse:\n    print(\"No GPU detected — installing openimpala (CPU)\")\n    !pip install -q openimpala {_common}"
   },
   {
    "cell_type": "code",
diff --git a/tutorials/04_multiphase_and_fields.ipynb b/tutorials/04_multiphase_and_fields.ipynb
index 8131c07c..f6640664 100644
--- a/tutorials/04_multiphase_and_fields.ipynb
+++ b/tutorials/04_multiphase_and_fields.ipynb
@@ -10,7 +10,7 @@
    "execution_count": null,
    "metadata": {},
    "outputs": [],
-   "source": "# Install OpenImpala (compiled C++ backend needed for low-level API in this tutorial).\n# Auto-detect GPU vs CPU runtime.\nimport subprocess, sys\n\ndef _has_gpu():\n    try:\n        subprocess.check_output([\"nvidia-smi\"], stderr=subprocess.DEVNULL)\n        return True\n    except (FileNotFoundError, subprocess.CalledProcessError):\n        return False\n\n_common = \"porespy yt matplotlib\"\nif _has_gpu():\n    print(\"GPU detected — installing openimpala-cuda\")\n    !pip install -q openimpala-cuda --find-links https://github.com/BASE-Laboratory/OpenImpala/releases/latest nvidia-cuda-runtime-cu12 nvidia-cublas-cu12 nvidia-cusparse-cu12 nvidia-curand-cu12 {_common}\nelse:\n    print(\"No GPU detected — installing openimpala (CPU)\")\n    !pip install -q openimpala {_common}"
+   "source": "# Install OpenImpala (compiled C++ backend needed for low-level API in this tutorial).\n# Auto-detect GPU vs CPU runtime.\nimport subprocess, sys\n\ndef _has_gpu():\n    try:\n        subprocess.check_output([\"nvidia-smi\"], stderr=subprocess.DEVNULL)\n        return True\n    except (FileNotFoundError, subprocess.CalledProcessError):\n        return False\n\n_common = \"porespy yt matplotlib\"\nif _has_gpu():\n    print(\"GPU detected — installing openimpala-cuda\")\n    !pip install -q openimpala-cuda {_common}\nelse:\n    print(\"No GPU detected — installing openimpala (CPU)\")\n    !pip install -q openimpala {_common}"
   },
   {
    "cell_type": "code",
diff --git a/tutorials/07_hpc_scaling.ipynb b/tutorials/07_hpc_scaling.ipynb
index bd0b783d..e4c879e5 100644
--- a/tutorials/07_hpc_scaling.ipynb
+++ b/tutorials/07_hpc_scaling.ipynb
@@ -242,7 +242,7 @@
    "execution_count": null,
    "metadata": {},
    "outputs": [],
-   "source": "# Install OpenImpala (compiled C++ backend needed for HPC features in this tutorial).\n# Auto-detect GPU vs CPU runtime.\nimport subprocess, sys\n\ndef _has_gpu():\n    try:\n        subprocess.check_output([\"nvidia-smi\"], stderr=subprocess.DEVNULL)\n        return True\n    except (FileNotFoundError, subprocess.CalledProcessError):\n        return False\n\n_common = \"porespy matplotlib\"\nif _has_gpu():\n    print(\"GPU detected — installing openimpala-cuda\")\n    !pip install -q openimpala-cuda --find-links https://github.com/BASE-Laboratory/OpenImpala/releases/latest nvidia-cuda-runtime-cu12 nvidia-cublas-cu12 nvidia-cusparse-cu12 nvidia-curand-cu12 {_common}\nelse:\n    print(\"No GPU detected — installing openimpala (CPU)\")\n    !pip install -q openimpala {_common}"
+   "source": "# Install OpenImpala (compiled C++ backend needed for HPC features in this tutorial).\n# Auto-detect GPU vs CPU runtime.\nimport subprocess, sys\n\ndef _has_gpu():\n    try:\n        subprocess.check_output([\"nvidia-smi\"], stderr=subprocess.DEVNULL)\n        return True\n    except (FileNotFoundError, subprocess.CalledProcessError):\n        return False\n\n_common = \"porespy matplotlib\"\nif _has_gpu():\n    print(\"GPU detected — installing openimpala-cuda\")\n    !pip install -q openimpala-cuda {_common}\nelse:\n    print(\"No GPU detected — installing openimpala (CPU)\")\n    !pip install -q openimpala {_common}"
   },
   {
    "cell_type": "code",