From 5ea4878177c9c203fbf333387342010379f1943c Mon Sep 17 00:00:00 2001
From: James Le Houx <james.le-houx@gre.ac.uk>
Date: Wed, 6 May 2026 09:48:01 +0000
Subject: [PATCH 1/3] clang-format: rewrap "total" computation in
 voxelimage_from_numpy

CI clang-format check tripped on the line in 64306d4:

    python/bindings/module.cpp:173:30: error: code should be
      clang-formatted [-Wclang-format-violations]

The 100-column LLVM-based style preferred wrapping the assignment after
the `=`, so the three static_casts sit on a single continuation line
rather than being broken in the middle. Ran `clang-format -i` against
the file and confirmed it's now idempotent under the project's
.clang-format settings.

No behavioural change.

https://claude.ai/code/session_011dJ5Bwq4Tnr8wxH597XJFf
---
 python/bindings/module.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/python/bindings/module.cpp b/python/bindings/module.cpp
index d151874..1fa3cd8 100644
--- a/python/bindings/module.cpp
+++ b/python/bindings/module.cpp
@@ -170,8 +170,8 @@ voxelimage_from_numpy(py::array_t<int32_t, py::array::c_style | py::array::force
     img->mf = std::make_shared<amrex::iMultiFab>(img->ba, img->dm, 1, 1);
 
     const auto* host_ptr = static_cast<const int32_t*>(buf.ptr);
-    const std::size_t total = static_cast<std::size_t>(nx) * static_cast<std::size_t>(ny) *
-                              static_cast<std::size_t>(nz);
+    const std::size_t total =
+        static_cast<std::size_t>(nx) * static_cast<std::size_t>(ny) * static_cast<std::size_t>(nz);
 
     // Stage the NumPy data in a buffer the kernel below can dereference safely.
     // On CPU builds this is just the host pointer (no copy). On CUDA builds the

From 61cf635c7dbccb0aaf1715dd02cf23f99ee62928 Mon Sep 17 00:00:00 2001
From: James Le Houx <james.le-houx@gre.ac.uk>
Date: Wed, 6 May 2026 09:53:07 +0000
Subject: [PATCH 2/3] fix flood-fill seed planting segfault on GPU build
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Reported on Colab T4 with openimpala-cuda 4.2.10: notebook §3 still
crashes the kernel, this time at PercolationCheck construction. The
silent crash from §3 dies inside parallelFloodFill — specifically at
the seed-planting phase 1, FloodFill.cpp:141-147:

    for (const auto& seed : seedPoints) {
        if (tileBox.contains(seed)) {
            if (phase_arr(seed) == phaseID) {
                mask_arr(seed, 0) = label;   // <-- host-side write to
                                             //     device-resident memory
            }
        }
    }

Same bug pattern as VoxelImage.from_numpy in 64306d4: a host-side
loop writes through an Array4<int> view that points at iMultiFab data
the AMReX CUDA build keeps in device memory. Reads through the view
also fault, but it's the writes that consistently kill the kernel.

Fix: keep the host-side per-tile filter (seedPoints can have many
out-of-tile entries on multi-rank decompositions, so it's worth the
short list) but stage the in-tile seeds in a Gpu::DeviceVector and
plant them via amrex::ParallelFor with an AMREX_GPU_DEVICE lambda.
On CPU builds the DeviceVector / copyAsync paths short out via the
#ifdef AMREX_USE_GPU guards and tile_seeds.data() is used directly,
so the change is a no-op for the CPU wheel.

streamSynchronize() after the per-tile copyAsync is needed because
the next iteration of the MFIter may submit another launch while
this one is still in flight; the trailing global streamSynchronize
ensures all planted seeds are visible before phase 2 (FillBoundary +
wavefront expansion) starts.

Also fixed VolumeFraction by inspection — confirmed it already uses
ReduceOps with AMREX_GPU_DEVICE lambda (no host-write pattern).

Verified clang-format passes idempotently.

This needs another release tag (4.2.11) before the user can run
PercolationCheck / oi.tortuosity from Colab on a GPU runtime.

https://claude.ai/code/session_011dJ5Bwq4Tnr8wxH597XJFf
---
 src/props/FloodFill.cpp | 40 ++++++++++++++++++++++++++++++++++++----
 1 file changed, 36 insertions(+), 4 deletions(-)

diff --git a/src/props/FloodFill.cpp b/src/props/FloodFill.cpp
index 7cf2c49..8c97610 100644
--- a/src/props/FloodFill.cpp
+++ b/src/props/FloodFill.cpp
@@ -125,7 +125,12 @@ void parallelFloodFill(amrex::iMultiFab& reachabilityMask, const amrex::iMultiFa
     AMREX_ASSERT(label != FLOOD_INACTIVE); // label must differ from the "empty" marker
 
     // --- Phase 1: Plant seeds ---
-    // Seeds are a small host-side list; planting is cheap either way.
+    // Seeds arrive as a small host-side list. The mask_arr writes below need
+    // to run on the device that owns the iMultiFab data: on a CUDA build that
+    // is the GPU, and writing to mask_arr(iv, 0) from a host loop would
+    // segfault (T4 / A100 etc.). Filter seeds per tile on the host, stage
+    // them in a device buffer, then plant them via ParallelFor.
+    //
     // Note: we do NOT clear the mask here — callers that use multiple labels
     // (ConnectedComponents) may call this repeatedly on the same mask.
     // Callers doing a fresh fill should setVal(FLOOD_INACTIVE) beforehand.
@@ -138,14 +143,41 @@ void parallelFloodFill(amrex::iMultiFab& reachabilityMask, const amrex::iMultiFa
         const amrex::Box& tileBox = mfi.tilebox();
         auto mask_arr = reachabilityMask.array(mfi);
         const auto phase_arr = phaseFab.const_array(mfi, 0);
+
+        amrex::Vector<amrex::IntVect> tile_seeds;
         for (const auto& seed : seedPoints) {
             if (tileBox.contains(seed)) {
-                if (phase_arr(seed) == phaseID) {
-                    mask_arr(seed, 0) = label;
-                }
+                tile_seeds.push_back(seed);
             }
         }
+        if (tile_seeds.empty()) {
+            continue;
+        }
+
+        const int n_seeds = static_cast<int>(tile_seeds.size());
+        const int pid = phaseID;
+        const int lbl = label;
+
+#ifdef AMREX_USE_GPU
+        amrex::Gpu::DeviceVector<amrex::IntVect> d_seeds(n_seeds);
+        amrex::Gpu::copyAsync(amrex::Gpu::hostToDevice, tile_seeds.data(),
+                              tile_seeds.data() + n_seeds, d_seeds.data());
+        amrex::Gpu::streamSynchronize();
+        const amrex::IntVect* seed_ptr = d_seeds.data();
+#else
+        const amrex::IntVect* seed_ptr = tile_seeds.data();
+#endif
+
+        amrex::ParallelFor(n_seeds, [=] AMREX_GPU_DEVICE(int s) noexcept {
+            const amrex::IntVect iv = seed_ptr[s];
+            if (phase_arr(iv) == pid) {
+                mask_arr(iv, 0) = lbl;
+            }
+        });
     }
+#ifdef AMREX_USE_GPU
+    amrex::Gpu::streamSynchronize();
+#endif
 
     // --- Phase 2: Iterative wavefront expansion ---
     // Each iteration expands the reachable set by 1 cell in each direction.

From 6ea391c1b8c0a6bb30e554bb466c624c5558e069 Mon Sep 17 00:00:00 2001
From: James Le Houx <james.le-houx@gre.ac.uk>
Date: Wed, 6 May 2026 15:01:56 +0000
Subject: [PATCH 3/3] fix all remaining host-write-to-device-memory bugs in
 HYPRE solvers
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

After patching VoxelImage.from_numpy (64306d4) and FloodFill seed planting
(61cf635), an audit of the rest of src/props/ surfaced four more sites with
the same pattern: host code writing through an Array4 view that points at
device-resident iMultiFab data on CUDA builds. All would have segfaulted on
T4 / A100 / etc. once the user got past the earlier crash points.

CRITICAL — fires on every solve in the oi.tortuosity hot path:

  TortuosityHypre.cpp:1137 — flux-calc solution writeback. Every call to
  solver.value() reads HYPRE's solution into a host buffer with
  HYPRE_StructVectorGetBoxValues, then a LoopOnCpu copies into mf_soln_temp.
  On GPU the destination Array4 lives on device → segfault.

  EffectiveDiffusivityHypre.cpp:721 — same pattern in getChiSolution. Every
  oi.effective_diffusivity solve hits this on GPU.

  EffectiveDiffusivityHypre.cpp:283 — generateActiveMask wrote
  mask_arr(i,j,k,...) and read dc_arr(i,j,k,0) inside a LoopOnCpu, which
  segfaults on both ends. Replaced with a ParallelFor for the mask write
  plus a ReduceOps<Sum,Sum,Sum> for the three debug counters; the
  std::atomic<long> counters that the LoopOnCpu was incrementing are
  unnecessary now that the reduction is one-shot.

  EffectiveDiffusivityHypre.cpp:532 — pin-cell search read mask_arr from
  host. Replaced with a ReduceOps<Min> over the linearised cell index of
  active cells (sentinel = LONG_MAX for inactive); the winning index is
  unpacked back to (i,j,k) on host.

NON-CRITICAL — fires only on opt-in paths, fixed for completeness:

  TortuosityHypre.cpp:660 — plotfile writeback (write_plotfile=True only)
  TortuosityHypre.cpp:709 — failed-solve plotfile (write_plotfile=True
                            AND solver did not converge)

Same staging recipe everywhere: if AMREX_USE_GPU, copy the host buffer
into a Gpu::DeviceVector first and ParallelFor with a manually-computed
linear index lin = (k - lo.z)*nx*ny + (j - lo.y)*nx + (i - lo.x); else
just point src_ptr at the host buffer and the same ParallelFor expands
to a serial/OMP host loop. Trailing streamSynchronize after each MFIter
loop ensures all writes are complete before the next downstream operation
(FillBoundary, FlushBoundary, plotfile writer).

Verified clang-format passes idempotently.

Together with 64306d4 (VoxelImage) and 61cf635 (FloodFill) this should
take the GPU build from "segfaults at every entry point" to "fully
functional"; oi.tortuosity, oi.effective_diffusivity, oi.percolation_check,
and oi.volume_fraction should all run end-to-end on Colab T4 once 4.2.12
is published.

Skipped intentionally:
  TortuosityHypre.cpp:984 — checkMatrixProperties() debug-only function
                            requires a host-side iMultiFab copy first; not
                            in any standard call path
  TortuosityDirect.cpp    — legacy Forward Euler solver, deprecated path

https://claude.ai/code/session_011dJ5Bwq4Tnr8wxH597XJFf
---
 src/props/EffectiveDiffusivityHypre.cpp | 174 +++++++++++++-----------
 src/props/TortuosityHypre.cpp           |  92 +++++++++----
 2 files changed, 162 insertions(+), 104 deletions(-)

diff --git a/src/props/EffectiveDiffusivityHypre.cpp b/src/props/EffectiveDiffusivityHypre.cpp
index 7218f7f..1f93c4c 100644
--- a/src/props/EffectiveDiffusivityHypre.cpp
+++ b/src/props/EffectiveDiffusivityHypre.cpp
@@ -253,64 +253,49 @@ void EffectiveDiffusivityHypre::generateActiveMask() {
         m_mf_phase_original.FillBoundary(m_geom.periodicity());
     }
 
-    std::atomic<long> cells_not_target_became_active(0); // Renamed for clarity
-    std::atomic<long> cells_target_became_active(0);     // Renamed
-    std::atomic<long> cells_target_became_inactive(0);   // Renamed
-
-#ifdef AMREX_USE_OMP
-#pragma omp parallel if (amrex::Gpu::notInLaunchRegion())
-#endif
+    // Phase 1: write the active mask via a GPU-compatible ParallelFor. The
+    // mask_arr Array4 view points at device memory on CUDA builds, so the
+    // assignment must run on the device that owns the iMultiFab.
     for (amrex::MFIter mfi(m_mf_active_mask, amrex::TilingIfNotGPU()); mfi.isValid(); ++mfi) {
         const amrex::Box& current_tile_box = mfi.tilebox();
         amrex::Array4<int> const mask_arr = m_mf_active_mask.array(mfi);
-        amrex::Array4<const int> const phase_arr = m_mf_phase_original.const_array(mfi);
-
-        const int local_m_phase_id = m_phase_id;
-
-        if (m_verbose > 2 && amrex::ParallelDescriptor::IOProcessor() && mfi.LocalIndex() == 0 &&
-            amrex::ParallelDescriptor::MyProc() ==
-                amrex::ParallelDescriptor::IOProcessorNumber()) { // Changed to verbose > 2
-            amrex::Print() << "  DEBUG HYPRE: generateActiveMask MFIter loop (verbose > 2) is "
-                              "using local_m_phase_id = "
-                           << local_m_phase_id << " (from m_phase_id = " << m_phase_id
-                           << ") for comparison." << std::endl;
-        }
-
-        const amrex::Box& valid_bx_for_debug = mfi.validbox();
-
         amrex::Array4<const amrex::Real> const dc_arr = m_mf_diff_coeff.const_array(mfi);
+        amrex::ParallelFor(current_tile_box, [=] AMREX_GPU_DEVICE(int i, int j, int k) noexcept {
+            mask_arr(i, j, k, MaskComp) = (dc_arr(i, j, k, 0) > 0.0) ? cell_active : cell_inactive;
+        });
+    }
 
-        amrex::LoopOnCpu(
-            current_tile_box, [=, &cells_not_target_became_active, &cells_target_became_active,
-                               &cells_target_became_inactive](int i, int j, int k) noexcept {
-                // In multi-phase mode, mark active if D > 0.
-                // In single-phase mode, mark active if phase matches target.
-                bool should_be_active = (dc_arr(i, j, k, 0) > 0.0);
-
-                if (should_be_active) {
-                    mask_arr(i, j, k, MaskComp) = cell_active;
-                    if (valid_bx_for_debug.contains(i, j, k)) {
-                        cells_target_became_active++;
-                    }
-                } else {
-                    mask_arr(i, j, k, MaskComp) = cell_inactive;
-                }
-                // Separate check for errors in valid region
-                int original_phase_val = phase_arr(i, j, k, 0);
-                bool is_target_phase = (original_phase_val == local_m_phase_id);
-                if (valid_bx_for_debug.contains(i, j, k)) {
-                    if (!is_target_phase && mask_arr(i, j, k, MaskComp) == cell_active) {
-                        cells_not_target_became_active++;
-                    } else if (is_target_phase && mask_arr(i, j, k, MaskComp) == cell_inactive) {
-                        cells_target_became_inactive++;
-                    }
-                }
+    // Phase 2: collect debug counters via GPU-compatible reductions over the
+    // valid region only. Three independent ReduceOps because the counters
+    // mean different things; merging them into one tuple-reduce would be
+    // marginally faster but obscures intent.
+    amrex::ReduceOps<amrex::ReduceOpSum, amrex::ReduceOpSum, amrex::ReduceOpSum> debug_reduce_op;
+    amrex::ReduceData<long, long, long> debug_reduce_data(debug_reduce_op);
+    const int local_m_phase_id = m_phase_id;
+    for (amrex::MFIter mfi(m_mf_active_mask); mfi.isValid(); ++mfi) {
+        const amrex::Box& vbx = mfi.validbox();
+        amrex::Array4<const int> const mask_arr = m_mf_active_mask.const_array(mfi);
+        amrex::Array4<const int> const phase_arr = m_mf_phase_original.const_array(mfi);
+        amrex::Array4<const amrex::Real> const dc_arr = m_mf_diff_coeff.const_array(mfi);
+        debug_reduce_op.eval(
+            vbx, debug_reduce_data,
+            [=] AMREX_GPU_DEVICE(int i, int j,
+                                 int k) noexcept -> amrex::GpuTuple<long, long, long> {
+                const bool should_be_active = (dc_arr(i, j, k, 0) > 0.0);
+                const int classified = mask_arr(i, j, k, MaskComp);
+                const bool is_target_phase = (phase_arr(i, j, k, 0) == local_m_phase_id);
+                long n_target_active = (should_be_active && is_target_phase) ? 1L : 0L;
+                long n_not_target_became_active =
+                    (!is_target_phase && classified == cell_active) ? 1L : 0L;
+                long n_target_became_inactive =
+                    (is_target_phase && classified == cell_inactive) ? 1L : 0L;
+                return {n_not_target_became_active, n_target_active, n_target_became_inactive};
             });
     }
-
-    long cells_not_target_became_active_val = cells_not_target_became_active.load();
-    long cells_target_became_active_val = cells_target_became_active.load();
-    long cells_target_became_inactive_val = cells_target_became_inactive.load();
+    const auto debug_tuple = debug_reduce_data.value();
+    long cells_not_target_became_active_val = amrex::get<0>(debug_tuple);
+    long cells_target_became_active_val = amrex::get<1>(debug_tuple);
+    long cells_target_became_inactive_val = amrex::get<2>(debug_tuple);
 
     amrex::ParallelDescriptor::ReduceLongSum(cells_not_target_became_active_val);
     amrex::ParallelDescriptor::ReduceLongSum(cells_target_became_active_val);
@@ -524,27 +509,46 @@ void EffectiveDiffusivityHypre::setupMatrixEquation() {
         amrex::IntVect local_pin(-1, -1, -1);
         long local_pin_rank_idx = std::numeric_limits<long>::max(); // linearized index
 
+        // Find the lexicographically-first active cell on this rank via a
+        // GPU-compatible Min reduction over the linearised (i,j,k) index of
+        // active cells. Inactive cells contribute LONG_MAX so they never win.
+        // mask_arr is an Array4 view into device memory on CUDA builds —
+        // reading it from a host loop would segfault.
+        const amrex::Box& domain = m_geom.Domain();
+        const long dom_lo_x = domain.smallEnd(0);
+        const long dom_lo_y = domain.smallEnd(1);
+        const long dom_lo_z = domain.smallEnd(2);
+        const long dom_len_x = domain.length(0);
+        const long dom_len_y = domain.length(1);
+        const long sentinel = std::numeric_limits<long>::max();
+
+        amrex::ReduceOps<amrex::ReduceOpMin> pin_reduce_op;
+        amrex::ReduceData<long> pin_reduce_data(pin_reduce_op);
         for (amrex::MFIter mfi(m_mf_active_mask); mfi.isValid(); ++mfi) {
             const amrex::Box& vbx = mfi.validbox();
             auto mask_arr = m_mf_active_mask.const_array(mfi);
-            const auto& domain = m_geom.Domain();
-            bool found = false;
-            amrex::LoopOnCpu(vbx, [&](int i, int j, int k) {
-                if (!found && mask_arr(i, j, k, MaskComp) == cell_active) {
-                    long lin = static_cast<long>(i - domain.smallEnd(0)) +
-                               static_cast<long>(domain.length(0)) *
-                                   (static_cast<long>(j - domain.smallEnd(1)) +
-                                    static_cast<long>(domain.length(1)) *
-                                        static_cast<long>(k - domain.smallEnd(2)));
-                    if (lin < local_pin_rank_idx) {
-                        local_pin_rank_idx = lin;
-                        local_pin = amrex::IntVect(i, j, k);
-                        found = true;
+            pin_reduce_op.eval(
+                vbx, pin_reduce_data,
+                [=] AMREX_GPU_DEVICE(int i, int j, int k) noexcept -> amrex::GpuTuple<long> {
+                    if (mask_arr(i, j, k, MaskComp) != cell_active) {
+                        return {sentinel};
                     }
-                }
-            });
-            if (found)
-                break;
+                    const long lin = static_cast<long>(i - dom_lo_x) +
+                                     dom_len_x * (static_cast<long>(j - dom_lo_y) +
+                                                  dom_len_y * static_cast<long>(k - dom_lo_z));
+                    return {lin};
+                });
+        }
+        local_pin_rank_idx = amrex::get<0>(pin_reduce_data.value());
+        if (local_pin_rank_idx < sentinel) {
+            // Recover (i,j,k) from the linearised index.
+            const long k_off = local_pin_rank_idx / (dom_len_x * dom_len_y);
+            const long rem = local_pin_rank_idx - k_off * dom_len_x * dom_len_y;
+            const long j_off = rem / dom_len_x;
+            const long i_off = rem - j_off * dom_len_x;
+            local_pin = amrex::IntVect(static_cast<int>(i_off + dom_lo_x),
+                                       static_cast<int>(j_off + dom_lo_y),
+                                       static_cast<int>(k_off + dom_lo_z));
         }
 
         // Find the globally minimum linearized index across all ranks
@@ -718,19 +722,33 @@ void EffectiveDiffusivityHypre::getChiSolution(amrex::MultiFab& chi_field) {
             continue;
         }
 
+        // Stage HYPRE's host soln_buffer in device memory; chi_arr is an
+        // Array4 view into chi_field which on CUDA builds lives in device
+        // memory, so writing through it from a host loop segfaults.
+#ifdef AMREX_USE_GPU
+        amrex::Gpu::DeviceVector<HYPRE_Real> d_soln_buffer(npts);
+        amrex::Gpu::copyAsync(amrex::Gpu::hostToDevice, soln_buffer.data(),
+                              soln_buffer.data() + npts, d_soln_buffer.data());
+        amrex::Gpu::streamSynchronize();
+        const HYPRE_Real* src_ptr = d_soln_buffer.data();
+#else
+        const HYPRE_Real* src_ptr = soln_buffer.data();
+#endif
         amrex::Array4<amrex::Real> const chi_arr = chi_field.array(mfi);
-        long long k_lin_idx = 0;
-
-        amrex::LoopOnCpu(bx_getsol, [=, &k_lin_idx](int i, int j, int k) noexcept {
-            if (k_lin_idx < npts) {
-                chi_arr(i, j, k, ChiComp) = soln_buffer[k_lin_idx];
-            }
-            k_lin_idx++;
+        const int chi_comp = ChiComp;
+        const auto lo = amrex::lbound(bx_getsol);
+        const int nx_box = bx_getsol.length(0);
+        const int ny_box = bx_getsol.length(1);
+        amrex::ParallelFor(bx_getsol, [=] AMREX_GPU_DEVICE(int i, int j, int k) noexcept {
+            const long long lin = static_cast<long long>(k - lo.z) * nx_box * ny_box +
+                                  static_cast<long long>(j - lo.y) * nx_box +
+                                  static_cast<long long>(i - lo.x);
+            chi_arr(i, j, k, chi_comp) = static_cast<amrex::Real>(src_ptr[lin]);
         });
-        AMREX_ALWAYS_ASSERT_WITH_MESSAGE(
-            k_lin_idx == npts,
-            "Point count mismatch during HYPRE GetBoxValues copy in getChiSolution");
     }
+#ifdef AMREX_USE_GPU
+    amrex::Gpu::streamSynchronize();
+#endif
 
     if (chi_field.nGrow() > 0) {
         chi_field.FillBoundary(m_geom.periodicity());
diff --git a/src/props/TortuosityHypre.cpp b/src/props/TortuosityHypre.cpp
index c2afd86..a9fa770 100644
--- a/src/props/TortuosityHypre.cpp
+++ b/src/props/TortuosityHypre.cpp
@@ -657,20 +657,31 @@ bool OpenImpala::TortuosityHypre::solve() {
             if (get_ierr != 0) {
                 amrex::Warning("HYPRE_StructVectorGetBoxValues failed during plotfile writing!");
             }
+            // Same host-buffer-into-device-Array4 pattern as the flux-calc
+            // writeback below; stage in a DeviceVector and ParallelFor.
+#ifdef AMREX_USE_GPU
+            amrex::Gpu::DeviceVector<HYPRE_Real> d_soln_buffer(npts);
+            amrex::Gpu::copyAsync(amrex::Gpu::hostToDevice, soln_buffer.data(),
+                                  soln_buffer.data() + npts, d_soln_buffer.data());
+            amrex::Gpu::streamSynchronize();
+            const HYPRE_Real* src_ptr = d_soln_buffer.data();
+#else
+            const HYPRE_Real* src_ptr = soln_buffer.data();
+#endif
             amrex::Array4<amrex::Real> const soln_arr = mf_soln_temp.array(mfi);
-            long long k_lin_idx = 0;
-            amrex::LoopOnCpu(bx, [&](int ii, int jj, int kk) {
-                if (k_lin_idx < npts) {
-                    soln_arr(ii, jj, kk, 0) = static_cast<amrex::Real>(soln_buffer[k_lin_idx]);
-                } else {
-                    amrex::Warning("Buffer overrun detected during HYPRE GetBoxValues copy!");
-                }
-                k_lin_idx++;
+            const auto lo = amrex::lbound(bx);
+            const int nx_box = bx.length(0);
+            const int ny_box = bx.length(1);
+            amrex::ParallelFor(bx, [=] AMREX_GPU_DEVICE(int i, int j, int k) noexcept {
+                const long long lin = static_cast<long long>(k - lo.z) * nx_box * ny_box +
+                                      static_cast<long long>(j - lo.y) * nx_box +
+                                      static_cast<long long>(i - lo.x);
+                soln_arr(i, j, k, 0) = static_cast<amrex::Real>(src_ptr[lin]);
             });
-            if (k_lin_idx != npts) {
-                amrex::Warning("Point count mismatch during HYPRE GetBoxValues copy!");
-            }
         }
+#ifdef AMREX_USE_GPU
+        amrex::Gpu::streamSynchronize();
+#endif
         amrex::MultiFab mf_mask_temp(m_ba, m_dm, 1, 0);
         amrex::Copy(mf_mask_temp, m_mf_active_mask, MaskComp, 0, 1, 0);
         amrex::MultiFab mf_phase_temp(m_ba, m_dm, 1, 0);
@@ -706,15 +717,29 @@ bool OpenImpala::TortuosityHypre::solve() {
             auto hypre_hi_f = OpenImpala::TortuosityHypre::hiV(bx);
             HYPRE_StructVectorGetBoxValues(m_x, hypre_lo_f.data(), hypre_hi_f.data(),
                                            soln_buf_fail.data());
+#ifdef AMREX_USE_GPU
+            amrex::Gpu::DeviceVector<HYPRE_Real> d_soln_buf_fail(npts);
+            amrex::Gpu::copyAsync(amrex::Gpu::hostToDevice, soln_buf_fail.data(),
+                                  soln_buf_fail.data() + npts, d_soln_buf_fail.data());
+            amrex::Gpu::streamSynchronize();
+            const HYPRE_Real* src_ptr_fail = d_soln_buf_fail.data();
+#else
+            const HYPRE_Real* src_ptr_fail = soln_buf_fail.data();
+#endif
             amrex::Array4<amrex::Real> const soln_arr = mf_soln_fail.array(mfi);
-            long long idx = 0;
-            amrex::LoopOnCpu(bx, [&](int ii, int jj, int kk) {
-                if (idx < npts) {
-                    soln_arr(ii, jj, kk, 0) = static_cast<amrex::Real>(soln_buf_fail[idx]);
-                }
-                idx++;
+            const auto lo_f = amrex::lbound(bx);
+            const int nx_f = bx.length(0);
+            const int ny_f = bx.length(1);
+            amrex::ParallelFor(bx, [=] AMREX_GPU_DEVICE(int i, int j, int k) noexcept {
+                const long long lin = static_cast<long long>(k - lo_f.z) * nx_f * ny_f +
+                                      static_cast<long long>(j - lo_f.y) * nx_f +
+                                      static_cast<long long>(i - lo_f.x);
+                soln_arr(i, j, k, 0) = static_cast<amrex::Real>(src_ptr_fail[lin]);
             });
         }
+#ifdef AMREX_USE_GPU
+        amrex::Gpu::streamSynchronize();
+#endif
         amrex::MultiFab mf_mask_fail(m_ba, m_dm, 1, 0);
         amrex::Copy(mf_mask_fail, m_mf_active_mask, MaskComp, 0, 1, 0);
         amrex::MultiFab mf_phase_fail(m_ba, m_dm, 1, 0);
@@ -1134,18 +1159,33 @@ void OpenImpala::TortuosityHypre::global_fluxes() {
         if (get_ierr != 0) {
             amrex::Warning("HYPRE_StructVectorGetBoxValues failed during flux calculation copy!");
         }
+        // Stage HYPRE's host-side soln_buffer in device memory before writing it
+        // through the iMultiFab Array4 view, which on CUDA builds points at GPU
+        // memory. A bare LoopOnCpu writing soln_arr(ii,jj,kk,0) = ... segfaults
+        // on T4 / A100 / etc.
+#ifdef AMREX_USE_GPU
+        amrex::Gpu::DeviceVector<HYPRE_Real> d_soln_buffer(npts);
+        amrex::Gpu::copyAsync(amrex::Gpu::hostToDevice, soln_buffer.data(),
+                              soln_buffer.data() + npts, d_soln_buffer.data());
+        amrex::Gpu::streamSynchronize();
+        const HYPRE_Real* src_ptr = d_soln_buffer.data();
+#else
+        const HYPRE_Real* src_ptr = soln_buffer.data();
+#endif
         amrex::Array4<amrex::Real> const soln_arr = mf_soln_temp.array(mfi);
-        long long k_lin_idx = 0;
-        amrex::LoopOnCpu(bx, [&](int ii, int jj, int kk) {
-            if (k_lin_idx < npts) {
-                soln_arr(ii, jj, kk, 0) = static_cast<amrex::Real>(soln_buffer[k_lin_idx]);
-            }
-            k_lin_idx++;
+        const auto lo = amrex::lbound(bx);
+        const int nx_box = bx.length(0);
+        const int ny_box = bx.length(1);
+        amrex::ParallelFor(bx, [=] AMREX_GPU_DEVICE(int i, int j, int k) noexcept {
+            const long long lin = static_cast<long long>(k - lo.z) * nx_box * ny_box +
+                                  static_cast<long long>(j - lo.y) * nx_box +
+                                  static_cast<long long>(i - lo.x);
+            soln_arr(i, j, k, 0) = static_cast<amrex::Real>(src_ptr[lin]);
         });
-        if (k_lin_idx != npts) {
-            amrex::Warning("Point count mismatch during flux calc copy!");
-        }
     }
+#ifdef AMREX_USE_GPU
+    amrex::Gpu::streamSynchronize();
+#endif
     mf_soln_temp.FillBoundary(m_geom.periodicity());
     m_mf_active_mask.FillBoundary(m_geom.periodicity());
     m_mf_diff_coeff.FillBoundary(m_geom.periodicity());