From 5ea4878177c9c203fbf333387342010379f1943c Mon Sep 17 00:00:00 2001 From: James Le Houx Date: Wed, 6 May 2026 09:48:01 +0000 Subject: [PATCH 1/3] clang-format: rewrap "total" computation in voxelimage_from_numpy CI clang-format check tripped on the line in 64306d4: python/bindings/module.cpp:173:30: error: code should be clang-formatted [-Wclang-format-violations] The 100-column LLVM-based style preferred wrapping the assignment after the `=`, so the three static_casts sit on a single continuation line rather than being broken in the middle. Ran `clang-format -i` against the file and confirmed it's now idempotent under the project's .clang-format settings. No behavioural change. https://claude.ai/code/session_011dJ5Bwq4Tnr8wxH597XJFf --- python/bindings/module.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/bindings/module.cpp b/python/bindings/module.cpp index d151874..1fa3cd8 100644 --- a/python/bindings/module.cpp +++ b/python/bindings/module.cpp @@ -170,8 +170,8 @@ voxelimage_from_numpy(py::array_tmf = std::make_shared(img->ba, img->dm, 1, 1); const auto* host_ptr = static_cast(buf.ptr); - const std::size_t total = static_cast(nx) * static_cast(ny) * - static_cast(nz); + const std::size_t total = + static_cast(nx) * static_cast(ny) * static_cast(nz); // Stage the NumPy data in a buffer the kernel below can dereference safely. // On CPU builds this is just the host pointer (no copy). On CUDA builds the From 61cf635c7dbccb0aaf1715dd02cf23f99ee62928 Mon Sep 17 00:00:00 2001 From: James Le Houx Date: Wed, 6 May 2026 09:53:07 +0000 Subject: [PATCH 2/3] fix flood-fill seed planting segfault on GPU build MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Reported on Colab T4 with openimpala-cuda 4.2.10: notebook §3 still crashes the kernel, this time at PercolationCheck construction. The silent crash from §3 dies inside parallelFloodFill — specifically at the seed-planting phase 1, FloodFill.cpp:141-147: for (const auto& seed : seedPoints) { if (tileBox.contains(seed)) { if (phase_arr(seed) == phaseID) { mask_arr(seed, 0) = label; // <-- host-side write to // device-resident memory } } } Same bug pattern as VoxelImage.from_numpy in 64306d4: a host-side loop writes through an Array4 view that points at iMultiFab data the AMReX CUDA build keeps in device memory. Reads through the view also fault, but it's the writes that consistently kill the kernel. Fix: keep the host-side per-tile filter (seedPoints can have many out-of-tile entries on multi-rank decompositions, so it's worth the short list) but stage the in-tile seeds in a Gpu::DeviceVector and plant them via amrex::ParallelFor with an AMREX_GPU_DEVICE lambda. On CPU builds the DeviceVector / copyAsync paths short out via the #ifdef AMREX_USE_GPU guards and tile_seeds.data() is used directly, so the change is a no-op for the CPU wheel. streamSynchronize() after the per-tile copyAsync is needed because the next iteration of the MFIter may submit another launch while this one is still in flight; the trailing global streamSynchronize ensures all planted seeds are visible before phase 2 (FillBoundary + wavefront expansion) starts. Also fixed VolumeFraction by inspection — confirmed it already uses ReduceOps with AMREX_GPU_DEVICE lambda (no host-write pattern). Verified clang-format passes idempotently. This needs another release tag (4.2.11) before the user can run PercolationCheck / oi.tortuosity from Colab on a GPU runtime. https://claude.ai/code/session_011dJ5Bwq4Tnr8wxH597XJFf --- src/props/FloodFill.cpp | 40 ++++++++++++++++++++++++++++++++++++---- 1 file changed, 36 insertions(+), 4 deletions(-) diff --git a/src/props/FloodFill.cpp b/src/props/FloodFill.cpp index 7cf2c49..8c97610 100644 --- a/src/props/FloodFill.cpp +++ b/src/props/FloodFill.cpp @@ -125,7 +125,12 @@ void parallelFloodFill(amrex::iMultiFab& reachabilityMask, const amrex::iMultiFa AMREX_ASSERT(label != FLOOD_INACTIVE); // label must differ from the "empty" marker // --- Phase 1: Plant seeds --- - // Seeds are a small host-side list; planting is cheap either way. + // Seeds arrive as a small host-side list. The mask_arr writes below need + // to run on the device that owns the iMultiFab data: on a CUDA build that + // is the GPU, and writing to mask_arr(iv, 0) from a host loop would + // segfault (T4 / A100 etc.). Filter seeds per tile on the host, stage + // them in a device buffer, then plant them via ParallelFor. + // // Note: we do NOT clear the mask here — callers that use multiple labels // (ConnectedComponents) may call this repeatedly on the same mask. // Callers doing a fresh fill should setVal(FLOOD_INACTIVE) beforehand. @@ -138,14 +143,41 @@ void parallelFloodFill(amrex::iMultiFab& reachabilityMask, const amrex::iMultiFa const amrex::Box& tileBox = mfi.tilebox(); auto mask_arr = reachabilityMask.array(mfi); const auto phase_arr = phaseFab.const_array(mfi, 0); + + amrex::Vector tile_seeds; for (const auto& seed : seedPoints) { if (tileBox.contains(seed)) { - if (phase_arr(seed) == phaseID) { - mask_arr(seed, 0) = label; - } + tile_seeds.push_back(seed); } } + if (tile_seeds.empty()) { + continue; + } + + const int n_seeds = static_cast(tile_seeds.size()); + const int pid = phaseID; + const int lbl = label; + +#ifdef AMREX_USE_GPU + amrex::Gpu::DeviceVector d_seeds(n_seeds); + amrex::Gpu::copyAsync(amrex::Gpu::hostToDevice, tile_seeds.data(), + tile_seeds.data() + n_seeds, d_seeds.data()); + amrex::Gpu::streamSynchronize(); + const amrex::IntVect* seed_ptr = d_seeds.data(); +#else + const amrex::IntVect* seed_ptr = tile_seeds.data(); +#endif + + amrex::ParallelFor(n_seeds, [=] AMREX_GPU_DEVICE(int s) noexcept { + const amrex::IntVect iv = seed_ptr[s]; + if (phase_arr(iv) == pid) { + mask_arr(iv, 0) = lbl; + } + }); } +#ifdef AMREX_USE_GPU + amrex::Gpu::streamSynchronize(); +#endif // --- Phase 2: Iterative wavefront expansion --- // Each iteration expands the reachable set by 1 cell in each direction. From 6ea391c1b8c0a6bb30e554bb466c624c5558e069 Mon Sep 17 00:00:00 2001 From: James Le Houx Date: Wed, 6 May 2026 15:01:56 +0000 Subject: [PATCH 3/3] fix all remaining host-write-to-device-memory bugs in HYPRE solvers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit After patching VoxelImage.from_numpy (64306d4) and FloodFill seed planting (61cf635), an audit of the rest of src/props/ surfaced four more sites with the same pattern: host code writing through an Array4 view that points at device-resident iMultiFab data on CUDA builds. All would have segfaulted on T4 / A100 / etc. once the user got past the earlier crash points. CRITICAL — fires on every solve in the oi.tortuosity hot path: TortuosityHypre.cpp:1137 — flux-calc solution writeback. Every call to solver.value() reads HYPRE's solution into a host buffer with HYPRE_StructVectorGetBoxValues, then a LoopOnCpu copies into mf_soln_temp. On GPU the destination Array4 lives on device → segfault. EffectiveDiffusivityHypre.cpp:721 — same pattern in getChiSolution. Every oi.effective_diffusivity solve hits this on GPU. EffectiveDiffusivityHypre.cpp:283 — generateActiveMask wrote mask_arr(i,j,k,...) and read dc_arr(i,j,k,0) inside a LoopOnCpu, which segfaults on both ends. Replaced with a ParallelFor for the mask write plus a ReduceOps for the three debug counters; the std::atomic counters that the LoopOnCpu was incrementing are unnecessary now that the reduction is one-shot. EffectiveDiffusivityHypre.cpp:532 — pin-cell search read mask_arr from host. Replaced with a ReduceOps over the linearised cell index of active cells (sentinel = LONG_MAX for inactive); the winning index is unpacked back to (i,j,k) on host. NON-CRITICAL — fires only on opt-in paths, fixed for completeness: TortuosityHypre.cpp:660 — plotfile writeback (write_plotfile=True only) TortuosityHypre.cpp:709 — failed-solve plotfile (write_plotfile=True AND solver did not converge) Same staging recipe everywhere: if AMREX_USE_GPU, copy the host buffer into a Gpu::DeviceVector first and ParallelFor with a manually-computed linear index lin = (k - lo.z)*nx*ny + (j - lo.y)*nx + (i - lo.x); else just point src_ptr at the host buffer and the same ParallelFor expands to a serial/OMP host loop. Trailing streamSynchronize after each MFIter loop ensures all writes are complete before the next downstream operation (FillBoundary, FlushBoundary, plotfile writer). Verified clang-format passes idempotently. Together with 64306d4 (VoxelImage) and 61cf635 (FloodFill) this should take the GPU build from "segfaults at every entry point" to "fully functional"; oi.tortuosity, oi.effective_diffusivity, oi.percolation_check, and oi.volume_fraction should all run end-to-end on Colab T4 once 4.2.12 is published. Skipped intentionally: TortuosityHypre.cpp:984 — checkMatrixProperties() debug-only function requires a host-side iMultiFab copy first; not in any standard call path TortuosityDirect.cpp — legacy Forward Euler solver, deprecated path https://claude.ai/code/session_011dJ5Bwq4Tnr8wxH597XJFf --- src/props/EffectiveDiffusivityHypre.cpp | 174 +++++++++++++----------- src/props/TortuosityHypre.cpp | 92 +++++++++---- 2 files changed, 162 insertions(+), 104 deletions(-) diff --git a/src/props/EffectiveDiffusivityHypre.cpp b/src/props/EffectiveDiffusivityHypre.cpp index 7218f7f..1f93c4c 100644 --- a/src/props/EffectiveDiffusivityHypre.cpp +++ b/src/props/EffectiveDiffusivityHypre.cpp @@ -253,64 +253,49 @@ void EffectiveDiffusivityHypre::generateActiveMask() { m_mf_phase_original.FillBoundary(m_geom.periodicity()); } - std::atomic cells_not_target_became_active(0); // Renamed for clarity - std::atomic cells_target_became_active(0); // Renamed - std::atomic cells_target_became_inactive(0); // Renamed - -#ifdef AMREX_USE_OMP -#pragma omp parallel if (amrex::Gpu::notInLaunchRegion()) -#endif + // Phase 1: write the active mask via a GPU-compatible ParallelFor. The + // mask_arr Array4 view points at device memory on CUDA builds, so the + // assignment must run on the device that owns the iMultiFab. for (amrex::MFIter mfi(m_mf_active_mask, amrex::TilingIfNotGPU()); mfi.isValid(); ++mfi) { const amrex::Box& current_tile_box = mfi.tilebox(); amrex::Array4 const mask_arr = m_mf_active_mask.array(mfi); - amrex::Array4 const phase_arr = m_mf_phase_original.const_array(mfi); - - const int local_m_phase_id = m_phase_id; - - if (m_verbose > 2 && amrex::ParallelDescriptor::IOProcessor() && mfi.LocalIndex() == 0 && - amrex::ParallelDescriptor::MyProc() == - amrex::ParallelDescriptor::IOProcessorNumber()) { // Changed to verbose > 2 - amrex::Print() << " DEBUG HYPRE: generateActiveMask MFIter loop (verbose > 2) is " - "using local_m_phase_id = " - << local_m_phase_id << " (from m_phase_id = " << m_phase_id - << ") for comparison." << std::endl; - } - - const amrex::Box& valid_bx_for_debug = mfi.validbox(); - amrex::Array4 const dc_arr = m_mf_diff_coeff.const_array(mfi); + amrex::ParallelFor(current_tile_box, [=] AMREX_GPU_DEVICE(int i, int j, int k) noexcept { + mask_arr(i, j, k, MaskComp) = (dc_arr(i, j, k, 0) > 0.0) ? cell_active : cell_inactive; + }); + } - amrex::LoopOnCpu( - current_tile_box, [=, &cells_not_target_became_active, &cells_target_became_active, - &cells_target_became_inactive](int i, int j, int k) noexcept { - // In multi-phase mode, mark active if D > 0. - // In single-phase mode, mark active if phase matches target. - bool should_be_active = (dc_arr(i, j, k, 0) > 0.0); - - if (should_be_active) { - mask_arr(i, j, k, MaskComp) = cell_active; - if (valid_bx_for_debug.contains(i, j, k)) { - cells_target_became_active++; - } - } else { - mask_arr(i, j, k, MaskComp) = cell_inactive; - } - // Separate check for errors in valid region - int original_phase_val = phase_arr(i, j, k, 0); - bool is_target_phase = (original_phase_val == local_m_phase_id); - if (valid_bx_for_debug.contains(i, j, k)) { - if (!is_target_phase && mask_arr(i, j, k, MaskComp) == cell_active) { - cells_not_target_became_active++; - } else if (is_target_phase && mask_arr(i, j, k, MaskComp) == cell_inactive) { - cells_target_became_inactive++; - } - } + // Phase 2: collect debug counters via GPU-compatible reductions over the + // valid region only. Three independent ReduceOps because the counters + // mean different things; merging them into one tuple-reduce would be + // marginally faster but obscures intent. + amrex::ReduceOps debug_reduce_op; + amrex::ReduceData debug_reduce_data(debug_reduce_op); + const int local_m_phase_id = m_phase_id; + for (amrex::MFIter mfi(m_mf_active_mask); mfi.isValid(); ++mfi) { + const amrex::Box& vbx = mfi.validbox(); + amrex::Array4 const mask_arr = m_mf_active_mask.const_array(mfi); + amrex::Array4 const phase_arr = m_mf_phase_original.const_array(mfi); + amrex::Array4 const dc_arr = m_mf_diff_coeff.const_array(mfi); + debug_reduce_op.eval( + vbx, debug_reduce_data, + [=] AMREX_GPU_DEVICE(int i, int j, + int k) noexcept -> amrex::GpuTuple { + const bool should_be_active = (dc_arr(i, j, k, 0) > 0.0); + const int classified = mask_arr(i, j, k, MaskComp); + const bool is_target_phase = (phase_arr(i, j, k, 0) == local_m_phase_id); + long n_target_active = (should_be_active && is_target_phase) ? 1L : 0L; + long n_not_target_became_active = + (!is_target_phase && classified == cell_active) ? 1L : 0L; + long n_target_became_inactive = + (is_target_phase && classified == cell_inactive) ? 1L : 0L; + return {n_not_target_became_active, n_target_active, n_target_became_inactive}; }); } - - long cells_not_target_became_active_val = cells_not_target_became_active.load(); - long cells_target_became_active_val = cells_target_became_active.load(); - long cells_target_became_inactive_val = cells_target_became_inactive.load(); + const auto debug_tuple = debug_reduce_data.value(); + long cells_not_target_became_active_val = amrex::get<0>(debug_tuple); + long cells_target_became_active_val = amrex::get<1>(debug_tuple); + long cells_target_became_inactive_val = amrex::get<2>(debug_tuple); amrex::ParallelDescriptor::ReduceLongSum(cells_not_target_became_active_val); amrex::ParallelDescriptor::ReduceLongSum(cells_target_became_active_val); @@ -524,27 +509,46 @@ void EffectiveDiffusivityHypre::setupMatrixEquation() { amrex::IntVect local_pin(-1, -1, -1); long local_pin_rank_idx = std::numeric_limits::max(); // linearized index + // Find the lexicographically-first active cell on this rank via a + // GPU-compatible Min reduction over the linearised (i,j,k) index of + // active cells. Inactive cells contribute LONG_MAX so they never win. + // mask_arr is an Array4 view into device memory on CUDA builds — + // reading it from a host loop would segfault. + const amrex::Box& domain = m_geom.Domain(); + const long dom_lo_x = domain.smallEnd(0); + const long dom_lo_y = domain.smallEnd(1); + const long dom_lo_z = domain.smallEnd(2); + const long dom_len_x = domain.length(0); + const long dom_len_y = domain.length(1); + const long sentinel = std::numeric_limits::max(); + + amrex::ReduceOps pin_reduce_op; + amrex::ReduceData pin_reduce_data(pin_reduce_op); for (amrex::MFIter mfi(m_mf_active_mask); mfi.isValid(); ++mfi) { const amrex::Box& vbx = mfi.validbox(); auto mask_arr = m_mf_active_mask.const_array(mfi); - const auto& domain = m_geom.Domain(); - bool found = false; - amrex::LoopOnCpu(vbx, [&](int i, int j, int k) { - if (!found && mask_arr(i, j, k, MaskComp) == cell_active) { - long lin = static_cast(i - domain.smallEnd(0)) + - static_cast(domain.length(0)) * - (static_cast(j - domain.smallEnd(1)) + - static_cast(domain.length(1)) * - static_cast(k - domain.smallEnd(2))); - if (lin < local_pin_rank_idx) { - local_pin_rank_idx = lin; - local_pin = amrex::IntVect(i, j, k); - found = true; + pin_reduce_op.eval( + vbx, pin_reduce_data, + [=] AMREX_GPU_DEVICE(int i, int j, int k) noexcept -> amrex::GpuTuple { + if (mask_arr(i, j, k, MaskComp) != cell_active) { + return {sentinel}; } - } - }); - if (found) - break; + const long lin = static_cast(i - dom_lo_x) + + dom_len_x * (static_cast(j - dom_lo_y) + + dom_len_y * static_cast(k - dom_lo_z)); + return {lin}; + }); + } + local_pin_rank_idx = amrex::get<0>(pin_reduce_data.value()); + if (local_pin_rank_idx < sentinel) { + // Recover (i,j,k) from the linearised index. + const long k_off = local_pin_rank_idx / (dom_len_x * dom_len_y); + const long rem = local_pin_rank_idx - k_off * dom_len_x * dom_len_y; + const long j_off = rem / dom_len_x; + const long i_off = rem - j_off * dom_len_x; + local_pin = amrex::IntVect(static_cast(i_off + dom_lo_x), + static_cast(j_off + dom_lo_y), + static_cast(k_off + dom_lo_z)); } // Find the globally minimum linearized index across all ranks @@ -718,19 +722,33 @@ void EffectiveDiffusivityHypre::getChiSolution(amrex::MultiFab& chi_field) { continue; } + // Stage HYPRE's host soln_buffer in device memory; chi_arr is an + // Array4 view into chi_field which on CUDA builds lives in device + // memory, so writing through it from a host loop segfaults. +#ifdef AMREX_USE_GPU + amrex::Gpu::DeviceVector d_soln_buffer(npts); + amrex::Gpu::copyAsync(amrex::Gpu::hostToDevice, soln_buffer.data(), + soln_buffer.data() + npts, d_soln_buffer.data()); + amrex::Gpu::streamSynchronize(); + const HYPRE_Real* src_ptr = d_soln_buffer.data(); +#else + const HYPRE_Real* src_ptr = soln_buffer.data(); +#endif amrex::Array4 const chi_arr = chi_field.array(mfi); - long long k_lin_idx = 0; - - amrex::LoopOnCpu(bx_getsol, [=, &k_lin_idx](int i, int j, int k) noexcept { - if (k_lin_idx < npts) { - chi_arr(i, j, k, ChiComp) = soln_buffer[k_lin_idx]; - } - k_lin_idx++; + const int chi_comp = ChiComp; + const auto lo = amrex::lbound(bx_getsol); + const int nx_box = bx_getsol.length(0); + const int ny_box = bx_getsol.length(1); + amrex::ParallelFor(bx_getsol, [=] AMREX_GPU_DEVICE(int i, int j, int k) noexcept { + const long long lin = static_cast(k - lo.z) * nx_box * ny_box + + static_cast(j - lo.y) * nx_box + + static_cast(i - lo.x); + chi_arr(i, j, k, chi_comp) = static_cast(src_ptr[lin]); }); - AMREX_ALWAYS_ASSERT_WITH_MESSAGE( - k_lin_idx == npts, - "Point count mismatch during HYPRE GetBoxValues copy in getChiSolution"); } +#ifdef AMREX_USE_GPU + amrex::Gpu::streamSynchronize(); +#endif if (chi_field.nGrow() > 0) { chi_field.FillBoundary(m_geom.periodicity()); diff --git a/src/props/TortuosityHypre.cpp b/src/props/TortuosityHypre.cpp index c2afd86..a9fa770 100644 --- a/src/props/TortuosityHypre.cpp +++ b/src/props/TortuosityHypre.cpp @@ -657,20 +657,31 @@ bool OpenImpala::TortuosityHypre::solve() { if (get_ierr != 0) { amrex::Warning("HYPRE_StructVectorGetBoxValues failed during plotfile writing!"); } + // Same host-buffer-into-device-Array4 pattern as the flux-calc + // writeback below; stage in a DeviceVector and ParallelFor. +#ifdef AMREX_USE_GPU + amrex::Gpu::DeviceVector d_soln_buffer(npts); + amrex::Gpu::copyAsync(amrex::Gpu::hostToDevice, soln_buffer.data(), + soln_buffer.data() + npts, d_soln_buffer.data()); + amrex::Gpu::streamSynchronize(); + const HYPRE_Real* src_ptr = d_soln_buffer.data(); +#else + const HYPRE_Real* src_ptr = soln_buffer.data(); +#endif amrex::Array4 const soln_arr = mf_soln_temp.array(mfi); - long long k_lin_idx = 0; - amrex::LoopOnCpu(bx, [&](int ii, int jj, int kk) { - if (k_lin_idx < npts) { - soln_arr(ii, jj, kk, 0) = static_cast(soln_buffer[k_lin_idx]); - } else { - amrex::Warning("Buffer overrun detected during HYPRE GetBoxValues copy!"); - } - k_lin_idx++; + const auto lo = amrex::lbound(bx); + const int nx_box = bx.length(0); + const int ny_box = bx.length(1); + amrex::ParallelFor(bx, [=] AMREX_GPU_DEVICE(int i, int j, int k) noexcept { + const long long lin = static_cast(k - lo.z) * nx_box * ny_box + + static_cast(j - lo.y) * nx_box + + static_cast(i - lo.x); + soln_arr(i, j, k, 0) = static_cast(src_ptr[lin]); }); - if (k_lin_idx != npts) { - amrex::Warning("Point count mismatch during HYPRE GetBoxValues copy!"); - } } +#ifdef AMREX_USE_GPU + amrex::Gpu::streamSynchronize(); +#endif amrex::MultiFab mf_mask_temp(m_ba, m_dm, 1, 0); amrex::Copy(mf_mask_temp, m_mf_active_mask, MaskComp, 0, 1, 0); amrex::MultiFab mf_phase_temp(m_ba, m_dm, 1, 0); @@ -706,15 +717,29 @@ bool OpenImpala::TortuosityHypre::solve() { auto hypre_hi_f = OpenImpala::TortuosityHypre::hiV(bx); HYPRE_StructVectorGetBoxValues(m_x, hypre_lo_f.data(), hypre_hi_f.data(), soln_buf_fail.data()); +#ifdef AMREX_USE_GPU + amrex::Gpu::DeviceVector d_soln_buf_fail(npts); + amrex::Gpu::copyAsync(amrex::Gpu::hostToDevice, soln_buf_fail.data(), + soln_buf_fail.data() + npts, d_soln_buf_fail.data()); + amrex::Gpu::streamSynchronize(); + const HYPRE_Real* src_ptr_fail = d_soln_buf_fail.data(); +#else + const HYPRE_Real* src_ptr_fail = soln_buf_fail.data(); +#endif amrex::Array4 const soln_arr = mf_soln_fail.array(mfi); - long long idx = 0; - amrex::LoopOnCpu(bx, [&](int ii, int jj, int kk) { - if (idx < npts) { - soln_arr(ii, jj, kk, 0) = static_cast(soln_buf_fail[idx]); - } - idx++; + const auto lo_f = amrex::lbound(bx); + const int nx_f = bx.length(0); + const int ny_f = bx.length(1); + amrex::ParallelFor(bx, [=] AMREX_GPU_DEVICE(int i, int j, int k) noexcept { + const long long lin = static_cast(k - lo_f.z) * nx_f * ny_f + + static_cast(j - lo_f.y) * nx_f + + static_cast(i - lo_f.x); + soln_arr(i, j, k, 0) = static_cast(src_ptr_fail[lin]); }); } +#ifdef AMREX_USE_GPU + amrex::Gpu::streamSynchronize(); +#endif amrex::MultiFab mf_mask_fail(m_ba, m_dm, 1, 0); amrex::Copy(mf_mask_fail, m_mf_active_mask, MaskComp, 0, 1, 0); amrex::MultiFab mf_phase_fail(m_ba, m_dm, 1, 0); @@ -1134,18 +1159,33 @@ void OpenImpala::TortuosityHypre::global_fluxes() { if (get_ierr != 0) { amrex::Warning("HYPRE_StructVectorGetBoxValues failed during flux calculation copy!"); } + // Stage HYPRE's host-side soln_buffer in device memory before writing it + // through the iMultiFab Array4 view, which on CUDA builds points at GPU + // memory. A bare LoopOnCpu writing soln_arr(ii,jj,kk,0) = ... segfaults + // on T4 / A100 / etc. +#ifdef AMREX_USE_GPU + amrex::Gpu::DeviceVector d_soln_buffer(npts); + amrex::Gpu::copyAsync(amrex::Gpu::hostToDevice, soln_buffer.data(), + soln_buffer.data() + npts, d_soln_buffer.data()); + amrex::Gpu::streamSynchronize(); + const HYPRE_Real* src_ptr = d_soln_buffer.data(); +#else + const HYPRE_Real* src_ptr = soln_buffer.data(); +#endif amrex::Array4 const soln_arr = mf_soln_temp.array(mfi); - long long k_lin_idx = 0; - amrex::LoopOnCpu(bx, [&](int ii, int jj, int kk) { - if (k_lin_idx < npts) { - soln_arr(ii, jj, kk, 0) = static_cast(soln_buffer[k_lin_idx]); - } - k_lin_idx++; + const auto lo = amrex::lbound(bx); + const int nx_box = bx.length(0); + const int ny_box = bx.length(1); + amrex::ParallelFor(bx, [=] AMREX_GPU_DEVICE(int i, int j, int k) noexcept { + const long long lin = static_cast(k - lo.z) * nx_box * ny_box + + static_cast(j - lo.y) * nx_box + + static_cast(i - lo.x); + soln_arr(i, j, k, 0) = static_cast(src_ptr[lin]); }); - if (k_lin_idx != npts) { - amrex::Warning("Point count mismatch during flux calc copy!"); - } } +#ifdef AMREX_USE_GPU + amrex::Gpu::streamSynchronize(); +#endif mf_soln_temp.FillBoundary(m_geom.periodicity()); m_mf_active_mask.FillBoundary(m_geom.periodicity()); m_mf_diff_coeff.FillBoundary(m_geom.periodicity());