diff --git a/python/bindings/module.cpp b/python/bindings/module.cpp index d151874..1fa3cd8 100644 --- a/python/bindings/module.cpp +++ b/python/bindings/module.cpp @@ -170,8 +170,8 @@ voxelimage_from_numpy(py::array_tmf = std::make_shared(img->ba, img->dm, 1, 1); const auto* host_ptr = static_cast(buf.ptr); - const std::size_t total = static_cast(nx) * static_cast(ny) * - static_cast(nz); + const std::size_t total = + static_cast(nx) * static_cast(ny) * static_cast(nz); // Stage the NumPy data in a buffer the kernel below can dereference safely. // On CPU builds this is just the host pointer (no copy). On CUDA builds the diff --git a/src/props/EffectiveDiffusivityHypre.cpp b/src/props/EffectiveDiffusivityHypre.cpp index 7218f7f..1f93c4c 100644 --- a/src/props/EffectiveDiffusivityHypre.cpp +++ b/src/props/EffectiveDiffusivityHypre.cpp @@ -253,64 +253,49 @@ void EffectiveDiffusivityHypre::generateActiveMask() { m_mf_phase_original.FillBoundary(m_geom.periodicity()); } - std::atomic cells_not_target_became_active(0); // Renamed for clarity - std::atomic cells_target_became_active(0); // Renamed - std::atomic cells_target_became_inactive(0); // Renamed - -#ifdef AMREX_USE_OMP -#pragma omp parallel if (amrex::Gpu::notInLaunchRegion()) -#endif + // Phase 1: write the active mask via a GPU-compatible ParallelFor. The + // mask_arr Array4 view points at device memory on CUDA builds, so the + // assignment must run on the device that owns the iMultiFab. for (amrex::MFIter mfi(m_mf_active_mask, amrex::TilingIfNotGPU()); mfi.isValid(); ++mfi) { const amrex::Box& current_tile_box = mfi.tilebox(); amrex::Array4 const mask_arr = m_mf_active_mask.array(mfi); - amrex::Array4 const phase_arr = m_mf_phase_original.const_array(mfi); - - const int local_m_phase_id = m_phase_id; - - if (m_verbose > 2 && amrex::ParallelDescriptor::IOProcessor() && mfi.LocalIndex() == 0 && - amrex::ParallelDescriptor::MyProc() == - amrex::ParallelDescriptor::IOProcessorNumber()) { // Changed to verbose > 2 - amrex::Print() << " DEBUG HYPRE: generateActiveMask MFIter loop (verbose > 2) is " - "using local_m_phase_id = " - << local_m_phase_id << " (from m_phase_id = " << m_phase_id - << ") for comparison." << std::endl; - } - - const amrex::Box& valid_bx_for_debug = mfi.validbox(); - amrex::Array4 const dc_arr = m_mf_diff_coeff.const_array(mfi); + amrex::ParallelFor(current_tile_box, [=] AMREX_GPU_DEVICE(int i, int j, int k) noexcept { + mask_arr(i, j, k, MaskComp) = (dc_arr(i, j, k, 0) > 0.0) ? cell_active : cell_inactive; + }); + } - amrex::LoopOnCpu( - current_tile_box, [=, &cells_not_target_became_active, &cells_target_became_active, - &cells_target_became_inactive](int i, int j, int k) noexcept { - // In multi-phase mode, mark active if D > 0. - // In single-phase mode, mark active if phase matches target. - bool should_be_active = (dc_arr(i, j, k, 0) > 0.0); - - if (should_be_active) { - mask_arr(i, j, k, MaskComp) = cell_active; - if (valid_bx_for_debug.contains(i, j, k)) { - cells_target_became_active++; - } - } else { - mask_arr(i, j, k, MaskComp) = cell_inactive; - } - // Separate check for errors in valid region - int original_phase_val = phase_arr(i, j, k, 0); - bool is_target_phase = (original_phase_val == local_m_phase_id); - if (valid_bx_for_debug.contains(i, j, k)) { - if (!is_target_phase && mask_arr(i, j, k, MaskComp) == cell_active) { - cells_not_target_became_active++; - } else if (is_target_phase && mask_arr(i, j, k, MaskComp) == cell_inactive) { - cells_target_became_inactive++; - } - } + // Phase 2: collect debug counters via GPU-compatible reductions over the + // valid region only. Three independent ReduceOps because the counters + // mean different things; merging them into one tuple-reduce would be + // marginally faster but obscures intent. + amrex::ReduceOps debug_reduce_op; + amrex::ReduceData debug_reduce_data(debug_reduce_op); + const int local_m_phase_id = m_phase_id; + for (amrex::MFIter mfi(m_mf_active_mask); mfi.isValid(); ++mfi) { + const amrex::Box& vbx = mfi.validbox(); + amrex::Array4 const mask_arr = m_mf_active_mask.const_array(mfi); + amrex::Array4 const phase_arr = m_mf_phase_original.const_array(mfi); + amrex::Array4 const dc_arr = m_mf_diff_coeff.const_array(mfi); + debug_reduce_op.eval( + vbx, debug_reduce_data, + [=] AMREX_GPU_DEVICE(int i, int j, + int k) noexcept -> amrex::GpuTuple { + const bool should_be_active = (dc_arr(i, j, k, 0) > 0.0); + const int classified = mask_arr(i, j, k, MaskComp); + const bool is_target_phase = (phase_arr(i, j, k, 0) == local_m_phase_id); + long n_target_active = (should_be_active && is_target_phase) ? 1L : 0L; + long n_not_target_became_active = + (!is_target_phase && classified == cell_active) ? 1L : 0L; + long n_target_became_inactive = + (is_target_phase && classified == cell_inactive) ? 1L : 0L; + return {n_not_target_became_active, n_target_active, n_target_became_inactive}; }); } - - long cells_not_target_became_active_val = cells_not_target_became_active.load(); - long cells_target_became_active_val = cells_target_became_active.load(); - long cells_target_became_inactive_val = cells_target_became_inactive.load(); + const auto debug_tuple = debug_reduce_data.value(); + long cells_not_target_became_active_val = amrex::get<0>(debug_tuple); + long cells_target_became_active_val = amrex::get<1>(debug_tuple); + long cells_target_became_inactive_val = amrex::get<2>(debug_tuple); amrex::ParallelDescriptor::ReduceLongSum(cells_not_target_became_active_val); amrex::ParallelDescriptor::ReduceLongSum(cells_target_became_active_val); @@ -524,27 +509,46 @@ void EffectiveDiffusivityHypre::setupMatrixEquation() { amrex::IntVect local_pin(-1, -1, -1); long local_pin_rank_idx = std::numeric_limits::max(); // linearized index + // Find the lexicographically-first active cell on this rank via a + // GPU-compatible Min reduction over the linearised (i,j,k) index of + // active cells. Inactive cells contribute LONG_MAX so they never win. + // mask_arr is an Array4 view into device memory on CUDA builds — + // reading it from a host loop would segfault. + const amrex::Box& domain = m_geom.Domain(); + const long dom_lo_x = domain.smallEnd(0); + const long dom_lo_y = domain.smallEnd(1); + const long dom_lo_z = domain.smallEnd(2); + const long dom_len_x = domain.length(0); + const long dom_len_y = domain.length(1); + const long sentinel = std::numeric_limits::max(); + + amrex::ReduceOps pin_reduce_op; + amrex::ReduceData pin_reduce_data(pin_reduce_op); for (amrex::MFIter mfi(m_mf_active_mask); mfi.isValid(); ++mfi) { const amrex::Box& vbx = mfi.validbox(); auto mask_arr = m_mf_active_mask.const_array(mfi); - const auto& domain = m_geom.Domain(); - bool found = false; - amrex::LoopOnCpu(vbx, [&](int i, int j, int k) { - if (!found && mask_arr(i, j, k, MaskComp) == cell_active) { - long lin = static_cast(i - domain.smallEnd(0)) + - static_cast(domain.length(0)) * - (static_cast(j - domain.smallEnd(1)) + - static_cast(domain.length(1)) * - static_cast(k - domain.smallEnd(2))); - if (lin < local_pin_rank_idx) { - local_pin_rank_idx = lin; - local_pin = amrex::IntVect(i, j, k); - found = true; + pin_reduce_op.eval( + vbx, pin_reduce_data, + [=] AMREX_GPU_DEVICE(int i, int j, int k) noexcept -> amrex::GpuTuple { + if (mask_arr(i, j, k, MaskComp) != cell_active) { + return {sentinel}; } - } - }); - if (found) - break; + const long lin = static_cast(i - dom_lo_x) + + dom_len_x * (static_cast(j - dom_lo_y) + + dom_len_y * static_cast(k - dom_lo_z)); + return {lin}; + }); + } + local_pin_rank_idx = amrex::get<0>(pin_reduce_data.value()); + if (local_pin_rank_idx < sentinel) { + // Recover (i,j,k) from the linearised index. + const long k_off = local_pin_rank_idx / (dom_len_x * dom_len_y); + const long rem = local_pin_rank_idx - k_off * dom_len_x * dom_len_y; + const long j_off = rem / dom_len_x; + const long i_off = rem - j_off * dom_len_x; + local_pin = amrex::IntVect(static_cast(i_off + dom_lo_x), + static_cast(j_off + dom_lo_y), + static_cast(k_off + dom_lo_z)); } // Find the globally minimum linearized index across all ranks @@ -718,19 +722,33 @@ void EffectiveDiffusivityHypre::getChiSolution(amrex::MultiFab& chi_field) { continue; } + // Stage HYPRE's host soln_buffer in device memory; chi_arr is an + // Array4 view into chi_field which on CUDA builds lives in device + // memory, so writing through it from a host loop segfaults. +#ifdef AMREX_USE_GPU + amrex::Gpu::DeviceVector d_soln_buffer(npts); + amrex::Gpu::copyAsync(amrex::Gpu::hostToDevice, soln_buffer.data(), + soln_buffer.data() + npts, d_soln_buffer.data()); + amrex::Gpu::streamSynchronize(); + const HYPRE_Real* src_ptr = d_soln_buffer.data(); +#else + const HYPRE_Real* src_ptr = soln_buffer.data(); +#endif amrex::Array4 const chi_arr = chi_field.array(mfi); - long long k_lin_idx = 0; - - amrex::LoopOnCpu(bx_getsol, [=, &k_lin_idx](int i, int j, int k) noexcept { - if (k_lin_idx < npts) { - chi_arr(i, j, k, ChiComp) = soln_buffer[k_lin_idx]; - } - k_lin_idx++; + const int chi_comp = ChiComp; + const auto lo = amrex::lbound(bx_getsol); + const int nx_box = bx_getsol.length(0); + const int ny_box = bx_getsol.length(1); + amrex::ParallelFor(bx_getsol, [=] AMREX_GPU_DEVICE(int i, int j, int k) noexcept { + const long long lin = static_cast(k - lo.z) * nx_box * ny_box + + static_cast(j - lo.y) * nx_box + + static_cast(i - lo.x); + chi_arr(i, j, k, chi_comp) = static_cast(src_ptr[lin]); }); - AMREX_ALWAYS_ASSERT_WITH_MESSAGE( - k_lin_idx == npts, - "Point count mismatch during HYPRE GetBoxValues copy in getChiSolution"); } +#ifdef AMREX_USE_GPU + amrex::Gpu::streamSynchronize(); +#endif if (chi_field.nGrow() > 0) { chi_field.FillBoundary(m_geom.periodicity()); diff --git a/src/props/FloodFill.cpp b/src/props/FloodFill.cpp index 7cf2c49..8c97610 100644 --- a/src/props/FloodFill.cpp +++ b/src/props/FloodFill.cpp @@ -125,7 +125,12 @@ void parallelFloodFill(amrex::iMultiFab& reachabilityMask, const amrex::iMultiFa AMREX_ASSERT(label != FLOOD_INACTIVE); // label must differ from the "empty" marker // --- Phase 1: Plant seeds --- - // Seeds are a small host-side list; planting is cheap either way. + // Seeds arrive as a small host-side list. The mask_arr writes below need + // to run on the device that owns the iMultiFab data: on a CUDA build that + // is the GPU, and writing to mask_arr(iv, 0) from a host loop would + // segfault (T4 / A100 etc.). Filter seeds per tile on the host, stage + // them in a device buffer, then plant them via ParallelFor. + // // Note: we do NOT clear the mask here — callers that use multiple labels // (ConnectedComponents) may call this repeatedly on the same mask. // Callers doing a fresh fill should setVal(FLOOD_INACTIVE) beforehand. @@ -138,14 +143,41 @@ void parallelFloodFill(amrex::iMultiFab& reachabilityMask, const amrex::iMultiFa const amrex::Box& tileBox = mfi.tilebox(); auto mask_arr = reachabilityMask.array(mfi); const auto phase_arr = phaseFab.const_array(mfi, 0); + + amrex::Vector tile_seeds; for (const auto& seed : seedPoints) { if (tileBox.contains(seed)) { - if (phase_arr(seed) == phaseID) { - mask_arr(seed, 0) = label; - } + tile_seeds.push_back(seed); } } + if (tile_seeds.empty()) { + continue; + } + + const int n_seeds = static_cast(tile_seeds.size()); + const int pid = phaseID; + const int lbl = label; + +#ifdef AMREX_USE_GPU + amrex::Gpu::DeviceVector d_seeds(n_seeds); + amrex::Gpu::copyAsync(amrex::Gpu::hostToDevice, tile_seeds.data(), + tile_seeds.data() + n_seeds, d_seeds.data()); + amrex::Gpu::streamSynchronize(); + const amrex::IntVect* seed_ptr = d_seeds.data(); +#else + const amrex::IntVect* seed_ptr = tile_seeds.data(); +#endif + + amrex::ParallelFor(n_seeds, [=] AMREX_GPU_DEVICE(int s) noexcept { + const amrex::IntVect iv = seed_ptr[s]; + if (phase_arr(iv) == pid) { + mask_arr(iv, 0) = lbl; + } + }); } +#ifdef AMREX_USE_GPU + amrex::Gpu::streamSynchronize(); +#endif // --- Phase 2: Iterative wavefront expansion --- // Each iteration expands the reachable set by 1 cell in each direction. diff --git a/src/props/TortuosityHypre.cpp b/src/props/TortuosityHypre.cpp index c2afd86..a9fa770 100644 --- a/src/props/TortuosityHypre.cpp +++ b/src/props/TortuosityHypre.cpp @@ -657,20 +657,31 @@ bool OpenImpala::TortuosityHypre::solve() { if (get_ierr != 0) { amrex::Warning("HYPRE_StructVectorGetBoxValues failed during plotfile writing!"); } + // Same host-buffer-into-device-Array4 pattern as the flux-calc + // writeback below; stage in a DeviceVector and ParallelFor. +#ifdef AMREX_USE_GPU + amrex::Gpu::DeviceVector d_soln_buffer(npts); + amrex::Gpu::copyAsync(amrex::Gpu::hostToDevice, soln_buffer.data(), + soln_buffer.data() + npts, d_soln_buffer.data()); + amrex::Gpu::streamSynchronize(); + const HYPRE_Real* src_ptr = d_soln_buffer.data(); +#else + const HYPRE_Real* src_ptr = soln_buffer.data(); +#endif amrex::Array4 const soln_arr = mf_soln_temp.array(mfi); - long long k_lin_idx = 0; - amrex::LoopOnCpu(bx, [&](int ii, int jj, int kk) { - if (k_lin_idx < npts) { - soln_arr(ii, jj, kk, 0) = static_cast(soln_buffer[k_lin_idx]); - } else { - amrex::Warning("Buffer overrun detected during HYPRE GetBoxValues copy!"); - } - k_lin_idx++; + const auto lo = amrex::lbound(bx); + const int nx_box = bx.length(0); + const int ny_box = bx.length(1); + amrex::ParallelFor(bx, [=] AMREX_GPU_DEVICE(int i, int j, int k) noexcept { + const long long lin = static_cast(k - lo.z) * nx_box * ny_box + + static_cast(j - lo.y) * nx_box + + static_cast(i - lo.x); + soln_arr(i, j, k, 0) = static_cast(src_ptr[lin]); }); - if (k_lin_idx != npts) { - amrex::Warning("Point count mismatch during HYPRE GetBoxValues copy!"); - } } +#ifdef AMREX_USE_GPU + amrex::Gpu::streamSynchronize(); +#endif amrex::MultiFab mf_mask_temp(m_ba, m_dm, 1, 0); amrex::Copy(mf_mask_temp, m_mf_active_mask, MaskComp, 0, 1, 0); amrex::MultiFab mf_phase_temp(m_ba, m_dm, 1, 0); @@ -706,15 +717,29 @@ bool OpenImpala::TortuosityHypre::solve() { auto hypre_hi_f = OpenImpala::TortuosityHypre::hiV(bx); HYPRE_StructVectorGetBoxValues(m_x, hypre_lo_f.data(), hypre_hi_f.data(), soln_buf_fail.data()); +#ifdef AMREX_USE_GPU + amrex::Gpu::DeviceVector d_soln_buf_fail(npts); + amrex::Gpu::copyAsync(amrex::Gpu::hostToDevice, soln_buf_fail.data(), + soln_buf_fail.data() + npts, d_soln_buf_fail.data()); + amrex::Gpu::streamSynchronize(); + const HYPRE_Real* src_ptr_fail = d_soln_buf_fail.data(); +#else + const HYPRE_Real* src_ptr_fail = soln_buf_fail.data(); +#endif amrex::Array4 const soln_arr = mf_soln_fail.array(mfi); - long long idx = 0; - amrex::LoopOnCpu(bx, [&](int ii, int jj, int kk) { - if (idx < npts) { - soln_arr(ii, jj, kk, 0) = static_cast(soln_buf_fail[idx]); - } - idx++; + const auto lo_f = amrex::lbound(bx); + const int nx_f = bx.length(0); + const int ny_f = bx.length(1); + amrex::ParallelFor(bx, [=] AMREX_GPU_DEVICE(int i, int j, int k) noexcept { + const long long lin = static_cast(k - lo_f.z) * nx_f * ny_f + + static_cast(j - lo_f.y) * nx_f + + static_cast(i - lo_f.x); + soln_arr(i, j, k, 0) = static_cast(src_ptr_fail[lin]); }); } +#ifdef AMREX_USE_GPU + amrex::Gpu::streamSynchronize(); +#endif amrex::MultiFab mf_mask_fail(m_ba, m_dm, 1, 0); amrex::Copy(mf_mask_fail, m_mf_active_mask, MaskComp, 0, 1, 0); amrex::MultiFab mf_phase_fail(m_ba, m_dm, 1, 0); @@ -1134,18 +1159,33 @@ void OpenImpala::TortuosityHypre::global_fluxes() { if (get_ierr != 0) { amrex::Warning("HYPRE_StructVectorGetBoxValues failed during flux calculation copy!"); } + // Stage HYPRE's host-side soln_buffer in device memory before writing it + // through the iMultiFab Array4 view, which on CUDA builds points at GPU + // memory. A bare LoopOnCpu writing soln_arr(ii,jj,kk,0) = ... segfaults + // on T4 / A100 / etc. +#ifdef AMREX_USE_GPU + amrex::Gpu::DeviceVector d_soln_buffer(npts); + amrex::Gpu::copyAsync(amrex::Gpu::hostToDevice, soln_buffer.data(), + soln_buffer.data() + npts, d_soln_buffer.data()); + amrex::Gpu::streamSynchronize(); + const HYPRE_Real* src_ptr = d_soln_buffer.data(); +#else + const HYPRE_Real* src_ptr = soln_buffer.data(); +#endif amrex::Array4 const soln_arr = mf_soln_temp.array(mfi); - long long k_lin_idx = 0; - amrex::LoopOnCpu(bx, [&](int ii, int jj, int kk) { - if (k_lin_idx < npts) { - soln_arr(ii, jj, kk, 0) = static_cast(soln_buffer[k_lin_idx]); - } - k_lin_idx++; + const auto lo = amrex::lbound(bx); + const int nx_box = bx.length(0); + const int ny_box = bx.length(1); + amrex::ParallelFor(bx, [=] AMREX_GPU_DEVICE(int i, int j, int k) noexcept { + const long long lin = static_cast(k - lo.z) * nx_box * ny_box + + static_cast(j - lo.y) * nx_box + + static_cast(i - lo.x); + soln_arr(i, j, k, 0) = static_cast(src_ptr[lin]); }); - if (k_lin_idx != npts) { - amrex::Warning("Point count mismatch during flux calc copy!"); - } } +#ifdef AMREX_USE_GPU + amrex::Gpu::streamSynchronize(); +#endif mf_soln_temp.FillBoundary(m_geom.periodicity()); m_mf_active_mask.FillBoundary(m_geom.periodicity()); m_mf_diff_coeff.FillBoundary(m_geom.periodicity());