Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 4 additions & 2 deletions ext/CUDAExt.jl
Original file line number Diff line number Diff line change
Expand Up @@ -370,11 +370,13 @@ CuArray(H::Dagger.HaloArray) = convert(CuArray, H)
Base.convert(::Type{C}, H::Dagger.HaloArray) where {C<:CuArray} =
Dagger.HaloArray(C(H.center),
C.(H.halos),
H.halo_width)
H.halo_width;
own_center=H.own_center)
Adapt.adapt_structure(to::CUDA.KernelAdaptor, H::Dagger.HaloArray) =
Dagger.HaloArray(adapt(to, H.center),
adapt.(Ref(to), H.halos),
H.halo_width)
H.halo_width;
own_center=H.own_center)
function Dagger.inner_stencil_proc!(::CuArrayDeviceProc, f, output, read_vars)
Dagger.Kernel(_inner_stencil!)(f, output, read_vars; ndrange=size(output))
return
Expand Down
6 changes: 4 additions & 2 deletions ext/IntelExt.jl
Original file line number Diff line number Diff line change
Expand Up @@ -322,11 +322,13 @@ oneArray(H::Dagger.HaloArray) = convert(oneArray, H)
Base.convert(::Type{C}, H::Dagger.HaloArray) where {C<:oneArray} =
Dagger.HaloArray(C(H.center),
C.(H.halos),
H.halo_width)
H.halo_width;
own_center=H.own_center)
Adapt.adapt_structure(to::oneAPI.KernelAdaptor, H::Dagger.HaloArray) =
Dagger.HaloArray(adapt(to, H.center),
adapt.(Ref(to), H.halos),
H.halo_width)
H.halo_width;
own_center=H.own_center)
function Dagger.inner_stencil_proc!(::oneArrayDeviceProc, f, output, read_vars)
Dagger.Kernel(_inner_stencil!)(f, output, read_vars; ndrange=size(output))
return
Expand Down
6 changes: 4 additions & 2 deletions ext/MetalExt.jl
Original file line number Diff line number Diff line change
Expand Up @@ -346,11 +346,13 @@ MtlArray(H::Dagger.HaloArray) = convert(MtlArray, H)
Base.convert(::Type{C}, H::Dagger.HaloArray) where {C<:MtlArray} =
Dagger.HaloArray(C(H.center),
C.(H.halos),
H.halo_width)
H.halo_width;
own_center=H.own_center)
Adapt.adapt_structure(to::Metal.Adaptor, H::Dagger.HaloArray) =
Dagger.HaloArray(adapt(to, H.center),
adapt.(Ref(to), H.halos),
H.halo_width)
H.halo_width;
own_center=H.own_center)
function Dagger.inner_stencil_proc!(::MtlArrayDeviceProc, f, output, read_vars)
Dagger.Kernel(_inner_stencil!)(f, output, read_vars; ndrange=size(output))
return
Expand Down
6 changes: 4 additions & 2 deletions ext/OpenCLExt.jl
Original file line number Diff line number Diff line change
Expand Up @@ -320,11 +320,13 @@ CLArray(H::Dagger.HaloArray) = convert(CLArray, H)
Base.convert(::Type{C}, H::Dagger.HaloArray) where {C<:CLArray} =
Dagger.HaloArray(C(H.center),
C.(H.halos),
H.halo_width)
H.halo_width;
own_center=H.own_center)
Adapt.adapt_structure(to::OpenCL.KernelAdaptor, H::Dagger.HaloArray) =
Dagger.HaloArray(adapt(to, H.center),
adapt.(Ref(to), H.halos),
H.halo_width)
H.halo_width;
own_center=H.own_center)
function Dagger.inner_stencil_proc!(::CLArrayDeviceProc, f, output, read_vars)
Dagger.Kernel(_inner_stencil!)(f, output, read_vars; ndrange=size(output))
return
Expand Down
6 changes: 4 additions & 2 deletions ext/ROCExt.jl
Original file line number Diff line number Diff line change
Expand Up @@ -343,11 +343,13 @@ ROCArray(H::Dagger.HaloArray) = convert(ROCArray, H)
Base.convert(::Type{C}, H::Dagger.HaloArray) where {C<:ROCArray} =
Dagger.HaloArray(C(H.center),
C.(H.halos),
H.halo_width)
H.halo_width;
own_center=H.own_center)
Adapt.adapt_structure(to::AMDGPU.Runtime.Adaptor, H::Dagger.HaloArray) =
Dagger.HaloArray(adapt(to, H.center),
adapt.(Ref(to), H.halos),
H.halo_width)
H.halo_width;
own_center=H.own_center)
function Dagger.inner_stencil_proc!(::ROCArrayDeviceProc, f, output, read_vars)
Dagger.Kernel(_inner_stencil!)(f, output, read_vars; ndrange=size(output))
return
Expand Down
135 changes: 127 additions & 8 deletions src/array/stencil.jl
Original file line number Diff line number Diff line change
Expand Up @@ -46,8 +46,7 @@ function load_neighbor_region(arr, region_code::NTuple{N,Int}, neigh_dist) where
lastindex(arr, i)
end
end)
# FIXME: Don't collect
return move(task_processor(), collect(@view arr[start_idx:stop_idx]))
return move(task_processor(), copy(@view arr[start_idx:stop_idx]))
end

is_past_boundary(size, idx) = any(ntuple(i -> idx[i] < 1 || idx[i] > size[i], length(size)))
Expand Down Expand Up @@ -153,8 +152,9 @@ function load_boundary_region(pad::Pad, arr, region_code::NTuple{N,Int}, neigh_d
region_size = ntuple(N) do i
region_code[i] == 0 ? size(arr, i) : get_neigh_dist(neigh_dist, i)
end
# FIXME: return Fill(pad.padval, region_size)
return move(task_processor(), fill(pad.padval, region_size))
result = similar(arr, region_size...)
fill!(result, pad.padval)
return move(task_processor(), result)
end

# Use edge as source index (value will be overridden by apply_boundary_value)
Expand Down Expand Up @@ -420,7 +420,7 @@ function load_boundary_region(::Reflect{Symm}, arr, region_code::NTuple{N,Int},
end
end)

region = move(task_processor(), collect(@view arr[start_idx:stop_idx]))
region = move(task_processor(), copy(@view arr[start_idx:stop_idx]))

# Reverse only along dimensions that are actually being reflected
# (both non-zero in region_code AND past boundary)
Expand Down Expand Up @@ -568,6 +568,123 @@ end
# Chunk Selection and Halo Building
#############################################################################

function load_neighborhood_halos(chunks, idx, neigh_dist, boundary)
validate_neigh_dist(neigh_dist)

N = ndims(chunks)
chunk_dist = 1
nhalos = 3^N - 1
halos = Vector{Any}(undef, nhalos)
h = 0

for i in 0:(3^N - 1)
region_code = ntuple(N) do d
((i ÷ 3^(d-1)) % 3) - 1
end
all(==(0), region_code) && continue
h += 1

chunk_offset = CartesianIndex(ntuple(N) do d
region_code[d] * chunk_dist
end)
new_idx = idx + chunk_offset

if is_past_boundary(size(chunks), new_idx)
boundary_dims = ntuple(N) do d
new_idx[d] < 1 || new_idx[d] > size(chunks)[d]
end
if boundary_has_transition(boundary)
new_idx = boundary_transition(boundary, new_idx, size(chunks))
else
new_idx = idx
end
chunk = chunks[new_idx]
halos[h] = load_boundary_region(boundary, chunk, region_code, neigh_dist, boundary_dims)
else
chunk = chunks[new_idx]
halos[h] = load_neighbor_region(chunk, region_code, neigh_dist)
end
end

@assert h == nhalos
return Tuple(halos)
end

function load_neighborhood_halos_from_deps(deps, idx, chunk_size, neigh_dist, boundary)
validate_neigh_dist(neigh_dist)

N = length(chunk_size)
chunk_dist = 1
nhalos = 3^N - 1
halos = Vector{Any}(undef, nhalos)
h = 0

for i in 0:(3^N - 1)
region_code = ntuple(N) do d
((i ÷ 3^(d-1)) % 3) - 1
end
all(==(0), region_code) && continue
h += 1

chunk_offset = CartesianIndex(ntuple(N) do d
region_code[d] * chunk_dist
end)
new_idx = idx + chunk_offset

chunk = deps[h+1]
if is_past_boundary(chunk_size, new_idx)
boundary_dims = ntuple(N) do d
new_idx[d] < 1 || new_idx[d] > chunk_size[d]
end
halos[h] = load_boundary_region(boundary, chunk, region_code, neigh_dist, boundary_dims)
else
halos[h] = load_neighbor_region(chunk, region_code, neigh_dist)
end
end

@assert h == nhalos
return Tuple(halos)
end

function select_neighborhood_chunk_deps(chunks, idx, neigh_dist, boundary)
validate_neigh_dist(neigh_dist)

N = ndims(chunks)
chunk_dist = 1

accesses = Any[chunks[idx]]

for i in 0:(3^N - 1)
region_code = ntuple(N) do d
((i ÷ 3^(d-1)) % 3) - 1
end
all(==(0), region_code) && continue

chunk_offset = CartesianIndex(ntuple(N) do d
region_code[d] * chunk_dist
end)
new_idx = idx + chunk_offset

if is_past_boundary(size(chunks), new_idx)
if boundary_has_transition(boundary)
new_idx = boundary_transition(boundary, new_idx, size(chunks))
else
new_idx = idx
end
end
push!(accesses, chunks[new_idx])
end

@assert length(accesses) == 3^N
return accesses
end

function build_chunk_halo(neigh_dist, boundary, idx, chunk_size, own_center::Bool, read_deps...)
center = read_deps[1]
halos = load_neighborhood_halos_from_deps(read_deps, idx, chunk_size, neigh_dist, boundary)
return build_halo(neigh_dist, boundary, center, halos...; own_center=own_center)
end

function select_neighborhood_chunks(chunks, idx, neigh_dist, boundary)
validate_neigh_dist(neigh_dist)

Expand Down Expand Up @@ -615,11 +732,12 @@ function select_neighborhood_chunks(chunks, idx, neigh_dist, boundary)
return accesses
end

function build_halo(neigh_dist, boundary, center, all_halos...)
function build_halo(neigh_dist, boundary, center, all_halos...; own_center::Bool=false)
N = ndims(center)
expected_halos = 3^N - 1
@assert length(all_halos) == expected_halos "Halo mismatch: N=$N expected $expected_halos halos, got $(length(all_halos))"
return HaloArray(copy(center), (all_halos...,), ntuple(i->get_neigh_dist(neigh_dist, i), N))
center_data = own_center ? copy(center) : center
return HaloArray(center_data, (all_halos...,), ntuple(i->get_neigh_dist(neigh_dist, i), N); own_center)
end

function load_neighborhood(arr::HaloArray{T,N}, idx) where {T,N}
Expand Down Expand Up @@ -828,11 +946,12 @@ macro stencil(orig_ex)
for read_var in read_vars
if read_var in keys(neighborhoods)
neigh_dist, boundary = neighborhoods[read_var]
own_center = read_var == write_var
@gensym halo_tasks
push!(datadeps_body.args, :($halo_tasks = Array{$DTask}(undef, size($chunks($read_var)))))
push!(datadeps_body.args, quote
for $chunk_idx in $CartesianIndices($chunks($read_var))
$halo_tasks[$chunk_idx] = Dagger.@spawn name="stencil_build_halo" $build_halo($neigh_dist, $boundary, map($Read, $select_neighborhood_chunks($chunks($read_var), $chunk_idx, $neigh_dist, $boundary))...)
$halo_tasks[$chunk_idx] = Dagger.@spawn name="stencil_build_halo" $build_chunk_halo($neigh_dist, $boundary, $chunk_idx, $size($chunks($read_var)), $own_center, map($Read, $select_neighborhood_chunk_deps($chunks($read_var), $chunk_idx, $neigh_dist, $boundary))...)
end
end)
push!(datadeps_body.args, :($halo_tasks_map[$(QuoteNode(read_var))] = $halo_tasks))
Expand Down
20 changes: 14 additions & 6 deletions src/utils/haloarray.jl
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,12 @@ struct HaloArray{T,N,A<:AbstractArray{T,N},H<:Tuple} <: AbstractArray{T,N}
center::A
halos::H # Tuple of 3^N - 1 arrays in canonical order
halo_width::NTuple{N,Int}
own_center::Bool
end

function HaloArray(center, halos::Tuple, halo_width::NTuple{N,Int}; own_center::Bool=false) where N
T = eltype(center)
return HaloArray{T,N,typeof(center),typeof(halos)}(center, halos, halo_width, own_center)
end

# Number of halo regions for N dimensions
Expand Down Expand Up @@ -63,7 +69,7 @@ function HaloArray{T,N}(center_size::NTuple{N,Int}, halo_width::NTuple{N,Int}) w
Array{T,N}(undef, region_size...)
end

return HaloArray{T,N,typeof(center),typeof(halos)}(center, halos, halo_width)
return HaloArray(center, halos, halo_width; own_center=true)
end

Base.size(tile::HaloArray) = size(tile.center) .+ 2 .* tile.halo_width
Expand All @@ -83,7 +89,7 @@ function Base.copy(tile::HaloArray{T,N}) where {T,N}
center = copy(tile.center)
halos = ntuple(i -> copy(tile.halos[i]), length(tile.halos))
halo_width = tile.halo_width
return HaloArray(center, halos, halo_width)
return HaloArray(center, halos, halo_width; own_center=true)
end

# Compute the region code for a given index
Expand Down Expand Up @@ -182,7 +188,8 @@ end
Adapt.adapt_structure(to, H::Dagger.HaloArray) =
HaloArray(Adapt.adapt(to, H.center),
Adapt.adapt.(Ref(to), H.halos),
H.halo_width)
H.halo_width;
own_center=H.own_center)

function aliasing(A::HaloArray)
return CombinedAliasing([aliasing(A.center), map(aliasing, A.halos)...])
Expand All @@ -193,16 +200,17 @@ function move_rewrap(cache::AliasedObjectCache, from_proc::Processor, to_proc::P
center_chunk = move_rewrap(cache, from_proc, to_proc, from_space, to_space, A.center)
halo_chunks = ntuple(i -> move_rewrap(cache, from_proc, to_proc, from_space, to_space, A.halos[i]), length(A.halos))
halo_width = A.halo_width
own_center = A.own_center
to_w = root_worker_id(to_proc)
return remotecall_fetch(to_w, from_proc, to_proc, from_space, to_space, center_chunk, halo_chunks, halo_width) do from_proc, to_proc, from_space, to_space, center_chunk, halo_chunks, halo_width
return remotecall_fetch(to_w, from_proc, to_proc, from_space, to_space, center_chunk, halo_chunks, halo_width, own_center) do from_proc, to_proc, from_space, to_space, center_chunk, halo_chunks, halo_width, own_center
center_new = unwrap(center_chunk)
halos_new = ntuple(i -> unwrap(halo_chunks[i]), length(halo_chunks))
return tochunk(HaloArray(center_new, halos_new, halo_width), to_proc)
return tochunk(HaloArray(center_new, halos_new, halo_width; own_center=own_center), to_proc)
end
end

function Dagger.unsafe_free!(A::HaloArray)
unsafe_free!(A.center)
A.own_center && unsafe_free!(A.center)
foreach(unsafe_free!, A.halos)
end

Expand Down
6 changes: 4 additions & 2 deletions test/array/stencil.jl
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import Dagger: @stencil, Wrap, Pad, Reflect, Clamp, LinearExtrapolate

function test_stencil()
function test_stencil(; gpu::Bool=false)
@testset "Simple assignment" begin
A = zeros(Blocks(2, 2), Int, 4, 4)
@stencil A[idx] = 1
Expand Down Expand Up @@ -387,6 +387,8 @@ function test_stencil()

# From issue #669
for N in 3:4
# Fine-grained 4D GPU stencils require too many halo copies for typical GPU memory
gpu && N == 4 && continue
@testset "$(N)D array" begin
A = ones(Blocks(ntuple(_->1, N)...), Int, ntuple(_->3, N)...)
Dagger.allowscalar() do
Expand Down Expand Up @@ -497,7 +499,7 @@ end
kind == :oneAPI && continue
@testset "$kind" begin
Dagger.with_options(;scope) do
test_stencil()
test_stencil(; gpu=true)
end
end
end
Expand Down
Loading