diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml index 8e3b41511..95e9b9877 100644 --- a/.buildkite/pipeline.yml +++ b/.buildkite/pipeline.yml @@ -167,6 +167,18 @@ steps: env: CI_USE_OPENCL: "1" + - label: Julia 1.11 (Finch) + timeout_in_minutes: 90 + <<: *test + plugins: + - JuliaCI/julia#v1: + version: "1.11" + - JuliaCI/julia-test#v1: ~ + - JuliaCI/julia-coverage#v1: + codecov: true + env: + CI_TEST_FINCH: "1" + - label: Julia 1 - TimespanLogging timeout_in_minutes: 20 <<: *test diff --git a/Project.toml b/Project.toml index ce49bf6d7..1668e9db1 100644 --- a/Project.toml +++ b/Project.toml @@ -24,7 +24,6 @@ Requires = "ae029012-a4dd-5104-9daa-d747884805df" ScopedValues = "7e506255-f358-4e82-b7e4-beb19740aa63" Serialization = "9e88b42a-f829-5b0c-bbe9-9e923198166b" SharedArrays = "1a1011a3-84de-559e-8e89-a11a2f7dc383" -SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf" Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2" StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91" TaskLocalValues = "ed4db957-447d-4319-bfb6-7fa9ae7ecf34" @@ -34,36 +33,47 @@ UUIDs = "cf7118a7-6976-5b1a-9a39-7adc72f591a4" [weakdeps] AMDGPU = "21141c5a-9bdb-4563-92ae-f87d6854732e" AbstractFFTs = "621f4979-c628-5d54-868e-fcf4e3e8185c" +AlgebraicMultigrid = "2169fc97-5a83-5252-b627-83903c6c433c" CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba" Colors = "5ae59095-9a9b-59fe-a467-6f913c188581" DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0" Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f" +Finch = "9177782c-1635-4eb9-9bfb-d9dfa25e6bce" +IncompleteLU = "40713840-3770-5561-ab4c-a76e7d0d7895" GraphViz = "f526b714-d49f-11e8-06ff-31ed36ee7ee0" JSON3 = "0f8b85d8-7281-11e9-16c2-39a750bddbf1" +Krylov = "ba0b0d4f-ebba-5204-a429-3ac8c609bfb7" Metal = "dde4c033-4e86-420c-a63e-0dd931031962" OpenCL = "08131aa3-fb12-5dee-8b74-c09406e224a2" Plots = "91a5bcdd-55d7-5caf-9e0b-520d859cae80" PythonCall = "6099a3de-0909-46bc-b1f4-468b9a2dfc0d" +SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf" oneAPI = "8f75cd03-7ff8-4ecb-9b8f-daf728133b1b" [extensions] AbstractFFTsExt = "AbstractFFTs" +AlgebraicMultigridExt = ["AlgebraicMultigrid", "SparseArrays"] CUDAExt = "CUDA" DistributionsExt = "Distributions" +FinchExt = "Finch" GraphVizExt = "GraphViz" GraphVizSimpleExt = "Colors" +IncompleteLUExt = ["IncompleteLU", "SparseArrays"] IntelExt = "oneAPI" JSON3Ext = "JSON3" +KrylovExt = "Krylov" MetalExt = "Metal" OpenCLExt = "OpenCL" PlotsExt = ["DataFrames", "Plots"] PythonExt = "PythonCall" ROCExt = "AMDGPU" +SparseArraysExt = "SparseArrays" [compat] AMDGPU = "1, 2" AbstractFFTs = "1.5.0" Adapt = "4" +AlgebraicMultigrid = "1" CUDA = "3, 4, 5" Colors = "0.12, 0.13" DataFrames = "1" @@ -71,11 +81,14 @@ DataStructures = "0.18, 0.19" DistributedNext = "1.0.0" Distributions = "0.25" FillArrays = "1.13.0" +Finch = "1.2.9" GPUArraysCore = "0.2.0" GraphViz = "0.2" Graphs = "1" +IncompleteLU = "0.2" JSON3 = "1" KernelAbstractions = "0.9" +Krylov = "0.10" MacroTools = "0.5" MemPool = "0.4.12" Metal = "1.1" diff --git a/ext/AlgebraicMultigridExt.jl b/ext/AlgebraicMultigridExt.jl new file mode 100644 index 000000000..dd68bbcfa --- /dev/null +++ b/ext/AlgebraicMultigridExt.jl @@ -0,0 +1,45 @@ +module AlgebraicMultigridExt + +import AlgebraicMultigrid +import SparseArrays +import Dagger +import Dagger: DMatrix +import LinearAlgebra + +# Algebraic-multigrid preconditioner, built per diagonal tile on top of Dagger's +# block-preconditioner machinery (see `src/array/iterativesolvers.jl`). The AMG +# hierarchy's expensive setup runs once per tile at construction; each apply runs +# one V-cycle (a sequence of SpMVs, smoother sweeps, and a coarse solve). + +_as_sparse(A::SparseArrays.SparseMatrixCSC) = A +_as_sparse(A::AbstractMatrix) = SparseArrays.sparse(A) + +function _amg_operator(tile, method::Symbol; kwargs...) + S = _as_sparse(Dagger._tile_matrix(tile)) + ml = if method === :ruge_stuben + AlgebraicMultigrid.ruge_stuben(S; kwargs...) + elseif method === :smoothed_aggregation + AlgebraicMultigrid.smoothed_aggregation(S; kwargs...) + else + throw(ArgumentError("AMGPreconditioner: unknown method $(method); use \ + :ruge_stuben or :smoothed_aggregation")) + end + return AlgebraicMultigrid.aspreconditioner(ml) +end + +function Dagger.AMGPreconditioner(A::DMatrix; method::Symbol=:ruge_stuben, kwargs...) + build = tile -> _amg_operator(tile, method; kwargs...) + return Dagger._build_block_preconditioner(Dagger.AMGPreconditioner, A, + "AMGPreconditioner", build) +end + +# An AMG preconditioner applies a V-cycle via `ldiv!` (`\` is not defined). +Dagger._block_apply!(y, p::AlgebraicMultigrid.Preconditioner, x) = + (LinearAlgebra.ldiv!(y, p, x); return nothing) + +# The hierarchy is read-only and pinned to its worker; it is never written and +# its internals (a `Vector{Level}`) aren't introspectable by datadeps, so treat +# it as non-aliasing rather than recursing into it. +Dagger.aliasing(::AlgebraicMultigrid.Preconditioner) = Dagger.NoAliasing() + +end # module AlgebraicMultigridExt diff --git a/ext/FinchExt.jl b/ext/FinchExt.jl new file mode 100644 index 000000000..8b43bdb05 --- /dev/null +++ b/ext/FinchExt.jl @@ -0,0 +1,184 @@ +module FinchExt + +import Finch +import LinearAlgebra +import Dagger +import Dagger: Blocks, AutoBlocks, BlocksOrAuto, AssignmentType, DSparseArray, DSparseMatrix + +Dagger._sparse_collect(A::Finch.Tensor) = Array(A) +# Finch's generic `similar` produces tensor formats that destabilize later +# `@finch`/`@einsum` kernels (observed as segfaults during tile moves), so +# allocate an empty COO-backed tile explicitly instead. +Dagger._sparse_similar(::Finch.Tensor, ::Type{T}, dims::Dims) where {T} = + Finch.fspzeros(T, dims...) +Dagger.maybe_wrap_tile(x::Finch.Tensor) = DSparseArray(x) + +function Finch.fspzeros(p::Blocks, T::Type, dims::Dims; assignment::AssignmentType = :arbitrary) + d = Dagger.ArrayDomain(map(x->1:x, dims)) + a = Dagger.AllocateArray(T, (T, _dims) -> DSparseMatrix{T}(Finch.fspzeros(T, _dims...)), false, d, Dagger.partition(p, d), p, assignment) + return Dagger._to_darray(a) +end +Finch.fspzeros(p::BlocksOrAuto, T::Type, dims::Integer...; assignment::AssignmentType = :arbitrary) = + Finch.fspzeros(p, T, dims; assignment) +Finch.fspzeros(p::BlocksOrAuto, dims::Integer...; assignment::AssignmentType = :arbitrary) = + Finch.fspzeros(p, Float64, dims; assignment) +Finch.fspzeros(p::BlocksOrAuto, dims::Dims; assignment::AssignmentType = :arbitrary) = + Finch.fspzeros(p, Float64, dims; assignment) +Finch.fspzeros(::AutoBlocks, T::Type, dims::Dims; assignment::AssignmentType = :arbitrary) = + Finch.fspzeros(Dagger.auto_blocks(dims), T, dims; assignment) + +function Finch.fsprand(p::Blocks, T::Type, dims::Dims, sparsity::AbstractFloat; assignment::AssignmentType = :arbitrary) + d = Dagger.ArrayDomain(map(x->1:x, dims)) + a = Dagger.AllocateArray(T, (T, _dims) -> DSparseMatrix{T}(Finch.fsprand(T, _dims..., sparsity)), false, d, Dagger.partition(p, d), p, assignment) + return Dagger._to_darray(a) +end +Finch.fsprand(p::BlocksOrAuto, T::Type, dims_and_sparsity::Real...; assignment::AssignmentType = :arbitrary) = + Finch.fsprand(p, T, dims_and_sparsity[1:end-1], dims_and_sparsity[end]; assignment) +Finch.fsprand(p::BlocksOrAuto, dims_and_sparsity::Real...; assignment::AssignmentType = :arbitrary) = + Finch.fsprand(p, Float64, dims_and_sparsity[1:end-1], dims_and_sparsity[end]; assignment) +Finch.fsprand(p::BlocksOrAuto, dims::Dims, sparsity::AbstractFloat; assignment::AssignmentType = :arbitrary) = + Finch.fsprand(p, Float64, dims, sparsity; assignment) +Finch.fsprand(::AutoBlocks, T::Type, dims::Dims, sparsity::AbstractFloat; assignment::AssignmentType = :arbitrary) = + Finch.fsprand(Dagger.auto_blocks(dims), T, dims, sparsity; assignment) + +# Materialize a (possibly lazy/`SwizzleArray`) result into a concrete sparse +# `Tensor`. `@einsum` returns a lazy `SwizzleArray` for transposed-output +# patterns, which is not itself a `Finch.Tensor` and corrupts subsequent +# accumulation steps; copying through `@finch` produces a clean `SparseCOO`. +function _finch_materialize(X, ::Type{T}, dims) where {T} + out = Finch.fspzeros(T, dims...) + Finch.@finch begin + out .= 0 + for j = _, i = _ + if X[i, j] != 0 + out[i, j] = X[i, j] + end + end + end + return out +end + +# Finch tensors do not define `Base.copy`; build a fresh equivalent tensor. +Dagger._sparse_copy(mat::Finch.Tensor) = _finch_materialize(mat, eltype(mat), size(mat)) + +# Finch tensors do not support `setindex!`, so copy-buffering writeback into a +# Finch tile densifies, applies the (partial-range) copy, then re-sparsifies. +function Dagger._sparse_copyto_view!(mat::Finch.Tensor, Brange, src) + dense = Array(mat) + copyto!(view(dense, Brange), src) + return _finch_materialize(dense, eltype(mat), size(mat)) +end + +# Compute `C_out += alpha * op(A) * op(B)` via Finch's `@einsum`, choosing the +# right index pattern for each transpose option. We deliberately use *explicit* +# index patterns (`A[k,i]`) rather than `Finch.swizzle`, because feeding a +# swizzled tensor into `@einsum` makes it return a lazy `SwizzleArray` (rather +# than a materialized `Tensor`), which breaks subsequent accumulation steps. +# Adjoint ('C') additionally applies `conj` inside the einsum. +function _finch_einsum_matmul!(C_out, A, B, transA::Char, transB::Char, alpha) + aT = transA == 'T' || transA == 'C' + aC = transA == 'C' + bT = transB == 'T' || transB == 'C' + bC = transB == 'C' + if !aT && !bT + bC ? Finch.@einsum(C_out[i,j] += alpha * (A[i,k] * conj(B[k,j]))) : + Finch.@einsum(C_out[i,j] += alpha * (A[i,k] * B[k,j])) + elseif !aT && bT + bC ? Finch.@einsum(C_out[i,j] += alpha * (A[i,k] * conj(B[j,k]))) : + Finch.@einsum(C_out[i,j] += alpha * (A[i,k] * B[j,k])) + elseif aT && !bT + aC ? Finch.@einsum(C_out[i,j] += alpha * (conj(A[k,i]) * B[k,j])) : + Finch.@einsum(C_out[i,j] += alpha * (A[k,i] * B[k,j])) + else # aT && bT + if aC && bC + Finch.@einsum C_out[i,j] += alpha * (conj(A[k,i]) * conj(B[j,k])) + elseif aC + Finch.@einsum C_out[i,j] += alpha * (conj(A[k,i]) * B[j,k]) + elseif bC + Finch.@einsum C_out[i,j] += alpha * (A[k,i] * conj(B[j,k])) + else + Finch.@einsum C_out[i,j] += alpha * (A[k,i] * B[j,k]) + end + end + return C_out +end + +function Dagger.matmatmul!( + C::DSparseMatrix, + transA::Char, + transB::Char, + A::Finch.Tensor, + B::Finch.Tensor, + alpha, + beta +) + if !isa(C.mat, Finch.Tensor) + # Not supported here, forward to generic matmatmul! + return Dagger.matmatmul!(C.mat, transA, transB, A, B, alpha, beta) + end + transA in ('N', 'T', 'C') || throw(ArgumentError("Invalid transA: $transA")) + transB in ('N', 'T', 'C') || throw(ArgumentError("Invalid transB: $transB")) + + # `@einsum` cannot write into an existing sparse tensor in place, so we build + # a fresh output tensor and (re)assign it to the wrapper. `DSparseMatrix` + # hides this reallocation from Datadeps. + # + # N.B. BEWARE: `@einsum Cm[i,j] = ...` (assignment into an existing tensor) + # does not work; use a fresh `C_out` with `+=`. + Cm = C.mat + C_out = Finch.fspzeros(eltype(Cm), size(Cm)...) + C_out = _finch_einsum_matmul!(C_out, A, B, transA, transB, alpha) + # `@einsum` may return a lazy `SwizzleArray`; materialize to a concrete + # sparse `Tensor` so accumulation and later iterations stay well-typed. + C_out = _finch_materialize(C_out, eltype(Cm), size(Cm)) + if beta != 0 + C_out = C_out + beta * Cm + end + C.mat = C_out + + return C +end + +# Sparse matrix-vector multiply tile kernel: `C = alpha*op(A)*B + beta*C` with a +# Finch tensor `A` and dense vectors `B`/`C`. Finch's `@einsum` does not reliably +# accumulate into a dense output, so we densify the single sparse tile and use a +# dense BLAS-backed `mul!`; this is a per-tile densification, not the whole matrix. +function Dagger.matvecmul!(C::AbstractVector, transA::Char, A::Finch.Tensor, B::AbstractVector, alpha, beta) + Ad = Array(A) + opA = transA == 'N' ? Ad : + transA == 'C' ? adjoint(Ad) : + transA == 'T' ? transpose(Ad) : + throw(ArgumentError("Invalid transA: $transA")) + LinearAlgebra.mul!(C, opA, B, alpha, beta) + return C +end + +# Tile transpose/symmetrization used by `copytri!`. Finch tensors don't support +# in-place `setindex!`, so we densify, build the result, and re-sparsify via +# `_finch_materialize`. +# - `uplo === nothing`: off-diagonal tile, return the conjugate transpose. +# - `uplo == 'U'`/`'L'`: diagonal tile, build the full Hermitian tile from the +# given triangle (matching the dense `copydiagtile!` semantics). +function Dagger.transpose_tile(B::Finch.Tensor, uplo=nothing) + _B = Array(B) + if uplo === nothing + C = adjoint(_B) + return _finch_materialize(C, eltype(_B), size(C)) + end + n = LinearAlgebra.checksquare(_B) + C = zeros(eltype(_B), n, n) + if uplo == 'U' + for j in 1:n, i in 1:n + C[i, j] = i <= j ? _B[i, j] : conj(_B[j, i]) + end + elseif uplo == 'L' + for j in 1:n, i in 1:n + C[i, j] = i >= j ? _B[i, j] : conj(_B[j, i]) + end + else + throw(ArgumentError("uplo must be 'U' or 'L', got $uplo")) + end + return _finch_materialize(C, eltype(_B), size(C)) +end + +end # module FinchExt \ No newline at end of file diff --git a/ext/IncompleteLUExt.jl b/ext/IncompleteLUExt.jl new file mode 100644 index 000000000..06ecab522 --- /dev/null +++ b/ext/IncompleteLUExt.jl @@ -0,0 +1,36 @@ +module IncompleteLUExt + +import IncompleteLU +import SparseArrays +import Dagger +import Dagger: DMatrix +import LinearAlgebra + +# Block incomplete-LU preconditioner, built per diagonal tile on top of Dagger's +# block-preconditioner machinery (see `src/array/iterativesolvers.jl`). Each tile +# gets an ILU factorization (drop tolerance `τ`) once at construction; the apply +# is a forward/backward substitution per block. + +_as_sparse(A::SparseArrays.SparseMatrixCSC) = A +_as_sparse(A::AbstractMatrix) = SparseArrays.sparse(A) + +function _ilu_operator(tile; τ=0.001, kwargs...) + S = _as_sparse(Dagger._tile_matrix(tile)) + return IncompleteLU.ilu(S; τ=τ, kwargs...) +end + +function Dagger.BlockILUPreconditioner(A::DMatrix; kwargs...) + build = tile -> _ilu_operator(tile; kwargs...) + return Dagger._build_block_preconditioner(Dagger.BlockILUPreconditioner, A, + "BlockILUPreconditioner", build) +end + +# An ILU factorization applies via `ldiv!` (`\` is not defined). +Dagger._block_apply!(y, F::IncompleteLU.ILUFactorization, x) = + (LinearAlgebra.ldiv!(y, F, x); return nothing) + +# The factorization is read-only and pinned to its worker; treat it as +# non-aliasing rather than recursing into its internals. +Dagger.aliasing(::IncompleteLU.ILUFactorization) = Dagger.NoAliasing() + +end # module IncompleteLUExt diff --git a/ext/KrylovExt.jl b/ext/KrylovExt.jl new file mode 100644 index 000000000..5d36ba2fa --- /dev/null +++ b/ext/KrylovExt.jl @@ -0,0 +1,36 @@ +module KrylovExt + +import Krylov +import Dagger +import Dagger: DVector, DMatrix +import LinearAlgebra + +# Distributed iterative solvers, implemented on top of Krylov.jl. +# +# We never form `A⁻¹`: Krylov only needs `mul!(y, A, x)` (and `mul!(y, A', x)` +# for two-sided methods), which Dagger provides over `DVector`s via its +# distributed SpMV/`gemv_dagger!` path. The remaining requirements -- `dot`, +# `norm`, `axpy!`, `axpby!`, `rmul!`, `copyto!`, `fill!`, broadcasting -- are the +# distributed BLAS-1 ops in `Dagger`. +# +# Workspace vectors are allocated through a `KrylovConstructor` built from +# `similar(b)`, so every internal vector inherits `b`'s element type *and* +# partitioning (`similar` preserves both). This keeps all `mul!`/`dot`/`axpy!` +# operands on a compatible chunk layout, which the distributed kernels require. + +function _dagger_krylov(method::Symbol, A, b::DVector; kwargs...) + kc = Krylov.KrylovConstructor(similar(b)) + workspace = Krylov.krylov_workspace(Val(method), kc) + Krylov.krylov_solve!(workspace, A, b; kwargs...) + return Krylov.solution(workspace), Krylov.statistics(workspace) +end + +Dagger.krylov_solve(method::Symbol, A, b::DVector; kwargs...) = + _dagger_krylov(method, A, b; kwargs...) + +Dagger.cg(A, b::DVector; kwargs...) = _dagger_krylov(:cg, A, b; kwargs...) +Dagger.minres(A, b::DVector; kwargs...) = _dagger_krylov(:minres, A, b; kwargs...) +Dagger.gmres(A, b::DVector; kwargs...) = _dagger_krylov(:gmres, A, b; kwargs...) +Dagger.bicgstab(A, b::DVector; kwargs...) = _dagger_krylov(:bicgstab, A, b; kwargs...) + +end # module KrylovExt diff --git a/ext/SparseArraysExt.jl b/ext/SparseArraysExt.jl new file mode 100644 index 000000000..946c40be1 --- /dev/null +++ b/ext/SparseArraysExt.jl @@ -0,0 +1,111 @@ +module SparseArraysExt + +import SparseArrays +import SparseArrays: SparseMatrixCSC, SparseVector +import LinearAlgebra +import Dagger +import Dagger: Blocks, AutoBlocks, BlocksOrAuto, AssignmentType, DSparseArray, DSparseMatrix + +# Keep tiles sparse through `collect`/`cat`; the outer `collect` densifies. +Dagger._sparse_collect(M::SparseMatrixCSC) = copy(M) +# Wrap bare sparse tiles (e.g. from `distribute`) so Datadeps sees a stable container. +Dagger.maybe_wrap_tile(x::SparseMatrixCSC) = DSparseArray(x) +Dagger.maybe_wrap_tile(x::SparseVector) = DSparseArray(x) + +function SparseArrays.spzeros(p::Blocks, T::Type, dims::Dims; assignment::AssignmentType = :arbitrary) + d = Dagger.ArrayDomain(map(x->1:x, dims)) + a = Dagger.AllocateArray(T, (T, _dims) -> DSparseMatrix{T}(SparseArrays.spzeros(T, _dims...)), false, d, Dagger.partition(p, d), p, assignment) + return Dagger._to_darray(a) +end +SparseArrays.spzeros(p::BlocksOrAuto, T::Type, dims::Integer...; assignment::AssignmentType = :arbitrary) = + SparseArrays.spzeros(p, T, dims; assignment) +SparseArrays.spzeros(p::BlocksOrAuto, dims::Integer...; assignment::AssignmentType = :arbitrary) = + SparseArrays.spzeros(p, Float64, dims; assignment) +SparseArrays.spzeros(p::BlocksOrAuto, dims::Dims; assignment::AssignmentType = :arbitrary) = + SparseArrays.spzeros(p, Float64, dims; assignment) +SparseArrays.spzeros(::AutoBlocks, T::Type, dims::Dims; assignment::AssignmentType = :arbitrary) = + SparseArrays.spzeros(Dagger.auto_blocks(dims), T, dims; assignment) + +function SparseArrays.sprand(p::Blocks, T::Type, dims::Dims, sparsity::AbstractFloat; assignment::AssignmentType = :arbitrary) + d = Dagger.ArrayDomain(map(x->1:x, dims)) + a = Dagger.AllocateArray(T, (T, _dims) -> DSparseMatrix{T}(SparseArrays.sprand(T, _dims..., sparsity)), false, d, Dagger.partition(p, d), p, assignment) + return Dagger._to_darray(a) +end +SparseArrays.sprand(p::BlocksOrAuto, T::Type, dims_and_sparsity::Real...; assignment::AssignmentType = :arbitrary) = + SparseArrays.sprand(p, T, dims_and_sparsity[1:end-1], dims_and_sparsity[end]; assignment) +SparseArrays.sprand(p::BlocksOrAuto, dims_and_sparsity::Real...; assignment::AssignmentType = :arbitrary) = + SparseArrays.sprand(p, Float64, dims_and_sparsity[1:end-1], dims_and_sparsity[end]; assignment) +SparseArrays.sprand(p::BlocksOrAuto, dims::Dims, sparsity::AbstractFloat; assignment::AssignmentType = :arbitrary) = + SparseArrays.sprand(p, Float64, dims, sparsity; assignment) +SparseArrays.sprand(::AutoBlocks, T::Type, dims::Dims, sparsity::AbstractFloat; assignment::AssignmentType = :arbitrary) = + SparseArrays.sprand(Dagger.auto_blocks(dims), T, dims, sparsity; assignment) + +_apply_trans(X, t::Char) = + t == 'N' ? X : + t == 'T' ? transpose(X) : + t == 'C' ? adjoint(X) : + throw(ArgumentError("Invalid trans char: $t")) + +function Dagger.matmatmul!( + C::DSparseMatrix, + transA::Char, + transB::Char, + A::SparseMatrixCSC, + B::SparseMatrixCSC, + alpha, + beta +) + opA = _apply_trans(A, transA) + opB = _apply_trans(B, transB) + # Sparse*sparse yields a freshly-allocated sparse matrix, which we reassign + # into the wrapper (`DSparseMatrix` hides this reallocation from Datadeps). + # `SparseArrays` provides no efficient 5-arg `mul!` into a sparse `C` -- the + # output sparsity pattern is determined by the product -- so we form the + # product out-of-place and apply only the alpha/beta scaling that is actually + # needed. The transposed-operand products dispatch to specialized SparseArrays + # methods, so `opA`/`opB` are not materialized. + AB = opA * opB + prod = isone(alpha) ? AB : alpha * AB + if iszero(beta) + C.mat = prod + elseif isone(beta) + C.mat = prod + C.mat + else + C.mat = prod + beta * C.mat + end + + return C +end + +# Sparse matrix-vector multiply tile kernel: `C = alpha*op(A)*B + beta*C` with a +# `SparseMatrixCSC` `A` and dense vectors `B`/`C`. SparseArrays provides an +# efficient 5-arg `mul!` (SpMV) into a dense output, including for transposed and +# adjoint operands, so this updates `C` in place with no allocation. +function Dagger.matvecmul!(C::AbstractVector, transA::Char, A::SparseMatrixCSC, B::AbstractVector, alpha, beta) + LinearAlgebra.mul!(C, _apply_trans(A, transA), B, alpha, beta) + return C +end + +# Off-diagonal tile copy in `copytri!`: produce the (conjugate) transpose tile. +function Dagger.transpose_tile(B::SparseMatrixCSC) + return SparseArrays.sparse(B') +end +# Diagonal tile symmetrization in `copytri!`: build the full Hermitian tile from +# its `uplo` triangle (matching the dense `copydiagtile!` semantics). +function Dagger.transpose_tile(B::SparseMatrixCSC, uplo::Char) + if uplo == 'U' + Bt = SparseArrays.triu(B) + elseif uplo == 'L' + Bt = SparseArrays.tril(B) + else + throw(ArgumentError("uplo must be 'U' or 'L', got $uplo")) + end + C = Bt + Bt' + # The shared diagonal was added twice; restore the original tile's diagonal. + for i in 1:LinearAlgebra.checksquare(B) + C[i, i] = B[i, i] + end + return C +end + +end # module SparseArraysExt diff --git a/src/Dagger.jl b/src/Dagger.jl index 2e757ebc5..aa82a3d66 100644 --- a/src/Dagger.jl +++ b/src/Dagger.jl @@ -2,7 +2,6 @@ module Dagger import Serialization import Serialization: AbstractSerializer, serialize, deserialize -import SparseArrays: sprand, SparseMatrixCSC import MemPool import MemPool: DRef, FileRef, poolget, poolset @@ -120,7 +119,7 @@ include("array/operators.jl") include("array/indexing.jl") include("array/setindex.jl") include("array/matrix.jl") -include("array/sparse_partition.jl") +include("array/sparse.jl") include("array/sort.jl") include("array/permute.jl") include("array/linalg.jl") @@ -129,6 +128,7 @@ include("array/cholesky.jl") include("array/trsm.jl") include("array/lu.jl") include("array/qr.jl") +include("array/iterativesolvers.jl") # GPU include("gpu.jl") diff --git a/src/array/alloc.jl b/src/array/alloc.jl index aa1050210..266f7acd5 100644 --- a/src/array/alloc.jl +++ b/src/array/alloc.jl @@ -81,6 +81,7 @@ end allocate_array_func(::Processor, f) = f function stage(ctx, A::AllocateArray) tasks = Array{DTask,ndims(A.domainchunks)}(undef, size(A.domainchunks)...) + f = A.f Dagger.spawn_datadeps() do default_scope = get_compute_scope() for I in CartesianIndices(A.domainchunks) @@ -93,10 +94,15 @@ function stage(ctx, A::AllocateArray) scope = ExactScope(A.procgrid[CartesianIndex(mod1.(Tuple(I), size(A.procgrid))...)]) end - if A.want_index - task = Dagger.@spawn compute_scope=scope allocate_array(A.f, A.eltype, i, size(x)) + if f isa DArray + chunk = f.chunks[I] + task = Dagger.@spawn compute_scope=scope similar(chunk, A.eltype, size(x)) else - task = Dagger.@spawn compute_scope=scope allocate_array(A.f, A.eltype, size(x)) + if A.want_index + task = Dagger.@spawn compute_scope=scope allocate_array(f, A.eltype, i, size(x)) + else + task = Dagger.@spawn compute_scope=scope allocate_array(f, A.eltype, size(x)) + end end tasks[i] = task end @@ -134,20 +140,6 @@ Base.randn(p::BlocksOrAuto, dims::Dims; assignment::AssignmentType = :arbitrary) Base.randn(::AutoBlocks, T::Type, dims::Dims; assignment::AssignmentType = :arbitrary) = randn(auto_blocks(dims), T, dims; assignment) -function sprand(p::Blocks, T::Type, dims::Dims, sparsity::AbstractFloat; assignment::AssignmentType = :arbitrary) - d = ArrayDomain(map(x->1:x, dims)) - a = AllocateArray(T, (T, _dims) -> sprand(T, _dims..., sparsity), false, d, partition(p, d), p, assignment) - return _to_darray(a) -end -sprand(p::BlocksOrAuto, T::Type, dims_and_sparsity::Real...; assignment::AssignmentType = :arbitrary) = - sprand(p, T, dims_and_sparsity[1:end-1], dims_and_sparsity[end]; assignment) -sprand(p::BlocksOrAuto, dims_and_sparsity::Real...; assignment::AssignmentType = :arbitrary) = - sprand(p, Float64, dims_and_sparsity[1:end-1], dims_and_sparsity[end]; assignment) -sprand(p::BlocksOrAuto, dims::Dims, sparsity::AbstractFloat; assignment::AssignmentType = :arbitrary) = - sprand(p, Float64, dims, sparsity; assignment) -sprand(::AutoBlocks, T::Type, dims::Dims, sparsity::AbstractFloat; assignment::AssignmentType = :arbitrary) = - sprand(auto_blocks(dims), T, dims, sparsity; assignment) - function Base.ones(p::Blocks, T::Type, dims::Dims; assignment::AssignmentType = :arbitrary) d = ArrayDomain(map(x->1:x, dims)) a = AllocateArray(T, ones, false, d, partition(p, d), p, assignment) @@ -215,3 +207,14 @@ function unsafe_free!(A::DArray) end end end + +# Initializers + +function Base.fill!(A::DArray, x) + spawn_datadeps() do + for chunk in A.chunks + Dagger.@spawn fill!(chunk, x) + end + end + return A +end diff --git a/src/array/copy.jl b/src/array/copy.jl index 7d92566e6..d703f49d1 100644 --- a/src/array/copy.jl +++ b/src/array/copy.jl @@ -36,6 +36,13 @@ function allocate_copy_buffer(part::Blocks{N}, A::DArray{T,N}) where {T,N} return DArray{T}(undef, part, size(A)) end +to_range(x::UnitRange) = x +to_range(x::Integer) = x:x +to_range(x::Base.OneTo{Int}) = UnitRange(x) +to_range(x::Base.Slice{Base.OneTo{Int}}) = UnitRange(x) +to_range(::StepRange) = throw(ArgumentError("Cannot convert StepRange to UnitRange")) +to_range(x) = throw(ArgumentError("Cannot convert $(typeof(x)) to UnitRange")) + function darray_copyto!(B::DArray{TB,NB}, A::DArray{TA,NA}, Binds=parentindices(B), Ainds=parentindices(A)) where {TB,NB,TA,NA} Nmax = max(NA, NB) @@ -45,13 +52,6 @@ function darray_copyto!(B::DArray{TB,NB}, A::DArray{TA,NA}, Binds=parentindices( padNmax(x) = ntuple(i->pad1range(x, i), Nmax) padNmax(x::ArrayDomain) = padNmax(x.indexes) - to_range(x::UnitRange) = x - to_range(x::Integer) = x:x - to_range(x::Base.OneTo{Int}) = UnitRange(x) - to_range(x::Base.Slice{Base.OneTo{Int}}) = UnitRange(x) - to_range(::StepRange) = throw(ArgumentError("Non-continuous ranges are not yet supported for DArray copy")) - to_range(x) = throw(ArgumentError("Unsupported range type for DArray copy: $(typeof(x))")) - if any(x->x isa Vector, Binds) || any(x->x isa Vector, Ainds) # Split the copy into multiple copies dims_with_vector = findall(x->x[1] isa Vector || x[2] isa Vector, collect(zip(Binds, Ainds))) @@ -154,3 +154,13 @@ StridedDArray{T,N} = Union{<:DArray{T,N}, SubArray{T,N,<:DArray{T,NP}} where NP} Base.copyto!(B::StridedDArray, A::StridedDArray) = darray_copyto!(parent(B), parent(A), parentindices(B), parentindices(A)) +function Base.copyto!(B::Array, A::StridedDArray) + DB = view(B, AutoBlocks()) + darray_copyto!(DB, parent(A), parentindices(DB), parentindices(A)) + return B +end +function Base.copyto!(B::SubArray, A::StridedDArray) + DB = view(parent(B), AutoBlocks()) + darray_copyto!(DB, parent(A), parentindices(B), parentindices(A)) + return B +end diff --git a/src/array/darray.jl b/src/array/darray.jl index 32336f95d..904aa7587 100644 --- a/src/array/darray.jl +++ b/src/array/darray.jl @@ -197,7 +197,7 @@ function Base.collect(d::DArray{T,N}; tree=false, copyto=false) where {T,N} if tree collect(fetch(treereduce_nd(map(x -> ((args...,) -> Dagger.@spawn x(args...)) , dimcatfuncs), a.chunks))) else - collect(treereduce_nd(dimcatfuncs, asyncmap(fetch, a.chunks))) + collect(treereduce_nd(dimcatfuncs, map(collect, asyncmap(fetch, a.chunks)))) end end Array{T,N}(A::DArray{S,N}) where {T,N,S} = convert(Array{T,N}, collect(A)) @@ -321,8 +321,16 @@ function Base.isequal(x::ArrayOp, y::ArrayOp) x === y end -Base.similar(D::DArray{T,N} where T, ::Type{S}, dims::Dims{N}) where {S,N} = - DArray{S,N}(undef, D.partitioning, dims) +function Base.similar(D::DArray{T,N} where T, ::Type{S}, dims::Dims{N}) where {S,N} + if dims == size(D) + domain = ArrayDomain(map(x->1:x, dims)) + subdomains = partition(D.partitioning, domain) + a = AllocateArray(S, D, false, domain, subdomains, D.partitioning, nothing) + return _to_darray(a) + else + return DArray{S,N}(undef, D.partitioning, dims) + end +end Base.similar(D::DArray{T,N1} where T, ::Type{S}, dims::Dims{N2}) where {S,N1,N2} = DArray{S,N2}(undef, auto_blocks(dims), dims) @@ -480,7 +488,7 @@ function stage(ctx::Context, d::Distribute) else scope = ExactScope(d.procgrid[CartesianIndex(mod1.(Tuple(I), size(d.procgrid))...)]) end - Dagger.@spawn compute_scope=scope identity(d.data[c]) + Dagger.@spawn compute_scope=scope maybe_wrap_tile(d.data[c]) end end return DArray(eltype(d.data), diff --git a/src/array/iterativesolvers.jl b/src/array/iterativesolvers.jl new file mode 100644 index 000000000..e499d2a1c --- /dev/null +++ b/src/array/iterativesolvers.jl @@ -0,0 +1,308 @@ +# Distributed iterative (Krylov) linear solvers. +# +# The user-facing entry points (`Dagger.cg`, `Dagger.minres`, `Dagger.gmres`, +# `Dagger.bicgstab`, and the generic `Dagger.krylov_solve`) live here so the +# solver backend can evolve without changing user code. The actual solves are +# implemented in `ext/KrylovExt.jl`, which is loaded when `Krylov` is available +# (`using Krylov`). The matrix-free building blocks they rely on -- distributed +# `mul!`/SpMV and the BLAS-1 vector ops -- live in `mul.jl`/`linalg.jl`. + +""" + cg(A, b::DVector; M=I, atol, rtol, itmax, ...) -> (x::DVector, stats) + +Solve the symmetric positive-definite system `A x = b` with the conjugate +gradient method, distributed over `A`'s and `b`'s chunks. `A` may be a +`DMatrix` (dense or sparse-backed) or any object supporting `mul!(y, A, x)` over +`DVector`s (matrix-free). Requires `Krylov.jl` to be loaded. + +See also [`minres`](@ref), [`gmres`](@ref), [`bicgstab`](@ref), and +[`krylov_solve`](@ref). Keyword arguments are forwarded to `Krylov.cg!`. +""" +function cg end + +""" + minres(A, b::DVector; M=I, atol, rtol, itmax, ...) -> (x::DVector, stats) + +Solve the symmetric (possibly indefinite) system `A x = b` with MINRES. +Requires `Krylov.jl` to be loaded. See [`cg`](@ref). +""" +function minres end + +""" + gmres(A, b::DVector; M=I, N=I, restart=false, memory=20, ...) -> (x::DVector, stats) + +Solve the general (nonsymmetric) system `A x = b` with restarted GMRES. +`M`/`N` are left/right preconditioners. Requires `Krylov.jl` to be loaded. +See [`cg`](@ref). +""" +function gmres end + +""" + bicgstab(A, b::DVector; M=I, N=I, ...) -> (x::DVector, stats) + +Solve the general (nonsymmetric) system `A x = b` with BiCGStab (short +recurrence, low memory). Requires `Krylov.jl` to be loaded. See [`cg`](@ref). +""" +function bicgstab end + +""" + krylov_solve(method::Symbol, A, b::DVector; kwargs...) -> (x::DVector, stats) + +Generic entry point dispatching to the iterative `method` (`:cg`, `:minres`, +`:gmres`, `:bicgstab`). Requires `Krylov.jl` to be loaded. +""" +function krylov_solve end + +# Friendly fallbacks: these generic methods are shadowed by the more specific +# `(A, b::DVector)` methods added in `ext/KrylovExt.jl` once Krylov is loaded. +_krylov_required(name) = throw(ArgumentError( + "Dagger.$name requires Krylov.jl. Run `using Krylov` (or `import Krylov`) \ + to enable distributed iterative solvers.")) +cg(A, b; kwargs...) = _krylov_required(:cg) +minres(A, b; kwargs...) = _krylov_required(:minres) +gmres(A, b; kwargs...) = _krylov_required(:gmres) +bicgstab(A, b; kwargs...) = _krylov_required(:bicgstab) +krylov_solve(method::Symbol, A, b; kwargs...) = _krylov_required(:krylov_solve) + +# --- Preconditioners ------------------------------------------------------ +# A preconditioner object `P` represents the (approximate) *inverse* operator +# `M⁻¹`: applying it (`mul!(y, P, x)`) computes `y = M⁻¹ x`, which is exactly an +# ordinary matrix-vector product by the operator `P` stands for (no inversion +# happens at apply time -- any reciprocals/factorizations are precomputed once). +# This matches Krylov's `ldiv=false` convention, where the object passed as `M` +# is applied via `mul!(y, M, x)` to compute `y ← M⁻¹ x`, so these are passed +# straight through as `M=P`. +# +# These preconditioners are backend-agnostic: they operate on `DVector`s (and a +# `DMatrix`'s diagonal tiles) and need no solver backend, so they live in core. +# +# A note on the square-tile requirement below: the iterative solvers allocate +# *all* workspace vectors as `similar(b)` (one partitioning), and each must +# serve as both the length-`n` input and length-`n` output of `mul!(y, A, x)`. +# `gemv_dagger!` requires the input to match `A`'s column blocks and the output +# to match `A`'s row blocks, which is only simultaneously possible when those +# block sizes are equal. So the operator must use square tiles regardless of +# preconditioning; the preconditioners simply share that constraint. (Any +# uniform square tile size is fine, including a ragged final block.) + +""" + AbstractDaggerPreconditioner + +Supertype for Dagger's distributed preconditioners. A preconditioner `P` +represents an (approximate) inverse operator `M⁻¹` and applies it via +`mul!(y, P, x)` (`y = M⁻¹ x`) over `DVector`s, so it can be passed to the +solvers as `M=P` (with `ldiv=false`, the default). +""" +abstract type AbstractDaggerPreconditioner end + +""" + JacobiPreconditioner(A::DMatrix) + +Diagonal (Jacobi) preconditioner. The object represents the inverse-diagonal +operator `M⁻¹ = inv(Diagonal(A))`; it precomputes and stores the reciprocal +diagonal `dinv = 1 ./ diag(A)`, so applying it (`mul!(y, P, x)`) is a single +elementwise multiply `y = dinv .* x` per chunk -- an ordinary product by the +stored operator, not an inversion. Cheap to build (one diagonal extraction per +diagonal tile) and to apply. + +Requires square diagonal tiles (equal row/column block sizes); see the note +above on why the solver itself imposes this. Repartition `A` with equal block +sizes if needed. +""" +struct JacobiPreconditioner{V<:DVector} <: AbstractDaggerPreconditioner + dinv::V +end + +JacobiPreconditioner(A::DMatrix) = JacobiPreconditioner(_jacobi_dinv(A)) + +# Per-tile kernel: write the reciprocal diagonal of `tile` into `out`. +_set_inv_diag!(out, tile) = (out .= inv.(LinearAlgebra.diag(tile)); return nothing) + +# Validate that `A` has square diagonal tiles and return (n, Ac, mt, blocksize). +function _check_square_tiles(A::DMatrix, who) + n = LinearAlgebra.checksquare(A) + Ac = A.chunks + mt, nt = size(Ac) + mb, nb = A.partitioning.blocksize + # For a square matrix with square tiles, mt == nt follows automatically; we + # only need to reject non-square tiles (mb != nb). + mb == nb || throw(ArgumentError( + "$who requires square diagonal tiles (got block size $(mb)x$(nb)); \ + repartition A with equal block sizes (this matches the iterative \ + solver's own requirement on the operator)")) + return n, Ac, mt, mb +end + +function _jacobi_dinv(A::DMatrix{T}) where T + n, Ac, mt, mb = _check_square_tiles(A, "JacobiPreconditioner") + dinv = DVector{T}(undef, Blocks(mb), n) + dc = dinv.chunks + Dagger.spawn_datadeps() do + for i in 1:mt + Dagger.@spawn _set_inv_diag!(Out(dc[i]), In(Ac[i, i])) + end + end + return dinv +end + +function LinearAlgebra.mul!(y::DVector, P::JacobiPreconditioner, x::DVector) + part = P.dinv.partitioning + maybe_copy_buffered(P.dinv => part, x => part, y => part) do dinv, x, y + dc, xc, yc = dinv.chunks, x.chunks, y.chunks + Dagger.spawn_datadeps() do + for i in eachindex(yc) + Dagger.@spawn _jacobi_apply!(Out(yc[i]), In(dc[i]), In(xc[i])) + end + end + end + return y +end +_jacobi_apply!(y, dinv, x) = (y .= dinv .* x; return nothing) + +# --- Block-diagonal preconditioners --------------------------------------- +# A family of preconditioners of the form `M⁻¹ = blockdiag(op₁, …, op_k)`, where +# each `opⱼ` approximates the inverse of `A`'s `j`-th diagonal tile `Aⱼⱼ`. They +# share all machinery and differ only in how a per-tile operator is *built*: +# +# - BlockJacobiPreconditioner: exact tile solve via `lu` (dense or sparse). +# - BlockILUPreconditioner: incomplete LU per tile (needs IncompleteLU.jl). +# - AMGPreconditioner: an AMG hierarchy per tile (needs AlgebraicMultigrid.jl). +# +# Building per tile is embarrassingly parallel and a natural fit for the tiled +# layout. A single tile (`Blocks(n, n)`) makes any of these a *global* +# preconditioner over the whole matrix; many tiles make it a scalable +# block-Jacobi / additive-Schwarz variant that trades some convergence for +# parallelism. All require square diagonal tiles (see `JacobiPreconditioner`). +# +# A per-tile operator (`lu`/ILU factor, AMG hierarchy) generally cannot be moved +# between workers (dense `LU` has no `move!`; `UmfpackLU`/AMG hold process-bound +# resources). So each operator is built *once* and pinned (via +# `tochunk(..., ProcessScope)`) to the worker that owns its tile, and every apply +# for that block is scheduled there (`compute_scope`); datadeps then moves only +# the (movable) vector chunks to the operator, never the operator itself. + +""" + AbstractBlockPreconditioner <: AbstractDaggerPreconditioner + +Block-diagonal preconditioner: holds one per-diagonal-tile operator `opⱼ` +(`y ← opⱼ⁻¹ x`), pinned to its tile's worker, and applies them independently per +block. Concrete subtypes (`BlockJacobiPreconditioner`, `BlockILUPreconditioner`, +`AMGPreconditioner`) share these fields: `ops`, `scopes`, `part`, `n`. +""" +abstract type AbstractBlockPreconditioner <: AbstractDaggerPreconditioner end + +_tile_scope(c::Chunk) = ProcessScope(root_worker_id(c)) +_tile_scope(t::DTask) = ProcessScope(root_worker_id(fetch(t; raw=true))) + +# Build the per-tile operator on the current worker and pin the result there: the +# returned `Chunk` is process-scoped, so it can never be moved off this worker. +function _build_tile_pinned(build, tile) + op = build(tile) + proc = Dagger.task_processor() + return tochunk(op, proc, ProcessScope(root_worker_id(proc))) +end + +# Build a block preconditioner of type `Ctor` by applying `build` to each +# diagonal tile (pinned to the tile's worker). `build(tile) -> op` returns a +# per-tile operator supporting the block apply below. +function _build_block_preconditioner(Ctor, A::DMatrix, who, build) + n, Ac, mt, mb = _check_square_tiles(A, who) + ops = Vector{Any}(undef, mt) + scopes = Vector{ProcessScope}(undef, mt) + for i in 1:mt + tile = Ac[i, i] + scope = _tile_scope(tile) + scopes[i] = scope + ops[i] = Dagger.@spawn compute_scope=scope _build_tile_pinned(build, tile) + end + return Ctor(ops, scopes, Blocks(mb), n) +end + +function LinearAlgebra.mul!(y::DVector, P::AbstractBlockPreconditioner, x::DVector) + part = P.part + maybe_copy_buffered(x => part, y => part) do x, y + xc, yc = x.chunks, y.chunks + length(yc) == length(P.ops) || throw(DimensionMismatch( + "$(nameof(typeof(P))) has $(length(P.ops)) blocks but the vector has \ + $(length(yc)) chunks")) + Dagger.spawn_datadeps() do + for i in eachindex(yc) + # Pin the apply to the operator's worker; the operator is passed as + # an untracked arg (read-only, already-pinned) so datadeps never + # moves it -- only the vector chunks are moved to this worker. + Dagger.@spawn compute_scope=P.scopes[i] _block_apply!(Out(yc[i]), P.ops[i], In(xc[i])) + end + end + end + return y +end + +# Apply one block operator: `y = op⁻¹ x`. Default uses `\`; backends override for +# operator types where `\` is unavailable (e.g. ILU/AMG use `ldiv!`). +_block_apply!(y, op, x) = (y .= op \ x; return nothing) + +# Underlying matrix of a tile (overridden for `DSparseArray` in `sparse.jl`). +_tile_matrix(A) = A + +""" + BlockJacobiPreconditioner(A::DMatrix) + +Block-Jacobi preconditioner: `M⁻¹ = blockdiag(A₁₁, …, A_kk)⁻¹`. Each diagonal +tile is factorized *once* with `lu` (sparse or dense) and the apply solves +`Aᵢᵢ yᵢ = xᵢ`. Stronger than `JacobiPreconditioner` (captures intra-block +coupling); a single tile recovers an exact solve. See +[`AbstractBlockPreconditioner`](@ref). Requires square diagonal tiles. +""" +struct BlockJacobiPreconditioner{F,S} <: AbstractBlockPreconditioner + ops::F # cached per-tile factorizations (pinned to their workers) + scopes::S # the `ProcessScope` each operator/apply is pinned to + part::Blocks{1} # partitioning of the vectors it applies to + n::Int +end + +# Factorize a diagonal tile. Backends override for their storage (e.g. sparse +# tiles factorize the inner `SparseMatrixCSC`); the default is a dense LU factor. +_factorize_tile(A) = LinearAlgebra.lu(A) + +BlockJacobiPreconditioner(A::DMatrix) = + _build_block_preconditioner(BlockJacobiPreconditioner, A, "BlockJacobiPreconditioner", _factorize_tile) + +""" + BlockILUPreconditioner(A::DMatrix; τ=0.001, kwargs...) + +Block incomplete-LU preconditioner: an ILU factorization (with drop tolerance +`τ`) of each diagonal tile, applied per block. Cheaper setup than a full block +solve, good as a general-purpose preconditioner. Requires `IncompleteLU.jl` to +be loaded and sparse-backed tiles. See [`AbstractBlockPreconditioner`](@ref). +""" +struct BlockILUPreconditioner{F,S} <: AbstractBlockPreconditioner + ops::F + scopes::S + part::Blocks{1} + n::Int +end +# Friendly fallback (shadowed by the `::DMatrix` method added in `ext/IncompleteLUExt.jl`). +BlockILUPreconditioner(A; kwargs...) = throw(ArgumentError( + "Dagger.BlockILUPreconditioner requires IncompleteLU.jl. Run `using IncompleteLU` \ + to enable block incomplete-LU preconditioning.")) + +""" + AMGPreconditioner(A::DMatrix; method=:ruge_stuben, kwargs...) + +Algebraic-multigrid preconditioner: builds an AMG hierarchy (`method` is +`:ruge_stuben` or `:smoothed_aggregation`) for each diagonal tile and applies a +V-cycle per block. Near mesh-independent convergence for elliptic (Poisson-like) +operators. With one tile this is global AMG; with many tiles it is a scalable +block/additive-Schwarz AMG. Requires `AlgebraicMultigrid.jl` to be loaded and +sparse-backed tiles. See [`AbstractBlockPreconditioner`](@ref). +""" +struct AMGPreconditioner{F,S} <: AbstractBlockPreconditioner + ops::F + scopes::S + part::Blocks{1} + n::Int +end +# Friendly fallback (shadowed by the `::DMatrix` method added in `ext/AlgebraicMultigridExt.jl`). +AMGPreconditioner(A; kwargs...) = throw(ArgumentError( + "Dagger.AMGPreconditioner requires AlgebraicMultigrid.jl. Run \ + `using AlgebraicMultigrid` to enable algebraic-multigrid preconditioning.")) diff --git a/src/array/linalg.jl b/src/array/linalg.jl index c998ac752..6abae1590 100644 --- a/src/array/linalg.jl +++ b/src/array/linalg.jl @@ -4,6 +4,71 @@ function LinearAlgebra.norm2(A::DArray{T,N}) where {T,N} zeroRT = zero(real(T)) return sqrt(sum(map(norm->fetch(norm)::real(T), norms); init=zeroRT)) end + +# --- BLAS-1 vector operations --------------------------------------------- +# Efficient, distributed level-1 operations needed by iterative (Krylov) +# solvers. These run one BLAS-1 kernel per chunk (avoiding the generic +# scalar-indexing fallbacks, which are slow and trigger scalar access on GPUs). +# Operands may be partitioned differently: we align them with +# `maybe_copy_buffered` (a no-op when they already share a layout, and which +# copies results back to the originals for in-place operations), so users keep +# full control over their own (possibly sub-optimal) partitioning. + +function LinearAlgebra.dot(x::DArray{Tx,N}, y::DArray{Ty,N}) where {Tx,Ty,N} + size(x) == size(y) || throw(DimensionMismatch("dot: x has size $(size(x)), y has size $(size(y))")) + R = typeof(LinearAlgebra.dot(zero(Tx), zero(Ty))) + return maybe_copy_buffered(x => x.partitioning, y => x.partitioning) do x, y + xc, yc = x.chunks, y.chunks + parts = [Dagger.@spawn LinearAlgebra.dot(xc[i], yc[i]) for i in eachindex(xc)] + sum(fetch, parts; init=zero(R)) + end +end + +function LinearAlgebra.axpy!(a::Number, x::DArray{Tx,N}, y::DArray{Ty,N}) where {Tx,Ty,N} + size(x) == size(y) || throw(DimensionMismatch("axpy!: x has size $(size(x)), y has size $(size(y))")) + # Align the read-only `x` to `y`'s layout so only `x` may be buffered; `y` is + # updated in place (and copied back by `maybe_copy_buffered` if it was buffered). + maybe_copy_buffered(x => y.partitioning, y => y.partitioning) do x, y + xc, yc = x.chunks, y.chunks + Dagger.spawn_datadeps() do + for i in eachindex(xc) + Dagger.@spawn LinearAlgebra.axpy!(a, In(xc[i]), InOut(yc[i])) + end + end + end + return y +end + +function LinearAlgebra.axpby!(a::Number, x::DArray{Tx,N}, b::Number, y::DArray{Ty,N}) where {Tx,Ty,N} + size(x) == size(y) || throw(DimensionMismatch("axpby!: x has size $(size(x)), y has size $(size(y))")) + maybe_copy_buffered(x => y.partitioning, y => y.partitioning) do x, y + xc, yc = x.chunks, y.chunks + Dagger.spawn_datadeps() do + for i in eachindex(xc) + Dagger.@spawn LinearAlgebra.axpby!(a, In(xc[i]), b, InOut(yc[i])) + end + end + end + return y +end + +function LinearAlgebra.rmul!(x::DArray, a::Number) + Dagger.spawn_datadeps() do + for c in x.chunks + Dagger.@spawn LinearAlgebra.rmul!(InOut(c), a) + end + end + return x +end + +function LinearAlgebra.lmul!(a::Number, x::DArray) + Dagger.spawn_datadeps() do + for c in x.chunks + Dagger.@spawn LinearAlgebra.lmul!(a, InOut(c)) + end + end + return x +end function LinearAlgebra.norm2(A::UpperTriangular{T,<:DArray{T,2}}) where T Ac = parent(A).chunks Ac_upper = [] diff --git a/src/array/mul.jl b/src/array/mul.jl index 02b207641..794c0a781 100644 --- a/src/array/mul.jl +++ b/src/array/mul.jl @@ -1,3 +1,74 @@ +""" + matmatmul!(C, transA::Char, transB::Char, A, B, alpha, beta) + +A general-purpose matrix-matrix multiply, like `LinearAlgebra.generic_matmatmul!`, +but with extra functionality. May internally convert `A` and `B` to a type that +better matches `C` and provides optimal portability and, when possible, +better performance. The actual matrix multiply operation should happen in +`LinearAlgebra.generic_matmatmul!` or an equivalent call. + +The following automatic conversions are performed: +- If no `LinearAlgebra.generic_matmatmul!` method is available, convert `A` and `B` to dense Array-like +- If `C` is a `DSparseMatrix`, perform the operation out-of-place and then update `C` in-place +""" +function matmatmul!( + C, + transA::Char, + transB::Char, + A, + B, + alpha, + beta +) + EC = eltype(C) + EA = eltype(A) + EB = eltype(B) + + TC = typeof(C) + TA = typeof(A) + TB = typeof(B) + + mam = LinearAlgebra.MulAddMul(alpha, beta) + + # Check if C doesn't support in-place operations (e.g. DSparseMatrix) + # We'll get here if A and B don't have equivalent types + if isa(C, DSparseMatrix) + C.mat = alpha * A * B + beta * C.mat + return C + end + + # Check if the call will fail due to MethodError + sig = Tuple{TC, Char, Char, TA, TB, typeof(mam)} + if !hasmethod(LinearAlgebra.generic_matmatmul!, sig) + # Convert to Array-like + # FIXME: GPU support + C_new = C + A_new = collect(A) + B_new = collect(B) + alpha_new = alpha + beta_new = beta + # FIXME: Re-check hasmethod, and if no method, then convert and bounce C + @goto ready + end + + C_new = C + A_new = A + B_new = B + alpha_new = alpha + beta_new = beta + + @label ready + mam_new = LinearAlgebra.MulAddMul(alpha_new, beta_new) + return LinearAlgebra.generic_matmatmul!( + C_new, + transA, + transB, + A_new, + B_new, + mam_new + ) +end + function LinearAlgebra.generic_matmatmul!( C::DMatrix{T}, transA::Char, @@ -41,6 +112,7 @@ function LinearAlgebra.generic_matmatmul!( return gemm_dagger!(C, transA, transB, A, B, alpha, beta) end end +# FIXME: Mixed-precision methods function _repartition_matmatmul(C, A, B, transA::Char, transB::Char) partA = A.partitioning.blocksize partB = B.partitioning.blocksize @@ -136,28 +208,28 @@ function gemm_dagger!( # A: NoTrans / B: NoTrans for k in range(1, Ant) mzone = k == 1 ? beta : T(1.0) - Dagger.@spawn BLAS.gemm!( + Dagger.@spawn matmatmul!( + InOut(Cc[m, n]), transA, transB, - alpha, In(Ac[m, k]), In(Bc[k, n]), + alpha, mzone, - InOut(Cc[m, n]), ) end else # A: NoTrans / B: [Conj]Trans for k in range(1, Ant) mzone = k == 1 ? beta : T(1.0) - Dagger.@spawn BLAS.gemm!( + Dagger.@spawn matmatmul!( + InOut(Cc[m, n]), transA, transB, - alpha, In(Ac[m, k]), In(Bc[n, k]), + alpha, mzone, - InOut(Cc[m, n]), ) end end @@ -166,28 +238,28 @@ function gemm_dagger!( # A: [Conj]Trans / B: NoTrans for k in range(1, Amt) mzone = k == 1 ? beta : T(1.0) - Dagger.@spawn BLAS.gemm!( + Dagger.@spawn matmatmul!( + InOut(Cc[m, n]), transA, transB, - alpha, In(Ac[k, m]), In(Bc[k, n]), + alpha, mzone, - InOut(Cc[m, n]), ) end else # A: [Conj]Trans / B: [Conj]Trans for k in range(1, Amt) mzone = k == 1 ? beta : T(1.0) - Dagger.@spawn BLAS.gemm!( + Dagger.@spawn matmatmul!( + InOut(Cc[m, n]), transA, transB, - alpha, In(Ac[k, m]), In(Bc[n, k]), + alpha, mzone, - InOut(Cc[m, n]), ) end end @@ -235,6 +307,7 @@ function syrk_dagger!( iscomplex = T <: Complex transs = iscomplex ? 'C' : 'T' + anti_transs = trans == 'N' ? transs : 'N' Dagger.spawn_datadeps() do for n in range(1, Cnt) @@ -242,116 +315,63 @@ function syrk_dagger!( # NoTrans for k in range(1, Ant) mzone = k == 1 ? real(beta) : one(real(T)) - if iscomplex - Dagger.@spawn BLAS.herk!( - uplo, + _alpha = iscomplex ? real(alpha) : alpha + Dagger.@spawn matmatmul!( + InOut(Cc[n, n]), + trans, + anti_transs, + In(Ac[n, k]), + In(Ac[n, k]), + _alpha, + mzone, + ) + end + # NoTrans / Upper + for m in range(n + 1, Cmt) + for k in range(1, Ant) + mzone = k == 1 ? beta : one(T) + Dagger.@spawn matmatmul!( + InOut(Cc[n, m]), trans, - real(alpha), + transs, In(Ac[n, k]), - mzone, - InOut(Cc[n, n]), - ) - else - Dagger.@spawn BLAS.syrk!( - uplo, - trans, + In(Ac[m, k]), alpha, - In(Ac[n, k]), mzone, - InOut(Cc[n, n]), ) end end - if uplo == 'L' - # NoTrans / Lower - for m in range(n + 1, Cmt) - for k in range(1, Ant) - mzone = k == 1 ? beta : one(T) - Dagger.@spawn BLAS.gemm!( - trans, - transs, - alpha, - In(Ac[m, k]), - In(Ac[n, k]), - mzone, - InOut(Cc[m, n]), - ) - end - end - else - # NoTrans / Upper - for m in range(n + 1, Cmt) - for k in range(1, Ant) - mzone = k == 1 ? beta : one(T) - Dagger.@spawn BLAS.gemm!( - trans, - transs, - alpha, - In(Ac[n, k]), - In(Ac[m, k]), - mzone, - InOut(Cc[n, m]), - ) - end - end - end else # [Conj]Trans for k in range(1, Amt) mzone = k == 1 ? real(beta) : one(real(T)) - if iscomplex - Dagger.@spawn BLAS.herk!( - uplo, + _alpha = iscomplex ? real(alpha) : alpha + _trans = iscomplex ? transs : trans + Dagger.@spawn matmatmul!( + InOut(Cc[n, n]), + _trans, + anti_transs, + In(Ac[k, n]), + In(Ac[k, n]), + _alpha, + mzone, + ) + end + # [Conj]Trans / Upper + for m in range(n + 1, Cmt) + for k in range(1, Amt) + mzone = k == 1 ? beta : one(T) + Dagger.@spawn matmatmul!( + InOut(Cc[n, m]), transs, - real(alpha), + 'N', In(Ac[k, n]), - mzone, - InOut(Cc[n, n]), - ) - else - Dagger.@spawn BLAS.syrk!( - uplo, - trans, + In(Ac[k, m]), alpha, - In(Ac[k, n]), mzone, - InOut(Cc[n, n]), ) end end - if uplo == 'L' - # [Conj]Trans / Lower - for m in range(n + 1, Cmt) - for k in range(1, Amt) - mzone = k == 1 ? beta : one(T) - Dagger.@spawn BLAS.gemm!( - transs, - 'N', - alpha, - In(Ac[k, m]), - In(Ac[k, n]), - mzone, - InOut(Cc[m, n]), - ) - end - end - else - # [Conj]Trans / Upper - for m in range(n + 1, Cmt) - for k in range(1, Amt) - mzone = k == 1 ? beta : one(T) - Dagger.@spawn BLAS.gemm!( - transs, - 'N', - alpha, - In(Ac[k, n]), - In(Ac[k, m]), - mzone, - InOut(Cc[n, m]), - ) - end - end - end end end end @@ -393,7 +413,7 @@ end return A end -@inline function copytile!(A, B) +function copytile!(A, B) m, n = size(A) C = B' @@ -402,15 +422,14 @@ end end end -@inline function copydiagtile!(A, uplo) +function copydiagtile!(A, uplo) m, n = size(A) - Acpy = copy(A) if uplo == 'U' - C = UpperTriangular(Acpy)' + UpperTriangular(Acpy) + C = UpperTriangular(A)' + UpperTriangular(A) C[diagind(C)] .= A[diagind(A)] elseif uplo == 'L' - C = LowerTriangular(Acpy)' + Acpy - UpperTriangular(Acpy) + C = LowerTriangular(A)' + A - UpperTriangular(A) C[diagind(C)] .= A[diagind(A)] end @@ -418,6 +437,7 @@ end A[i, j] = C[i, j] end end + function LinearAlgebra.generic_matvecmul!( C::DVector{T}, transA::Char, @@ -464,6 +484,21 @@ function _repartition_matvecmul(C, A, B, transA::Char) partC = (dimA,) return Blocks(partC...), Blocks(partA...), Blocks(partB...) end +""" + matvecmul!(C, transA::Char, A, B, alpha, beta) + +Tile-level matrix-vector multiply computing `C = alpha*op(A)*B + beta*C` in +place on the (dense) output vector `C`, where `op` is determined by `transA` +(`'N'`, `'T'`, `'C'`). Dispatches on the tile types: dense tiles use BLAS, while +sparse tiles (e.g. `DSparseArray`) provide their own method (in a package +extension) using a sparse matrix-vector product. This is the matvec analogue of +[`matmatmul!`](@ref). +""" +function matvecmul!(C, transA::Char, A, B, alpha, beta) + BLAS.gemv!(transA, alpha, A, B, beta, C) + return C +end + function gemv_dagger!( C::DVector{T}, transA::Char, @@ -495,26 +530,26 @@ function gemv_dagger!( # A: NoTrans for k in range(1, Ant) mzone = k == 1 ? beta : T(1.0) - Dagger.@spawn BLAS.gemv!( + Dagger.@spawn matvecmul!( + InOut(Cc[m]), transA, - alpha, In(Ac[m, k]), In(Bc[k]), + alpha, mzone, - InOut(Cc[m]), ) end else # A: [Conj]Trans for k in range(1, Amt) mzone = k == 1 ? beta : T(1.0) - Dagger.@spawn BLAS.gemv!( + Dagger.@spawn matvecmul!( + InOut(Cc[m]), transA, - alpha, In(Ac[k, m]), In(Bc[k]), + alpha, mzone, - InOut(Cc[m]), ) end end @@ -523,3 +558,53 @@ function gemv_dagger!( return C end + +wrap_as_darray(A::DArray) = A +wrap_as_darray(A::Array) = view(A, AutoBlocks()) +function wrap_as_darray(A::SubArray{T,Nv,Array{T,Na}}) where {T,Nv,Na} + Ap = parent(A) + part = auto_blocks(map(last, parentindices(Ap))) + partsize = part.blocksize + inds = parentindices(A) + inds_ranges_parent = ntuple(i->to_range(inds[i]), Val(Na)) + inds_ranges_view = ntuple(i->to_range(inds[i]), Val(Nv)) + subdomains = partition(part, ArrayDomain(inds_ranges_parent)) + nparts = size(subdomains) + chunks = Array{Any,Na}(undef, nparts...) + for idx in CartesianIndices(nparts) + subdomain_view = subdomains[idx] + subdomain_parent = ArrayDomain(ntuple(i->Nv >= i ? subdomain_view.indexes[i] : inds_ranges_parent[i], Val(Na))) + subinds = ntuple(i->subdomain_parent.indexes[i], Val(Na)) + subA = view(Ap, subinds...) + chunks[idx] = tochunk(subA) + end + return DArray(T, ArrayDomain(inds_ranges_parent), subdomains, chunks, part) +end + +# Generate generic_matvecmul! methods for all combinations of DArray, Array, and SubArray +for CT in (DVector, Vector, SubArray{<:Any,1,<:Array}), + AT in (DMatrix, Matrix, SubArray{<:Any,2,<:Array}), + BT in (DVector, Vector, SubArray{<:Any,1,<:Array}) + + # Don't commit type piracy + CT isa DArray || AT isa DArray || BT isa DArray || continue + + @eval function LinearAlgebra.generic_matvecmul!( + C::$(CT), + transA::Char, + A::$(AT), + B::$(BT), + _add::LinearAlgebra.MulAddMul, + ) + new_C = wrap_as_darray(C) + new_A = wrap_as_darray(A) + new_B = wrap_as_darray(B) + return LinearAlgebra.generic_matvecmul!( + new_C, + transA, + new_A, + new_B, + _add, + ) + end +end \ No newline at end of file diff --git a/src/array/sparse.jl b/src/array/sparse.jl new file mode 100644 index 000000000..3e56c37b0 --- /dev/null +++ b/src/array/sparse.jl @@ -0,0 +1,152 @@ +""" + DSparseArray{T,N} <: AbstractArray{T,N} + +A sparse array container, for which the contained array may be replaced with a +new one to support in-place operations. Designed to work well with Datadeps +algorithms: writes that reallocate (and grow/shrink) the inner sparse storage +are hidden behind the wrapper's stable identity, so Datadeps aliasing tracking +remains valid (see `aliases_as_whole`). + +`DSparseVector{T}` and `DSparseMatrix{T}` are aliases for the 1- and 2-dimensional +cases. The wrapper is general over `N` so it can hold sparse vectors, matrices, +and (eventually) higher-order sparse tensors (e.g. Finch tensors). +""" +mutable struct DSparseArray{T,N} <: AbstractArray{T,N} + mat +end +const DSparseVector{T} = DSparseArray{T,1} +const DSparseMatrix{T} = DSparseArray{T,2} + +# Convenience constructor inferring element type and dimensionality from `x`. +DSparseArray(x) = DSparseArray{eltype(x),ndims(x)}(x) + +_sparse_collect(M) = collect(M) +# Allocate a destination tile matching the inner storage's sparse backend. The +# default delegates to the storage type's own `similar`, which carries the +# concrete backend (e.g. `SparseMatrixCSC`) forward to the destination -- this is +# how `similar(::DArray)` propagates a tile's sparse type to a newly-allocated +# result DArray. Backends whose `similar` is unsuitable (e.g. Finch, whose generic +# `similar` yields tensor formats that destabilize later kernels) override this. +_sparse_similar(mat, ::Type{T}, dims::Dims) where {T} = similar(mat, T, dims) + +Base.eltype(M::DSparseArray{T}) where T = T +Base.size(M::DSparseArray) = size(M.mat) +Base.iterate(M::DSparseArray) = iterate(M.mat) +Base.iterate(M::DSparseArray, state) = iterate(M.mat, state) +Base.similar(M::DSparseArray, ::Type{T}, dims::Dims) where T = + DSparseArray{T,length(dims)}(_sparse_similar(M.mat, T, dims)) +Base.collect(M::DSparseArray) = _sparse_collect(M.mat) + +# N.B. hash and aliasing shouldn't change even if M.mat changes. +# This is the key property that makes `DSparseArray` safe for Datadeps: +# the aliasing identity is tied to the (stable) wrapper object, not to the +# inner storage, which may be reallocated (and grow/shrink) on writes. +Base.hash(M::DSparseArray, h::UInt) = hash(objectid(M), hash(DSparseArray, h)) + +# Sparse containers must alias as a single, indivisible unit. Their storage is +# reallocated (and may grow/shrink) on writes, so it is unsafe to alias any +# *part* of the container independently. We therefore force *all* access to a +# `DSparseArray` to resolve to the container's stable whole-object +# `ObjectAliasing` -- including access via views, transposes, adjoints, and +# reshapes -- rather than e.g. a `StridedAliasing` over storage that may have +# since moved. `aliases_as_whole` opts the type into this behavior, and Datadeps' +# `aliasing_root` resolves any wrapper of a `DSparseArray` back to the container +# before computing aliasing. Note: this must return a *bare* `ObjectAliasing`, as +# `aliases_as_whole(::ObjectAliasing)` relies on it to drive whole-object copies. +aliasing(M::DSparseArray, _=identity) = ObjectAliasing(M) +aliases_as_whole(::DSparseArray) = true + +# A `DSparseArray` has no meaningful raw data pointer (its storage may be +# reallocated/resized). This trap ensures that if aliasing ever tries to treat a +# wrapper of a `DSparseArray` as strided memory -- e.g. a new array-wrapper type +# that `aliasing_root` failed to resolve via `Base.parent` -- it fails loudly +# instead of silently corrupting Datadeps' aliasing tracking. +function Base.pointer(::DSparseArray) + throw(ArgumentError("`pointer(::DSparseArray)` is intentionally unsupported: \ + a DSparseArray may reallocate its storage, so it must be aliased as a \ + whole object via `Dagger.aliasing`. If you reached here through Datadeps \ + aliasing of an array wrapper, ensure that wrapper implements `Base.parent` \ + so that `Dagger.aliasing_root` can resolve it to the DSparseArray.")) +end + +# Forward indexing to the inner array. Ranges/colons are supported so that +# views (used by copy-buffering between mismatched partitionings) work. +Base.getindex(M::DSparseArray, I...) = getindex(M.mat, I...) +Base.setindex!(M::DSparseArray, v, I...) = setindex!(M.mat, v, I...) + +# Whole-tile copy. This is how Datadeps moves a tile between workers via +# `move!`: we *reallocate* the inner storage rather than mutating in place, +# since sparse storage cannot generally be updated in place. The wrapper's +# identity (and thus its aliasing) is preserved. +function Base.copyto!(dst::DSparseArray, src::DSparseArray) + dst.mat = _sparse_copy(src.mat) + return dst +end +# Backend-overridable deep copy of the inner storage (Finch tensors don't define +# `Base.copy`). +_sparse_copy(mat) = copy(mat) + +# Wrapping hook used when materializing tiles (e.g. in `distribute`). Backends +# overload this (e.g. in package extensions) to wrap freshly-created tiles in a +# container that Datadeps can track (such as `DSparseArray` for sparse tiles); +# everything else is left untouched. +maybe_wrap_tile(x) = x + +# Partial-range tile copy used by copy-buffering (e.g. when matmul operands need +# to be repartitioned). Some backends (Finch) forbid `setindex!`, so we route +# through a backend hook that returns the (possibly reallocated) inner storage. +# `DSparseArray` hides the reallocation from Datadeps. +function copyto_view!(Bpart::DSparseArray, Brange, Apart, Arange) + Bpart.mat = _sparse_copyto_view!(Bpart.mat, Brange, view(Apart, Arange)) + return +end +# Default: in-place element-wise copy (works for `setindex!`-capable backends +# such as `SparseArrays`). May reallocate and return new storage for backends +# that can't update in place, so callers must use the returned value. +function _sparse_copyto_view!(mat, Brange, src) + copyto!(view(mat, Brange), src) + return mat +end + +#Base.BroadcastStyle(::Type{<:DSparseArray}) = FinchStyle() + +LinearAlgebra.norm2(M::DSparseArray) = LinearAlgebra.norm2(M.mat) + +# Matrix-specific operations dispatch on the 2-D alias `DSparseMatrix`. +function matmatmul!( + C::DSparseMatrix, + transA::Char, + transB::Char, + A::DSparseMatrix, + B::DSparseMatrix, + alpha, + beta +) + # Forward to sparse matmatmul!, but allow in-place update of C + return matmatmul!(C, transA, transB, A.mat, B.mat, alpha, beta) +end + +# Sparse matrix-vector multiply: `C = alpha*op(A)*B + beta*C` with a sparse +# matrix tile `A` and dense vectors `B`/`C`. Unwrap to the inner storage so each +# backend can provide an efficient (in-place on dense `C`) method. +function matvecmul!(C, transA::Char, A::DSparseMatrix, B, alpha, beta) + return matvecmul!(C, transA, A.mat, B, alpha, beta) +end + +# Factorize the inner sparse storage (e.g. sparse LU/UMFPACK) for block-Jacobi, +# rather than the `DSparseArray` wrapper (whose generic `lu` would densify). +_factorize_tile(M::DSparseArray) = LinearAlgebra.lu(M.mat) + +# Expose the inner sparse storage so preconditioner backends (ILU, AMG) build on +# the actual `SparseMatrixCSC` rather than the `DSparseArray` wrapper. +_tile_matrix(M::DSparseArray) = M.mat + +function transpose_tile end +function copytile!(A::DSparseMatrix, B::DSparseMatrix) + A.mat = transpose_tile(B.mat) + return +end +function copydiagtile!(A::DSparseMatrix, uplo) + A.mat = transpose_tile(A.mat, uplo) + return +end \ No newline at end of file diff --git a/src/array/sparse_partition.jl b/src/array/sparse_partition.jl deleted file mode 100644 index 26efe3e6a..000000000 --- a/src/array/sparse_partition.jl +++ /dev/null @@ -1,37 +0,0 @@ -export partition_sparse - -function partition_sparse(colptr, nnz, sz, nparts) - nnz_per_chunk = nnz/nparts - dom = ArrayDomain(map(x->1:x, sz)) - - nxt = nnz_per_chunk - colstart = 1 - splits = UnitRange[] - - for i=1:nparts-1 - idxs = searchsorted(colptr, nxt) - if length(idxs) == 0 - # the exact index was not found. latch on to the nearest column boundary - idx = first(idxs) - if abs(colptr[idx]-nxt) < abs(colptr[idx-1]-nxt) - nextidx = idx - else - nextidx = idx-1 - end - else - nextidx = last(idxs) - end - push!(splits, colstart:nextidx) - colstart=nextidx+1 - nxt=nxt + nnz_per_chunk - end - push!(splits, colstart:(length(colptr)-1)) - - - cumlength = cumsum(map(length, splits)) - subdomains = DomainBlocks((1,1), ([size(dom, 1)], cumlength)) - DomainSplit(dom, subdomains) -end - -partition_sparse(S::SparseMatrixCSC, nparts) = - partition_sparse(S.colptr, length(S.nzval), size(S), nparts) diff --git a/src/datadeps/aliasing.jl b/src/datadeps/aliasing.jl index c3e0ed20b..bd49107aa 100644 --- a/src/datadeps/aliasing.jl +++ b/src/datadeps/aliasing.jl @@ -567,8 +567,12 @@ function aliasing!(state::DataDepsState, target_space::MemorySpace, arg_w::Argum return state.ainfo_cache[remote_arg_w] end - # Calculate the ainfo - ainfo = AliasingWrapper(aliasing(remote_arg, arg_w.dep_mod)) + # Calculate the ainfo. `aliasing_unwrapped` resolves any wrapper of a + # whole-object container (e.g. a view of a `DSparseArray`) to the container + # itself, so aliasing is computed on the container -- never on a partial view + # of storage that may have been reallocated. `remote_arg` is local here (for + # `Chunk`s the unwrap+aliasing happen remotely inside `aliasing(::Chunk, ...)`). + ainfo = AliasingWrapper(aliasing_unwrapped(remote_arg, arg_w.dep_mod)) # Cache the result state.ainfo_cache[remote_arg_w] = ainfo diff --git a/src/datadeps/chunkview.jl b/src/datadeps/chunkview.jl index 1c2aa600f..c769f4b37 100644 --- a/src/datadeps/chunkview.jl +++ b/src/datadeps/chunkview.jl @@ -35,7 +35,10 @@ function aliasing(x::ChunkView{N}) where N return remotecall_fetch(root_worker_id(x.chunk.processor), x.chunk, x.slices) do x, slices x = unwrap(x) v = view(x, slices...) - return aliasing(v) + # A view of a whole-object container (e.g. `DSparseArray`) must alias the + # entire container; `aliasing_unwrapped` resolves that (otherwise it just + # aliases the view), and crucially does so here where `v` lives. + return aliasing_unwrapped(v) end end memory_space(x::ChunkView) = memory_space(x.chunk) diff --git a/src/datadeps/remainders.jl b/src/datadeps/remainders.jl index 2c2c49920..8a74d3839 100644 --- a/src/datadeps/remainders.jl +++ b/src/datadeps/remainders.jl @@ -43,6 +43,7 @@ memory_spans(mra::MultiRemainderAliasing) = vcat(memory_spans.(mra.remainders).. Base.hash(mra::MultiRemainderAliasing, h::UInt) = hash(mra.remainders, hash(MultiRemainderAliasing, h)) Base.:(==)(mra1::MultiRemainderAliasing, mra2::MultiRemainderAliasing) = mra1.remainders == mra2.remainders +# A whole-object copy from the argument's recorded owner space. struct FullCopy end """ @@ -87,9 +88,34 @@ function compute_remainder_for_arg!(state::DataDepsState, target_space::MemorySpace, arg_w::ArgumentWrapper, write_num::Int; compute_syncdeps::Bool=true) + owner_space = state.arg_owner[arg_w] + + # Whole-object containers (e.g. `DSparseArray`) reallocate their entire + # storage on every write, so they can never be *partially* current in a + # memory space -- the whole object is current in exactly one space: the most + # recent writer (or the origin owner if never written). There is thus never a + # meaningful span "remainder" to compute, so we short-circuit to a whole-object + # `FullCopy` from the owner space (or `NoAliasing` if the target already holds + # it). This avoids the span-diff computation and the construction + transfer of + # a `RemainderAliasing` entirely. We detect this from the aliasing info, which + # is a bare `ObjectAliasing` for such containers (see `aliases_as_whole`). + # + # `arg_owner` is the most-recent *direct* writer of `arg_w`, while the merged + # `arg_history` records the latest write through *any* overlapping argument. + # For every path Datadeps currently exercises a whole-object container is only + # ever reached through a single argument, so these agree -- the assertion below + # guards that invariant. They could only diverge under cross-argument aliasing + # of a whole-object container (the same container written through one argument + # and read through a different, overlapping one); supporting that would require + # sourcing the copy from `history[end].space` instead of `arg_owner`. + if aliases_as_whole(aliasing!(state, target_space, arg_w)) + history = state.arg_history[arg_w] + @assert isempty(history) || history[end].space == owner_space "whole-object container reached through a different overlapping argument than its owner; cross-argument aliasing of whole-object containers is unsupported" + return owner_space == target_space ? (NoAliasing(), 0) : (FullCopy(), 0) + end + spaces_set = Set{MemorySpace}() push!(spaces_set, target_space) - owner_space = state.arg_owner[arg_w] push!(spaces_set, owner_space) @label restart @@ -419,6 +445,9 @@ end # Main copy function for RemainderAliasing function move!(dep_mod::RemainderAliasing{S}, to_space::MemorySpace, from_space::MemorySpace, to::Chunk, from::Chunk) where S # TODO: Support direct copy between GPU memory spaces + # N.B. Whole-object containers (e.g. `DSparseArray`) never reach here: their + # storage reallocates on write, so they can't be span-copied. They are routed + # to a whole-object `FullCopy` in `compute_remainder_for_arg!` instead. # Copy the data from the source object copies = remotecall_fetch(root_worker_id(from_space), from_space, dep_mod, from) do from_space, dep_mod, from @@ -523,18 +552,4 @@ function write_remainder!(copies::Vector{UInt8}, copies_offset::UInt64, to::Base else write_remainder!(copies, copies_offset, to[], to_ptr, n) end -end - -function find_object_holding_ptr(A::SparseMatrixCSC, ptr::UInt64) - span = LocalMemorySpan(pointer(A.nzval), length(A.nzval)*sizeof(eltype(A.nzval))) - if span_start(span) <= ptr <= span_end(span) - return A.nzval - end - span = LocalMemorySpan(pointer(A.colptr), length(A.colptr)*sizeof(eltype(A.colptr))) - if span_start(span) <= ptr <= span_end(span) - return A.colptr - end - span = LocalMemorySpan(pointer(A.rowval), length(A.rowval)*sizeof(eltype(A.rowval))) - @assert span_start(span) <= ptr <= span_end(span) "Pointer $ptr not found in SparseMatrixCSC" - return A.rowval end \ No newline at end of file diff --git a/src/file-io.jl b/src/file-io.jl index 1407fbced..15405d049 100644 --- a/src/file-io.jl +++ b/src/file-io.jl @@ -1,5 +1,4 @@ export save, load -using SparseArrays import MemPool: GenericFileDevice, FileRef struct File diff --git a/src/memory-spaces.jl b/src/memory-spaces.jl index eb4f7ad5b..5d06edd64 100644 --- a/src/memory-spaces.jl +++ b/src/memory-spaces.jl @@ -390,17 +390,90 @@ end aliasing(::String) = NoAliasing() # FIXME: Not necessarily true aliasing(::Symbol) = NoAliasing() aliasing(::Type) = NoAliasing() + +""" + aliases_as_whole(x) -> Bool + +Whether `x` -- either an object, or an already-computed aliasing -- must be +treated as a single, indivisible unit, i.e. it is never safe to alias (or to +consider current) only *part* of it. + +For an object: containers whose backing storage may be reallocated or resized on +write (such as `DSparseArray`) return `true`. This causes [`aliasing_root`](@ref) +to resolve any view/wrapper of them to the whole container, so all access funnels +through the container's own `aliasing`. By contract, such a container's `aliasing` +must be a bare [`ObjectAliasing`](@ref). + +For an aliasing: a bare `ObjectAliasing` (the aliasing of a whole-object +container) reports `true`. Datadeps uses this to copy such arguments as a whole +(a `FullCopy`) rather than computing per-span remainders, since they can never be +*partially* current in a memory space. +""" +aliases_as_whole(@nospecialize x) = false +aliases_as_whole(ainfo::AliasingWrapper) = aliases_as_whole(ainfo.inner) +aliases_as_whole(::ObjectAliasing) = true + +""" + aliasing_root(x) + +Resolve `x` down to the whole-object container it wraps: peel array wrappers +(views, transposes, adjoints, reshapes, permutations, ...) off of `x` until +reaching a container that must alias as a whole (see [`aliases_as_whole`](@ref)), +and return that container. If no such container is wrapped, return `x` unchanged. + +This relies solely on the standard `Base.parent` interface, so it transparently +handles *any* array wrapper without per-wrapper `aliasing` methods: a new wrapper +type needs no special-casing here, and a new whole-object container only needs to +define [`aliases_as_whole`](@ref) (plus its own `aliasing` method, which must +return a bare `ObjectAliasing`). A trapping `Base.pointer` on such containers +guards against a wrapper slipping through and being misinterpreted as strided +memory. + +!!! warning + The resolved object's *identity* defines its aliasing, so this is only + meaningful in the memory space where `x` physically resides. Never return its + result across a worker boundary and *then* compute `aliasing` -- the transfer + copies the object and changes its aliasing. Use [`aliasing_unwrapped`](@ref), + which fuses the two operations so they always run together. +""" +aliasing_root(@nospecialize x) = x +function aliasing_root(x::AbstractArray) + aliases_as_whole(x) && return x + p = parent(x) + # `Base.parent` returns the argument itself for non-wrapper arrays, which + # terminates the recursion. + p === x && return x + root = aliasing_root(p) + return aliases_as_whole(root) ? root : x +end + +""" + aliasing_unwrapped(x[, dep_mod]) + +Compute the `aliasing` of `x`, first resolving (via [`aliasing_root`](@ref)) any +whole-object container that `x` wraps. This is the entry point Datadeps uses to +compute the aliasing of a (possibly wrapped) argument. + +Unwrapping and `aliasing` are intentionally fused into a single call so they +always execute in the same place. It MUST be evaluated where `x` physically lives +-- e.g. *inside* a `Chunk`'s `remotecall_fetch` block -- because the unwrapped +object's identity determines its aliasing; returning the unwrapped object across +a worker boundary first would copy it and silently change the result. +""" +aliasing_unwrapped(x) = aliasing(aliasing_root(x)) +aliasing_unwrapped(x, dep_mod) = aliasing(aliasing_root(x), dep_mod) + function aliasing(x::Chunk, T) @assert x.handle isa DRef if root_worker_id(x.processor) == myid() - return aliasing(unwrap(x), T) + return aliasing_unwrapped(unwrap(x), T) end return remotecall_fetch(root_worker_id(x.processor), x, T) do x, T - aliasing(unwrap(x), T) + aliasing_unwrapped(unwrap(x), T) end end aliasing(x::Chunk) = remotecall_fetch(root_worker_id(x.processor), x) do x - aliasing(unwrap(x)) + aliasing_unwrapped(unwrap(x)) end aliasing(x::DTask, T) = aliasing(fetch(x; raw=true), T) aliasing(x::DTask) = aliasing(fetch(x; raw=true)) diff --git a/test/Project.toml b/test/Project.toml index af2d30b14..53cd012ac 100644 --- a/test/Project.toml +++ b/test/Project.toml @@ -1,4 +1,5 @@ [deps] +AlgebraicMultigrid = "2169fc97-5a83-5252-b627-83903c6c433c" ArgParse = "c7e460c6-2fb9-53a9-8c5b-16f535851c63" Colors = "5ae59095-9a9b-59fe-a467-6f913c188581" CondaPkg = "992eb4ea-22a4-4c89-a5bb-47a3300528ab" @@ -9,8 +10,10 @@ FFTW = "7a1cc6ca-52ef-59f5-83cd-3a7055c09341" GPUArraysCore = "46192b85-c4d5-4398-a991-12ede77f4527" GraphViz = "f526b714-d49f-11e8-06ff-31ed36ee7ee0" Graphs = "86223c79-3864-5bf0-83f7-82e725a168b6" +IncompleteLU = "40713840-3770-5561-ab4c-a76e7d0d7895" JSON3 = "0f8b85d8-7281-11e9-16c2-39a750bddbf1" KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c" +Krylov = "ba0b0d4f-ebba-5204-a429-3ac8c609bfb7" LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" MemPool = "f9f48841-c794-520a-933b-121f7ba6ed94" OnlineStats = "a15396b6-48d5-5d58-9928-6d29437db91e" diff --git a/test/array/linalg/core.jl b/test/array/linalg/core.jl index 4c66ac34a..5de6fefd1 100644 --- a/test/array/linalg/core.jl +++ b/test/array/linalg/core.jl @@ -23,3 +23,57 @@ end DA = DArray(A) @test isapprox(norm(A), norm(DA)) end + +@testset "BLAS-1 vector ops" begin + @testset "T=$T part=$(part.blocksize)" for T in (Float64, ComplexF64), part in (Blocks(16), Blocks(4)) + n = 16 + x = rand(T, n) + y = rand(T, n) + a = T(2) + b = T(3) + Dx = distribute(x, part) + + # dot (conjugates first arg for complex) + @test dot(Dx, distribute(y, part)) ≈ dot(x, y) + + # axpy!: y += a*x + Dy = distribute(copy(y), part) + axpy!(a, Dx, Dy) + @test collect(Dy) ≈ a .* x .+ y + + # axpby!: y = a*x + b*y + Dy = distribute(copy(y), part) + axpby!(a, Dx, b, Dy) + @test collect(Dy) ≈ a .* x .+ b .* y + + # rmul!/lmul!: scale in place + Dz = distribute(copy(x), part) + rmul!(Dz, a) + @test collect(Dz) ≈ a .* x + Dz = distribute(copy(x), part) + lmul!(a, Dz) + @test collect(Dz) ≈ a .* x + end + + # Operands with *different* partitionings must still work (aligned internally + # via maybe_copy_buffered), and the in-place result keeps its own layout. + @testset "mismatched partitionings T=$T" for T in (Float64, ComplexF64) + n = 16 + x = rand(T, n) + y = rand(T, n) + a, b = T(2), T(3) + Dx = distribute(x, Blocks(4)) + Dy = distribute(y, Blocks(8)) + + @test dot(Dx, Dy) ≈ dot(x, y) + + Dy2 = distribute(copy(y), Blocks(8)) + axpy!(a, Dx, Dy2) + @test collect(Dy2) ≈ a .* x .+ y + @test Dy2.partitioning == Blocks(8) # output layout preserved + + Dy3 = distribute(copy(y), Blocks(8)) + axpby!(a, Dx, b, Dy3) + @test collect(Dy3) ≈ a .* x .+ b .* y + end +end diff --git a/test/array/linalg/iterativesolvers.jl b/test/array/linalg/iterativesolvers.jl new file mode 100644 index 000000000..ca7865f29 --- /dev/null +++ b/test/array/linalg/iterativesolvers.jl @@ -0,0 +1,206 @@ +# Distributed iterative (Krylov) linear-solver tests. +# +# Exercises the matrix-free Krylov integration (`Dagger.cg`/`minres`/`gmres`/ +# `bicgstab` + the generic `krylov_solve`) over both dense and sparse-backed +# `DMatrix` operators, plus the Jacobi preconditioner. Reference solutions come +# from a dense direct solve. +# +# julia test/runtests.jl --test array/linalg/iterativesolvers + +using Krylov +using AlgebraicMultigrid +using IncompleteLU + +# Strongly diagonally-dominant tridiagonal SPD matrix. The large diagonal keeps +# the condition number small so the Krylov methods converge in a handful of +# iterations (keeping the distributed test fast). `inv(diag) == 1/4`. +const SPD_DIAG = 4.0 + +laplacian_1d(T, n) = SparseArrays.spdiagm( + -1 => fill(-one(T), n - 1), + 0 => fill(T(SPD_DIAG), n), + 1 => fill(-one(T), n - 1), +) + +# Add a first-order advection term -> nonsymmetric, still well-conditioned. +function advection_diffusion_1d(T, n) + return laplacian_1d(T, n) + SparseArrays.spdiagm( + -1 => fill(T(-3) / 10, n - 1), + 1 => fill(T(3) / 10, n - 1), + ) +end + +@testset "Iterative solvers (Krylov)" begin + n = 64 + k = 16 + Db_part = Blocks(k) + A_part = Blocks(k, k) + + @testset "SPD operator ($(backend))" for backend in (:dense, :sparse) + Asp = laplacian_1d(Float64, n) + Adense = Matrix(Asp) + b = rand(n) + xref = Adense \ b + + DA = backend === :dense ? distribute(Adense, A_part) : distribute(Asp, A_part) + Db = distribute(b, Db_part) + + @testset "$(nameof(solver))" for solver in (Dagger.cg, Dagger.minres, Dagger.gmres, Dagger.bicgstab) + x, stats = solver(DA, Db; atol = 1e-12, rtol = 1e-10, itmax = 500) + @test stats.solved + @test x isa Dagger.DVector + @test collect(x) ≈ xref rtol = 1e-6 + end + + # Generic entry point. + x, stats = Dagger.krylov_solve(:cg, DA, Db; atol = 1e-12, rtol = 1e-10) + @test stats.solved + @test collect(x) ≈ xref rtol = 1e-6 + end + + @testset "nonsymmetric operator ($(backend))" for backend in (:dense, :sparse) + Asp = advection_diffusion_1d(Float64, n) + b = rand(n) + xref = Matrix(Asp) \ b + + DA = backend === :dense ? distribute(Matrix(Asp), A_part) : distribute(Asp, A_part) + Db = distribute(b, Db_part) + + @testset "$(nameof(solver))" for solver in (Dagger.gmres, Dagger.bicgstab) + x, stats = solver(DA, Db; atol = 1e-12, rtol = 1e-10, itmax = 500) + @test stats.solved + @test collect(x) ≈ xref rtol = 1e-6 + end + end + + @testset "complex SPD (Hermitian) operator" begin + # Real SPD tridiagonal is Hermitian as a complex matrix. + Asp = SparseArrays.spdiagm( + -1 => fill(ComplexF64(-1), n - 1), + 0 => fill(ComplexF64(SPD_DIAG), n), + 1 => fill(ComplexF64(-1), n - 1), + ) + b = rand(ComplexF64, n) + xref = Matrix(Asp) \ b + DA = distribute(Asp, A_part) + Db = distribute(b, Db_part) + x, stats = Dagger.cg(DA, Db; atol = 1e-12, rtol = 1e-10, itmax = 500) + @test stats.solved + @test collect(x) ≈ xref rtol = 1e-6 + end + + @testset "Jacobi preconditioner" begin + Asp = laplacian_1d(Float64, n) + b = rand(n) + xref = Matrix(Asp) \ b + + @testset "build + apply ($(backend))" for backend in (:dense, :sparse) + DA = backend === :dense ? distribute(Matrix(Asp), A_part) : distribute(Asp, A_part) + Db = distribute(b, Db_part) + + P = Dagger.JacobiPreconditioner(DA) + @test collect(P.dinv) ≈ fill(1 / SPD_DIAG, n) # 1/diag + + # Apply: y = M⁻¹ x = dinv .* x. + y = similar(Db) + mul!(y, P, Db) + @test collect(y) ≈ (1 / SPD_DIAG) .* b + + x, stats = Dagger.cg(DA, Db; M = P, atol = 1e-12, rtol = 1e-10, itmax = 500) + @test stats.solved + @test collect(x) ≈ xref rtol = 1e-6 + end + + # Non-square block grid must be rejected with a helpful error. + DA_ragged = distribute(Matrix(Asp), Blocks(k, k ÷ 2)) + @test_throws ArgumentError Dagger.JacobiPreconditioner(DA_ragged) + end + + @testset "block-Jacobi preconditioner" begin + Asp = laplacian_1d(Float64, n) + Adense = Matrix(Asp) + b = rand(n) + xref = Adense \ b + + # Reference: apply the exact block-diagonal inverse. + yref = similar(b) + for s in 1:k:n + r = s:min(s + k - 1, n) + yref[r] = Adense[r, r] \ b[r] + end + + @testset "build + apply ($(backend))" for backend in (:dense, :sparse) + DA = backend === :dense ? distribute(Adense, A_part) : distribute(Asp, A_part) + Db = distribute(b, Db_part) + + P = Dagger.BlockJacobiPreconditioner(DA) + y = similar(Db) + mul!(y, P, Db) + @test collect(y) ≈ yref + + x, stats = Dagger.cg(DA, Db; M = P, atol = 1e-12, rtol = 1e-10, itmax = 500) + @test stats.solved + @test collect(x) ≈ xref rtol = 1e-6 + end + + # A single tile makes block-Jacobi an *exact* solve, so PCG converges + # essentially immediately. + DA1 = distribute(Adense, Blocks(n, n)) + Db1 = distribute(b, Blocks(n)) + P1 = Dagger.BlockJacobiPreconditioner(DA1) + x1, s1 = Dagger.cg(DA1, Db1; M = P1, atol = 1e-12, rtol = 1e-10, itmax = 500) + @test s1.solved + @test s1.niter <= 2 + @test collect(x1) ≈ xref rtol = 1e-8 + + @test_throws ArgumentError Dagger.BlockJacobiPreconditioner(distribute(Adense, Blocks(k, k ÷ 2))) + end + + @testset "AMG preconditioner ($(method))" for method in (:ruge_stuben, :smoothed_aggregation) + Asp = laplacian_1d(Float64, n) + Adense = Matrix(Asp) + b = rand(n) + xref = Adense \ b + + @testset "$(backend)" for backend in (:dense, :sparse) + DA = backend === :dense ? distribute(Adense, A_part) : distribute(Asp, A_part) + Db = distribute(b, Db_part) + + P = Dagger.AMGPreconditioner(DA; method = method) + # Apply is a V-cycle approximating M⁻¹ x; just check it runs + is finite + # (and repeatable, exercising the cached, pinned hierarchy). + y1 = similar(Db); mul!(y1, P, Db) + y2 = similar(Db); mul!(y2, P, Db) + @test all(isfinite, collect(y1)) + @test collect(y1) ≈ collect(y2) + + x, stats = Dagger.cg(DA, Db; M = P, atol = 1e-12, rtol = 1e-10, itmax = 500) + @test stats.solved + @test collect(x) ≈ xref rtol = 1e-6 + end + end + + @testset "block-ILU preconditioner" begin + Asp = laplacian_1d(Float64, n) + Adense = Matrix(Asp) + b = rand(n) + xref = Adense \ b + + @testset "build + apply ($(backend))" for backend in (:dense, :sparse) + DA = backend === :dense ? distribute(Adense, A_part) : distribute(Asp, A_part) + Db = distribute(b, Db_part) + + P = Dagger.BlockILUPreconditioner(DA; τ = 0.01) + y1 = similar(Db); mul!(y1, P, Db) + y2 = similar(Db); mul!(y2, P, Db) + @test all(isfinite, collect(y1)) + @test collect(y1) ≈ collect(y2) + + x, stats = Dagger.cg(DA, Db; M = P, atol = 1e-12, rtol = 1e-10, itmax = 500) + @test stats.solved + @test collect(x) ≈ xref rtol = 1e-6 + end + + @test_throws ArgumentError Dagger.BlockILUPreconditioner(distribute(Adense, Blocks(k, k ÷ 2)); τ = 0.01) + end +end diff --git a/test/array/linalg/matmul.jl b/test/array/linalg/matmul.jl index ebc9d8ddc..f8b090d8a 100644 --- a/test/array/linalg/matmul.jl +++ b/test/array/linalg/matmul.jl @@ -29,36 +29,71 @@ function test_gemm!(T, szA, szB, partA, partB) DA = distribute(A, partA) DB = distribute(B, partB) + SA = sprand(T, szA..., 0.1) + SB = sprand(T, szA..., 0.1) + + DSA = distribute(SA, partA) + DSB = distribute(SB, partB) + ## Out-of-place gemm # No transA, No transB + # Dense DC = DA * DB C = A * B @test collect(DC) ≈ C + # Sparse + DSC = DSA * DSB + SC = SA * SB + @test collect(DSC) ≈ SC if szA == szB # No transA, transB + # Dense DC = DA * DB' C = A * B' @test collect(DC) ≈ C + # Sparse + DSC = DSA * DSB' + SC = SA * SB' + @test collect(DSC) ≈ SC # transA, No transB + # Dense DC = DA' * DB C = A' * B @test collect(DC) ≈ C + # Sparse + DSC = DSA' * DSB + SC = SA' * SB + @test collect(DSC) ≈ SC end # transA, transB + # Dense DC = DA' * DB' C = A' * B' @test collect(DC) ≈ C + #= Sparse + DSC = DSA' * DSB' + SC = SA' * SB' + @test collect(DSC) ≈ SC + =# ## In-place gemm # No transA, No transB + # Dense C = zeros(T, szC...) DC = distribute(C, partC) mul!(C, A, B) mul!(DC, DA, DB) @test collect(DC) ≈ C + #= Sparse + SC = zeros(T, szC...) + DSC = distribute(SC, partC) + mul!(SC, SA, SB) + mul!(DSC, DSA, DSB) + @test collect(DSC) ≈ SC + =# if szA == szB # No transA, transB diff --git a/test/array/linalg/matmul_finch.jl b/test/array/linalg/matmul_finch.jl new file mode 100644 index 000000000..061874f1e --- /dev/null +++ b/test/array/linalg/matmul_finch.jl @@ -0,0 +1,105 @@ +# Fast, focused sparse matrix-multiply tests for the Finch backend. +# +# Mirrors `matmul_sparse.jl` (SparseArrays) but exercises Finch tensor tiles via +# `DSparseMatrix`. Finch is an optional (weak) dependency, so this test loads it +# on all workers and skips gracefully if it cannot be loaded. +# +# Run with: +# +# julia test/runtests.jl --test array/linalg/matmul_finch +# +# N.B. Finch precompilation is heavy (several minutes); this test is intended for +# local validation rather than fast CI. + +# Make the test-project environment (which provides Finch) resolvable on the +# workers, which are launched with Dagger's project. +const _FINCH_TEST_ENV = abspath(joinpath(@__DIR__, "..", "..")) +@everywhere pushfirst!(LOAD_PATH, $_FINCH_TEST_ENV) + +const FINCH_AVAILABLE = try + @eval using Finch + @everywhere using Finch + true +catch err + @warn "Finch unavailable; skipping Finch matmul tests" exception=err + false +end + +if !FINCH_AVAILABLE + @testset "Finch GEMM (quick)" begin + @test_skip false + end +else + const FINCH_DENSITY = 0.3 + + # Self-consistent: build distributed Finch operands, take the dense + # `collect` as the reference, and compare against the distributed result. + function test_finch_gemm!(T, sz, partA, partB) + partC = Blocks(partA.blocksize[1], partB.blocksize[2]) + + DSA = Finch.fsprand(partA, T, sz, FINCH_DENSITY) + DSB = Finch.fsprand(partB, T, sz, FINCH_DENSITY) + A = collect(DSA) + B = collect(DSB) + + ## Out-of-place + @test collect(DSA * DSB) ≈ A * B # N / N + @test collect(DSA * DSB') ≈ A * B' # N / C + @test collect(DSA' * DSB) ≈ A' * B # C / N + @test collect(DSA' * DSB') ≈ A' * B' # C / C + + ## Symmetric rank-k (syrk path: A === B with one operand transposed) + @test collect(DSA' * DSA) ≈ A' * A + @test collect(DSA * DSA') ≈ A * A' + + ## In-place + DSC = Finch.fspzeros(partC, T, sz) + mul!(DSC, DSA, DSB) + @test collect(DSC) ≈ A * B + end + + const FINCH_QUICK_CASES = [ + ((8, 8), Blocks(4, 4), Blocks(4, 4)), + ((8, 8), Blocks(2, 4), Blocks(4, 2)), + ] + + @testset "Finch GEMM (quick)" begin + @testset "size=$sz part=$(partA.blocksize)/$(partB.blocksize)" for (sz, partA, partB) in FINCH_QUICK_CASES + @testset "T=$T" for T in (Float64, ComplexF64) + test_finch_gemm!(T, sz, partA, partB) + end + end + end + + # Sparse matrix-vector multiply (SpMV): Finch sparse matrix tiles, dense vectors. + function test_finch_spmv!(T, n, part) + bs = part.blocksize[1] + DSA = Finch.fsprand(part, T, (n, n), FINCH_DENSITY) + A = collect(DSA) + x = rand(T, n) + Dx = distribute(x, Blocks(bs)) + + @test collect(DSA * Dx) ≈ A * x + @test collect(transpose(DSA) * Dx) ≈ transpose(A) * x + @test collect(DSA' * Dx) ≈ A' * x + + y = rand(T, n) + Dy = distribute(copy(y), Blocks(bs)) + alpha, beta = T(2), T(3) + mul!(Dy, DSA, Dx, alpha, beta) + @test collect(Dy) ≈ alpha * (A * x) + beta * y + end + + const FINCH_SPMV_CASES = [ + (8, Blocks(4, 4)), + (8, Blocks(2, 2)), + ] + + @testset "Finch SpMV (quick)" begin + @testset "n=$n part=$(part.blocksize)" for (n, part) in FINCH_SPMV_CASES + @testset "T=$T" for T in (Float64, ComplexF64) + test_finch_spmv!(T, n, part) + end + end + end +end diff --git a/test/array/linalg/matmul_sparse.jl b/test/array/linalg/matmul_sparse.jl new file mode 100644 index 000000000..cc2e8274e --- /dev/null +++ b/test/array/linalg/matmul_sparse.jl @@ -0,0 +1,139 @@ +# Fast, focused sparse matrix-multiply tests. +# +# This is intentionally a *small* subset of the full GEMM suite in `matmul.jl`, +# meant for quick iteration while developing sparse-array support (the full +# suite takes ~1hr). Run with: +# +# julia test/runtests.jl --test array/linalg/matmul_sparse +# +# Keep this fast: only a couple of sizes/partitionings, just enough to exercise +# the out-of-place and in-place paths (incl. transposes) for both real and +# complex element types. + +const SPARSE_DENSITY = 0.3 + +function test_sparse_gemm!(T, sz, partA, partB) + rows, cols = sz + @assert rows == cols "sparse quick-test uses square matrices so transposes line up" + partC = Blocks(partA.blocksize[1], partB.blocksize[2]) + + SA = sprand(T, sz..., SPARSE_DENSITY) + SB = sprand(T, sz..., SPARSE_DENSITY) + + DSA = distribute(SA, partA) + DSB = distribute(SB, partB) + + ## Out-of-place + @test collect(DSA * DSB) ≈ SA * SB # N / N + @test collect(DSA * DSB') ≈ SA * SB' # N / T + @test collect(DSA' * DSB) ≈ SA' * SB # T / N + @test collect(DSA' * DSB') ≈ SA' * SB' # T / T + + ## Symmetric rank-k (syrk path: A === B with one operand transposed) + @test collect(DSA' * DSA) ≈ Array(SA)' * Array(SA) + @test collect(DSA * DSA') ≈ Array(SA) * Array(SA)' + + ## In-place + SC = SA * SB + DSC = distribute(sparse(zeros(T, sz...)), partC) + mul!(DSC, DSA, DSB) + @test collect(DSC) ≈ SC +end + +const SPARSE_QUICK_CASES = [ + ((8, 8), Blocks(4, 4), Blocks(4, 4)), + ((8, 8), Blocks(2, 4), Blocks(4, 2)), + ((8, 8), Blocks(4, 2), Blocks(2, 4)), +] + +@testset "Sparse GEMM (quick)" begin + @testset "size=$sz part=$(partA.blocksize)/$(partB.blocksize)" for (sz, partA, partB) in SPARSE_QUICK_CASES + @testset "T=$T" for T in (Float64, ComplexF64) + test_sparse_gemm!(T, sz, partA, partB) + end + end +end + +# Sparse matrix-vector multiply (SpMV): sparse matrix tiles, dense vectors. This +# is the core kernel for distributed iterative (Krylov) solvers. +function test_sparse_spmv!(T, n, part) + bs = part.blocksize[1] + SA = sprand(T, n, n, SPARSE_DENSITY) + x = rand(T, n) + + DSA = distribute(SA, part) + Dx = distribute(x, Blocks(bs)) + + ## Out-of-place (N / T / C) + @test collect(DSA * Dx) ≈ SA * x + @test collect(transpose(DSA) * Dx) ≈ transpose(SA) * x + @test collect(DSA' * Dx) ≈ SA' * x + + ## In-place 5-arg mul!: y = alpha*A*x + beta*y + y = rand(T, n) + Dy = distribute(copy(y), Blocks(bs)) + alpha, beta = T(2), T(3) + mul!(Dy, DSA, Dx, alpha, beta) + @test collect(Dy) ≈ alpha * (SA * x) + beta * y +end + +const SPARSE_SPMV_CASES = [ + (8, Blocks(4, 4)), + (8, Blocks(2, 2)), +] + +@testset "Sparse SpMV (quick)" begin + @testset "n=$n part=$(part.blocksize)" for (n, part) in SPARSE_SPMV_CASES + @testset "T=$T" for T in (Float64, ComplexF64) + test_sparse_spmv!(T, n, part) + end + end +end + +# Any *partial* or *reinterpreted* access to a sparse container (views, +# transposes, adjoints, reshapes, and combinations thereof) must alias the +# *entire* container, so that Datadeps never tracks stale sub-spans of storage +# that may have been reallocated on write. This is enforced by `aliasing_root`, +# which resolves any such wrapper back to the container itself (via `Base.parent`) +# -- exposed to Datadeps through the fused `aliasing_unwrapped` -- plus a trapping +# `Base.pointer` that fires if a wrapper ever slips through to the strided path. +@testset "Sparse whole-container aliasing" begin + M = Dagger.DSparseMatrix{Float64}(sprand(Float64, 6, 6, 0.4)) + + @test Dagger.aliases_as_whole(M) + @test !Dagger.aliases_as_whole(rand(Float64, 6, 6)) + # The container's aliasing is a bare `ObjectAliasing`, which also reports + # whole-object (drives the Datadeps whole-object copy short-circuit). + @test Dagger.aliasing(M) isa Dagger.ObjectAliasing + @test Dagger.aliases_as_whole(Dagger.aliasing(M)) + @test !Dagger.aliases_as_whole(Dagger.aliasing(rand(Float64, 6, 6))) + + # `aliasing_root` peels any array wrapper back to the sparse container. + @test Dagger.aliasing_root(M) === M + @test Dagger.aliasing_root(view(M, 1:3, :)) === M + @test Dagger.aliasing_root(view(M, 4:6, 2:4)) === M + @test Dagger.aliasing_root(transpose(M)) === M + @test Dagger.aliasing_root(M') === M + @test Dagger.aliasing_root(reshape(M, 36)) === M + @test Dagger.aliasing_root(view(transpose(M), 1:2, :)) === M + + # After unwrapping, all wrappers alias the same whole container. + a_full = Dagger.aliasing(M) + @test Dagger.will_alias(a_full, Dagger.aliasing_unwrapped(view(M, 1:3, :))) + @test Dagger.will_alias(Dagger.aliasing_unwrapped(view(M, 1:3, :)), + Dagger.aliasing_unwrapped(view(M, 4:6, :))) + + # Distinct containers must not alias. + M2 = Dagger.DSparseMatrix{Float64}(sprand(Float64, 6, 6, 0.4)) + @test !Dagger.will_alias(a_full, Dagger.aliasing(M2)) + + # Dense arrays are untouched: not whole-object, views stay strided, and + # `aliasing_root` returns them unchanged. + A = rand(Float64, 6, 6) + Av = view(A, 1:3, :) + @test Dagger.aliasing_root(Av) === Av + @test Dagger.aliasing_unwrapped(Av) isa Dagger.StridedAliasing + + # The `pointer` trap fires if a sparse container is treated as raw memory. + @test_throws ArgumentError pointer(M) +end diff --git a/test/runtests.jl b/test/runtests.jl index bc4505fb3..205985b12 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -7,6 +7,11 @@ USE_METAL = parse(Bool, get(ENV, "CI_USE_METAL", "0")) USE_OPENCL = parse(Bool, get(ENV, "CI_USE_OPENCL", "0")) USE_GPU = USE_CUDA || USE_ROCM || USE_ONEAPI || USE_METAL || USE_OPENCL +# Finch is a heavy optional dependency (precompilation takes minutes), so it is +# not a default test dependency. The dedicated Finch CI job signals via this +# variable to `Pkg.add` Finch and run only the Finch-backed tests. +USE_FINCH = parse(Bool, get(ENV, "CI_TEST_FINCH", "0")) + tests = [ ("Thunk", "thunk.jl"), ("Scheduler", "scheduler.jl"), @@ -31,10 +36,13 @@ tests = [ ("Array - LinearAlgebra - Core", "array/linalg/core.jl"), ("Array - LinearAlgebra - Arithmetic", "array/linalg/arithmetic.jl"), ("Array - LinearAlgebra - Matmul", "array/linalg/matmul.jl"), + ("Array - LinearAlgebra - Matmul (Sparse, quick)", "array/linalg/matmul_sparse.jl"), + ("Array - LinearAlgebra - Matmul (Finch, quick)", "array/linalg/matmul_finch.jl"), ("Array - LinearAlgebra - Cholesky", "array/linalg/cholesky.jl"), ("Array - LinearAlgebra - LU", "array/linalg/lu.jl"), ("Array - LinearAlgebra - Solve", "array/linalg/solve.jl"), ("Array - LinearAlgebra - QR", "array/linalg/qr.jl"), + ("Array - LinearAlgebra - Iterative Solvers", "array/linalg/iterativesolvers.jl"), ("Array - Permute", "array/permute.jl"), ("Array - Random", "array/random.jl"), ("Array - Stencils", "array/stencil.jl"), @@ -55,8 +63,22 @@ if USE_GPU ("Array - Stencils", "array/stencil.jl"), ] end +if USE_FINCH + # Only run the Finch-backed tests in the dedicated Finch CI job. + tests = [ + ("Array - LinearAlgebra - Matmul (Finch)", "array/linalg/matmul_finch.jl"), + ] +end all_test_names = map(test -> replace(last(test), ".jl"=>""), tests) +# Tests excluded from default runs; they only run when explicitly requested via +# `--test` or enabled by an environment signal. Finch's precompilation is very +# heavy, so its tests are opt-in (except in the dedicated Finch job, where we +# explicitly want them to run). +optin_test_names = USE_FINCH ? String[] : String[ + "array/linalg/matmul_finch", +] + additional_workers::Int = 3 if PROGRAM_FILE != "" && realpath(PROGRAM_FILE) == @__FILE__ @@ -68,6 +90,10 @@ if PROGRAM_FILE != "" && realpath(PROGRAM_FILE) == @__FILE__ Pkg.instantiate() catch end + if USE_FINCH + # Finch is not a default test dependency; add it on demand when signaled. + Pkg.add("Finch") + end using ArgParse s = ArgParseSettings(description = "Dagger Testsuite") @@ -102,7 +128,7 @@ if PROGRAM_FILE != "" && realpath(PROGRAM_FILE) == @__FILE__ parsed_args = parse_args(s) to_test = String[] if isempty(parsed_args["test"]) - to_test = copy(all_test_names) + to_test = filter(!in(optin_test_names), copy(all_test_names)) else for _test in parsed_args["test"] test = only(_test) @@ -158,7 +184,7 @@ if PROGRAM_FILE != "" && realpath(PROGRAM_FILE) == @__FILE__ Pkg.offline(true) end else - to_test = all_test_names + to_test = filter(!in(optin_test_names), all_test_names) @info "Running all tests" end