JuliaParallel · jpsamaroo · Oct 21, 2025 · Apr 22, 2024 · Oct 28, 2025 · Nov 26, 2025
diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml
@@ -167,6 +167,18 @@ steps:
     env:
       CI_USE_OPENCL: "1"
 
+  - label: Julia 1.11 (Finch)
+    timeout_in_minutes: 90
+    <<: *test
+    plugins:
+      - JuliaCI/julia#v1:
+          version: "1.11"
+      - JuliaCI/julia-test#v1: ~
+      - JuliaCI/julia-coverage#v1:
+          codecov: true
+    env:
+      CI_TEST_FINCH: "1"
+
   - label: Julia 1 - TimespanLogging
     timeout_in_minutes: 20
     <<: *test

diff --git a/Project.toml b/Project.toml
@@ -24,7 +24,6 @@ Requires = "ae029012-a4dd-5104-9daa-d747884805df"
 ScopedValues = "7e506255-f358-4e82-b7e4-beb19740aa63"
 Serialization = "9e88b42a-f829-5b0c-bbe9-9e923198166b"
 SharedArrays = "1a1011a3-84de-559e-8e89-a11a2f7dc383"
-SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
 Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
 StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91"
 TaskLocalValues = "ed4db957-447d-4319-bfb6-7fa9ae7ecf34"
@@ -34,48 +33,62 @@ UUIDs = "cf7118a7-6976-5b1a-9a39-7adc72f591a4"
 [weakdeps]
 AMDGPU = "21141c5a-9bdb-4563-92ae-f87d6854732e"
 AbstractFFTs = "621f4979-c628-5d54-868e-fcf4e3e8185c"
+AlgebraicMultigrid = "2169fc97-5a83-5252-b627-83903c6c433c"
 CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
 Colors = "5ae59095-9a9b-59fe-a467-6f913c188581"
 DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
 Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f"
+Finch = "9177782c-1635-4eb9-9bfb-d9dfa25e6bce"
+IncompleteLU = "40713840-3770-5561-ab4c-a76e7d0d7895"
 GraphViz = "f526b714-d49f-11e8-06ff-31ed36ee7ee0"
 JSON3 = "0f8b85d8-7281-11e9-16c2-39a750bddbf1"
+Krylov = "ba0b0d4f-ebba-5204-a429-3ac8c609bfb7"
 Metal = "dde4c033-4e86-420c-a63e-0dd931031962"
 OpenCL = "08131aa3-fb12-5dee-8b74-c09406e224a2"
 Plots = "91a5bcdd-55d7-5caf-9e0b-520d859cae80"
 PythonCall = "6099a3de-0909-46bc-b1f4-468b9a2dfc0d"
+SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
 oneAPI = "8f75cd03-7ff8-4ecb-9b8f-daf728133b1b"
 
 [extensions]
 AbstractFFTsExt = "AbstractFFTs"
+AlgebraicMultigridExt = ["AlgebraicMultigrid", "SparseArrays"]
 CUDAExt = "CUDA"
 DistributionsExt = "Distributions"
+FinchExt = "Finch"
 GraphVizExt = "GraphViz"
 GraphVizSimpleExt = "Colors"
+IncompleteLUExt = ["IncompleteLU", "SparseArrays"]
 IntelExt = "oneAPI"
 JSON3Ext = "JSON3"
+KrylovExt = "Krylov"
 MetalExt = "Metal"
 OpenCLExt = "OpenCL"
 PlotsExt = ["DataFrames", "Plots"]
 PythonExt = "PythonCall"
 ROCExt = "AMDGPU"
+SparseArraysExt = "SparseArrays"
 
 [compat]
 AMDGPU = "1, 2"
 AbstractFFTs = "1.5.0"
 Adapt = "4"
+AlgebraicMultigrid = "1"
 CUDA = "3, 4, 5"
 Colors = "0.12, 0.13"
 DataFrames = "1"
 DataStructures = "0.18, 0.19"
 DistributedNext = "1.0.0"
 Distributions = "0.25"
 FillArrays = "1.13.0"
+Finch = "1.2.9"
 GPUArraysCore = "0.2.0"
 GraphViz = "0.2"
 Graphs = "1"
+IncompleteLU = "0.2"
 JSON3 = "1"
 KernelAbstractions = "0.9"
+Krylov = "0.10"
 MacroTools = "0.5"
 MemPool = "0.4.12"
 Metal = "1.1"

diff --git a/ext/AlgebraicMultigridExt.jl b/ext/AlgebraicMultigridExt.jl
@@ -0,0 +1,45 @@
+module AlgebraicMultigridExt
+
+import AlgebraicMultigrid
+import SparseArrays
+import Dagger
+import Dagger: DMatrix
+import LinearAlgebra
+
+# Algebraic-multigrid preconditioner, built per diagonal tile on top of Dagger's
+# block-preconditioner machinery (see `src/array/iterativesolvers.jl`). The AMG
+# hierarchy's expensive setup runs once per tile at construction; each apply runs
+# one V-cycle (a sequence of SpMVs, smoother sweeps, and a coarse solve).
+
+_as_sparse(A::SparseArrays.SparseMatrixCSC) = A
+_as_sparse(A::AbstractMatrix) = SparseArrays.sparse(A)
+
+function _amg_operator(tile, method::Symbol; kwargs...)
+    S = _as_sparse(Dagger._tile_matrix(tile))
+    ml = if method === :ruge_stuben
+        AlgebraicMultigrid.ruge_stuben(S; kwargs...)
+    elseif method === :smoothed_aggregation
+        AlgebraicMultigrid.smoothed_aggregation(S; kwargs...)
+    else
+        throw(ArgumentError("AMGPreconditioner: unknown method $(method); use \
+            :ruge_stuben or :smoothed_aggregation"))
+    end
+    return AlgebraicMultigrid.aspreconditioner(ml)
+end
+
+function Dagger.AMGPreconditioner(A::DMatrix; method::Symbol=:ruge_stuben, kwargs...)
+    build = tile -> _amg_operator(tile, method; kwargs...)
+    return Dagger._build_block_preconditioner(Dagger.AMGPreconditioner, A,
+                                              "AMGPreconditioner", build)
+end
+
+# An AMG preconditioner applies a V-cycle via `ldiv!` (`\` is not defined).
+Dagger._block_apply!(y, p::AlgebraicMultigrid.Preconditioner, x) =
+    (LinearAlgebra.ldiv!(y, p, x); return nothing)
+
+# The hierarchy is read-only and pinned to its worker; it is never written and
+# its internals (a `Vector{Level}`) aren't introspectable by datadeps, so treat
+# it as non-aliasing rather than recursing into it.
+Dagger.aliasing(::AlgebraicMultigrid.Preconditioner) = Dagger.NoAliasing()
+
+end # module AlgebraicMultigridExt
diff --git a/ext/FinchExt.jl b/ext/FinchExt.jl
@@ -0,0 +1,184 @@
+module FinchExt
+
+import Finch
+import LinearAlgebra
+import Dagger
+import Dagger: Blocks, AutoBlocks, BlocksOrAuto, AssignmentType, DSparseArray, DSparseMatrix
+
+Dagger._sparse_collect(A::Finch.Tensor) = Array(A)
+# Finch's generic `similar` produces tensor formats that destabilize later
+# `@finch`/`@einsum` kernels (observed as segfaults during tile moves), so
+# allocate an empty COO-backed tile explicitly instead.
+Dagger._sparse_similar(::Finch.Tensor, ::Type{T}, dims::Dims) where {T} =
+    Finch.fspzeros(T, dims...)
+Dagger.maybe_wrap_tile(x::Finch.Tensor) = DSparseArray(x)
+
+function Finch.fspzeros(p::Blocks, T::Type, dims::Dims; assignment::AssignmentType = :arbitrary)
+    d = Dagger.ArrayDomain(map(x->1:x, dims))
+    a = Dagger.AllocateArray(T, (T, _dims) -> DSparseMatrix{T}(Finch.fspzeros(T, _dims...)), false, d, Dagger.partition(p, d), p, assignment)
+    return Dagger._to_darray(a)
+end
+Finch.fspzeros(p::BlocksOrAuto, T::Type, dims::Integer...; assignment::AssignmentType = :arbitrary) =
+    Finch.fspzeros(p, T, dims; assignment)
+Finch.fspzeros(p::BlocksOrAuto, dims::Integer...; assignment::AssignmentType = :arbitrary) =
+    Finch.fspzeros(p, Float64, dims; assignment)
+Finch.fspzeros(p::BlocksOrAuto, dims::Dims; assignment::AssignmentType = :arbitrary) =
+    Finch.fspzeros(p, Float64, dims; assignment)
+Finch.fspzeros(::AutoBlocks, T::Type, dims::Dims; assignment::AssignmentType = :arbitrary) =
+    Finch.fspzeros(Dagger.auto_blocks(dims), T, dims; assignment)
+
+function Finch.fsprand(p::Blocks, T::Type, dims::Dims, sparsity::AbstractFloat; assignment::AssignmentType = :arbitrary)
+    d = Dagger.ArrayDomain(map(x->1:x, dims))
+    a = Dagger.AllocateArray(T, (T, _dims) -> DSparseMatrix{T}(Finch.fsprand(T, _dims..., sparsity)), false, d, Dagger.partition(p, d), p, assignment)
+    return Dagger._to_darray(a)
+end
+Finch.fsprand(p::BlocksOrAuto, T::Type, dims_and_sparsity::Real...; assignment::AssignmentType = :arbitrary) =
+    Finch.fsprand(p, T, dims_and_sparsity[1:end-1], dims_and_sparsity[end]; assignment)
+Finch.fsprand(p::BlocksOrAuto, dims_and_sparsity::Real...; assignment::AssignmentType = :arbitrary) =
+    Finch.fsprand(p, Float64, dims_and_sparsity[1:end-1], dims_and_sparsity[end]; assignment)
+Finch.fsprand(p::BlocksOrAuto, dims::Dims, sparsity::AbstractFloat; assignment::AssignmentType = :arbitrary) =
+    Finch.fsprand(p, Float64, dims, sparsity; assignment)
+Finch.fsprand(::AutoBlocks, T::Type, dims::Dims, sparsity::AbstractFloat; assignment::AssignmentType = :arbitrary) =
+    Finch.fsprand(Dagger.auto_blocks(dims), T, dims, sparsity; assignment)
+
+# Materialize a (possibly lazy/`SwizzleArray`) result into a concrete sparse
+# `Tensor`. `@einsum` returns a lazy `SwizzleArray` for transposed-output
+# patterns, which is not itself a `Finch.Tensor` and corrupts subsequent
+# accumulation steps; copying through `@finch` produces a clean `SparseCOO`.
+function _finch_materialize(X, ::Type{T}, dims) where {T}
+    out = Finch.fspzeros(T, dims...)
+    Finch.@finch begin
+        out .= 0
+        for j = _, i = _
+            if X[i, j] != 0
+                out[i, j] = X[i, j]
+            end
+        end
+    end
+    return out
+end
+
+# Finch tensors do not define `Base.copy`; build a fresh equivalent tensor.
+Dagger._sparse_copy(mat::Finch.Tensor) = _finch_materialize(mat, eltype(mat), size(mat))
+
+# Finch tensors do not support `setindex!`, so copy-buffering writeback into a
+# Finch tile densifies, applies the (partial-range) copy, then re-sparsifies.
+function Dagger._sparse_copyto_view!(mat::Finch.Tensor, Brange, src)
+    dense = Array(mat)
+    copyto!(view(dense, Brange), src)
+    return _finch_materialize(dense, eltype(mat), size(mat))
+end
+
+# Compute `C_out += alpha * op(A) * op(B)` via Finch's `@einsum`, choosing the
+# right index pattern for each transpose option. We deliberately use *explicit*
+# index patterns (`A[k,i]`) rather than `Finch.swizzle`, because feeding a
+# swizzled tensor into `@einsum` makes it return a lazy `SwizzleArray` (rather
+# than a materialized `Tensor`), which breaks subsequent accumulation steps.
+# Adjoint ('C') additionally applies `conj` inside the einsum.
+function _finch_einsum_matmul!(C_out, A, B, transA::Char, transB::Char, alpha)
+    aT = transA == 'T' || transA == 'C'
+    aC = transA == 'C'
+    bT = transB == 'T' || transB == 'C'
+    bC = transB == 'C'
+    if !aT && !bT
+        bC ? Finch.@einsum(C_out[i,j] += alpha * (A[i,k] * conj(B[k,j]))) :
+             Finch.@einsum(C_out[i,j] += alpha * (A[i,k] * B[k,j]))
+    elseif !aT && bT
+        bC ? Finch.@einsum(C_out[i,j] += alpha * (A[i,k] * conj(B[j,k]))) :
+             Finch.@einsum(C_out[i,j] += alpha * (A[i,k] * B[j,k]))
+    elseif aT && !bT
+        aC ? Finch.@einsum(C_out[i,j] += alpha * (conj(A[k,i]) * B[k,j])) :
+             Finch.@einsum(C_out[i,j] += alpha * (A[k,i] * B[k,j]))
+    else # aT && bT
+        if aC && bC
+            Finch.@einsum C_out[i,j] += alpha * (conj(A[k,i]) * conj(B[j,k]))
+        elseif aC
+            Finch.@einsum C_out[i,j] += alpha * (conj(A[k,i]) * B[j,k])
+        elseif bC
+            Finch.@einsum C_out[i,j] += alpha * (A[k,i] * conj(B[j,k]))
+        else
+            Finch.@einsum C_out[i,j] += alpha * (A[k,i] * B[j,k])
+        end
+    end
+    return C_out
+end
+
+function Dagger.matmatmul!(
+    C::DSparseMatrix,
+    transA::Char,
+    transB::Char,
+    A::Finch.Tensor,
+    B::Finch.Tensor,
+    alpha,
+    beta
+)
+    if !isa(C.mat, Finch.Tensor)
+        # Not supported here, forward to generic matmatmul!
+        return Dagger.matmatmul!(C.mat, transA, transB, A, B, alpha, beta)
+    end
+    transA in ('N', 'T', 'C') || throw(ArgumentError("Invalid transA: $transA"))
+    transB in ('N', 'T', 'C') || throw(ArgumentError("Invalid transB: $transB"))
+
+    # `@einsum` cannot write into an existing sparse tensor in place, so we build
+    # a fresh output tensor and (re)assign it to the wrapper. `DSparseMatrix`
+    # hides this reallocation from Datadeps.
+    #
+    # N.B. BEWARE: `@einsum Cm[i,j] = ...` (assignment into an existing tensor)
+    # does not work; use a fresh `C_out` with `+=`.
+    Cm = C.mat
+    C_out = Finch.fspzeros(eltype(Cm), size(Cm)...)
+    C_out = _finch_einsum_matmul!(C_out, A, B, transA, transB, alpha)
+    # `@einsum` may return a lazy `SwizzleArray`; materialize to a concrete
+    # sparse `Tensor` so accumulation and later iterations stay well-typed.
+    C_out = _finch_materialize(C_out, eltype(Cm), size(Cm))
+    if beta != 0
+        C_out = C_out + beta * Cm
+    end
+    C.mat = C_out
+
+    return C
+end
+
+# Sparse matrix-vector multiply tile kernel: `C = alpha*op(A)*B + beta*C` with a
+# Finch tensor `A` and dense vectors `B`/`C`. Finch's `@einsum` does not reliably
+# accumulate into a dense output, so we densify the single sparse tile and use a
+# dense BLAS-backed `mul!`; this is a per-tile densification, not the whole matrix.
+function Dagger.matvecmul!(C::AbstractVector, transA::Char, A::Finch.Tensor, B::AbstractVector, alpha, beta)
+    Ad = Array(A)
+    opA = transA == 'N' ? Ad :
+          transA == 'C' ? adjoint(Ad) :
+          transA == 'T' ? transpose(Ad) :
+          throw(ArgumentError("Invalid transA: $transA"))
+    LinearAlgebra.mul!(C, opA, B, alpha, beta)
+    return C
+end
+
+# Tile transpose/symmetrization used by `copytri!`. Finch tensors don't support
+# in-place `setindex!`, so we densify, build the result, and re-sparsify via
+# `_finch_materialize`.
+# - `uplo === nothing`: off-diagonal tile, return the conjugate transpose.
+# - `uplo == 'U'`/`'L'`: diagonal tile, build the full Hermitian tile from the
+#   given triangle (matching the dense `copydiagtile!` semantics).
+function Dagger.transpose_tile(B::Finch.Tensor, uplo=nothing)
+    _B = Array(B)
+    if uplo === nothing
+        C = adjoint(_B)
+        return _finch_materialize(C, eltype(_B), size(C))
+    end
+    n = LinearAlgebra.checksquare(_B)
+    C = zeros(eltype(_B), n, n)
+    if uplo == 'U'
+        for j in 1:n, i in 1:n
+            C[i, j] = i <= j ? _B[i, j] : conj(_B[j, i])
+        end
+    elseif uplo == 'L'
+        for j in 1:n, i in 1:n
+            C[i, j] = i >= j ? _B[i, j] : conj(_B[j, i])
+        end
+    else
+        throw(ArgumentError("uplo must be 'U' or 'L', got $uplo"))
+    end
+    return _finch_materialize(C, eltype(_B), size(C))
+end
+
+end # module FinchExt
diff --git a/ext/IncompleteLUExt.jl b/ext/IncompleteLUExt.jl
@@ -0,0 +1,36 @@
+module IncompleteLUExt
+
+import IncompleteLU
+import SparseArrays
+import Dagger
+import Dagger: DMatrix
+import LinearAlgebra
+
+# Block incomplete-LU preconditioner, built per diagonal tile on top of Dagger's
+# block-preconditioner machinery (see `src/array/iterativesolvers.jl`). Each tile
+# gets an ILU factorization (drop tolerance `τ`) once at construction; the apply
+# is a forward/backward substitution per block.
+
+_as_sparse(A::SparseArrays.SparseMatrixCSC) = A
+_as_sparse(A::AbstractMatrix) = SparseArrays.sparse(A)
+
+function _ilu_operator(tile; τ=0.001, kwargs...)
+    S = _as_sparse(Dagger._tile_matrix(tile))
+    return IncompleteLU.ilu(S; τ=τ, kwargs...)
+end
+
+function Dagger.BlockILUPreconditioner(A::DMatrix; kwargs...)
+    build = tile -> _ilu_operator(tile; kwargs...)
+    return Dagger._build_block_preconditioner(Dagger.BlockILUPreconditioner, A,
+                                              "BlockILUPreconditioner", build)
+end
+
+# An ILU factorization applies via `ldiv!` (`\` is not defined).
+Dagger._block_apply!(y, F::IncompleteLU.ILUFactorization, x) =
+    (LinearAlgebra.ldiv!(y, F, x); return nothing)
+
+# The factorization is read-only and pinned to its worker; treat it as
+# non-aliasing rather than recursing into its internals.
+Dagger.aliasing(::IncompleteLU.ILUFactorization) = Dagger.NoAliasing()
+
+end # module IncompleteLUExt
diff --git a/ext/KrylovExt.jl b/ext/KrylovExt.jl
@@ -0,0 +1,36 @@
+module KrylovExt
+
+import Krylov
+import Dagger
+import Dagger: DVector, DMatrix
+import LinearAlgebra
+
+# Distributed iterative solvers, implemented on top of Krylov.jl.
+#
+# We never form `A⁻¹`: Krylov only needs `mul!(y, A, x)` (and `mul!(y, A', x)`
+# for two-sided methods), which Dagger provides over `DVector`s via its
+# distributed SpMV/`gemv_dagger!` path. The remaining requirements -- `dot`,
+# `norm`, `axpy!`, `axpby!`, `rmul!`, `copyto!`, `fill!`, broadcasting -- are the
+# distributed BLAS-1 ops in `Dagger`.
+#
+# Workspace vectors are allocated through a `KrylovConstructor` built from
+# `similar(b)`, so every internal vector inherits `b`'s element type *and*
+# partitioning (`similar` preserves both). This keeps all `mul!`/`dot`/`axpy!`
+# operands on a compatible chunk layout, which the distributed kernels require.
+
+function _dagger_krylov(method::Symbol, A, b::DVector; kwargs...)
+    kc = Krylov.KrylovConstructor(similar(b))
+    workspace = Krylov.krylov_workspace(Val(method), kc)
+    Krylov.krylov_solve!(workspace, A, b; kwargs...)
+    return Krylov.solution(workspace), Krylov.statistics(workspace)
+end
+
+Dagger.krylov_solve(method::Symbol, A, b::DVector; kwargs...) =
+    _dagger_krylov(method, A, b; kwargs...)
+
+Dagger.cg(A, b::DVector; kwargs...)       = _dagger_krylov(:cg, A, b; kwargs...)
+Dagger.minres(A, b::DVector; kwargs...)   = _dagger_krylov(:minres, A, b; kwargs...)
+Dagger.gmres(A, b::DVector; kwargs...)    = _dagger_krylov(:gmres, A, b; kwargs...)
+Dagger.bicgstab(A, b::DVector; kwargs...) = _dagger_krylov(:bicgstab, A, b; kwargs...)
+
+end # module KrylovExt