diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml
index 8e3b41511..95e9b9877 100644
--- a/.buildkite/pipeline.yml
+++ b/.buildkite/pipeline.yml
@@ -167,6 +167,18 @@ steps:
     env:
       CI_USE_OPENCL: "1"
 
+  - label: Julia 1.11 (Finch)
+    timeout_in_minutes: 90
+    <<: *test
+    plugins:
+      - JuliaCI/julia#v1:
+          version: "1.11"
+      - JuliaCI/julia-test#v1: ~
+      - JuliaCI/julia-coverage#v1:
+          codecov: true
+    env:
+      CI_TEST_FINCH: "1"
+
   - label: Julia 1 - TimespanLogging
     timeout_in_minutes: 20
     <<: *test
diff --git a/Project.toml b/Project.toml
index ce49bf6d7..1668e9db1 100644
--- a/Project.toml
+++ b/Project.toml
@@ -24,7 +24,6 @@ Requires = "ae029012-a4dd-5104-9daa-d747884805df"
 ScopedValues = "7e506255-f358-4e82-b7e4-beb19740aa63"
 Serialization = "9e88b42a-f829-5b0c-bbe9-9e923198166b"
 SharedArrays = "1a1011a3-84de-559e-8e89-a11a2f7dc383"
-SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
 Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
 StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91"
 TaskLocalValues = "ed4db957-447d-4319-bfb6-7fa9ae7ecf34"
@@ -34,36 +33,47 @@ UUIDs = "cf7118a7-6976-5b1a-9a39-7adc72f591a4"
 [weakdeps]
 AMDGPU = "21141c5a-9bdb-4563-92ae-f87d6854732e"
 AbstractFFTs = "621f4979-c628-5d54-868e-fcf4e3e8185c"
+AlgebraicMultigrid = "2169fc97-5a83-5252-b627-83903c6c433c"
 CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
 Colors = "5ae59095-9a9b-59fe-a467-6f913c188581"
 DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
 Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f"
+Finch = "9177782c-1635-4eb9-9bfb-d9dfa25e6bce"
+IncompleteLU = "40713840-3770-5561-ab4c-a76e7d0d7895"
 GraphViz = "f526b714-d49f-11e8-06ff-31ed36ee7ee0"
 JSON3 = "0f8b85d8-7281-11e9-16c2-39a750bddbf1"
+Krylov = "ba0b0d4f-ebba-5204-a429-3ac8c609bfb7"
 Metal = "dde4c033-4e86-420c-a63e-0dd931031962"
 OpenCL = "08131aa3-fb12-5dee-8b74-c09406e224a2"
 Plots = "91a5bcdd-55d7-5caf-9e0b-520d859cae80"
 PythonCall = "6099a3de-0909-46bc-b1f4-468b9a2dfc0d"
+SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
 oneAPI = "8f75cd03-7ff8-4ecb-9b8f-daf728133b1b"
 
 [extensions]
 AbstractFFTsExt = "AbstractFFTs"
+AlgebraicMultigridExt = ["AlgebraicMultigrid", "SparseArrays"]
 CUDAExt = "CUDA"
 DistributionsExt = "Distributions"
+FinchExt = "Finch"
 GraphVizExt = "GraphViz"
 GraphVizSimpleExt = "Colors"
+IncompleteLUExt = ["IncompleteLU", "SparseArrays"]
 IntelExt = "oneAPI"
 JSON3Ext = "JSON3"
+KrylovExt = "Krylov"
 MetalExt = "Metal"
 OpenCLExt = "OpenCL"
 PlotsExt = ["DataFrames", "Plots"]
 PythonExt = "PythonCall"
 ROCExt = "AMDGPU"
+SparseArraysExt = "SparseArrays"
 
 [compat]
 AMDGPU = "1, 2"
 AbstractFFTs = "1.5.0"
 Adapt = "4"
+AlgebraicMultigrid = "1"
 CUDA = "3, 4, 5"
 Colors = "0.12, 0.13"
 DataFrames = "1"
@@ -71,11 +81,14 @@ DataStructures = "0.18, 0.19"
 DistributedNext = "1.0.0"
 Distributions = "0.25"
 FillArrays = "1.13.0"
+Finch = "1.2.9"
 GPUArraysCore = "0.2.0"
 GraphViz = "0.2"
 Graphs = "1"
+IncompleteLU = "0.2"
 JSON3 = "1"
 KernelAbstractions = "0.9"
+Krylov = "0.10"
 MacroTools = "0.5"
 MemPool = "0.4.12"
 Metal = "1.1"
diff --git a/ext/AlgebraicMultigridExt.jl b/ext/AlgebraicMultigridExt.jl
new file mode 100644
index 000000000..dd68bbcfa
--- /dev/null
+++ b/ext/AlgebraicMultigridExt.jl
@@ -0,0 +1,45 @@
+module AlgebraicMultigridExt
+
+import AlgebraicMultigrid
+import SparseArrays
+import Dagger
+import Dagger: DMatrix
+import LinearAlgebra
+
+# Algebraic-multigrid preconditioner, built per diagonal tile on top of Dagger's
+# block-preconditioner machinery (see `src/array/iterativesolvers.jl`). The AMG
+# hierarchy's expensive setup runs once per tile at construction; each apply runs
+# one V-cycle (a sequence of SpMVs, smoother sweeps, and a coarse solve).
+
+_as_sparse(A::SparseArrays.SparseMatrixCSC) = A
+_as_sparse(A::AbstractMatrix) = SparseArrays.sparse(A)
+
+function _amg_operator(tile, method::Symbol; kwargs...)
+    S = _as_sparse(Dagger._tile_matrix(tile))
+    ml = if method === :ruge_stuben
+        AlgebraicMultigrid.ruge_stuben(S; kwargs...)
+    elseif method === :smoothed_aggregation
+        AlgebraicMultigrid.smoothed_aggregation(S; kwargs...)
+    else
+        throw(ArgumentError("AMGPreconditioner: unknown method $(method); use \
+            :ruge_stuben or :smoothed_aggregation"))
+    end
+    return AlgebraicMultigrid.aspreconditioner(ml)
+end
+
+function Dagger.AMGPreconditioner(A::DMatrix; method::Symbol=:ruge_stuben, kwargs...)
+    build = tile -> _amg_operator(tile, method; kwargs...)
+    return Dagger._build_block_preconditioner(Dagger.AMGPreconditioner, A,
+                                              "AMGPreconditioner", build)
+end
+
+# An AMG preconditioner applies a V-cycle via `ldiv!` (`\` is not defined).
+Dagger._block_apply!(y, p::AlgebraicMultigrid.Preconditioner, x) =
+    (LinearAlgebra.ldiv!(y, p, x); return nothing)
+
+# The hierarchy is read-only and pinned to its worker; it is never written and
+# its internals (a `Vector{Level}`) aren't introspectable by datadeps, so treat
+# it as non-aliasing rather than recursing into it.
+Dagger.aliasing(::AlgebraicMultigrid.Preconditioner) = Dagger.NoAliasing()
+
+end # module AlgebraicMultigridExt
diff --git a/ext/FinchExt.jl b/ext/FinchExt.jl
new file mode 100644
index 000000000..8b43bdb05
--- /dev/null
+++ b/ext/FinchExt.jl
@@ -0,0 +1,184 @@
+module FinchExt
+
+import Finch
+import LinearAlgebra
+import Dagger
+import Dagger: Blocks, AutoBlocks, BlocksOrAuto, AssignmentType, DSparseArray, DSparseMatrix
+
+Dagger._sparse_collect(A::Finch.Tensor) = Array(A)
+# Finch's generic `similar` produces tensor formats that destabilize later
+# `@finch`/`@einsum` kernels (observed as segfaults during tile moves), so
+# allocate an empty COO-backed tile explicitly instead.
+Dagger._sparse_similar(::Finch.Tensor, ::Type{T}, dims::Dims) where {T} =
+    Finch.fspzeros(T, dims...)
+Dagger.maybe_wrap_tile(x::Finch.Tensor) = DSparseArray(x)
+
+function Finch.fspzeros(p::Blocks, T::Type, dims::Dims; assignment::AssignmentType = :arbitrary)
+    d = Dagger.ArrayDomain(map(x->1:x, dims))
+    a = Dagger.AllocateArray(T, (T, _dims) -> DSparseMatrix{T}(Finch.fspzeros(T, _dims...)), false, d, Dagger.partition(p, d), p, assignment)
+    return Dagger._to_darray(a)
+end
+Finch.fspzeros(p::BlocksOrAuto, T::Type, dims::Integer...; assignment::AssignmentType = :arbitrary) =
+    Finch.fspzeros(p, T, dims; assignment)
+Finch.fspzeros(p::BlocksOrAuto, dims::Integer...; assignment::AssignmentType = :arbitrary) =
+    Finch.fspzeros(p, Float64, dims; assignment)
+Finch.fspzeros(p::BlocksOrAuto, dims::Dims; assignment::AssignmentType = :arbitrary) =
+    Finch.fspzeros(p, Float64, dims; assignment)
+Finch.fspzeros(::AutoBlocks, T::Type, dims::Dims; assignment::AssignmentType = :arbitrary) =
+    Finch.fspzeros(Dagger.auto_blocks(dims), T, dims; assignment)
+
+function Finch.fsprand(p::Blocks, T::Type, dims::Dims, sparsity::AbstractFloat; assignment::AssignmentType = :arbitrary)
+    d = Dagger.ArrayDomain(map(x->1:x, dims))
+    a = Dagger.AllocateArray(T, (T, _dims) -> DSparseMatrix{T}(Finch.fsprand(T, _dims..., sparsity)), false, d, Dagger.partition(p, d), p, assignment)
+    return Dagger._to_darray(a)
+end
+Finch.fsprand(p::BlocksOrAuto, T::Type, dims_and_sparsity::Real...; assignment::AssignmentType = :arbitrary) =
+    Finch.fsprand(p, T, dims_and_sparsity[1:end-1], dims_and_sparsity[end]; assignment)
+Finch.fsprand(p::BlocksOrAuto, dims_and_sparsity::Real...; assignment::AssignmentType = :arbitrary) =
+    Finch.fsprand(p, Float64, dims_and_sparsity[1:end-1], dims_and_sparsity[end]; assignment)
+Finch.fsprand(p::BlocksOrAuto, dims::Dims, sparsity::AbstractFloat; assignment::AssignmentType = :arbitrary) =
+    Finch.fsprand(p, Float64, dims, sparsity; assignment)
+Finch.fsprand(::AutoBlocks, T::Type, dims::Dims, sparsity::AbstractFloat; assignment::AssignmentType = :arbitrary) =
+    Finch.fsprand(Dagger.auto_blocks(dims), T, dims, sparsity; assignment)
+
+# Materialize a (possibly lazy/`SwizzleArray`) result into a concrete sparse
+# `Tensor`. `@einsum` returns a lazy `SwizzleArray` for transposed-output
+# patterns, which is not itself a `Finch.Tensor` and corrupts subsequent
+# accumulation steps; copying through `@finch` produces a clean `SparseCOO`.
+function _finch_materialize(X, ::Type{T}, dims) where {T}
+    out = Finch.fspzeros(T, dims...)
+    Finch.@finch begin
+        out .= 0
+        for j = _, i = _
+            if X[i, j] != 0
+                out[i, j] = X[i, j]
+            end
+        end
+    end
+    return out
+end
+
+# Finch tensors do not define `Base.copy`; build a fresh equivalent tensor.
+Dagger._sparse_copy(mat::Finch.Tensor) = _finch_materialize(mat, eltype(mat), size(mat))
+
+# Finch tensors do not support `setindex!`, so copy-buffering writeback into a
+# Finch tile densifies, applies the (partial-range) copy, then re-sparsifies.
+function Dagger._sparse_copyto_view!(mat::Finch.Tensor, Brange, src)
+    dense = Array(mat)
+    copyto!(view(dense, Brange), src)
+    return _finch_materialize(dense, eltype(mat), size(mat))
+end
+
+# Compute `C_out += alpha * op(A) * op(B)` via Finch's `@einsum`, choosing the
+# right index pattern for each transpose option. We deliberately use *explicit*
+# index patterns (`A[k,i]`) rather than `Finch.swizzle`, because feeding a
+# swizzled tensor into `@einsum` makes it return a lazy `SwizzleArray` (rather
+# than a materialized `Tensor`), which breaks subsequent accumulation steps.
+# Adjoint ('C') additionally applies `conj` inside the einsum.
+function _finch_einsum_matmul!(C_out, A, B, transA::Char, transB::Char, alpha)
+    aT = transA == 'T' || transA == 'C'
+    aC = transA == 'C'
+    bT = transB == 'T' || transB == 'C'
+    bC = transB == 'C'
+    if !aT && !bT
+        bC ? Finch.@einsum(C_out[i,j] += alpha * (A[i,k] * conj(B[k,j]))) :
+             Finch.@einsum(C_out[i,j] += alpha * (A[i,k] * B[k,j]))
+    elseif !aT && bT
+        bC ? Finch.@einsum(C_out[i,j] += alpha * (A[i,k] * conj(B[j,k]))) :
+             Finch.@einsum(C_out[i,j] += alpha * (A[i,k] * B[j,k]))
+    elseif aT && !bT
+        aC ? Finch.@einsum(C_out[i,j] += alpha * (conj(A[k,i]) * B[k,j])) :
+             Finch.@einsum(C_out[i,j] += alpha * (A[k,i] * B[k,j]))
+    else # aT && bT
+        if aC && bC
+            Finch.@einsum C_out[i,j] += alpha * (conj(A[k,i]) * conj(B[j,k]))
+        elseif aC
+            Finch.@einsum C_out[i,j] += alpha * (conj(A[k,i]) * B[j,k])
+        elseif bC
+            Finch.@einsum C_out[i,j] += alpha * (A[k,i] * conj(B[j,k]))
+        else
+            Finch.@einsum C_out[i,j] += alpha * (A[k,i] * B[j,k])
+        end
+    end
+    return C_out
+end
+
+function Dagger.matmatmul!(
+    C::DSparseMatrix,
+    transA::Char,
+    transB::Char,
+    A::Finch.Tensor,
+    B::Finch.Tensor,
+    alpha,
+    beta
+)
+    if !isa(C.mat, Finch.Tensor)
+        # Not supported here, forward to generic matmatmul!
+        return Dagger.matmatmul!(C.mat, transA, transB, A, B, alpha, beta)
+    end
+    transA in ('N', 'T', 'C') || throw(ArgumentError("Invalid transA: $transA"))
+    transB in ('N', 'T', 'C') || throw(ArgumentError("Invalid transB: $transB"))
+
+    # `@einsum` cannot write into an existing sparse tensor in place, so we build
+    # a fresh output tensor and (re)assign it to the wrapper. `DSparseMatrix`
+    # hides this reallocation from Datadeps.
+    #
+    # N.B. BEWARE: `@einsum Cm[i,j] = ...` (assignment into an existing tensor)
+    # does not work; use a fresh `C_out` with `+=`.
+    Cm = C.mat
+    C_out = Finch.fspzeros(eltype(Cm), size(Cm)...)
+    C_out = _finch_einsum_matmul!(C_out, A, B, transA, transB, alpha)
+    # `@einsum` may return a lazy `SwizzleArray`; materialize to a concrete
+    # sparse `Tensor` so accumulation and later iterations stay well-typed.
+    C_out = _finch_materialize(C_out, eltype(Cm), size(Cm))
+    if beta != 0
+        C_out = C_out + beta * Cm
+    end
+    C.mat = C_out
+
+    return C
+end
+
+# Sparse matrix-vector multiply tile kernel: `C = alpha*op(A)*B + beta*C` with a
+# Finch tensor `A` and dense vectors `B`/`C`. Finch's `@einsum` does not reliably
+# accumulate into a dense output, so we densify the single sparse tile and use a
+# dense BLAS-backed `mul!`; this is a per-tile densification, not the whole matrix.
+function Dagger.matvecmul!(C::AbstractVector, transA::Char, A::Finch.Tensor, B::AbstractVector, alpha, beta)
+    Ad = Array(A)
+    opA = transA == 'N' ? Ad :
+          transA == 'C' ? adjoint(Ad) :
+          transA == 'T' ? transpose(Ad) :
+          throw(ArgumentError("Invalid transA: $transA"))
+    LinearAlgebra.mul!(C, opA, B, alpha, beta)
+    return C
+end
+
+# Tile transpose/symmetrization used by `copytri!`. Finch tensors don't support
+# in-place `setindex!`, so we densify, build the result, and re-sparsify via
+# `_finch_materialize`.
+# - `uplo === nothing`: off-diagonal tile, return the conjugate transpose.
+# - `uplo == 'U'`/`'L'`: diagonal tile, build the full Hermitian tile from the
+#   given triangle (matching the dense `copydiagtile!` semantics).
+function Dagger.transpose_tile(B::Finch.Tensor, uplo=nothing)
+    _B = Array(B)
+    if uplo === nothing
+        C = adjoint(_B)
+        return _finch_materialize(C, eltype(_B), size(C))
+    end
+    n = LinearAlgebra.checksquare(_B)
+    C = zeros(eltype(_B), n, n)
+    if uplo == 'U'
+        for j in 1:n, i in 1:n
+            C[i, j] = i <= j ? _B[i, j] : conj(_B[j, i])
+        end
+    elseif uplo == 'L'
+        for j in 1:n, i in 1:n
+            C[i, j] = i >= j ? _B[i, j] : conj(_B[j, i])
+        end
+    else
+        throw(ArgumentError("uplo must be 'U' or 'L', got $uplo"))
+    end
+    return _finch_materialize(C, eltype(_B), size(C))
+end
+
+end # module FinchExt
\ No newline at end of file
diff --git a/ext/IncompleteLUExt.jl b/ext/IncompleteLUExt.jl
new file mode 100644
index 000000000..06ecab522
--- /dev/null
+++ b/ext/IncompleteLUExt.jl
@@ -0,0 +1,36 @@
+module IncompleteLUExt
+
+import IncompleteLU
+import SparseArrays
+import Dagger
+import Dagger: DMatrix
+import LinearAlgebra
+
+# Block incomplete-LU preconditioner, built per diagonal tile on top of Dagger's
+# block-preconditioner machinery (see `src/array/iterativesolvers.jl`). Each tile
+# gets an ILU factorization (drop tolerance `τ`) once at construction; the apply
+# is a forward/backward substitution per block.
+
+_as_sparse(A::SparseArrays.SparseMatrixCSC) = A
+_as_sparse(A::AbstractMatrix) = SparseArrays.sparse(A)
+
+function _ilu_operator(tile; τ=0.001, kwargs...)
+    S = _as_sparse(Dagger._tile_matrix(tile))
+    return IncompleteLU.ilu(S; τ=τ, kwargs...)
+end
+
+function Dagger.BlockILUPreconditioner(A::DMatrix; kwargs...)
+    build = tile -> _ilu_operator(tile; kwargs...)
+    return Dagger._build_block_preconditioner(Dagger.BlockILUPreconditioner, A,
+                                              "BlockILUPreconditioner", build)
+end
+
+# An ILU factorization applies via `ldiv!` (`\` is not defined).
+Dagger._block_apply!(y, F::IncompleteLU.ILUFactorization, x) =
+    (LinearAlgebra.ldiv!(y, F, x); return nothing)
+
+# The factorization is read-only and pinned to its worker; treat it as
+# non-aliasing rather than recursing into its internals.
+Dagger.aliasing(::IncompleteLU.ILUFactorization) = Dagger.NoAliasing()
+
+end # module IncompleteLUExt
diff --git a/ext/KrylovExt.jl b/ext/KrylovExt.jl
new file mode 100644
index 000000000..5d36ba2fa
--- /dev/null
+++ b/ext/KrylovExt.jl
@@ -0,0 +1,36 @@
+module KrylovExt
+
+import Krylov
+import Dagger
+import Dagger: DVector, DMatrix
+import LinearAlgebra
+
+# Distributed iterative solvers, implemented on top of Krylov.jl.
+#
+# We never form `A⁻¹`: Krylov only needs `mul!(y, A, x)` (and `mul!(y, A', x)`
+# for two-sided methods), which Dagger provides over `DVector`s via its
+# distributed SpMV/`gemv_dagger!` path. The remaining requirements -- `dot`,
+# `norm`, `axpy!`, `axpby!`, `rmul!`, `copyto!`, `fill!`, broadcasting -- are the
+# distributed BLAS-1 ops in `Dagger`.
+#
+# Workspace vectors are allocated through a `KrylovConstructor` built from
+# `similar(b)`, so every internal vector inherits `b`'s element type *and*
+# partitioning (`similar` preserves both). This keeps all `mul!`/`dot`/`axpy!`
+# operands on a compatible chunk layout, which the distributed kernels require.
+
+function _dagger_krylov(method::Symbol, A, b::DVector; kwargs...)
+    kc = Krylov.KrylovConstructor(similar(b))
+    workspace = Krylov.krylov_workspace(Val(method), kc)
+    Krylov.krylov_solve!(workspace, A, b; kwargs...)
+    return Krylov.solution(workspace), Krylov.statistics(workspace)
+end
+
+Dagger.krylov_solve(method::Symbol, A, b::DVector; kwargs...) =
+    _dagger_krylov(method, A, b; kwargs...)
+
+Dagger.cg(A, b::DVector; kwargs...)       = _dagger_krylov(:cg, A, b; kwargs...)
+Dagger.minres(A, b::DVector; kwargs...)   = _dagger_krylov(:minres, A, b; kwargs...)
+Dagger.gmres(A, b::DVector; kwargs...)    = _dagger_krylov(:gmres, A, b; kwargs...)
+Dagger.bicgstab(A, b::DVector; kwargs...) = _dagger_krylov(:bicgstab, A, b; kwargs...)
+
+end # module KrylovExt
diff --git a/ext/SparseArraysExt.jl b/ext/SparseArraysExt.jl
new file mode 100644
index 000000000..946c40be1
--- /dev/null
+++ b/ext/SparseArraysExt.jl
@@ -0,0 +1,111 @@
+module SparseArraysExt
+
+import SparseArrays
+import SparseArrays: SparseMatrixCSC, SparseVector
+import LinearAlgebra
+import Dagger
+import Dagger: Blocks, AutoBlocks, BlocksOrAuto, AssignmentType, DSparseArray, DSparseMatrix
+
+# Keep tiles sparse through `collect`/`cat`; the outer `collect` densifies.
+Dagger._sparse_collect(M::SparseMatrixCSC) = copy(M)
+# Wrap bare sparse tiles (e.g. from `distribute`) so Datadeps sees a stable container.
+Dagger.maybe_wrap_tile(x::SparseMatrixCSC) = DSparseArray(x)
+Dagger.maybe_wrap_tile(x::SparseVector) = DSparseArray(x)
+
+function SparseArrays.spzeros(p::Blocks, T::Type, dims::Dims; assignment::AssignmentType = :arbitrary)
+    d = Dagger.ArrayDomain(map(x->1:x, dims))
+    a = Dagger.AllocateArray(T, (T, _dims) -> DSparseMatrix{T}(SparseArrays.spzeros(T, _dims...)), false, d, Dagger.partition(p, d), p, assignment)
+    return Dagger._to_darray(a)
+end
+SparseArrays.spzeros(p::BlocksOrAuto, T::Type, dims::Integer...; assignment::AssignmentType = :arbitrary) =
+    SparseArrays.spzeros(p, T, dims; assignment)
+SparseArrays.spzeros(p::BlocksOrAuto, dims::Integer...; assignment::AssignmentType = :arbitrary) =
+    SparseArrays.spzeros(p, Float64, dims; assignment)
+SparseArrays.spzeros(p::BlocksOrAuto, dims::Dims; assignment::AssignmentType = :arbitrary) =
+    SparseArrays.spzeros(p, Float64, dims; assignment)
+SparseArrays.spzeros(::AutoBlocks, T::Type, dims::Dims; assignment::AssignmentType = :arbitrary) =
+    SparseArrays.spzeros(Dagger.auto_blocks(dims), T, dims; assignment)
+
+function SparseArrays.sprand(p::Blocks, T::Type, dims::Dims, sparsity::AbstractFloat; assignment::AssignmentType = :arbitrary)
+    d = Dagger.ArrayDomain(map(x->1:x, dims))
+    a = Dagger.AllocateArray(T, (T, _dims) -> DSparseMatrix{T}(SparseArrays.sprand(T, _dims..., sparsity)), false, d, Dagger.partition(p, d), p, assignment)
+    return Dagger._to_darray(a)
+end
+SparseArrays.sprand(p::BlocksOrAuto, T::Type, dims_and_sparsity::Real...; assignment::AssignmentType = :arbitrary) =
+    SparseArrays.sprand(p, T, dims_and_sparsity[1:end-1], dims_and_sparsity[end]; assignment)
+SparseArrays.sprand(p::BlocksOrAuto, dims_and_sparsity::Real...; assignment::AssignmentType = :arbitrary) =
+    SparseArrays.sprand(p, Float64, dims_and_sparsity[1:end-1], dims_and_sparsity[end]; assignment)
+SparseArrays.sprand(p::BlocksOrAuto, dims::Dims, sparsity::AbstractFloat; assignment::AssignmentType = :arbitrary) =
+    SparseArrays.sprand(p, Float64, dims, sparsity; assignment)
+SparseArrays.sprand(::AutoBlocks, T::Type, dims::Dims, sparsity::AbstractFloat; assignment::AssignmentType = :arbitrary) =
+    SparseArrays.sprand(Dagger.auto_blocks(dims), T, dims, sparsity; assignment)
+
+_apply_trans(X, t::Char) =
+    t == 'N' ? X :
+    t == 'T' ? transpose(X) :
+    t == 'C' ? adjoint(X) :
+    throw(ArgumentError("Invalid trans char: $t"))
+
+function Dagger.matmatmul!(
+    C::DSparseMatrix,
+    transA::Char,
+    transB::Char,
+    A::SparseMatrixCSC,
+    B::SparseMatrixCSC,
+    alpha,
+    beta
+)
+    opA = _apply_trans(A, transA)
+    opB = _apply_trans(B, transB)
+    # Sparse*sparse yields a freshly-allocated sparse matrix, which we reassign
+    # into the wrapper (`DSparseMatrix` hides this reallocation from Datadeps).
+    # `SparseArrays` provides no efficient 5-arg `mul!` into a sparse `C` -- the
+    # output sparsity pattern is determined by the product -- so we form the
+    # product out-of-place and apply only the alpha/beta scaling that is actually
+    # needed. The transposed-operand products dispatch to specialized SparseArrays
+    # methods, so `opA`/`opB` are not materialized.
+    AB = opA * opB
+    prod = isone(alpha) ? AB : alpha * AB
+    if iszero(beta)
+        C.mat = prod
+    elseif isone(beta)
+        C.mat = prod + C.mat
+    else
+        C.mat = prod + beta * C.mat
+    end
+
+    return C
+end
+
+# Sparse matrix-vector multiply tile kernel: `C = alpha*op(A)*B + beta*C` with a
+# `SparseMatrixCSC` `A` and dense vectors `B`/`C`. SparseArrays provides an
+# efficient 5-arg `mul!` (SpMV) into a dense output, including for transposed and
+# adjoint operands, so this updates `C` in place with no allocation.
+function Dagger.matvecmul!(C::AbstractVector, transA::Char, A::SparseMatrixCSC, B::AbstractVector, alpha, beta)
+    LinearAlgebra.mul!(C, _apply_trans(A, transA), B, alpha, beta)
+    return C
+end
+
+# Off-diagonal tile copy in `copytri!`: produce the (conjugate) transpose tile.
+function Dagger.transpose_tile(B::SparseMatrixCSC)
+    return SparseArrays.sparse(B')
+end
+# Diagonal tile symmetrization in `copytri!`: build the full Hermitian tile from
+# its `uplo` triangle (matching the dense `copydiagtile!` semantics).
+function Dagger.transpose_tile(B::SparseMatrixCSC, uplo::Char)
+    if uplo == 'U'
+        Bt = SparseArrays.triu(B)
+    elseif uplo == 'L'
+        Bt = SparseArrays.tril(B)
+    else
+        throw(ArgumentError("uplo must be 'U' or 'L', got $uplo"))
+    end
+    C = Bt + Bt'
+    # The shared diagonal was added twice; restore the original tile's diagonal.
+    for i in 1:LinearAlgebra.checksquare(B)
+        C[i, i] = B[i, i]
+    end
+    return C
+end
+
+end # module SparseArraysExt
diff --git a/src/Dagger.jl b/src/Dagger.jl
index 2e757ebc5..aa82a3d66 100644
--- a/src/Dagger.jl
+++ b/src/Dagger.jl
@@ -2,7 +2,6 @@ module Dagger
 
 import Serialization
 import Serialization: AbstractSerializer, serialize, deserialize
-import SparseArrays: sprand, SparseMatrixCSC
 
 import MemPool
 import MemPool: DRef, FileRef, poolget, poolset
@@ -120,7 +119,7 @@ include("array/operators.jl")
 include("array/indexing.jl")
 include("array/setindex.jl")
 include("array/matrix.jl")
-include("array/sparse_partition.jl")
+include("array/sparse.jl")
 include("array/sort.jl")
 include("array/permute.jl")
 include("array/linalg.jl")
@@ -129,6 +128,7 @@ include("array/cholesky.jl")
 include("array/trsm.jl")
 include("array/lu.jl")
 include("array/qr.jl")
+include("array/iterativesolvers.jl")
 
 # GPU
 include("gpu.jl")
diff --git a/src/array/alloc.jl b/src/array/alloc.jl
index aa1050210..266f7acd5 100644
--- a/src/array/alloc.jl
+++ b/src/array/alloc.jl
@@ -81,6 +81,7 @@ end
 allocate_array_func(::Processor, f) = f
 function stage(ctx, A::AllocateArray)
     tasks = Array{DTask,ndims(A.domainchunks)}(undef, size(A.domainchunks)...)
+    f = A.f
     Dagger.spawn_datadeps() do
         default_scope = get_compute_scope()
         for I in CartesianIndices(A.domainchunks)
@@ -93,10 +94,15 @@ function stage(ctx, A::AllocateArray)
                 scope = ExactScope(A.procgrid[CartesianIndex(mod1.(Tuple(I), size(A.procgrid))...)])
             end
 
-            if A.want_index
-                task = Dagger.@spawn compute_scope=scope allocate_array(A.f, A.eltype, i, size(x))
+            if f isa DArray
+                chunk = f.chunks[I]
+                task = Dagger.@spawn compute_scope=scope similar(chunk, A.eltype, size(x))
             else
-                task = Dagger.@spawn compute_scope=scope allocate_array(A.f, A.eltype, size(x))
+                if A.want_index
+                    task = Dagger.@spawn compute_scope=scope allocate_array(f, A.eltype, i, size(x))
+                else
+                    task = Dagger.@spawn compute_scope=scope allocate_array(f, A.eltype, size(x))
+                end
             end
             tasks[i] = task
         end
@@ -134,20 +140,6 @@ Base.randn(p::BlocksOrAuto, dims::Dims; assignment::AssignmentType = :arbitrary)
 Base.randn(::AutoBlocks, T::Type, dims::Dims; assignment::AssignmentType = :arbitrary) =
     randn(auto_blocks(dims), T, dims; assignment)
 
-function sprand(p::Blocks, T::Type, dims::Dims, sparsity::AbstractFloat; assignment::AssignmentType = :arbitrary)
-    d = ArrayDomain(map(x->1:x, dims))
-    a = AllocateArray(T, (T, _dims) -> sprand(T, _dims..., sparsity), false, d, partition(p, d), p, assignment)
-    return _to_darray(a)
-end
-sprand(p::BlocksOrAuto, T::Type, dims_and_sparsity::Real...; assignment::AssignmentType = :arbitrary) =
-    sprand(p, T, dims_and_sparsity[1:end-1], dims_and_sparsity[end]; assignment)
-sprand(p::BlocksOrAuto, dims_and_sparsity::Real...; assignment::AssignmentType = :arbitrary) =
-    sprand(p, Float64, dims_and_sparsity[1:end-1], dims_and_sparsity[end]; assignment)
-sprand(p::BlocksOrAuto, dims::Dims, sparsity::AbstractFloat; assignment::AssignmentType = :arbitrary) =
-    sprand(p, Float64, dims, sparsity; assignment)
-sprand(::AutoBlocks, T::Type, dims::Dims, sparsity::AbstractFloat; assignment::AssignmentType = :arbitrary) =
-    sprand(auto_blocks(dims), T, dims, sparsity; assignment)
-
 function Base.ones(p::Blocks, T::Type, dims::Dims; assignment::AssignmentType = :arbitrary)
     d = ArrayDomain(map(x->1:x, dims))
     a = AllocateArray(T, ones, false, d, partition(p, d), p, assignment)
@@ -215,3 +207,14 @@ function unsafe_free!(A::DArray)
         end
     end
 end
+
+# Initializers
+
+function Base.fill!(A::DArray, x)
+    spawn_datadeps() do
+        for chunk in A.chunks
+            Dagger.@spawn fill!(chunk, x)
+        end
+    end
+    return A
+end
diff --git a/src/array/copy.jl b/src/array/copy.jl
index 7d92566e6..d703f49d1 100644
--- a/src/array/copy.jl
+++ b/src/array/copy.jl
@@ -36,6 +36,13 @@ function allocate_copy_buffer(part::Blocks{N}, A::DArray{T,N}) where {T,N}
     return DArray{T}(undef, part, size(A))
 end
 
+to_range(x::UnitRange) = x
+to_range(x::Integer) = x:x
+to_range(x::Base.OneTo{Int}) = UnitRange(x)
+to_range(x::Base.Slice{Base.OneTo{Int}}) = UnitRange(x)
+to_range(::StepRange) = throw(ArgumentError("Cannot convert StepRange to UnitRange"))
+to_range(x) = throw(ArgumentError("Cannot convert $(typeof(x)) to UnitRange"))
+
 function darray_copyto!(B::DArray{TB,NB}, A::DArray{TA,NA}, Binds=parentindices(B), Ainds=parentindices(A)) where {TB,NB,TA,NA}
     Nmax = max(NA, NB)
 
@@ -45,13 +52,6 @@ function darray_copyto!(B::DArray{TB,NB}, A::DArray{TA,NA}, Binds=parentindices(
     padNmax(x) = ntuple(i->pad1range(x, i), Nmax)
     padNmax(x::ArrayDomain) = padNmax(x.indexes)
 
-    to_range(x::UnitRange) = x
-    to_range(x::Integer) = x:x
-    to_range(x::Base.OneTo{Int}) = UnitRange(x)
-    to_range(x::Base.Slice{Base.OneTo{Int}}) = UnitRange(x)
-    to_range(::StepRange) = throw(ArgumentError("Non-continuous ranges are not yet supported for DArray copy"))
-    to_range(x) = throw(ArgumentError("Unsupported range type for DArray copy: $(typeof(x))"))
-
     if any(x->x isa Vector, Binds) || any(x->x isa Vector, Ainds)
         # Split the copy into multiple copies
         dims_with_vector = findall(x->x[1] isa Vector || x[2] isa Vector, collect(zip(Binds, Ainds)))
@@ -154,3 +154,13 @@ StridedDArray{T,N} = Union{<:DArray{T,N}, SubArray{T,N,<:DArray{T,NP}} where NP}
 
 Base.copyto!(B::StridedDArray, A::StridedDArray) =
     darray_copyto!(parent(B), parent(A), parentindices(B), parentindices(A))
+function Base.copyto!(B::Array, A::StridedDArray)
+    DB = view(B, AutoBlocks())
+    darray_copyto!(DB, parent(A), parentindices(DB), parentindices(A))
+    return B
+end
+function Base.copyto!(B::SubArray, A::StridedDArray)
+    DB = view(parent(B), AutoBlocks())
+    darray_copyto!(DB, parent(A), parentindices(B), parentindices(A))
+    return B
+end
diff --git a/src/array/darray.jl b/src/array/darray.jl
index 32336f95d..904aa7587 100644
--- a/src/array/darray.jl
+++ b/src/array/darray.jl
@@ -197,7 +197,7 @@ function Base.collect(d::DArray{T,N}; tree=false, copyto=false) where {T,N}
     if tree
         collect(fetch(treereduce_nd(map(x -> ((args...,) -> Dagger.@spawn x(args...)) , dimcatfuncs), a.chunks)))
     else
-        collect(treereduce_nd(dimcatfuncs, asyncmap(fetch, a.chunks)))
+        collect(treereduce_nd(dimcatfuncs, map(collect, asyncmap(fetch, a.chunks))))
     end
 end
 Array{T,N}(A::DArray{S,N}) where {T,N,S} = convert(Array{T,N}, collect(A))
@@ -321,8 +321,16 @@ function Base.isequal(x::ArrayOp, y::ArrayOp)
     x === y
 end
 
-Base.similar(D::DArray{T,N} where T, ::Type{S}, dims::Dims{N}) where {S,N} =
-    DArray{S,N}(undef, D.partitioning, dims)
+function Base.similar(D::DArray{T,N} where T, ::Type{S}, dims::Dims{N}) where {S,N}
+    if dims == size(D)
+        domain = ArrayDomain(map(x->1:x, dims))
+        subdomains = partition(D.partitioning, domain)
+        a = AllocateArray(S, D, false, domain, subdomains, D.partitioning, nothing)
+        return _to_darray(a)
+    else
+        return DArray{S,N}(undef, D.partitioning, dims)
+    end
+end
 Base.similar(D::DArray{T,N1} where T, ::Type{S}, dims::Dims{N2}) where {S,N1,N2} =
     DArray{S,N2}(undef, auto_blocks(dims), dims)
 
@@ -480,7 +488,7 @@ function stage(ctx::Context, d::Distribute)
             else
                 scope = ExactScope(d.procgrid[CartesianIndex(mod1.(Tuple(I), size(d.procgrid))...)])
             end
-            Dagger.@spawn compute_scope=scope identity(d.data[c])
+            Dagger.@spawn compute_scope=scope maybe_wrap_tile(d.data[c])
         end
     end
     return DArray(eltype(d.data),
diff --git a/src/array/iterativesolvers.jl b/src/array/iterativesolvers.jl
new file mode 100644
index 000000000..e499d2a1c
--- /dev/null
+++ b/src/array/iterativesolvers.jl
@@ -0,0 +1,308 @@
+# Distributed iterative (Krylov) linear solvers.
+#
+# The user-facing entry points (`Dagger.cg`, `Dagger.minres`, `Dagger.gmres`,
+# `Dagger.bicgstab`, and the generic `Dagger.krylov_solve`) live here so the
+# solver backend can evolve without changing user code. The actual solves are
+# implemented in `ext/KrylovExt.jl`, which is loaded when `Krylov` is available
+# (`using Krylov`). The matrix-free building blocks they rely on -- distributed
+# `mul!`/SpMV and the BLAS-1 vector ops -- live in `mul.jl`/`linalg.jl`.
+
+"""
+    cg(A, b::DVector; M=I, atol, rtol, itmax, ...) -> (x::DVector, stats)
+
+Solve the symmetric positive-definite system `A x = b` with the conjugate
+gradient method, distributed over `A`'s and `b`'s chunks. `A` may be a
+`DMatrix` (dense or sparse-backed) or any object supporting `mul!(y, A, x)` over
+`DVector`s (matrix-free). Requires `Krylov.jl` to be loaded.
+
+See also [`minres`](@ref), [`gmres`](@ref), [`bicgstab`](@ref), and
+[`krylov_solve`](@ref). Keyword arguments are forwarded to `Krylov.cg!`.
+"""
+function cg end
+
+"""
+    minres(A, b::DVector; M=I, atol, rtol, itmax, ...) -> (x::DVector, stats)
+
+Solve the symmetric (possibly indefinite) system `A x = b` with MINRES.
+Requires `Krylov.jl` to be loaded. See [`cg`](@ref).
+"""
+function minres end
+
+"""
+    gmres(A, b::DVector; M=I, N=I, restart=false, memory=20, ...) -> (x::DVector, stats)
+
+Solve the general (nonsymmetric) system `A x = b` with restarted GMRES.
+`M`/`N` are left/right preconditioners. Requires `Krylov.jl` to be loaded.
+See [`cg`](@ref).
+"""
+function gmres end
+
+"""
+    bicgstab(A, b::DVector; M=I, N=I, ...) -> (x::DVector, stats)
+
+Solve the general (nonsymmetric) system `A x = b` with BiCGStab (short
+recurrence, low memory). Requires `Krylov.jl` to be loaded. See [`cg`](@ref).
+"""
+function bicgstab end
+
+"""
+    krylov_solve(method::Symbol, A, b::DVector; kwargs...) -> (x::DVector, stats)
+
+Generic entry point dispatching to the iterative `method` (`:cg`, `:minres`,
+`:gmres`, `:bicgstab`). Requires `Krylov.jl` to be loaded.
+"""
+function krylov_solve end
+
+# Friendly fallbacks: these generic methods are shadowed by the more specific
+# `(A, b::DVector)` methods added in `ext/KrylovExt.jl` once Krylov is loaded.
+_krylov_required(name) = throw(ArgumentError(
+    "Dagger.$name requires Krylov.jl. Run `using Krylov` (or `import Krylov`) \
+    to enable distributed iterative solvers."))
+cg(A, b; kwargs...) = _krylov_required(:cg)
+minres(A, b; kwargs...) = _krylov_required(:minres)
+gmres(A, b; kwargs...) = _krylov_required(:gmres)
+bicgstab(A, b; kwargs...) = _krylov_required(:bicgstab)
+krylov_solve(method::Symbol, A, b; kwargs...) = _krylov_required(:krylov_solve)
+
+# --- Preconditioners ------------------------------------------------------
+# A preconditioner object `P` represents the (approximate) *inverse* operator
+# `M⁻¹`: applying it (`mul!(y, P, x)`) computes `y = M⁻¹ x`, which is exactly an
+# ordinary matrix-vector product by the operator `P` stands for (no inversion
+# happens at apply time -- any reciprocals/factorizations are precomputed once).
+# This matches Krylov's `ldiv=false` convention, where the object passed as `M`
+# is applied via `mul!(y, M, x)` to compute `y ← M⁻¹ x`, so these are passed
+# straight through as `M=P`.
+#
+# These preconditioners are backend-agnostic: they operate on `DVector`s (and a
+# `DMatrix`'s diagonal tiles) and need no solver backend, so they live in core.
+#
+# A note on the square-tile requirement below: the iterative solvers allocate
+# *all* workspace vectors as `similar(b)` (one partitioning), and each must
+# serve as both the length-`n` input and length-`n` output of `mul!(y, A, x)`.
+# `gemv_dagger!` requires the input to match `A`'s column blocks and the output
+# to match `A`'s row blocks, which is only simultaneously possible when those
+# block sizes are equal. So the operator must use square tiles regardless of
+# preconditioning; the preconditioners simply share that constraint. (Any
+# uniform square tile size is fine, including a ragged final block.)
+
+"""
+    AbstractDaggerPreconditioner
+
+Supertype for Dagger's distributed preconditioners. A preconditioner `P`
+represents an (approximate) inverse operator `M⁻¹` and applies it via
+`mul!(y, P, x)` (`y = M⁻¹ x`) over `DVector`s, so it can be passed to the
+solvers as `M=P` (with `ldiv=false`, the default).
+"""
+abstract type AbstractDaggerPreconditioner end
+
+"""
+    JacobiPreconditioner(A::DMatrix)
+
+Diagonal (Jacobi) preconditioner. The object represents the inverse-diagonal
+operator `M⁻¹ = inv(Diagonal(A))`; it precomputes and stores the reciprocal
+diagonal `dinv = 1 ./ diag(A)`, so applying it (`mul!(y, P, x)`) is a single
+elementwise multiply `y = dinv .* x` per chunk -- an ordinary product by the
+stored operator, not an inversion. Cheap to build (one diagonal extraction per
+diagonal tile) and to apply.
+
+Requires square diagonal tiles (equal row/column block sizes); see the note
+above on why the solver itself imposes this. Repartition `A` with equal block
+sizes if needed.
+"""
+struct JacobiPreconditioner{V<:DVector} <: AbstractDaggerPreconditioner
+    dinv::V
+end
+
+JacobiPreconditioner(A::DMatrix) = JacobiPreconditioner(_jacobi_dinv(A))
+
+# Per-tile kernel: write the reciprocal diagonal of `tile` into `out`.
+_set_inv_diag!(out, tile) = (out .= inv.(LinearAlgebra.diag(tile)); return nothing)
+
+# Validate that `A` has square diagonal tiles and return (n, Ac, mt, blocksize).
+function _check_square_tiles(A::DMatrix, who)
+    n = LinearAlgebra.checksquare(A)
+    Ac = A.chunks
+    mt, nt = size(Ac)
+    mb, nb = A.partitioning.blocksize
+    # For a square matrix with square tiles, mt == nt follows automatically; we
+    # only need to reject non-square tiles (mb != nb).
+    mb == nb || throw(ArgumentError(
+        "$who requires square diagonal tiles (got block size $(mb)x$(nb)); \
+        repartition A with equal block sizes (this matches the iterative \
+        solver's own requirement on the operator)"))
+    return n, Ac, mt, mb
+end
+
+function _jacobi_dinv(A::DMatrix{T}) where T
+    n, Ac, mt, mb = _check_square_tiles(A, "JacobiPreconditioner")
+    dinv = DVector{T}(undef, Blocks(mb), n)
+    dc = dinv.chunks
+    Dagger.spawn_datadeps() do
+        for i in 1:mt
+            Dagger.@spawn _set_inv_diag!(Out(dc[i]), In(Ac[i, i]))
+        end
+    end
+    return dinv
+end
+
+function LinearAlgebra.mul!(y::DVector, P::JacobiPreconditioner, x::DVector)
+    part = P.dinv.partitioning
+    maybe_copy_buffered(P.dinv => part, x => part, y => part) do dinv, x, y
+        dc, xc, yc = dinv.chunks, x.chunks, y.chunks
+        Dagger.spawn_datadeps() do
+            for i in eachindex(yc)
+                Dagger.@spawn _jacobi_apply!(Out(yc[i]), In(dc[i]), In(xc[i]))
+            end
+        end
+    end
+    return y
+end
+_jacobi_apply!(y, dinv, x) = (y .= dinv .* x; return nothing)
+
+# --- Block-diagonal preconditioners ---------------------------------------
+# A family of preconditioners of the form `M⁻¹ = blockdiag(op₁, …, op_k)`, where
+# each `opⱼ` approximates the inverse of `A`'s `j`-th diagonal tile `Aⱼⱼ`. They
+# share all machinery and differ only in how a per-tile operator is *built*:
+#
+#   - BlockJacobiPreconditioner: exact tile solve via `lu` (dense or sparse).
+#   - BlockILUPreconditioner:    incomplete LU per tile (needs IncompleteLU.jl).
+#   - AMGPreconditioner:         an AMG hierarchy per tile (needs AlgebraicMultigrid.jl).
+#
+# Building per tile is embarrassingly parallel and a natural fit for the tiled
+# layout. A single tile (`Blocks(n, n)`) makes any of these a *global*
+# preconditioner over the whole matrix; many tiles make it a scalable
+# block-Jacobi / additive-Schwarz variant that trades some convergence for
+# parallelism. All require square diagonal tiles (see `JacobiPreconditioner`).
+#
+# A per-tile operator (`lu`/ILU factor, AMG hierarchy) generally cannot be moved
+# between workers (dense `LU` has no `move!`; `UmfpackLU`/AMG hold process-bound
+# resources). So each operator is built *once* and pinned (via
+# `tochunk(..., ProcessScope)`) to the worker that owns its tile, and every apply
+# for that block is scheduled there (`compute_scope`); datadeps then moves only
+# the (movable) vector chunks to the operator, never the operator itself.
+
+"""
+    AbstractBlockPreconditioner <: AbstractDaggerPreconditioner
+
+Block-diagonal preconditioner: holds one per-diagonal-tile operator `opⱼ`
+(`y ← opⱼ⁻¹ x`), pinned to its tile's worker, and applies them independently per
+block. Concrete subtypes (`BlockJacobiPreconditioner`, `BlockILUPreconditioner`,
+`AMGPreconditioner`) share these fields: `ops`, `scopes`, `part`, `n`.
+"""
+abstract type AbstractBlockPreconditioner <: AbstractDaggerPreconditioner end
+
+_tile_scope(c::Chunk) = ProcessScope(root_worker_id(c))
+_tile_scope(t::DTask) = ProcessScope(root_worker_id(fetch(t; raw=true)))
+
+# Build the per-tile operator on the current worker and pin the result there: the
+# returned `Chunk` is process-scoped, so it can never be moved off this worker.
+function _build_tile_pinned(build, tile)
+    op = build(tile)
+    proc = Dagger.task_processor()
+    return tochunk(op, proc, ProcessScope(root_worker_id(proc)))
+end
+
+# Build a block preconditioner of type `Ctor` by applying `build` to each
+# diagonal tile (pinned to the tile's worker). `build(tile) -> op` returns a
+# per-tile operator supporting the block apply below.
+function _build_block_preconditioner(Ctor, A::DMatrix, who, build)
+    n, Ac, mt, mb = _check_square_tiles(A, who)
+    ops = Vector{Any}(undef, mt)
+    scopes = Vector{ProcessScope}(undef, mt)
+    for i in 1:mt
+        tile = Ac[i, i]
+        scope = _tile_scope(tile)
+        scopes[i] = scope
+        ops[i] = Dagger.@spawn compute_scope=scope _build_tile_pinned(build, tile)
+    end
+    return Ctor(ops, scopes, Blocks(mb), n)
+end
+
+function LinearAlgebra.mul!(y::DVector, P::AbstractBlockPreconditioner, x::DVector)
+    part = P.part
+    maybe_copy_buffered(x => part, y => part) do x, y
+        xc, yc = x.chunks, y.chunks
+        length(yc) == length(P.ops) || throw(DimensionMismatch(
+            "$(nameof(typeof(P))) has $(length(P.ops)) blocks but the vector has \
+            $(length(yc)) chunks"))
+        Dagger.spawn_datadeps() do
+            for i in eachindex(yc)
+                # Pin the apply to the operator's worker; the operator is passed as
+                # an untracked arg (read-only, already-pinned) so datadeps never
+                # moves it -- only the vector chunks are moved to this worker.
+                Dagger.@spawn compute_scope=P.scopes[i] _block_apply!(Out(yc[i]), P.ops[i], In(xc[i]))
+            end
+        end
+    end
+    return y
+end
+
+# Apply one block operator: `y = op⁻¹ x`. Default uses `\`; backends override for
+# operator types where `\` is unavailable (e.g. ILU/AMG use `ldiv!`).
+_block_apply!(y, op, x) = (y .= op \ x; return nothing)
+
+# Underlying matrix of a tile (overridden for `DSparseArray` in `sparse.jl`).
+_tile_matrix(A) = A
+
+"""
+    BlockJacobiPreconditioner(A::DMatrix)
+
+Block-Jacobi preconditioner: `M⁻¹ = blockdiag(A₁₁, …, A_kk)⁻¹`. Each diagonal
+tile is factorized *once* with `lu` (sparse or dense) and the apply solves
+`Aᵢᵢ yᵢ = xᵢ`. Stronger than `JacobiPreconditioner` (captures intra-block
+coupling); a single tile recovers an exact solve. See
+[`AbstractBlockPreconditioner`](@ref). Requires square diagonal tiles.
+"""
+struct BlockJacobiPreconditioner{F,S} <: AbstractBlockPreconditioner
+    ops::F            # cached per-tile factorizations (pinned to their workers)
+    scopes::S         # the `ProcessScope` each operator/apply is pinned to
+    part::Blocks{1}   # partitioning of the vectors it applies to
+    n::Int
+end
+
+# Factorize a diagonal tile. Backends override for their storage (e.g. sparse
+# tiles factorize the inner `SparseMatrixCSC`); the default is a dense LU factor.
+_factorize_tile(A) = LinearAlgebra.lu(A)
+
+BlockJacobiPreconditioner(A::DMatrix) =
+    _build_block_preconditioner(BlockJacobiPreconditioner, A, "BlockJacobiPreconditioner", _factorize_tile)
+
+"""
+    BlockILUPreconditioner(A::DMatrix; τ=0.001, kwargs...)
+
+Block incomplete-LU preconditioner: an ILU factorization (with drop tolerance
+`τ`) of each diagonal tile, applied per block. Cheaper setup than a full block
+solve, good as a general-purpose preconditioner. Requires `IncompleteLU.jl` to
+be loaded and sparse-backed tiles. See [`AbstractBlockPreconditioner`](@ref).
+"""
+struct BlockILUPreconditioner{F,S} <: AbstractBlockPreconditioner
+    ops::F
+    scopes::S
+    part::Blocks{1}
+    n::Int
+end
+# Friendly fallback (shadowed by the `::DMatrix` method added in `ext/IncompleteLUExt.jl`).
+BlockILUPreconditioner(A; kwargs...) = throw(ArgumentError(
+    "Dagger.BlockILUPreconditioner requires IncompleteLU.jl. Run `using IncompleteLU` \
+    to enable block incomplete-LU preconditioning."))
+
+"""
+    AMGPreconditioner(A::DMatrix; method=:ruge_stuben, kwargs...)
+
+Algebraic-multigrid preconditioner: builds an AMG hierarchy (`method` is
+`:ruge_stuben` or `:smoothed_aggregation`) for each diagonal tile and applies a
+V-cycle per block. Near mesh-independent convergence for elliptic (Poisson-like)
+operators. With one tile this is global AMG; with many tiles it is a scalable
+block/additive-Schwarz AMG. Requires `AlgebraicMultigrid.jl` to be loaded and
+sparse-backed tiles. See [`AbstractBlockPreconditioner`](@ref).
+"""
+struct AMGPreconditioner{F,S} <: AbstractBlockPreconditioner
+    ops::F
+    scopes::S
+    part::Blocks{1}
+    n::Int
+end
+# Friendly fallback (shadowed by the `::DMatrix` method added in `ext/AlgebraicMultigridExt.jl`).
+AMGPreconditioner(A; kwargs...) = throw(ArgumentError(
+    "Dagger.AMGPreconditioner requires AlgebraicMultigrid.jl. Run \
+    `using AlgebraicMultigrid` to enable algebraic-multigrid preconditioning."))
diff --git a/src/array/linalg.jl b/src/array/linalg.jl
index c998ac752..6abae1590 100644
--- a/src/array/linalg.jl
+++ b/src/array/linalg.jl
@@ -4,6 +4,71 @@ function LinearAlgebra.norm2(A::DArray{T,N}) where {T,N}
     zeroRT = zero(real(T))
     return sqrt(sum(map(norm->fetch(norm)::real(T), norms); init=zeroRT))
 end
+
+# --- BLAS-1 vector operations ---------------------------------------------
+# Efficient, distributed level-1 operations needed by iterative (Krylov)
+# solvers. These run one BLAS-1 kernel per chunk (avoiding the generic
+# scalar-indexing fallbacks, which are slow and trigger scalar access on GPUs).
+# Operands may be partitioned differently: we align them with
+# `maybe_copy_buffered` (a no-op when they already share a layout, and which
+# copies results back to the originals for in-place operations), so users keep
+# full control over their own (possibly sub-optimal) partitioning.
+
+function LinearAlgebra.dot(x::DArray{Tx,N}, y::DArray{Ty,N}) where {Tx,Ty,N}
+    size(x) == size(y) || throw(DimensionMismatch("dot: x has size $(size(x)), y has size $(size(y))"))
+    R = typeof(LinearAlgebra.dot(zero(Tx), zero(Ty)))
+    return maybe_copy_buffered(x => x.partitioning, y => x.partitioning) do x, y
+        xc, yc = x.chunks, y.chunks
+        parts = [Dagger.@spawn LinearAlgebra.dot(xc[i], yc[i]) for i in eachindex(xc)]
+        sum(fetch, parts; init=zero(R))
+    end
+end
+
+function LinearAlgebra.axpy!(a::Number, x::DArray{Tx,N}, y::DArray{Ty,N}) where {Tx,Ty,N}
+    size(x) == size(y) || throw(DimensionMismatch("axpy!: x has size $(size(x)), y has size $(size(y))"))
+    # Align the read-only `x` to `y`'s layout so only `x` may be buffered; `y` is
+    # updated in place (and copied back by `maybe_copy_buffered` if it was buffered).
+    maybe_copy_buffered(x => y.partitioning, y => y.partitioning) do x, y
+        xc, yc = x.chunks, y.chunks
+        Dagger.spawn_datadeps() do
+            for i in eachindex(xc)
+                Dagger.@spawn LinearAlgebra.axpy!(a, In(xc[i]), InOut(yc[i]))
+            end
+        end
+    end
+    return y
+end
+
+function LinearAlgebra.axpby!(a::Number, x::DArray{Tx,N}, b::Number, y::DArray{Ty,N}) where {Tx,Ty,N}
+    size(x) == size(y) || throw(DimensionMismatch("axpby!: x has size $(size(x)), y has size $(size(y))"))
+    maybe_copy_buffered(x => y.partitioning, y => y.partitioning) do x, y
+        xc, yc = x.chunks, y.chunks
+        Dagger.spawn_datadeps() do
+            for i in eachindex(xc)
+                Dagger.@spawn LinearAlgebra.axpby!(a, In(xc[i]), b, InOut(yc[i]))
+            end
+        end
+    end
+    return y
+end
+
+function LinearAlgebra.rmul!(x::DArray, a::Number)
+    Dagger.spawn_datadeps() do
+        for c in x.chunks
+            Dagger.@spawn LinearAlgebra.rmul!(InOut(c), a)
+        end
+    end
+    return x
+end
+
+function LinearAlgebra.lmul!(a::Number, x::DArray)
+    Dagger.spawn_datadeps() do
+        for c in x.chunks
+            Dagger.@spawn LinearAlgebra.lmul!(a, InOut(c))
+        end
+    end
+    return x
+end
 function LinearAlgebra.norm2(A::UpperTriangular{T,<:DArray{T,2}}) where T
     Ac = parent(A).chunks
     Ac_upper = []
diff --git a/src/array/mul.jl b/src/array/mul.jl
index 02b207641..794c0a781 100644
--- a/src/array/mul.jl
+++ b/src/array/mul.jl
@@ -1,3 +1,74 @@
+"""
+    matmatmul!(C, transA::Char, transB::Char, A, B, alpha, beta)
+
+A general-purpose matrix-matrix multiply, like `LinearAlgebra.generic_matmatmul!`,
+but with extra functionality. May internally convert `A` and `B` to a type that
+better matches `C` and provides optimal portability and, when possible,
+better performance. The actual matrix multiply operation should happen in
+`LinearAlgebra.generic_matmatmul!` or an equivalent call.
+
+The following automatic conversions are performed:
+- If no `LinearAlgebra.generic_matmatmul!` method is available, convert `A` and `B` to dense Array-like
+- If `C` is a `DSparseMatrix`, perform the operation out-of-place and then update `C` in-place
+"""
+function matmatmul!(
+    C,
+    transA::Char,
+    transB::Char,
+    A,
+    B,
+    alpha,
+    beta
+)
+    EC = eltype(C)
+    EA = eltype(A)
+    EB = eltype(B)
+
+    TC = typeof(C)
+    TA = typeof(A)
+    TB = typeof(B)
+
+    mam = LinearAlgebra.MulAddMul(alpha, beta)
+
+    # Check if C doesn't support in-place operations (e.g. DSparseMatrix)
+    # We'll get here if A and B don't have equivalent types
+    if isa(C, DSparseMatrix)
+        C.mat = alpha * A * B + beta * C.mat
+        return C
+    end
+
+    # Check if the call will fail due to MethodError
+    sig = Tuple{TC, Char, Char, TA, TB, typeof(mam)}
+    if !hasmethod(LinearAlgebra.generic_matmatmul!, sig)
+        # Convert to Array-like
+        # FIXME: GPU support
+        C_new = C
+        A_new = collect(A)
+        B_new = collect(B)
+        alpha_new = alpha
+        beta_new = beta
+        # FIXME: Re-check hasmethod, and if no method, then convert and bounce C
+        @goto ready
+    end
+
+    C_new = C
+    A_new = A
+    B_new = B
+    alpha_new = alpha
+    beta_new = beta
+
+    @label ready
+    mam_new = LinearAlgebra.MulAddMul(alpha_new, beta_new)
+    return LinearAlgebra.generic_matmatmul!(
+        C_new,
+        transA,
+        transB,
+        A_new,
+        B_new,
+        mam_new
+    )
+end
+
 function LinearAlgebra.generic_matmatmul!(
     C::DMatrix{T},
     transA::Char,
@@ -41,6 +112,7 @@ function LinearAlgebra.generic_matmatmul!(
         return gemm_dagger!(C, transA, transB, A, B, alpha, beta)
     end
 end
+# FIXME: Mixed-precision methods
 function _repartition_matmatmul(C, A, B, transA::Char, transB::Char)
     partA = A.partitioning.blocksize
     partB = B.partitioning.blocksize
@@ -136,28 +208,28 @@ function gemm_dagger!(
                         # A: NoTrans / B: NoTrans
                         for k in range(1, Ant)
                             mzone = k == 1 ? beta : T(1.0)
-                            Dagger.@spawn BLAS.gemm!(
+                            Dagger.@spawn matmatmul!(
+                                InOut(Cc[m, n]),
                                 transA,
                                 transB,
-                                alpha,
                                 In(Ac[m, k]),
                                 In(Bc[k, n]),
+                                alpha,
                                 mzone,
-                                InOut(Cc[m, n]),
                             )
                         end
                     else
                         # A: NoTrans / B: [Conj]Trans
                         for k in range(1, Ant)
                             mzone = k == 1 ? beta : T(1.0)
-                            Dagger.@spawn BLAS.gemm!(
+                            Dagger.@spawn matmatmul!(
+                                InOut(Cc[m, n]),
                                 transA,
                                 transB,
-                                alpha,
                                 In(Ac[m, k]),
                                 In(Bc[n, k]),
+                                alpha,
                                 mzone,
-                                InOut(Cc[m, n]),
                             )
                         end
                     end
@@ -166,28 +238,28 @@ function gemm_dagger!(
                         # A: [Conj]Trans / B: NoTrans
                         for k in range(1, Amt)
                             mzone = k == 1 ? beta : T(1.0)
-                            Dagger.@spawn BLAS.gemm!(
+                            Dagger.@spawn matmatmul!(
+                                InOut(Cc[m, n]),
                                 transA,
                                 transB,
-                                alpha,
                                 In(Ac[k, m]),
                                 In(Bc[k, n]),
+                                alpha,
                                 mzone,
-                                InOut(Cc[m, n]),
                             )
                         end
                     else
                         # A: [Conj]Trans / B: [Conj]Trans
                         for k in range(1, Amt)
                             mzone = k == 1 ? beta : T(1.0)
-                            Dagger.@spawn BLAS.gemm!(
+                            Dagger.@spawn matmatmul!(
+                                InOut(Cc[m, n]),
                                 transA,
                                 transB,
-                                alpha,
                                 In(Ac[k, m]),
                                 In(Bc[n, k]),
+                                alpha,
                                 mzone,
-                                InOut(Cc[m, n]),
                             )
                         end
                     end
@@ -235,6 +307,7 @@ function syrk_dagger!(
 
     iscomplex = T <: Complex
     transs = iscomplex ? 'C' : 'T'
+    anti_transs = trans == 'N' ? transs : 'N'
 
     Dagger.spawn_datadeps() do
         for n in range(1, Cnt)
@@ -242,116 +315,63 @@ function syrk_dagger!(
                 # NoTrans
                 for k in range(1, Ant)
                     mzone = k == 1 ? real(beta) : one(real(T))
-                    if iscomplex
-                        Dagger.@spawn BLAS.herk!(
-                            uplo,
+                    _alpha = iscomplex ? real(alpha) : alpha
+                    Dagger.@spawn matmatmul!(
+                        InOut(Cc[n, n]),
+                        trans,
+                        anti_transs,
+                        In(Ac[n, k]),
+                        In(Ac[n, k]),
+                        _alpha,
+                        mzone,
+                    )
+                end
+                # NoTrans / Upper
+                for m in range(n + 1, Cmt)
+                    for k in range(1, Ant)
+                        mzone = k == 1 ? beta : one(T)
+                        Dagger.@spawn matmatmul!(
+                            InOut(Cc[n, m]),
                             trans,
-                            real(alpha),
+                            transs,
                             In(Ac[n, k]),
-                            mzone,
-                            InOut(Cc[n, n]),
-                        )
-                    else
-                        Dagger.@spawn BLAS.syrk!(
-                            uplo,
-                            trans,
+                            In(Ac[m, k]),
                             alpha,
-                            In(Ac[n, k]),
                             mzone,
-                            InOut(Cc[n, n]),
                         )
                     end
                 end
-                if uplo == 'L'
-                    # NoTrans / Lower
-                    for m in range(n + 1, Cmt)
-                        for k in range(1, Ant)
-                            mzone = k == 1 ? beta : one(T)
-                            Dagger.@spawn BLAS.gemm!(
-                                trans,
-                                transs,
-                                alpha,
-                                In(Ac[m, k]),
-                                In(Ac[n, k]),
-                                mzone,
-                                InOut(Cc[m, n]),
-                            )
-                        end
-                    end
-                else
-                    # NoTrans / Upper
-                    for m in range(n + 1, Cmt)
-                        for k in range(1, Ant)
-                            mzone = k == 1 ? beta : one(T)
-                            Dagger.@spawn BLAS.gemm!(
-                                trans,
-                                transs,
-                                alpha,
-                                In(Ac[n, k]),
-                                In(Ac[m, k]),
-                                mzone,
-                                InOut(Cc[n, m]),
-                            )
-                        end
-                    end
-                end
             else
                 # [Conj]Trans
                 for k in range(1, Amt)
                     mzone = k == 1 ? real(beta) : one(real(T))
-                    if iscomplex
-                        Dagger.@spawn BLAS.herk!(
-                            uplo,
+                    _alpha = iscomplex ? real(alpha) : alpha
+                    _trans = iscomplex ? transs : trans
+                    Dagger.@spawn matmatmul!(
+                        InOut(Cc[n, n]),
+                        _trans,
+                        anti_transs,
+                        In(Ac[k, n]),
+                        In(Ac[k, n]),
+                        _alpha,
+                        mzone,
+                    )
+                end
+                # [Conj]Trans / Upper
+                for m in range(n + 1, Cmt)
+                    for k in range(1, Amt)
+                        mzone = k == 1 ? beta : one(T)
+                        Dagger.@spawn matmatmul!(
+                            InOut(Cc[n, m]),
                             transs,
-                            real(alpha),
+                            'N',
                             In(Ac[k, n]),
-                            mzone,
-                            InOut(Cc[n, n]),
-                        )
-                    else
-                        Dagger.@spawn BLAS.syrk!(
-                            uplo,
-                            trans,
+                            In(Ac[k, m]),
                             alpha,
-                            In(Ac[k, n]),
                             mzone,
-                            InOut(Cc[n, n]),
                         )
                     end
                 end
-                if uplo == 'L'
-                    # [Conj]Trans / Lower
-                    for m in range(n + 1, Cmt)
-                        for k in range(1, Amt)
-                            mzone = k == 1 ? beta : one(T)
-                            Dagger.@spawn BLAS.gemm!(
-                                transs,
-                                'N',
-                                alpha,
-                                In(Ac[k, m]),
-                                In(Ac[k, n]),
-                                mzone,
-                                InOut(Cc[m, n]),
-                            )
-                        end
-                    end
-                else
-                    # [Conj]Trans / Upper
-                    for m in range(n + 1, Cmt)
-                        for k in range(1, Amt)
-                            mzone = k == 1 ? beta : one(T)
-                            Dagger.@spawn BLAS.gemm!(
-                                transs,
-                                'N',
-                                alpha,
-                                In(Ac[k, n]),
-                                In(Ac[k, m]),
-                                mzone,
-                                InOut(Cc[n, m]),
-                            )
-                        end
-                    end
-                end
             end
         end
     end
@@ -393,7 +413,7 @@ end
     return A
 end
 
-@inline function copytile!(A, B)
+function copytile!(A, B)
     m, n = size(A)
     C = B'
 
@@ -402,15 +422,14 @@ end
     end
 end
 
-@inline function copydiagtile!(A, uplo)
+function copydiagtile!(A, uplo)
     m, n = size(A)
-    Acpy = copy(A)
 
     if uplo == 'U'
-        C = UpperTriangular(Acpy)' + UpperTriangular(Acpy)
+        C = UpperTriangular(A)' + UpperTriangular(A)
         C[diagind(C)] .= A[diagind(A)]
     elseif uplo == 'L'
-        C = LowerTriangular(Acpy)' + Acpy - UpperTriangular(Acpy)
+        C = LowerTriangular(A)' + A - UpperTriangular(A)
         C[diagind(C)] .= A[diagind(A)]
     end
 
@@ -418,6 +437,7 @@ end
         A[i, j] = C[i, j]
     end
 end
+
 function LinearAlgebra.generic_matvecmul!(
     C::DVector{T},
     transA::Char,
@@ -464,6 +484,21 @@ function _repartition_matvecmul(C, A, B, transA::Char)
     partC = (dimA,)
     return Blocks(partC...), Blocks(partA...), Blocks(partB...)
 end
+"""
+    matvecmul!(C, transA::Char, A, B, alpha, beta)
+
+Tile-level matrix-vector multiply computing `C = alpha*op(A)*B + beta*C` in
+place on the (dense) output vector `C`, where `op` is determined by `transA`
+(`'N'`, `'T'`, `'C'`). Dispatches on the tile types: dense tiles use BLAS, while
+sparse tiles (e.g. `DSparseArray`) provide their own method (in a package
+extension) using a sparse matrix-vector product. This is the matvec analogue of
+[`matmatmul!`](@ref).
+"""
+function matvecmul!(C, transA::Char, A, B, alpha, beta)
+    BLAS.gemv!(transA, alpha, A, B, beta, C)
+    return C
+end
+
 function gemv_dagger!(
     C::DVector{T},
     transA::Char,
@@ -495,26 +530,26 @@ function gemv_dagger!(
                 # A: NoTrans
                 for k in range(1, Ant)
                     mzone = k == 1 ? beta : T(1.0)
-                    Dagger.@spawn BLAS.gemv!(
+                    Dagger.@spawn matvecmul!(
+                        InOut(Cc[m]),
                         transA,
-                        alpha,
                         In(Ac[m, k]),
                         In(Bc[k]),
+                        alpha,
                         mzone,
-                        InOut(Cc[m]),
                     )
                 end
             else
                 # A: [Conj]Trans
                 for k in range(1, Amt)
                     mzone = k == 1 ? beta : T(1.0)
-                    Dagger.@spawn BLAS.gemv!(
+                    Dagger.@spawn matvecmul!(
+                        InOut(Cc[m]),
                         transA,
-                        alpha,
                         In(Ac[k, m]),
                         In(Bc[k]),
+                        alpha,
                         mzone,
-                        InOut(Cc[m]),
                     )
                 end
             end
@@ -523,3 +558,53 @@ function gemv_dagger!(
 
     return C
 end
+
+wrap_as_darray(A::DArray) = A
+wrap_as_darray(A::Array) = view(A, AutoBlocks())
+function wrap_as_darray(A::SubArray{T,Nv,Array{T,Na}}) where {T,Nv,Na}
+    Ap = parent(A)
+    part = auto_blocks(map(last, parentindices(Ap)))
+    partsize = part.blocksize
+    inds = parentindices(A)
+    inds_ranges_parent = ntuple(i->to_range(inds[i]), Val(Na))
+    inds_ranges_view = ntuple(i->to_range(inds[i]), Val(Nv))
+    subdomains = partition(part, ArrayDomain(inds_ranges_parent))
+    nparts = size(subdomains)
+    chunks = Array{Any,Na}(undef, nparts...)
+    for idx in CartesianIndices(nparts)
+        subdomain_view = subdomains[idx]
+        subdomain_parent = ArrayDomain(ntuple(i->Nv >= i ? subdomain_view.indexes[i] : inds_ranges_parent[i], Val(Na)))
+        subinds = ntuple(i->subdomain_parent.indexes[i], Val(Na))
+        subA = view(Ap, subinds...)
+        chunks[idx] = tochunk(subA)
+    end
+    return DArray(T, ArrayDomain(inds_ranges_parent), subdomains, chunks, part)
+end
+
+# Generate generic_matvecmul! methods for all combinations of DArray, Array, and SubArray
+for CT in (DVector, Vector, SubArray{<:Any,1,<:Array}),
+    AT in (DMatrix, Matrix, SubArray{<:Any,2,<:Array}),
+    BT in (DVector, Vector, SubArray{<:Any,1,<:Array})
+
+    # Don't commit type piracy
+    CT isa DArray || AT isa DArray || BT isa DArray || continue
+
+    @eval function LinearAlgebra.generic_matvecmul!(
+        C::$(CT),
+        transA::Char,
+        A::$(AT),
+        B::$(BT),
+        _add::LinearAlgebra.MulAddMul,
+    )
+        new_C = wrap_as_darray(C)
+        new_A = wrap_as_darray(A)
+        new_B = wrap_as_darray(B)
+        return LinearAlgebra.generic_matvecmul!(
+            new_C,
+            transA,
+            new_A,
+            new_B,
+            _add,
+        )
+    end
+end
\ No newline at end of file
diff --git a/src/array/sparse.jl b/src/array/sparse.jl
new file mode 100644
index 000000000..3e56c37b0
--- /dev/null
+++ b/src/array/sparse.jl
@@ -0,0 +1,152 @@
+"""
+    DSparseArray{T,N} <: AbstractArray{T,N}
+
+A sparse array container, for which the contained array may be replaced with a
+new one to support in-place operations. Designed to work well with Datadeps
+algorithms: writes that reallocate (and grow/shrink) the inner sparse storage
+are hidden behind the wrapper's stable identity, so Datadeps aliasing tracking
+remains valid (see `aliases_as_whole`).
+
+`DSparseVector{T}` and `DSparseMatrix{T}` are aliases for the 1- and 2-dimensional
+cases. The wrapper is general over `N` so it can hold sparse vectors, matrices,
+and (eventually) higher-order sparse tensors (e.g. Finch tensors).
+"""
+mutable struct DSparseArray{T,N} <: AbstractArray{T,N}
+    mat
+end
+const DSparseVector{T} = DSparseArray{T,1}
+const DSparseMatrix{T} = DSparseArray{T,2}
+
+# Convenience constructor inferring element type and dimensionality from `x`.
+DSparseArray(x) = DSparseArray{eltype(x),ndims(x)}(x)
+
+_sparse_collect(M) = collect(M)
+# Allocate a destination tile matching the inner storage's sparse backend. The
+# default delegates to the storage type's own `similar`, which carries the
+# concrete backend (e.g. `SparseMatrixCSC`) forward to the destination -- this is
+# how `similar(::DArray)` propagates a tile's sparse type to a newly-allocated
+# result DArray. Backends whose `similar` is unsuitable (e.g. Finch, whose generic
+# `similar` yields tensor formats that destabilize later kernels) override this.
+_sparse_similar(mat, ::Type{T}, dims::Dims) where {T} = similar(mat, T, dims)
+
+Base.eltype(M::DSparseArray{T}) where T = T
+Base.size(M::DSparseArray) = size(M.mat)
+Base.iterate(M::DSparseArray) = iterate(M.mat)
+Base.iterate(M::DSparseArray, state) = iterate(M.mat, state)
+Base.similar(M::DSparseArray, ::Type{T}, dims::Dims) where T =
+    DSparseArray{T,length(dims)}(_sparse_similar(M.mat, T, dims))
+Base.collect(M::DSparseArray) = _sparse_collect(M.mat)
+
+# N.B. hash and aliasing shouldn't change even if M.mat changes.
+# This is the key property that makes `DSparseArray` safe for Datadeps:
+# the aliasing identity is tied to the (stable) wrapper object, not to the
+# inner storage, which may be reallocated (and grow/shrink) on writes.
+Base.hash(M::DSparseArray, h::UInt) = hash(objectid(M), hash(DSparseArray, h))
+
+# Sparse containers must alias as a single, indivisible unit. Their storage is
+# reallocated (and may grow/shrink) on writes, so it is unsafe to alias any
+# *part* of the container independently. We therefore force *all* access to a
+# `DSparseArray` to resolve to the container's stable whole-object
+# `ObjectAliasing` -- including access via views, transposes, adjoints, and
+# reshapes -- rather than e.g. a `StridedAliasing` over storage that may have
+# since moved. `aliases_as_whole` opts the type into this behavior, and Datadeps'
+# `aliasing_root` resolves any wrapper of a `DSparseArray` back to the container
+# before computing aliasing. Note: this must return a *bare* `ObjectAliasing`, as
+# `aliases_as_whole(::ObjectAliasing)` relies on it to drive whole-object copies.
+aliasing(M::DSparseArray, _=identity) = ObjectAliasing(M)
+aliases_as_whole(::DSparseArray) = true
+
+# A `DSparseArray` has no meaningful raw data pointer (its storage may be
+# reallocated/resized). This trap ensures that if aliasing ever tries to treat a
+# wrapper of a `DSparseArray` as strided memory -- e.g. a new array-wrapper type
+# that `aliasing_root` failed to resolve via `Base.parent` -- it fails loudly
+# instead of silently corrupting Datadeps' aliasing tracking.
+function Base.pointer(::DSparseArray)
+    throw(ArgumentError("`pointer(::DSparseArray)` is intentionally unsupported: \
+        a DSparseArray may reallocate its storage, so it must be aliased as a \
+        whole object via `Dagger.aliasing`. If you reached here through Datadeps \
+        aliasing of an array wrapper, ensure that wrapper implements `Base.parent` \
+        so that `Dagger.aliasing_root` can resolve it to the DSparseArray."))
+end
+
+# Forward indexing to the inner array. Ranges/colons are supported so that
+# views (used by copy-buffering between mismatched partitionings) work.
+Base.getindex(M::DSparseArray, I...) = getindex(M.mat, I...)
+Base.setindex!(M::DSparseArray, v, I...) = setindex!(M.mat, v, I...)
+
+# Whole-tile copy. This is how Datadeps moves a tile between workers via
+# `move!`: we *reallocate* the inner storage rather than mutating in place,
+# since sparse storage cannot generally be updated in place. The wrapper's
+# identity (and thus its aliasing) is preserved.
+function Base.copyto!(dst::DSparseArray, src::DSparseArray)
+    dst.mat = _sparse_copy(src.mat)
+    return dst
+end
+# Backend-overridable deep copy of the inner storage (Finch tensors don't define
+# `Base.copy`).
+_sparse_copy(mat) = copy(mat)
+
+# Wrapping hook used when materializing tiles (e.g. in `distribute`). Backends
+# overload this (e.g. in package extensions) to wrap freshly-created tiles in a
+# container that Datadeps can track (such as `DSparseArray` for sparse tiles);
+# everything else is left untouched.
+maybe_wrap_tile(x) = x
+
+# Partial-range tile copy used by copy-buffering (e.g. when matmul operands need
+# to be repartitioned). Some backends (Finch) forbid `setindex!`, so we route
+# through a backend hook that returns the (possibly reallocated) inner storage.
+# `DSparseArray` hides the reallocation from Datadeps.
+function copyto_view!(Bpart::DSparseArray, Brange, Apart, Arange)
+    Bpart.mat = _sparse_copyto_view!(Bpart.mat, Brange, view(Apart, Arange))
+    return
+end
+# Default: in-place element-wise copy (works for `setindex!`-capable backends
+# such as `SparseArrays`). May reallocate and return new storage for backends
+# that can't update in place, so callers must use the returned value.
+function _sparse_copyto_view!(mat, Brange, src)
+    copyto!(view(mat, Brange), src)
+    return mat
+end
+
+#Base.BroadcastStyle(::Type{<:DSparseArray}) = FinchStyle()
+
+LinearAlgebra.norm2(M::DSparseArray) = LinearAlgebra.norm2(M.mat)
+
+# Matrix-specific operations dispatch on the 2-D alias `DSparseMatrix`.
+function matmatmul!(
+    C::DSparseMatrix,
+    transA::Char,
+    transB::Char,
+    A::DSparseMatrix,
+    B::DSparseMatrix,
+    alpha,
+    beta
+)
+    # Forward to sparse matmatmul!, but allow in-place update of C
+    return matmatmul!(C, transA, transB, A.mat, B.mat, alpha, beta)
+end
+
+# Sparse matrix-vector multiply: `C = alpha*op(A)*B + beta*C` with a sparse
+# matrix tile `A` and dense vectors `B`/`C`. Unwrap to the inner storage so each
+# backend can provide an efficient (in-place on dense `C`) method.
+function matvecmul!(C, transA::Char, A::DSparseMatrix, B, alpha, beta)
+    return matvecmul!(C, transA, A.mat, B, alpha, beta)
+end
+
+# Factorize the inner sparse storage (e.g. sparse LU/UMFPACK) for block-Jacobi,
+# rather than the `DSparseArray` wrapper (whose generic `lu` would densify).
+_factorize_tile(M::DSparseArray) = LinearAlgebra.lu(M.mat)
+
+# Expose the inner sparse storage so preconditioner backends (ILU, AMG) build on
+# the actual `SparseMatrixCSC` rather than the `DSparseArray` wrapper.
+_tile_matrix(M::DSparseArray) = M.mat
+
+function transpose_tile end
+function copytile!(A::DSparseMatrix, B::DSparseMatrix)
+    A.mat = transpose_tile(B.mat)
+    return
+end
+function copydiagtile!(A::DSparseMatrix, uplo)
+    A.mat = transpose_tile(A.mat, uplo)
+    return
+end
\ No newline at end of file
diff --git a/src/array/sparse_partition.jl b/src/array/sparse_partition.jl
deleted file mode 100644
index 26efe3e6a..000000000
--- a/src/array/sparse_partition.jl
+++ /dev/null
@@ -1,37 +0,0 @@
-export partition_sparse
-
-function partition_sparse(colptr, nnz, sz, nparts)
-    nnz_per_chunk = nnz/nparts
-    dom = ArrayDomain(map(x->1:x, sz))
-
-    nxt = nnz_per_chunk
-    colstart = 1
-    splits = UnitRange[]
-
-    for i=1:nparts-1
-       idxs = searchsorted(colptr, nxt)
-       if length(idxs) == 0
-           # the exact index was not found. latch on to the nearest column boundary
-           idx = first(idxs)
-           if abs(colptr[idx]-nxt) < abs(colptr[idx-1]-nxt)
-               nextidx = idx
-           else
-               nextidx = idx-1
-           end
-       else
-           nextidx = last(idxs)
-       end
-       push!(splits, colstart:nextidx)
-       colstart=nextidx+1
-       nxt=nxt + nnz_per_chunk
-    end
-    push!(splits, colstart:(length(colptr)-1))
-
-
-    cumlength = cumsum(map(length, splits))
-    subdomains = DomainBlocks((1,1), ([size(dom, 1)], cumlength))
-    DomainSplit(dom, subdomains)
-end
-
-partition_sparse(S::SparseMatrixCSC, nparts) =
-    partition_sparse(S.colptr, length(S.nzval), size(S), nparts)
diff --git a/src/datadeps/aliasing.jl b/src/datadeps/aliasing.jl
index c3e0ed20b..bd49107aa 100644
--- a/src/datadeps/aliasing.jl
+++ b/src/datadeps/aliasing.jl
@@ -567,8 +567,12 @@ function aliasing!(state::DataDepsState, target_space::MemorySpace, arg_w::Argum
         return state.ainfo_cache[remote_arg_w]
     end
 
-    # Calculate the ainfo
-    ainfo = AliasingWrapper(aliasing(remote_arg, arg_w.dep_mod))
+    # Calculate the ainfo. `aliasing_unwrapped` resolves any wrapper of a
+    # whole-object container (e.g. a view of a `DSparseArray`) to the container
+    # itself, so aliasing is computed on the container -- never on a partial view
+    # of storage that may have been reallocated. `remote_arg` is local here (for
+    # `Chunk`s the unwrap+aliasing happen remotely inside `aliasing(::Chunk, ...)`).
+    ainfo = AliasingWrapper(aliasing_unwrapped(remote_arg, arg_w.dep_mod))
 
     # Cache the result
     state.ainfo_cache[remote_arg_w] = ainfo
diff --git a/src/datadeps/chunkview.jl b/src/datadeps/chunkview.jl
index 1c2aa600f..c769f4b37 100644
--- a/src/datadeps/chunkview.jl
+++ b/src/datadeps/chunkview.jl
@@ -35,7 +35,10 @@ function aliasing(x::ChunkView{N}) where N
     return remotecall_fetch(root_worker_id(x.chunk.processor), x.chunk, x.slices) do x, slices
         x = unwrap(x)
         v = view(x, slices...)
-        return aliasing(v)
+        # A view of a whole-object container (e.g. `DSparseArray`) must alias the
+        # entire container; `aliasing_unwrapped` resolves that (otherwise it just
+        # aliases the view), and crucially does so here where `v` lives.
+        return aliasing_unwrapped(v)
     end
 end
 memory_space(x::ChunkView) = memory_space(x.chunk)
diff --git a/src/datadeps/remainders.jl b/src/datadeps/remainders.jl
index 2c2c49920..8a74d3839 100644
--- a/src/datadeps/remainders.jl
+++ b/src/datadeps/remainders.jl
@@ -43,6 +43,7 @@ memory_spans(mra::MultiRemainderAliasing) = vcat(memory_spans.(mra.remainders)..
 Base.hash(mra::MultiRemainderAliasing, h::UInt) = hash(mra.remainders, hash(MultiRemainderAliasing, h))
 Base.:(==)(mra1::MultiRemainderAliasing, mra2::MultiRemainderAliasing) = mra1.remainders == mra2.remainders
 
+# A whole-object copy from the argument's recorded owner space.
 struct FullCopy end
 
 """
@@ -87,9 +88,34 @@ function compute_remainder_for_arg!(state::DataDepsState,
                                     target_space::MemorySpace,
                                     arg_w::ArgumentWrapper,
                                     write_num::Int; compute_syncdeps::Bool=true)
+    owner_space = state.arg_owner[arg_w]
+
+    # Whole-object containers (e.g. `DSparseArray`) reallocate their entire
+    # storage on every write, so they can never be *partially* current in a
+    # memory space -- the whole object is current in exactly one space: the most
+    # recent writer (or the origin owner if never written). There is thus never a
+    # meaningful span "remainder" to compute, so we short-circuit to a whole-object
+    # `FullCopy` from the owner space (or `NoAliasing` if the target already holds
+    # it). This avoids the span-diff computation and the construction + transfer of
+    # a `RemainderAliasing` entirely. We detect this from the aliasing info, which
+    # is a bare `ObjectAliasing` for such containers (see `aliases_as_whole`).
+    #
+    # `arg_owner` is the most-recent *direct* writer of `arg_w`, while the merged
+    # `arg_history` records the latest write through *any* overlapping argument.
+    # For every path Datadeps currently exercises a whole-object container is only
+    # ever reached through a single argument, so these agree -- the assertion below
+    # guards that invariant. They could only diverge under cross-argument aliasing
+    # of a whole-object container (the same container written through one argument
+    # and read through a different, overlapping one); supporting that would require
+    # sourcing the copy from `history[end].space` instead of `arg_owner`.
+    if aliases_as_whole(aliasing!(state, target_space, arg_w))
+        history = state.arg_history[arg_w]
+        @assert isempty(history) || history[end].space == owner_space "whole-object container reached through a different overlapping argument than its owner; cross-argument aliasing of whole-object containers is unsupported"
+        return owner_space == target_space ? (NoAliasing(), 0) : (FullCopy(), 0)
+    end
+
     spaces_set = Set{MemorySpace}()
     push!(spaces_set, target_space)
-    owner_space = state.arg_owner[arg_w]
     push!(spaces_set, owner_space)
 
     @label restart
@@ -419,6 +445,9 @@ end
 # Main copy function for RemainderAliasing
 function move!(dep_mod::RemainderAliasing{S}, to_space::MemorySpace, from_space::MemorySpace, to::Chunk, from::Chunk) where S
     # TODO: Support direct copy between GPU memory spaces
+    # N.B. Whole-object containers (e.g. `DSparseArray`) never reach here: their
+    # storage reallocates on write, so they can't be span-copied. They are routed
+    # to a whole-object `FullCopy` in `compute_remainder_for_arg!` instead.
 
     # Copy the data from the source object
     copies = remotecall_fetch(root_worker_id(from_space), from_space, dep_mod, from) do from_space, dep_mod, from
@@ -523,18 +552,4 @@ function write_remainder!(copies::Vector{UInt8}, copies_offset::UInt64, to::Base
     else
         write_remainder!(copies, copies_offset, to[], to_ptr, n)
     end
-end
-
-function find_object_holding_ptr(A::SparseMatrixCSC, ptr::UInt64)
-    span = LocalMemorySpan(pointer(A.nzval), length(A.nzval)*sizeof(eltype(A.nzval)))
-    if span_start(span) <= ptr <= span_end(span)
-        return A.nzval
-    end
-    span = LocalMemorySpan(pointer(A.colptr), length(A.colptr)*sizeof(eltype(A.colptr)))
-    if span_start(span) <= ptr <= span_end(span)
-        return A.colptr
-    end
-    span = LocalMemorySpan(pointer(A.rowval), length(A.rowval)*sizeof(eltype(A.rowval)))
-    @assert span_start(span) <= ptr <= span_end(span) "Pointer $ptr not found in SparseMatrixCSC"
-    return A.rowval
 end
\ No newline at end of file
diff --git a/src/file-io.jl b/src/file-io.jl
index 1407fbced..15405d049 100644
--- a/src/file-io.jl
+++ b/src/file-io.jl
@@ -1,5 +1,4 @@
 export save, load
-using SparseArrays
 import MemPool: GenericFileDevice, FileRef
 
 struct File
diff --git a/src/memory-spaces.jl b/src/memory-spaces.jl
index eb4f7ad5b..5d06edd64 100644
--- a/src/memory-spaces.jl
+++ b/src/memory-spaces.jl
@@ -390,17 +390,90 @@ end
 aliasing(::String) = NoAliasing() # FIXME: Not necessarily true
 aliasing(::Symbol) = NoAliasing()
 aliasing(::Type) = NoAliasing()
+
+"""
+    aliases_as_whole(x) -> Bool
+
+Whether `x` -- either an object, or an already-computed aliasing -- must be
+treated as a single, indivisible unit, i.e. it is never safe to alias (or to
+consider current) only *part* of it.
+
+For an object: containers whose backing storage may be reallocated or resized on
+write (such as `DSparseArray`) return `true`. This causes [`aliasing_root`](@ref)
+to resolve any view/wrapper of them to the whole container, so all access funnels
+through the container's own `aliasing`. By contract, such a container's `aliasing`
+must be a bare [`ObjectAliasing`](@ref).
+
+For an aliasing: a bare `ObjectAliasing` (the aliasing of a whole-object
+container) reports `true`. Datadeps uses this to copy such arguments as a whole
+(a `FullCopy`) rather than computing per-span remainders, since they can never be
+*partially* current in a memory space.
+"""
+aliases_as_whole(@nospecialize x) = false
+aliases_as_whole(ainfo::AliasingWrapper) = aliases_as_whole(ainfo.inner)
+aliases_as_whole(::ObjectAliasing) = true
+
+"""
+    aliasing_root(x)
+
+Resolve `x` down to the whole-object container it wraps: peel array wrappers
+(views, transposes, adjoints, reshapes, permutations, ...) off of `x` until
+reaching a container that must alias as a whole (see [`aliases_as_whole`](@ref)),
+and return that container. If no such container is wrapped, return `x` unchanged.
+
+This relies solely on the standard `Base.parent` interface, so it transparently
+handles *any* array wrapper without per-wrapper `aliasing` methods: a new wrapper
+type needs no special-casing here, and a new whole-object container only needs to
+define [`aliases_as_whole`](@ref) (plus its own `aliasing` method, which must
+return a bare `ObjectAliasing`). A trapping `Base.pointer` on such containers
+guards against a wrapper slipping through and being misinterpreted as strided
+memory.
+
+!!! warning
+    The resolved object's *identity* defines its aliasing, so this is only
+    meaningful in the memory space where `x` physically resides. Never return its
+    result across a worker boundary and *then* compute `aliasing` -- the transfer
+    copies the object and changes its aliasing. Use [`aliasing_unwrapped`](@ref),
+    which fuses the two operations so they always run together.
+"""
+aliasing_root(@nospecialize x) = x
+function aliasing_root(x::AbstractArray)
+    aliases_as_whole(x) && return x
+    p = parent(x)
+    # `Base.parent` returns the argument itself for non-wrapper arrays, which
+    # terminates the recursion.
+    p === x && return x
+    root = aliasing_root(p)
+    return aliases_as_whole(root) ? root : x
+end
+
+"""
+    aliasing_unwrapped(x[, dep_mod])
+
+Compute the `aliasing` of `x`, first resolving (via [`aliasing_root`](@ref)) any
+whole-object container that `x` wraps. This is the entry point Datadeps uses to
+compute the aliasing of a (possibly wrapped) argument.
+
+Unwrapping and `aliasing` are intentionally fused into a single call so they
+always execute in the same place. It MUST be evaluated where `x` physically lives
+-- e.g. *inside* a `Chunk`'s `remotecall_fetch` block -- because the unwrapped
+object's identity determines its aliasing; returning the unwrapped object across
+a worker boundary first would copy it and silently change the result.
+"""
+aliasing_unwrapped(x) = aliasing(aliasing_root(x))
+aliasing_unwrapped(x, dep_mod) = aliasing(aliasing_root(x), dep_mod)
+
 function aliasing(x::Chunk, T)
     @assert x.handle isa DRef
     if root_worker_id(x.processor) == myid()
-        return aliasing(unwrap(x), T)
+        return aliasing_unwrapped(unwrap(x), T)
     end
     return remotecall_fetch(root_worker_id(x.processor), x, T) do x, T
-        aliasing(unwrap(x), T)
+        aliasing_unwrapped(unwrap(x), T)
     end
 end
 aliasing(x::Chunk) = remotecall_fetch(root_worker_id(x.processor), x) do x
-    aliasing(unwrap(x))
+    aliasing_unwrapped(unwrap(x))
 end
 aliasing(x::DTask, T) = aliasing(fetch(x; raw=true), T)
 aliasing(x::DTask) = aliasing(fetch(x; raw=true))
diff --git a/test/Project.toml b/test/Project.toml
index af2d30b14..53cd012ac 100644
--- a/test/Project.toml
+++ b/test/Project.toml
@@ -1,4 +1,5 @@
 [deps]
+AlgebraicMultigrid = "2169fc97-5a83-5252-b627-83903c6c433c"
 ArgParse = "c7e460c6-2fb9-53a9-8c5b-16f535851c63"
 Colors = "5ae59095-9a9b-59fe-a467-6f913c188581"
 CondaPkg = "992eb4ea-22a4-4c89-a5bb-47a3300528ab"
@@ -9,8 +10,10 @@ FFTW = "7a1cc6ca-52ef-59f5-83cd-3a7055c09341"
 GPUArraysCore = "46192b85-c4d5-4398-a991-12ede77f4527"
 GraphViz = "f526b714-d49f-11e8-06ff-31ed36ee7ee0"
 Graphs = "86223c79-3864-5bf0-83f7-82e725a168b6"
+IncompleteLU = "40713840-3770-5561-ab4c-a76e7d0d7895"
 JSON3 = "0f8b85d8-7281-11e9-16c2-39a750bddbf1"
 KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c"
+Krylov = "ba0b0d4f-ebba-5204-a429-3ac8c609bfb7"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 MemPool = "f9f48841-c794-520a-933b-121f7ba6ed94"
 OnlineStats = "a15396b6-48d5-5d58-9928-6d29437db91e"
diff --git a/test/array/linalg/core.jl b/test/array/linalg/core.jl
index 4c66ac34a..5de6fefd1 100644
--- a/test/array/linalg/core.jl
+++ b/test/array/linalg/core.jl
@@ -23,3 +23,57 @@ end
     DA = DArray(A)
     @test isapprox(norm(A), norm(DA))
 end
+
+@testset "BLAS-1 vector ops" begin
+    @testset "T=$T part=$(part.blocksize)" for T in (Float64, ComplexF64), part in (Blocks(16), Blocks(4))
+        n = 16
+        x = rand(T, n)
+        y = rand(T, n)
+        a = T(2)
+        b = T(3)
+        Dx = distribute(x, part)
+
+        # dot (conjugates first arg for complex)
+        @test dot(Dx, distribute(y, part)) ≈ dot(x, y)
+
+        # axpy!: y += a*x
+        Dy = distribute(copy(y), part)
+        axpy!(a, Dx, Dy)
+        @test collect(Dy) ≈ a .* x .+ y
+
+        # axpby!: y = a*x + b*y
+        Dy = distribute(copy(y), part)
+        axpby!(a, Dx, b, Dy)
+        @test collect(Dy) ≈ a .* x .+ b .* y
+
+        # rmul!/lmul!: scale in place
+        Dz = distribute(copy(x), part)
+        rmul!(Dz, a)
+        @test collect(Dz) ≈ a .* x
+        Dz = distribute(copy(x), part)
+        lmul!(a, Dz)
+        @test collect(Dz) ≈ a .* x
+    end
+
+    # Operands with *different* partitionings must still work (aligned internally
+    # via maybe_copy_buffered), and the in-place result keeps its own layout.
+    @testset "mismatched partitionings T=$T" for T in (Float64, ComplexF64)
+        n = 16
+        x = rand(T, n)
+        y = rand(T, n)
+        a, b = T(2), T(3)
+        Dx = distribute(x, Blocks(4))
+        Dy = distribute(y, Blocks(8))
+
+        @test dot(Dx, Dy) ≈ dot(x, y)
+
+        Dy2 = distribute(copy(y), Blocks(8))
+        axpy!(a, Dx, Dy2)
+        @test collect(Dy2) ≈ a .* x .+ y
+        @test Dy2.partitioning == Blocks(8)   # output layout preserved
+
+        Dy3 = distribute(copy(y), Blocks(8))
+        axpby!(a, Dx, b, Dy3)
+        @test collect(Dy3) ≈ a .* x .+ b .* y
+    end
+end
diff --git a/test/array/linalg/iterativesolvers.jl b/test/array/linalg/iterativesolvers.jl
new file mode 100644
index 000000000..ca7865f29
--- /dev/null
+++ b/test/array/linalg/iterativesolvers.jl
@@ -0,0 +1,206 @@
+# Distributed iterative (Krylov) linear-solver tests.
+#
+# Exercises the matrix-free Krylov integration (`Dagger.cg`/`minres`/`gmres`/
+# `bicgstab` + the generic `krylov_solve`) over both dense and sparse-backed
+# `DMatrix` operators, plus the Jacobi preconditioner. Reference solutions come
+# from a dense direct solve.
+#
+#     julia test/runtests.jl --test array/linalg/iterativesolvers
+
+using Krylov
+using AlgebraicMultigrid
+using IncompleteLU
+
+# Strongly diagonally-dominant tridiagonal SPD matrix. The large diagonal keeps
+# the condition number small so the Krylov methods converge in a handful of
+# iterations (keeping the distributed test fast). `inv(diag) == 1/4`.
+const SPD_DIAG = 4.0
+
+laplacian_1d(T, n) = SparseArrays.spdiagm(
+    -1 => fill(-one(T), n - 1),
+     0 => fill(T(SPD_DIAG), n),
+     1 => fill(-one(T), n - 1),
+)
+
+# Add a first-order advection term -> nonsymmetric, still well-conditioned.
+function advection_diffusion_1d(T, n)
+    return laplacian_1d(T, n) + SparseArrays.spdiagm(
+        -1 => fill(T(-3) / 10, n - 1),
+         1 => fill(T(3) / 10, n - 1),
+    )
+end
+
+@testset "Iterative solvers (Krylov)" begin
+    n = 64
+    k = 16
+    Db_part = Blocks(k)
+    A_part = Blocks(k, k)
+
+    @testset "SPD operator ($(backend))" for backend in (:dense, :sparse)
+        Asp = laplacian_1d(Float64, n)
+        Adense = Matrix(Asp)
+        b = rand(n)
+        xref = Adense \ b
+
+        DA = backend === :dense ? distribute(Adense, A_part) : distribute(Asp, A_part)
+        Db = distribute(b, Db_part)
+
+        @testset "$(nameof(solver))" for solver in (Dagger.cg, Dagger.minres, Dagger.gmres, Dagger.bicgstab)
+            x, stats = solver(DA, Db; atol = 1e-12, rtol = 1e-10, itmax = 500)
+            @test stats.solved
+            @test x isa Dagger.DVector
+            @test collect(x) ≈ xref rtol = 1e-6
+        end
+
+        # Generic entry point.
+        x, stats = Dagger.krylov_solve(:cg, DA, Db; atol = 1e-12, rtol = 1e-10)
+        @test stats.solved
+        @test collect(x) ≈ xref rtol = 1e-6
+    end
+
+    @testset "nonsymmetric operator ($(backend))" for backend in (:dense, :sparse)
+        Asp = advection_diffusion_1d(Float64, n)
+        b = rand(n)
+        xref = Matrix(Asp) \ b
+
+        DA = backend === :dense ? distribute(Matrix(Asp), A_part) : distribute(Asp, A_part)
+        Db = distribute(b, Db_part)
+
+        @testset "$(nameof(solver))" for solver in (Dagger.gmres, Dagger.bicgstab)
+            x, stats = solver(DA, Db; atol = 1e-12, rtol = 1e-10, itmax = 500)
+            @test stats.solved
+            @test collect(x) ≈ xref rtol = 1e-6
+        end
+    end
+
+    @testset "complex SPD (Hermitian) operator" begin
+        # Real SPD tridiagonal is Hermitian as a complex matrix.
+        Asp = SparseArrays.spdiagm(
+            -1 => fill(ComplexF64(-1), n - 1),
+             0 => fill(ComplexF64(SPD_DIAG), n),
+             1 => fill(ComplexF64(-1), n - 1),
+        )
+        b = rand(ComplexF64, n)
+        xref = Matrix(Asp) \ b
+        DA = distribute(Asp, A_part)
+        Db = distribute(b, Db_part)
+        x, stats = Dagger.cg(DA, Db; atol = 1e-12, rtol = 1e-10, itmax = 500)
+        @test stats.solved
+        @test collect(x) ≈ xref rtol = 1e-6
+    end
+
+    @testset "Jacobi preconditioner" begin
+        Asp = laplacian_1d(Float64, n)
+        b = rand(n)
+        xref = Matrix(Asp) \ b
+
+        @testset "build + apply ($(backend))" for backend in (:dense, :sparse)
+            DA = backend === :dense ? distribute(Matrix(Asp), A_part) : distribute(Asp, A_part)
+            Db = distribute(b, Db_part)
+
+            P = Dagger.JacobiPreconditioner(DA)
+            @test collect(P.dinv) ≈ fill(1 / SPD_DIAG, n)   # 1/diag
+
+            # Apply: y = M⁻¹ x = dinv .* x.
+            y = similar(Db)
+            mul!(y, P, Db)
+            @test collect(y) ≈ (1 / SPD_DIAG) .* b
+
+            x, stats = Dagger.cg(DA, Db; M = P, atol = 1e-12, rtol = 1e-10, itmax = 500)
+            @test stats.solved
+            @test collect(x) ≈ xref rtol = 1e-6
+        end
+
+        # Non-square block grid must be rejected with a helpful error.
+        DA_ragged = distribute(Matrix(Asp), Blocks(k, k ÷ 2))
+        @test_throws ArgumentError Dagger.JacobiPreconditioner(DA_ragged)
+    end
+
+    @testset "block-Jacobi preconditioner" begin
+        Asp = laplacian_1d(Float64, n)
+        Adense = Matrix(Asp)
+        b = rand(n)
+        xref = Adense \ b
+
+        # Reference: apply the exact block-diagonal inverse.
+        yref = similar(b)
+        for s in 1:k:n
+            r = s:min(s + k - 1, n)
+            yref[r] = Adense[r, r] \ b[r]
+        end
+
+        @testset "build + apply ($(backend))" for backend in (:dense, :sparse)
+            DA = backend === :dense ? distribute(Adense, A_part) : distribute(Asp, A_part)
+            Db = distribute(b, Db_part)
+
+            P = Dagger.BlockJacobiPreconditioner(DA)
+            y = similar(Db)
+            mul!(y, P, Db)
+            @test collect(y) ≈ yref
+
+            x, stats = Dagger.cg(DA, Db; M = P, atol = 1e-12, rtol = 1e-10, itmax = 500)
+            @test stats.solved
+            @test collect(x) ≈ xref rtol = 1e-6
+        end
+
+        # A single tile makes block-Jacobi an *exact* solve, so PCG converges
+        # essentially immediately.
+        DA1 = distribute(Adense, Blocks(n, n))
+        Db1 = distribute(b, Blocks(n))
+        P1 = Dagger.BlockJacobiPreconditioner(DA1)
+        x1, s1 = Dagger.cg(DA1, Db1; M = P1, atol = 1e-12, rtol = 1e-10, itmax = 500)
+        @test s1.solved
+        @test s1.niter <= 2
+        @test collect(x1) ≈ xref rtol = 1e-8
+
+        @test_throws ArgumentError Dagger.BlockJacobiPreconditioner(distribute(Adense, Blocks(k, k ÷ 2)))
+    end
+
+    @testset "AMG preconditioner ($(method))" for method in (:ruge_stuben, :smoothed_aggregation)
+        Asp = laplacian_1d(Float64, n)
+        Adense = Matrix(Asp)
+        b = rand(n)
+        xref = Adense \ b
+
+        @testset "$(backend)" for backend in (:dense, :sparse)
+            DA = backend === :dense ? distribute(Adense, A_part) : distribute(Asp, A_part)
+            Db = distribute(b, Db_part)
+
+            P = Dagger.AMGPreconditioner(DA; method = method)
+            # Apply is a V-cycle approximating M⁻¹ x; just check it runs + is finite
+            # (and repeatable, exercising the cached, pinned hierarchy).
+            y1 = similar(Db); mul!(y1, P, Db)
+            y2 = similar(Db); mul!(y2, P, Db)
+            @test all(isfinite, collect(y1))
+            @test collect(y1) ≈ collect(y2)
+
+            x, stats = Dagger.cg(DA, Db; M = P, atol = 1e-12, rtol = 1e-10, itmax = 500)
+            @test stats.solved
+            @test collect(x) ≈ xref rtol = 1e-6
+        end
+    end
+
+    @testset "block-ILU preconditioner" begin
+        Asp = laplacian_1d(Float64, n)
+        Adense = Matrix(Asp)
+        b = rand(n)
+        xref = Adense \ b
+
+        @testset "build + apply ($(backend))" for backend in (:dense, :sparse)
+            DA = backend === :dense ? distribute(Adense, A_part) : distribute(Asp, A_part)
+            Db = distribute(b, Db_part)
+
+            P = Dagger.BlockILUPreconditioner(DA; τ = 0.01)
+            y1 = similar(Db); mul!(y1, P, Db)
+            y2 = similar(Db); mul!(y2, P, Db)
+            @test all(isfinite, collect(y1))
+            @test collect(y1) ≈ collect(y2)
+
+            x, stats = Dagger.cg(DA, Db; M = P, atol = 1e-12, rtol = 1e-10, itmax = 500)
+            @test stats.solved
+            @test collect(x) ≈ xref rtol = 1e-6
+        end
+
+        @test_throws ArgumentError Dagger.BlockILUPreconditioner(distribute(Adense, Blocks(k, k ÷ 2)); τ = 0.01)
+    end
+end
diff --git a/test/array/linalg/matmul.jl b/test/array/linalg/matmul.jl
index ebc9d8ddc..f8b090d8a 100644
--- a/test/array/linalg/matmul.jl
+++ b/test/array/linalg/matmul.jl
@@ -29,36 +29,71 @@ function test_gemm!(T, szA, szB, partA, partB)
     DA = distribute(A, partA)
     DB = distribute(B, partB)
 
+    SA = sprand(T, szA..., 0.1)
+    SB = sprand(T, szA..., 0.1)
+
+    DSA = distribute(SA, partA)
+    DSB = distribute(SB, partB)
+
     ## Out-of-place gemm
     # No transA, No transB
+    # Dense
     DC = DA * DB
     C = A * B
     @test collect(DC) ≈ C
+    # Sparse
+    DSC = DSA * DSB
+    SC = SA * SB
+    @test collect(DSC) ≈ SC
 
     if szA == szB
         # No transA, transB
+        # Dense
         DC = DA * DB'
         C = A * B'
         @test collect(DC) ≈ C
+        # Sparse
+        DSC = DSA * DSB'
+        SC = SA * SB'
+        @test collect(DSC) ≈ SC
 
         # transA, No transB
+        # Dense
         DC = DA' * DB
         C = A' * B
         @test collect(DC) ≈ C
+        # Sparse
+        DSC = DSA' * DSB
+        SC = SA' * SB
+        @test collect(DSC) ≈ SC
     end
 
     # transA, transB
+    # Dense
     DC = DA' * DB'
     C = A' * B'
     @test collect(DC) ≈ C
+    #= Sparse
+    DSC = DSA' * DSB'
+    SC = SA' * SB'
+    @test collect(DSC) ≈ SC
+    =#
 
     ## In-place gemm
     # No transA, No transB
+    # Dense
     C = zeros(T, szC...)
     DC = distribute(C, partC)
     mul!(C, A, B)
     mul!(DC, DA, DB)
     @test collect(DC) ≈ C
+    #= Sparse
+    SC = zeros(T, szC...)
+    DSC = distribute(SC, partC)
+    mul!(SC, SA, SB)
+    mul!(DSC, DSA, DSB)
+    @test collect(DSC) ≈ SC
+    =#
 
     if szA == szB
         # No transA, transB
diff --git a/test/array/linalg/matmul_finch.jl b/test/array/linalg/matmul_finch.jl
new file mode 100644
index 000000000..061874f1e
--- /dev/null
+++ b/test/array/linalg/matmul_finch.jl
@@ -0,0 +1,105 @@
+# Fast, focused sparse matrix-multiply tests for the Finch backend.
+#
+# Mirrors `matmul_sparse.jl` (SparseArrays) but exercises Finch tensor tiles via
+# `DSparseMatrix`. Finch is an optional (weak) dependency, so this test loads it
+# on all workers and skips gracefully if it cannot be loaded.
+#
+# Run with:
+#
+#     julia test/runtests.jl --test array/linalg/matmul_finch
+#
+# N.B. Finch precompilation is heavy (several minutes); this test is intended for
+# local validation rather than fast CI.
+
+# Make the test-project environment (which provides Finch) resolvable on the
+# workers, which are launched with Dagger's project.
+const _FINCH_TEST_ENV = abspath(joinpath(@__DIR__, "..", ".."))
+@everywhere pushfirst!(LOAD_PATH, $_FINCH_TEST_ENV)
+
+const FINCH_AVAILABLE = try
+    @eval using Finch
+    @everywhere using Finch
+    true
+catch err
+    @warn "Finch unavailable; skipping Finch matmul tests" exception=err
+    false
+end
+
+if !FINCH_AVAILABLE
+    @testset "Finch GEMM (quick)" begin
+        @test_skip false
+    end
+else
+    const FINCH_DENSITY = 0.3
+
+    # Self-consistent: build distributed Finch operands, take the dense
+    # `collect` as the reference, and compare against the distributed result.
+    function test_finch_gemm!(T, sz, partA, partB)
+        partC = Blocks(partA.blocksize[1], partB.blocksize[2])
+
+        DSA = Finch.fsprand(partA, T, sz, FINCH_DENSITY)
+        DSB = Finch.fsprand(partB, T, sz, FINCH_DENSITY)
+        A = collect(DSA)
+        B = collect(DSB)
+
+        ## Out-of-place
+        @test collect(DSA * DSB)   ≈ A * B     # N / N
+        @test collect(DSA * DSB')  ≈ A * B'    # N / C
+        @test collect(DSA' * DSB)  ≈ A' * B    # C / N
+        @test collect(DSA' * DSB') ≈ A' * B'   # C / C
+
+        ## Symmetric rank-k (syrk path: A === B with one operand transposed)
+        @test collect(DSA' * DSA) ≈ A' * A
+        @test collect(DSA * DSA') ≈ A * A'
+
+        ## In-place
+        DSC = Finch.fspzeros(partC, T, sz)
+        mul!(DSC, DSA, DSB)
+        @test collect(DSC) ≈ A * B
+    end
+
+    const FINCH_QUICK_CASES = [
+        ((8, 8), Blocks(4, 4), Blocks(4, 4)),
+        ((8, 8), Blocks(2, 4), Blocks(4, 2)),
+    ]
+
+    @testset "Finch GEMM (quick)" begin
+        @testset "size=$sz part=$(partA.blocksize)/$(partB.blocksize)" for (sz, partA, partB) in FINCH_QUICK_CASES
+            @testset "T=$T" for T in (Float64, ComplexF64)
+                test_finch_gemm!(T, sz, partA, partB)
+            end
+        end
+    end
+
+    # Sparse matrix-vector multiply (SpMV): Finch sparse matrix tiles, dense vectors.
+    function test_finch_spmv!(T, n, part)
+        bs = part.blocksize[1]
+        DSA = Finch.fsprand(part, T, (n, n), FINCH_DENSITY)
+        A = collect(DSA)
+        x = rand(T, n)
+        Dx = distribute(x, Blocks(bs))
+
+        @test collect(DSA * Dx)            ≈ A * x
+        @test collect(transpose(DSA) * Dx) ≈ transpose(A) * x
+        @test collect(DSA' * Dx)           ≈ A' * x
+
+        y = rand(T, n)
+        Dy = distribute(copy(y), Blocks(bs))
+        alpha, beta = T(2), T(3)
+        mul!(Dy, DSA, Dx, alpha, beta)
+        @test collect(Dy) ≈ alpha * (A * x) + beta * y
+    end
+
+    const FINCH_SPMV_CASES = [
+        (8, Blocks(4, 4)),
+        (8, Blocks(2, 2)),
+    ]
+
+    @testset "Finch SpMV (quick)" begin
+        @testset "n=$n part=$(part.blocksize)" for (n, part) in FINCH_SPMV_CASES
+            @testset "T=$T" for T in (Float64, ComplexF64)
+                test_finch_spmv!(T, n, part)
+            end
+        end
+    end
+end
diff --git a/test/array/linalg/matmul_sparse.jl b/test/array/linalg/matmul_sparse.jl
new file mode 100644
index 000000000..cc2e8274e
--- /dev/null
+++ b/test/array/linalg/matmul_sparse.jl
@@ -0,0 +1,139 @@
+# Fast, focused sparse matrix-multiply tests.
+#
+# This is intentionally a *small* subset of the full GEMM suite in `matmul.jl`,
+# meant for quick iteration while developing sparse-array support (the full
+# suite takes ~1hr). Run with:
+#
+#     julia test/runtests.jl --test array/linalg/matmul_sparse
+#
+# Keep this fast: only a couple of sizes/partitionings, just enough to exercise
+# the out-of-place and in-place paths (incl. transposes) for both real and
+# complex element types.
+
+const SPARSE_DENSITY = 0.3
+
+function test_sparse_gemm!(T, sz, partA, partB)
+    rows, cols = sz
+    @assert rows == cols "sparse quick-test uses square matrices so transposes line up"
+    partC = Blocks(partA.blocksize[1], partB.blocksize[2])
+
+    SA = sprand(T, sz..., SPARSE_DENSITY)
+    SB = sprand(T, sz..., SPARSE_DENSITY)
+
+    DSA = distribute(SA, partA)
+    DSB = distribute(SB, partB)
+
+    ## Out-of-place
+    @test collect(DSA * DSB)   ≈ SA * SB     # N / N
+    @test collect(DSA * DSB')  ≈ SA * SB'    # N / T
+    @test collect(DSA' * DSB)  ≈ SA' * SB    # T / N
+    @test collect(DSA' * DSB') ≈ SA' * SB'   # T / T
+
+    ## Symmetric rank-k (syrk path: A === B with one operand transposed)
+    @test collect(DSA' * DSA) ≈ Array(SA)' * Array(SA)
+    @test collect(DSA * DSA') ≈ Array(SA) * Array(SA)'
+
+    ## In-place
+    SC = SA * SB
+    DSC = distribute(sparse(zeros(T, sz...)), partC)
+    mul!(DSC, DSA, DSB)
+    @test collect(DSC) ≈ SC
+end
+
+const SPARSE_QUICK_CASES = [
+    ((8, 8), Blocks(4, 4), Blocks(4, 4)),
+    ((8, 8), Blocks(2, 4), Blocks(4, 2)),
+    ((8, 8), Blocks(4, 2), Blocks(2, 4)),
+]
+
+@testset "Sparse GEMM (quick)" begin
+    @testset "size=$sz part=$(partA.blocksize)/$(partB.blocksize)" for (sz, partA, partB) in SPARSE_QUICK_CASES
+        @testset "T=$T" for T in (Float64, ComplexF64)
+            test_sparse_gemm!(T, sz, partA, partB)
+        end
+    end
+end
+
+# Sparse matrix-vector multiply (SpMV): sparse matrix tiles, dense vectors. This
+# is the core kernel for distributed iterative (Krylov) solvers.
+function test_sparse_spmv!(T, n, part)
+    bs = part.blocksize[1]
+    SA = sprand(T, n, n, SPARSE_DENSITY)
+    x = rand(T, n)
+
+    DSA = distribute(SA, part)
+    Dx = distribute(x, Blocks(bs))
+
+    ## Out-of-place (N / T / C)
+    @test collect(DSA * Dx)            ≈ SA * x
+    @test collect(transpose(DSA) * Dx) ≈ transpose(SA) * x
+    @test collect(DSA' * Dx)           ≈ SA' * x
+
+    ## In-place 5-arg mul!: y = alpha*A*x + beta*y
+    y = rand(T, n)
+    Dy = distribute(copy(y), Blocks(bs))
+    alpha, beta = T(2), T(3)
+    mul!(Dy, DSA, Dx, alpha, beta)
+    @test collect(Dy) ≈ alpha * (SA * x) + beta * y
+end
+
+const SPARSE_SPMV_CASES = [
+    (8, Blocks(4, 4)),
+    (8, Blocks(2, 2)),
+]
+
+@testset "Sparse SpMV (quick)" begin
+    @testset "n=$n part=$(part.blocksize)" for (n, part) in SPARSE_SPMV_CASES
+        @testset "T=$T" for T in (Float64, ComplexF64)
+            test_sparse_spmv!(T, n, part)
+        end
+    end
+end
+
+# Any *partial* or *reinterpreted* access to a sparse container (views,
+# transposes, adjoints, reshapes, and combinations thereof) must alias the
+# *entire* container, so that Datadeps never tracks stale sub-spans of storage
+# that may have been reallocated on write. This is enforced by `aliasing_root`,
+# which resolves any such wrapper back to the container itself (via `Base.parent`)
+# -- exposed to Datadeps through the fused `aliasing_unwrapped` -- plus a trapping
+# `Base.pointer` that fires if a wrapper ever slips through to the strided path.
+@testset "Sparse whole-container aliasing" begin
+    M = Dagger.DSparseMatrix{Float64}(sprand(Float64, 6, 6, 0.4))
+
+    @test Dagger.aliases_as_whole(M)
+    @test !Dagger.aliases_as_whole(rand(Float64, 6, 6))
+    # The container's aliasing is a bare `ObjectAliasing`, which also reports
+    # whole-object (drives the Datadeps whole-object copy short-circuit).
+    @test Dagger.aliasing(M) isa Dagger.ObjectAliasing
+    @test Dagger.aliases_as_whole(Dagger.aliasing(M))
+    @test !Dagger.aliases_as_whole(Dagger.aliasing(rand(Float64, 6, 6)))
+
+    # `aliasing_root` peels any array wrapper back to the sparse container.
+    @test Dagger.aliasing_root(M) === M
+    @test Dagger.aliasing_root(view(M, 1:3, :)) === M
+    @test Dagger.aliasing_root(view(M, 4:6, 2:4)) === M
+    @test Dagger.aliasing_root(transpose(M)) === M
+    @test Dagger.aliasing_root(M') === M
+    @test Dagger.aliasing_root(reshape(M, 36)) === M
+    @test Dagger.aliasing_root(view(transpose(M), 1:2, :)) === M
+
+    # After unwrapping, all wrappers alias the same whole container.
+    a_full = Dagger.aliasing(M)
+    @test Dagger.will_alias(a_full, Dagger.aliasing_unwrapped(view(M, 1:3, :)))
+    @test Dagger.will_alias(Dagger.aliasing_unwrapped(view(M, 1:3, :)),
+                            Dagger.aliasing_unwrapped(view(M, 4:6, :)))
+
+    # Distinct containers must not alias.
+    M2 = Dagger.DSparseMatrix{Float64}(sprand(Float64, 6, 6, 0.4))
+    @test !Dagger.will_alias(a_full, Dagger.aliasing(M2))
+
+    # Dense arrays are untouched: not whole-object, views stay strided, and
+    # `aliasing_root` returns them unchanged.
+    A = rand(Float64, 6, 6)
+    Av = view(A, 1:3, :)
+    @test Dagger.aliasing_root(Av) === Av
+    @test Dagger.aliasing_unwrapped(Av) isa Dagger.StridedAliasing
+
+    # The `pointer` trap fires if a sparse container is treated as raw memory.
+    @test_throws ArgumentError pointer(M)
+end
diff --git a/test/runtests.jl b/test/runtests.jl
index bc4505fb3..205985b12 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -7,6 +7,11 @@ USE_METAL = parse(Bool, get(ENV, "CI_USE_METAL", "0"))
 USE_OPENCL = parse(Bool, get(ENV, "CI_USE_OPENCL", "0"))
 USE_GPU = USE_CUDA || USE_ROCM || USE_ONEAPI || USE_METAL || USE_OPENCL
 
+# Finch is a heavy optional dependency (precompilation takes minutes), so it is
+# not a default test dependency. The dedicated Finch CI job signals via this
+# variable to `Pkg.add` Finch and run only the Finch-backed tests.
+USE_FINCH = parse(Bool, get(ENV, "CI_TEST_FINCH", "0"))
+
 tests = [
     ("Thunk", "thunk.jl"),
     ("Scheduler", "scheduler.jl"),
@@ -31,10 +36,13 @@ tests = [
     ("Array - LinearAlgebra - Core", "array/linalg/core.jl"),
     ("Array - LinearAlgebra - Arithmetic", "array/linalg/arithmetic.jl"),
     ("Array - LinearAlgebra - Matmul", "array/linalg/matmul.jl"),
+    ("Array - LinearAlgebra - Matmul (Sparse, quick)", "array/linalg/matmul_sparse.jl"),
+    ("Array - LinearAlgebra - Matmul (Finch, quick)", "array/linalg/matmul_finch.jl"),
     ("Array - LinearAlgebra - Cholesky", "array/linalg/cholesky.jl"),
     ("Array - LinearAlgebra - LU", "array/linalg/lu.jl"),
     ("Array - LinearAlgebra - Solve", "array/linalg/solve.jl"),
     ("Array - LinearAlgebra - QR", "array/linalg/qr.jl"),
+    ("Array - LinearAlgebra - Iterative Solvers", "array/linalg/iterativesolvers.jl"),
     ("Array - Permute", "array/permute.jl"),
     ("Array - Random", "array/random.jl"),
     ("Array - Stencils", "array/stencil.jl"),
@@ -55,8 +63,22 @@ if USE_GPU
         ("Array - Stencils", "array/stencil.jl"),
     ]
 end
+if USE_FINCH
+    # Only run the Finch-backed tests in the dedicated Finch CI job.
+    tests = [
+        ("Array - LinearAlgebra - Matmul (Finch)", "array/linalg/matmul_finch.jl"),
+    ]
+end
 all_test_names = map(test -> replace(last(test), ".jl"=>""), tests)
 
+# Tests excluded from default runs; they only run when explicitly requested via
+# `--test` or enabled by an environment signal. Finch's precompilation is very
+# heavy, so its tests are opt-in (except in the dedicated Finch job, where we
+# explicitly want them to run).
+optin_test_names = USE_FINCH ? String[] : String[
+    "array/linalg/matmul_finch",
+]
+
 additional_workers::Int = 3
 
 if PROGRAM_FILE != "" && realpath(PROGRAM_FILE) == @__FILE__
@@ -68,6 +90,10 @@ if PROGRAM_FILE != "" && realpath(PROGRAM_FILE) == @__FILE__
         Pkg.instantiate()
     catch
     end
+    if USE_FINCH
+        # Finch is not a default test dependency; add it on demand when signaled.
+        Pkg.add("Finch")
+    end
 
     using ArgParse
     s = ArgParseSettings(description = "Dagger Testsuite")
@@ -102,7 +128,7 @@ if PROGRAM_FILE != "" && realpath(PROGRAM_FILE) == @__FILE__
     parsed_args = parse_args(s)
     to_test = String[]
     if isempty(parsed_args["test"])
-        to_test = copy(all_test_names)
+        to_test = filter(!in(optin_test_names), copy(all_test_names))
     else
         for _test in parsed_args["test"]
             test = only(_test)
@@ -158,7 +184,7 @@ if PROGRAM_FILE != "" && realpath(PROGRAM_FILE) == @__FILE__
         Pkg.offline(true)
     end
 else
-    to_test = all_test_names
+    to_test = filter(!in(optin_test_names), all_test_names)
     @info "Running all tests"
 end