From 254dd23df12fa1254dfc12854c6c537e1536b4a6 Mon Sep 17 00:00:00 2001
From: Alberto Mercurio <alberto.mercurio96@gmail.com>
Date: Sun, 1 Feb 2026 12:46:55 +0100
Subject: [PATCH] Introduce `droptzeros!` and apply it at the end of + methods

---
 .github/copilot-instructions.md      |   8 +-
 src/GenericSparseArrays.jl           |   2 +-
 src/conversions/conversions.jl       |  18 ++++
 src/core.jl                          |   3 +
 src/matrix_coo/matrix_coo.jl         | 119 ++++++++++++++++++++++++++-
 src/matrix_csc/matrix_csc.jl         |  99 +++++++++++++++++++++-
 src/matrix_csc/matrix_csc_kernels.jl |  36 ++++++++
 src/matrix_csr/matrix_csr.jl         |  99 +++++++++++++++++++++-
 src/matrix_csr/matrix_csr_kernels.jl |  36 ++++++++
 test/shared/matrix_coo.jl            |   5 ++
 test/shared/matrix_csc.jl            |   5 ++
 test/shared/matrix_csr.jl            |   5 ++
 12 files changed, 427 insertions(+), 8 deletions(-)

diff --git a/.github/copilot-instructions.md b/.github/copilot-instructions.md
index e423770..898541d 100644
--- a/.github/copilot-instructions.md
+++ b/.github/copilot-instructions.md
@@ -13,7 +13,9 @@ These guidelines give AI coding agents the minimum project-specific context to b
 - `docs/` (Documenter): `make.jl` sets up docs + doctests. `docs/src/index.md` auto-docs the module; adding docstrings automatically surfaces them.
 
 ## 3. Development Workflows
-- Run tests locally: `make test` from the repo root folder.
+- Run tests locally: `make test` from the repo root folder. Important: this runs with the latest stable Julia version, and might fail in some cases like code quality checks (Aqua, JET).
+- The tests are divided into GROUPs defined in `runtests.jl`. For example, to run only the tests for the CUDA backend, use `GROUP=CUDA make test`.
+- Avoid to run generic tests like `make test`. Instead, run tests for specific backends depending on the available hardware on your machine. For example prefer running `GROUP=CUDA make test` on a machine with an NVIDIA GPU or `GROUP=Metal make test` on an Apple Silicon machine.
 - Add a dependency: `julia --project -e 'using Pkg; Pkg.add("PackageName")'` from the repo root folder, then update `[compat]` manually with a bounded version.
 - Build docs locally: `make docs`.
 - Doctests: Any code block in docstrings marked for execution must pass CI doctest phase.
@@ -21,7 +23,7 @@ These guidelines give AI coding agents the minimum project-specific context to b
 - Always check in which directory you are running commands. Change directory to the repo root if needed.
 
 ## 4. Coding Conventions
-- Public API: add docstrings starting with a concise one-line summary, then details (Documenter picks them up).
+- Public API: add docstrings starting with a concise one-line summary, then details (Documenter picks them up). Write docstrings *only* for functions/types that are part of the package's public API, and not methods of other Modules (e.g., Base, LinearAlgebra, SparseArrays).
 - Keep internal helpers non-exported; prefix with an underscore only if truly private.
 - Avoid type piracy: only extend Base / external methods for types you own OR clearly justify in comments.
 - Prefer parametric methods that remain type-stable (JET will flag instability; address before committing).
@@ -32,7 +34,7 @@ These guidelines give AI coding agents the minimum project-specific context to b
 - Always test the same functionality on all the supported backends (CPU, GPU, etc.) to ensure consistent behavior and avoid repetitions when generating the problem. Make tests reusable across backends when possible using shared functions in the `test/shared` folder.
 - The test for different CPU/GPU architectures should be in a separate Julia environment, as each machine may not have all the backends available.
 - When you run tests locally, only tests for the CPU and CUDA backends, as the others are not always available.
-- All added code must pass Aqua + JET. If JET warns about type instability, refactor or add an inline comment explaining any intentional dynamic behavior.
+- All added code must pass Aqua + JET. If JET warns about type instability, refactor or add an inline comment explaining any intentional dynamic behavior. Usually code quality tests fail on the latest stable Julia version; prefer to run them on the latest LTS Julia version.
 
 ## 6. Documentation Patterns
 
diff --git a/src/GenericSparseArrays.jl b/src/GenericSparseArrays.jl
index aeae93f..621a78d 100644
--- a/src/GenericSparseArrays.jl
+++ b/src/GenericSparseArrays.jl
@@ -3,7 +3,7 @@ module GenericSparseArrays
 using LinearAlgebra
 import LinearAlgebra: wrap, copymutable_oftype, __normalize!, kron, Diagonal, issymmetric, ishermitian
 using SparseArrays
-import SparseArrays: SparseVector, SparseMatrixCSC
+import SparseArrays: SparseVector, SparseMatrixCSC, dropzeros, dropzeros!
 import SparseArrays: getcolptr, getrowval, getnzval, nonzeroinds
 import SparseArrays: _show_with_braille_patterns
 
diff --git a/src/conversions/conversions.jl b/src/conversions/conversions.jl
index 54b0096..e29e565 100644
--- a/src/conversions/conversions.jl
+++ b/src/conversions/conversions.jl
@@ -174,6 +174,15 @@ function GenericSparseMatrixCSC(A::GenericSparseMatrixCOO{Tv, Ti}) where {Tv, Ti
 
     backend = get_backend(A.nzval)
 
+    # Handle empty matrix case
+    if nnz_count == 0
+        colptr = similar(A.rowind, Ti, n + 1)
+        fill!(colptr, one(Ti))
+        rowind = similar(A.rowind, Ti, 0)
+        nzval = similar(A.nzval, Tv, 0)
+        return GenericSparseMatrixCSC(m, n, colptr, rowind, nzval)
+    end
+
     # Create keys for sorting: column first, then row
     keys = similar(A.rowind, Ti, nnz_count)
 
@@ -276,6 +285,15 @@ function GenericSparseMatrixCSR(A::GenericSparseMatrixCOO{Tv, Ti}) where {Tv, Ti
 
     backend = get_backend(A.nzval)
 
+    # Handle empty matrix case
+    if nnz_count == 0
+        rowptr = similar(A.rowind, Ti, m + 1)
+        fill!(rowptr, one(Ti))
+        colind = similar(A.rowind, Ti, 0)
+        nzval = similar(A.nzval, Tv, 0)
+        return GenericSparseMatrixCSR(m, n, rowptr, colind, nzval)
+    end
+
     # Create keys for sorting: row first, then column
     keys = similar(A.rowind, Ti, nnz_count)
 
diff --git a/src/core.jl b/src/core.jl
index 41ed659..3582d04 100644
--- a/src/core.jl
+++ b/src/core.jl
@@ -83,6 +83,9 @@ end
 
 Base.:+(B::DenseMatrix, A::AbstractGenericSparseMatrix) = A + B
 
+Base.:-(A::AbstractGenericSparseMatrix, B::Union{DenseMatrix, AbstractGenericSparseMatrix}) = A + (-B)
+Base.:-(B::DenseMatrix, A::AbstractGenericSparseMatrix) = B + (-A)
+
 LinearAlgebra.issymmetric(A::Transpose{<:Any, <:AbstractGenericSparseMatrix}) = issymmetric(parent(A))
 LinearAlgebra.issymmetric(A::Adjoint{<:Any, <:AbstractGenericSparseMatrix}) = issymmetric(parent(A))
 LinearAlgebra.ishermitian(A::Transpose{<:Any, <:AbstractGenericSparseMatrix}) = ishermitian(parent(A))
diff --git a/src/matrix_coo/matrix_coo.jl b/src/matrix_coo/matrix_coo.jl
index 458b049..a6e4232 100644
--- a/src/matrix_coo/matrix_coo.jl
+++ b/src/matrix_coo/matrix_coo.jl
@@ -391,7 +391,9 @@ function Base.:+(A::GenericSparseMatrixCOO, B::GenericSparseMatrixCOO)
         ndrange = (nnz_concat,),
     )
 
-    return GenericSparseMatrixCOO(m, n, rowind_C, colind_C, nzval_C)
+    C = GenericSparseMatrixCOO(m, n, rowind_C, colind_C, nzval_C)
+    dropzeros!(C)
+    return C
 end
 
 # Addition with transpose/adjoint support
@@ -508,7 +510,13 @@ for (wrapa, transa, conja, unwrapa, whereT1) in trans_adj_wrappers(:GenericSpars
                 ndrange = (nnz_concat,),
             )
 
-            return GenericSparseMatrixCOO(m, n, rowind_C, colind_C, nzval_C)
+            C = GenericSparseMatrixCOO(m, n, rowind_C, colind_C, nzval_C)
+            dropzeros!(C)
+            return C
+        end
+
+        @eval function Base.:-(A::$TypeA, B::$TypeB) where {$(whereT1(:T1)), $(whereT2(:T2))}
+            return A + (-B)
         end
     end
 end
@@ -751,3 +759,110 @@ function LinearAlgebra.ishermitian(A::GenericSparseMatrixCOO)
     m == n || return false
     return ishermitian(GenericSparseMatrixCSC(A))
 end
+
+# Helper function to filter COO entries by mask
+function _filter_by_mask!(new_rowind, new_colind, new_nzval, rowind, colind, nzval, mask)
+    backend = get_backend(nzval)
+
+    # Compute prefix sum of mask to get write indices
+    mask_int = similar(rowind, Int)
+    mask_int .= mask
+    write_indices = _cumsum_AK(mask_int)
+
+    kernel! = _kernel_filter_coo!(backend)
+    kernel!(new_rowind, new_colind, new_nzval, rowind, colind, nzval, mask, write_indices; ndrange = length(nzval))
+    return nothing
+end
+
+@kernel inbounds = true function _kernel_filter_coo!(
+        new_rowind,
+        new_colind,
+        new_nzval,
+        @Const(rowind),
+        @Const(colind),
+        @Const(nzval),
+        @Const(mask),
+        @Const(write_indices),
+    )
+    i = @index(Global)
+    if mask[i]
+        idx = write_indices[i]
+        new_rowind[idx] = rowind[i]
+        new_colind[idx] = colind[i]
+        new_nzval[idx] = nzval[i]
+    end
+end
+
+# Internal implementation that writes into the provided arrays
+function _dropzeros_impl_coo!(new_rowind, new_colind, new_nzval, A::GenericSparseMatrixCOO, mask)
+    rowind = getrowind(A)
+    colind = getcolind(A)
+    nzval = nonzeros(A)
+
+    # Copy non-zero entries using mask
+    _filter_by_mask!(new_rowind, new_colind, new_nzval, rowind, colind, nzval, mask)
+    return nothing
+end
+
+function SparseArrays.dropzeros!(A::GenericSparseMatrixCOO)
+    nzval = nonzeros(A)
+    rowind = getrowind(A)
+    colind = getcolind(A)
+
+    # Find non-zero mask
+    mask = nzval .!= zero(eltype(nzval))
+
+    # Count non-zeros
+    total_nnz = sum(mask)
+
+    if total_nnz == length(nzval)
+        # No zeros to drop
+        return A
+    end
+
+    # Allocate temporary arrays for new data
+    new_rowind = similar(rowind, total_nnz)
+    new_colind = similar(colind, total_nnz)
+    new_nzval = similar(nzval, total_nnz)
+
+    # Fill the new arrays
+    _dropzeros_impl_coo!(new_rowind, new_colind, new_nzval, A, mask)
+
+    # Copy back to original arrays
+    resize!(A.rowind, total_nnz)
+    copyto!(A.rowind, new_rowind)
+    resize!(A.colind, total_nnz)
+    copyto!(A.colind, new_colind)
+    resize!(A.nzval, total_nnz)
+    copyto!(A.nzval, new_nzval)
+
+    return A
+end
+
+function SparseArrays.dropzeros(A::GenericSparseMatrixCOO)
+    m, n = size(A)
+    nzval = nonzeros(A)
+    rowind = getrowind(A)
+    colind = getcolind(A)
+
+    # Find non-zero mask
+    mask = nzval .!= zero(eltype(nzval))
+
+    # Count non-zeros
+    total_nnz = sum(mask)
+
+    if total_nnz == length(nzval)
+        # No zeros to drop, return a copy
+        return copy(A)
+    end
+
+    # Allocate new arrays
+    new_rowind = similar(rowind, total_nnz)
+    new_colind = similar(colind, total_nnz)
+    new_nzval = similar(nzval, total_nnz)
+
+    # Fill the new arrays
+    _dropzeros_impl_coo!(new_rowind, new_colind, new_nzval, A, mask)
+
+    return GenericSparseMatrixCOO(m, n, new_rowind, new_colind, new_nzval)
+end
diff --git a/src/matrix_csc/matrix_csc.jl b/src/matrix_csc/matrix_csc.jl
index d7468bb..33a9ada 100644
--- a/src/matrix_csc/matrix_csc.jl
+++ b/src/matrix_csc/matrix_csc.jl
@@ -364,7 +364,9 @@ function Base.:+(A::GenericSparseMatrixCSC, B::GenericSparseMatrixCSC)
         ndrange = (n,),
     )
 
-    return GenericSparseMatrixCSC(m, n, colptr_C, rowval_C, nzval_C)
+    C = GenericSparseMatrixCSC(m, n, colptr_C, rowval_C, nzval_C)
+    dropzeros!(C)
+    return C
 end
 
 # Addition with transpose/adjoint support
@@ -394,6 +396,10 @@ for (wrapa, transa, conja, unwrapa, whereT1) in trans_adj_wrappers(:GenericSpars
             # Convert back to CSC
             return GenericSparseMatrixCSC(result_csr)
         end
+
+        @eval function Base.:-(A::$TypeA, B::$TypeB) where {$(whereT1(:T1)), $(whereT2(:T2))}
+            return A + (-B)
+        end
     end
 end
 
@@ -606,3 +612,94 @@ function LinearAlgebra.ishermitian(A::GenericSparseMatrixCSC)
 
     return @allowscalar result[1]
 end
+
+# Internal implementation that writes into the provided arrays
+function _dropzeros_impl_csc!(new_colptr, new_rowval, new_nzval, A::GenericSparseMatrixCSC)
+    m, n = size(A)
+    backend = get_backend(A)
+    Ti = eltype(getcolptr(A))
+
+    # Count non-zeros per column
+    nnz_per_col = similar(getcolptr(A), n)
+    kernel_count! = kernel_count_nonzeros_csc!(backend)
+    kernel_count!(nnz_per_col, getcolptr(A), nonzeros(A); ndrange = (n,))
+
+    # Build new colptr
+    cumsum_nnz = _cumsum_AK(nnz_per_col)
+    fill!(view(new_colptr, 1:1), one(Ti))
+    view(new_colptr, 2:(n + 1)) .= cumsum_nnz .+ one(Ti)
+
+    # Get total number of non-zeros
+    total_nnz = @allowscalar new_colptr[end] - one(Ti)
+
+    if total_nnz > 0
+        # Copy non-zero entries
+        kernel_drop! = kernel_dropzeros_csc!(backend)
+        kernel_drop!(
+            new_rowval,
+            new_nzval,
+            new_colptr,
+            getcolptr(A),
+            rowvals(A),
+            nonzeros(A);
+            ndrange = (n,),
+        )
+    end
+
+    return total_nnz
+end
+
+function SparseArrays.dropzeros!(A::GenericSparseMatrixCSC)
+    m, n = size(A)
+    backend = get_backend(A)
+    Ti = eltype(getcolptr(A))
+
+    # Count non-zeros per column to determine new size
+    nnz_per_col = similar(getcolptr(A), n)
+    kernel_count! = kernel_count_nonzeros_csc!(backend)
+    kernel_count!(nnz_per_col, getcolptr(A), nonzeros(A); ndrange = (n,))
+
+    cumsum_nnz = _cumsum_AK(nnz_per_col)
+    total_nnz = @allowscalar cumsum_nnz[end]
+
+    # Allocate temporary arrays for new data
+    new_colptr = similar(getcolptr(A))
+    new_rowval = similar(rowvals(A), total_nnz)
+    new_nzval = similar(nonzeros(A), total_nnz)
+
+    # Fill the new arrays
+    _dropzeros_impl_csc!(new_colptr, new_rowval, new_nzval, A)
+
+    # Copy back to original arrays
+    copyto!(A.colptr, new_colptr)
+    resize!(A.rowval, total_nnz)
+    copyto!(A.rowval, new_rowval)
+    resize!(A.nzval, total_nnz)
+    copyto!(A.nzval, new_nzval)
+
+    return A
+end
+
+function SparseArrays.dropzeros(A::GenericSparseMatrixCSC)
+    m, n = size(A)
+    backend = get_backend(A)
+    Ti = eltype(getcolptr(A))
+
+    # Count non-zeros per column to determine new size
+    nnz_per_col = similar(getcolptr(A), n)
+    kernel_count! = kernel_count_nonzeros_csc!(backend)
+    kernel_count!(nnz_per_col, getcolptr(A), nonzeros(A); ndrange = (n,))
+
+    cumsum_nnz = _cumsum_AK(nnz_per_col)
+    total_nnz = @allowscalar cumsum_nnz[end]
+
+    # Allocate new arrays
+    new_colptr = similar(getcolptr(A))
+    new_rowval = similar(rowvals(A), total_nnz)
+    new_nzval = similar(nonzeros(A), total_nnz)
+
+    # Fill the new arrays
+    _dropzeros_impl_csc!(new_colptr, new_rowval, new_nzval, A)
+
+    return GenericSparseMatrixCSC(m, n, new_colptr, new_rowval, new_nzval)
+end
diff --git a/src/matrix_csc/matrix_csc_kernels.jl b/src/matrix_csc/matrix_csc_kernels.jl
index ab3a66f..11f2a77 100644
--- a/src/matrix_csc/matrix_csc_kernels.jl
+++ b/src/matrix_csc/matrix_csc_kernels.jl
@@ -407,3 +407,39 @@ end
         result[1] = false
     end
 end
+
+# Kernel to count non-zeros per column after filtering zeros
+@kernel inbounds = true function kernel_count_nonzeros_csc!(
+        nnz_per_col,
+        @Const(colptr),
+        @Const(nzval),
+    )
+    col = @index(Global)
+    count = 0
+    for j in colptr[col]:(colptr[col + 1] - 1)
+        if !iszero(nzval[j])
+            count += 1
+        end
+    end
+    nnz_per_col[col] = count
+end
+
+# Kernel to copy non-zero entries only (dropping zeros)
+@kernel inbounds = true function kernel_dropzeros_csc!(
+        new_rowval,
+        new_nzval,
+        @Const(new_colptr),
+        @Const(old_colptr),
+        @Const(old_rowval),
+        @Const(old_nzval),
+    )
+    col = @index(Global)
+    write_idx = new_colptr[col]
+    for j in old_colptr[col]:(old_colptr[col + 1] - 1)
+        if !iszero(old_nzval[j])
+            new_rowval[write_idx] = old_rowval[j]
+            new_nzval[write_idx] = old_nzval[j]
+            write_idx += 1
+        end
+    end
+end
diff --git a/src/matrix_csr/matrix_csr.jl b/src/matrix_csr/matrix_csr.jl
index 1c52daa..f728cc1 100644
--- a/src/matrix_csr/matrix_csr.jl
+++ b/src/matrix_csr/matrix_csr.jl
@@ -362,7 +362,9 @@ function Base.:+(A::GenericSparseMatrixCSR, B::GenericSparseMatrixCSR)
         ndrange = (m,),
     )
 
-    return GenericSparseMatrixCSR(m, n, rowptr_C, colval_C, nzval_C)
+    C = GenericSparseMatrixCSR(m, n, rowptr_C, colval_C, nzval_C)
+    dropzeros!(C)
+    return C
 end
 
 # Addition with transpose/adjoint support
@@ -392,6 +394,10 @@ for (wrapa, transa, conja, unwrapa, whereT1) in trans_adj_wrappers(:GenericSpars
             # Convert back to CSR
             return GenericSparseMatrixCSR(result_csc)
         end
+
+        @eval function Base.:-(A::$TypeA, B::$TypeB) where {$(whereT1(:T1)), $(whereT2(:T2))}
+            return A + (-B)
+        end
     end
 end
 
@@ -601,3 +607,94 @@ function LinearAlgebra.ishermitian(A::GenericSparseMatrixCSR)
 
     return @allowscalar result[1]
 end
+
+# Internal implementation that writes into the provided arrays
+function _dropzeros_impl_csr!(new_rowptr, new_colval, new_nzval, A::GenericSparseMatrixCSR)
+    m, n = size(A)
+    backend = get_backend(A)
+    Ti = eltype(getrowptr(A))
+
+    # Count non-zeros per row
+    nnz_per_row = similar(getrowptr(A), m)
+    kernel_count! = kernel_count_nonzeros_csr!(backend)
+    kernel_count!(nnz_per_row, getrowptr(A), nonzeros(A); ndrange = (m,))
+
+    # Build new rowptr
+    cumsum_nnz = _cumsum_AK(nnz_per_row)
+    fill!(view(new_rowptr, 1:1), one(Ti))
+    view(new_rowptr, 2:(m + 1)) .= cumsum_nnz .+ one(Ti)
+
+    # Get total number of non-zeros
+    total_nnz = @allowscalar new_rowptr[end] - one(Ti)
+
+    if total_nnz > 0
+        # Copy non-zero entries
+        kernel_drop! = kernel_dropzeros_csr!(backend)
+        kernel_drop!(
+            new_colval,
+            new_nzval,
+            new_rowptr,
+            getrowptr(A),
+            colvals(A),
+            nonzeros(A);
+            ndrange = (m,),
+        )
+    end
+
+    return total_nnz
+end
+
+function SparseArrays.dropzeros!(A::GenericSparseMatrixCSR)
+    m, n = size(A)
+    backend = get_backend(A)
+    Ti = eltype(getrowptr(A))
+
+    # Count non-zeros per row to determine new size
+    nnz_per_row = similar(getrowptr(A), m)
+    kernel_count! = kernel_count_nonzeros_csr!(backend)
+    kernel_count!(nnz_per_row, getrowptr(A), nonzeros(A); ndrange = (m,))
+
+    cumsum_nnz = _cumsum_AK(nnz_per_row)
+    total_nnz = @allowscalar cumsum_nnz[end]
+
+    # Allocate temporary arrays for new data
+    new_rowptr = similar(getrowptr(A))
+    new_colval = similar(colvals(A), total_nnz)
+    new_nzval = similar(nonzeros(A), total_nnz)
+
+    # Fill the new arrays
+    _dropzeros_impl_csr!(new_rowptr, new_colval, new_nzval, A)
+
+    # Copy back to original arrays
+    copyto!(A.rowptr, new_rowptr)
+    resize!(A.colval, total_nnz)
+    copyto!(A.colval, new_colval)
+    resize!(A.nzval, total_nnz)
+    copyto!(A.nzval, new_nzval)
+
+    return A
+end
+
+function SparseArrays.dropzeros(A::GenericSparseMatrixCSR)
+    m, n = size(A)
+    backend = get_backend(A)
+    Ti = eltype(getrowptr(A))
+
+    # Count non-zeros per row to determine new size
+    nnz_per_row = similar(getrowptr(A), m)
+    kernel_count! = kernel_count_nonzeros_csr!(backend)
+    kernel_count!(nnz_per_row, getrowptr(A), nonzeros(A); ndrange = (m,))
+
+    cumsum_nnz = _cumsum_AK(nnz_per_row)
+    total_nnz = @allowscalar cumsum_nnz[end]
+
+    # Allocate new arrays
+    new_rowptr = similar(getrowptr(A))
+    new_colval = similar(colvals(A), total_nnz)
+    new_nzval = similar(nonzeros(A), total_nnz)
+
+    # Fill the new arrays
+    _dropzeros_impl_csr!(new_rowptr, new_colval, new_nzval, A)
+
+    return GenericSparseMatrixCSR(m, n, new_rowptr, new_colval, new_nzval)
+end
diff --git a/src/matrix_csr/matrix_csr_kernels.jl b/src/matrix_csr/matrix_csr_kernels.jl
index ef18f5a..d849730 100644
--- a/src/matrix_csr/matrix_csr_kernels.jl
+++ b/src/matrix_csr/matrix_csr_kernels.jl
@@ -405,3 +405,39 @@ end
         result[1] = false
     end
 end
+
+# Kernel to count non-zeros per row after filtering zeros
+@kernel inbounds = true function kernel_count_nonzeros_csr!(
+        nnz_per_row,
+        @Const(rowptr),
+        @Const(nzval),
+    )
+    row = @index(Global)
+    count = 0
+    for j in rowptr[row]:(rowptr[row + 1] - 1)
+        if !iszero(nzval[j])
+            count += 1
+        end
+    end
+    nnz_per_row[row] = count
+end
+
+# Kernel to copy non-zero entries only (dropping zeros)
+@kernel inbounds = true function kernel_dropzeros_csr!(
+        new_colval,
+        new_nzval,
+        @Const(new_rowptr),
+        @Const(old_rowptr),
+        @Const(old_colval),
+        @Const(old_nzval),
+    )
+    row = @index(Global)
+    write_idx = new_rowptr[row]
+    for j in old_rowptr[row]:(old_rowptr[row + 1] - 1)
+        if !iszero(old_nzval[j])
+            new_colval[write_idx] = old_colval[j]
+            new_nzval[write_idx] = old_nzval[j]
+            write_idx += 1
+        end
+    end
+end
diff --git a/test/shared/matrix_coo.jl b/test/shared/matrix_coo.jl
index 0dc3762..8803ea0 100644
--- a/test/shared/matrix_coo.jl
+++ b/test/shared/matrix_coo.jl
@@ -329,6 +329,11 @@ function shared_test_linearalgebra_matrix_coo(
                 @test collect(result) ≈ Matrix(expected)
                 @test result isa GenericSparseMatrixCOO
 
+                # Test op_A(A) - op_A(A) drops zeros (should have zero stored elements)
+                result_sub = op_A(dA) - op_A(dA)
+                @test nnz(result_sub) == 0
+                @test collect(result_sub) ≈ zeros(T, size(op_A(A))...)
+
                 # Additional tests only for identity + identity
                 if op_A === identity && op_B === identity
                     # Test with overlapping entries
diff --git a/test/shared/matrix_csc.jl b/test/shared/matrix_csc.jl
index 67313d2..1edcfc2 100644
--- a/test/shared/matrix_csc.jl
+++ b/test/shared/matrix_csc.jl
@@ -327,6 +327,11 @@ function shared_test_linearalgebra_matrix_csc(
                 @test collect(result) ≈ Matrix(expected)
                 @test result isa GenericSparseMatrixCSC
 
+                # Test op_A(A) - op_A(A) drops zeros (should have zero stored elements)
+                result_sub = op_A(dA) - op_A(dA)
+                @test nnz(result_sub) == 0
+                @test collect(result_sub) ≈ zeros(T, size(op_A(A))...)
+
                 # Additional tests only for identity + identity
                 if op_A === identity && op_B === identity
                     # Test with overlapping entries
diff --git a/test/shared/matrix_csr.jl b/test/shared/matrix_csr.jl
index 75aaaba..bd95a62 100644
--- a/test/shared/matrix_csr.jl
+++ b/test/shared/matrix_csr.jl
@@ -326,6 +326,11 @@ function shared_test_linearalgebra_matrix_csr(
                 @test collect(result) ≈ Matrix(expected)
                 @test result isa GenericSparseMatrixCSR
 
+                # Test op_A(A) - op_A(A) drops zeros (should have zero stored elements)
+                result_sub = op_A(dA) - op_A(dA)
+                @test nnz(result_sub) == 0
+                @test collect(result_sub) ≈ zeros(T, size(op_A(A))...)
+
                 # Additional tests only for identity + identity
                 if op_A === identity && op_B === identity
                     # Test with overlapping entries