From 8e8a89e112cb11a896be143e2eb2bdd35e9cbed3 Mon Sep 17 00:00:00 2001
From: vt2211 <vardaan.tekriwal@gmail.com>
Date: Fri, 10 Apr 2026 23:51:44 -0700
Subject: [PATCH 1/3] anistropic kernels and linear solve speedups

---
 fvgp/fvgp.py                   |  28 +-
 fvgp/gp.py                     |  34 +-
 fvgp/gp_lin_alg.py             | 792 ++++++++++++++++++++++++++++++---
 fvgp/gp_marginal_likelihood.py |  62 ++-
 fvgp/kernels.py                | 325 +++++++++++++-
 fvgp/kv_linalg.py              | 215 ++++++++-
 tests/test_fvgp.py             |  99 ++++-
 7 files changed, 1449 insertions(+), 106 deletions(-)

diff --git a/fvgp/fvgp.py b/fvgp/fvgp.py
index 3a31556..2c88cbf 100755
--- a/fvgp/fvgp.py
+++ b/fvgp/fvgp.py
@@ -151,7 +151,13 @@ class provides all the methods described for the GP (:py:class:`fvgp.GP`) class.
         Matrix batch size for distributed computing in gp2Scale. The default is 10000.
     gp2Scale_linalg_mode : str, optional
         One of `Chol`, `sparseLU`, `sparseCG`, `sparseMINRES`, `sparseSolve`, `sparseCGpre`
-        (incomplete LU preconditioner), or `sparseMINRESpre`. The default is None which amounts to
+        (cached sparse preconditioner; geometry selected via `args["sparse_preconditioner_type"]`),
+        or `sparseMINRESpre`. Supported preconditioner geometries are `ilu`,
+        `incomplete_cholesky`, `block_jacobi`, `additive_schwarz`, and `amg`.
+        Convenience aliases such as `sparseCGpre_ilu`, `sparseCGpre_ic`,
+        `sparseCGpre_blockjacobi`, `sparseCGpre_schwarz`, `sparseCGpre_amg`,
+        and the corresponding `sparseMINRESpre_*` forms are also accepted.
+        The default is None which amounts to
         an automatic determination of the mode. For advanced customization options
         this can also be an iterable with three callables: the first f(K), where K is the covariance matrix
         to compute a factorization object
@@ -188,13 +194,29 @@ class provides all the methods described for the GP (:py:class:`fvgp.GP`) class.
         - "random_logdet_verbose" : True/False; default = False
         - "random_logdet_print_info" : True/False; default = False
         - "sparse_minres_tol" : float
-        - "cg_minres_tol" : float
-        - "random_logdet_lanczos_compute_device" : str; default = "cpu"/"gpu"
+        - "sparse_cg_tol" : float
+        - "cg_minres_tol" : float; legacy CG tolerance alias still accepted
+        - "sparse_preconditioner_type" : one of `ilu`, `incomplete_cholesky`, `block_jacobi`, `additive_schwarz`, `amg`; optional if a `sparseCGpre_*` or `sparseMINRESpre_*` alias mode is used
+        - "sparse_preconditioner_drop_tol" : float; ILU only
+        - "sparse_preconditioner_fill_factor" : float; ILU only
+        - "sparse_preconditioner_block_size" : int; block Jacobi / additive Schwarz
+        - "sparse_preconditioner_schwarz_overlap" : int; additive Schwarz
+        - "sparse_preconditioner_ic_shift" : float; incomplete Cholesky diagonal shift
+        - "sparse_preconditioner_amg_cycle" : str; AMG cycle, e.g. `V`
+        - "sparse_preconditioner_refresh_interval" : int; rebuild the cached preconditioner every k matrix updates
+        - "sparse_krylov_warm_start" : True/False; reuse the previous iterative solution as the next initial guess
+        - "sparse_krylov_mode" : `single` or `block`; block mode currently applies to CG solves with multiple RHS
+        - "sparse_block_krylov" : True/False; legacy alias for `sparse_krylov_mode=\"block\"`
+        - "sparse_krylov_block_size" : int; number of RHS columns per block-CG group
+        - "sparse_krylov_maxiter" : int; generic Krylov iteration cap
+        - "sparse_cg_maxiter" : int; CG-specific iteration cap
+        - "sparse_minres_maxiter" : int; MINRES-specific iteration cap
         - "Chol_factor_compute_device" : str; default = "cpu"/"gpu"
         - "update_Chol_factor_compute_device": str; default = "cpu"/"gpu"
         - "Chol_solve_compute_device" : str; default = "cpu"/"gpu"
         - "Chol_logdet_compute_device" : str; default = "cpu"/"gpu"
         - "GPU_engine" : str; default = "torch"/"cupy"
+        - "GPU_device" : optional torch device string, e.g. "cuda:0" or "mps"
 
         All other keys will be stored and are available as part of the object instance.
 
diff --git a/fvgp/gp.py b/fvgp/gp.py
index 03bfacf..8a22c51 100755
--- a/fvgp/gp.py
+++ b/fvgp/gp.py
@@ -145,7 +145,13 @@ class GP:
         Matrix batch size for distributed computing in gp2Scale. The default is 10000.
     gp2Scale_linalg_mode : str, optional
         One of `Chol`, `sparseLU`, `sparseCG`, `sparseMINRES`, `sparseSolve`, `sparseCGpre`
-        (incomplete LU preconditioner), or `sparseMINRESpre`. The default is None which amounts to
+        (cached sparse preconditioner; geometry selected via `args["sparse_preconditioner_type"]`),
+        or `sparseMINRESpre`. Supported preconditioner geometries are `ilu`,
+        `incomplete_cholesky`, `block_jacobi`, `additive_schwarz`, and `amg`.
+        Convenience aliases such as `sparseCGpre_ilu`, `sparseCGpre_ic`,
+        `sparseCGpre_blockjacobi`, `sparseCGpre_schwarz`, `sparseCGpre_amg`,
+        and the corresponding `sparseMINRESpre_*` forms are also accepted.
+        The default is None which amounts to
         an automatic determination of the mode. For advanced customization options
         this can also be an iterable with three callables: the first f(K), where K is the covariance matrix
         to compute a factorization object
@@ -182,13 +188,29 @@ class GP:
         - "random_logdet_verbose" : True/False; default = False
         - "random_logdet_print_info" : True/False; default = False
         - "sparse_minres_tol" : float
-        - "cg_minres_tol" : float
-        - "random_logdet_lanczos_compute_device" : str; default = "cpu"/"gpu"
+        - "sparse_cg_tol" : float
+        - "cg_minres_tol" : float; legacy CG tolerance alias still accepted
+        - "sparse_preconditioner_type" : one of `ilu`, `incomplete_cholesky`, `block_jacobi`, `additive_schwarz`, `amg`; optional if a `sparseCGpre_*` or `sparseMINRESpre_*` alias mode is used
+        - "sparse_preconditioner_drop_tol" : float; ILU only
+        - "sparse_preconditioner_fill_factor" : float; ILU only
+        - "sparse_preconditioner_block_size" : int; block Jacobi / additive Schwarz
+        - "sparse_preconditioner_schwarz_overlap" : int; additive Schwarz
+        - "sparse_preconditioner_ic_shift" : float; incomplete Cholesky diagonal shift
+        - "sparse_preconditioner_amg_cycle" : str; AMG cycle, e.g. `V`
+        - "sparse_preconditioner_refresh_interval" : int; rebuild the cached preconditioner every k matrix updates
+        - "sparse_krylov_warm_start" : True/False; reuse the previous iterative solution as the next initial guess
+        - "sparse_krylov_mode" : `single` or `block`; block mode currently applies to CG solves with multiple RHS
+        - "sparse_block_krylov" : True/False; legacy alias for `sparse_krylov_mode=\"block\"`
+        - "sparse_krylov_block_size" : int; number of RHS columns per block-CG group
+        - "sparse_krylov_maxiter" : int; generic Krylov iteration cap
+        - "sparse_cg_maxiter" : int; CG-specific iteration cap
+        - "sparse_minres_maxiter" : int; MINRES-specific iteration cap
         - "Chol_factor_compute_device" : str; default = "cpu"/"gpu"
         - "update_Chol_factor_compute_device": str; default = "cpu"/"gpu"
         - "Chol_solve_compute_device" : str; default = "cpu"/"gpu"
         - "Chol_logdet_compute_device" : str; default = "cpu"/"gpu"
         - "GPU_engine" : str; default = "torch"/"cupy"
+        - "GPU_device" : optional torch device string, e.g. "cuda:0" or "mps"
 
         All other keys will be stored and are available as part of the object instance and
         in kernel, mean, and noise functions.
@@ -320,6 +342,7 @@ def __init__(
             self.trainer,
             gp2Scale_linalg_mode=gp2Scale_linalg_mode,
         )
+        self.marginal_density = self.marginal_likelihood
 
         ##########################################
         #######prepare posterior evaluations######
@@ -1457,6 +1480,7 @@ def __getstate__(self):
             prior=self.prior,
             likelihood=self.likelihood,
             marginal_likelihood=self.marginal_likelihood,
+            marginal_density=self.marginal_likelihood,
             trainer=self.trainer,
             posterior=self.posterior
         )
@@ -1464,6 +1488,10 @@ def __getstate__(self):
 
     def __setstate__(self, state):
         self.__dict__.update(state)
+        if "marginal_likelihood" not in self.__dict__ and "marginal_density" in self.__dict__:
+            self.marginal_likelihood = self.marginal_density
+        if "marginal_density" not in self.__dict__ and "marginal_likelihood" in self.__dict__:
+            self.marginal_density = self.marginal_likelihood
 
 
 ####################################################################################
diff --git a/fvgp/gp_lin_alg.py b/fvgp/gp_lin_alg.py
index bd7d835..60b31f5 100755
--- a/fvgp/gp_lin_alg.py
+++ b/fvgp/gp_lin_alg.py
@@ -2,6 +2,7 @@
 import warnings
 warnings.simplefilter("once", UserWarning)
 import time
+from collections import deque
 from loguru import logger
 from scipy.sparse.linalg import splu
 from scipy.sparse.linalg import minres, cg, spsolve
@@ -12,11 +13,123 @@
 import importlib
 
 
+def _normalize_args(args):
+    return {} if args is None else args
+
+
+def normalize_sparse_preconditioner_type(preconditioner_type):
+    preconditioner_type = str(preconditioner_type).lower()
+    aliases = {
+        "ilu": "ilu",
+        "ic": "incomplete_cholesky",
+        "ichol": "incomplete_cholesky",
+        "incomplete_cholesky": "incomplete_cholesky",
+        "block_jacobi": "block_jacobi",
+        "blockjacobi": "block_jacobi",
+        "schwarz": "additive_schwarz",
+        "additive_schwarz": "additive_schwarz",
+        "amg": "amg",
+    }
+    if preconditioner_type not in aliases:
+        raise ValueError(
+            "Unknown sparse preconditioner type "
+            f"{preconditioner_type!r}. Expected one of "
+            "{'ilu', 'ic', 'incomplete_cholesky', 'block_jacobi', 'blockjacobi', "
+            "'schwarz', 'additive_schwarz', 'amg'}."
+        )
+    return aliases[preconditioner_type]
+
+
+def resolve_gp2scale_linalg_mode(mode, args=None):
+    args = dict(_normalize_args(args))
+    if not isinstance(mode, str):
+        return mode, args
+
+    mode_lower = mode.lower()
+    alias_prefixes = {
+        "sparsecgpre_": "sparseCGpre",
+        "sparseminrespre_": "sparseMINRESpre",
+    }
+    for prefix, canonical_mode in alias_prefixes.items():
+        if not mode_lower.startswith(prefix):
+            continue
+        inferred_type = normalize_sparse_preconditioner_type(mode_lower[len(prefix):])
+        explicit_type = args.get("sparse_preconditioner_type")
+        if explicit_type is not None:
+            explicit_type = normalize_sparse_preconditioner_type(explicit_type)
+            if explicit_type != inferred_type:
+                raise ValueError(
+                    f"Conflicting sparse preconditioner specifications: mode {mode!r} "
+                    f"implies {inferred_type!r}, but args['sparse_preconditioner_type'] "
+                    f"is {explicit_type!r}."
+                )
+        args["sparse_preconditioner_type"] = inferred_type
+        return canonical_mode, args
+
+    return mode, args
+
+
+def _torch_gpu_device(args=None):
+    args = _normalize_args(args)
+    if importlib.util.find_spec("torch") is None:
+        return None
+    import torch
+
+    if "GPU_device" in args:
+        try:
+            requested_device = torch.device(str(args["GPU_device"]))
+        except Exception:
+            return None
+        if requested_device.type == "cuda":
+            return requested_device if torch.cuda.is_available() else None
+        if requested_device.type == "mps":
+            mps_backend = getattr(torch.backends, "mps", None)
+            return requested_device if mps_backend is not None and torch.backends.mps.is_available() else None
+        return requested_device
+
+    if torch.cuda.is_available():
+        device_index = int(args.get("GPU_device_index", torch.cuda.current_device() if torch.cuda.device_count() > 0 else 0))
+        return torch.device(f"cuda:{device_index}")
+
+    mps_backend = getattr(torch.backends, "mps", None)
+    if mps_backend is not None and torch.backends.mps.is_available():
+        return torch.device("mps")
+
+    return None
+
+
+def _cupy_gpu_available():
+    if importlib.util.find_spec("cupy") is None:
+        return False
+    try:
+        import cupy as cp
+        return cp.cuda.runtime.getDeviceCount() > 0
+    except Exception:
+        return False
+
+
 def get_gpu_engine(args):
-    if "GPU_engine" in args: return args["GPU_engine"]
-    if importlib.util.find_spec("torch"): return "torch"
-    elif importlib.util.find_spec("cupy"): return "cupy"
-    else: return None
+    args = _normalize_args(args)
+    if "GPU_engine" in args:
+        preferred_engine = str(args["GPU_engine"]).lower()
+        if preferred_engine == "torch":
+            return "torch" if _torch_gpu_device(args) is not None else None
+        if preferred_engine == "cupy":
+            return "cupy" if _cupy_gpu_available() else None
+        return None
+    if _torch_gpu_device(args) is not None:
+        return "torch"
+    if _cupy_gpu_available():
+        return "cupy"
+    return None
+
+
+def _imate_gpu_enabled(args=None):
+    args = _normalize_args(args)
+    if _cupy_gpu_available():
+        return True
+    device = _torch_gpu_device(args)
+    return device is not None and device.type == "cuda"
 
 
 def calculate_sparse_LU_factor(M, args=None):
@@ -30,6 +143,10 @@ def calculate_LU_solve(LU, vec, args=None):
     assert isinstance(vec, np.ndarray)
     if np.ndim(vec) == 1: vec = vec.reshape(len(vec), 1)
     logger.debug("calculate_LU_solve")
+    lu_dtype = getattr(getattr(LU, "L", None), "dtype", None)
+    if lu_dtype is None:
+        lu_dtype = getattr(getattr(LU, "U", None), "dtype", vec.dtype)
+    vec = np.asarray(vec, dtype=lu_dtype)
     res = LU.solve(vec)
     if np.ndim(res) == 1: res = res.reshape(len(res), 1)
     assert np.ndim(res) == 2
@@ -45,6 +162,7 @@ def calculate_LU_logdet(LU, args=None):
 
 
 def calculate_Chol_factor(M, compute_device="cpu", args=None):
+    args = _normalize_args(args)
     assert isinstance(M, np.ndarray)
     if "Chol_factor_compute_device" in args: compute_device = args["Chol_factor_compute_device"]
     logger.debug(f"calculate_Chol_factor on {compute_device}")
@@ -55,7 +173,8 @@ def calculate_Chol_factor(M, compute_device="cpu", args=None):
         engine = get_gpu_engine(args)
         if engine == "torch":  # pragma: no cover
             import torch
-            A = torch.tensor(M, device="cuda:0", dtype=torch.float32)
+            device = _torch_gpu_device(args)
+            A = torch.tensor(M, device=device, dtype=torch.float32)
             L = torch.linalg.cholesky(A)
             c = L.cpu().numpy()
         elif engine == "cupy":  # pragma: no cover
@@ -63,13 +182,20 @@ def calculate_Chol_factor(M, compute_device="cpu", args=None):
             A = cp.array(M, dtype=cp.float32)
             L = cp.linalg.cholesky(A)
             c = cp.asnumpy(L)
-        else: c = None
+        else:
+            warnings.warn(
+                "No usable GPU backend found for Cholesky factorization. Falling back to CPU.",
+                stacklevel=2,
+            )
+            c, _ = cho_factor(M, lower=True)
+            c = np.tril(c)
     else:
         raise Exception("No valid compute device found. ")
     return c
 
 
 def update_Chol_factor(old_chol_factor, new_matrix, compute_device="cpu", args=None):
+    args = _normalize_args(args)
     assert isinstance(new_matrix, np.ndarray)
     compute_device = "cpu"
     #if "update_Chol_factor_compute_device" in args: compute_device = args["update_Chol_factor_compute_device"]
@@ -78,10 +204,11 @@ def update_Chol_factor(old_chol_factor, new_matrix, compute_device="cpu", args=N
     KV = new_matrix
     kk = KV[size:, size:]
     k = KV[size:, 0:size]
-    return cholesky_update_rank_n(old_chol_factor, k.T, kk, compute_device=compute_device)
+    return cholesky_update_rank_n(old_chol_factor, k.T, kk, compute_device=compute_device, args=args)
 
 
 def calculate_Chol_solve(factor, vec, compute_device="cpu", args=None):
+    args = _normalize_args(args)
     assert isinstance(vec, np.ndarray)
     if np.ndim(vec) == 1: vec = vec.reshape(len(vec), 1)
     if "Chol_solve_compute_device" in args: compute_device = args["Chol_solve_compute_device"]
@@ -92,9 +219,9 @@ def calculate_Chol_solve(factor, vec, compute_device="cpu", args=None):
         engine = get_gpu_engine(args)
         if engine == "torch":  # pragma: no cover
             import torch
-            # Move to GPU
-            L = torch.tensor(factor, device="cuda", dtype=torch.float32)
-            b = torch.tensor(vec, device="cuda", dtype=torch.float32)
+            device = _torch_gpu_device(args)
+            L = torch.tensor(factor, device=device, dtype=torch.float32)
+            b = torch.tensor(vec, device=device, dtype=torch.float32)
             y = torch.linalg.solve_triangular(L, b, upper=False)
             x = torch.linalg.solve_triangular(L.T, y, upper=True)
             res = x.cpu().numpy()
@@ -105,7 +232,12 @@ def calculate_Chol_solve(factor, vec, compute_device="cpu", args=None):
             y = cp.linalg.solve_triangular(L, b, lower=True)
             x = cp.linalg.solve_triangular(L.T, y, lower=False)
             res = cp.asnumpy(x)
-        else: res = None
+        else:
+            warnings.warn(
+                "No usable GPU backend found for Cholesky solve. Falling back to CPU.",
+                stacklevel=2,
+            )
+            res = cho_solve((factor, True), vec)
     else:
         raise Exception("NO valid compute device found. ")
 
@@ -114,6 +246,7 @@ def calculate_Chol_solve(factor, vec, compute_device="cpu", args=None):
 
 
 def calculate_Chol_logdet(factor, compute_device="cpu", args=None):
+    args = _normalize_args(args)
     if "Chol_logdet_compute_device" in args: compute_device = args["Chol_logdet_compute_device"]
     logger.debug(f"calculate_Chol_logdet on {compute_device}")
     if compute_device == "cpu":
@@ -123,13 +256,20 @@ def calculate_Chol_logdet(factor, compute_device="cpu", args=None):
         engine = get_gpu_engine(args)
         if engine == "torch":  # pragma: no cover
             import torch
-            L = torch.tensor(factor, device="cuda", dtype=torch.float32)
+            device = _torch_gpu_device(args)
+            L = torch.tensor(factor, device=device, dtype=torch.float32)
             logdet = 2.0 * torch.sum(torch.log(torch.diag(L))).cpu().item()
         elif engine == "cupy":  # pragma: no cover
             import cupy as cp
             L = cp.array(factor, dtype=cp.float32)
             logdet = 2.0 * cp.sum(cp.log(cp.diag(L))).get()
-        else: logdet = None
+        else:
+            warnings.warn(
+                "No usable GPU backend found for Cholesky logdet. Falling back to CPU.",
+                stacklevel=2,
+            )
+            upper_diag = abs(factor.diagonal())
+            logdet = 2.0 * np.sum(np.log(upper_diag))
     else:
         raise Exception("No valid compute device found. ")
     assert np.isscalar(logdet)
@@ -159,13 +299,467 @@ def spai(A, m, args=None):
     return M
 
 
+def _as_symmetric_csr(KV):
+    assert sparse.issparse(KV)
+    A = KV.tocsr().astype(np.float64)
+    return ((A + A.T) * 0.5).tocsr()
+
+
+def _shifted_dense_cholesky(matrix, args=None, shift_key="sparse_preconditioner_shift"):
+    args = _normalize_args(args)
+    base_shift = float(args.get(shift_key, args.get("sparse_preconditioner_shift", 0.0)))
+    growth = float(args.get("sparse_preconditioner_shift_growth", 10.0))
+    attempts = int(args.get("sparse_preconditioner_shift_attempts", 5))
+    eye = np.eye(matrix.shape[0], dtype=matrix.dtype)
+    last_exc = None
+    shift = base_shift
+
+    for _ in range(max(attempts, 1)):
+        try:
+            local_matrix = matrix if shift == 0.0 else matrix + shift * eye
+            return cho_factor(local_matrix, lower=True), shift
+        except Exception as exc:
+            last_exc = exc
+            shift = 1e-10 if shift == 0.0 else shift * growth
+
+    raise np.linalg.LinAlgError(f"Local Cholesky factorization failed after shifted retries: {last_exc}")
+
+
+def _build_graph_blocks(KV, block_size):
+    A = _as_symmetric_csr(KV)
+    n = A.shape[0]
+    block_size = max(int(block_size), 1)
+    assigned = np.zeros(n, dtype=bool)
+    blocks = []
+
+    for seed in range(n):
+        if assigned[seed]:
+            continue
+        queue = deque([seed])
+        queued = {seed}
+        block = []
+
+        while queue and len(block) < block_size:
+            node = queue.popleft()
+            if assigned[node]:
+                continue
+            assigned[node] = True
+            block.append(node)
+            start, end = A.indptr[node], A.indptr[node + 1]
+            for neighbor in A.indices[start:end]:
+                neighbor = int(neighbor)
+                if neighbor == node or assigned[neighbor] or neighbor in queued:
+                    continue
+                queued.add(neighbor)
+                queue.append(neighbor)
+
+        blocks.append(np.array(block, dtype=int))
+
+    return blocks
+
+
+def _expand_block_overlap(KV, block, overlap):
+    A = _as_symmetric_csr(KV)
+    overlap = max(int(overlap), 0)
+    expanded = set(int(i) for i in block)
+    frontier = set(expanded)
+
+    for _ in range(overlap):
+        if not frontier:
+            break
+        next_frontier = set()
+        for node in frontier:
+            start, end = A.indptr[node], A.indptr[node + 1]
+            for neighbor in A.indices[start:end]:
+                neighbor = int(neighbor)
+                if neighbor not in expanded:
+                    expanded.add(neighbor)
+                    next_frontier.add(neighbor)
+        frontier = next_frontier
+
+    return np.array(sorted(expanded), dtype=int)
+
+
+def _build_blockwise_operator(shape, local_models, dtype):
+    def matvec(x):
+        x = np.asarray(x, dtype=dtype)
+        result = np.zeros(shape[0], dtype=dtype)
+        for model in local_models:
+            local_rhs = x[model["indices"]]
+            local_solution = cho_solve(model["factor"], local_rhs)
+            result[model["indices"]] += local_solution
+        return result
+
+    return sparse.linalg.LinearOperator(shape, matvec=matvec, rmatvec=matvec, dtype=dtype)
+
+
+def _build_block_jacobi_preconditioner(KV, args=None):
+    args = _normalize_args(args)
+    A = _as_symmetric_csr(KV)
+    block_size = int(args.get("sparse_preconditioner_block_size", 64))
+    blocks = _build_graph_blocks(A, block_size)
+    local_models = []
+    for block in blocks:
+        local_matrix = A[block][:, block].toarray()
+        factor, shift = _shifted_dense_cholesky(local_matrix, args)
+        local_models.append({"indices": block, "factor": factor, "shift": shift})
+
+    factor = {
+        "type": "block_jacobi",
+        "blocks": blocks,
+        "local_models": local_models,
+    }
+    operator = _build_blockwise_operator(A.shape, local_models, A.dtype)
+    return factor, operator
+
+
+def _build_additive_schwarz_preconditioner(KV, args=None):
+    args = _normalize_args(args)
+    A = _as_symmetric_csr(KV)
+    block_size = int(args.get("sparse_preconditioner_block_size", 64))
+    overlap = int(args.get("sparse_preconditioner_schwarz_overlap", 1))
+    base_blocks = _build_graph_blocks(A, block_size)
+    local_models = []
+
+    for base_block in base_blocks:
+        local_indices = _expand_block_overlap(A, base_block, overlap)
+        local_matrix = A[local_indices][:, local_indices].toarray()
+        factor, shift = _shifted_dense_cholesky(local_matrix, args)
+        local_models.append(
+            {
+                "indices": local_indices,
+                "core": base_block,
+                "factor": factor,
+                "shift": shift,
+            }
+        )
+
+    factor = {
+        "type": "additive_schwarz",
+        "blocks": base_blocks,
+        "overlap": overlap,
+        "local_models": local_models,
+    }
+    operator = _build_blockwise_operator(A.shape, local_models, A.dtype)
+    return factor, operator
+
+
+def _build_ic0_factor(KV, args=None):
+    args = _normalize_args(args)
+    A = _as_symmetric_csr(KV)
+    n = A.shape[0]
+    base_shift = float(args.get("sparse_preconditioner_ic_shift", args.get("sparse_preconditioner_shift", 0.0)))
+    growth = float(args.get("sparse_preconditioner_ic_shift_growth", args.get("sparse_preconditioner_shift_growth", 10.0)))
+    attempts = int(args.get("sparse_preconditioner_ic_shift_attempts", args.get("sparse_preconditioner_shift_attempts", 5)))
+    last_exc = None
+    shift = base_shift
+
+    for _ in range(max(attempts, 1)):
+        try:
+            A_shifted = A if shift == 0.0 else A + sparse.eye(n, format="csr", dtype=A.dtype) * shift
+            lower = sparse.tril(A_shifted, format="csr")
+            diag = np.zeros(n, dtype=np.float64)
+            rows = []
+
+            for i in range(n):
+                start, end = lower.indptr[i], lower.indptr[i + 1]
+                row_cols = lower.indices[start:end]
+                row_vals = lower.data[start:end]
+                diag_val = None
+                original_entries = {}
+
+                for col, value in zip(row_cols, row_vals):
+                    col = int(col)
+                    value = float(value)
+                    if col < i:
+                        original_entries[col] = value
+                    elif col == i:
+                        diag_val = value
+
+                if diag_val is None:
+                    raise np.linalg.LinAlgError(f"IC(0) encountered a missing diagonal at row {i}")
+
+                computed_entries = {}
+                for j in sorted(original_entries):
+                    row_j = rows[j]
+                    if len(computed_entries) < len(row_j):
+                        coupling = sum(
+                            value * row_j.get(k, 0.0)
+                            for k, value in computed_entries.items()
+                        )
+                    else:
+                        coupling = sum(
+                            computed_entries.get(k, 0.0) * value
+                            for k, value in row_j.items()
+                        )
+                    computed_entries[j] = (original_entries[j] - coupling) / diag[j]
+
+                diag_sq = diag_val - sum(value * value for value in computed_entries.values())
+                if diag_sq <= 0.0:
+                    raise np.linalg.LinAlgError(f"IC(0) produced a non-positive pivot at row {i}")
+                diag[i] = np.sqrt(diag_sq)
+                rows.append(computed_entries)
+
+            columns = [[] for _ in range(n)]
+            for i, row in enumerate(rows):
+                for j, value in row.items():
+                    columns[j].append((i, value))
+
+            def solve(vector):
+                vector = np.asarray(vector, dtype=np.float64)
+                y = np.zeros_like(vector)
+                for i, row in enumerate(rows):
+                    total = vector[i]
+                    for j, value in row.items():
+                        total -= value * y[j]
+                    y[i] = total / diag[i]
+
+                z = np.zeros_like(y)
+                for i in range(n - 1, -1, -1):
+                    total = y[i]
+                    for row_index, value in columns[i]:
+                        if row_index > i:
+                            total -= value * z[row_index]
+                    z[i] = total / diag[i]
+                return z
+
+            factor = {
+                "type": "incomplete_cholesky",
+                "diag": diag,
+                "rows": rows,
+                "columns": columns,
+                "solve": solve,
+                "shift": shift,
+            }
+            operator = sparse.linalg.LinearOperator(A.shape, matvec=solve, rmatvec=solve, dtype=A.dtype)
+            return factor, operator
+        except Exception as exc:
+            last_exc = exc
+            shift = 1e-10 if shift == 0.0 else shift * growth
+
+    raise np.linalg.LinAlgError(f"IC(0) preconditioner construction failed after shifted retries: {last_exc}")
+
+
+def _build_ilu_preconditioner(KV, args=None):
+    args = _normalize_args(args)
+    A = KV.tocsc()
+    spilu_kwargs = {
+        "drop_tol": args.get("sparse_preconditioner_drop_tol", 1e-8),
+        "fill_factor": args.get("sparse_preconditioner_fill_factor", 10),
+    }
+    if "sparse_preconditioner_drop_rule" in args:
+        spilu_kwargs["drop_rule"] = args["sparse_preconditioner_drop_rule"]
+    if "sparse_preconditioner_permc_spec" in args:
+        spilu_kwargs["permc_spec"] = args["sparse_preconditioner_permc_spec"]
+    if "sparse_preconditioner_diag_pivot_thresh" in args:
+        spilu_kwargs["diag_pivot_thresh"] = args["sparse_preconditioner_diag_pivot_thresh"]
+
+    factor = sparse.linalg.spilu(A, **spilu_kwargs)
+    operator = sparse.linalg.LinearOperator(
+        A.shape,
+        matvec=factor.solve,
+        rmatvec=factor.solve,
+        dtype=A.dtype,
+    )
+    return factor, operator
+
+
+def _build_amg_preconditioner(KV, args=None):
+    args = _normalize_args(args)
+    try:
+        import pyamg
+    except ImportError as exc:
+        raise ImportError("pyamg is required for sparse_preconditioner_type='amg'") from exc
+
+    A = _as_symmetric_csr(KV)
+    solver_kwargs = {
+        "max_levels": int(args.get("sparse_preconditioner_amg_max_levels", 10)),
+        "max_coarse": int(args.get("sparse_preconditioner_amg_max_coarse", 500)),
+    }
+    if "sparse_preconditioner_amg_strength" in args:
+        solver_kwargs["strength"] = args["sparse_preconditioner_amg_strength"]
+    if "sparse_preconditioner_amg_symmetry" in args:
+        solver_kwargs["symmetry"] = args["sparse_preconditioner_amg_symmetry"]
+    if "sparse_preconditioner_amg_presmoother" in args:
+        solver_kwargs["presmoother"] = args["sparse_preconditioner_amg_presmoother"]
+    if "sparse_preconditioner_amg_postsmoother" in args:
+        solver_kwargs["postsmoother"] = args["sparse_preconditioner_amg_postsmoother"]
+
+    factor = pyamg.smoothed_aggregation_solver(A, **solver_kwargs)
+    cycle = args.get("sparse_preconditioner_amg_cycle", "V")
+    operator = factor.aspreconditioner(cycle=cycle)
+    return factor, operator
+
+
+def calculate_sparse_preconditioner(KV, args=None):
+    args = _normalize_args(args)
+    assert sparse.issparse(KV)
+    logger.debug("calculate_sparse_preconditioner")
+    preconditioner_type = normalize_sparse_preconditioner_type(
+        args.get("sparse_preconditioner_type", "ilu")
+    )
+
+    builders = {
+        "ilu": _build_ilu_preconditioner,
+        "incomplete_cholesky": _build_ic0_factor,
+        "block_jacobi": _build_block_jacobi_preconditioner,
+        "additive_schwarz": _build_additive_schwarz_preconditioner,
+        "amg": _build_amg_preconditioner,
+    }
+
+    if preconditioner_type not in builders:
+        raise ValueError(
+            "Unknown sparse preconditioner type "
+            f"{preconditioner_type!r}. Expected one of {sorted(builders)}."
+        )
+
+    factor, operator = builders[preconditioner_type](KV, args=args)
+    if isinstance(factor, dict):
+        factor.setdefault("type", preconditioner_type)
+    return factor, operator
+
+
+def _resolve_krylov_mode(args=None):
+    args = _normalize_args(args)
+    if bool(args.get("sparse_block_krylov", False)):
+        return "block"
+    mode = str(args.get("sparse_krylov_mode", "single")).lower()
+    aliases = {
+        "single": "single",
+        "columnwise": "single",
+        "block": "block",
+        "block_cg": "block",
+    }
+    if mode not in aliases:
+        raise ValueError(
+            f"Unknown sparse Krylov mode {mode!r}. Expected one of {{'single', 'block'}}."
+        )
+    return aliases[mode]
+
+
+def _resolve_krylov_maxiter(args=None, solver_key=None):
+    args = _normalize_args(args)
+    if solver_key is not None and solver_key in args:
+        value = args[solver_key]
+        return None if value is None else int(value)
+    if "sparse_krylov_maxiter" in args:
+        value = args["sparse_krylov_maxiter"]
+        return None if value is None else int(value)
+    return None
+
+
+def _normalize_rhs(vec):
+    vec = np.asarray(vec, dtype=np.float64)
+    if np.ndim(vec) == 1:
+        vec = vec.reshape(len(vec), 1)
+    return vec
+
+
+def _normalize_initial_guess(x0, shape):
+    if x0 is None:
+        return None
+    guess = np.asarray(x0, dtype=np.float64)
+    if guess.ndim == 1:
+        guess = guess.reshape(-1, 1)
+    if guess.shape[1] == 1 and shape[1] > 1:
+        guess = np.repeat(guess, shape[1], axis=1)
+    if guess.shape[1] != shape[1]:
+        raise ValueError(
+            f"Initial guess has {guess.shape[1]} columns but the RHS has {shape[1]} columns."
+        )
+    if guess.shape[0] < shape[0]:
+        padding = np.zeros((shape[0] - guess.shape[0], guess.shape[1]), dtype=guess.dtype)
+        guess = np.vstack([guess, padding])
+    elif guess.shape[0] > shape[0]:
+        guess = guess[:shape[0], :]
+    return guess
+
+
+def _column_initial_guess(x0, column_index):
+    if x0 is None:
+        return None
+    if x0.ndim == 1:
+        return x0
+    return x0[:, min(column_index, x0.shape[1] - 1)]
+
+
+def _apply_linear_operator(operator, matrix):
+    if matrix.size == 0:
+        return np.zeros_like(matrix)
+    if hasattr(operator, "matmat"):
+        try:
+            return np.asarray(operator.matmat(matrix), dtype=np.float64)
+        except Exception:
+            pass
+    return np.column_stack([np.asarray(operator @ matrix[:, i], dtype=np.float64) for i in range(matrix.shape[1])])
+
+
+def _apply_preconditioner(M, residual):
+    if M is None:
+        return residual.copy()
+    return _apply_linear_operator(M, residual)
+
+
+def _block_conjugate_gradient(KV, vec, cg_tol, x0=None, M=None, maxiter=None):
+    vec = _normalize_rhs(vec)
+    x0 = _normalize_initial_guess(x0, vec.shape)
+    n, rhs_count = vec.shape
+    if rhs_count == 1:
+        rhs = vec[:, 0]
+        initial_guess = None if x0 is None else x0[:, 0]
+        solution, exit_code = cg(KV, rhs, M=M, rtol=cg_tol, x0=initial_guess, maxiter=maxiter)
+        return solution.reshape(n, 1), exit_code
+
+    X = np.zeros((n, rhs_count), dtype=np.float64) if x0 is None else x0.copy()
+    R = vec - _apply_linear_operator(KV, X)
+    rhs_norm = np.linalg.norm(vec, axis=0)
+    rhs_norm[rhs_norm == 0.0] = 1.0
+    rel_residual = np.linalg.norm(R, axis=0) / rhs_norm
+    if np.max(rel_residual) <= cg_tol:
+        return X, 0
+
+    Z = _apply_preconditioner(M, R)
+    P = Z.copy()
+    G = R.T @ Z
+    if maxiter is None:
+        maxiter = 10 * n
+    last_exit_code = 1
+
+    for _ in range(max(maxiter, 1)):
+        AP = _apply_linear_operator(KV, P)
+        H = P.T @ AP
+        try:
+            alpha = np.linalg.solve(H, G)
+        except np.linalg.LinAlgError:
+            last_exit_code = 2
+            break
+
+        X = X + P @ alpha
+        R = R - AP @ alpha
+        rel_residual = np.linalg.norm(R, axis=0) / rhs_norm
+        if np.max(rel_residual) <= cg_tol:
+            return X, 0
+
+        Z = _apply_preconditioner(M, R)
+        G_new = R.T @ Z
+        try:
+            beta = np.linalg.solve(G, G_new)
+        except np.linalg.LinAlgError:
+            last_exit_code = 3
+            break
+        P = Z + P @ beta
+        G = G_new
+
+    return X, last_exit_code
+
+
 def calculate_random_logdet(KV, compute_device, args=None):
+    args = _normalize_args(args)
     assert sparse.issparse(KV)
     logger.debug("calculate_random_logdet")
     from imate import logdet as imate_logdet
     st = time.time()
-    if compute_device == "gpu": gpu = True
-    else: gpu = False
+    gpu = compute_device == "gpu" and _imate_gpu_enabled(args)
 
     lanczos_degree = 20
     error_rtol = 0.01
@@ -176,7 +770,6 @@ def calculate_random_logdet(KV, compute_device, args=None):
     if "random_logdet_error_rtol" in args: error_rtol = args["random_logdet_error_rtol"]
     if "random_logdet_verbose" in args: verbose = args["random_logdet_verbose"]
     if "random_logdet_print_info" in args: print_info = args["random_logdet_print_info"]
-    if "random_logdet_lanczos_compute_device" in args: lanczos_degree = args["random_logdet_lanczos_compute_device"]
 
     logdet, info_slq = imate_logdet(KV, method='slq', min_num_samples=10, max_num_samples=5000,
                                     lanczos_degree=lanczos_degree, error_rtol=error_rtol, gpu=gpu,
@@ -188,6 +781,7 @@ def calculate_random_logdet(KV, compute_device, args=None):
 
 
 def calculate_sparse_minres(KV, vec, x0=None, M=None, args=None):
+    args = _normalize_args(args)
     assert sparse.issparse(KV)
     st = time.time()
     logger.debug("MINRES solve in progress ...")
@@ -195,30 +789,88 @@ def calculate_sparse_minres(KV, vec, x0=None, M=None, args=None):
     if "sparse_minres_tol" in args:
         minres_tol = args["sparse_minres_tol"]
         logger.debug("sparse_minres_tol changed to ", minres_tol)
+    maxiter = _resolve_krylov_maxiter(args, solver_key="sparse_minres_maxiter")
 
-    if np.ndim(vec) == 1: vec = vec.reshape(len(vec), 1)
-    if isinstance(x0, np.ndarray) and len(x0) < KV.shape[0]: x0 = np.append(x0, np.zeros(KV.shape[0] - len(x0)))
+    vec = _normalize_rhs(vec)
+    x0 = _normalize_initial_guess(x0, vec.shape)
     res = np.zeros(vec.shape)
     for i in range(vec.shape[1]):
-        res[:, i], exit_code = minres(KV, vec[:, i], M=M, rtol=minres_tol, x0=x0)
-        if exit_code == 1: warnings.warn("MINRES not successful")
+        initial_guess = _column_initial_guess(x0, i)
+        res[:, i], exit_code = minres(KV, vec[:, i], M=M, rtol=minres_tol, x0=initial_guess, maxiter=maxiter)
+        if exit_code != 0: warnings.warn(f"MINRES not successful (exit_code={exit_code})")
     logger.debug("MINRES compute time: {} seconds.", time.time() - st)
     assert np.ndim(res) == 2
     return res
 
 
 def calculate_sparse_conj_grad(KV, vec, x0=None, M=None, args=None):
+    args = _normalize_args(args)
     assert sparse.issparse(KV)
     st = time.time()
     logger.debug("CG solve in progress ...")
     cg_tol = 1e-5
-    if "sparse_cg_tol" in args: cg_tol = args["sparse_minres_tol"]
-    if np.ndim(vec) == 1: vec = vec.reshape(len(vec), 1)
-    if isinstance(x0, np.ndarray) and len(x0) < KV.shape[0]: x0 = np.append(x0, np.zeros(KV.shape[0] - len(x0)))
+    # Backward-compatible tolerance resolution:
+    # - sparse_cg_tol is the corrected, explicit CG key
+    # - cg_minres_tol appeared in published docs
+    # - sparse_minres_tol was accidentally read by the published CG path
+    if "sparse_cg_tol" in args:
+        cg_tol = args["sparse_cg_tol"]
+    elif "cg_minres_tol" in args:
+        cg_tol = args["cg_minres_tol"]
+    elif "sparse_minres_tol" in args:
+        cg_tol = args["sparse_minres_tol"]
+    krylov_mode = _resolve_krylov_mode(args)
+    maxiter = _resolve_krylov_maxiter(args, solver_key="sparse_cg_maxiter")
+    vec = _normalize_rhs(vec)
+    x0 = _normalize_initial_guess(x0, vec.shape)
+
+    if krylov_mode == "block" and vec.shape[1] > 1:
+        block_size = int(args.get("sparse_krylov_block_size", vec.shape[1]))
+        block_size = max(1, min(block_size, vec.shape[1]))
+        res = np.zeros(vec.shape)
+        exit_codes = []
+        for start in range(0, vec.shape[1], block_size):
+            stop = min(start + block_size, vec.shape[1])
+            block_x0 = None if x0 is None else x0[:, start:stop]
+            try:
+                block_solution, exit_code = _block_conjugate_gradient(
+                    KV,
+                    vec[:, start:stop],
+                    cg_tol,
+                    x0=block_x0,
+                    M=M,
+                    maxiter=maxiter,
+                )
+            except Exception as exc:
+                warnings.warn(
+                    "Block CG failed; falling back to columnwise CG for this RHS block: "
+                    f"{type(exc).__name__}: {exc}"
+                )
+                block_solution = np.zeros((KV.shape[0], stop - start))
+                exit_code = 4
+                for block_index, rhs_index in enumerate(range(start, stop)):
+                    initial_guess = _column_initial_guess(x0, rhs_index)
+                    block_solution[:, block_index], _ = cg(
+                        KV,
+                        vec[:, rhs_index],
+                        M=M,
+                        rtol=cg_tol,
+                        x0=initial_guess,
+                        maxiter=maxiter,
+                    )
+            res[:, start:stop] = block_solution
+            exit_codes.append(exit_code)
+        if any(code != 0 for code in exit_codes):
+            warnings.warn(f"Block CG not fully successful (exit_codes={exit_codes})")
+        logger.debug("CG compute time: {} seconds.", time.time() - st)
+        assert np.ndim(res) == 2
+        return res
+
     res = np.zeros(vec.shape)
     for i in range(vec.shape[1]):
-        res[:, i], exit_code = cg(KV, vec[:, i], M=M, rtol=cg_tol, x0=x0)
-        if exit_code == 1: warnings.warn("CG not successful")
+        initial_guess = _column_initial_guess(x0, i)
+        res[:, i], exit_code = cg(KV, vec[:, i], M=M, rtol=cg_tol, x0=initial_guess, maxiter=maxiter)
+        if exit_code != 0: warnings.warn(f"CG not successful (exit_code={exit_code})")
     logger.debug("CG compute time: {} seconds.", time.time() - st)
     assert np.ndim(res) == 2
     return res
@@ -253,7 +905,7 @@ def cholesky_update_rank_1(L, b, c, compute_device="cpu", args=None):
     elif compute_device == "gpu":
         engine = get_gpu_engine(args)
         if engine == "torch":  # pragma: no cover
-            L_prime = cholesky_update_rank_1_torch(L, b, c)
+            L_prime = cholesky_update_rank_1_torch(L, b, c, args=args)
         elif engine == "cupy":  # pragma: no cover
             L_prime = cholesky_update_rank_1_cupy(L, b, c)
         else: L_prime = None
@@ -289,7 +941,7 @@ def cholesky_update_rank_1_numpy(L, b, c, args=None):
     return L_prime
 
 
-def cholesky_update_rank_1_torch(L, b, c):   # pragma: no cover
+def cholesky_update_rank_1_torch(L, b, c, args=None):   # pragma: no cover
     """
     Rank-1 Cholesky update on GPU using PyTorch.
 
@@ -304,9 +956,10 @@ def cholesky_update_rank_1_torch(L, b, c):   # pragma: no cover
     L_prime : (n+1, n+1) updated lower-triangular Cholesky factor on GPU
     """
     import torch
+    device = _torch_gpu_device(args)
     # Solve L v = b (forward solve)
-    L = torch.tensor(L, device="cuda", dtype=torch.float32)
-    b = torch.tensor(b, device="cuda", dtype=torch.float32)
+    L = torch.tensor(L, device=device, dtype=torch.float32)
+    b = torch.tensor(b, device=device, dtype=torch.float32)
 
     v = torch.linalg.solve_triangular(L, b.unsqueeze(1), upper=False).squeeze(1)
 
@@ -362,7 +1015,13 @@ def cholesky_update_rank_n(L, b, c, compute_device="cpu", args=None):
     # Solve Lv = b for v
     L_prime = L.copy()
     for i in range(b.shape[1]):
-        L_prime = cholesky_update_rank_1(L_prime, np.append(b[:, i], c[0:i, i]), c[i, i], compute_device=compute_device)
+        L_prime = cholesky_update_rank_1(
+            L_prime,
+            np.append(b[:, i], c[0:i, i]),
+            c[i, i],
+            compute_device=compute_device,
+            args=args,
+        )
     return L_prime
 
 
@@ -374,13 +1033,23 @@ def calculate_logdet(A, compute_device='cpu', args=None):
         return logdet
     elif compute_device == "gpu":
         try:
-            import torch
-            A = torch.from_numpy(A).cuda()
-            sign, logdet = torch.slogdet(A)
-            logdet = logdet.cpu().numpy()
-            logdet = np.nan_to_num(logdet)
-            assert np.isscalar(logdet)
-            return logdet
+            engine = get_gpu_engine(args)
+            if engine == "torch":
+                import torch
+                device = _torch_gpu_device(args)
+                A_dev = torch.as_tensor(A, device=device, dtype=torch.float32)
+                sign, logdet = torch.slogdet(A_dev)
+                logdet = np.nan_to_num(logdet.detach().cpu().numpy())
+                assert np.isscalar(logdet)
+                return logdet
+            if engine == "cupy":
+                import cupy as cp
+                A_dev = cp.asarray(A, dtype=cp.float32)
+                sign, logdet = cp.linalg.slogdet(A_dev)
+                logdet = np.nan_to_num(cp.asnumpy(logdet))
+                assert np.isscalar(logdet)
+                return logdet
+            raise RuntimeError("No usable GPU backend available")
         except Exception as e:
             warnings.warn(
                 "I encountered a problem using the GPU via pytorch. Falling back to Numpy and the CPU.")
@@ -399,7 +1068,7 @@ def update_logdet(old_logdet, old_inv, new_matrix, compute_device="cpu", args=No
     KV = new_matrix
     kk = KV[size:, size:]
     k = KV[size:, 0:size]
-    res = old_logdet + calculate_logdet(kk - k @ old_inv @ k.T, compute_device=compute_device)
+    res = old_logdet + calculate_logdet(kk - k @ old_inv @ k.T, compute_device=compute_device, args=args)
     assert np.isscalar(res)
     return res
 
@@ -411,10 +1080,18 @@ def calculate_inv(A, compute_device='cpu', args=None):
     if compute_device == "cpu":
         return np.linalg.inv(A)
     elif compute_device == "gpu":
-        import torch
-        A = torch.from_numpy(A)
-        B = torch.inverse(A)
-        return B.numpy()
+        engine = get_gpu_engine(args)
+        if engine == "torch":  # pragma: no cover
+            import torch
+            device = _torch_gpu_device(args)
+            A_dev = torch.as_tensor(A, device=device, dtype=torch.float32)
+            B = torch.inverse(A_dev)
+            return B.detach().cpu().numpy()
+        if engine == "cupy":  # pragma: no cover
+            import cupy as cp
+            A_dev = cp.asarray(A, dtype=cp.float32)
+            return cp.asnumpy(cp.linalg.inv(A_dev))
+        return np.linalg.inv(A)
     else:
         return np.linalg.inv(A)
 
@@ -422,7 +1099,7 @@ def calculate_inv(A, compute_device='cpu', args=None):
 def calculate_inv_from_chol(L, compute_device="cpu", args=None):
     logger.debug("calculate_inv_from_chol")
     if compute_device == "cpu": A_inv = cho_solve((L, True), np.eye(L.shape[0]))
-    elif compute_device == "gpu": A_inv = calculate_Chol_solve(L, np.eye(L.shape[0]), compute_device="gpu")
+    elif compute_device == "gpu": A_inv = calculate_Chol_solve(L, np.eye(L.shape[0]), compute_device="gpu", args=args)
     else: raise Exception("No valid compute device found.")
     return A_inv
 
@@ -433,7 +1110,7 @@ def update_inv(old_inv, new_matrix, compute_device="cpu", args=None):
     KV = new_matrix
     kk = KV[size:, size:]
     k = KV[size:, 0:size]
-    X = calculate_inv(kk - k @ old_inv @ k.T, compute_device=compute_device)
+    X = calculate_inv(kk - k @ old_inv @ k.T, compute_device=compute_device, args=args)
     F = -old_inv @ k.T @ X
     new_inv = np.block([[old_inv + old_inv @ k.T @ X @ k @ old_inv, F],
                         [F.T, X]])
@@ -456,10 +1133,11 @@ def solve(A, b, compute_device='cpu', args=None):
         engine = get_gpu_engine(args)
         if engine == "torch":  # pragma: no cover
             import torch
-            At = torch.from_numpy(A).cuda()
-            bt = torch.from_numpy(b).cuda()
+            device = _torch_gpu_device(args)
+            At = torch.as_tensor(A, device=device, dtype=torch.float32)
+            bt = torch.as_tensor(b, device=device, dtype=torch.float32)
             x = torch.linalg.solve(At, bt)
-            x = x.cpu().numpy()
+            x = x.detach().cpu().numpy()
             if np.ndim(x) == 1: x = x.reshape(len(x), 1)
             assert np.ndim(x) == np.ndim(b)
             return x
@@ -493,17 +1171,18 @@ def matmul(A, B, compute_device="cpu", args=None):
         engine = get_gpu_engine(args)
         if engine == "torch":  # pragma: no cover
             import torch
-            A = torch.tensor(A, device="cuda", dtype=torch.float32)
-            B = torch.tensor(B, device="cuda", dtype=torch.float32)
+            device = _torch_gpu_device(args)
+            A = torch.tensor(A, device=device, dtype=torch.float32)
+            B = torch.tensor(B, device=device, dtype=torch.float32)
             res = A@B
-            res = res.cpu().numpy()
+            res = res.detach().cpu().numpy()
         elif engine == "cupy":  # pragma: no cover
             import cupy as cp
             A = cp.array(A)
             B = cp.array(B)
             res = A @ B
             res = cp.asnumpy(res)
-        else: res = None
+        else: res = A @ B
     else:
         raise Exception("NO valid compute device found. ")
     return res
@@ -522,11 +1201,12 @@ def matmul3(A, B, C, compute_device="cpu", args=None):
         engine = get_gpu_engine(args)
         if engine == "torch":  # pragma: no cover
             import torch
-            A = torch.tensor(A, device="cuda", dtype=torch.float32)
-            B = torch.tensor(B, device="cuda", dtype=torch.float32)
-            C = torch.tensor(C, device="cuda", dtype=torch.float32)
+            device = _torch_gpu_device(args)
+            A = torch.tensor(A, device=device, dtype=torch.float32)
+            B = torch.tensor(B, device=device, dtype=torch.float32)
+            C = torch.tensor(C, device=device, dtype=torch.float32)
             res = A @ B @ C
-            res = res.cpu().numpy()
+            res = res.detach().cpu().numpy()
         elif engine == "cupy":  # pragma: no cover
             import cupy as cp
             A = cp.array(A)
@@ -534,7 +1214,7 @@ def matmul3(A, B, C, compute_device="cpu", args=None):
             C = cp.array(C)
             res = A @ B @ C
             res = cp.asnumpy(res)
-        else: res = None
+        else: res = A @ B @ C
     else:
         raise Exception("NO valid compute device found. ")
     return res
diff --git a/fvgp/gp_marginal_likelihood.py b/fvgp/gp_marginal_likelihood.py
index a141c16..2222ac3 100755
--- a/fvgp/gp_marginal_likelihood.py
+++ b/fvgp/gp_marginal_likelihood.py
@@ -135,6 +135,34 @@ def _set_state_KVinvY(self, K, V, m, mode):
         logger.debug("Solve in progress")
         KVinvY = self.KVlinalg.solve(y_mean).reshape(y_mean.shape)
         return KVinvY
+
+    def _build_sparse_preconditioner_or_none(self, KV):
+        try:
+            _, operator = calculate_sparse_preconditioner(KV, args=self.args)
+            return operator
+        except Exception as exc:
+            logger.warning(
+                "Sparse preconditioner construction failed; "
+                "falling back to the unpreconditioned iterative solve: {}",
+                exc,
+            )
+            return None
+
+    def _iterative_initial_guess(self, target_shape):
+        if not bool(self.args.get("sparse_krylov_warm_start", False)):
+            return None
+        if self.KVinvY is None:
+            return None
+        guess = np.asarray(self.KVinvY)
+        if guess.ndim == 1:
+            guess = guess.reshape(-1, 1)
+        if guess.shape[0] != target_shape[0]:
+            return None
+        if guess.shape[1] == target_shape[1]:
+            return guess
+        if guess.shape[1] == 1 and target_shape[1] > 1:
+            return np.repeat(guess, target_shape[1], axis=1)
+        return None
     ##################################################################
     ##################################################################
     ##################################################################
@@ -145,6 +173,7 @@ def compute_new_KVinvY(self, KV, m):
         This does not change the KV obj
         """
         y_mean = self.y_data - m[:, None]
+        x0 = self._iterative_initial_guess(y_mean.shape)
         if self.gp2Scale:   # pragma: no cover
             mode: Any = self._set_gp2Scale_mode(KV)
             if mode == "sparseLU":
@@ -155,15 +184,15 @@ def compute_new_KVinvY(self, KV, m):
                 Chol_factor = calculate_Chol_factor(KV, compute_device=self.compute_device, args=self.args)
                 KVinvY = calculate_Chol_solve(Chol_factor, y_mean, compute_device=self.compute_device, args=self.args)
             elif mode == "sparseCG":
-                KVinvY = calculate_sparse_conj_grad(KV, y_mean, args=self.args)
+                KVinvY = calculate_sparse_conj_grad(KV, y_mean, x0=x0, args=self.args)
             elif mode == "sparseMINRES":
-                KVinvY = calculate_sparse_minres(KV, y_mean, args=self.args)
+                KVinvY = calculate_sparse_minres(KV, y_mean, x0=x0, args=self.args)
             elif mode == "sparseMINRESpre":
-                B = sparse.linalg.spilu(KV, drop_tol=1e-8)
-                KVinvY = calculate_sparse_minres(KV, y_mean, M=B.L.T @ B.L, args=self.args)
+                M = self._build_sparse_preconditioner_or_none(KV)
+                KVinvY = calculate_sparse_minres(KV, y_mean, M=M, x0=x0, args=self.args)
             elif mode == "sparseCGpre":
-                B = sparse.linalg.spilu(KV, drop_tol=1e-8)
-                KVinvY = calculate_sparse_conj_grad(KV, y_mean, M=B.L.T @ B.L, args=self.args)
+                M = self._build_sparse_preconditioner_or_none(KV)
+                KVinvY = calculate_sparse_conj_grad(KV, y_mean, M=M, x0=x0, args=self.args)
             elif mode == "sparseSolve":
                 KVinvY = calculate_sparse_solve(KV, y_mean, args=self.args)
             elif callable(mode[0]) and callable(mode[1]) and callable(mode[2]):
@@ -184,6 +213,7 @@ def _compute_new_KVlogdet_KVinvY(self, K, V, m):
         """
         KV = self.addKV(K, V)
         y_mean = self.y_data - m[:, None]
+        x0 = self._iterative_initial_guess(y_mean.shape)
         if self.gp2Scale:
             mode: Any = self._set_gp2Scale_mode(KV)
             if mode == "sparseLU":
@@ -196,18 +226,18 @@ def _compute_new_KVlogdet_KVinvY(self, K, V, m):
                 KVinvY = calculate_Chol_solve(Chol_factor, y_mean, compute_device=self.compute_device, args=self.args)
                 KVlogdet = calculate_Chol_logdet(Chol_factor, compute_device=self.compute_device, args=self.args)
             elif mode == "sparseCG":
-                KVinvY = calculate_sparse_conj_grad(KV, y_mean, args=self.args)
+                KVinvY = calculate_sparse_conj_grad(KV, y_mean, x0=x0, args=self.args)
                 KVlogdet = calculate_random_logdet(KV, self.compute_device, args=self.args)
             elif mode == "sparseMINRES":
-                KVinvY = calculate_sparse_minres(KV, y_mean, args=self.args)
+                KVinvY = calculate_sparse_minres(KV, y_mean, x0=x0, args=self.args)
                 KVlogdet = calculate_random_logdet(KV, self.compute_device, args=self.args)
             elif mode == "sparseMINRESpre":
-                B = sparse.linalg.spilu(KV, drop_tol=1e-8)
-                KVinvY = calculate_sparse_minres(KV, y_mean, M=B.L.T @ B.L, args=self.args)
+                M = self._build_sparse_preconditioner_or_none(KV)
+                KVinvY = calculate_sparse_minres(KV, y_mean, M=M, x0=x0, args=self.args)
                 KVlogdet = calculate_random_logdet(KV, self.compute_device, args=self.args)
             elif mode == "sparseCGpre":
-                B = sparse.linalg.spilu(KV, drop_tol=1e-8)
-                KVinvY = calculate_sparse_conj_grad(KV, y_mean, M=B.L.T @ B.L, args=self.args)
+                M = self._build_sparse_preconditioner_or_none(KV)
+                KVinvY = calculate_sparse_conj_grad(KV, y_mean, M=M, x0=x0, args=self.args)
                 KVlogdet = calculate_random_logdet(KV, self.compute_device, args=self.args)
             elif mode == "sparseSolve":
                 KVinvY = calculate_sparse_solve(KV, y_mean, args=self.args)
@@ -263,12 +293,16 @@ def addKV(K, V):
     def _set_gp2Scale_mode(self, KV):
         Ksparsity = float(KV.nnz) / float(len(self.x_data) ** 2)
         if self.gp2Scale_linalg_mode is not None:
-            return self.gp2Scale_linalg_mode
+            mode, resolved_args = resolve_gp2scale_linalg_mode(self.gp2Scale_linalg_mode, self.args)
+            self.data.args = resolved_args
+            return mode
         elif len(self.x_data) < 50001 and Ksparsity < 0.0001:
             mode = "sparseLU"
         elif len(self.x_data) < 2001 and Ksparsity >= 0.0001:
             mode = "Chol"
         else:
+            # Preserve the published gp2Scale automatic behavior unless the
+            # caller explicitly asks for a different solver mode.
             mode = "sparseMINRES"
         return mode
 
@@ -472,5 +506,3 @@ def __getstate__(self):
 
     def __setstate__(self, state):
         self.__dict__.update(state)
-
-
diff --git a/fvgp/kernels.py b/fvgp/kernels.py
index 339a1ce..7928660 100755
--- a/fvgp/kernels.py
+++ b/fvgp/kernels.py
@@ -1,4 +1,9 @@
+import importlib
+import warnings
+
 import numpy as np
+import scipy.sparse as sparse
+from scipy.spatial import cKDTree
 from scipy.spatial.distance import cdist
 
 
@@ -521,8 +526,184 @@ def wendland_anisotropic_gp2Scale_cpu(x1, x2, hps):
     for i in range(len(x1[0])): distance_matrix += (np.subtract.outer(x1[:, i], x2[:, i]) / hps[1 + i]) ** 2
     d = np.sqrt(distance_matrix)
     d[d > 1.] = 1.
-    kernel = hps[0] * (1. - d) ** 8 * (35. * d ** 3 + 25. * d ** 2 + 8. * d + 1.)
-    return kernel
+    return _wendland_anisotropic_polynomial(d, hps[0])
+
+
+def _wendland_anisotropic_polynomial(d, amplitude):
+    return amplitude * (1. - d) ** 8 * (35. * d ** 3 + 25. * d ** 2 + 8. * d + 1.)
+
+
+_GP2SCALE_SPARSE_SORT_WARNING_EMITTED = False
+
+
+def _warn_gp2scale_sparse_kernel_sorting():
+    global _GP2SCALE_SPARSE_SORT_WARNING_EMITTED
+    if _GP2SCALE_SPARSE_SORT_WARNING_EMITTED:
+        return
+    warnings.warn(
+        "The support-aware gp2Scale Wendland kernels work best when gp2Scale batches preserve locality. "
+        "If your data have a progression column, such as day in a climate dataset, sort by that column before batching.",
+        stacklevel=2,
+    )
+    _GP2SCALE_SPARSE_SORT_WARNING_EMITTED = True
+
+
+def _get_torch_gpu_device():  # pragma: no cover
+    if importlib.util.find_spec("torch") is None:
+        return None
+    import torch
+
+    if torch.cuda.is_available():
+        device_index = torch.cuda.current_device() if torch.cuda.device_count() > 0 else 0
+        return torch.device(f"cuda:{device_index}")
+
+    mps_backend = getattr(torch.backends, "mps", None)
+    if mps_backend is not None and torch.backends.mps.is_available():
+        return torch.device("mps")
+
+    return None
+
+
+def _cupy_gpu_available():  # pragma: no cover
+    if importlib.util.find_spec("cupy") is None:
+        return False
+    try:
+        import cupy as cp
+        return cp.cuda.runtime.getDeviceCount() > 0
+    except Exception:
+        return False
+
+
+def _get_default_gpu_engine():  # pragma: no cover
+    if _get_torch_gpu_device() is not None:
+        return "torch"
+    if _cupy_gpu_available():
+        return "cupy"
+    return None
+
+
+def _wendland_triplets_from_neighbor_lists(neighbor_lists):
+    row_parts = []
+    col_parts = []
+    for row_idx, neighbors in enumerate(neighbor_lists):
+        if not neighbors:
+            continue
+        cols = np.asarray(neighbors, dtype=np.int64)
+        row_parts.append(np.full(cols.size, row_idx, dtype=np.int64))
+        col_parts.append(cols)
+
+    if not row_parts:
+        return (
+            np.empty(0, dtype=np.int64),
+            np.empty(0, dtype=np.int64),
+        )
+
+    return np.concatenate(row_parts), np.concatenate(col_parts)
+
+
+def _empty_gp2scale_sparse_block(x1, x2):
+    return sparse.coo_matrix((len(x1), len(x2)))
+
+
+def _whiten_gp2scale_points(x1, x2, hps):
+    scales = np.asarray(hps[1:], dtype=float)
+    z1 = np.asarray(x1, dtype=float) / scales
+    z2 = np.asarray(x2, dtype=float) / scales
+    return z1, z2
+
+
+def _gp2scale_whitened_block_distance(z1, z2):
+    mins_1 = np.min(z1, axis=0)
+    maxs_1 = np.max(z1, axis=0)
+    mins_2 = np.min(z2, axis=0)
+    maxs_2 = np.max(z2, axis=0)
+    gap = np.maximum(0.0, np.maximum(mins_1 - maxs_2, mins_2 - maxs_1))
+    return np.linalg.norm(gap)
+
+
+def _wendland_support_aware_cpu_triplets(x1, x2, hps):
+    """
+    Output-sensitive COO triplets for the anisotropic Wendland gp2Scale kernel.
+    The support condition is an ellipsoid in the original coordinates and a unit
+    ball in whitened coordinates, so block assembly can be written as a radius
+    search rather than dense all-pairs evaluation.
+    """
+    if len(x1) == 0 or len(x2) == 0:
+        return (
+            np.empty(0, dtype=float),
+            np.empty(0, dtype=np.int64),
+            np.empty(0, dtype=np.int64),
+        )
+
+    z1, z2 = _whiten_gp2scale_points(x1, x2, hps)
+    if _gp2scale_whitened_block_distance(z1, z2) > 1.0:
+        return (
+            np.empty(0, dtype=float),
+            np.empty(0, dtype=np.int64),
+            np.empty(0, dtype=np.int64),
+        )
+
+    tree1 = cKDTree(z1)
+    tree2 = cKDTree(z2)
+    neighbor_lists = tree1.query_ball_tree(tree2, r=1.0)
+
+    rows, cols = _wendland_triplets_from_neighbor_lists(neighbor_lists)
+    if rows.size == 0:
+        return (
+            np.empty(0, dtype=float),
+            np.empty(0, dtype=np.int64),
+            np.empty(0, dtype=np.int64),
+        )
+
+    diff = z2[cols] - z1[rows]
+    dist_sq = np.sum(diff * diff, axis=1)
+    inside_mask = dist_sq <= 1.0
+    if not np.all(inside_mask):
+        rows = rows[inside_mask]
+        cols = cols[inside_mask]
+        dist_sq = dist_sq[inside_mask]
+        if rows.size == 0:
+            return (
+                np.empty(0, dtype=float),
+                np.empty(0, dtype=np.int64),
+                np.empty(0, dtype=np.int64),
+            )
+
+    distances = np.sqrt(np.minimum(dist_sq, 1.0))
+    values = _wendland_anisotropic_polynomial(distances, hps[0])
+    nonzero_mask = values != 0.0
+    if not np.all(nonzero_mask):
+        rows = rows[nonzero_mask]
+        cols = cols[nonzero_mask]
+        values = values[nonzero_mask]
+        if rows.size == 0:
+            return (
+                np.empty(0, dtype=float),
+                np.empty(0, dtype=np.int64),
+                np.empty(0, dtype=np.int64),
+            )
+
+    return (
+        values,
+        rows,
+        cols,
+    )
+
+
+def wendland_anisotropic_gp2Scale_cpu_sparse(x1, x2, hps):
+    """
+    Support-aware anisotropic Wendland kernel for gp2Scale.
+
+    This kernel preserves the usual kernel interface but performs block-local
+    support checks and radius-search assembly internally, returning a sparse COO
+    matrix directly. It is intended for gp2Scale workflows where the batchwise
+    kernel is assembled on sparse blocks.
+    """
+    _warn_gp2scale_sparse_kernel_sorting()
+    values, rows, cols = _wendland_support_aware_cpu_triplets(x1, x2, hps)
+    if values.size == 0:
+        return _empty_gp2scale_sparse_block(x1, x2)
+    return sparse.coo_matrix((values, (rows, cols)), shape=(len(x1), len(x2)))
 
 
 def _get_distance_matrix_gpu(x1, x2, device, hps):  # pragma: no cover
@@ -551,15 +732,137 @@ def wendland_anisotropic_gp2Scale_gpu(x1, x2, hps):  # pragma: no cover
     ------
     Covariance matrix : np.ndarray
     """
-    import torch
-    cuda_device = torch.device("cuda:0")
-    x1_dev = torch.from_numpy(x1).to(cuda_device, dtype=torch.float32)
-    x2_dev = torch.from_numpy(x2).to(cuda_device, dtype=torch.float32)
-    hps_dev = torch.from_numpy(hps).to(cuda_device, dtype=torch.float32)
-    d = _get_distance_matrix_gpu(x1_dev, x2_dev, cuda_device, hps_dev)
-    d[d > 1.] = 1.
-    kernel = hps[0] * (1. - d) ** 8 * (35. * d ** 3 + 25. * d ** 2 + 8. * d + 1.)
-    return kernel.cpu().numpy()
+    engine = _get_default_gpu_engine()
+    if engine == "torch":
+        import torch
+        device = _get_torch_gpu_device()
+        x1_dev = torch.as_tensor(x1, device=device, dtype=torch.float32)
+        x2_dev = torch.as_tensor(x2, device=device, dtype=torch.float32)
+        hps_dev = torch.as_tensor(hps, device=device, dtype=torch.float32)
+        d = _get_distance_matrix_gpu(x1_dev, x2_dev, device, hps_dev)
+        d = torch.clamp(d, max=1.0)
+        kernel = hps_dev[0] * (1. - d) ** 8 * (35. * d ** 3 + 25. * d ** 2 + 8. * d + 1.)
+        return kernel.detach().cpu().numpy()
+    if engine == "cupy":
+        import cupy as cp
+        x1_dev = cp.asarray(x1, dtype=cp.float32)
+        x2_dev = cp.asarray(x2, dtype=cp.float32)
+        hps_dev = cp.asarray(hps, dtype=cp.float32)
+        d = cp.zeros((len(x1), len(x2)), dtype=cp.float32)
+        for i in range(x1.shape[1]):
+            d += ((x1_dev[:, i].reshape(-1, 1) - x2_dev[:, i]) / hps_dev[1 + i]) ** 2
+        d = cp.sqrt(d)
+        d = cp.minimum(d, cp.float32(1.0))
+        kernel = hps_dev[0] * (1. - d) ** 8 * (35. * d ** 3 + 25. * d ** 2 + 8. * d + 1.)
+        return cp.asnumpy(kernel)
+
+    warnings.warn(
+        "No usable GPU backend was found for wendland_anisotropic_gp2Scale_gpu; "
+        "falling back to the CPU Wendland implementation.",
+        stacklevel=2,
+    )
+    return wendland_anisotropic_gp2Scale_cpu(x1, x2, hps)
+
+
+def _wendland_support_aware_gpu_triplets(x1, x2, hps):  # pragma: no cover
+    """
+    Output-sensitive COO triplets for the anisotropic Wendland gp2Scale kernel
+    with GPU-backed distance/polynomial evaluation when a usable GPU backend is
+    available. Neighbor search remains host-side.
+    """
+    if len(x1) == 0 or len(x2) == 0:
+        return (
+            np.empty(0, dtype=float),
+            np.empty(0, dtype=np.int64),
+            np.empty(0, dtype=np.int64),
+        )
+
+    z1, z2 = _whiten_gp2scale_points(x1, x2, hps)
+    if _gp2scale_whitened_block_distance(z1, z2) > 1.0:
+        return (
+            np.empty(0, dtype=float),
+            np.empty(0, dtype=np.int64),
+            np.empty(0, dtype=np.int64),
+        )
+
+    tree1 = cKDTree(z1)
+    tree2 = cKDTree(z2)
+    neighbor_lists = tree1.query_ball_tree(tree2, r=1.0)
+    rows, cols = _wendland_triplets_from_neighbor_lists(neighbor_lists)
+    if rows.size == 0:
+        return (
+            np.empty(0, dtype=float),
+            np.empty(0, dtype=np.int64),
+            np.empty(0, dtype=np.int64),
+        )
+
+    diff = z2[cols] - z1[rows]
+    engine = _get_default_gpu_engine()
+    if engine == "torch":
+        import torch
+        device = _get_torch_gpu_device()
+        diff_dev = torch.as_tensor(diff, device=device, dtype=torch.float32)
+        dist_sq_dev = torch.sum(diff_dev * diff_dev, dim=1)
+        inside_mask = (dist_sq_dev <= 1.0).detach().cpu().numpy()
+        if not np.all(inside_mask):
+            rows = rows[inside_mask]
+            cols = cols[inside_mask]
+            dist_sq_dev = dist_sq_dev[inside_mask]
+            if rows.size == 0:
+                return (
+                    np.empty(0, dtype=float),
+                    np.empty(0, dtype=np.int64),
+                    np.empty(0, dtype=np.int64),
+                )
+        distances = torch.sqrt(torch.clamp(dist_sq_dev, max=1.0))
+        values = (_wendland_anisotropic_polynomial(distances, float(hps[0]))).detach().cpu().numpy()
+    elif engine == "cupy":
+        import cupy as cp
+        diff_dev = cp.asarray(diff, dtype=cp.float32)
+        dist_sq_dev = cp.sum(diff_dev * diff_dev, axis=1)
+        inside_mask = cp.asnumpy(dist_sq_dev <= cp.float32(1.0))
+        if not np.all(inside_mask):
+            rows = rows[inside_mask]
+            cols = cols[inside_mask]
+            dist_sq_dev = dist_sq_dev[inside_mask]
+            if rows.size == 0:
+                return (
+                    np.empty(0, dtype=float),
+                    np.empty(0, dtype=np.int64),
+                    np.empty(0, dtype=np.int64),
+                )
+        distances = cp.sqrt(cp.minimum(dist_sq_dev, cp.float32(1.0)))
+        values = cp.asnumpy(_wendland_anisotropic_polynomial(distances, float(hps[0])))
+    else:
+        warnings.warn(
+            "No usable GPU backend was found for the support-aware GPU Wendland kernel; "
+            "falling back to the CPU sparse Wendland kernel.",
+            stacklevel=2,
+        )
+        return _wendland_support_aware_cpu_triplets(x1, x2, hps)
+
+    nonzero_mask = values != 0.0
+    if not np.all(nonzero_mask):
+        rows = rows[nonzero_mask]
+        cols = cols[nonzero_mask]
+        values = values[nonzero_mask]
+
+    return values, rows, cols
+
+
+def wendland_anisotropic_gp2Scale_gpu_sparse(x1, x2, hps):  # pragma: no cover
+    """
+    GPU-backed support-aware anisotropic Wendland kernel for gp2Scale.
+
+    Neighbor discovery remains host-side; only the distance/polynomial
+    evaluation on the discovered support graph is GPU-backed when a usable
+    backend is available.
+    """
+    _warn_gp2scale_sparse_kernel_sorting()
+    values, rows, cols = _wendland_support_aware_gpu_triplets(x1, x2, hps)
+    if values.size == 0:
+        return _empty_gp2scale_sparse_block(x1, x2)
+    return sparse.coo_matrix((values, (rows, cols)), shape=(len(x1), len(x2)))
 
 
 def wasserstein_1d(a, b):
diff --git a/fvgp/kv_linalg.py b/fvgp/kv_linalg.py
index 8f2004a..52c1fc8 100755
--- a/fvgp/kv_linalg.py
+++ b/fvgp/kv_linalg.py
@@ -1,4 +1,5 @@
 import numpy as np
+import warnings
 from .gp_lin_alg import *
 from scipy.sparse import issparse
 import scipy.sparse as sparse
@@ -14,47 +15,184 @@ def __init__(self, compute_device, data):
         self.KV = None
         self.Chol_factor = None
         self.LU_factor = None
+        self.Preconditioner_factor = None
+        self.Preconditioner_operator = None
+        self.Preconditioner_signature = None
+        self.Preconditioner_reuse_counter = 0
+        self.Last_iterative_solution = None
         self.custom_obj = None
         self.allowed_modes = ["Chol", "CholInv", "Inv", "sparseMINRES", "sparseCG",
-                              "sparseLU", "sparseMINRESpre", "sparseCGpre", "sparseSolve", "a set of callables"]
+                              "sparseLU", "sparseMINRESpre", "sparseCGpre",
+                              "sparseMINRESpre_<type>", "sparseCGpre_<type>",
+                              "sparseSolve", "a set of callables"]
 
     @property
     def args(self):
         return self.data.args
 
+    def _warm_start_enabled(self):
+        return bool(self.args.get("sparse_krylov_warm_start", False))
+
+    def _preconditioner_refresh_interval(self):
+        return max(1, int(self.args.get("sparse_preconditioner_refresh_interval", 1)))
+
+    def _preconditioner_signature(self):
+        relevant = {
+            key: value
+            for key, value in self.args.items()
+            if key.startswith("sparse_preconditioner_")
+        }
+        return tuple(sorted(relevant.items()))
+
+    def _reset_sparse_preconditioner(self):
+        self.Preconditioner_factor = None
+        self.Preconditioner_operator = None
+        self.Preconditioner_signature = None
+        self.Preconditioner_reuse_counter = 0
+
+    def _can_reuse_sparse_preconditioner(self, KV, mode):
+        if mode not in {"sparseMINRESpre", "sparseCGpre"}:
+            return False
+        if self.Preconditioner_operator is None:
+            return False
+        if self.mode != mode:
+            return False
+        if self.KV is None:
+            return False
+        if self.KV.shape != KV.shape:
+            return False
+        if self.Preconditioner_signature != self._preconditioner_signature():
+            return False
+        if self.Preconditioner_reuse_counter >= self._preconditioner_refresh_interval() - 1:
+            return False
+        return True
+
+    def _refresh_sparse_preconditioner(self):
+        if self.mode not in {"sparseMINRESpre", "sparseCGpre"} or self.KV is None:
+            self._reset_sparse_preconditioner()
+            return
+        try:
+            factor, operator = calculate_sparse_preconditioner(self.KV, args=self.args)
+        except Exception as exc:
+            warnings.warn(
+                f"Failed to build sparse preconditioner for mode {self.mode}; "
+                "falling back to the unpreconditioned iterative solve."
+            )
+            logger.warning("Sparse preconditioner construction failed for {}: {}", self.mode, exc)
+            self._reset_sparse_preconditioner()
+            return
+        self.Preconditioner_factor = factor
+        self.Preconditioner_operator = operator
+        self.Preconditioner_signature = self._preconditioner_signature()
+        self.Preconditioner_reuse_counter = 0
+
+    def _get_sparse_preconditioner(self):
+        if self.mode in {"sparseMINRESpre", "sparseCGpre"} and self.Preconditioner_operator is None:
+            self._refresh_sparse_preconditioner()
+        return self.Preconditioner_operator
+
+    def _iterative_modes(self):
+        return {"sparseMINRES", "sparseCG", "sparseMINRESpre", "sparseCGpre"}
+
+    def _store_iterative_solution(self, solution):
+        if self._warm_start_enabled():
+            self.Last_iterative_solution = np.array(solution, copy=True)
+
+    def _consume_warm_start(self, b, x0):
+        if x0 is not None:
+            return x0
+        if not self._warm_start_enabled():
+            return None
+        if self.Last_iterative_solution is None:
+            return None
+        guess = np.asarray(self.Last_iterative_solution)
+        rhs = np.asarray(b)
+        if rhs.ndim == 1:
+            rhs = rhs.reshape(-1, 1)
+        if guess.ndim == 1:
+            guess = guess.reshape(-1, 1)
+        if guess.shape[0] != rhs.shape[0]:
+            return None
+        if guess.shape[1] == rhs.shape[1]:
+            return guess
+        if guess.shape[1] == 1 and rhs.shape[1] > 1:
+            return np.repeat(guess, rhs.shape[1], axis=1)
+        return None
+
+    def _prepare_iterative_state(self, KV, mode):
+        if mode in {"sparseMINRESpre", "sparseCGpre"}:
+            if self._can_reuse_sparse_preconditioner(KV, mode):
+                self.KV = KV
+                self.mode = mode
+                self.Preconditioner_reuse_counter += 1
+            else:
+                self.KV = KV
+                self.mode = mode
+                self._reset_sparse_preconditioner()
+                self._refresh_sparse_preconditioner()
+        else:
+            if mode not in {"sparseMINRES", "sparseCG"}:
+                self.Last_iterative_solution = None
+            self._reset_sparse_preconditioner()
+            self.KV = KV
+            self.mode = mode
+
     def set_KV(self, KV, mode):
-        self.mode = mode
+        previous_solution = self.Last_iterative_solution
+        previous_mode = self.mode
+        previous_custom_obj = self.custom_obj
+        mode, resolved_args = resolve_gp2scale_linalg_mode(mode, self.args)
+        self.data.args = resolved_args
         self.KVinv = None
         self.KV = None
         self.Chol_factor = None
         self.LU_factor = None
-        assert self.mode is not None
+        self.custom_obj = None
+        if mode not in self._iterative_modes():
+            self.Last_iterative_solution = None
+        assert mode is not None
+        self.mode = mode
         if self.mode == "Chol":
+            self._reset_sparse_preconditioner()
             if issparse(KV): KV = KV.toarray()
             self.Chol_factor = calculate_Chol_factor(KV, compute_device=self.compute_device, args=self.args)
         elif self.mode == "CholInv":
+            self._reset_sparse_preconditioner()
             if issparse(KV): KV = KV.toarray()
             self.Chol_factor = calculate_Chol_factor(KV, compute_device=self.compute_device, args=self.args)
             self.KVinv = calculate_inv_from_chol(self.Chol_factor, compute_device=self.compute_device, args=self.args)
         elif self.mode == "Inv":
+            self._reset_sparse_preconditioner()
             self.KV = KV
             self.KVinv = calculate_inv(KV, compute_device=self.compute_device, args=self.args)
         elif self.mode == "sparseMINRES":
-            self.KV = KV
+            self._prepare_iterative_state(KV, self.mode)
         elif self.mode == "sparseCG":
-            self.KV = KV
+            self._prepare_iterative_state(KV, self.mode)
         elif self.mode == "sparseLU":
+            self._reset_sparse_preconditioner()
             self.LU_factor = calculate_sparse_LU_factor(KV, args=self.args)
         elif self.mode == "sparseMINRESpre":
-            self.KV = KV
+            self._prepare_iterative_state(KV, self.mode)
         elif self.mode == "sparseCGpre":
-            self.KV = KV
+            self._prepare_iterative_state(KV, self.mode)
         elif self.mode == "sparseSolve":
+            self._reset_sparse_preconditioner()
             self.KV = KV
         elif callable(self.mode[0]):
+            self._reset_sparse_preconditioner()
             self.custom_obj = self.mode[0](KV)
         else:
+            self.custom_obj = previous_custom_obj
             raise Exception("No Mode. Choose from: ", self.allowed_modes)
+        if self._warm_start_enabled() and previous_solution is not None and mode in self._iterative_modes():
+            previous = np.asarray(previous_solution)
+            if previous.ndim == 1:
+                previous = previous.reshape(-1, 1)
+            if previous.shape[0] == KV.shape[0]:
+                self.Last_iterative_solution = previous
+            elif mode != previous_mode:
+                self.Last_iterative_solution = None
 
     def update_KV(self, KV):
         if self.mode == "Chol":
@@ -85,15 +223,29 @@ def update_KV(self, KV):
         elif self.mode == "sparseLU":
             self.LU_factor = calculate_sparse_LU_factor(KV, args=self.args)
         elif self.mode == "sparseMINRESpre":
-            self.KV = KV
+            if self._can_reuse_sparse_preconditioner(KV, self.mode):
+                self.KV = KV
+                self.Preconditioner_reuse_counter += 1
+            else:
+                self.KV = KV
+                self._reset_sparse_preconditioner()
+                self._refresh_sparse_preconditioner()
         elif self.mode == "sparseCGpre":
-            self.KV = KV
+            if self._can_reuse_sparse_preconditioner(KV, self.mode):
+                self.KV = KV
+                self.Preconditioner_reuse_counter += 1
+            else:
+                self.KV = KV
+                self._reset_sparse_preconditioner()
+                self._refresh_sparse_preconditioner()
         elif self.mode == "sparseSolve":
             self.KV = KV
         elif callable(self.mode[0]):
             self.custom_obj = self.mode[0](KV)
         else:
             raise Exception("No Mode. Choose from: ", self.allowed_modes)
+        if self.mode not in self._iterative_modes():
+            self.Last_iterative_solution = None
 
     def solve(self, b, x0=None):
         if self.mode == "Chol":
@@ -104,17 +256,35 @@ def solve(self, b, x0=None):
             #return matmul(self.KVinv, b, compute_device=self.compute_device) #is this really faster?
             return self.KVinv @ b
         elif self.mode == "sparseCG":
-            return calculate_sparse_conj_grad(self.KV, b, x0=x0, args=self.args)
+            res = calculate_sparse_conj_grad(self.KV, b, x0=self._consume_warm_start(b, x0), args=self.args)
+            self._store_iterative_solution(res)
+            return res
         elif self.mode == "sparseMINRES":
-            return calculate_sparse_minres(self.KV, b, x0=x0, args=self.args)
+            res = calculate_sparse_minres(self.KV, b, x0=self._consume_warm_start(b, x0), args=self.args)
+            self._store_iterative_solution(res)
+            return res
         elif self.mode == "sparseLU":
             return calculate_LU_solve(self.LU_factor, b, args=self.args)
         elif self.mode == "sparseMINRESpre":
-            B = sparse.linalg.spilu(self.KV, drop_tol=1e-8)
-            return calculate_sparse_minres(self.KV, b, M=B.L.T @ B.L, x0=x0, args=self.args)
+            res = calculate_sparse_minres(
+                self.KV,
+                b,
+                M=self._get_sparse_preconditioner(),
+                x0=self._consume_warm_start(b, x0),
+                args=self.args,
+            )
+            self._store_iterative_solution(res)
+            return res
         elif self.mode == "sparseCGpre":
-            B = sparse.linalg.spilu(self.KV, drop_tol=1e-8)
-            return calculate_sparse_conj_grad(self.KV, b, M=B.L.T @ B.L, x0=x0, args=self.args)
+            res = calculate_sparse_conj_grad(
+                self.KV,
+                b,
+                M=self._get_sparse_preconditioner(),
+                x0=self._consume_warm_start(b, x0),
+                args=self.args,
+            )
+            self._store_iterative_solution(res)
+            return res
         elif self.mode == "sparseSolve":
             return calculate_sparse_solve(self.KV, b, args=self.args)
         elif callable(self.mode[1]):
@@ -144,6 +314,11 @@ def __getstate__(self):
             KV=self.KV,
             Chol_factor=self.Chol_factor,
             LU_factor=self.LU_factor,
+            Preconditioner_factor=None,
+            Preconditioner_operator=None,
+            Preconditioner_signature=self.Preconditioner_signature,
+            Preconditioner_reuse_counter=self.Preconditioner_reuse_counter,
+            Last_iterative_solution=self.Last_iterative_solution,
             custom_obj=self.custom_obj,
             allowed_modes=self.allowed_modes
         )
@@ -151,3 +326,13 @@ def __getstate__(self):
 
     def __setstate__(self, state):
         self.__dict__.update(state)
+        if "Preconditioner_factor" not in self.__dict__:
+            self.Preconditioner_factor = None
+        if "Preconditioner_operator" not in self.__dict__:
+            self.Preconditioner_operator = None
+        if "Preconditioner_signature" not in self.__dict__:
+            self.Preconditioner_signature = None
+        if "Preconditioner_reuse_counter" not in self.__dict__:
+            self.Preconditioner_reuse_counter = 0
+        if "Last_iterative_solution" not in self.__dict__:
+            self.Last_iterative_solution = None
diff --git a/tests/test_fvgp.py b/tests/test_fvgp.py
index 261a59a..f6e2e67 100755
--- a/tests/test_fvgp.py
+++ b/tests/test_fvgp.py
@@ -5,6 +5,7 @@
 
 import unittest
 import numpy as np
+import pytest
 from fvgp import fvGP
 from fvgp import GP
 import time
@@ -20,6 +21,7 @@
 from distributed.utils_test import gen_cluster, client, loop, cluster_fixture, loop_in_thread, cleanup
 from fvgp.kernels import *
 from fvgp.gp_lin_alg import *
+from fvgp.kv_linalg import KVlinalg
 from scipy import sparse
 from fvgp import deep_kernel_network
 
@@ -73,6 +75,79 @@ def test_lin_alg():
     how_sparse_is(A)
 
 
+def test_gp2scale_sparse_kernel_matches_dense_kernel():
+    rng = np.random.default_rng(7)
+    x1 = rng.random((24, 3))
+    x2 = rng.random((19, 3))
+    hps = np.array([1.7, 0.35, 0.45, 0.55])
+
+    dense_block = wendland_anisotropic_gp2Scale_cpu(x1, x2, hps)
+    sparse_block = wendland_anisotropic_gp2Scale_cpu_sparse(x1, x2, hps)
+
+    assert sparse.isspmatrix_coo(sparse_block)
+    assert np.allclose(dense_block, sparse_block.toarray())
+
+    far_block = wendland_anisotropic_gp2Scale_cpu_sparse(x1, x2 + 10.0, hps)
+    assert sparse.isspmatrix_coo(far_block)
+    assert far_block.nnz == 0
+
+
+def test_gp2scale_linalg_mode_alias_resolution():
+    mode, args = resolve_gp2scale_linalg_mode("sparseCGpre_blockjacobi")
+    assert mode == "sparseCGpre"
+    assert args["sparse_preconditioner_type"] == "block_jacobi"
+
+    mode, args = resolve_gp2scale_linalg_mode(
+        "sparseMINRESpre_ic",
+        {"sparse_preconditioner_ic_shift": 1e-8},
+    )
+    assert mode == "sparseMINRESpre"
+    assert args["sparse_preconditioner_type"] == "incomplete_cholesky"
+    assert args["sparse_preconditioner_ic_shift"] == 1e-8
+
+    with pytest.raises(ValueError):
+        resolve_gp2scale_linalg_mode(
+            "sparseCGpre_ilu",
+            {"sparse_preconditioner_type": "amg"},
+        )
+
+
+def test_kvlinalg_sparse_preconditioner_reuse_and_warm_start():
+    rng = np.random.default_rng(11)
+    A = rng.standard_normal((40, 40))
+    A = A @ A.T + np.eye(40)
+    KV = sparse.csr_matrix(A)
+    data = argparse.Namespace(
+        args={
+            "sparse_preconditioner_block_size": 8,
+            "sparse_preconditioner_refresh_interval": 3,
+            "sparse_krylov_warm_start": True,
+        }
+    )
+    kv = KVlinalg("cpu", data)
+    kv.set_KV(KV, "sparseCGpre_blockjacobi")
+
+    rhs = rng.standard_normal(40)
+    solution = kv.solve(rhs)
+    operator_before = kv.Preconditioner_operator
+
+    assert kv.mode == "sparseCGpre"
+    assert data.args["sparse_preconditioner_type"] == "block_jacobi"
+    assert operator_before is not None
+    assert kv.Last_iterative_solution is not None
+
+    KV_updated = sparse.csr_matrix(A + 0.01 * np.eye(40))
+    kv.update_KV(KV_updated)
+
+    assert kv.Preconditioner_operator is operator_before
+    assert kv.Preconditioner_reuse_counter == 1
+
+    updated_solution = kv.solve(rhs)
+    residual = KV_updated @ np.asarray(updated_solution).reshape(-1) - rhs
+    assert np.linalg.norm(residual) / np.linalg.norm(rhs) < 1e-4
+    assert solution.shape == updated_solution.shape == (40, 1)
+
+
 def test_single_task_init_basic():
     def kernel(x1,x2,hps):
         d = get_distance_matrix(x1,x2)
@@ -345,6 +420,7 @@ def mkernel(x1,x2,hps):
 
 
 def test_gp2Scale(client):
+    pytest.importorskip("imate")
     input_dim = 1
     N = 200
     x_data = np.random.rand(N,input_dim)
@@ -400,6 +476,24 @@ def test_gp2Scale(client):
     my_gp2S.update_gp_data(x_data,y_data, append = False)
     my_gp2S.update_gp_data(x_new,y_new, append = True)
 
+    my_gp2S = GP(
+        np.sort(x_data, axis=0),
+        y_data,
+        init_hps,
+        gp2Scale=True,
+        kernel_function=wendland_anisotropic_gp2Scale_cpu_sparse,
+        gp2Scale_batch_size=100,
+        gp2Scale_dask_client=client,
+        gp2Scale_linalg_mode="sparseCGpre_blockjacobi",
+        args={
+            "sparse_preconditioner_block_size": 16,
+            "sparse_krylov_warm_start": True,
+        },
+    )
+    my_gp2S.log_likelihood(hyperparameters=init_hps)
+    my_gp2S.log_likelihood(hyperparameters=init_hps)
+    my_gp2S.update_gp_data(x_new, y_new, append=True)
+
     my_gp2S.train(hyperparameter_bounds=hps_bounds, max_iter = 2, init_hyperparameters = init_hps, info = True)
 
     def obj_func(hps,args):
@@ -470,7 +564,8 @@ def test_pickle():
     #TEST0
     #tests empty gp pickling
     my_gpo = GP(x_data, y_data)
-    pickle.loads(pickle.dumps(my_gpo))
+    my_gpo2 = pickle.loads(pickle.dumps(my_gpo))
+    assert my_gpo2.marginal_density is my_gpo2.marginal_likelihood
 
     #TEST1
     #initialize the GPOptimizer
@@ -593,5 +688,3 @@ def is_pickle_equal(obj):
     assert is_pickle_equal(my_gpo.posterior)
     assert is_pickle_equal(my_gpo.data)
     assert is_pickle_equal(my_gpo.marginal_likelihood.KVlinalg)
-
-

From 91398f8b25ed01a1b8fb72c5875884f0b106f9f6 Mon Sep 17 00:00:00 2001
From: vt2211 <vardaan.tekriwal@gmail.com>
Date: Tue, 14 Apr 2026 16:11:14 -0700
Subject: [PATCH 2/3] logdet ccompute device readded, removed marginal_density
 aliases, and enforced warm starts only for training

---
 fvgp/fvgp.py                   |  3 +-
 fvgp/ggmp.py                   | 11 +++---
 fvgp/gp.py                     |  9 ++---
 fvgp/gp_lin_alg.py             |  3 +-
 fvgp/gp_marginal_likelihood.py |  8 ++--
 fvgp/gp_posterior.py           |  2 +-
 fvgp/kv_linalg.py              | 26 +++++++------
 tests/test_fvgp.py             | 70 ++++++++++++++++++++++++++++++++--
 8 files changed, 100 insertions(+), 32 deletions(-)

diff --git a/fvgp/fvgp.py b/fvgp/fvgp.py
index 2c88cbf..a556d1d 100755
--- a/fvgp/fvgp.py
+++ b/fvgp/fvgp.py
@@ -204,13 +204,14 @@ class provides all the methods described for the GP (:py:class:`fvgp.GP`) class.
         - "sparse_preconditioner_ic_shift" : float; incomplete Cholesky diagonal shift
         - "sparse_preconditioner_amg_cycle" : str; AMG cycle, e.g. `V`
         - "sparse_preconditioner_refresh_interval" : int; rebuild the cached preconditioner every k matrix updates
-        - "sparse_krylov_warm_start" : True/False; reuse the previous iterative solution as the next initial guess
+        - "sparse_krylov_warm_start" : True/False; reuse the previous iterative solution as the next initial guess during training-time iterative solves
         - "sparse_krylov_mode" : `single` or `block`; block mode currently applies to CG solves with multiple RHS
         - "sparse_block_krylov" : True/False; legacy alias for `sparse_krylov_mode=\"block\"`
         - "sparse_krylov_block_size" : int; number of RHS columns per block-CG group
         - "sparse_krylov_maxiter" : int; generic Krylov iteration cap
         - "sparse_cg_maxiter" : int; CG-specific iteration cap
         - "sparse_minres_maxiter" : int; MINRES-specific iteration cap
+        - "random_logdet_lanczos_compute_device" : str; overrides the stochastic logdet device with "cpu" or "gpu"
         - "Chol_factor_compute_device" : str; default = "cpu"/"gpu"
         - "update_Chol_factor_compute_device": str; default = "cpu"/"gpu"
         - "Chol_solve_compute_device" : str; default = "cpu"/"gpu"
diff --git a/fvgp/ggmp.py b/fvgp/ggmp.py
index 9df5224..9e2df4e 100644
--- a/fvgp/ggmp.py
+++ b/fvgp/ggmp.py
@@ -496,10 +496,11 @@ def _gp_log_likelihood(self, gp):
         """
         Compute scalar log-likelihood for a configured GP object.
 
-        Prefers marginal_density.neg_log_likelihood (most consistent with gradients),
-        and falls back to marginal_density.log_likelihood / gp.log_likelihood.
+        Prefers marginal_likelihood.neg_log_likelihood (most consistent with
+        gradients), and falls back to marginal_likelihood.log_likelihood /
+        gp.log_likelihood.
         """
-        md = getattr(gp, "marginal_density", None)
+        md = getattr(gp, "marginal_likelihood", None)
         if md is not None:
             if hasattr(md, "neg_log_likelihood"):
                 return -self._as_float(md.neg_log_likelihood(hyperparameters=None), reduce="sum")
@@ -517,9 +518,9 @@ def _gp_neg_log_likelihood_gradient(self, gp):
         Uses fvGP's analytical gradient (fixed in v4.7.8), falling back to numerical
         gradient for gp2Scale/GPU mode where analytical is not supported.
         """
-        md = getattr(gp, "marginal_density", None)
+        md = getattr(gp, "marginal_likelihood", None)
         if md is None:
-            raise AttributeError("GP has no marginal_density object")
+            raise AttributeError("GP has no marginal_likelihood object")
 
         hps = np.asarray(gp.hyperparameters, dtype=float)
 
diff --git a/fvgp/gp.py b/fvgp/gp.py
index 8a22c51..a7b557f 100755
--- a/fvgp/gp.py
+++ b/fvgp/gp.py
@@ -198,13 +198,14 @@ class GP:
         - "sparse_preconditioner_ic_shift" : float; incomplete Cholesky diagonal shift
         - "sparse_preconditioner_amg_cycle" : str; AMG cycle, e.g. `V`
         - "sparse_preconditioner_refresh_interval" : int; rebuild the cached preconditioner every k matrix updates
-        - "sparse_krylov_warm_start" : True/False; reuse the previous iterative solution as the next initial guess
+        - "sparse_krylov_warm_start" : True/False; reuse the previous iterative solution as the next initial guess during training-time iterative solves
         - "sparse_krylov_mode" : `single` or `block`; block mode currently applies to CG solves with multiple RHS
         - "sparse_block_krylov" : True/False; legacy alias for `sparse_krylov_mode=\"block\"`
         - "sparse_krylov_block_size" : int; number of RHS columns per block-CG group
         - "sparse_krylov_maxiter" : int; generic Krylov iteration cap
         - "sparse_cg_maxiter" : int; CG-specific iteration cap
         - "sparse_minres_maxiter" : int; MINRES-specific iteration cap
+        - "random_logdet_lanczos_compute_device" : str; overrides the stochastic logdet device with "cpu" or "gpu"
         - "Chol_factor_compute_device" : str; default = "cpu"/"gpu"
         - "update_Chol_factor_compute_device": str; default = "cpu"/"gpu"
         - "Chol_solve_compute_device" : str; default = "cpu"/"gpu"
@@ -342,7 +343,6 @@ def __init__(
             self.trainer,
             gp2Scale_linalg_mode=gp2Scale_linalg_mode,
         )
-        self.marginal_density = self.marginal_likelihood
 
         ##########################################
         #######prepare posterior evaluations######
@@ -1480,7 +1480,6 @@ def __getstate__(self):
             prior=self.prior,
             likelihood=self.likelihood,
             marginal_likelihood=self.marginal_likelihood,
-            marginal_density=self.marginal_likelihood,
             trainer=self.trainer,
             posterior=self.posterior
         )
@@ -1490,8 +1489,8 @@ def __setstate__(self, state):
         self.__dict__.update(state)
         if "marginal_likelihood" not in self.__dict__ and "marginal_density" in self.__dict__:
             self.marginal_likelihood = self.marginal_density
-        if "marginal_density" not in self.__dict__ and "marginal_likelihood" in self.__dict__:
-            self.marginal_density = self.marginal_likelihood
+        if "marginal_density" in self.__dict__:
+            del self.__dict__["marginal_density"]
 
 
 ####################################################################################
diff --git a/fvgp/gp_lin_alg.py b/fvgp/gp_lin_alg.py
index 60b31f5..5496e7e 100755
--- a/fvgp/gp_lin_alg.py
+++ b/fvgp/gp_lin_alg.py
@@ -759,7 +759,8 @@ def calculate_random_logdet(KV, compute_device, args=None):
     logger.debug("calculate_random_logdet")
     from imate import logdet as imate_logdet
     st = time.time()
-    gpu = compute_device == "gpu" and _imate_gpu_enabled(args)
+    logdet_compute_device = str(args.get("random_logdet_lanczos_compute_device", compute_device)).lower()
+    gpu = logdet_compute_device == "gpu" and _imate_gpu_enabled(args)
 
     lanczos_degree = 20
     error_rtol = 0.01
diff --git a/fvgp/gp_marginal_likelihood.py b/fvgp/gp_marginal_likelihood.py
index 2222ac3..a96a256 100755
--- a/fvgp/gp_marginal_likelihood.py
+++ b/fvgp/gp_marginal_likelihood.py
@@ -121,7 +121,7 @@ def _update_state_KVinvY(self, K, V, m):
         y_mean = self.y_data - m[:, None]
         KV = self.addKV(K, V)
         self.KVlinalg.update_KV(KV)
-        KVinvY = self.KVlinalg.solve(y_mean, x0=self.KVinvY).reshape(y_mean.shape)
+        KVinvY = self.KVlinalg.solve(y_mean, x0=self.KVinvY, training=False).reshape(y_mean.shape)
         return KVinvY
 
     def _set_state_KVinvY(self, K, V, m, mode):
@@ -133,7 +133,7 @@ def _set_state_KVinvY(self, K, V, m, mode):
         self.KVlinalg.set_KV(KV, mode)
         logger.debug("KVlinalg obj set")
         logger.debug("Solve in progress")
-        KVinvY = self.KVlinalg.solve(y_mean).reshape(y_mean.shape)
+        KVinvY = self.KVlinalg.solve(y_mean, training=False).reshape(y_mean.shape)
         return KVinvY
 
     def _build_sparse_preconditioner_or_none(self, KV):
@@ -166,14 +166,14 @@ def _iterative_initial_guess(self, target_shape):
     ##################################################################
     ##################################################################
     ##################################################################
-    def compute_new_KVinvY(self, KV, m):
+    def compute_new_KVinvY(self, KV, m, training=False):
         """
         Recompute KVinvY for new hyperparameters (e.g. during training, for instance)
         This is only used by some posterior functions and in the gradient of the log likelihood function.
         This does not change the KV obj
         """
         y_mean = self.y_data - m[:, None]
-        x0 = self._iterative_initial_guess(y_mean.shape)
+        x0 = self._iterative_initial_guess(y_mean.shape) if training else None
         if self.gp2Scale:   # pragma: no cover
             mode: Any = self._set_gp2Scale_mode(KV)
             if mode == "sparseLU":
diff --git a/fvgp/gp_posterior.py b/fvgp/gp_posterior.py
index 0099879..84ab24c 100755
--- a/fvgp/gp_posterior.py
+++ b/fvgp/gp_posterior.py
@@ -28,7 +28,7 @@ def d_kernel_dx(self, x_pred, x_data, direction, hyperparameters):
         return self.prior.d_kernel_dx(x_pred, x_data, direction, hyperparameters)
 
     def KVsolve(self, v):
-        return self.marginal_likelihood.KVlinalg.solve(v)
+        return self.marginal_likelihood.KVlinalg.solve(v, training=False)
 
     def compute_new_KVinvY(self, KV, m):
         return self.marginal_likelihood.compute_new_KVinvY(KV, m)
diff --git a/fvgp/kv_linalg.py b/fvgp/kv_linalg.py
index 52c1fc8..2ac30c1 100755
--- a/fvgp/kv_linalg.py
+++ b/fvgp/kv_linalg.py
@@ -94,13 +94,15 @@ def _get_sparse_preconditioner(self):
     def _iterative_modes(self):
         return {"sparseMINRES", "sparseCG", "sparseMINRESpre", "sparseCGpre"}
 
-    def _store_iterative_solution(self, solution):
-        if self._warm_start_enabled():
+    def _store_iterative_solution(self, solution, training=False):
+        if training and self._warm_start_enabled():
             self.Last_iterative_solution = np.array(solution, copy=True)
 
-    def _consume_warm_start(self, b, x0):
+    def _consume_warm_start(self, b, x0, training=False):
         if x0 is not None:
             return x0
+        if not training:
+            return None
         if not self._warm_start_enabled():
             return None
         if self.Last_iterative_solution is None:
@@ -247,7 +249,7 @@ def update_KV(self, KV):
         if self.mode not in self._iterative_modes():
             self.Last_iterative_solution = None
 
-    def solve(self, b, x0=None):
+    def solve(self, b, x0=None, training=False):
         if self.mode == "Chol":
             return calculate_Chol_solve(self.Chol_factor, b, compute_device=self.compute_device, args=self.args)
         elif self.mode == "CholInv":
@@ -256,12 +258,12 @@ def solve(self, b, x0=None):
             #return matmul(self.KVinv, b, compute_device=self.compute_device) #is this really faster?
             return self.KVinv @ b
         elif self.mode == "sparseCG":
-            res = calculate_sparse_conj_grad(self.KV, b, x0=self._consume_warm_start(b, x0), args=self.args)
-            self._store_iterative_solution(res)
+            res = calculate_sparse_conj_grad(self.KV, b, x0=self._consume_warm_start(b, x0, training=training), args=self.args)
+            self._store_iterative_solution(res, training=training)
             return res
         elif self.mode == "sparseMINRES":
-            res = calculate_sparse_minres(self.KV, b, x0=self._consume_warm_start(b, x0), args=self.args)
-            self._store_iterative_solution(res)
+            res = calculate_sparse_minres(self.KV, b, x0=self._consume_warm_start(b, x0, training=training), args=self.args)
+            self._store_iterative_solution(res, training=training)
             return res
         elif self.mode == "sparseLU":
             return calculate_LU_solve(self.LU_factor, b, args=self.args)
@@ -270,20 +272,20 @@ def solve(self, b, x0=None):
                 self.KV,
                 b,
                 M=self._get_sparse_preconditioner(),
-                x0=self._consume_warm_start(b, x0),
+                x0=self._consume_warm_start(b, x0, training=training),
                 args=self.args,
             )
-            self._store_iterative_solution(res)
+            self._store_iterative_solution(res, training=training)
             return res
         elif self.mode == "sparseCGpre":
             res = calculate_sparse_conj_grad(
                 self.KV,
                 b,
                 M=self._get_sparse_preconditioner(),
-                x0=self._consume_warm_start(b, x0),
+                x0=self._consume_warm_start(b, x0, training=training),
                 args=self.args,
             )
-            self._store_iterative_solution(res)
+            self._store_iterative_solution(res, training=training)
             return res
         elif self.mode == "sparseSolve":
             return calculate_sparse_solve(self.KV, b, args=self.args)
diff --git a/tests/test_fvgp.py b/tests/test_fvgp.py
index f6e2e67..cd6bf7d 100755
--- a/tests/test_fvgp.py
+++ b/tests/test_fvgp.py
@@ -6,6 +6,8 @@
 import unittest
 import numpy as np
 import pytest
+import sys
+import types
 from fvgp import fvGP
 from fvgp import GP
 import time
@@ -20,6 +22,8 @@
 from dask.distributed import performance_report
 from distributed.utils_test import gen_cluster, client, loop, cluster_fixture, loop_in_thread, cleanup
 from fvgp.kernels import *
+import fvgp.gp_lin_alg as gp_lin_alg_module
+import fvgp.kv_linalg as kv_linalg_module
 from fvgp.gp_lin_alg import *
 from fvgp.kv_linalg import KVlinalg
 from scipy import sparse
@@ -112,6 +116,35 @@ def test_gp2scale_linalg_mode_alias_resolution():
         )
 
 
+def test_random_logdet_compute_device_override(monkeypatch):
+    captured_gpu_flags = []
+
+    fake_imate = types.ModuleType("imate")
+
+    def fake_logdet(*args, **kwargs):
+        captured_gpu_flags.append(kwargs["gpu"])
+        return 0.0, {}
+
+    fake_imate.logdet = fake_logdet
+    monkeypatch.setitem(sys.modules, "imate", fake_imate)
+    monkeypatch.setattr(gp_lin_alg_module, "_imate_gpu_enabled", lambda args=None: True)
+
+    KV = sparse.eye(4, format="csr")
+
+    calculate_random_logdet(
+        KV,
+        "cpu",
+        args={"random_logdet_lanczos_compute_device": "gpu"},
+    )
+    calculate_random_logdet(
+        KV,
+        "gpu",
+        args={"random_logdet_lanczos_compute_device": "cpu"},
+    )
+
+    assert captured_gpu_flags == [True, False]
+
+
 def test_kvlinalg_sparse_preconditioner_reuse_and_warm_start():
     rng = np.random.default_rng(11)
     A = rng.standard_normal((40, 40))
@@ -128,7 +161,7 @@ def test_kvlinalg_sparse_preconditioner_reuse_and_warm_start():
     kv.set_KV(KV, "sparseCGpre_blockjacobi")
 
     rhs = rng.standard_normal(40)
-    solution = kv.solve(rhs)
+    solution = kv.solve(rhs, training=True)
     operator_before = kv.Preconditioner_operator
 
     assert kv.mode == "sparseCGpre"
@@ -142,12 +175,35 @@ def test_kvlinalg_sparse_preconditioner_reuse_and_warm_start():
     assert kv.Preconditioner_operator is operator_before
     assert kv.Preconditioner_reuse_counter == 1
 
-    updated_solution = kv.solve(rhs)
+    updated_solution = kv.solve(rhs, training=True)
     residual = KV_updated @ np.asarray(updated_solution).reshape(-1) - rhs
     assert np.linalg.norm(residual) / np.linalg.norm(rhs) < 1e-4
     assert solution.shape == updated_solution.shape == (40, 1)
 
 
+def test_kvlinalg_warm_start_is_training_only(monkeypatch):
+    captured_x0 = []
+
+    def fake_cg(KV, vec, x0=None, M=None, args=None):
+        captured_x0.append(None if x0 is None else np.array(x0, copy=True))
+        return np.ones((KV.shape[0], 1))
+
+    monkeypatch.setattr(kv_linalg_module, "calculate_sparse_conj_grad", fake_cg)
+
+    data = argparse.Namespace(args={"sparse_krylov_warm_start": True})
+    kv = KVlinalg("cpu", data)
+    kv.mode = "sparseCG"
+    kv.KV = sparse.eye(4, format="csr")
+    kv.Last_iterative_solution = 2.0 * np.ones((4, 1))
+
+    rhs = np.arange(4.0)
+    kv.solve(rhs, training=False)
+    kv.solve(rhs, training=True)
+
+    assert captured_x0[0] is None
+    assert np.allclose(captured_x0[1], 2.0 * np.ones((4, 1)))
+
+
 def test_single_task_init_basic():
     def kernel(x1,x2,hps):
         d = get_distance_matrix(x1,x2)
@@ -565,7 +621,15 @@ def test_pickle():
     #tests empty gp pickling
     my_gpo = GP(x_data, y_data)
     my_gpo2 = pickle.loads(pickle.dumps(my_gpo))
-    assert my_gpo2.marginal_density is my_gpo2.marginal_likelihood
+    assert hasattr(my_gpo2, "marginal_likelihood")
+    assert not hasattr(my_gpo2, "marginal_density")
+
+    legacy_state = my_gpo.__getstate__()
+    legacy_state["marginal_density"] = legacy_state.pop("marginal_likelihood")
+    my_gpo3 = object.__new__(GP)
+    my_gpo3.__setstate__(legacy_state)
+    assert hasattr(my_gpo3, "marginal_likelihood")
+    assert not hasattr(my_gpo3, "marginal_density")
 
     #TEST1
     #initialize the GPOptimizer

From baf457e2769f69e97e4ca9a2ab4de93b44852d99 Mon Sep 17 00:00:00 2001
From: Vardaan Tekriwal <vardaan.tekriwal@gmail.com>
Date: Tue, 21 Apr 2026 16:01:30 -0700
Subject: [PATCH 3/3] New ichol implementation, error handling, and guidance
 from benchmarks

---
 fvgp/fvgp.py                   |  31 +++-
 fvgp/gp.py                     |  32 +++-
 fvgp/gp_lin_alg.py             | 316 +++++++++++++++++++++++++++++----
 fvgp/gp_marginal_likelihood.py |   3 +-
 fvgp/kv_linalg.py              |  11 +-
 tests/test_fvgp.py             |  30 +++-
 6 files changed, 376 insertions(+), 47 deletions(-)

diff --git a/fvgp/fvgp.py b/fvgp/fvgp.py
index a556d1d..91ed060 100755
--- a/fvgp/fvgp.py
+++ b/fvgp/fvgp.py
@@ -153,9 +153,13 @@ class provides all the methods described for the GP (:py:class:`fvgp.GP`) class.
         One of `Chol`, `sparseLU`, `sparseCG`, `sparseMINRES`, `sparseSolve`, `sparseCGpre`
         (cached sparse preconditioner; geometry selected via `args["sparse_preconditioner_type"]`),
         or `sparseMINRESpre`. Supported preconditioner geometries are `ilu`,
-        `incomplete_cholesky`, `block_jacobi`, `additive_schwarz`, and `amg`.
+        compiled incomplete Cholesky (`ic`, `ichol`, or `incomplete_cholesky`) and
+        zero-fill `ichol0`; these use the optional `ilupp` backend. `native_ic` keeps
+        the legacy Python IC(0) implementation for debugging/comparison. Other options are `block_jacobi`,
+        `additive_schwarz`, and `amg`.
         Convenience aliases such as `sparseCGpre_ilu`, `sparseCGpre_ic`,
-        `sparseCGpre_blockjacobi`, `sparseCGpre_schwarz`, `sparseCGpre_amg`,
+        `sparseCGpre_ichol0`, `sparseCGpre_native_ic`, `sparseCGpre_blockjacobi`,
+        `sparseCGpre_schwarz`, `sparseCGpre_amg`,
         and the corresponding `sparseMINRESpre_*` forms are also accepted.
         The default is None which amounts to
         an automatic determination of the mode. For advanced customization options
@@ -196,12 +200,14 @@ class provides all the methods described for the GP (:py:class:`fvgp.GP`) class.
         - "sparse_minres_tol" : float
         - "sparse_cg_tol" : float
         - "cg_minres_tol" : float; legacy CG tolerance alias still accepted
-        - "sparse_preconditioner_type" : one of `ilu`, `incomplete_cholesky`, `block_jacobi`, `additive_schwarz`, `amg`; optional if a `sparseCGpre_*` or `sparseMINRESpre_*` alias mode is used
+        - "sparse_preconditioner_type" : one of `ilu`, `ic`, `ichol`, `incomplete_cholesky`, `ichol0`, `native_ic`, `block_jacobi`, `additive_schwarz`, `amg`; optional if a `sparseCGpre_*` or `sparseMINRESpre_*` alias mode is used
         - "sparse_preconditioner_drop_tol" : float; ILU only
         - "sparse_preconditioner_fill_factor" : float; ILU only
         - "sparse_preconditioner_block_size" : int; block Jacobi / additive Schwarz
         - "sparse_preconditioner_schwarz_overlap" : int; additive Schwarz
-        - "sparse_preconditioner_ic_shift" : float; incomplete Cholesky diagonal shift
+        - "sparse_preconditioner_ic_shift" : float; legacy Python IC(0) diagonal shift only
+        - "sparse_preconditioner_ichol_fill_in" : int; thresholded IC (`ic`, `ichol`, `incomplete_cholesky`) only
+        - "sparse_preconditioner_ichol_threshold" : float; thresholded IC (`ic`, `ichol`, `incomplete_cholesky`) only
         - "sparse_preconditioner_amg_cycle" : str; AMG cycle, e.g. `V`
         - "sparse_preconditioner_refresh_interval" : int; rebuild the cached preconditioner every k matrix updates
         - "sparse_krylov_warm_start" : True/False; reuse the previous iterative solution as the next initial guess during training-time iterative solves
@@ -219,6 +225,23 @@ class provides all the methods described for the GP (:py:class:`fvgp.GP`) class.
         - "GPU_engine" : str; default = "torch"/"cupy"
         - "GPU_device" : optional torch device string, e.g. "cuda:0" or "mps"
 
+        Practical sparse-solver guidance:
+
+        - Keep compact-support covariance matrices genuinely sparse before using global
+          factor-based preconditioners. If the kernel support is too broad, ILU/IC
+          failures are usually memory/fill failures, not proof that Krylov solvers are bad.
+        - Prefer `sparseCGpre_*` as the primary path for covariance systems, which are
+          symmetric positive definite in the usual case. `MINRES` can be useful as a
+          comparison, but it may need a stricter `sparse_minres_tol` to satisfy a raw
+          relative residual check.
+        - Sweep preconditioner parameters at the target scale. For ILU, `drop_tol` and
+          `fill_factor` control a memory/solve-time tradeoff; a slightly more expressive
+          factor can cost only marginally more to build but reduce solve time substantially.
+        - For repeated nearby K+V updates, enable `sparse_krylov_warm_start` and reuse
+          preconditioners with `sparse_preconditioner_refresh_interval`. The best refresh
+          interval is problem-dependent because preconditioner build cost can be comparable
+          to an unconditioned solve.
+
         All other keys will be stored and are available as part of the object instance.
 
     Attributes
diff --git a/fvgp/gp.py b/fvgp/gp.py
index a7b557f..05b1a24 100755
--- a/fvgp/gp.py
+++ b/fvgp/gp.py
@@ -147,9 +147,13 @@ class GP:
         One of `Chol`, `sparseLU`, `sparseCG`, `sparseMINRES`, `sparseSolve`, `sparseCGpre`
         (cached sparse preconditioner; geometry selected via `args["sparse_preconditioner_type"]`),
         or `sparseMINRESpre`. Supported preconditioner geometries are `ilu`,
-        `incomplete_cholesky`, `block_jacobi`, `additive_schwarz`, and `amg`.
+        compiled incomplete Cholesky (`ic`, `ichol`, or `incomplete_cholesky`) and
+        zero-fill `ichol0`; these use the optional `ilupp` backend. `native_ic` keeps
+        the legacy Python IC(0) implementation for debugging/comparison. Other options are `block_jacobi`,
+        `additive_schwarz`, and `amg`.
         Convenience aliases such as `sparseCGpre_ilu`, `sparseCGpre_ic`,
-        `sparseCGpre_blockjacobi`, `sparseCGpre_schwarz`, `sparseCGpre_amg`,
+        `sparseCGpre_ichol0`, `sparseCGpre_native_ic`, `sparseCGpre_blockjacobi`,
+        `sparseCGpre_schwarz`, `sparseCGpre_amg`,
         and the corresponding `sparseMINRESpre_*` forms are also accepted.
         The default is None which amounts to
         an automatic determination of the mode. For advanced customization options
@@ -190,12 +194,14 @@ class GP:
         - "sparse_minres_tol" : float
         - "sparse_cg_tol" : float
         - "cg_minres_tol" : float; legacy CG tolerance alias still accepted
-        - "sparse_preconditioner_type" : one of `ilu`, `incomplete_cholesky`, `block_jacobi`, `additive_schwarz`, `amg`; optional if a `sparseCGpre_*` or `sparseMINRESpre_*` alias mode is used
+        - "sparse_preconditioner_type" : one of `ilu`, `ic`, `ichol`, `incomplete_cholesky`, `ichol0`, `native_ic`, `block_jacobi`, `additive_schwarz`, `amg`; optional if a `sparseCGpre_*` or `sparseMINRESpre_*` alias mode is used
         - "sparse_preconditioner_drop_tol" : float; ILU only
         - "sparse_preconditioner_fill_factor" : float; ILU only
         - "sparse_preconditioner_block_size" : int; block Jacobi / additive Schwarz
         - "sparse_preconditioner_schwarz_overlap" : int; additive Schwarz
-        - "sparse_preconditioner_ic_shift" : float; incomplete Cholesky diagonal shift
+        - "sparse_preconditioner_ic_shift" : float; legacy Python IC(0) diagonal shift only
+        - "sparse_preconditioner_ichol_fill_in" : int; thresholded IC (`ic`, `ichol`, `incomplete_cholesky`) only
+        - "sparse_preconditioner_ichol_threshold" : float; thresholded IC (`ic`, `ichol`, `incomplete_cholesky`) only
         - "sparse_preconditioner_amg_cycle" : str; AMG cycle, e.g. `V`
         - "sparse_preconditioner_refresh_interval" : int; rebuild the cached preconditioner every k matrix updates
         - "sparse_krylov_warm_start" : True/False; reuse the previous iterative solution as the next initial guess during training-time iterative solves
@@ -213,6 +219,24 @@ class GP:
         - "GPU_engine" : str; default = "torch"/"cupy"
         - "GPU_device" : optional torch device string, e.g. "cuda:0" or "mps"
 
+        Practical sparse-solver guidance:
+
+        - Keep compact-support covariance matrices genuinely sparse before using global
+          factor-based preconditioners. If the kernel support is too broad, ILU/IC
+          failures are usually memory/fill failures, not proof that Krylov solvers are bad.
+        - Prefer `sparseCGpre_*` as the primary path for covariance systems, which are
+          symmetric positive definite in the usual case. `MINRES` can be useful as a
+          comparison, but it may need a stricter `sparse_minres_tol` to satisfy a raw
+          relative residual check.
+        - Sweep preconditioner parameters at the target scale. For ILU, `drop_tol` and
+          `fill_factor` control a memory/solve-time tradeoff; a slightly more expressive
+          factor can cost only marginally more to build but reduce solve time substantially.
+        - For repeated nearby K+V updates, enable `sparse_krylov_warm_start` and reuse
+          preconditioners with `sparse_preconditioner_refresh_interval`. The best refresh
+          interval is problem-dependent because preconditioner build cost can be comparable
+          to an unconditioned solve.
+
+
         All other keys will be stored and are available as part of the object instance and
         in kernel, mean, and noise functions.
 
diff --git a/fvgp/gp_lin_alg.py b/fvgp/gp_lin_alg.py
index 5496e7e..ce5de6d 100755
--- a/fvgp/gp_lin_alg.py
+++ b/fvgp/gp_lin_alg.py
@@ -5,7 +5,7 @@
 from collections import deque
 from loguru import logger
 from scipy.sparse.linalg import splu
-from scipy.sparse.linalg import minres, cg, spsolve
+from scipy.sparse.linalg import minres, cg, spsolve, spsolve_triangular
 from scipy.sparse import identity
 from scipy.sparse.linalg import onenormest
 from scipy.linalg import cho_factor, cho_solve, solve_triangular
@@ -21,9 +21,16 @@ def normalize_sparse_preconditioner_type(preconditioner_type):
     preconditioner_type = str(preconditioner_type).lower()
     aliases = {
         "ilu": "ilu",
-        "ic": "incomplete_cholesky",
-        "ichol": "incomplete_cholesky",
-        "incomplete_cholesky": "incomplete_cholesky",
+        "ic": "ic",
+        "ichol": "ic",
+        "incomplete_cholesky": "ic",
+        "ichol0": "ichol0",
+        "ilupp_ichol0": "ichol0",
+        "ilupp_icholt": "ic",
+        "native_ic": "native_incomplete_cholesky",
+        "legacy_ic": "native_incomplete_cholesky",
+        "native_incomplete_cholesky": "native_incomplete_cholesky",
+        "legacy_incomplete_cholesky": "native_incomplete_cholesky",
         "block_jacobi": "block_jacobi",
         "blockjacobi": "block_jacobi",
         "schwarz": "additive_schwarz",
@@ -34,12 +41,57 @@ def normalize_sparse_preconditioner_type(preconditioner_type):
         raise ValueError(
             "Unknown sparse preconditioner type "
             f"{preconditioner_type!r}. Expected one of "
-            "{'ilu', 'ic', 'incomplete_cholesky', 'block_jacobi', 'blockjacobi', "
+            "{'ilu', 'ic', 'ichol', 'ichol0', 'incomplete_cholesky', "
+            "'native_ic', 'legacy_ic', 'block_jacobi', 'blockjacobi', "
             "'schwarz', 'additive_schwarz', 'amg'}."
         )
     return aliases[preconditioner_type]
 
 
+def _raise_missing_ilupp(preconditioner_type, exc):
+    raise ImportError(
+        "The sparse incomplete-Cholesky preconditioners (`ic`, `ichol`, "
+        "`incomplete_cholesky`, and `ichol0`) require the optional `ilupp` "
+        "package. Install it in the Python environment running fvGP with:\n\n"
+        "    pip install ilupp\n\n"
+        f"Requested sparse preconditioner resolved to backend={preconditioner_type!r}."
+    ) from exc
+
+
+def sparse_preconditioner_failure_guidance(args=None):
+    args = _normalize_args(args)
+    preconditioner_type = args.get("sparse_preconditioner_type")
+    try:
+        preconditioner_type = normalize_sparse_preconditioner_type(preconditioner_type)
+    except Exception:
+        preconditioner_type = str(preconditioner_type)
+
+    guidance = [
+        "Practical guidance: preconditioner failures often mean the covariance graph is too dense or the factor is too expressive for available memory.",
+        "First check the compact-support kernel length scale/support radius and keep matrix density low before tuning solver parameters.",
+        "Run a small preconditioner build sweep before a full solve run; a buildable preconditioner can still be slow to apply.",
+    ]
+    if preconditioner_type == "ilu":
+        guidance.append(
+            "For ILU, sweep `sparse_preconditioner_drop_tol` and `sparse_preconditioner_fill_factor`; looser drop tolerances and smaller fill factors are more likely to fit, while stronger factors may reduce solve time."
+        )
+    elif preconditioner_type in {"ic", "ichol0"}:
+        guidance.append(
+            "For IC/IChol, install the optional backend with `pip install ilupp`; if thresholded IC does not fit, try softer fill/threshold settings or `ichol0`, then verify actual solve time."
+        )
+    elif preconditioner_type in {"block_jacobi", "additive_schwarz"}:
+        guidance.append(
+            "For block/local preconditioners, sweep block size and overlap; these may fit easily but can be weaker than ILU on large covariance systems."
+        )
+    guidance.append(
+        "For repeated nearby K+V updates, `sparse_krylov_warm_start=True` and a nontrivial `sparse_preconditioner_refresh_interval` can avoid rebuilding every solve."
+    )
+    guidance.append(
+        "If MINRES returns with a poor raw residual, try a stricter `sparse_minres_tol` before judging the method."
+    )
+    return " ".join(guidance)
+
+
 def resolve_gp2scale_linalg_mode(mode, args=None):
     args = dict(_normalize_args(args))
     if not isinstance(mode, str):
@@ -500,38 +552,41 @@ def _build_ic0_factor(KV, args=None):
                 diag[i] = np.sqrt(diag_sq)
                 rows.append(computed_entries)
 
-            columns = [[] for _ in range(n)]
+            indptr = [0]
+            indices = []
+            data = []
             for i, row in enumerate(rows):
                 for j, value in row.items():
-                    columns[j].append((i, value))
+                    indices.append(int(j))
+                    data.append(float(value))
+                indices.append(i)
+                data.append(float(diag[i]))
+                indptr.append(len(indices))
+
+            L = sparse.csr_matrix(
+                (
+                    np.asarray(data, dtype=np.float64),
+                    np.asarray(indices, dtype=np.int32),
+                    np.asarray(indptr, dtype=np.int32),
+                ),
+                shape=A.shape,
+            )
+            LT = L.transpose().tocsr()
 
             def solve(vector):
                 vector = np.asarray(vector, dtype=np.float64)
-                y = np.zeros_like(vector)
-                for i, row in enumerate(rows):
-                    total = vector[i]
-                    for j, value in row.items():
-                        total -= value * y[j]
-                    y[i] = total / diag[i]
-
-                z = np.zeros_like(y)
-                for i in range(n - 1, -1, -1):
-                    total = y[i]
-                    for row_index, value in columns[i]:
-                        if row_index > i:
-                            total -= value * z[row_index]
-                    z[i] = total / diag[i]
-                return z
+                y = spsolve_triangular(L, vector, lower=True)
+                return spsolve_triangular(LT, y, lower=False)
 
             factor = {
                 "type": "incomplete_cholesky",
                 "diag": diag,
                 "rows": rows,
-                "columns": columns,
-                "solve": solve,
+                "L": L,
+                "LT": LT,
                 "shift": shift,
             }
-            operator = sparse.linalg.LinearOperator(A.shape, matvec=solve, rmatvec=solve, dtype=A.dtype)
+            operator = _build_dtype_adapted_operator(A.shape, solve, factor_dtype=np.float64)
             return factor, operator
         except Exception as exc:
             last_exc = exc
@@ -540,6 +595,30 @@ def solve(vector):
     raise np.linalg.LinAlgError(f"IC(0) preconditioner construction failed after shifted retries: {last_exc}")
 
 
+def _build_dtype_adapted_operator(shape, solve, factor_dtype, operator_dtype=np.float64):
+    factor_dtype = np.dtype(factor_dtype)
+    operator_dtype = np.dtype(operator_dtype)
+
+    def _apply(vec):
+        arr = np.asarray(vec, dtype=operator_dtype)
+        if arr.ndim == 1:
+            solved = solve(np.asarray(arr, dtype=factor_dtype))
+            return np.asarray(solved, dtype=operator_dtype)
+        columns = [
+            np.asarray(solve(np.asarray(arr[:, i], dtype=factor_dtype)), dtype=operator_dtype)
+            for i in range(arr.shape[1])
+        ]
+        return np.column_stack(columns)
+
+    return sparse.linalg.LinearOperator(
+        shape,
+        matvec=_apply,
+        rmatvec=_apply,
+        matmat=_apply,
+        dtype=operator_dtype,
+    )
+
+
 def _build_ilu_preconditioner(KV, args=None):
     args = _normalize_args(args)
     A = KV.tocsc()
@@ -555,12 +634,38 @@ def _build_ilu_preconditioner(KV, args=None):
         spilu_kwargs["diag_pivot_thresh"] = args["sparse_preconditioner_diag_pivot_thresh"]
 
     factor = sparse.linalg.spilu(A, **spilu_kwargs)
-    operator = sparse.linalg.LinearOperator(
-        A.shape,
-        matvec=factor.solve,
-        rmatvec=factor.solve,
-        dtype=A.dtype,
-    )
+    # SciPy Krylov paths in this module normalize RHS vectors to float64, while
+    # `spilu` factors on the matrix dtype (often float32 for gp2Scale artifacts).
+    # Adapt the operator boundary so the preconditioner can still be applied
+    # without forcing a full float64 refactorization of the sparse matrix.
+    operator = _build_dtype_adapted_operator(A.shape, factor.solve, factor_dtype=A.dtype)
+    return factor, operator
+
+
+def _build_ilupp_ichol0_preconditioner(KV, args=None):
+    try:
+        import ilupp
+    except ImportError as exc:
+        _raise_missing_ilupp("ichol0", exc)
+
+    A = _as_symmetric_csr(KV).astype(np.float64)
+    factor = ilupp.IChol0Preconditioner(A)
+    operator = _build_dtype_adapted_operator(A.shape, factor.dot, factor_dtype=np.float64)
+    return factor, operator
+
+
+def _build_ilupp_icholt_preconditioner(KV, args=None):
+    args = _normalize_args(args)
+    try:
+        import ilupp
+    except ImportError as exc:
+        _raise_missing_ilupp("ic", exc)
+
+    A = _as_symmetric_csr(KV).astype(np.float64)
+    add_fill_in = int(args.get("sparse_preconditioner_ichol_fill_in", 16))
+    threshold = float(args.get("sparse_preconditioner_ichol_threshold", 1e-4))
+    factor = ilupp.ICholTPreconditioner(A, add_fill_in=add_fill_in, threshold=threshold)
+    operator = _build_dtype_adapted_operator(A.shape, factor.dot, factor_dtype=np.float64)
     return factor, operator
 
 
@@ -601,7 +706,9 @@ def calculate_sparse_preconditioner(KV, args=None):
 
     builders = {
         "ilu": _build_ilu_preconditioner,
-        "incomplete_cholesky": _build_ic0_factor,
+        "native_incomplete_cholesky": _build_ic0_factor,
+        "ichol0": _build_ilupp_ichol0_preconditioner,
+        "ic": _build_ilupp_icholt_preconditioner,
         "block_jacobi": _build_block_jacobi_preconditioner,
         "additive_schwarz": _build_additive_schwarz_preconditioner,
         "amg": _build_amg_preconditioner,
@@ -649,6 +756,8 @@ def _resolve_krylov_maxiter(args=None, solver_key=None):
 
 
 def _normalize_rhs(vec):
+    if sparse.issparse(vec):
+        vec = vec.toarray()
     vec = np.asarray(vec, dtype=np.float64)
     if np.ndim(vec) == 1:
         vec = vec.reshape(len(vec), 1)
@@ -700,14 +809,75 @@ def _apply_preconditioner(M, residual):
     return _apply_linear_operator(M, residual)
 
 
-def _block_conjugate_gradient(KV, vec, cg_tol, x0=None, M=None, maxiter=None):
+def _krylov_progress_interval(args=None):
+    args = _normalize_args(args)
+    value = args.get("sparse_progress_interval", 0)
+    return max(int(value or 0), 0)
+
+
+def _krylov_progress_label(args, solver_name, rhs_index=None):
+    args = _normalize_args(args)
+    base = str(args.get("sparse_progress_label", solver_name)).strip() or solver_name
+    if rhs_index is None:
+        return base
+    return f"{base} rhs={rhs_index}"
+
+
+def _emit_krylov_progress(label, iteration, progress_interval):
+    if progress_interval <= 0:
+        return
+    if iteration == 1 or iteration % progress_interval == 0:
+        print(f"{label}: iteration {iteration}", flush=True)
+
+
+def _emit_krylov_completion(label, solver_name, iteration_count, exit_code, elapsed_seconds, progress_interval):
+    if progress_interval <= 0:
+        return
+    print(
+        f"{label}: {solver_name} finished after {iteration_count} iterations "
+        f"(exit_code={exit_code}, seconds={elapsed_seconds:.3f})",
+        flush=True,
+    )
+
+
+def _build_krylov_callback(args, solver_name, rhs_index=None):
+    progress_interval = _krylov_progress_interval(args)
+    if progress_interval <= 0:
+        return None, None, progress_interval
+
+    label = _krylov_progress_label(args, solver_name, rhs_index=rhs_index)
+    counter = {"count": 0}
+
+    def callback(_xk):
+        counter["count"] += 1
+        _emit_krylov_progress(label, counter["count"], progress_interval)
+
+    return callback, counter, progress_interval
+
+
+def _block_conjugate_gradient(KV, vec, cg_tol, x0=None, M=None, maxiter=None, args=None, rhs_offset=0):
     vec = _normalize_rhs(vec)
     x0 = _normalize_initial_guess(x0, vec.shape)
     n, rhs_count = vec.shape
+    progress_interval = _krylov_progress_interval(args)
+    progress_label = _krylov_progress_label(args, "BlockCG", rhs_index=rhs_offset if rhs_count == 1 else None)
+    iteration_count = 0
+    block_start = time.time()
     if rhs_count == 1:
         rhs = vec[:, 0]
         initial_guess = None if x0 is None else x0[:, 0]
-        solution, exit_code = cg(KV, rhs, M=M, rtol=cg_tol, x0=initial_guess, maxiter=maxiter)
+        callback, counter, progress_interval = _build_krylov_callback(args, "BlockCG", rhs_index=rhs_offset)
+        start = time.time()
+        solution, exit_code = cg(KV, rhs, M=M, rtol=cg_tol, x0=initial_guess, maxiter=maxiter, callback=callback)
+        iteration_count = 0 if counter is None else counter["count"]
+        _emit_krylov_completion(
+            _krylov_progress_label(args, "BlockCG", rhs_index=rhs_offset),
+            "BlockCG",
+            iteration_count,
+            exit_code,
+            time.time() - start,
+            progress_interval,
+        )
         return solution.reshape(n, 1), exit_code
 
     X = np.zeros((n, rhs_count), dtype=np.float64) if x0 is None else x0.copy()
@@ -726,6 +896,8 @@ def _block_conjugate_gradient(KV, vec, cg_tol, x0=None, M=None, maxiter=None):
     last_exit_code = 1
 
     for _ in range(max(maxiter, 1)):
+        iteration_count += 1
+        _emit_krylov_progress(progress_label, iteration_count, progress_interval)
         AP = _apply_linear_operator(KV, P)
         H = P.T @ AP
         try:
@@ -750,6 +922,14 @@ def _block_conjugate_gradient(KV, vec, cg_tol, x0=None, M=None, maxiter=None):
         P = Z + P @ beta
         G = G_new
 
+    _emit_krylov_completion(
+        progress_label,
+        "BlockCG",
+        iteration_count,
+        last_exit_code,
+        time.time() - block_start,
+        progress_interval,
+    )
     return X, last_exit_code
 
 
@@ -797,7 +977,30 @@ def calculate_sparse_minres(KV, vec, x0=None, M=None, args=None):
     res = np.zeros(vec.shape)
     for i in range(vec.shape[1]):
         initial_guess = _column_initial_guess(x0, i)
-        res[:, i], exit_code = minres(KV, vec[:, i], M=M, rtol=minres_tol, x0=initial_guess, maxiter=maxiter)
+        callback, counter, progress_interval = _build_krylov_callback(
+            args,
+            "MINRES",
+            rhs_index=i if vec.shape[1] > 1 else None,
+        )
+        column_start = time.time()
+        res[:, i], exit_code = minres(
+            KV,
+            vec[:, i],
+            M=M,
+            rtol=minres_tol,
+            x0=initial_guess,
+            maxiter=maxiter,
+            callback=callback,
+        )
+        iteration_count = 0 if counter is None else counter["count"]
+        _emit_krylov_completion(
+            _krylov_progress_label(args, "MINRES", rhs_index=i if vec.shape[1] > 1 else None),
+            "MINRES",
+            iteration_count,
+            exit_code,
+            time.time() - column_start,
+            progress_interval,
+        )
         if exit_code != 0: warnings.warn(f"MINRES not successful (exit_code={exit_code})")
     logger.debug("MINRES compute time: {} seconds.", time.time() - st)
     assert np.ndim(res) == 2
@@ -841,6 +1044,8 @@ def calculate_sparse_conj_grad(KV, vec, x0=None, M=None, args=None):
                     x0=block_x0,
                     M=M,
                     maxiter=maxiter,
+                    args=args,
+                    rhs_offset=start,
                 )
             except Exception as exc:
                 warnings.warn(
@@ -851,6 +1056,12 @@ def calculate_sparse_conj_grad(KV, vec, x0=None, M=None, args=None):
                 exit_code = 4
                 for block_index, rhs_index in enumerate(range(start, stop)):
                     initial_guess = _column_initial_guess(x0, rhs_index)
+                    callback, counter, progress_interval = _build_krylov_callback(
+                        args,
+                        "CG",
+                        rhs_index=rhs_index if vec.shape[1] > 1 else None,
+                    )
+                    column_start = time.time()
                     block_solution[:, block_index], _ = cg(
                         KV,
                         vec[:, rhs_index],
@@ -858,6 +1069,16 @@ def calculate_sparse_conj_grad(KV, vec, x0=None, M=None, args=None):
                         rtol=cg_tol,
                         x0=initial_guess,
                         maxiter=maxiter,
+                        callback=callback,
+                    )
+                    iteration_count = 0 if counter is None else counter["count"]
+                    _emit_krylov_completion(
+                        _krylov_progress_label(args, "CG", rhs_index=rhs_index if vec.shape[1] > 1 else None),
+                        "CG",
+                        iteration_count,
+                        0,
+                        time.time() - column_start,
+                        progress_interval,
                     )
             res[:, start:stop] = block_solution
             exit_codes.append(exit_code)
@@ -870,7 +1091,30 @@ def calculate_sparse_conj_grad(KV, vec, x0=None, M=None, args=None):
     res = np.zeros(vec.shape)
     for i in range(vec.shape[1]):
         initial_guess = _column_initial_guess(x0, i)
-        res[:, i], exit_code = cg(KV, vec[:, i], M=M, rtol=cg_tol, x0=initial_guess, maxiter=maxiter)
+        callback, counter, progress_interval = _build_krylov_callback(
+            args,
+            "CG",
+            rhs_index=i if vec.shape[1] > 1 else None,
+        )
+        column_start = time.time()
+        res[:, i], exit_code = cg(
+            KV,
+            vec[:, i],
+            M=M,
+            rtol=cg_tol,
+            x0=initial_guess,
+            maxiter=maxiter,
+            callback=callback,
+        )
+        iteration_count = 0 if counter is None else counter["count"]
+        _emit_krylov_completion(
+            _krylov_progress_label(args, "CG", rhs_index=i if vec.shape[1] > 1 else None),
+            "CG",
+            iteration_count,
+            exit_code,
+            time.time() - column_start,
+            progress_interval,
+        )
         if exit_code != 0: warnings.warn(f"CG not successful (exit_code={exit_code})")
     logger.debug("CG compute time: {} seconds.", time.time() - st)
     assert np.ndim(res) == 2
diff --git a/fvgp/gp_marginal_likelihood.py b/fvgp/gp_marginal_likelihood.py
index a96a256..5194b2d 100755
--- a/fvgp/gp_marginal_likelihood.py
+++ b/fvgp/gp_marginal_likelihood.py
@@ -143,8 +143,9 @@ def _build_sparse_preconditioner_or_none(self, KV):
         except Exception as exc:
             logger.warning(
                 "Sparse preconditioner construction failed; "
-                "falling back to the unpreconditioned iterative solve: {}",
+                "falling back to the unpreconditioned iterative solve: {}. {}",
                 exc,
+                sparse_preconditioner_failure_guidance(self.args),
             )
             return None
 
diff --git a/fvgp/kv_linalg.py b/fvgp/kv_linalg.py
index 2ac30c1..aefbb97 100755
--- a/fvgp/kv_linalg.py
+++ b/fvgp/kv_linalg.py
@@ -19,6 +19,7 @@ def __init__(self, compute_device, data):
         self.Preconditioner_operator = None
         self.Preconditioner_signature = None
         self.Preconditioner_reuse_counter = 0
+        self.Last_preconditioner_error = None
         self.Last_iterative_solution = None
         self.custom_obj = None
         self.allowed_modes = ["Chol", "CholInv", "Inv", "sparseMINRES", "sparseCG",
@@ -49,6 +50,7 @@ def _reset_sparse_preconditioner(self):
         self.Preconditioner_operator = None
         self.Preconditioner_signature = None
         self.Preconditioner_reuse_counter = 0
+        self.Last_preconditioner_error = None
 
     def _can_reuse_sparse_preconditioner(self, KV, mode):
         if mode not in {"sparseMINRESpre", "sparseCGpre"}:
@@ -74,9 +76,12 @@ def _refresh_sparse_preconditioner(self):
         try:
             factor, operator = calculate_sparse_preconditioner(self.KV, args=self.args)
         except Exception as exc:
+            self.Last_preconditioner_error = f"{type(exc).__name__}: {exc}"
             warnings.warn(
                 f"Failed to build sparse preconditioner for mode {self.mode}; "
-                "falling back to the unpreconditioned iterative solve."
+                f"falling back to the unpreconditioned iterative solve. "
+                f"Reason: {self.Last_preconditioner_error}. "
+                f"{sparse_preconditioner_failure_guidance(self.args)}"
             )
             logger.warning("Sparse preconditioner construction failed for {}: {}", self.mode, exc)
             self._reset_sparse_preconditioner()
@@ -85,6 +90,7 @@ def _refresh_sparse_preconditioner(self):
         self.Preconditioner_operator = operator
         self.Preconditioner_signature = self._preconditioner_signature()
         self.Preconditioner_reuse_counter = 0
+        self.Last_preconditioner_error = None
 
     def _get_sparse_preconditioner(self):
         if self.mode in {"sparseMINRESpre", "sparseCGpre"} and self.Preconditioner_operator is None:
@@ -320,6 +326,7 @@ def __getstate__(self):
             Preconditioner_operator=None,
             Preconditioner_signature=self.Preconditioner_signature,
             Preconditioner_reuse_counter=self.Preconditioner_reuse_counter,
+            Last_preconditioner_error=self.Last_preconditioner_error,
             Last_iterative_solution=self.Last_iterative_solution,
             custom_obj=self.custom_obj,
             allowed_modes=self.allowed_modes
@@ -336,5 +343,7 @@ def __setstate__(self, state):
             self.Preconditioner_signature = None
         if "Preconditioner_reuse_counter" not in self.__dict__:
             self.Preconditioner_reuse_counter = 0
+        if "Last_preconditioner_error" not in self.__dict__:
+            self.Last_preconditioner_error = None
         if "Last_iterative_solution" not in self.__dict__:
             self.Last_iterative_solution = None
diff --git a/tests/test_fvgp.py b/tests/test_fvgp.py
index cd6bf7d..30850ef 100755
--- a/tests/test_fvgp.py
+++ b/tests/test_fvgp.py
@@ -106,7 +106,15 @@ def test_gp2scale_linalg_mode_alias_resolution():
         {"sparse_preconditioner_ic_shift": 1e-8},
     )
     assert mode == "sparseMINRESpre"
-    assert args["sparse_preconditioner_type"] == "incomplete_cholesky"
+    assert args["sparse_preconditioner_type"] == "ic"
+    assert args["sparse_preconditioner_ic_shift"] == 1e-8
+
+    mode, args = resolve_gp2scale_linalg_mode(
+        "sparseMINRESpre_native_ic",
+        {"sparse_preconditioner_ic_shift": 1e-8},
+    )
+    assert mode == "sparseMINRESpre"
+    assert args["sparse_preconditioner_type"] == "native_incomplete_cholesky"
     assert args["sparse_preconditioner_ic_shift"] == 1e-8
 
     with pytest.raises(ValueError):
@@ -116,6 +124,26 @@ def test_gp2scale_linalg_mode_alias_resolution():
         )
 
 
+def test_missing_ilupp_message_for_ic_aliases(monkeypatch):
+    import builtins
+
+    real_import = builtins.__import__
+
+    def fake_import(name, *args, **kwargs):
+        if name == "ilupp":
+            raise ImportError("simulated missing ilupp")
+        return real_import(name, *args, **kwargs)
+
+    monkeypatch.setattr(builtins, "__import__", fake_import)
+    KV = sparse.eye(4, format="csr")
+
+    with pytest.raises(ImportError, match="pip install ilupp"):
+        calculate_sparse_preconditioner(KV, args={"sparse_preconditioner_type": "ic"})
+
+    with pytest.raises(ImportError, match="pip install ilupp"):
+        calculate_sparse_preconditioner(KV, args={"sparse_preconditioner_type": "ichol0"})
+
+
 def test_random_logdet_compute_device_override(monkeypatch):
     captured_gpu_flags = []