From 976ddc4f662a462f4871d30e28fae667a4722948 Mon Sep 17 00:00:00 2001
From: Justin Kin Jun Hew <justinkinjun.hew@anu.edu.au>
Date: Fri, 29 May 2026 14:43:39 +1000
Subject: [PATCH 01/12] change dims

---
 src/c/toolkits/petsc/patches/NewMat.cpp | 13 +++++--------
 1 file changed, 5 insertions(+), 8 deletions(-)
diff --git a/src/c/toolkits/petsc/patches/NewMat.cpp b/src/c/toolkits/petsc/patches/NewMat.cpp
index 7442a0854..e053ea64f 100644
--- a/src/c/toolkits/petsc/patches/NewMat.cpp
+++ b/src/c/toolkits/petsc/patches/NewMat.cpp
@@ -109,13 +109,10 @@ PMat NewMat(int M,int N,int connectivity,int numberofdofspernode,ISSM_MPI_Comm c
 	MatSetFromOptions(outmatrix);
 	MatSetOption(outmatrix,MAT_IGNORE_ZERO_ENTRIES,PETSC_TRUE);
 
-	/*preallocation  according to type: */
-	MatGetType(outmatrix,&type);
+        /*preallocation according to type:
+         * Use strncmp("mpiaij",...) so that CUDA variants (mpiaijcusparse) also
+         * get preallocated — they inherit from MPIAIJ and accept the same call. */
+        MatGetType(outmatrix,&type);
 
-	if((strcmp(type,"mpiaij")==0) || (strcmp(type,"mpidense")==0)){
-		MatMPIAIJSetPreallocation(outmatrix,d_nz,NULL,o_nz,NULL);
-	}
-
-	return outmatrix;
-}
+        if((strncmp(type,"mpiaij",6)==0) || (strcmp(type,"mpidense")==0)){
 /*}}}*/

From 590cddc6e41db3027a80471b82d2fb67070b9940 Mon Sep 17 00:00:00 2001
From: Justin Kin Jun Hew <justinkinjun.hew@anu.edu.au>
Date: Fri, 29 May 2026 15:00:28 +1000
Subject: [PATCH 02/12] GPU patches

---
 m4/issm_options.m4                            | 14 +++
 src/c/toolkits/petsc/objects/PetscVec.cpp     | 13 ++-
 .../toolkits/petsc/patches/VecToMPISerial.cpp | 97 ++++++-------------
 src/m/classes/toolkits.py                     |  1 +
 4 files changed, 54 insertions(+), 71 deletions(-)

diff --git a/m4/issm_options.m4 b/m4/issm_options.m4
index 4ce29b74a..cb24d1057 100644
--- a/m4/issm_options.m4
+++ b/m4/issm_options.m4
@@ -1394,6 +1394,20 @@ AC_DEFUN([ISSM_OPTIONS],[
 		AC_DEFINE([_HAVE_PETSC_], [1], [with PETSc in ISSM src])
 		AC_SUBST([PETSCINCL])
 		AC_SUBST([PETSCLIB])
+
+		dnl Detect CUDA support in the configured PETSc install {{{
+		AC_MSG_CHECKING(whether PETSc was built with CUDA support)
+		PETSC_HAS_CUDA=no
+		if test -f "${PETSC_ROOT}/include/petscconf.h"; then
+			if grep -q "PETSC_HAVE_CUDA" "${PETSC_ROOT}/include/petscconf.h"; then
+				PETSC_HAS_CUDA=yes
+			fi
+		fi
+		AC_MSG_RESULT([${PETSC_HAS_CUDA}])
+		if test "x${PETSC_HAS_CUDA}" == "xyes"; then
+			AC_DEFINE([_HAVE_PETSC_CUDA_], [1], [PETSc built with CUDA support])
+		fi
+		dnl }}}
 	fi
 	dnl }}}
 	dnl MPI{{{
diff --git a/src/c/toolkits/petsc/objects/PetscVec.cpp b/src/c/toolkits/petsc/objects/PetscVec.cpp
index 85ecc45b7..f9b9449e2 100644
--- a/src/c/toolkits/petsc/objects/PetscVec.cpp
+++ b/src/c/toolkits/petsc/objects/PetscVec.cpp
@@ -170,9 +170,16 @@ void PetscVec<doubletype>::GetLocalVector(doubletype** pvector,int** pindices){/
 	/*Build indices*/
 	int* indices=xNew<int>(range);
 	for(int i=0;i<range;i++) indices[i]=lower_row+i;
-	/*Get vector*/
-	doubletype* values =xNew<doubletype>(range);
-	VecGetValues(this->vector,range,indices,values);
+	/*Get vector: use VecGetArrayRead which is CUDA-aware (PETSc downloads from
+	 * device transparently) instead of VecGetValues which forces a stalling
+	 * GPU->CPU copy for every index lookup.*/
+	doubletype* values=xNew<doubletype>(range);
+	{
+		const PetscScalar* arr=NULL;
+		VecGetArrayRead(this->vector,&arr);
+		for(int i=0;i<range;i++) values[i]=(doubletype)arr[i];
+		VecRestoreArrayRead(this->vector,&arr);
+	}
 
 	*pvector  = values;
 	*pindices = indices;
diff --git a/src/c/toolkits/petsc/patches/VecToMPISerial.cpp b/src/c/toolkits/petsc/patches/VecToMPISerial.cpp
index 368e13bda..4fc32dcab 100644
--- a/src/c/toolkits/petsc/patches/VecToMPISerial.cpp
+++ b/src/c/toolkits/petsc/patches/VecToMPISerial.cpp
@@ -55,91 +55,52 @@
 template<typename doubletype, typename vectype>
 int VecToMPISerial(doubletype** pgathered_vector, vectype vector,ISSM_MPI_Comm comm,bool broadcast){
 
-	int i;
-	int num_procs; 
-	int my_rank;
-
-	/*Petsc*/
-	ISSM_MPI_Status status;
-	PetscInt lower_row,upper_row; 
-	int range;
-	int * idxn=NULL; 
-	int buffer[3];
-
-	/*intermediary results*/
-	doubletype* local_vector=NULL;
-
-	/*input*/
-	int vector_size;
-
 	/*Output*/
-	doubletype* gathered_vector=NULL; //Global vector holding the final assembled vector on all nodes.
+	doubletype*        gathered_vector = NULL;
+	const PetscScalar* vec_array       = NULL;
 
-	/*recover my_rank and num_procs*/
-	ISSM_MPI_Comm_size(comm,&num_procs);
-	ISSM_MPI_Comm_rank(comm,&my_rank);
+	/*Sequential vector and scatter context*/
+	vectype    vector_seq = NULL;
+	VecScatter ctx        = NULL;
 
+	/*Check for empty vector*/
+	int vector_size;
 	VecGetSize(vector,&vector_size);
 	if(vector_size==0){
 		*pgathered_vector=NULL;
 		return 1;
 	}
 
-	/*Allocate gathered vector on all nodes .*/
-	if(broadcast || my_rank==0){ 
-		gathered_vector=xNew<doubletype>(vector_size);
-	}
-
-	/*Allocate local vectors*/
-	VecGetOwnershipRange(vector,&lower_row,&upper_row);
-	upper_row--;
-	range=upper_row-lower_row+1;    
-
-	if (range){
-		idxn=xNew<int>(range); 
-		for (i=0;i<range;i++){
-			*(idxn+i)=lower_row+i;
-		} 
-		local_vector=xNew<doubletype>(range);
-		/*Extract values from MPI vector to serial local_vector on each node*/
-		VecGetValues(vector,range,idxn,local_vector); 
+	/*Create scatter: ToAll replicates on every rank; ToZero gathers only on rank 0.
+	 * VecScatterBegin/End are CUDA-aware: PETSc handles GPU->CPU transfer internally,
+	 * avoiding the deprecated VecGetValues path which stalls the GPU pipeline.*/
+	if(broadcast){
+		VecScatterCreateToAll(vector,&ctx,&vector_seq);
 	}
-
-	/*Now each node holds its local_vector containing range rows. 
-	 * We send this local_vector  to the gathered_vector on node 0*/
-	for (i=1;i<num_procs;i++){
-		if (my_rank==i){ 
-			buffer[0]=my_rank;
-			buffer[1]=lower_row;
-			buffer[2]=range;
-			ISSM_MPI_Send(buffer,3,ISSM_MPI_INT,0,1,comm);
-			if (range)ISSM_MPI_Send(local_vector,range,TypeToMPIType<doubletype>(),0,1,comm);
-		}
-		if (my_rank==0){
-			ISSM_MPI_Recv(buffer,3,ISSM_MPI_INT,i,1,comm,&status); 
-			if (buffer[2])ISSM_MPI_Recv(gathered_vector+buffer[1],buffer[2],TypeToMPIType<doubletype>(),i,1,comm,&status);
-		}
+	else{
+		VecScatterCreateToZero(vector,&ctx,&vector_seq);
 	}
 
-	if (my_rank==0){ 
-		//Still have the local_vector on node 0 to take care of.
-		if (range) {
-			xMemCpy<doubletype>(&gathered_vector[lower_row], local_vector, range);
-		}
+	VecScatterBegin(ctx,vector,vector_seq,INSERT_VALUES,SCATTER_FORWARD);
+	VecScatterEnd(  ctx,vector,vector_seq,INSERT_VALUES,SCATTER_FORWARD);
+
+	/*Copy data from sequential vector (always CPU-resident after scatter)*/
+	int n;
+	VecGetSize(vector_seq,&n);
+	if(n>0){
+		gathered_vector=xNew<doubletype>(n);
+		VecGetArrayRead(vector_seq,&vec_array);
+		for(int i=0;i<n;i++) gathered_vector[i]=(doubletype)vec_array[i];
+		VecRestoreArrayRead(vector_seq,&vec_array);
 	}
 
-	if(broadcast){
-		/*Now, broadcast gathered_vector from node 0 to other nodes: */
-		ISSM_MPI_Bcast(gathered_vector,vector_size,TypeToMPIType<doubletype>(),0,comm);
-	}
+	/*Destroy scatter context and sequential vector*/
+	VecScatterDestroy(&ctx);
+	VecDestroy(&vector_seq);
 
-	/*Assign output pointers: */
+	/*Assign output pointer*/
 	*pgathered_vector=gathered_vector;
 
-	/*Free resources: */
-	xDelete<int>(idxn);
-	xDelete<doubletype>(local_vector);
-
 	return 1;
 }
 
diff --git a/src/m/classes/toolkits.py b/src/m/classes/toolkits.py
index 97e812a22..14f1d3c8c 100644
--- a/src/m/classes/toolkits.py
+++ b/src/m/classes/toolkits.py
@@ -1,4 +1,5 @@
 from fielddisplay import fielddisplay
+from gpuoptions import gpuoptions
 from iluasmoptions import iluasmoptions
 from IssmConfig import IssmConfig
 from issmgslsolver import issmgslsolver

From 984516722fa7849b7617b12e2c7d17c2e60afc86 Mon Sep 17 00:00:00 2001
From: Justin Kin Jun Hew <justinkinjun.hew@anu.edu.au>
Date: Tue, 2 Jun 2026 09:35:05 +1000
Subject: [PATCH 03/12] remove duplicate gadi file from cluster dir

---
 src/m/contrib/jkjhew/gadi.py | 248 -----------------------------------
 1 file changed, 248 deletions(-)
 delete mode 100644 src/m/contrib/jkjhew/gadi.py

diff --git a/src/m/contrib/jkjhew/gadi.py b/src/m/contrib/jkjhew/gadi.py
deleted file mode 100644
index 37340573a..000000000
--- a/src/m/contrib/jkjhew/gadi.py
+++ /dev/null
@@ -1,248 +0,0 @@
-import subprocess
-
-from fielddisplay import fielddisplay
-from helpers import *
-from IssmConfig import IssmConfig
-from issmscpin import issmscpin
-from issmscpout import issmscpout
-from issmssh import issmssh
-from MatlabFuncs import *
-from pairoptions import pairoptions
-try:
-    from gadi_settings import gadi_settings
-except ImportError:
-    pass
-from QueueRequirements import QueueRequirements
-
-class gadi(object):
-    """Gadi cluster class definition.
-       Usage:
-           cluster = gadi()  # default constructor
-           cluster = gadi('numnodes',2,'cpuspernode',24,'login','username') ...
-    """
-
-    def __init__(self, *args):  # {{{
-        self.name           = 'gadi.nci.org.au'
-        self.login          = ''
-        # Adjust modules for Gadi as needed:
-        self.modules        = [
-            # Example modules or spack loads:
-            'openmpi/4.1.2', 
-            # 'python3/3.9.2',
-            # or just keep them empty if you handle externally:
-        ]
-        # For Gadi, you usually specify the total ncpus. But if you still want
-        # to specify by numnodes & cpuspernode, that’s fine. Adjust as needed:
-        self.numnodes       = 1
-        self.cpuspernode    = 4
-        self.port           = ''  # typical SSH port
-        self.queue          = 'normal'  # or "hugemem", "express", etc.
-        self.time           = 60  # total minutes of walltime, e.g. 60 => 1 hour
-        self.processor      = ''  # not usually needed for Gadi
-        self.srcpath        = ''
-        self.extpkgpath     = ''
-        self.codepath       = ''
-        self.executionpath  = ''
-        self.project        = ''  # Gadi uses -P <PROJECT>
-        self.interactive    = 0
-        self.bbftp          = 0
-        self.numstreams     = 8
-        self.hyperthreading = 0
-
-        # Use provided options to change fields
-        options = pairoptions(*args)
-
-        # If you had a user settings file, try it:
-        try:
-            self = gadi_settings(self)
-        except NameError:
-            pass
-
-        # Override defaults with user-specified parameters
-        self = options.AssignObjectFields(self)
-
-        # Ensure login is set correctly
-        if not self.login:
-            raise ValueError("Login must be specified for Gadi cluster.")
-        
-    # }}}
-
-    def __repr__(self):  # {{{
-        s = 'class Gadi object\n'
-        s += '    name: {}\n'.format(self.name)
-        s += '    login: {}\n'.format(self.login)
-        s += '    modules: {}\n'.format(strjoin(self.modules, ', '))
-        s += '    numnodes: {}\n'.format(self.numnodes)
-        s += '    cpuspernode: {}\n'.format(self.cpuspernode)
-        s += '    np: {}\n'.format(self.nprocs())
-        s += '    port: {}\n'.format(self.port)
-        s += '    queue: {}\n'.format(self.queue)
-        s += '    time (min): {}\n'.format(self.time)
-        s += '    processor: {}\n'.format(self.processor)
-        s += '    srcpath: {}\n'.format(self.srcpath)
-        s += '    extpkgpath: {}\n'.format(self.extpkgpath)
-        s += '    codepath: {}\n'.format(self.codepath)
-        s += '    executionpath: {}\n'.format(self.executionpath)
-        s += '    project: {}\n'.format(self.project)
-        s += '    interactive: {}\n'.format(self.interactive)
-        s += '    bbftp: {}\n'.format(self.bbftp)
-        s += '    numstreams: {}\n'.format(self.numstreams)
-        s += '    hyperthreading: {}\n'.format(self.hyperthreading)
-        return s
-    # }}}
-
-    def nprocs(self):  # {{{
-        return self.numnodes * self.cpuspernode
-    # }}}
-
-    def checkconsistency(self, md, solution, analyses):  # {{{
-        queuedict = {
-            'normal': [48*60,  3072],  # e.g. up to 48h, 3072 cores
-            'express': [2*60,  960],   # e.g. up to 2h, 960 cores
-            'hugemem': [48*60, 3072],
-        }
-        QueueRequirements(queuedict, self.queue, self.nprocs(), self.time)
-
-        # Some minimal checks
-        if not self.login:
-            md = md.checkmessage('login is empty')
-        if not self.srcpath:
-            md = md.checkmessage('srcpath is empty')
-        if not self.codepath:
-            md = md.checkmessage('codepath is empty')
-        if not self.executionpath:
-            md = md.checkmessage('executionpath is empty')
-        if not self.project:
-            md = md.checkmessage('project (PBS -P) is empty')
-        return self
-    # }}}
-
-    def BuildQueueScript(
-        self, dirname, modelname, solution,
-        io_gather, isvalgrind, isgprof, isdakota, isoceancoupling
-    ):  # {{{
-        """
-        Create a PBS script for Gadi. 
-        Gadi typically uses #PBS lines like:
-         - #PBS -P <PROJECT>
-         - #PBS -q normal
-         - #PBS -l ncpus=...,walltime=HH:MM:SS,mem=...
-         - #PBS -l wd
-         - #PBS -j oe
-        """
-
-        if isgprof:
-            print('gprof not typically used on Gadi via this script, ignoring...')
-
-        executable = 'issm.exe'
-        if isdakota:
-            version_str = IssmConfig('_DAKOTA_VERSION_')
-            # e.g. "6.14"
-            version_float = float(version_str[0:3])
-            if version_float >= 6.0:
-                executable = 'issm_dakota.exe'
-        if isoceancoupling:
-            executable = 'issm_ocean.exe'
-
-        # Convert self.time (minutes) to hh:mm:ss
-        hours   = self.time // 60
-        minutes = self.time % 60
-        walltime_str = '{:02d}:{:02d}:00'.format(hours, minutes)
-
-        # Write queue script
-        fid = open(modelname + '.queue', 'w')
-        fid.write('#!/bin/bash\n')
-        fid.write('#PBS -P {}\n'.format(self.project))
-        fid.write('#PBS -q {}\n'.format(self.queue))
-        fid.write('#PBS -l ncpus={}\n'.format(self.nprocs()))
-        # Optionally request memory (example: 4GB x nprocs):
-        # fid.write('#PBS -l mem={}GB\n'.format(4 * self.nprocs()))
-        fid.write('#PBS -l walltime={}\n'.format(walltime_str))
-        fid.write('#PBS -l wd\n')  
-        fid.write('#PBS -j oe\n')
-        fid.write('#PBS -l storage=scratch/{0}+gdata/{0}\n'.format(self.project))
-        fid.write('#PBS -m bea\n')
-        fid.write('#PBS -o {}/{}/{}.outlog \n'.format(self.executionpath, dirname, modelname))
-        fid.write('#PBS -e {}/{}/{}.errlog \n\n'.format(self.executionpath, dirname, modelname))   
-
-        fid.write('\n# Load modules as needed:\n')
-        fid.write('module purge\n')
-        for m in self.modules:
-            fid.write('module load {}\n'.format(m))
-
-        # Optionally source environment scripts if needed:
-        # fid.write('source /g/data/...someSpackOrCondaEnv...\n')
-
-        fid.write('\n# Switch to run directory (if not using -l wd):\n')
-        fid.write('cd {}/{}/\n'.format(self.executionpath, dirname))
-        fid.write('\n# Now launch the job:\n')
-        
-
-        if not isvalgrind:
-            fid.write('mpiexec -n {} {}/{} {} {}/{} {}\n'.format(
-                self.nprocs(), self.codepath, executable,
-                solution, self.executionpath, dirname, modelname))
-        else:
-            # Example valgrind usage
-            supstring = ''
-            # If you have self.valgrindsup or so, otherwise remove
-            # for supfile in self.valgrindsup:
-            #     supstring += ' --suppressions=' + supfile
-            fid.write(
-                'mpiexec -n {} valgrind --leak-check=full{} {}/{} {} {}/{} {}\n'.format(
-                    self.nprocs(), supstring, self.codepath, executable,
-                    solution, self.executionpath, dirname, modelname))
-
-        if not io_gather:
-            # Merge partial outbin files if your code writes multiple .outbin.#
-            fid.write('\ncat {}.outbin.* > {}.outbin\n'.format(modelname, modelname))
-
-        fid.close()
-    # }}}
-
-    def UploadQueueJob(self, modelname, dirname, filelist):  # {{{
-        # Compress inputs into a tarball
-        compressstring = 'tar -zcf {}.tar.gz'.format(dirname)
-        for f in filelist:
-            compressstring += ' {}'.format(f)
-
-        subprocess.call(compressstring, shell=True)
-        print('Uploading input file and queueing script to Gadi...')
-        directory = self.executionpath
-        issmscpout(self.name, directory, self.login, self.port, [dirname + '.tar.gz'])
-    # }}}
-
-    def LaunchQueueJob(self, modelname, dirname, filelist, restart, batch):  # {{{
-        """
-        On Gadi, typically you do: 
-          qsub modelname.queue
-        rather than `./modelname.queue`.
-        """
-        if not isempty(restart):
-            # If "restart" logic is needed, adapt as necessary
-            launchcommand = (
-                'cd {executionpath}/{dirname} && qsub {modelname}.queue'
-                .format(executionpath=self.executionpath, 
-                        dirname=dirname, modelname=modelname)
-            )
-        else:
-            # Create/clean directory, extract tar, then qsub
-            launchcommand = (
-                'cd {executionpath} && '
-                'rm -rf ./{dirname} && mkdir {dirname} && cd {dirname} && '
-                'mv ../{dirname}.tar.gz ./ && '
-                'tar -zxf {dirname}.tar.gz && '
-                'qsub {modelname}.queue'
-                .format(executionpath=self.executionpath, 
-                        dirname=dirname, modelname=modelname)
-            )
-            
-        print('Launching solution sequence on Gadi via SSH...')
-        issmssh(self.name, self.login, self.port, launchcommand)
-    # }}}
-
-    def Download(self, dirname, filelist):  # {{{
-        # Copy files from Gadi back to local machine
-        directory = '{}/{}/'.format(self.executionpath, dirname)
-        issmscpin(self.name, self.login, self.port, directory, filelist)
-    # }}}

From 0b869271986bd9f6c97d3c71e45c105ad7e69e4b Mon Sep 17 00:00:00 2001
From: Justin Kin Jun Hew <justinkinjun.hew@anu.edu.au>
Date: Tue, 2 Jun 2026 13:17:04 +1000
Subject: [PATCH 04/12] Fix +ad CI: guard CUDA scatter path with
 _HAVE_PETSC_CUDA_
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

In +ad builds PVec resolves to adjoint_petsc::ADVec (ADVecImpl*) which
is not compatible with VecScatterCreate*/VecGetArrayRead — those APIs
only accept plain Vec.

+cuda and +ad are mutually exclusive (Spack conflict), so:
  _HAVE_PETSC_CUDA_ defined  -> scatter / VecGetArrayRead (Vec only)
  _HAVE_PETSC_CUDA_ not set  -> original VecGetValues path (handles ADVec)

Fixes CI failure in gcc-ad build (PR #35).
---
 src/c/toolkits/petsc/objects/PetscVec.cpp     |  11 +-
 .../toolkits/petsc/patches/VecToMPISerial.cpp | 122 +++++++++++++-----
 2 files changed, 96 insertions(+), 37 deletions(-)

diff --git a/src/c/toolkits/petsc/objects/PetscVec.cpp b/src/c/toolkits/petsc/objects/PetscVec.cpp
index f9b9449e2..1f75b0d2d 100644
--- a/src/c/toolkits/petsc/objects/PetscVec.cpp
+++ b/src/c/toolkits/petsc/objects/PetscVec.cpp
@@ -170,16 +170,21 @@ void PetscVec<doubletype>::GetLocalVector(doubletype** pvector,int** pindices){/
 	/*Build indices*/
 	int* indices=xNew<int>(range);
 	for(int i=0;i<range;i++) indices[i]=lower_row+i;
-	/*Get vector: use VecGetArrayRead which is CUDA-aware (PETSc downloads from
-	 * device transparently) instead of VecGetValues which forces a stalling
-	 * GPU->CPU copy for every index lookup.*/
 	doubletype* values=xNew<doubletype>(range);
+#ifdef _HAVE_PETSC_CUDA_
+	/*CUDA build: VecGetArrayRead is GPU-aware; returns the local block after a
+	 * single device->host transfer. Safe here because +cuda implies ~ad, so
+	 * this->vector is plain Vec, never ADVecImpl. */
 	{
 		const PetscScalar* arr=NULL;
 		VecGetArrayRead(this->vector,&arr);
 		for(int i=0;i<range;i++) values[i]=(doubletype)arr[i];
 		VecRestoreArrayRead(this->vector,&arr);
 	}
+#else
+	/*Non-CUDA build (including +ad where this->vector may be ADVecImpl*). */
+	VecGetValues(this->vector,range,indices,values);
+#endif
 
 	*pvector  = values;
 	*pindices = indices;
diff --git a/src/c/toolkits/petsc/patches/VecToMPISerial.cpp b/src/c/toolkits/petsc/patches/VecToMPISerial.cpp
index 4fc32dcab..2113d87d8 100644
--- a/src/c/toolkits/petsc/patches/VecToMPISerial.cpp
+++ b/src/c/toolkits/petsc/patches/VecToMPISerial.cpp
@@ -55,15 +55,6 @@
 template<typename doubletype, typename vectype>
 int VecToMPISerial(doubletype** pgathered_vector, vectype vector,ISSM_MPI_Comm comm,bool broadcast){
 
-	/*Output*/
-	doubletype*        gathered_vector = NULL;
-	const PetscScalar* vec_array       = NULL;
-
-	/*Sequential vector and scatter context*/
-	vectype    vector_seq = NULL;
-	VecScatter ctx        = NULL;
-
-	/*Check for empty vector*/
 	int vector_size;
 	VecGetSize(vector,&vector_size);
 	if(vector_size==0){
@@ -71,36 +62,99 @@ int VecToMPISerial(doubletype** pgathered_vector, vectype vector,ISSM_MPI_Comm c
 		return 1;
 	}
 
-	/*Create scatter: ToAll replicates on every rank; ToZero gathers only on rank 0.
-	 * VecScatterBegin/End are CUDA-aware: PETSc handles GPU->CPU transfer internally,
-	 * avoiding the deprecated VecGetValues path which stalls the GPU pipeline.*/
-	if(broadcast){
-		VecScatterCreateToAll(vector,&ctx,&vector_seq);
-	}
-	else{
-		VecScatterCreateToZero(vector,&ctx,&vector_seq);
-	}
+	doubletype* gathered_vector=NULL;
+
+#ifdef _HAVE_PETSC_CUDA_
+	/*CUDA build: use VecScatter which is GPU-aware. PETSc handles device->host
+	 * transfer internally. Cannot use VecGetValues on CUDA vectors (deprecated
+	 * and stalls the GPU pipeline).
+	 * Safe here because +cuda and +ad are mutually exclusive (Spack conflict),
+	 * so vectype is always plain Vec, never ADVecImpl. */
+	{
+		const PetscScalar* vec_array=NULL;
+		vectype    vector_seq=NULL;
+		VecScatter ctx=NULL;
 
-	VecScatterBegin(ctx,vector,vector_seq,INSERT_VALUES,SCATTER_FORWARD);
-	VecScatterEnd(  ctx,vector,vector_seq,INSERT_VALUES,SCATTER_FORWARD);
-
-	/*Copy data from sequential vector (always CPU-resident after scatter)*/
-	int n;
-	VecGetSize(vector_seq,&n);
-	if(n>0){
-		gathered_vector=xNew<doubletype>(n);
-		VecGetArrayRead(vector_seq,&vec_array);
-		for(int i=0;i<n;i++) gathered_vector[i]=(doubletype)vec_array[i];
-		VecRestoreArrayRead(vector_seq,&vec_array);
+		if(broadcast){
+			VecScatterCreateToAll(vector,&ctx,&vector_seq);
+		}
+		else{
+			VecScatterCreateToZero(vector,&ctx,&vector_seq);
+		}
+
+		VecScatterBegin(ctx,vector,vector_seq,INSERT_VALUES,SCATTER_FORWARD);
+		VecScatterEnd(  ctx,vector,vector_seq,INSERT_VALUES,SCATTER_FORWARD);
+
+		int n;
+		VecGetSize(vector_seq,&n);
+		if(n>0){
+			gathered_vector=xNew<doubletype>(n);
+			VecGetArrayRead(vector_seq,&vec_array);
+			for(int i=0;i<n;i++) gathered_vector[i]=(doubletype)vec_array[i];
+			VecRestoreArrayRead(vector_seq,&vec_array);
+		}
+
+		VecScatterDestroy(&ctx);
+		VecDestroy(&vector_seq);
 	}
+#else
+	/*Non-CUDA build (including +ad where vectype may be ADVecImpl*):
+	 * use the original VecGetValues-based gather. */
+	{
+		int i;
+		int num_procs;
+		int my_rank;
+		ISSM_MPI_Status status;
+		PetscInt lower_row,upper_row;
+		int range;
+		int* idxn=NULL;
+		int buffer[3];
+		doubletype* local_vector=NULL;
 
-	/*Destroy scatter context and sequential vector*/
-	VecScatterDestroy(&ctx);
-	VecDestroy(&vector_seq);
+		ISSM_MPI_Comm_size(comm,&num_procs);
+		ISSM_MPI_Comm_rank(comm,&my_rank);
 
-	/*Assign output pointer*/
-	*pgathered_vector=gathered_vector;
+		if(broadcast || my_rank==0){
+			gathered_vector=xNew<doubletype>(vector_size);
+		}
 
+		VecGetOwnershipRange(vector,&lower_row,&upper_row);
+		upper_row--;
+		range=upper_row-lower_row+1;
+
+		if(range){
+			idxn=xNew<int>(range);
+			for(i=0;i<range;i++) idxn[i]=lower_row+i;
+			local_vector=xNew<doubletype>(range);
+			VecGetValues(vector,range,idxn,local_vector);
+		}
+
+		for(i=1;i<num_procs;i++){
+			if(my_rank==i){
+				buffer[0]=my_rank; buffer[1]=lower_row; buffer[2]=range;
+				ISSM_MPI_Send(buffer,3,ISSM_MPI_INT,0,1,comm);
+				if(range) ISSM_MPI_Send(local_vector,range,TypeToMPIType<doubletype>(),0,1,comm);
+			}
+			if(my_rank==0){
+				ISSM_MPI_Recv(buffer,3,ISSM_MPI_INT,i,1,comm,&status);
+				if(buffer[2]) ISSM_MPI_Recv(gathered_vector+buffer[1],buffer[2],TypeToMPIType<doubletype>(),i,1,comm,&status);
+			}
+		}
+
+		if(my_rank==0 && range){
+			xMemCpy<doubletype>(&gathered_vector[lower_row],local_vector,range);
+		}
+
+		if(broadcast){
+			ISSM_MPI_Bcast(gathered_vector,vector_size,TypeToMPIType<doubletype>(),0,comm);
+		}
+
+		xDelete<int>(idxn);
+		xDelete<doubletype>(local_vector);
+	}
+#endif
+
+	*pgathered_vector=gathered_vector;
 	return 1;
 }
 

From df382b430fce5eaa7eec1a1179bb35b0437011f8 Mon Sep 17 00:00:00 2001
From: Justin Kin Jun Hew <justinkinjun.hew@anu.edu.au>
Date: Tue, 2 Jun 2026 13:30:12 +1000
Subject: [PATCH 05/12] Fix NewMat.cpp: restore missing function body after
 strncmp edit

The strncmp commit accidentally dropped the preallocation call, closing
braces and return statement, leaving only the fold marker. Restore them.
---
 src/c/toolkits/petsc/patches/NewMat.cpp | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/src/c/toolkits/petsc/patches/NewMat.cpp b/src/c/toolkits/petsc/patches/NewMat.cpp
index e053ea64f..4f667bf6e 100644
--- a/src/c/toolkits/petsc/patches/NewMat.cpp
+++ b/src/c/toolkits/petsc/patches/NewMat.cpp
@@ -115,4 +115,9 @@ PMat NewMat(int M,int N,int connectivity,int numberofdofspernode,ISSM_MPI_Comm c
         MatGetType(outmatrix,&type);
 
         if((strncmp(type,"mpiaij",6)==0) || (strcmp(type,"mpidense")==0)){
+                MatMPIAIJSetPreallocation(outmatrix,d_nz,NULL,o_nz,NULL);
+        }
+
+        return outmatrix;
+}
 /*}}}*/

From 5193072e1d02a45e4aaea8bf57941ef5225a26b9 Mon Sep 17 00:00:00 2001
From: Justin Kin Jun Hew <justinkinjun.hew@anu.edu.au>
Date: Tue, 2 Jun 2026 14:02:05 +1000
Subject: [PATCH 06/12] Add gcc-cuda CI manifest for GPU/CUDA build variant

---
 .../build-ci/manifests/gcc-cuda.spack.yaml.j2 | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)
 create mode 100644 .github/build-ci/manifests/gcc-cuda.spack.yaml.j2

diff --git a/.github/build-ci/manifests/gcc-cuda.spack.yaml.j2 b/.github/build-ci/manifests/gcc-cuda.spack.yaml.j2
new file mode 100644
index 000000000..2780d94b9
--- /dev/null
+++ b/.github/build-ci/manifests/gcc-cuda.spack.yaml.j2
@@ -0,0 +1,19 @@
+spack:
+  specs:
+  - issm @git.{{ ref }} +wrappers ~ad +cuda
+  packages:
+    # Workaround due to https://github.com/spack/spack-packages/pull/3644
+    re2c:
+      require:
+      - '@3.1'
+
+    gcc:
+      require:
+      - '@{{ gcc_compiler_version }}'
+    all:
+      require:
+      - '%access_gcc'
+      - 'target={{ target }}'
+  concretizer:
+    unify: false
+  view: false

From 5c2ec4852d9f2ea56dc0030bc12b8f21960d4179 Mon Sep 17 00:00:00 2001
From: Justin Kin Jun Hew <justinkinjun.hew@anu.edu.au>
Date: Tue, 2 Jun 2026 14:50:40 +1000
Subject: [PATCH 07/12] update

---
 .github/workflows/ci.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 2dc0bf9f5..e6b3a6177 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -31,5 +31,6 @@ jobs:
       spack-manifest-path: ${{ matrix.file }}
       allow-ssh-into-spack-install: false  # If true, PR author must ssh into instance to complete job
       spack-manifest-data-path: .github/build-ci/data/standard.json
+      access-spack-packages-ref: justinh2002/cuda_issm_gpu
       # Default args (including explicit spack/spack-packages/spack-config versions)
       # are specified in https://github.com/ACCESS-NRI/build-ci/tree/v3/.github/workflows#inputs

From 2010bbb28529509295c3f6564c671e81fb938532 Mon Sep 17 00:00:00 2001
From: Justin Kin Jun Hew <justinkinjun.hew@anu.edu.au>
Date: Tue, 2 Jun 2026 15:01:24 +1000
Subject: [PATCH 08/12] test


From 62fb8652cd665edca7179cd35e0c283c7af5b812 Mon Sep 17 00:00:00 2001
From: Justin Kin Jun Hew <justinkinjun.hew@anu.edu.au>
Date: Tue, 2 Jun 2026 15:05:20 +1000
Subject: [PATCH 09/12] test


From 5d670dda72c6b7fe490867494b0a51e8e4cf5abb Mon Sep 17 00:00:00 2001
From: Justin Kin Jun Hew <justinkinjun.hew@anu.edu.au>
Date: Wed, 3 Jun 2026 10:56:48 +1000
Subject: [PATCH 10/12] add cuda_arch specification

---
 .github/build-ci/manifests/gcc-cuda.spack.yaml.j2 | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/build-ci/manifests/gcc-cuda.spack.yaml.j2 b/.github/build-ci/manifests/gcc-cuda.spack.yaml.j2
index 2780d94b9..4eb669b6d 100644
--- a/.github/build-ci/manifests/gcc-cuda.spack.yaml.j2
+++ b/.github/build-ci/manifests/gcc-cuda.spack.yaml.j2
@@ -1,6 +1,6 @@
 spack:
   specs:
-  - issm @git.{{ ref }} +wrappers ~ad +cuda
+  - issm @git.{{ ref }} +wrappers ~ad +cuda cuda_arch=80
   packages:
     # Workaround due to https://github.com/spack/spack-packages/pull/3644
     re2c:

From 4a89911b693fc8dc0c365370cf4332d724880d04 Mon Sep 17 00:00:00 2001
From: Justin Kin Jun Hew <justinkinjun.hew@anu.edu.au>
Date: Tue, 16 Jun 2026 15:36:31 +1000
Subject: [PATCH 11/12] add remaining gpu option changes

---
 .github/build-ci/data/standard.json           |   2 +-
 .../build-ci/manifests/gcc-cuda.spack.yaml.j2 |   2 +-
 .../petsc/install-3.22-gadi-gpu.sh            |  20 +-
 src/c/Makefile.am                             |   1 +
 src/c/shared/Enum/EnumDefinitions.h           |   1 +
 src/c/shared/Enum/EnumToStringx.cpp           |   1 +
 src/c/shared/Enum/StringToEnumx.cpp           |   1 +
 .../solutionsequences/AndersonAccelerator.cpp | 202 ++++++++++++++++++
 src/c/solutionsequences/AndersonAccelerator.h |  56 +++++
 .../solutionsequence_nonlinear.cpp            |  22 +-
 src/m/classes/gpuoptions.py                   |  50 ++++-
 11 files changed, 345 insertions(+), 13 deletions(-)
 mode change 100644 => 100755 externalpackages/petsc/install-3.22-gadi-gpu.sh
 create mode 100644 src/c/solutionsequences/AndersonAccelerator.cpp
 create mode 100644 src/c/solutionsequences/AndersonAccelerator.h

diff --git a/.github/build-ci/data/standard.json b/.github/build-ci/data/standard.json
index 2dabd49cd..0112f15e3 100644
--- a/.github/build-ci/data/standard.json
+++ b/.github/build-ci/data/standard.json
@@ -1,4 +1,4 @@
 {
     "gcc_compiler_version": "13.2.0",
-    "target": "x86_64"
+    "target": " x86_64"
 }
diff --git a/.github/build-ci/manifests/gcc-cuda.spack.yaml.j2 b/.github/build-ci/manifests/gcc-cuda.spack.yaml.j2
index 4eb669b6d..2780d94b9 100644
--- a/.github/build-ci/manifests/gcc-cuda.spack.yaml.j2
+++ b/.github/build-ci/manifests/gcc-cuda.spack.yaml.j2
@@ -1,6 +1,6 @@
 spack:
   specs:
-  - issm @git.{{ ref }} +wrappers ~ad +cuda cuda_arch=80
+  - issm @git.{{ ref }} +wrappers ~ad +cuda
   packages:
     # Workaround due to https://github.com/spack/spack-packages/pull/3644
     re2c:
diff --git a/externalpackages/petsc/install-3.22-gadi-gpu.sh b/externalpackages/petsc/install-3.22-gadi-gpu.sh
old mode 100644
new mode 100755
index 8bbf28b68..6198d0460
--- a/externalpackages/petsc/install-3.22-gadi-gpu.sh
+++ b/externalpackages/petsc/install-3.22-gadi-gpu.sh
@@ -24,6 +24,21 @@ if [ ! -d "${CUDA_DIR}" ]; then
     exit 1
 fi
 
+# Build PETSc against the SAME MPI that ISSM uses (system OpenMPI), rather than
+# letting PETSc download its own MPICH.  A mismatched MPI causes issm.exe to be
+# linked against two MPI runtimes at once, which breaks any np>1 (multi-rank)
+# run: each rank initialises as its own standalone MPI_COMM_WORLD rank-0.
+# The OpenMPI compiler wrappers must be on PATH (module load openmpi/4.1.3).
+for w in mpicc mpicxx mpif90; do
+    if ! command -v ${w} &>/dev/null; then
+        echo "ERROR: ${w} not found. Load OpenMPI first: module load openmpi/4.1.3"
+        exit 1
+    fi
+done
+if ! mpicc --showme:version 2>/dev/null | grep -qi "open mpi"; then
+    echo "WARNING: mpicc does not appear to be OpenMPI; check your modules."
+fi
+
 # Environment
 if [ -z ${LDFLAGS+x} ]; then
     LDFLAGS=""
@@ -51,14 +66,17 @@ cd ${PETSC_DIR}
     --prefix="${PREFIX}" \
     --PETSC_DIR="${PETSC_DIR}" \
     --LDFLAGS="${LDFLAGS}" \
+    --CFLAGS="-g -O2" --CXXFLAGS="-g -O2" --FFLAGS="-g -O2" \
     --with-debugging=0 \
     --with-valgrind=0 \
     --with-x=0 \
     --with-ssl=0 \
     --with-pic=1 \
+    --with-cc=mpicc \
+    --with-cxx=mpicxx \
+    --with-fc=mpif90 \
     --download-fblaslapack=1 \
     --download-metis=1 \
-    --download-mpich=1 \
     --download-mumps=1 \
     --download-parmetis=1 \
     --download-scalapack=1 \
diff --git a/src/c/Makefile.am b/src/c/Makefile.am
index cb7be17ae..623b61905 100644
--- a/src/c/Makefile.am
+++ b/src/c/Makefile.am
@@ -266,6 +266,7 @@ issm_sources += \
 	./solutionsequences/solutionsequence_la_theta.cpp \
 	./solutionsequences/solutionsequence_linear.cpp \
 	./solutionsequences/solutionsequence_nonlinear.cpp \
+	./solutionsequences/AndersonAccelerator.cpp \
 	./solutionsequences/solutionsequence_newton.cpp \
 	./solutionsequences/solutionsequence_fct.cpp \
 	./solutionsequences/solutionsequence_schurcg.cpp \
diff --git a/src/c/shared/Enum/EnumDefinitions.h b/src/c/shared/Enum/EnumDefinitions.h
index e65b65dcd..4c3731309 100644
--- a/src/c/shared/Enum/EnumDefinitions.h
+++ b/src/c/shared/Enum/EnumDefinitions.h
@@ -695,6 +695,7 @@ enum definitions{
 	StepEnum,
 	StepsEnum,
 	StressbalanceAbstolEnum,
+	StressbalanceAndersonDepthEnum,
 	StressbalanceFSreconditioningEnum,
 	StressbalanceIsHydrologyLayerEnum,
 	StressbalanceIsnewtonEnum,
diff --git a/src/c/shared/Enum/EnumToStringx.cpp b/src/c/shared/Enum/EnumToStringx.cpp
index 2e76cfab7..d38b909b0 100644
--- a/src/c/shared/Enum/EnumToStringx.cpp
+++ b/src/c/shared/Enum/EnumToStringx.cpp
@@ -703,6 +703,7 @@ const char* EnumToStringx(int en){
 		case StepEnum : return "Step";
 		case StepsEnum : return "Steps";
 		case StressbalanceAbstolEnum : return "StressbalanceAbstol";
+		case StressbalanceAndersonDepthEnum : return "StressbalanceAndersonDepth";
 		case StressbalanceFSreconditioningEnum : return "StressbalanceFSreconditioning";
 		case StressbalanceIsHydrologyLayerEnum : return "StressbalanceIsHydrologyLayer";
 		case StressbalanceIsnewtonEnum : return "StressbalanceIsnewton";
diff --git a/src/c/shared/Enum/StringToEnumx.cpp b/src/c/shared/Enum/StringToEnumx.cpp
index 92e93b392..5e2fc9466 100644
--- a/src/c/shared/Enum/StringToEnumx.cpp
+++ b/src/c/shared/Enum/StringToEnumx.cpp
@@ -718,6 +718,7 @@ int  StringToEnumx(const char* name,bool notfounderror){
 	      else if (strcmp(name,"Step")==0) return StepEnum;
 	      else if (strcmp(name,"Steps")==0) return StepsEnum;
 	      else if (strcmp(name,"StressbalanceAbstol")==0) return StressbalanceAbstolEnum;
+	      else if (strcmp(name,"StressbalanceAndersonDepth")==0) return StressbalanceAndersonDepthEnum;
 	      else if (strcmp(name,"StressbalanceFSreconditioning")==0) return StressbalanceFSreconditioningEnum;
 	      else if (strcmp(name,"StressbalanceIsHydrologyLayer")==0) return StressbalanceIsHydrologyLayerEnum;
 	      else if (strcmp(name,"StressbalanceIsnewton")==0) return StressbalanceIsnewtonEnum;
diff --git a/src/c/solutionsequences/AndersonAccelerator.cpp b/src/c/solutionsequences/AndersonAccelerator.cpp
new file mode 100644
index 000000000..e379ee51c
--- /dev/null
+++ b/src/c/solutionsequences/AndersonAccelerator.cpp
@@ -0,0 +1,202 @@
+/*!\file:  AndersonAccelerator.cpp
+ * \brief: Anderson acceleration for Picard fixed-point iterations.
+ *
+ * See AndersonAccelerator.h for theory and usage.
+ */
+
+#include "AndersonAccelerator.h"
+
+/* ──────────────────────────────────────────────────────────────────────────
+ * Construction / destruction
+ * ────────────────────────────────────────────────────────────────────────── */
+
+AndersonAccelerator::AndersonAccelerator(int m) : depth(m), iter(0) {
+    /*reserve to avoid repeated allocation for typical depths <= 10*/
+    if(m > 0){
+        f_hist.reserve(m+1);
+        G_hist.reserve(m+1);
+    }
+}
+
+AndersonAccelerator::~AndersonAccelerator(){
+    for(int i=0; i<(int)f_hist.size(); i++) delete f_hist[i];
+    for(int i=0; i<(int)G_hist.size(); i++) delete G_hist[i];
+}
+
+/* ──────────────────────────────────────────────────────────────────────────
+ * SmallSolve
+ *
+ * Gaussian elimination with partial pivoting for a dense mk x mk system.
+ * On entry:  A[i*mk+j], b[i].
+ * On exit:   theta[i] holds the solution.
+ * Returns false and leaves theta zeroed if A is numerically singular.
+ * ────────────────────────────────────────────────────────────────────────── */
+bool AndersonAccelerator::SmallSolve(IssmDouble* A, IssmDouble* b,
+                                     IssmDouble* theta, int mk){
+
+    const IssmDouble tol = 1.0e-14;
+
+    /*copy b into theta; we will overwrite it as we go*/
+    for(int i=0; i<mk; i++) theta[i] = b[i];
+
+    /*forward elimination*/
+    for(int col=0; col<mk; col++){
+
+        /*partial pivot: find row with largest absolute value in this column*/
+        int pivot = col;
+        IssmDouble maxval = fabs(A[col*mk+col]);
+        for(int row=col+1; row<mk; row++){
+            IssmDouble val = fabs(A[row*mk+col]);
+            if(val > maxval){ maxval = val; pivot = row; }
+        }
+
+        if(maxval < tol){
+            /*singular or near-singular — fall back to Picard (theta = 0)*/
+            for(int i=0; i<mk; i++) theta[i] = 0.0;
+            return false;
+        }
+
+        /*swap rows col and pivot in A and theta*/
+        if(pivot != col){
+            for(int j=0; j<mk; j++){
+                IssmDouble tmp = A[col*mk+j];
+                A[col*mk+j]    = A[pivot*mk+j];
+                A[pivot*mk+j]  = tmp;
+            }
+            IssmDouble tmp  = theta[col];
+            theta[col]      = theta[pivot];
+            theta[pivot]    = tmp;
+        }
+
+        /*eliminate column below the pivot*/
+        IssmDouble inv_diag = 1.0 / A[col*mk+col];
+        for(int row=col+1; row<mk; row++){
+            IssmDouble factor = A[row*mk+col] * inv_diag;
+            for(int j=col; j<mk; j++)
+                A[row*mk+j] -= factor * A[col*mk+j];
+            theta[row] -= factor * theta[col];
+        }
+    }
+
+    /*back substitution*/
+    for(int row=mk-1; row>=0; row--){
+        for(int j=row+1; j<mk; j++)
+            theta[row] -= A[row*mk+j] * theta[j];
+        theta[row] /= A[row*mk+row];
+    }
+
+    return true;
+}
+
+/* ──────────────────────────────────────────────────────────────────────────
+ * Apply
+ *
+ * Core of Anderson acceleration.  Called once per nonlinear iteration
+ * after the linear solve has produced a new Picard update in *puf.
+ *
+ * Algorithm (Walker & Ni 2011, "Type 1"):
+ *
+ *   f_k  = G(u_k) - u_k                    Picard residual
+ *   mk   = min(iter, depth)                 effective window
+ *
+ *   if mk == 0:  u_{k+1} = G(u_k)          plain Picard
+ *   else:
+ *     dF[:,j] = f_hist[j+1] - f_hist[j]    mk difference columns
+ *     dG[:,j] = G_hist[j+1] - G_hist[j]
+ *     theta   = argmin_t || f_k - dF t ||  (mk x mk normal equations)
+ *     u_{k+1} = G(u_k) - dG theta          accelerated update
+ * ────────────────────────────────────────────────────────────────────────── */
+void AndersonAccelerator::Apply(Vector<IssmDouble>** puf,
+                                Vector<IssmDouble>* old_uf){
+
+    if(depth == 0) return;   /* disabled: leave *puf untouched */
+
+    Vector<IssmDouble>* G_k = *puf;   /* Picard update G(u_k) from linear solve */
+
+    /* ── 1. Compute Picard residual  f_k = G_k - u_k  ───────────────────── */
+    Vector<IssmDouble>* f_k = G_k->Duplicate();
+    G_k->Copy(f_k);
+    f_k->AXPY(old_uf, -1.0);   /* f_k = G_k - old_uf */
+
+    /* ── 2. Store G_k and f_k in history before any modification ─────────── */
+    Vector<IssmDouble>* G_store = G_k->Duplicate();
+    G_k->Copy(G_store);
+
+    f_hist.push_back(f_k);
+    G_hist.push_back(G_store);
+
+    /* trim history to at most depth+1 entries (depth+1 gives depth differences)*/
+    while((int)f_hist.size() > depth+1){
+        delete f_hist.front();  f_hist.erase(f_hist.begin());
+        delete G_hist.front();  G_hist.erase(G_hist.begin());
+    }
+
+    /* ── 3. Determine effective window size ──────────────────────────────── */
+    int mk = (int)f_hist.size() - 1;   /* number of difference columns */
+
+    if(mk == 0){
+        /* first iteration or depth==1 first step: keep plain Picard update */
+        iter++;
+        return;
+    }
+
+    /* ── 4. Build difference vectors dF[j] and dG[j]  ───────────────────── */
+    /*  dF[j] = f_hist[j+1] - f_hist[j]   (j = 0 ... mk-1)
+     *  dG[j] = G_hist[j+1] - G_hist[j]                                      */
+    std::vector<Vector<IssmDouble>*> dF(mk, NULL);
+    std::vector<Vector<IssmDouble>*> dG(mk, NULL);
+
+    for(int j=0; j<mk; j++){
+        dF[j] = f_hist[j+1]->Duplicate();
+        f_hist[j+1]->Copy(dF[j]);
+        dF[j]->AXPY(f_hist[j], -1.0);      /* dF[j] = f_hist[j+1] - f_hist[j] */
+
+        dG[j] = G_hist[j+1]->Duplicate();
+        G_hist[j+1]->Copy(dG[j]);
+        dG[j]->AXPY(G_hist[j], -1.0);      /* dG[j] = G_hist[j+1] - G_hist[j] */
+    }
+
+    /* ── 5. Form normal equations  A theta = b  where
+     *       A[i][j] = dF[i] . dF[j]
+     *       b[i]    = dF[i] . f_k                                            */
+    IssmDouble* A     = new IssmDouble[mk*mk];
+    IssmDouble* b_rhs = new IssmDouble[mk];
+    IssmDouble* theta = new IssmDouble[mk]();
+
+    Vector<IssmDouble>* f_k_cur = f_hist.back();   /* = f_k we just pushed */
+
+    for(int i=0; i<mk; i++){
+        b_rhs[i] = dF[i]->Dot(f_k_cur);
+        for(int j=0; j<=i; j++){
+            A[i*mk+j] = A[j*mk+i] = dF[i]->Dot(dF[j]);
+        }
+    }
+
+    /* ── 6. Solve the mk x mk system ────────────────────────────────────── */
+    bool ok = SmallSolve(A, b_rhs, theta, mk);
+
+    /* ── 7. Anderson update:  G_k -= sum_j theta[j] * dG[j]  ─────────────
+     *   If SmallSolve failed (singular) theta is zeroed so G_k is unchanged,
+     *   effectively falling back to Picard for this iteration.               */
+    if(ok){
+        for(int j=0; j<mk; j++){
+            G_k->AXPY(dG[j], -theta[j]);   /* G_k -= theta[j] * dG[j] */
+        }
+        if(VerboseConvergence())
+            _printf0_("   Anderson(" << depth << "): mk=" << mk << " applied\n");
+    }
+    else{
+        if(VerboseConvergence())
+            _printf0_("   Anderson(" << depth << "): mk=" << mk
+                      << " singular, falling back to Picard\n");
+    }
+
+    /* ── 8. Clean up ─────────────────────────────────────────────────────── */
+    for(int j=0; j<mk; j++){ delete dF[j]; delete dG[j]; }
+    delete[] A;
+    delete[] b_rhs;
+    delete[] theta;
+
+    /* *puf still points to G_k which has been modified in-place */
+    iter++;
+}
diff --git a/src/c/solutionsequences/AndersonAccelerator.h b/src/c/solutionsequences/AndersonAccelerator.h
new file mode 100644
index 000000000..431523533
--- /dev/null
+++ b/src/c/solutionsequences/AndersonAccelerator.h
@@ -0,0 +1,56 @@
+/*!\file:  AndersonAccelerator.h
+ * \brief: Anderson acceleration (depth-m mixing) for Picard fixed-point iterations.
+ *
+ * Theory: Walker & Ni (2011), SIAM J. Numer. Anal. 49(4), 1715-1735.
+ *
+ * Given the Picard map  G : u -> u_new = K(u)^{-1} f(u),
+ * standard Picard sets  u_{k+1} = G(u_k).
+ * Anderson(m) instead solves a small m x m least-squares problem each
+ * iteration to find the best linear combination of the last m Picard updates,
+ * replacing the plain update at negligible extra cost.
+ *
+ * Usage:
+ *   AndersonAccelerator aa(m);          // m=0 -> pure Picard (no-op)
+ *   for(;;){
+ *     ... linear solve -> uf ...
+ *     aa.Apply(&uf, old_uf);            // uf replaced with accelerated update
+ *     ... UpdateInputFromSolution(uf) ...
+ *   }
+ */
+
+#ifndef _ANDERSONACCELERATOR_H_
+#define _ANDERSONACCELERATOR_H_
+
+#include <vector>
+#include "../toolkits/toolkits.h"
+#include "../shared/shared.h"
+
+class AndersonAccelerator {
+
+  public:
+
+    int  depth;  /* window size m; 0 = disabled (pure Picard) */
+    int  iter;   /* iteration counter (reset on construction)  */
+
+    /* history of Picard residuals  f_k = G(u_k) - u_k  */
+    std::vector<Vector<IssmDouble>*> f_hist;
+    /* history of raw Picard updates  G(u_k)  */
+    std::vector<Vector<IssmDouble>*> G_hist;
+
+    AndersonAccelerator(int m);
+    ~AndersonAccelerator();
+
+    /*! Replace *puf (= G(u_k) from linear solve) with the Anderson-accelerated
+     *  update, given old_uf = u_k (solution vector from the previous iteration).
+     *  On return *puf holds u_{k+1}. */
+    void Apply(Vector<IssmDouble>** puf, Vector<IssmDouble>* old_uf);
+
+  private:
+
+    /*! Solve the small mk x mk system  A theta = b  in-place via
+     *  Gaussian elimination with partial pivoting.  A and b are
+     *  overwritten.  Returns false if the system is singular. */
+    bool SmallSolve(IssmDouble* A, IssmDouble* b, IssmDouble* theta, int mk);
+};
+
+#endif  /* _ANDERSONACCELERATOR_H_ */
diff --git a/src/c/solutionsequences/solutionsequence_nonlinear.cpp b/src/c/solutionsequences/solutionsequence_nonlinear.cpp
index da407f1ae..0d1459519 100644
--- a/src/c/solutionsequences/solutionsequence_nonlinear.cpp
+++ b/src/c/solutionsequences/solutionsequence_nonlinear.cpp
@@ -1,8 +1,13 @@
 /*!\file: solutionsequence_nonlinear.cpp
- * \brief: core of a non-linear solution, using fixed-point method 
- */ 
+ * \brief: core of a non-linear solution, using fixed-point method
+ *         with optional Anderson acceleration (depth m >= 1).
+ *
+ * Anderson depth is read from parameter StressbalanceAndersonDepthEnum
+ * (default 0 = pure Picard).  Recommended starting value: 5.
+ */
 
 #include "./solutionsequences.h"
+#include "./AndersonAccelerator.h"
 #include "../toolkits/toolkits.h"
 #include "../classes/classes.h"
 #include "../shared/shared.h"
@@ -27,6 +32,7 @@ void solutionsequence_nonlinear(FemModel* femmodel,bool conserve_loads){
 	int min_mechanical_constraints;
 	int max_nonlinear_iterations;
 	int configuration_type;
+	int anderson_depth;
 	IssmDouble eps_res,eps_rel,eps_abs;
 
 	/*Recover parameters: */
@@ -36,6 +42,10 @@ void solutionsequence_nonlinear(FemModel* femmodel,bool conserve_loads){
 	femmodel->parameters->FindParam(&eps_rel,StressbalanceReltolEnum);
 	femmodel->parameters->FindParam(&eps_abs,StressbalanceAbstolEnum);
 	femmodel->parameters->FindParam(&configuration_type,ConfigurationTypeEnum);
+	/*Anderson depth: 0 = pure Picard (default), >=1 enables acceleration*/
+	anderson_depth = 0;
+	if(femmodel->parameters->Exist(StressbalanceAndersonDepthEnum))
+		femmodel->parameters->FindParam(&anderson_depth,StressbalanceAndersonDepthEnum);
 	femmodel->UpdateConstraintsx();
 
 	/*Were loads requested as output? : */
@@ -47,6 +57,9 @@ void solutionsequence_nonlinear(FemModel* femmodel,bool conserve_loads){
 	int  count=0;
 	bool converged=false;
 
+	/*Anderson accelerator (no-op when anderson_depth==0)*/
+	AndersonAccelerator anderson(anderson_depth);
+
 	/*Start non-linear iteration using input velocity: */
 	GetSolutionFromInputsx(&ug,femmodel);
 	Reducevectorgtofx(&uf, ug, femmodel->nodes,femmodel->parameters);
@@ -76,6 +89,11 @@ void solutionsequence_nonlinear(FemModel* femmodel,bool conserve_loads){
 		Solverx(&uf, Kff, pf, old_uf, df, femmodel->parameters);
 		femmodel->profiler->Stop(SOLVER);
 
+		/*Anderson acceleration: replaces uf with the depth-m accelerated update.
+		 * old_uf = u^(k), uf = G(u^(k)) from linear solve.
+		 * No-op when anderson_depth==0 (pure Picard).                           */
+		anderson.Apply(&uf, old_uf);
+
 		Mergesolutionfromftogx(&ug, uf,ys,femmodel->nodes,femmodel->parameters);delete ys;
 
 		convergence(&converged,Kff,pf,uf,old_uf,eps_res,eps_rel,eps_abs);
diff --git a/src/m/classes/gpuoptions.py b/src/m/classes/gpuoptions.py
index 8d636ed75..4575eedd7 100644
--- a/src/m/classes/gpuoptions.py
+++ b/src/m/classes/gpuoptions.py
@@ -10,13 +10,16 @@ def gpuoptions(*args):
     Requires a CUDA-enabled PETSc build (see
     externalpackages/petsc/install-3.22-gadi-gpu.sh).
 
-    The preconditioner defaults to block-Jacobi with ILU sub-solves, which
-    PETSc can run on GPU via CUDA.  For larger problems consider switching to
-    'gamg' (algebraic multigrid) which also has CUDA support.
+    Defaults to GMRES preconditioned with smoothed-aggregation algebraic
+    multigrid (GAMG), whose convergence rate is nearly independent of mesh size
+    -- essential for the iterative GPU solve to scale, since the simpler
+    block-Jacobi/ILU preconditioner degrades badly under mesh refinement.  The
+    matrix block size and the host-side coarse-grid products below are required
+    for GAMG to converge and to fit in GPU memory on large meshes.
 
     Usage:
         options = gpuoptions()
-        options = gpuoptions('ksp_type', 'cg', 'pc_type', 'gamg')
+        options = gpuoptions('ksp_type', 'cg', 'pc_type', 'bjacobi')
     """
 
     # Retrieve any overrides passed by the caller
@@ -27,9 +30,40 @@ def gpuoptions(*args):
     gpu['vec_type']  = options.getfieldvalue('vec_type',  'cuda')
     gpu['mat_type']  = options.getfieldvalue('mat_type',  'aijcusparse')
     gpu['ksp_type']  = options.getfieldvalue('ksp_type',  'gmres')
-    gpu['pc_type']   = options.getfieldvalue('pc_type',   'bjacobi')
-    gpu['sub_pc_type'] = options.getfieldvalue('sub_pc_type', 'ilu')
-    gpu['ksp_rtol']  = options.getfieldvalue('ksp_rtol',  1e-10)
-    gpu['ksp_max_it'] = options.getfieldvalue('ksp_max_it', 500)
+    gpu['pc_type']   = options.getfieldvalue('pc_type',   'gamg')
+    # SSA stress-balance has 2 coupled DOFs per node (vx, vy).  Telling PETSc the
+    # block size lets GAMG aggregate by node rather than by scalar DOF, which is
+    # essential for the multigrid hierarchy to converge on large vector-elliptic
+    # systems.  Read by MatSetFromOptions when the stiffness matrix is created.
+    gpu['mat_block_size'] = options.getfieldvalue('mat_block_size', 2)
+    # GAMG builds each coarse level with a sparse matrix-matrix product (PtAP).
+    # CUSPARSE's SpGEMM needs huge scratch buffers and runs the GPU out of memory
+    # on large meshes (cudaErrorMemoryAllocation in MatProductSymbolic).  Compute
+    # these products on the host (96 GB) instead; the resulting coarse operators
+    # are small and the Krylov iteration itself stays on the GPU.
+    #
+    # GAMG calls MatPtAP() directly (api_user=true), so PETSc reads the
+    # api-specific option name '-matptap_backend_cpu' -- NOT the generic
+    # '-mat_product_algorithm_backend_cpu', which is only read on the MatProduct
+    # interface path.  PtAP decomposes into inner A*B products, so cover those too.
+    gpu['matptap_backend_cpu']    = options.getfieldvalue('matptap_backend_cpu', 'true')
+    gpu['matmatmult_backend_cpu'] = options.getfieldvalue('matmatmult_backend_cpu', 'true')
+    gpu['mat_product_algorithm_backend_cpu'] = options.getfieldvalue('mat_product_algorithm_backend_cpu', 'true')
+    # Stop coarsening when grid has <= 2000 equations (direct solve on coarse grid)
+    gpu['pc_gamg_coarse_eq_limit'] = options.getfieldvalue('pc_gamg_coarse_eq_limit', 2000)
+    # Aggressive dropping of weak connections improves coarse grid quality
+    gpu['pc_gamg_threshold']  = options.getfieldvalue('pc_gamg_threshold',  0.08)
+    # ksp_rtol is the relative reduction of the *preconditioned* residual at which
+    # GMRES stops.  Empirically a 1e-10 reduction leaves the *true* residual
+    # ||KU-F||/||F|| at ~1.5e-6 on the 1.6M-DOF case -- just above ISSM's 1e-6
+    # acceptance gate (md.settings.solver_residue_threshold).  1e-12 drives the
+    # true residual to ~1.5e-8, clearing the gate with margin.  (Confirmed not an
+    # iteration/restart limit: result was bit-identical across max_it and restart
+    # changes, i.e. GMRES converges to rtol well before exhausting iterations.)
+    gpu['ksp_rtol']  = options.getfieldvalue('ksp_rtol',  1e-12)
+    gpu['ksp_max_it'] = options.getfieldvalue('ksp_max_it', 5000)
+    # Longer restart than the GMRES(30) default, so the extra iterations needed to
+    # reach the tighter rtol on large systems don't stagnate.
+    gpu['ksp_gmres_restart'] = options.getfieldvalue('ksp_gmres_restart', 100)
 
     return gpu

From 0ae95149cca67e09e75684b32a1665536aa3e754 Mon Sep 17 00:00:00 2001
From: Justin Kin Jun Hew <justinkinjun.hew@anu.edu.au>
Date: Tue, 16 Jun 2026 16:23:38 +1000
Subject: [PATCH 12/12] fix target

---
 .github/build-ci/data/standard.json | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/build-ci/data/standard.json b/.github/build-ci/data/standard.json
index 0112f15e3..2dabd49cd 100644
--- a/.github/build-ci/data/standard.json
+++ b/.github/build-ci/data/standard.json
@@ -1,4 +1,4 @@
 {
     "gcc_compiler_version": "13.2.0",
-    "target": " x86_64"
+    "target": "x86_64"
 }