From 976ddc4f662a462f4871d30e28fae667a4722948 Mon Sep 17 00:00:00 2001 From: Justin Kin Jun Hew Date: Fri, 29 May 2026 14:43:39 +1000 Subject: [PATCH 01/12] change dims --- src/c/toolkits/petsc/patches/NewMat.cpp | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/src/c/toolkits/petsc/patches/NewMat.cpp b/src/c/toolkits/petsc/patches/NewMat.cpp index 7442a0854..e053ea64f 100644 --- a/src/c/toolkits/petsc/patches/NewMat.cpp +++ b/src/c/toolkits/petsc/patches/NewMat.cpp @@ -109,13 +109,10 @@ PMat NewMat(int M,int N,int connectivity,int numberofdofspernode,ISSM_MPI_Comm c MatSetFromOptions(outmatrix); MatSetOption(outmatrix,MAT_IGNORE_ZERO_ENTRIES,PETSC_TRUE); - /*preallocation according to type: */ - MatGetType(outmatrix,&type); + /*preallocation according to type: + * Use strncmp("mpiaij",...) so that CUDA variants (mpiaijcusparse) also + * get preallocated — they inherit from MPIAIJ and accept the same call. */ + MatGetType(outmatrix,&type); - if((strcmp(type,"mpiaij")==0) || (strcmp(type,"mpidense")==0)){ - MatMPIAIJSetPreallocation(outmatrix,d_nz,NULL,o_nz,NULL); - } - - return outmatrix; -} + if((strncmp(type,"mpiaij",6)==0) || (strcmp(type,"mpidense")==0)){ /*}}}*/ From 590cddc6e41db3027a80471b82d2fb67070b9940 Mon Sep 17 00:00:00 2001 From: Justin Kin Jun Hew Date: Fri, 29 May 2026 15:00:28 +1000 Subject: [PATCH 02/12] GPU patches --- m4/issm_options.m4 | 14 +++ src/c/toolkits/petsc/objects/PetscVec.cpp | 13 ++- .../toolkits/petsc/patches/VecToMPISerial.cpp | 97 ++++++------------- src/m/classes/toolkits.py | 1 + 4 files changed, 54 insertions(+), 71 deletions(-) diff --git a/m4/issm_options.m4 b/m4/issm_options.m4 index 4ce29b74a..cb24d1057 100644 --- a/m4/issm_options.m4 +++ b/m4/issm_options.m4 @@ -1394,6 +1394,20 @@ AC_DEFUN([ISSM_OPTIONS],[ AC_DEFINE([_HAVE_PETSC_], [1], [with PETSc in ISSM src]) AC_SUBST([PETSCINCL]) AC_SUBST([PETSCLIB]) + + dnl Detect CUDA support in the configured PETSc install {{{ + AC_MSG_CHECKING(whether PETSc was built with CUDA support) + PETSC_HAS_CUDA=no + if test -f "${PETSC_ROOT}/include/petscconf.h"; then + if grep -q "PETSC_HAVE_CUDA" "${PETSC_ROOT}/include/petscconf.h"; then + PETSC_HAS_CUDA=yes + fi + fi + AC_MSG_RESULT([${PETSC_HAS_CUDA}]) + if test "x${PETSC_HAS_CUDA}" == "xyes"; then + AC_DEFINE([_HAVE_PETSC_CUDA_], [1], [PETSc built with CUDA support]) + fi + dnl }}} fi dnl }}} dnl MPI{{{ diff --git a/src/c/toolkits/petsc/objects/PetscVec.cpp b/src/c/toolkits/petsc/objects/PetscVec.cpp index 85ecc45b7..f9b9449e2 100644 --- a/src/c/toolkits/petsc/objects/PetscVec.cpp +++ b/src/c/toolkits/petsc/objects/PetscVec.cpp @@ -170,9 +170,16 @@ void PetscVec::GetLocalVector(doubletype** pvector,int** pindices){/ /*Build indices*/ int* indices=xNew(range); for(int i=0;i(range); - VecGetValues(this->vector,range,indices,values); + /*Get vector: use VecGetArrayRead which is CUDA-aware (PETSc downloads from + * device transparently) instead of VecGetValues which forces a stalling + * GPU->CPU copy for every index lookup.*/ + doubletype* values=xNew(range); + { + const PetscScalar* arr=NULL; + VecGetArrayRead(this->vector,&arr); + for(int i=0;ivector,&arr); + } *pvector = values; *pindices = indices; diff --git a/src/c/toolkits/petsc/patches/VecToMPISerial.cpp b/src/c/toolkits/petsc/patches/VecToMPISerial.cpp index 368e13bda..4fc32dcab 100644 --- a/src/c/toolkits/petsc/patches/VecToMPISerial.cpp +++ b/src/c/toolkits/petsc/patches/VecToMPISerial.cpp @@ -55,91 +55,52 @@ template int VecToMPISerial(doubletype** pgathered_vector, vectype vector,ISSM_MPI_Comm comm,bool broadcast){ - int i; - int num_procs; - int my_rank; - - /*Petsc*/ - ISSM_MPI_Status status; - PetscInt lower_row,upper_row; - int range; - int * idxn=NULL; - int buffer[3]; - - /*intermediary results*/ - doubletype* local_vector=NULL; - - /*input*/ - int vector_size; - /*Output*/ - doubletype* gathered_vector=NULL; //Global vector holding the final assembled vector on all nodes. + doubletype* gathered_vector = NULL; + const PetscScalar* vec_array = NULL; - /*recover my_rank and num_procs*/ - ISSM_MPI_Comm_size(comm,&num_procs); - ISSM_MPI_Comm_rank(comm,&my_rank); + /*Sequential vector and scatter context*/ + vectype vector_seq = NULL; + VecScatter ctx = NULL; + /*Check for empty vector*/ + int vector_size; VecGetSize(vector,&vector_size); if(vector_size==0){ *pgathered_vector=NULL; return 1; } - /*Allocate gathered vector on all nodes .*/ - if(broadcast || my_rank==0){ - gathered_vector=xNew(vector_size); - } - - /*Allocate local vectors*/ - VecGetOwnershipRange(vector,&lower_row,&upper_row); - upper_row--; - range=upper_row-lower_row+1; - - if (range){ - idxn=xNew(range); - for (i=0;i(range); - /*Extract values from MPI vector to serial local_vector on each node*/ - VecGetValues(vector,range,idxn,local_vector); + /*Create scatter: ToAll replicates on every rank; ToZero gathers only on rank 0. + * VecScatterBegin/End are CUDA-aware: PETSc handles GPU->CPU transfer internally, + * avoiding the deprecated VecGetValues path which stalls the GPU pipeline.*/ + if(broadcast){ + VecScatterCreateToAll(vector,&ctx,&vector_seq); } - - /*Now each node holds its local_vector containing range rows. - * We send this local_vector to the gathered_vector on node 0*/ - for (i=1;i(),0,1,comm); - } - if (my_rank==0){ - ISSM_MPI_Recv(buffer,3,ISSM_MPI_INT,i,1,comm,&status); - if (buffer[2])ISSM_MPI_Recv(gathered_vector+buffer[1],buffer[2],TypeToMPIType(),i,1,comm,&status); - } + else{ + VecScatterCreateToZero(vector,&ctx,&vector_seq); } - if (my_rank==0){ - //Still have the local_vector on node 0 to take care of. - if (range) { - xMemCpy(&gathered_vector[lower_row], local_vector, range); - } + VecScatterBegin(ctx,vector,vector_seq,INSERT_VALUES,SCATTER_FORWARD); + VecScatterEnd( ctx,vector,vector_seq,INSERT_VALUES,SCATTER_FORWARD); + + /*Copy data from sequential vector (always CPU-resident after scatter)*/ + int n; + VecGetSize(vector_seq,&n); + if(n>0){ + gathered_vector=xNew(n); + VecGetArrayRead(vector_seq,&vec_array); + for(int i=0;i(),0,comm); - } + /*Destroy scatter context and sequential vector*/ + VecScatterDestroy(&ctx); + VecDestroy(&vector_seq); - /*Assign output pointers: */ + /*Assign output pointer*/ *pgathered_vector=gathered_vector; - /*Free resources: */ - xDelete(idxn); - xDelete(local_vector); - return 1; } diff --git a/src/m/classes/toolkits.py b/src/m/classes/toolkits.py index 97e812a22..14f1d3c8c 100644 --- a/src/m/classes/toolkits.py +++ b/src/m/classes/toolkits.py @@ -1,4 +1,5 @@ from fielddisplay import fielddisplay +from gpuoptions import gpuoptions from iluasmoptions import iluasmoptions from IssmConfig import IssmConfig from issmgslsolver import issmgslsolver From 984516722fa7849b7617b12e2c7d17c2e60afc86 Mon Sep 17 00:00:00 2001 From: Justin Kin Jun Hew Date: Tue, 2 Jun 2026 09:35:05 +1000 Subject: [PATCH 03/12] remove duplicate gadi file from cluster dir --- src/m/contrib/jkjhew/gadi.py | 248 ----------------------------------- 1 file changed, 248 deletions(-) delete mode 100644 src/m/contrib/jkjhew/gadi.py diff --git a/src/m/contrib/jkjhew/gadi.py b/src/m/contrib/jkjhew/gadi.py deleted file mode 100644 index 37340573a..000000000 --- a/src/m/contrib/jkjhew/gadi.py +++ /dev/null @@ -1,248 +0,0 @@ -import subprocess - -from fielddisplay import fielddisplay -from helpers import * -from IssmConfig import IssmConfig -from issmscpin import issmscpin -from issmscpout import issmscpout -from issmssh import issmssh -from MatlabFuncs import * -from pairoptions import pairoptions -try: - from gadi_settings import gadi_settings -except ImportError: - pass -from QueueRequirements import QueueRequirements - -class gadi(object): - """Gadi cluster class definition. - Usage: - cluster = gadi() # default constructor - cluster = gadi('numnodes',2,'cpuspernode',24,'login','username') ... - """ - - def __init__(self, *args): # {{{ - self.name = 'gadi.nci.org.au' - self.login = '' - # Adjust modules for Gadi as needed: - self.modules = [ - # Example modules or spack loads: - 'openmpi/4.1.2', - # 'python3/3.9.2', - # or just keep them empty if you handle externally: - ] - # For Gadi, you usually specify the total ncpus. But if you still want - # to specify by numnodes & cpuspernode, that’s fine. Adjust as needed: - self.numnodes = 1 - self.cpuspernode = 4 - self.port = '' # typical SSH port - self.queue = 'normal' # or "hugemem", "express", etc. - self.time = 60 # total minutes of walltime, e.g. 60 => 1 hour - self.processor = '' # not usually needed for Gadi - self.srcpath = '' - self.extpkgpath = '' - self.codepath = '' - self.executionpath = '' - self.project = '' # Gadi uses -P - self.interactive = 0 - self.bbftp = 0 - self.numstreams = 8 - self.hyperthreading = 0 - - # Use provided options to change fields - options = pairoptions(*args) - - # If you had a user settings file, try it: - try: - self = gadi_settings(self) - except NameError: - pass - - # Override defaults with user-specified parameters - self = options.AssignObjectFields(self) - - # Ensure login is set correctly - if not self.login: - raise ValueError("Login must be specified for Gadi cluster.") - - # }}} - - def __repr__(self): # {{{ - s = 'class Gadi object\n' - s += ' name: {}\n'.format(self.name) - s += ' login: {}\n'.format(self.login) - s += ' modules: {}\n'.format(strjoin(self.modules, ', ')) - s += ' numnodes: {}\n'.format(self.numnodes) - s += ' cpuspernode: {}\n'.format(self.cpuspernode) - s += ' np: {}\n'.format(self.nprocs()) - s += ' port: {}\n'.format(self.port) - s += ' queue: {}\n'.format(self.queue) - s += ' time (min): {}\n'.format(self.time) - s += ' processor: {}\n'.format(self.processor) - s += ' srcpath: {}\n'.format(self.srcpath) - s += ' extpkgpath: {}\n'.format(self.extpkgpath) - s += ' codepath: {}\n'.format(self.codepath) - s += ' executionpath: {}\n'.format(self.executionpath) - s += ' project: {}\n'.format(self.project) - s += ' interactive: {}\n'.format(self.interactive) - s += ' bbftp: {}\n'.format(self.bbftp) - s += ' numstreams: {}\n'.format(self.numstreams) - s += ' hyperthreading: {}\n'.format(self.hyperthreading) - return s - # }}} - - def nprocs(self): # {{{ - return self.numnodes * self.cpuspernode - # }}} - - def checkconsistency(self, md, solution, analyses): # {{{ - queuedict = { - 'normal': [48*60, 3072], # e.g. up to 48h, 3072 cores - 'express': [2*60, 960], # e.g. up to 2h, 960 cores - 'hugemem': [48*60, 3072], - } - QueueRequirements(queuedict, self.queue, self.nprocs(), self.time) - - # Some minimal checks - if not self.login: - md = md.checkmessage('login is empty') - if not self.srcpath: - md = md.checkmessage('srcpath is empty') - if not self.codepath: - md = md.checkmessage('codepath is empty') - if not self.executionpath: - md = md.checkmessage('executionpath is empty') - if not self.project: - md = md.checkmessage('project (PBS -P) is empty') - return self - # }}} - - def BuildQueueScript( - self, dirname, modelname, solution, - io_gather, isvalgrind, isgprof, isdakota, isoceancoupling - ): # {{{ - """ - Create a PBS script for Gadi. - Gadi typically uses #PBS lines like: - - #PBS -P - - #PBS -q normal - - #PBS -l ncpus=...,walltime=HH:MM:SS,mem=... - - #PBS -l wd - - #PBS -j oe - """ - - if isgprof: - print('gprof not typically used on Gadi via this script, ignoring...') - - executable = 'issm.exe' - if isdakota: - version_str = IssmConfig('_DAKOTA_VERSION_') - # e.g. "6.14" - version_float = float(version_str[0:3]) - if version_float >= 6.0: - executable = 'issm_dakota.exe' - if isoceancoupling: - executable = 'issm_ocean.exe' - - # Convert self.time (minutes) to hh:mm:ss - hours = self.time // 60 - minutes = self.time % 60 - walltime_str = '{:02d}:{:02d}:00'.format(hours, minutes) - - # Write queue script - fid = open(modelname + '.queue', 'w') - fid.write('#!/bin/bash\n') - fid.write('#PBS -P {}\n'.format(self.project)) - fid.write('#PBS -q {}\n'.format(self.queue)) - fid.write('#PBS -l ncpus={}\n'.format(self.nprocs())) - # Optionally request memory (example: 4GB x nprocs): - # fid.write('#PBS -l mem={}GB\n'.format(4 * self.nprocs())) - fid.write('#PBS -l walltime={}\n'.format(walltime_str)) - fid.write('#PBS -l wd\n') - fid.write('#PBS -j oe\n') - fid.write('#PBS -l storage=scratch/{0}+gdata/{0}\n'.format(self.project)) - fid.write('#PBS -m bea\n') - fid.write('#PBS -o {}/{}/{}.outlog \n'.format(self.executionpath, dirname, modelname)) - fid.write('#PBS -e {}/{}/{}.errlog \n\n'.format(self.executionpath, dirname, modelname)) - - fid.write('\n# Load modules as needed:\n') - fid.write('module purge\n') - for m in self.modules: - fid.write('module load {}\n'.format(m)) - - # Optionally source environment scripts if needed: - # fid.write('source /g/data/...someSpackOrCondaEnv...\n') - - fid.write('\n# Switch to run directory (if not using -l wd):\n') - fid.write('cd {}/{}/\n'.format(self.executionpath, dirname)) - fid.write('\n# Now launch the job:\n') - - - if not isvalgrind: - fid.write('mpiexec -n {} {}/{} {} {}/{} {}\n'.format( - self.nprocs(), self.codepath, executable, - solution, self.executionpath, dirname, modelname)) - else: - # Example valgrind usage - supstring = '' - # If you have self.valgrindsup or so, otherwise remove - # for supfile in self.valgrindsup: - # supstring += ' --suppressions=' + supfile - fid.write( - 'mpiexec -n {} valgrind --leak-check=full{} {}/{} {} {}/{} {}\n'.format( - self.nprocs(), supstring, self.codepath, executable, - solution, self.executionpath, dirname, modelname)) - - if not io_gather: - # Merge partial outbin files if your code writes multiple .outbin.# - fid.write('\ncat {}.outbin.* > {}.outbin\n'.format(modelname, modelname)) - - fid.close() - # }}} - - def UploadQueueJob(self, modelname, dirname, filelist): # {{{ - # Compress inputs into a tarball - compressstring = 'tar -zcf {}.tar.gz'.format(dirname) - for f in filelist: - compressstring += ' {}'.format(f) - - subprocess.call(compressstring, shell=True) - print('Uploading input file and queueing script to Gadi...') - directory = self.executionpath - issmscpout(self.name, directory, self.login, self.port, [dirname + '.tar.gz']) - # }}} - - def LaunchQueueJob(self, modelname, dirname, filelist, restart, batch): # {{{ - """ - On Gadi, typically you do: - qsub modelname.queue - rather than `./modelname.queue`. - """ - if not isempty(restart): - # If "restart" logic is needed, adapt as necessary - launchcommand = ( - 'cd {executionpath}/{dirname} && qsub {modelname}.queue' - .format(executionpath=self.executionpath, - dirname=dirname, modelname=modelname) - ) - else: - # Create/clean directory, extract tar, then qsub - launchcommand = ( - 'cd {executionpath} && ' - 'rm -rf ./{dirname} && mkdir {dirname} && cd {dirname} && ' - 'mv ../{dirname}.tar.gz ./ && ' - 'tar -zxf {dirname}.tar.gz && ' - 'qsub {modelname}.queue' - .format(executionpath=self.executionpath, - dirname=dirname, modelname=modelname) - ) - - print('Launching solution sequence on Gadi via SSH...') - issmssh(self.name, self.login, self.port, launchcommand) - # }}} - - def Download(self, dirname, filelist): # {{{ - # Copy files from Gadi back to local machine - directory = '{}/{}/'.format(self.executionpath, dirname) - issmscpin(self.name, self.login, self.port, directory, filelist) - # }}} From 0b869271986bd9f6c97d3c71e45c105ad7e69e4b Mon Sep 17 00:00:00 2001 From: Justin Kin Jun Hew Date: Tue, 2 Jun 2026 13:17:04 +1000 Subject: [PATCH 04/12] Fix +ad CI: guard CUDA scatter path with _HAVE_PETSC_CUDA_ MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In +ad builds PVec resolves to adjoint_petsc::ADVec (ADVecImpl*) which is not compatible with VecScatterCreate*/VecGetArrayRead — those APIs only accept plain Vec. +cuda and +ad are mutually exclusive (Spack conflict), so: _HAVE_PETSC_CUDA_ defined -> scatter / VecGetArrayRead (Vec only) _HAVE_PETSC_CUDA_ not set -> original VecGetValues path (handles ADVec) Fixes CI failure in gcc-ad build (PR #35). --- src/c/toolkits/petsc/objects/PetscVec.cpp | 11 +- .../toolkits/petsc/patches/VecToMPISerial.cpp | 122 +++++++++++++----- 2 files changed, 96 insertions(+), 37 deletions(-) diff --git a/src/c/toolkits/petsc/objects/PetscVec.cpp b/src/c/toolkits/petsc/objects/PetscVec.cpp index f9b9449e2..1f75b0d2d 100644 --- a/src/c/toolkits/petsc/objects/PetscVec.cpp +++ b/src/c/toolkits/petsc/objects/PetscVec.cpp @@ -170,16 +170,21 @@ void PetscVec::GetLocalVector(doubletype** pvector,int** pindices){/ /*Build indices*/ int* indices=xNew(range); for(int i=0;iCPU copy for every index lookup.*/ doubletype* values=xNew(range); +#ifdef _HAVE_PETSC_CUDA_ + /*CUDA build: VecGetArrayRead is GPU-aware; returns the local block after a + * single device->host transfer. Safe here because +cuda implies ~ad, so + * this->vector is plain Vec, never ADVecImpl. */ { const PetscScalar* arr=NULL; VecGetArrayRead(this->vector,&arr); for(int i=0;ivector,&arr); } +#else + /*Non-CUDA build (including +ad where this->vector may be ADVecImpl*). */ + VecGetValues(this->vector,range,indices,values); +#endif *pvector = values; *pindices = indices; diff --git a/src/c/toolkits/petsc/patches/VecToMPISerial.cpp b/src/c/toolkits/petsc/patches/VecToMPISerial.cpp index 4fc32dcab..2113d87d8 100644 --- a/src/c/toolkits/petsc/patches/VecToMPISerial.cpp +++ b/src/c/toolkits/petsc/patches/VecToMPISerial.cpp @@ -55,15 +55,6 @@ template int VecToMPISerial(doubletype** pgathered_vector, vectype vector,ISSM_MPI_Comm comm,bool broadcast){ - /*Output*/ - doubletype* gathered_vector = NULL; - const PetscScalar* vec_array = NULL; - - /*Sequential vector and scatter context*/ - vectype vector_seq = NULL; - VecScatter ctx = NULL; - - /*Check for empty vector*/ int vector_size; VecGetSize(vector,&vector_size); if(vector_size==0){ @@ -71,36 +62,99 @@ int VecToMPISerial(doubletype** pgathered_vector, vectype vector,ISSM_MPI_Comm c return 1; } - /*Create scatter: ToAll replicates on every rank; ToZero gathers only on rank 0. - * VecScatterBegin/End are CUDA-aware: PETSc handles GPU->CPU transfer internally, - * avoiding the deprecated VecGetValues path which stalls the GPU pipeline.*/ - if(broadcast){ - VecScatterCreateToAll(vector,&ctx,&vector_seq); - } - else{ - VecScatterCreateToZero(vector,&ctx,&vector_seq); - } + doubletype* gathered_vector=NULL; + +#ifdef _HAVE_PETSC_CUDA_ + /*CUDA build: use VecScatter which is GPU-aware. PETSc handles device->host + * transfer internally. Cannot use VecGetValues on CUDA vectors (deprecated + * and stalls the GPU pipeline). + * Safe here because +cuda and +ad are mutually exclusive (Spack conflict), + * so vectype is always plain Vec, never ADVecImpl. */ + { + const PetscScalar* vec_array=NULL; + vectype vector_seq=NULL; + VecScatter ctx=NULL; - VecScatterBegin(ctx,vector,vector_seq,INSERT_VALUES,SCATTER_FORWARD); - VecScatterEnd( ctx,vector,vector_seq,INSERT_VALUES,SCATTER_FORWARD); - - /*Copy data from sequential vector (always CPU-resident after scatter)*/ - int n; - VecGetSize(vector_seq,&n); - if(n>0){ - gathered_vector=xNew(n); - VecGetArrayRead(vector_seq,&vec_array); - for(int i=0;i0){ + gathered_vector=xNew(n); + VecGetArrayRead(vector_seq,&vec_array); + for(int i=0;i(vector_size); + } + VecGetOwnershipRange(vector,&lower_row,&upper_row); + upper_row--; + range=upper_row-lower_row+1; + + if(range){ + idxn=xNew(range); + for(i=0;i(range); + VecGetValues(vector,range,idxn,local_vector); + } + + for(i=1;i(),0,1,comm); + } + if(my_rank==0){ + ISSM_MPI_Recv(buffer,3,ISSM_MPI_INT,i,1,comm,&status); + if(buffer[2]) ISSM_MPI_Recv(gathered_vector+buffer[1],buffer[2],TypeToMPIType(),i,1,comm,&status); + } + } + + if(my_rank==0 && range){ + xMemCpy(&gathered_vector[lower_row],local_vector,range); + } + + if(broadcast){ + ISSM_MPI_Bcast(gathered_vector,vector_size,TypeToMPIType(),0,comm); + } + + xDelete(idxn); + xDelete(local_vector); + } +#endif + + *pgathered_vector=gathered_vector; return 1; } From df382b430fce5eaa7eec1a1179bb35b0437011f8 Mon Sep 17 00:00:00 2001 From: Justin Kin Jun Hew Date: Tue, 2 Jun 2026 13:30:12 +1000 Subject: [PATCH 05/12] Fix NewMat.cpp: restore missing function body after strncmp edit The strncmp commit accidentally dropped the preallocation call, closing braces and return statement, leaving only the fold marker. Restore them. --- src/c/toolkits/petsc/patches/NewMat.cpp | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/c/toolkits/petsc/patches/NewMat.cpp b/src/c/toolkits/petsc/patches/NewMat.cpp index e053ea64f..4f667bf6e 100644 --- a/src/c/toolkits/petsc/patches/NewMat.cpp +++ b/src/c/toolkits/petsc/patches/NewMat.cpp @@ -115,4 +115,9 @@ PMat NewMat(int M,int N,int connectivity,int numberofdofspernode,ISSM_MPI_Comm c MatGetType(outmatrix,&type); if((strncmp(type,"mpiaij",6)==0) || (strcmp(type,"mpidense")==0)){ + MatMPIAIJSetPreallocation(outmatrix,d_nz,NULL,o_nz,NULL); + } + + return outmatrix; +} /*}}}*/ From 5193072e1d02a45e4aaea8bf57941ef5225a26b9 Mon Sep 17 00:00:00 2001 From: Justin Kin Jun Hew Date: Tue, 2 Jun 2026 14:02:05 +1000 Subject: [PATCH 06/12] Add gcc-cuda CI manifest for GPU/CUDA build variant --- .../build-ci/manifests/gcc-cuda.spack.yaml.j2 | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) create mode 100644 .github/build-ci/manifests/gcc-cuda.spack.yaml.j2 diff --git a/.github/build-ci/manifests/gcc-cuda.spack.yaml.j2 b/.github/build-ci/manifests/gcc-cuda.spack.yaml.j2 new file mode 100644 index 000000000..2780d94b9 --- /dev/null +++ b/.github/build-ci/manifests/gcc-cuda.spack.yaml.j2 @@ -0,0 +1,19 @@ +spack: + specs: + - issm @git.{{ ref }} +wrappers ~ad +cuda + packages: + # Workaround due to https://github.com/spack/spack-packages/pull/3644 + re2c: + require: + - '@3.1' + + gcc: + require: + - '@{{ gcc_compiler_version }}' + all: + require: + - '%access_gcc' + - 'target={{ target }}' + concretizer: + unify: false + view: false From 5c2ec4852d9f2ea56dc0030bc12b8f21960d4179 Mon Sep 17 00:00:00 2001 From: Justin Kin Jun Hew Date: Tue, 2 Jun 2026 14:50:40 +1000 Subject: [PATCH 07/12] update --- .github/workflows/ci.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 2dc0bf9f5..e6b3a6177 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -31,5 +31,6 @@ jobs: spack-manifest-path: ${{ matrix.file }} allow-ssh-into-spack-install: false # If true, PR author must ssh into instance to complete job spack-manifest-data-path: .github/build-ci/data/standard.json + access-spack-packages-ref: justinh2002/cuda_issm_gpu # Default args (including explicit spack/spack-packages/spack-config versions) # are specified in https://github.com/ACCESS-NRI/build-ci/tree/v3/.github/workflows#inputs From 2010bbb28529509295c3f6564c671e81fb938532 Mon Sep 17 00:00:00 2001 From: Justin Kin Jun Hew Date: Tue, 2 Jun 2026 15:01:24 +1000 Subject: [PATCH 08/12] test From 62fb8652cd665edca7179cd35e0c283c7af5b812 Mon Sep 17 00:00:00 2001 From: Justin Kin Jun Hew Date: Tue, 2 Jun 2026 15:05:20 +1000 Subject: [PATCH 09/12] test From 5d670dda72c6b7fe490867494b0a51e8e4cf5abb Mon Sep 17 00:00:00 2001 From: Justin Kin Jun Hew Date: Wed, 3 Jun 2026 10:56:48 +1000 Subject: [PATCH 10/12] add cuda_arch specification --- .github/build-ci/manifests/gcc-cuda.spack.yaml.j2 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/build-ci/manifests/gcc-cuda.spack.yaml.j2 b/.github/build-ci/manifests/gcc-cuda.spack.yaml.j2 index 2780d94b9..4eb669b6d 100644 --- a/.github/build-ci/manifests/gcc-cuda.spack.yaml.j2 +++ b/.github/build-ci/manifests/gcc-cuda.spack.yaml.j2 @@ -1,6 +1,6 @@ spack: specs: - - issm @git.{{ ref }} +wrappers ~ad +cuda + - issm @git.{{ ref }} +wrappers ~ad +cuda cuda_arch=80 packages: # Workaround due to https://github.com/spack/spack-packages/pull/3644 re2c: From 4a89911b693fc8dc0c365370cf4332d724880d04 Mon Sep 17 00:00:00 2001 From: Justin Kin Jun Hew Date: Tue, 16 Jun 2026 15:36:31 +1000 Subject: [PATCH 11/12] add remaining gpu option changes --- .github/build-ci/data/standard.json | 2 +- .../build-ci/manifests/gcc-cuda.spack.yaml.j2 | 2 +- .../petsc/install-3.22-gadi-gpu.sh | 20 +- src/c/Makefile.am | 1 + src/c/shared/Enum/EnumDefinitions.h | 1 + src/c/shared/Enum/EnumToStringx.cpp | 1 + src/c/shared/Enum/StringToEnumx.cpp | 1 + .../solutionsequences/AndersonAccelerator.cpp | 202 ++++++++++++++++++ src/c/solutionsequences/AndersonAccelerator.h | 56 +++++ .../solutionsequence_nonlinear.cpp | 22 +- src/m/classes/gpuoptions.py | 50 ++++- 11 files changed, 345 insertions(+), 13 deletions(-) mode change 100644 => 100755 externalpackages/petsc/install-3.22-gadi-gpu.sh create mode 100644 src/c/solutionsequences/AndersonAccelerator.cpp create mode 100644 src/c/solutionsequences/AndersonAccelerator.h diff --git a/.github/build-ci/data/standard.json b/.github/build-ci/data/standard.json index 2dabd49cd..0112f15e3 100644 --- a/.github/build-ci/data/standard.json +++ b/.github/build-ci/data/standard.json @@ -1,4 +1,4 @@ { "gcc_compiler_version": "13.2.0", - "target": "x86_64" + "target": " x86_64" } diff --git a/.github/build-ci/manifests/gcc-cuda.spack.yaml.j2 b/.github/build-ci/manifests/gcc-cuda.spack.yaml.j2 index 4eb669b6d..2780d94b9 100644 --- a/.github/build-ci/manifests/gcc-cuda.spack.yaml.j2 +++ b/.github/build-ci/manifests/gcc-cuda.spack.yaml.j2 @@ -1,6 +1,6 @@ spack: specs: - - issm @git.{{ ref }} +wrappers ~ad +cuda cuda_arch=80 + - issm @git.{{ ref }} +wrappers ~ad +cuda packages: # Workaround due to https://github.com/spack/spack-packages/pull/3644 re2c: diff --git a/externalpackages/petsc/install-3.22-gadi-gpu.sh b/externalpackages/petsc/install-3.22-gadi-gpu.sh old mode 100644 new mode 100755 index 8bbf28b68..6198d0460 --- a/externalpackages/petsc/install-3.22-gadi-gpu.sh +++ b/externalpackages/petsc/install-3.22-gadi-gpu.sh @@ -24,6 +24,21 @@ if [ ! -d "${CUDA_DIR}" ]; then exit 1 fi +# Build PETSc against the SAME MPI that ISSM uses (system OpenMPI), rather than +# letting PETSc download its own MPICH. A mismatched MPI causes issm.exe to be +# linked against two MPI runtimes at once, which breaks any np>1 (multi-rank) +# run: each rank initialises as its own standalone MPI_COMM_WORLD rank-0. +# The OpenMPI compiler wrappers must be on PATH (module load openmpi/4.1.3). +for w in mpicc mpicxx mpif90; do + if ! command -v ${w} &>/dev/null; then + echo "ERROR: ${w} not found. Load OpenMPI first: module load openmpi/4.1.3" + exit 1 + fi +done +if ! mpicc --showme:version 2>/dev/null | grep -qi "open mpi"; then + echo "WARNING: mpicc does not appear to be OpenMPI; check your modules." +fi + # Environment if [ -z ${LDFLAGS+x} ]; then LDFLAGS="" @@ -51,14 +66,17 @@ cd ${PETSC_DIR} --prefix="${PREFIX}" \ --PETSC_DIR="${PETSC_DIR}" \ --LDFLAGS="${LDFLAGS}" \ + --CFLAGS="-g -O2" --CXXFLAGS="-g -O2" --FFLAGS="-g -O2" \ --with-debugging=0 \ --with-valgrind=0 \ --with-x=0 \ --with-ssl=0 \ --with-pic=1 \ + --with-cc=mpicc \ + --with-cxx=mpicxx \ + --with-fc=mpif90 \ --download-fblaslapack=1 \ --download-metis=1 \ - --download-mpich=1 \ --download-mumps=1 \ --download-parmetis=1 \ --download-scalapack=1 \ diff --git a/src/c/Makefile.am b/src/c/Makefile.am index cb7be17ae..623b61905 100644 --- a/src/c/Makefile.am +++ b/src/c/Makefile.am @@ -266,6 +266,7 @@ issm_sources += \ ./solutionsequences/solutionsequence_la_theta.cpp \ ./solutionsequences/solutionsequence_linear.cpp \ ./solutionsequences/solutionsequence_nonlinear.cpp \ + ./solutionsequences/AndersonAccelerator.cpp \ ./solutionsequences/solutionsequence_newton.cpp \ ./solutionsequences/solutionsequence_fct.cpp \ ./solutionsequences/solutionsequence_schurcg.cpp \ diff --git a/src/c/shared/Enum/EnumDefinitions.h b/src/c/shared/Enum/EnumDefinitions.h index e65b65dcd..4c3731309 100644 --- a/src/c/shared/Enum/EnumDefinitions.h +++ b/src/c/shared/Enum/EnumDefinitions.h @@ -695,6 +695,7 @@ enum definitions{ StepEnum, StepsEnum, StressbalanceAbstolEnum, + StressbalanceAndersonDepthEnum, StressbalanceFSreconditioningEnum, StressbalanceIsHydrologyLayerEnum, StressbalanceIsnewtonEnum, diff --git a/src/c/shared/Enum/EnumToStringx.cpp b/src/c/shared/Enum/EnumToStringx.cpp index 2e76cfab7..d38b909b0 100644 --- a/src/c/shared/Enum/EnumToStringx.cpp +++ b/src/c/shared/Enum/EnumToStringx.cpp @@ -703,6 +703,7 @@ const char* EnumToStringx(int en){ case StepEnum : return "Step"; case StepsEnum : return "Steps"; case StressbalanceAbstolEnum : return "StressbalanceAbstol"; + case StressbalanceAndersonDepthEnum : return "StressbalanceAndersonDepth"; case StressbalanceFSreconditioningEnum : return "StressbalanceFSreconditioning"; case StressbalanceIsHydrologyLayerEnum : return "StressbalanceIsHydrologyLayer"; case StressbalanceIsnewtonEnum : return "StressbalanceIsnewton"; diff --git a/src/c/shared/Enum/StringToEnumx.cpp b/src/c/shared/Enum/StringToEnumx.cpp index 92e93b392..5e2fc9466 100644 --- a/src/c/shared/Enum/StringToEnumx.cpp +++ b/src/c/shared/Enum/StringToEnumx.cpp @@ -718,6 +718,7 @@ int StringToEnumx(const char* name,bool notfounderror){ else if (strcmp(name,"Step")==0) return StepEnum; else if (strcmp(name,"Steps")==0) return StepsEnum; else if (strcmp(name,"StressbalanceAbstol")==0) return StressbalanceAbstolEnum; + else if (strcmp(name,"StressbalanceAndersonDepth")==0) return StressbalanceAndersonDepthEnum; else if (strcmp(name,"StressbalanceFSreconditioning")==0) return StressbalanceFSreconditioningEnum; else if (strcmp(name,"StressbalanceIsHydrologyLayer")==0) return StressbalanceIsHydrologyLayerEnum; else if (strcmp(name,"StressbalanceIsnewton")==0) return StressbalanceIsnewtonEnum; diff --git a/src/c/solutionsequences/AndersonAccelerator.cpp b/src/c/solutionsequences/AndersonAccelerator.cpp new file mode 100644 index 000000000..e379ee51c --- /dev/null +++ b/src/c/solutionsequences/AndersonAccelerator.cpp @@ -0,0 +1,202 @@ +/*!\file: AndersonAccelerator.cpp + * \brief: Anderson acceleration for Picard fixed-point iterations. + * + * See AndersonAccelerator.h for theory and usage. + */ + +#include "AndersonAccelerator.h" + +/* ────────────────────────────────────────────────────────────────────────── + * Construction / destruction + * ────────────────────────────────────────────────────────────────────────── */ + +AndersonAccelerator::AndersonAccelerator(int m) : depth(m), iter(0) { + /*reserve to avoid repeated allocation for typical depths <= 10*/ + if(m > 0){ + f_hist.reserve(m+1); + G_hist.reserve(m+1); + } +} + +AndersonAccelerator::~AndersonAccelerator(){ + for(int i=0; i<(int)f_hist.size(); i++) delete f_hist[i]; + for(int i=0; i<(int)G_hist.size(); i++) delete G_hist[i]; +} + +/* ────────────────────────────────────────────────────────────────────────── + * SmallSolve + * + * Gaussian elimination with partial pivoting for a dense mk x mk system. + * On entry: A[i*mk+j], b[i]. + * On exit: theta[i] holds the solution. + * Returns false and leaves theta zeroed if A is numerically singular. + * ────────────────────────────────────────────────────────────────────────── */ +bool AndersonAccelerator::SmallSolve(IssmDouble* A, IssmDouble* b, + IssmDouble* theta, int mk){ + + const IssmDouble tol = 1.0e-14; + + /*copy b into theta; we will overwrite it as we go*/ + for(int i=0; i maxval){ maxval = val; pivot = row; } + } + + if(maxval < tol){ + /*singular or near-singular — fall back to Picard (theta = 0)*/ + for(int i=0; i=0; row--){ + for(int j=row+1; j** puf, + Vector* old_uf){ + + if(depth == 0) return; /* disabled: leave *puf untouched */ + + Vector* G_k = *puf; /* Picard update G(u_k) from linear solve */ + + /* ── 1. Compute Picard residual f_k = G_k - u_k ───────────────────── */ + Vector* f_k = G_k->Duplicate(); + G_k->Copy(f_k); + f_k->AXPY(old_uf, -1.0); /* f_k = G_k - old_uf */ + + /* ── 2. Store G_k and f_k in history before any modification ─────────── */ + Vector* G_store = G_k->Duplicate(); + G_k->Copy(G_store); + + f_hist.push_back(f_k); + G_hist.push_back(G_store); + + /* trim history to at most depth+1 entries (depth+1 gives depth differences)*/ + while((int)f_hist.size() > depth+1){ + delete f_hist.front(); f_hist.erase(f_hist.begin()); + delete G_hist.front(); G_hist.erase(G_hist.begin()); + } + + /* ── 3. Determine effective window size ──────────────────────────────── */ + int mk = (int)f_hist.size() - 1; /* number of difference columns */ + + if(mk == 0){ + /* first iteration or depth==1 first step: keep plain Picard update */ + iter++; + return; + } + + /* ── 4. Build difference vectors dF[j] and dG[j] ───────────────────── */ + /* dF[j] = f_hist[j+1] - f_hist[j] (j = 0 ... mk-1) + * dG[j] = G_hist[j+1] - G_hist[j] */ + std::vector*> dF(mk, NULL); + std::vector*> dG(mk, NULL); + + for(int j=0; jDuplicate(); + f_hist[j+1]->Copy(dF[j]); + dF[j]->AXPY(f_hist[j], -1.0); /* dF[j] = f_hist[j+1] - f_hist[j] */ + + dG[j] = G_hist[j+1]->Duplicate(); + G_hist[j+1]->Copy(dG[j]); + dG[j]->AXPY(G_hist[j], -1.0); /* dG[j] = G_hist[j+1] - G_hist[j] */ + } + + /* ── 5. Form normal equations A theta = b where + * A[i][j] = dF[i] . dF[j] + * b[i] = dF[i] . f_k */ + IssmDouble* A = new IssmDouble[mk*mk]; + IssmDouble* b_rhs = new IssmDouble[mk]; + IssmDouble* theta = new IssmDouble[mk](); + + Vector* f_k_cur = f_hist.back(); /* = f_k we just pushed */ + + for(int i=0; iDot(f_k_cur); + for(int j=0; j<=i; j++){ + A[i*mk+j] = A[j*mk+i] = dF[i]->Dot(dF[j]); + } + } + + /* ── 6. Solve the mk x mk system ────────────────────────────────────── */ + bool ok = SmallSolve(A, b_rhs, theta, mk); + + /* ── 7. Anderson update: G_k -= sum_j theta[j] * dG[j] ───────────── + * If SmallSolve failed (singular) theta is zeroed so G_k is unchanged, + * effectively falling back to Picard for this iteration. */ + if(ok){ + for(int j=0; jAXPY(dG[j], -theta[j]); /* G_k -= theta[j] * dG[j] */ + } + if(VerboseConvergence()) + _printf0_(" Anderson(" << depth << "): mk=" << mk << " applied\n"); + } + else{ + if(VerboseConvergence()) + _printf0_(" Anderson(" << depth << "): mk=" << mk + << " singular, falling back to Picard\n"); + } + + /* ── 8. Clean up ─────────────────────────────────────────────────────── */ + for(int j=0; j u_new = K(u)^{-1} f(u), + * standard Picard sets u_{k+1} = G(u_k). + * Anderson(m) instead solves a small m x m least-squares problem each + * iteration to find the best linear combination of the last m Picard updates, + * replacing the plain update at negligible extra cost. + * + * Usage: + * AndersonAccelerator aa(m); // m=0 -> pure Picard (no-op) + * for(;;){ + * ... linear solve -> uf ... + * aa.Apply(&uf, old_uf); // uf replaced with accelerated update + * ... UpdateInputFromSolution(uf) ... + * } + */ + +#ifndef _ANDERSONACCELERATOR_H_ +#define _ANDERSONACCELERATOR_H_ + +#include +#include "../toolkits/toolkits.h" +#include "../shared/shared.h" + +class AndersonAccelerator { + + public: + + int depth; /* window size m; 0 = disabled (pure Picard) */ + int iter; /* iteration counter (reset on construction) */ + + /* history of Picard residuals f_k = G(u_k) - u_k */ + std::vector*> f_hist; + /* history of raw Picard updates G(u_k) */ + std::vector*> G_hist; + + AndersonAccelerator(int m); + ~AndersonAccelerator(); + + /*! Replace *puf (= G(u_k) from linear solve) with the Anderson-accelerated + * update, given old_uf = u_k (solution vector from the previous iteration). + * On return *puf holds u_{k+1}. */ + void Apply(Vector** puf, Vector* old_uf); + + private: + + /*! Solve the small mk x mk system A theta = b in-place via + * Gaussian elimination with partial pivoting. A and b are + * overwritten. Returns false if the system is singular. */ + bool SmallSolve(IssmDouble* A, IssmDouble* b, IssmDouble* theta, int mk); +}; + +#endif /* _ANDERSONACCELERATOR_H_ */ diff --git a/src/c/solutionsequences/solutionsequence_nonlinear.cpp b/src/c/solutionsequences/solutionsequence_nonlinear.cpp index da407f1ae..0d1459519 100644 --- a/src/c/solutionsequences/solutionsequence_nonlinear.cpp +++ b/src/c/solutionsequences/solutionsequence_nonlinear.cpp @@ -1,8 +1,13 @@ /*!\file: solutionsequence_nonlinear.cpp - * \brief: core of a non-linear solution, using fixed-point method - */ + * \brief: core of a non-linear solution, using fixed-point method + * with optional Anderson acceleration (depth m >= 1). + * + * Anderson depth is read from parameter StressbalanceAndersonDepthEnum + * (default 0 = pure Picard). Recommended starting value: 5. + */ #include "./solutionsequences.h" +#include "./AndersonAccelerator.h" #include "../toolkits/toolkits.h" #include "../classes/classes.h" #include "../shared/shared.h" @@ -27,6 +32,7 @@ void solutionsequence_nonlinear(FemModel* femmodel,bool conserve_loads){ int min_mechanical_constraints; int max_nonlinear_iterations; int configuration_type; + int anderson_depth; IssmDouble eps_res,eps_rel,eps_abs; /*Recover parameters: */ @@ -36,6 +42,10 @@ void solutionsequence_nonlinear(FemModel* femmodel,bool conserve_loads){ femmodel->parameters->FindParam(&eps_rel,StressbalanceReltolEnum); femmodel->parameters->FindParam(&eps_abs,StressbalanceAbstolEnum); femmodel->parameters->FindParam(&configuration_type,ConfigurationTypeEnum); + /*Anderson depth: 0 = pure Picard (default), >=1 enables acceleration*/ + anderson_depth = 0; + if(femmodel->parameters->Exist(StressbalanceAndersonDepthEnum)) + femmodel->parameters->FindParam(&anderson_depth,StressbalanceAndersonDepthEnum); femmodel->UpdateConstraintsx(); /*Were loads requested as output? : */ @@ -47,6 +57,9 @@ void solutionsequence_nonlinear(FemModel* femmodel,bool conserve_loads){ int count=0; bool converged=false; + /*Anderson accelerator (no-op when anderson_depth==0)*/ + AndersonAccelerator anderson(anderson_depth); + /*Start non-linear iteration using input velocity: */ GetSolutionFromInputsx(&ug,femmodel); Reducevectorgtofx(&uf, ug, femmodel->nodes,femmodel->parameters); @@ -76,6 +89,11 @@ void solutionsequence_nonlinear(FemModel* femmodel,bool conserve_loads){ Solverx(&uf, Kff, pf, old_uf, df, femmodel->parameters); femmodel->profiler->Stop(SOLVER); + /*Anderson acceleration: replaces uf with the depth-m accelerated update. + * old_uf = u^(k), uf = G(u^(k)) from linear solve. + * No-op when anderson_depth==0 (pure Picard). */ + anderson.Apply(&uf, old_uf); + Mergesolutionfromftogx(&ug, uf,ys,femmodel->nodes,femmodel->parameters);delete ys; convergence(&converged,Kff,pf,uf,old_uf,eps_res,eps_rel,eps_abs); diff --git a/src/m/classes/gpuoptions.py b/src/m/classes/gpuoptions.py index 8d636ed75..4575eedd7 100644 --- a/src/m/classes/gpuoptions.py +++ b/src/m/classes/gpuoptions.py @@ -10,13 +10,16 @@ def gpuoptions(*args): Requires a CUDA-enabled PETSc build (see externalpackages/petsc/install-3.22-gadi-gpu.sh). - The preconditioner defaults to block-Jacobi with ILU sub-solves, which - PETSc can run on GPU via CUDA. For larger problems consider switching to - 'gamg' (algebraic multigrid) which also has CUDA support. + Defaults to GMRES preconditioned with smoothed-aggregation algebraic + multigrid (GAMG), whose convergence rate is nearly independent of mesh size + -- essential for the iterative GPU solve to scale, since the simpler + block-Jacobi/ILU preconditioner degrades badly under mesh refinement. The + matrix block size and the host-side coarse-grid products below are required + for GAMG to converge and to fit in GPU memory on large meshes. Usage: options = gpuoptions() - options = gpuoptions('ksp_type', 'cg', 'pc_type', 'gamg') + options = gpuoptions('ksp_type', 'cg', 'pc_type', 'bjacobi') """ # Retrieve any overrides passed by the caller @@ -27,9 +30,40 @@ def gpuoptions(*args): gpu['vec_type'] = options.getfieldvalue('vec_type', 'cuda') gpu['mat_type'] = options.getfieldvalue('mat_type', 'aijcusparse') gpu['ksp_type'] = options.getfieldvalue('ksp_type', 'gmres') - gpu['pc_type'] = options.getfieldvalue('pc_type', 'bjacobi') - gpu['sub_pc_type'] = options.getfieldvalue('sub_pc_type', 'ilu') - gpu['ksp_rtol'] = options.getfieldvalue('ksp_rtol', 1e-10) - gpu['ksp_max_it'] = options.getfieldvalue('ksp_max_it', 500) + gpu['pc_type'] = options.getfieldvalue('pc_type', 'gamg') + # SSA stress-balance has 2 coupled DOFs per node (vx, vy). Telling PETSc the + # block size lets GAMG aggregate by node rather than by scalar DOF, which is + # essential for the multigrid hierarchy to converge on large vector-elliptic + # systems. Read by MatSetFromOptions when the stiffness matrix is created. + gpu['mat_block_size'] = options.getfieldvalue('mat_block_size', 2) + # GAMG builds each coarse level with a sparse matrix-matrix product (PtAP). + # CUSPARSE's SpGEMM needs huge scratch buffers and runs the GPU out of memory + # on large meshes (cudaErrorMemoryAllocation in MatProductSymbolic). Compute + # these products on the host (96 GB) instead; the resulting coarse operators + # are small and the Krylov iteration itself stays on the GPU. + # + # GAMG calls MatPtAP() directly (api_user=true), so PETSc reads the + # api-specific option name '-matptap_backend_cpu' -- NOT the generic + # '-mat_product_algorithm_backend_cpu', which is only read on the MatProduct + # interface path. PtAP decomposes into inner A*B products, so cover those too. + gpu['matptap_backend_cpu'] = options.getfieldvalue('matptap_backend_cpu', 'true') + gpu['matmatmult_backend_cpu'] = options.getfieldvalue('matmatmult_backend_cpu', 'true') + gpu['mat_product_algorithm_backend_cpu'] = options.getfieldvalue('mat_product_algorithm_backend_cpu', 'true') + # Stop coarsening when grid has <= 2000 equations (direct solve on coarse grid) + gpu['pc_gamg_coarse_eq_limit'] = options.getfieldvalue('pc_gamg_coarse_eq_limit', 2000) + # Aggressive dropping of weak connections improves coarse grid quality + gpu['pc_gamg_threshold'] = options.getfieldvalue('pc_gamg_threshold', 0.08) + # ksp_rtol is the relative reduction of the *preconditioned* residual at which + # GMRES stops. Empirically a 1e-10 reduction leaves the *true* residual + # ||KU-F||/||F|| at ~1.5e-6 on the 1.6M-DOF case -- just above ISSM's 1e-6 + # acceptance gate (md.settings.solver_residue_threshold). 1e-12 drives the + # true residual to ~1.5e-8, clearing the gate with margin. (Confirmed not an + # iteration/restart limit: result was bit-identical across max_it and restart + # changes, i.e. GMRES converges to rtol well before exhausting iterations.) + gpu['ksp_rtol'] = options.getfieldvalue('ksp_rtol', 1e-12) + gpu['ksp_max_it'] = options.getfieldvalue('ksp_max_it', 5000) + # Longer restart than the GMRES(30) default, so the extra iterations needed to + # reach the tighter rtol on large systems don't stagnate. + gpu['ksp_gmres_restart'] = options.getfieldvalue('ksp_gmres_restart', 100) return gpu From 0ae95149cca67e09e75684b32a1665536aa3e754 Mon Sep 17 00:00:00 2001 From: Justin Kin Jun Hew Date: Tue, 16 Jun 2026 16:23:38 +1000 Subject: [PATCH 12/12] fix target --- .github/build-ci/data/standard.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/build-ci/data/standard.json b/.github/build-ci/data/standard.json index 0112f15e3..2dabd49cd 100644 --- a/.github/build-ci/data/standard.json +++ b/.github/build-ci/data/standard.json @@ -1,4 +1,4 @@ { "gcc_compiler_version": "13.2.0", - "target": " x86_64" + "target": "x86_64" }