ACCESS-NRI · justinh2002 · May 29, 2026 · May 29, 2026 · Jun 1, 2026 · Jun 2, 2026
diff --git a/.github/build-ci/manifests/gcc-cuda.spack.yaml.j2 b/.github/build-ci/manifests/gcc-cuda.spack.yaml.j2
@@ -0,0 +1,19 @@
+spack:
+  specs:
+  - issm @git.{{ ref }} +wrappers ~ad +cuda
+  packages:
+    # Workaround due to https://github.com/spack/spack-packages/pull/3644
+    re2c:
+      require:
+      - '@3.1'
+
+    gcc:
+      require:
+      - '@{{ gcc_compiler_version }}'
+    all:
+      require:
+      - '%access_gcc'
+      - 'target={{ target }}'
+  concretizer:
+    unify: false
+  view: false
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -31,5 +31,6 @@ jobs:
       spack-manifest-path: ${{ matrix.file }}
       allow-ssh-into-spack-install: false  # If true, PR author must ssh into instance to complete job
       spack-manifest-data-path: .github/build-ci/data/standard.json
+      access-spack-packages-ref: justinh2002/cuda_issm_gpu
       # Default args (including explicit spack/spack-packages/spack-config versions)
       # are specified in https://github.com/ACCESS-NRI/build-ci/tree/v3/.github/workflows#inputs
diff --git a/externalpackages/petsc/install-3.22-gadi-gpu.sh b/externalpackages/petsc/install-3.22-gadi-gpu.sh
@@ -24,6 +24,21 @@ if [ ! -d "${CUDA_DIR}" ]; then
     exit 1
 fi
 
+# Build PETSc against the SAME MPI that ISSM uses (system OpenMPI), rather than
+# letting PETSc download its own MPICH.  A mismatched MPI causes issm.exe to be
+# linked against two MPI runtimes at once, which breaks any np>1 (multi-rank)
+# run: each rank initialises as its own standalone MPI_COMM_WORLD rank-0.
+# The OpenMPI compiler wrappers must be on PATH (module load openmpi/4.1.3).
+for w in mpicc mpicxx mpif90; do
+    if ! command -v ${w} &>/dev/null; then
+        echo "ERROR: ${w} not found. Load OpenMPI first: module load openmpi/4.1.3"
+        exit 1
+    fi
+done
+if ! mpicc --showme:version 2>/dev/null | grep -qi "open mpi"; then
+    echo "WARNING: mpicc does not appear to be OpenMPI; check your modules."
+fi
+
 # Environment
 if [ -z ${LDFLAGS+x} ]; then
     LDFLAGS=""
@@ -51,14 +66,17 @@ cd ${PETSC_DIR}
     --prefix="${PREFIX}" \
     --PETSC_DIR="${PETSC_DIR}" \
     --LDFLAGS="${LDFLAGS}" \
+    --CFLAGS="-g -O2" --CXXFLAGS="-g -O2" --FFLAGS="-g -O2" \
     --with-debugging=0 \
     --with-valgrind=0 \
     --with-x=0 \
     --with-ssl=0 \
     --with-pic=1 \
+    --with-cc=mpicc \
+    --with-cxx=mpicxx \
+    --with-fc=mpif90 \
     --download-fblaslapack=1 \
     --download-metis=1 \
-    --download-mpich=1 \
     --download-mumps=1 \
     --download-parmetis=1 \
     --download-scalapack=1 \

diff --git a/m4/issm_options.m4 b/m4/issm_options.m4
@@ -1394,6 +1394,20 @@ AC_DEFUN([ISSM_OPTIONS],[
 		AC_DEFINE([_HAVE_PETSC_], [1], [with PETSc in ISSM src])
 		AC_SUBST([PETSCINCL])
 		AC_SUBST([PETSCLIB])
+
+		dnl Detect CUDA support in the configured PETSc install {{{
+		AC_MSG_CHECKING(whether PETSc was built with CUDA support)
+		PETSC_HAS_CUDA=no
+		if test -f "${PETSC_ROOT}/include/petscconf.h"; then
+			if grep -q "PETSC_HAVE_CUDA" "${PETSC_ROOT}/include/petscconf.h"; then
+				PETSC_HAS_CUDA=yes
+			fi
+		fi
+		AC_MSG_RESULT([${PETSC_HAS_CUDA}])
+		if test "x${PETSC_HAS_CUDA}" == "xyes"; then
+			AC_DEFINE([_HAVE_PETSC_CUDA_], [1], [PETSc built with CUDA support])
+		fi
+		dnl }}}
 	fi
 	dnl }}}
 	dnl MPI{{{

diff --git a/src/c/Makefile.am b/src/c/Makefile.am
@@ -266,6 +266,7 @@ issm_sources += \
 	./solutionsequences/solutionsequence_la_theta.cpp \
 	./solutionsequences/solutionsequence_linear.cpp \
 	./solutionsequences/solutionsequence_nonlinear.cpp \
+	./solutionsequences/AndersonAccelerator.cpp \
 	./solutionsequences/solutionsequence_newton.cpp \
 	./solutionsequences/solutionsequence_fct.cpp \
 	./solutionsequences/solutionsequence_schurcg.cpp \

diff --git a/src/c/shared/Enum/EnumDefinitions.h b/src/c/shared/Enum/EnumDefinitions.h
@@ -695,6 +695,7 @@ enum definitions{
 	StepEnum,
 	StepsEnum,
 	StressbalanceAbstolEnum,
+	StressbalanceAndersonDepthEnum,
 	StressbalanceFSreconditioningEnum,
 	StressbalanceIsHydrologyLayerEnum,
 	StressbalanceIsnewtonEnum,

diff --git a/src/c/shared/Enum/EnumToStringx.cpp b/src/c/shared/Enum/EnumToStringx.cpp
@@ -703,6 +703,7 @@ const char* EnumToStringx(int en){
 		case StepEnum : return "Step";
 		case StepsEnum : return "Steps";
 		case StressbalanceAbstolEnum : return "StressbalanceAbstol";
+		case StressbalanceAndersonDepthEnum : return "StressbalanceAndersonDepth";
 		case StressbalanceFSreconditioningEnum : return "StressbalanceFSreconditioning";
 		case StressbalanceIsHydrologyLayerEnum : return "StressbalanceIsHydrologyLayer";
 		case StressbalanceIsnewtonEnum : return "StressbalanceIsnewton";

diff --git a/src/c/shared/Enum/StringToEnumx.cpp b/src/c/shared/Enum/StringToEnumx.cpp
@@ -718,6 +718,7 @@ int  StringToEnumx(const char* name,bool notfounderror){
 	      else if (strcmp(name,"Step")==0) return StepEnum;
 	      else if (strcmp(name,"Steps")==0) return StepsEnum;
 	      else if (strcmp(name,"StressbalanceAbstol")==0) return StressbalanceAbstolEnum;
+	      else if (strcmp(name,"StressbalanceAndersonDepth")==0) return StressbalanceAndersonDepthEnum;
 	      else if (strcmp(name,"StressbalanceFSreconditioning")==0) return StressbalanceFSreconditioningEnum;
 	      else if (strcmp(name,"StressbalanceIsHydrologyLayer")==0) return StressbalanceIsHydrologyLayerEnum;
 	      else if (strcmp(name,"StressbalanceIsnewton")==0) return StressbalanceIsnewtonEnum;

diff --git a/src/c/solutionsequences/AndersonAccelerator.cpp b/src/c/solutionsequences/AndersonAccelerator.cpp
@@ -0,0 +1,202 @@
+/*!\file:  AndersonAccelerator.cpp
+ * \brief: Anderson acceleration for Picard fixed-point iterations.
+ *
+ * See AndersonAccelerator.h for theory and usage.
+ */
+
+#include "AndersonAccelerator.h"
+
+/* ──────────────────────────────────────────────────────────────────────────
+ * Construction / destruction
+ * ────────────────────────────────────────────────────────────────────────── */
+
+AndersonAccelerator::AndersonAccelerator(int m) : depth(m), iter(0) {
+    /*reserve to avoid repeated allocation for typical depths <= 10*/
+    if(m > 0){
+        f_hist.reserve(m+1);
+        G_hist.reserve(m+1);
+    }
+}
+
+AndersonAccelerator::~AndersonAccelerator(){
+    for(int i=0; i<(int)f_hist.size(); i++) delete f_hist[i];
+    for(int i=0; i<(int)G_hist.size(); i++) delete G_hist[i];
+}
+
+/* ──────────────────────────────────────────────────────────────────────────
+ * SmallSolve
+ *
+ * Gaussian elimination with partial pivoting for a dense mk x mk system.
+ * On entry:  A[i*mk+j], b[i].
+ * On exit:   theta[i] holds the solution.
+ * Returns false and leaves theta zeroed if A is numerically singular.
+ * ────────────────────────────────────────────────────────────────────────── */
+bool AndersonAccelerator::SmallSolve(IssmDouble* A, IssmDouble* b,
+                                     IssmDouble* theta, int mk){
+
+    const IssmDouble tol = 1.0e-14;
+
+    /*copy b into theta; we will overwrite it as we go*/
+    for(int i=0; i<mk; i++) theta[i] = b[i];
+
+    /*forward elimination*/
+    for(int col=0; col<mk; col++){
+
+        /*partial pivot: find row with largest absolute value in this column*/
+        int pivot = col;
+        IssmDouble maxval = fabs(A[col*mk+col]);
+        for(int row=col+1; row<mk; row++){
+            IssmDouble val = fabs(A[row*mk+col]);
+            if(val > maxval){ maxval = val; pivot = row; }
+        }
+
+        if(maxval < tol){
+            /*singular or near-singular — fall back to Picard (theta = 0)*/
+            for(int i=0; i<mk; i++) theta[i] = 0.0;
+            return false;
+        }
+
+        /*swap rows col and pivot in A and theta*/
+        if(pivot != col){
+            for(int j=0; j<mk; j++){
+                IssmDouble tmp = A[col*mk+j];
+                A[col*mk+j]    = A[pivot*mk+j];
+                A[pivot*mk+j]  = tmp;
+            }
+            IssmDouble tmp  = theta[col];
+            theta[col]      = theta[pivot];
+            theta[pivot]    = tmp;
+        }
+
+        /*eliminate column below the pivot*/
+        IssmDouble inv_diag = 1.0 / A[col*mk+col];
+        for(int row=col+1; row<mk; row++){
+            IssmDouble factor = A[row*mk+col] * inv_diag;
+            for(int j=col; j<mk; j++)
+                A[row*mk+j] -= factor * A[col*mk+j];
+            theta[row] -= factor * theta[col];
+        }
+    }
+
+    /*back substitution*/
+    for(int row=mk-1; row>=0; row--){
+        for(int j=row+1; j<mk; j++)
+            theta[row] -= A[row*mk+j] * theta[j];
+        theta[row] /= A[row*mk+row];
+    }
+
+    return true;
+}
+
+/* ──────────────────────────────────────────────────────────────────────────
+ * Apply
+ *
+ * Core of Anderson acceleration.  Called once per nonlinear iteration
+ * after the linear solve has produced a new Picard update in *puf.
+ *
+ * Algorithm (Walker & Ni 2011, "Type 1"):
+ *
+ *   f_k  = G(u_k) - u_k                    Picard residual
+ *   mk   = min(iter, depth)                 effective window
+ *
+ *   if mk == 0:  u_{k+1} = G(u_k)          plain Picard
+ *   else:
+ *     dF[:,j] = f_hist[j+1] - f_hist[j]    mk difference columns
+ *     dG[:,j] = G_hist[j+1] - G_hist[j]
+ *     theta   = argmin_t || f_k - dF t ||  (mk x mk normal equations)
+ *     u_{k+1} = G(u_k) - dG theta          accelerated update
+ * ────────────────────────────────────────────────────────────────────────── */
+void AndersonAccelerator::Apply(Vector<IssmDouble>** puf,
+                                Vector<IssmDouble>* old_uf){
+
+    if(depth == 0) return;   /* disabled: leave *puf untouched */
+
+    Vector<IssmDouble>* G_k = *puf;   /* Picard update G(u_k) from linear solve */
+
+    /* ── 1. Compute Picard residual  f_k = G_k - u_k  ───────────────────── */
+    Vector<IssmDouble>* f_k = G_k->Duplicate();
+    G_k->Copy(f_k);
+    f_k->AXPY(old_uf, -1.0);   /* f_k = G_k - old_uf */
+
+    /* ── 2. Store G_k and f_k in history before any modification ─────────── */
+    Vector<IssmDouble>* G_store = G_k->Duplicate();
+    G_k->Copy(G_store);
+
+    f_hist.push_back(f_k);
+    G_hist.push_back(G_store);
+
+    /* trim history to at most depth+1 entries (depth+1 gives depth differences)*/
+    while((int)f_hist.size() > depth+1){
+        delete f_hist.front();  f_hist.erase(f_hist.begin());
+        delete G_hist.front();  G_hist.erase(G_hist.begin());
+    }
+
+    /* ── 3. Determine effective window size ──────────────────────────────── */
+    int mk = (int)f_hist.size() - 1;   /* number of difference columns */
+
+    if(mk == 0){
+        /* first iteration or depth==1 first step: keep plain Picard update */
+        iter++;
+        return;
+    }
+
+    /* ── 4. Build difference vectors dF[j] and dG[j]  ───────────────────── */
+    /*  dF[j] = f_hist[j+1] - f_hist[j]   (j = 0 ... mk-1)
+     *  dG[j] = G_hist[j+1] - G_hist[j]                                      */
+    std::vector<Vector<IssmDouble>*> dF(mk, NULL);
+    std::vector<Vector<IssmDouble>*> dG(mk, NULL);
+
+    for(int j=0; j<mk; j++){
+        dF[j] = f_hist[j+1]->Duplicate();
+        f_hist[j+1]->Copy(dF[j]);
+        dF[j]->AXPY(f_hist[j], -1.0);      /* dF[j] = f_hist[j+1] - f_hist[j] */
+
+        dG[j] = G_hist[j+1]->Duplicate();
+        G_hist[j+1]->Copy(dG[j]);
+        dG[j]->AXPY(G_hist[j], -1.0);      /* dG[j] = G_hist[j+1] - G_hist[j] */
+    }
+
+    /* ── 5. Form normal equations  A theta = b  where
+     *       A[i][j] = dF[i] . dF[j]
+     *       b[i]    = dF[i] . f_k                                            */
+    IssmDouble* A     = new IssmDouble[mk*mk];
+    IssmDouble* b_rhs = new IssmDouble[mk];
+    IssmDouble* theta = new IssmDouble[mk]();
+
+    Vector<IssmDouble>* f_k_cur = f_hist.back();   /* = f_k we just pushed */
+
+    for(int i=0; i<mk; i++){
+        b_rhs[i] = dF[i]->Dot(f_k_cur);
+        for(int j=0; j<=i; j++){
+            A[i*mk+j] = A[j*mk+i] = dF[i]->Dot(dF[j]);
+        }
+    }
+
+    /* ── 6. Solve the mk x mk system ────────────────────────────────────── */
+    bool ok = SmallSolve(A, b_rhs, theta, mk);
+
+    /* ── 7. Anderson update:  G_k -= sum_j theta[j] * dG[j]  ─────────────
+     *   If SmallSolve failed (singular) theta is zeroed so G_k is unchanged,
+     *   effectively falling back to Picard for this iteration.               */
+    if(ok){
+        for(int j=0; j<mk; j++){
+            G_k->AXPY(dG[j], -theta[j]);   /* G_k -= theta[j] * dG[j] */
+        }
+        if(VerboseConvergence())
+            _printf0_("   Anderson(" << depth << "): mk=" << mk << " applied\n");
+    }
+    else{
+        if(VerboseConvergence())
+            _printf0_("   Anderson(" << depth << "): mk=" << mk
+                      << " singular, falling back to Picard\n");
+    }
+
+    /* ── 8. Clean up ─────────────────────────────────────────────────────── */
+    for(int j=0; j<mk; j++){ delete dF[j]; delete dG[j]; }
+    delete[] A;
+    delete[] b_rhs;
+    delete[] theta;
+
+    /* *puf still points to G_k which has been modified in-place */
+    iter++;
+}
diff --git a/src/c/solutionsequences/AndersonAccelerator.h b/src/c/solutionsequences/AndersonAccelerator.h
@@ -0,0 +1,56 @@
+/*!\file:  AndersonAccelerator.h
+ * \brief: Anderson acceleration (depth-m mixing) for Picard fixed-point iterations.
+ *
+ * Theory: Walker & Ni (2011), SIAM J. Numer. Anal. 49(4), 1715-1735.
+ *
+ * Given the Picard map  G : u -> u_new = K(u)^{-1} f(u),
+ * standard Picard sets  u_{k+1} = G(u_k).
+ * Anderson(m) instead solves a small m x m least-squares problem each
+ * iteration to find the best linear combination of the last m Picard updates,
+ * replacing the plain update at negligible extra cost.
+ *
+ * Usage:
+ *   AndersonAccelerator aa(m);          // m=0 -> pure Picard (no-op)
+ *   for(;;){
+ *     ... linear solve -> uf ...
+ *     aa.Apply(&uf, old_uf);            // uf replaced with accelerated update
+ *     ... UpdateInputFromSolution(uf) ...
+ *   }
+ */
+
+#ifndef _ANDERSONACCELERATOR_H_
+#define _ANDERSONACCELERATOR_H_
+
+#include <vector>
+#include "../toolkits/toolkits.h"
+#include "../shared/shared.h"
+
+class AndersonAccelerator {
+
+  public:
+
+    int  depth;  /* window size m; 0 = disabled (pure Picard) */
+    int  iter;   /* iteration counter (reset on construction)  */
+
+    /* history of Picard residuals  f_k = G(u_k) - u_k  */
+    std::vector<Vector<IssmDouble>*> f_hist;
+    /* history of raw Picard updates  G(u_k)  */
+    std::vector<Vector<IssmDouble>*> G_hist;
+
+    AndersonAccelerator(int m);
+    ~AndersonAccelerator();
+
+    /*! Replace *puf (= G(u_k) from linear solve) with the Anderson-accelerated
+     *  update, given old_uf = u_k (solution vector from the previous iteration).
+     *  On return *puf holds u_{k+1}. */
+    void Apply(Vector<IssmDouble>** puf, Vector<IssmDouble>* old_uf);
+
+  private:
+
+    /*! Solve the small mk x mk system  A theta = b  in-place via
+     *  Gaussian elimination with partial pivoting.  A and b are
+     *  overwritten.  Returns false if the system is singular. */
+    bool SmallSolve(IssmDouble* A, IssmDouble* b, IssmDouble* theta, int mk);
+};
+
+#endif  /* _ANDERSONACCELERATOR_H_ */