From bcf919d9847eec7c0a94048e89cf8786067d3525 Mon Sep 17 00:00:00 2001
From: Gaurav Harsha <gauravharsha05@gmail.com>
Date: Tue, 12 May 2026 07:42:11 -0400
Subject: [PATCH 1/3] replace context macro usage with inline function calls

---
 src/gpu_kernel.cpp            | 10 ++++-----
 src/green/gpu/gpu_kernel.h    | 14 ++++++-------
 src/green/gpu/gw_gpu_kernel.h |  2 +-
 src/gw_gpu_kernel.cpp         | 30 +++++++++++++--------------
 src/hf_gpu_kernel.cpp         | 38 +++++++++++++++++------------------
 test/cu_solver_test.cpp       |  6 +++---
 6 files changed, 50 insertions(+), 50 deletions(-)
diff --git a/src/gpu_kernel.cpp b/src/gpu_kernel.cpp
index c6e6db4..acd4367 100644
--- a/src/gpu_kernel.cpp
+++ b/src/gpu_kernel.cpp
@@ -24,17 +24,17 @@
 namespace green::gpu {
 
   void gpu_kernel::setup_MPI_structure() {
-    _devCount_total = (utils::context.node_rank < _devCount_per_node) ? 1 : 0;
-    MPI_Allreduce(MPI_IN_PLACE, &_devCount_total, 1, MPI_INT, MPI_SUM, utils::context.global);
-    if (!utils::context.global_rank && _verbose > 1)
+    _devCount_total = (utils::context().node_rank < _devCount_per_node) ? 1 : 0;
+    MPI_Allreduce(MPI_IN_PLACE, &_devCount_total, 1, MPI_INT, MPI_SUM, utils::context().global);
+    if (!utils::context().global_rank && _verbose > 1)
       std::cout << "Your host has " << _devCount_per_node << " devices/node and we'll use " << _devCount_total
                 << " devices in total." << std::endl;
-    if (_devCount_total > _ink && !utils::context.global_rank && _verbose > 0) {
+    if (_devCount_total > _ink && !utils::context().global_rank && _verbose > 0) {
       std::cerr << "***Warining***: The maximum number of GPUs to parallel would be " << _ink << " for cuGW and " << _ink
                 << " for cuHF. Extra resources would simply be idle." << std::endl;
     }
 
-    utils::setup_devices_communicator(utils::context.global, utils::context.global_rank, utils::context.node_rank, _devCount_per_node, _devCount_total, _devices_comm, _devices_rank,
+    utils::setup_devices_communicator(utils::context().global, utils::context().global_rank, utils::context().node_rank, _devCount_per_node, _devCount_total, _devices_comm, _devices_rank,
                                _devices_size);
   }
 
diff --git a/src/green/gpu/gpu_kernel.h b/src/green/gpu/gpu_kernel.h
index 0e55a82..d845377 100644
--- a/src/green/gpu/gpu_kernel.h
+++ b/src/green/gpu/gpu_kernel.h
@@ -43,7 +43,7 @@ namespace green::gpu {
         _naosq(nao * nao), _nao3(nao * nao * nao), _NQnaosq(NQ * nao * nao), _nk_batch(0), _devices_comm(MPI_COMM_NULL),
         _devices_rank(0), _devices_size(0), _shared_win(MPI_WIN_NULL), _devCount_total(0), _devCount_per_node(0),
         _low_device_memory(p["cuda_low_gpu_memory"]), _verbose(p["verbose"]), _Vk1k2_Qij(nullptr) {
-      check_for_cuda(utils::context.global, utils::context.global_rank, _devCount_per_node, _verbose);
+      check_for_cuda(utils::context().global, utils::context().global_rank, _devCount_per_node, _verbose);
       if (p["cuda_low_cpu_memory"].as<bool>()) {
         _coul_int_reading_type = chunks;
       } else {
@@ -72,9 +72,9 @@ namespace green::gpu {
         allocate_shared_Coulomb(&_Vk1k2_Qij);
         statistics.end();
       } else {
-        if (!utils::context.global_rank && _verbose > 0) std::cout << "Will read Coulomb integrals from chunks." << std::endl;
+        if (!utils::context().global_rank && _verbose > 0) std::cout << "Will read Coulomb integrals from chunks." << std::endl;
       }
-      MPI_Barrier(utils::context.global);
+      MPI_Barrier(utils::context().global);
     }
 
     /**
@@ -93,7 +93,7 @@ namespace green::gpu {
       if (_coul_int_reading_type == as_a_whole) {
         statistics.start("read whole integral");
         MPI_Win_fence(0, _shared_win);
-        coul_int->read_entire(_Vk1k2_Qij, utils::context.node_rank, utils::context.node_size);
+        coul_int->read_entire(_Vk1k2_Qij, utils::context().node_rank, utils::context().node_size);
         MPI_Win_fence(0, _shared_win);
         statistics.end();
       }
@@ -106,15 +106,15 @@ namespace green::gpu {
     void allocate_shared_Coulomb(std::complex<prec>** Vk1k2_Qij) {
       size_t   number_elements    = _bz_utils.k_symmetry().num_kpair_stored() * _NQ * _naosq;
       MPI_Aint shared_buffer_size = number_elements * sizeof(std::complex<prec>);
-      if (!utils::context.global_rank && _verbose > 0) {
+      if (!utils::context().global_rank && _verbose > 0) {
         std::cout << std::setprecision(4);
         std::cout << "Reading the entire Coulomb integrals at once. Estimated memory requirement per node = "
                   << (double)shared_buffer_size / 1024 / 1024 / 1024 << " GB." << std::endl;
         std::cout << std::setprecision(15);
       }
       // Collective operations among node_comm
-      utils::setup_mpi_shared_memory(Vk1k2_Qij, shared_buffer_size, _shared_win, utils::context.node_comm,
-                                     utils::context.node_rank);
+      utils::setup_mpi_shared_memory(Vk1k2_Qij, shared_buffer_size, _shared_win, utils::context().node_comm,
+                                     utils::context().node_rank);
     }
 
   protected:
diff --git a/src/green/gpu/gw_gpu_kernel.h b/src/green/gpu/gw_gpu_kernel.h
index adcc2ce..b69afd4 100644
--- a/src/green/gpu/gw_gpu_kernel.h
+++ b/src/green/gpu/gw_gpu_kernel.h
@@ -182,7 +182,7 @@ namespace green::gpu {
      */
     x2c_gw_gpu_kernel(const params::params& p, size_t nao, size_t nso, size_t ns, size_t NQ, const grids::transformer_t& ft,
                   const bz_utils_t& bz_utils, LinearSolverType cuda_lin_solver, int verbose = 1) : gw_gpu_kernel(p, nao, nso, ns, NQ, ft, bz_utils, cuda_lin_solver, verbose) {
-      if (!_low_device_memory && !utils::context.global_rank && _verbose > 2) std::cout<<"X2C GW force using low device memory implementation"<<std::endl;
+      if (!_low_device_memory && !utils::context().global_rank && _verbose > 2) std::cout<<"X2C GW force using low device memory implementation"<<std::endl;
       _low_device_memory = true;
       if (verbose > 0) {
         complexity_estimation();
diff --git a/src/gw_gpu_kernel.cpp b/src/gw_gpu_kernel.cpp
index ffb0c65..328766b 100644
--- a/src/gw_gpu_kernel.cpp
+++ b/src/gw_gpu_kernel.cpp
@@ -123,7 +123,7 @@ namespace green::gpu {
       _flop_count = flop_count_firstmatmul + flop_count_transforms + flop_count_fourier
                   + flop_count_solver + flop_count_secondmatmul;
 
-      if (!utils::context.global_rank && _verbose > 1) {
+      if (!utils::context().global_rank && _verbose > 1) {
         std::cout << "############ Total GW Operations per Iteration ############" << std::endl;
         std::cout << "Total:         " << _flop_count << std::endl;
         std::cout << "First matmul:  " << flop_count_firstmatmul << std::endl;
@@ -149,7 +149,7 @@ namespace green::gpu {
       double flop_count_secondmatmul=_ink*_nk*4*_nts*(matmul_cost(_nao*_NQ, _nao, _nao)+matmul_cost(_NQ, _naosq, _NQ)+matmul_cost(_nao, _nao, _NQ*_nao));
       _flop_count= flop_count_firstmatmul+flop_count_fourier+flop_count_solver+flop_count_secondmatmul;
 
-      if (!utils::context.global_rank && _verbose > 1) {
+      if (!utils::context().global_rank && _verbose > 1) {
         std::cout << "############ Total Two-Component GW Operations per Iteration ############" << std::endl;
         std::cout << "Total:         " << _flop_count << std::endl;
         std::cout << "First matmul:  " << flop_count_firstmatmul << std::endl;
@@ -166,11 +166,11 @@ namespace green::gpu {
       statistics.start("total");
       statistics.start("Initialization: CPU");
       sigma_tau.fence();
-      if (!utils::context.node_rank) sigma_tau.object().set_zero();
+      if (!utils::context().node_rank) sigma_tau.object().set_zero();
       sigma_tau.fence();
       setup_MPI_structure();
       _coul_int = new df_integral_t(_path, _nao, _nk, _NQ, _bz_utils);
-      MPI_Barrier(utils::context.global);
+      MPI_Barrier(utils::context().global);
       set_shared_Coulomb();
       statistics.end();
       update_integrals(_coul_int, statistics);
@@ -178,20 +178,20 @@ namespace green::gpu {
       if (_devices_comm != MPI_COMM_NULL) {
         gw_innerloop(g, sigma_tau);
       }
-      MPI_Barrier(utils::context.global);
+      MPI_Barrier(utils::context().global);
       sigma_tau.fence();
       // Print effective FLOPs achieved in the calculation
       flops_achieved();
-      if (!utils::context.node_rank) {
+      if (!utils::context().node_rank) {
         if (_devices_comm != MPI_COMM_NULL) statistics.start("selfenergy_reduce");
-        utils::allreduce(MPI_IN_PLACE, sigma_tau.object().data(), sigma_tau.object().size()/(_nso*_nso), dt_matrix, matrix_sum_op, utils::context.internode_comm);
+        utils::allreduce(MPI_IN_PLACE, sigma_tau.object().data(), sigma_tau.object().size()/(_nso*_nso), dt_matrix, matrix_sum_op, utils::context().internode_comm);
         sigma_tau.object() /= (_nk);
         if (_devices_comm != MPI_COMM_NULL) statistics.end();
       }
       sigma_tau.fence();
-      MPI_Barrier(utils::context.global);
+      MPI_Barrier(utils::context().global);
       statistics.end();
-      statistics.print(utils::context.global);
+      statistics.print(utils::context().global);
       print_effective_flops();
       // Reset all timing stats for next iteration
       statistics.reset();
@@ -199,7 +199,7 @@ namespace green::gpu {
       clean_MPI_structure();
       clean_shared_Coulomb();
       delete _coul_int;
-      MPI_Barrier(utils::context.global);
+      MPI_Barrier(utils::context().global);
       MPI_Type_free(&dt_matrix);
       MPI_Op_free(&matrix_sum_op);
     }
@@ -234,7 +234,7 @@ namespace green::gpu {
         MPI_Reduce(&min_eff_flops, &min_eff_flops, 1, MPI_DOUBLE, MPI_MIN, 0, _devices_comm);
         MPI_Reduce(&avg_eff_flops, &avg_eff_flops, 1, MPI_DOUBLE, MPI_SUM, 0, _devices_comm);
       }
-      if (!utils::context.global_rank && _verbose > 1) {
+      if (!utils::context().global_rank && _verbose > 1) {
         auto old_precision = std::cout.precision();
         std::cout << std::setprecision(6);
         std::cout << "===================   GPU Performance   ====================" << std::endl;
@@ -300,8 +300,8 @@ namespace green::gpu {
       // k-space AO transforms are only needed for scalar (non-relativistic) calculations.
       cu_symmetry_data sym_data = make_cu_symmetry_data(_bz_utils, _nao, _NQ, /*build_k_ao=*/true, /*build_q_p0=*/true);
       cugw_utils<prec> cugw(_nts, _nt_batch, _nw_b, _ns, _nk, _ink, _nq, _inq, _nqkpt, _NQ, _nao, sym_data, g.object(),
-                            _low_device_memory, _ft.Ttn_FB(), _ft.Tnt_BF(), _cuda_lin_solver, utils::context.global_rank,
-                            utils::context.node_rank, _devCount_per_node);
+                            _low_device_memory, _ft.Ttn_FB(), _ft.Tnt_BF(), _cuda_lin_solver, utils::context().global_rank,
+                            utils::context().node_rank, _devCount_per_node);
       statistics.end();
       gw_reader0_callback<prec> r0 = [&](int k_ibz, tensor<std::complex<prec>,4>& Gk_smtij) {
         copy_Gk(g.object(), Gk_smtij, k_ibz, true);
@@ -425,7 +425,7 @@ namespace green::gpu {
         throw std::runtime_error("Not enough memory to create qkpt even with nt_batch = 1. Cannot run application on GPU.");
       if (_nqkpt == 0)
         throw std::runtime_error("Not enough memory to create qkpt. Please reduce nt_batch");
-      if (_nqkpt == 1 && _ink != 1 && !utils::context.global_rank) {
+      if (_nqkpt == 1 && _ink != 1 && !utils::context().global_rank) {
         if (_nt_batch > 1)
           std::cerr << "WARNING: Only one qkpt created! Performance will be sub-optimal. Reduce nt_batch" << std::endl;
         else
@@ -472,7 +472,7 @@ namespace green::gpu {
       cu_symmetry_data sym_data_x2c = make_cu_symmetry_data(_bz_utils, _nao, _NQ, /*build_k_ao=*/false, /*build_q_p0=*/true);
       cugw_utils<prec> cugw(_nts, _nt_batch, _nw_b, psuedo_ns, _nk, _ink, _nq, _inq, _nqkpt, _NQ, _nao, sym_data_x2c,
                             g.object(), true, _ft.Ttn_FB(), _ft.Tnt_BF(), _cuda_lin_solver,
-                            utils::context.global_rank, utils::context.node_rank, _devCount_per_node);
+                            utils::context().global_rank, utils::context().node_rank, _devCount_per_node);
       statistics.end();
       // r0: called per star member (k_full) in cu_routines.cu.
       // copy_Gk_2c looks up k_ibz internally and applies X2C TR (spin-flip + conj) on the CPU.
diff --git a/src/hf_gpu_kernel.cpp b/src/hf_gpu_kernel.cpp
index 51f99cb..c9be09e 100644
--- a/src/hf_gpu_kernel.cpp
+++ b/src/hf_gpu_kernel.cpp
@@ -70,7 +70,7 @@ namespace green::gpu {
     new_Fock.set_zero();
     setup_MPI_structure();
     _coul_int = new df_integral_t(_path, _nao, _nk, _NQ, _bz_utils);
-    MPI_Barrier(utils::context.global);
+    MPI_Barrier(utils::context().global);
     set_shared_Coulomb();
     statistics.end();
     update_integrals(_coul_int, statistics);
@@ -90,15 +90,15 @@ namespace green::gpu {
     statistics.end();
 
     statistics.start("Fock reduce");
-    utils::allreduce(MPI_IN_PLACE, new_Fock.data(), new_Fock.size(), MPI_C_DOUBLE_COMPLEX, MPI_SUM, utils::context.global);
+    utils::allreduce(MPI_IN_PLACE, new_Fock.data(), new_Fock.size(), MPI_C_DOUBLE_COMPLEX, MPI_SUM, utils::context().global);
     statistics.end();
     statistics.end();
-    statistics.print(utils::context.global);
+    statistics.print(utils::context().global);
 
     clean_MPI_structure();
     clean_shared_Coulomb();
     delete _coul_int;
-    MPI_Barrier(utils::context.global);
+    MPI_Barrier(utils::context().global);
     return new_Fock;
   }
 
@@ -109,7 +109,7 @@ namespace green::gpu {
     // Also determines _nk_batch
     HF_check_devices_free_space();
     // Each process gets one cuda runner hf_utils
-    cuhf_utils hf_utils(_nk, _ink, _ns, _nao, _NQ, _nk_batch, dm_fbz, utils::context.global_rank, utils::context.node_rank,
+    cuhf_utils hf_utils(_nk, _ink, _ns, _nao, _NQ, _nk_batch, dm_fbz, utils::context().global_rank, utils::context().node_rank,
                         _devCount_per_node);
 
     statistics.end();
@@ -141,8 +141,8 @@ namespace green::gpu {
     // TODO or NOTE: It looks like we are building the Hartree term on single CPU, with no MPI whatsoever
     // I see - we build the Hartree bubble on all the cpu procs through full sum, and only then use MPI for _ink * _ns
     // to update the Fock. This can be fixed later.
-    if (utils::context.global_rank < _ink * _ns) {
-      int hf_nprocs = (utils::context.global_size > _ink * _ns) ? _ink * _ns : utils::context.global_size;
+    if (utils::context().global_rank < _ink * _ns) {
+      int hf_nprocs = (utils::context().global_size > _ink * _ns) ? _ink * _ns : utils::context().global_size;
 
       // Direct diagram
       MatrixXcd  X1(_nao, _nao);
@@ -166,7 +166,7 @@ namespace green::gpu {
       }
       upper_Coul /= double(_nk);
 
-      for (int ii = utils::context.global_rank; ii < _ink * _ns; ii += hf_nprocs) {
+      for (int ii = utils::context().global_rank; ii < _ink * _ns; ii += hf_nprocs) {
         int is   = ii / _ink;
         int ik   = ii % _ink;
         int k_ir = _bz_utils.k_symmetry().full_point(ik);
@@ -185,10 +185,10 @@ namespace green::gpu {
   }
 
   void scalar_hf_gpu_kernel::add_Ewald(ztensor<4>& new_Fock, const ztensor<4>& dm, const ztensor<4>& S, double madelung) {
-    if (utils::context.global_rank < _ink * _ns) {
+    if (utils::context().global_rank < _ink * _ns) {
       double prefactor = (_ns == 2) ? 1.0 : 0.5;
-      size_t hf_nprocs = (utils::context.global_size > _ink * _ns) ? _ink * _ns : utils::context.global_size;
-      for (size_t ii = utils::context.global_rank; ii < _ns * _ink; ii += hf_nprocs) {
+      size_t hf_nprocs = (utils::context().global_size > _ink * _ns) ? _ink * _ns : utils::context().global_size;
+      for (size_t ii = utils::context().global_rank; ii < _ns * _ink; ii += hf_nprocs) {
         size_t      is = ii / _ink;
         size_t      ik = ii % _ink;
         CMMatrixXcd dmm(dm.data() + is * _ink * _nao * _nao + ik * _nao * _nao, _nao, _nao);
@@ -216,7 +216,7 @@ namespace green::gpu {
                                  _ink * _ns * matmul_cost(1, _naosq, _nk);
     _hf_total_flops = flop_count_direct + flop_count_exchange;
 
-    if (!utils::context.global_rank && _verbose > 1) {
+    if (!utils::context().global_rank && _verbose > 1) {
       std::cout << "############ Total HF Operations per Iteration ############" << std::endl;
       std::cout << "Total:         " << _hf_total_flops << std::endl;
       std::cout << "Matmul (Direct diagram):  " << flop_count_direct << std::endl;
@@ -235,7 +235,7 @@ namespace green::gpu {
     // Each NxN AO block of the 2-component exchange potential is evalulated individually
     // using the non-relativistic functions with pseudo spin = 3 (i.e. aa, bb, ab blocks)
     int pseudo_ns = 3;
-    cuhf_utils hf_utils(_nk, _ink, pseudo_ns, _nao, _NQ, _nk_batch, dm_fbz_3kij, utils::context.global_rank, utils::context.node_rank, _devCount_per_node);
+    cuhf_utils hf_utils(_nk, _ink, pseudo_ns, _nao, _NQ, _nk_batch, dm_fbz_3kij, utils::context().global_rank, utils::context().node_rank, _devCount_per_node);
     statistics.end();
     MPI_Barrier(_devices_comm);
 
@@ -250,8 +250,8 @@ namespace green::gpu {
   }
 
   void x2c_hf_gpu_kernel::compute_direct_selfenergy(ztensor<4> &new_Fock, const ztensor<4> &dm) {
-    if (utils::context.global_rank < _ink) {
-      int direct_nprocs = (utils::context.global_size > _ink)? _ink : utils::context.global_size;
+    if (utils::context().global_rank < _ink) {
+      int direct_nprocs = (utils::context().global_size > _ink)? _ink : utils::context().global_size;
 
       ztensor<3> v(_NQ, _nao, _nao);
       MMatrixXcd vm(v.data(), _NQ, _nao * _nao);
@@ -282,7 +282,7 @@ namespace green::gpu {
 
       MatrixXcd Fm(1, _nao * _nao);
       MMatrixXcd Fmm(Fm.data(), _nao, _nao);
-      for (int ik = utils::context.global_rank; ik < _ink; ik += direct_nprocs) {
+      for (int ik = utils::context().global_rank; ik < _ink; ik += direct_nprocs) {
         int k_ir = _bz_utils.k_symmetry().full_point(ik);
 
         if (_coul_int_reading_type == as_a_whole) {
@@ -301,8 +301,8 @@ namespace green::gpu {
   }
 
   void x2c_hf_gpu_kernel::add_Ewald(ztensor<4>& new_Fock, const ztensor<4>& dm, const ztensor<4>& S, double madelung) {
-    if (utils::context.global_rank < _ink * _ns) {
-      int direct_nprocs = (utils::context.global_size > _ink)? _ink : utils::context.global_size;
+    if (utils::context().global_rank < _ink * _ns) {
+      int direct_nprocs = (utils::context().global_size > _ink)? _ink : utils::context().global_size;
       ztensor<3> dm_spblks[3] { {_ink, _nao, _nao}, {_ink, _nao, _nao}, {_ink, _nao, _nao} };
       for (int ik = 0; ik < _ink; ++ik) {
         CMMatrixXcd dmm(dm.data() + ik*_nso*_nso, _nso, _nso);
@@ -314,7 +314,7 @@ namespace green::gpu {
         matrix(dm_spblks[2](ik)) = dmm.block(0, _nao, _nao, _nao);
       }
       MatrixXcd buffer(_nao, _nao);
-      for (size_t iks = utils::context.global_rank; iks < 3*_ink; iks += direct_nprocs) {
+      for (size_t iks = utils::context().global_rank; iks < 3*_ink; iks += direct_nprocs) {
         size_t ik = iks / 3;
         size_t is = iks % 3;
         MMatrixXcd Fm_nso(new_Fock.data() + ik*_nso*_nso, _nso, _nso);
diff --git a/test/cu_solver_test.cpp b/test/cu_solver_test.cpp
index 9bedbf1..f047f13 100644
--- a/test/cu_solver_test.cpp
+++ b/test/cu_solver_test.cpp
@@ -81,7 +81,7 @@ void solve_hf(const std::string& input, const std::string& int_hf, const std::st
   {
     green::h5pp::archive ar(test_file, "r");
     G_shared.fence();
-    if (!green::utils::context.node_rank) ar["G_tau"] >> G_shared.object();
+    if (!green::utils::context().node_rank) ar["G_tau"] >> G_shared.object();
     G_shared.fence();
     ar["result/Sigma1"] >> Sigma1_test;
     ar.close();
@@ -147,10 +147,10 @@ void solve_gw(const std::string& input, const std::string& int_f, const std::str
   {
     green::h5pp::archive ar(test_file, "r");
     G_shared.fence();
-    if (!green::utils::context.node_rank) ar["G_tau"] >> G_shared.object();
+    if (!green::utils::context().node_rank) ar["G_tau"] >> G_shared.object();
     G_shared.fence();
     S_shared_tst.fence();
-    if (!green::utils::context.node_rank) ar["result/Sigma_tau"] >> S_shared_tst.object();
+    if (!green::utils::context().node_rank) ar["result/Sigma_tau"] >> S_shared_tst.object();
     S_shared_tst.fence();
     ar.close();
   }

From ab0862e12de944bf02157e3e377fcfba7bdec4e2 Mon Sep 17 00:00:00 2001
From: Emanuel Gull <egull@users.noreply.github.com>
Date: Tue, 12 May 2026 14:52:44 +0200
Subject: [PATCH 2/3] Potential fix for pull request finding

Co-authored-by: Copilot Autofix powered by AI <175728472+Copilot@users.noreply.github.com>
---
 src/green/gpu/gw_gpu_kernel.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/green/gpu/gw_gpu_kernel.h b/src/green/gpu/gw_gpu_kernel.h
index b69afd4..0edd8e8 100644
--- a/src/green/gpu/gw_gpu_kernel.h
+++ b/src/green/gpu/gw_gpu_kernel.h
@@ -182,7 +182,7 @@ namespace green::gpu {
      */
     x2c_gw_gpu_kernel(const params::params& p, size_t nao, size_t nso, size_t ns, size_t NQ, const grids::transformer_t& ft,
                   const bz_utils_t& bz_utils, LinearSolverType cuda_lin_solver, int verbose = 1) : gw_gpu_kernel(p, nao, nso, ns, NQ, ft, bz_utils, cuda_lin_solver, verbose) {
-      if (!_low_device_memory && !utils::context().global_rank && _verbose > 2) std::cout<<"X2C GW force using low device memory implementation"<<std::endl;
+      if (!_low_device_memory && !utils::context().global_rank && _verbose > 2) std::cout<<"Forcing X2C GW to use low device memory implementation"<<std::endl;
       _low_device_memory = true;
       if (verbose > 0) {
         complexity_estimation();

From 8f336180f6075883a3e688475509f49339f58b71 Mon Sep 17 00:00:00 2001
From: Gaurav Harsha <gauravharsha05@gmail.com>
Date: Tue, 12 May 2026 09:35:51 -0400
Subject: [PATCH 3/3] fix spelling: psuedo_ns -> pseudo_ns

---
 src/gw_gpu_kernel.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/gw_gpu_kernel.cpp b/src/gw_gpu_kernel.cpp
index 328766b..6f31bd1 100644
--- a/src/gw_gpu_kernel.cpp
+++ b/src/gw_gpu_kernel.cpp
@@ -467,10 +467,10 @@ namespace green::gpu {
       // Reuse the non-relativistic functions with pseudo spin = 4, the aa, bb, ab, ba blocks.
       // Since the size of the Green's function and self-energy is 4 times largeer,
       // low_device_memory mode is always used.
-      int psuedo_ns = 4;
+      int pseudo_ns = 4;
       // X2C: no k-space AO transforms needed; transform_k_ao_device_2c uses only TR flags.
       cu_symmetry_data sym_data_x2c = make_cu_symmetry_data(_bz_utils, _nao, _NQ, /*build_k_ao=*/false, /*build_q_p0=*/true);
-      cugw_utils<prec> cugw(_nts, _nt_batch, _nw_b, psuedo_ns, _nk, _ink, _nq, _inq, _nqkpt, _NQ, _nao, sym_data_x2c,
+      cugw_utils<prec> cugw(_nts, _nt_batch, _nw_b, pseudo_ns, _nk, _ink, _nq, _inq, _nqkpt, _NQ, _nao, sym_data_x2c,
                             g.object(), true, _ft.Ttn_FB(), _ft.Tnt_BF(), _cuda_lin_solver,
                             utils::context().global_rank, utils::context().node_rank, _devCount_per_node);
       statistics.end();
@@ -516,7 +516,7 @@ namespace green::gpu {
 
       ztensor<5> Sigma_tskij_host_local(_nts, 1, _ink, _nso, _nso);
       statistics.start("Solve cuGW");
-      cugw.accumulate_gw_selfenergy_on_device(_nts, psuedo_ns, _nk, _ink, _nq, _inq, _nao, _Vk1k2_Qij,
+      cugw.accumulate_gw_selfenergy_on_device(_nts, pseudo_ns, _nk, _ink, _nq, _inq, _nao, _Vk1k2_Qij,
                                               Sigma_tskij_host_local, _devices_rank, _devices_size, true, _verbose, r0, r1, r2);
       statistics.end();
       // Convert Sigma_tskij_host_local to (_nts, 1, _ink, _nso, _nso)