From bcf919d9847eec7c0a94048e89cf8786067d3525 Mon Sep 17 00:00:00 2001 From: Gaurav Harsha Date: Tue, 12 May 2026 07:42:11 -0400 Subject: [PATCH 1/3] replace context macro usage with inline function calls --- src/gpu_kernel.cpp | 10 ++++----- src/green/gpu/gpu_kernel.h | 14 ++++++------- src/green/gpu/gw_gpu_kernel.h | 2 +- src/gw_gpu_kernel.cpp | 30 +++++++++++++-------------- src/hf_gpu_kernel.cpp | 38 +++++++++++++++++------------------ test/cu_solver_test.cpp | 6 +++--- 6 files changed, 50 insertions(+), 50 deletions(-) diff --git a/src/gpu_kernel.cpp b/src/gpu_kernel.cpp index c6e6db4..acd4367 100644 --- a/src/gpu_kernel.cpp +++ b/src/gpu_kernel.cpp @@ -24,17 +24,17 @@ namespace green::gpu { void gpu_kernel::setup_MPI_structure() { - _devCount_total = (utils::context.node_rank < _devCount_per_node) ? 1 : 0; - MPI_Allreduce(MPI_IN_PLACE, &_devCount_total, 1, MPI_INT, MPI_SUM, utils::context.global); - if (!utils::context.global_rank && _verbose > 1) + _devCount_total = (utils::context().node_rank < _devCount_per_node) ? 1 : 0; + MPI_Allreduce(MPI_IN_PLACE, &_devCount_total, 1, MPI_INT, MPI_SUM, utils::context().global); + if (!utils::context().global_rank && _verbose > 1) std::cout << "Your host has " << _devCount_per_node << " devices/node and we'll use " << _devCount_total << " devices in total." << std::endl; - if (_devCount_total > _ink && !utils::context.global_rank && _verbose > 0) { + if (_devCount_total > _ink && !utils::context().global_rank && _verbose > 0) { std::cerr << "***Warining***: The maximum number of GPUs to parallel would be " << _ink << " for cuGW and " << _ink << " for cuHF. Extra resources would simply be idle." << std::endl; } - utils::setup_devices_communicator(utils::context.global, utils::context.global_rank, utils::context.node_rank, _devCount_per_node, _devCount_total, _devices_comm, _devices_rank, + utils::setup_devices_communicator(utils::context().global, utils::context().global_rank, utils::context().node_rank, _devCount_per_node, _devCount_total, _devices_comm, _devices_rank, _devices_size); } diff --git a/src/green/gpu/gpu_kernel.h b/src/green/gpu/gpu_kernel.h index 0e55a82..d845377 100644 --- a/src/green/gpu/gpu_kernel.h +++ b/src/green/gpu/gpu_kernel.h @@ -43,7 +43,7 @@ namespace green::gpu { _naosq(nao * nao), _nao3(nao * nao * nao), _NQnaosq(NQ * nao * nao), _nk_batch(0), _devices_comm(MPI_COMM_NULL), _devices_rank(0), _devices_size(0), _shared_win(MPI_WIN_NULL), _devCount_total(0), _devCount_per_node(0), _low_device_memory(p["cuda_low_gpu_memory"]), _verbose(p["verbose"]), _Vk1k2_Qij(nullptr) { - check_for_cuda(utils::context.global, utils::context.global_rank, _devCount_per_node, _verbose); + check_for_cuda(utils::context().global, utils::context().global_rank, _devCount_per_node, _verbose); if (p["cuda_low_cpu_memory"].as()) { _coul_int_reading_type = chunks; } else { @@ -72,9 +72,9 @@ namespace green::gpu { allocate_shared_Coulomb(&_Vk1k2_Qij); statistics.end(); } else { - if (!utils::context.global_rank && _verbose > 0) std::cout << "Will read Coulomb integrals from chunks." << std::endl; + if (!utils::context().global_rank && _verbose > 0) std::cout << "Will read Coulomb integrals from chunks." << std::endl; } - MPI_Barrier(utils::context.global); + MPI_Barrier(utils::context().global); } /** @@ -93,7 +93,7 @@ namespace green::gpu { if (_coul_int_reading_type == as_a_whole) { statistics.start("read whole integral"); MPI_Win_fence(0, _shared_win); - coul_int->read_entire(_Vk1k2_Qij, utils::context.node_rank, utils::context.node_size); + coul_int->read_entire(_Vk1k2_Qij, utils::context().node_rank, utils::context().node_size); MPI_Win_fence(0, _shared_win); statistics.end(); } @@ -106,15 +106,15 @@ namespace green::gpu { void allocate_shared_Coulomb(std::complex** Vk1k2_Qij) { size_t number_elements = _bz_utils.k_symmetry().num_kpair_stored() * _NQ * _naosq; MPI_Aint shared_buffer_size = number_elements * sizeof(std::complex); - if (!utils::context.global_rank && _verbose > 0) { + if (!utils::context().global_rank && _verbose > 0) { std::cout << std::setprecision(4); std::cout << "Reading the entire Coulomb integrals at once. Estimated memory requirement per node = " << (double)shared_buffer_size / 1024 / 1024 / 1024 << " GB." << std::endl; std::cout << std::setprecision(15); } // Collective operations among node_comm - utils::setup_mpi_shared_memory(Vk1k2_Qij, shared_buffer_size, _shared_win, utils::context.node_comm, - utils::context.node_rank); + utils::setup_mpi_shared_memory(Vk1k2_Qij, shared_buffer_size, _shared_win, utils::context().node_comm, + utils::context().node_rank); } protected: diff --git a/src/green/gpu/gw_gpu_kernel.h b/src/green/gpu/gw_gpu_kernel.h index adcc2ce..b69afd4 100644 --- a/src/green/gpu/gw_gpu_kernel.h +++ b/src/green/gpu/gw_gpu_kernel.h @@ -182,7 +182,7 @@ namespace green::gpu { */ x2c_gw_gpu_kernel(const params::params& p, size_t nao, size_t nso, size_t ns, size_t NQ, const grids::transformer_t& ft, const bz_utils_t& bz_utils, LinearSolverType cuda_lin_solver, int verbose = 1) : gw_gpu_kernel(p, nao, nso, ns, NQ, ft, bz_utils, cuda_lin_solver, verbose) { - if (!_low_device_memory && !utils::context.global_rank && _verbose > 2) std::cout<<"X2C GW force using low device memory implementation"< 2) std::cout<<"X2C GW force using low device memory implementation"< 0) { complexity_estimation(); diff --git a/src/gw_gpu_kernel.cpp b/src/gw_gpu_kernel.cpp index ffb0c65..328766b 100644 --- a/src/gw_gpu_kernel.cpp +++ b/src/gw_gpu_kernel.cpp @@ -123,7 +123,7 @@ namespace green::gpu { _flop_count = flop_count_firstmatmul + flop_count_transforms + flop_count_fourier + flop_count_solver + flop_count_secondmatmul; - if (!utils::context.global_rank && _verbose > 1) { + if (!utils::context().global_rank && _verbose > 1) { std::cout << "############ Total GW Operations per Iteration ############" << std::endl; std::cout << "Total: " << _flop_count << std::endl; std::cout << "First matmul: " << flop_count_firstmatmul << std::endl; @@ -149,7 +149,7 @@ namespace green::gpu { double flop_count_secondmatmul=_ink*_nk*4*_nts*(matmul_cost(_nao*_NQ, _nao, _nao)+matmul_cost(_NQ, _naosq, _NQ)+matmul_cost(_nao, _nao, _NQ*_nao)); _flop_count= flop_count_firstmatmul+flop_count_fourier+flop_count_solver+flop_count_secondmatmul; - if (!utils::context.global_rank && _verbose > 1) { + if (!utils::context().global_rank && _verbose > 1) { std::cout << "############ Total Two-Component GW Operations per Iteration ############" << std::endl; std::cout << "Total: " << _flop_count << std::endl; std::cout << "First matmul: " << flop_count_firstmatmul << std::endl; @@ -166,11 +166,11 @@ namespace green::gpu { statistics.start("total"); statistics.start("Initialization: CPU"); sigma_tau.fence(); - if (!utils::context.node_rank) sigma_tau.object().set_zero(); + if (!utils::context().node_rank) sigma_tau.object().set_zero(); sigma_tau.fence(); setup_MPI_structure(); _coul_int = new df_integral_t(_path, _nao, _nk, _NQ, _bz_utils); - MPI_Barrier(utils::context.global); + MPI_Barrier(utils::context().global); set_shared_Coulomb(); statistics.end(); update_integrals(_coul_int, statistics); @@ -178,20 +178,20 @@ namespace green::gpu { if (_devices_comm != MPI_COMM_NULL) { gw_innerloop(g, sigma_tau); } - MPI_Barrier(utils::context.global); + MPI_Barrier(utils::context().global); sigma_tau.fence(); // Print effective FLOPs achieved in the calculation flops_achieved(); - if (!utils::context.node_rank) { + if (!utils::context().node_rank) { if (_devices_comm != MPI_COMM_NULL) statistics.start("selfenergy_reduce"); - utils::allreduce(MPI_IN_PLACE, sigma_tau.object().data(), sigma_tau.object().size()/(_nso*_nso), dt_matrix, matrix_sum_op, utils::context.internode_comm); + utils::allreduce(MPI_IN_PLACE, sigma_tau.object().data(), sigma_tau.object().size()/(_nso*_nso), dt_matrix, matrix_sum_op, utils::context().internode_comm); sigma_tau.object() /= (_nk); if (_devices_comm != MPI_COMM_NULL) statistics.end(); } sigma_tau.fence(); - MPI_Barrier(utils::context.global); + MPI_Barrier(utils::context().global); statistics.end(); - statistics.print(utils::context.global); + statistics.print(utils::context().global); print_effective_flops(); // Reset all timing stats for next iteration statistics.reset(); @@ -199,7 +199,7 @@ namespace green::gpu { clean_MPI_structure(); clean_shared_Coulomb(); delete _coul_int; - MPI_Barrier(utils::context.global); + MPI_Barrier(utils::context().global); MPI_Type_free(&dt_matrix); MPI_Op_free(&matrix_sum_op); } @@ -234,7 +234,7 @@ namespace green::gpu { MPI_Reduce(&min_eff_flops, &min_eff_flops, 1, MPI_DOUBLE, MPI_MIN, 0, _devices_comm); MPI_Reduce(&avg_eff_flops, &avg_eff_flops, 1, MPI_DOUBLE, MPI_SUM, 0, _devices_comm); } - if (!utils::context.global_rank && _verbose > 1) { + if (!utils::context().global_rank && _verbose > 1) { auto old_precision = std::cout.precision(); std::cout << std::setprecision(6); std::cout << "=================== GPU Performance ====================" << std::endl; @@ -300,8 +300,8 @@ namespace green::gpu { // k-space AO transforms are only needed for scalar (non-relativistic) calculations. cu_symmetry_data sym_data = make_cu_symmetry_data(_bz_utils, _nao, _NQ, /*build_k_ao=*/true, /*build_q_p0=*/true); cugw_utils cugw(_nts, _nt_batch, _nw_b, _ns, _nk, _ink, _nq, _inq, _nqkpt, _NQ, _nao, sym_data, g.object(), - _low_device_memory, _ft.Ttn_FB(), _ft.Tnt_BF(), _cuda_lin_solver, utils::context.global_rank, - utils::context.node_rank, _devCount_per_node); + _low_device_memory, _ft.Ttn_FB(), _ft.Tnt_BF(), _cuda_lin_solver, utils::context().global_rank, + utils::context().node_rank, _devCount_per_node); statistics.end(); gw_reader0_callback r0 = [&](int k_ibz, tensor,4>& Gk_smtij) { copy_Gk(g.object(), Gk_smtij, k_ibz, true); @@ -425,7 +425,7 @@ namespace green::gpu { throw std::runtime_error("Not enough memory to create qkpt even with nt_batch = 1. Cannot run application on GPU."); if (_nqkpt == 0) throw std::runtime_error("Not enough memory to create qkpt. Please reduce nt_batch"); - if (_nqkpt == 1 && _ink != 1 && !utils::context.global_rank) { + if (_nqkpt == 1 && _ink != 1 && !utils::context().global_rank) { if (_nt_batch > 1) std::cerr << "WARNING: Only one qkpt created! Performance will be sub-optimal. Reduce nt_batch" << std::endl; else @@ -472,7 +472,7 @@ namespace green::gpu { cu_symmetry_data sym_data_x2c = make_cu_symmetry_data(_bz_utils, _nao, _NQ, /*build_k_ao=*/false, /*build_q_p0=*/true); cugw_utils cugw(_nts, _nt_batch, _nw_b, psuedo_ns, _nk, _ink, _nq, _inq, _nqkpt, _NQ, _nao, sym_data_x2c, g.object(), true, _ft.Ttn_FB(), _ft.Tnt_BF(), _cuda_lin_solver, - utils::context.global_rank, utils::context.node_rank, _devCount_per_node); + utils::context().global_rank, utils::context().node_rank, _devCount_per_node); statistics.end(); // r0: called per star member (k_full) in cu_routines.cu. // copy_Gk_2c looks up k_ibz internally and applies X2C TR (spin-flip + conj) on the CPU. diff --git a/src/hf_gpu_kernel.cpp b/src/hf_gpu_kernel.cpp index 51f99cb..c9be09e 100644 --- a/src/hf_gpu_kernel.cpp +++ b/src/hf_gpu_kernel.cpp @@ -70,7 +70,7 @@ namespace green::gpu { new_Fock.set_zero(); setup_MPI_structure(); _coul_int = new df_integral_t(_path, _nao, _nk, _NQ, _bz_utils); - MPI_Barrier(utils::context.global); + MPI_Barrier(utils::context().global); set_shared_Coulomb(); statistics.end(); update_integrals(_coul_int, statistics); @@ -90,15 +90,15 @@ namespace green::gpu { statistics.end(); statistics.start("Fock reduce"); - utils::allreduce(MPI_IN_PLACE, new_Fock.data(), new_Fock.size(), MPI_C_DOUBLE_COMPLEX, MPI_SUM, utils::context.global); + utils::allreduce(MPI_IN_PLACE, new_Fock.data(), new_Fock.size(), MPI_C_DOUBLE_COMPLEX, MPI_SUM, utils::context().global); statistics.end(); statistics.end(); - statistics.print(utils::context.global); + statistics.print(utils::context().global); clean_MPI_structure(); clean_shared_Coulomb(); delete _coul_int; - MPI_Barrier(utils::context.global); + MPI_Barrier(utils::context().global); return new_Fock; } @@ -109,7 +109,7 @@ namespace green::gpu { // Also determines _nk_batch HF_check_devices_free_space(); // Each process gets one cuda runner hf_utils - cuhf_utils hf_utils(_nk, _ink, _ns, _nao, _NQ, _nk_batch, dm_fbz, utils::context.global_rank, utils::context.node_rank, + cuhf_utils hf_utils(_nk, _ink, _ns, _nao, _NQ, _nk_batch, dm_fbz, utils::context().global_rank, utils::context().node_rank, _devCount_per_node); statistics.end(); @@ -141,8 +141,8 @@ namespace green::gpu { // TODO or NOTE: It looks like we are building the Hartree term on single CPU, with no MPI whatsoever // I see - we build the Hartree bubble on all the cpu procs through full sum, and only then use MPI for _ink * _ns // to update the Fock. This can be fixed later. - if (utils::context.global_rank < _ink * _ns) { - int hf_nprocs = (utils::context.global_size > _ink * _ns) ? _ink * _ns : utils::context.global_size; + if (utils::context().global_rank < _ink * _ns) { + int hf_nprocs = (utils::context().global_size > _ink * _ns) ? _ink * _ns : utils::context().global_size; // Direct diagram MatrixXcd X1(_nao, _nao); @@ -166,7 +166,7 @@ namespace green::gpu { } upper_Coul /= double(_nk); - for (int ii = utils::context.global_rank; ii < _ink * _ns; ii += hf_nprocs) { + for (int ii = utils::context().global_rank; ii < _ink * _ns; ii += hf_nprocs) { int is = ii / _ink; int ik = ii % _ink; int k_ir = _bz_utils.k_symmetry().full_point(ik); @@ -185,10 +185,10 @@ namespace green::gpu { } void scalar_hf_gpu_kernel::add_Ewald(ztensor<4>& new_Fock, const ztensor<4>& dm, const ztensor<4>& S, double madelung) { - if (utils::context.global_rank < _ink * _ns) { + if (utils::context().global_rank < _ink * _ns) { double prefactor = (_ns == 2) ? 1.0 : 0.5; - size_t hf_nprocs = (utils::context.global_size > _ink * _ns) ? _ink * _ns : utils::context.global_size; - for (size_t ii = utils::context.global_rank; ii < _ns * _ink; ii += hf_nprocs) { + size_t hf_nprocs = (utils::context().global_size > _ink * _ns) ? _ink * _ns : utils::context().global_size; + for (size_t ii = utils::context().global_rank; ii < _ns * _ink; ii += hf_nprocs) { size_t is = ii / _ink; size_t ik = ii % _ink; CMMatrixXcd dmm(dm.data() + is * _ink * _nao * _nao + ik * _nao * _nao, _nao, _nao); @@ -216,7 +216,7 @@ namespace green::gpu { _ink * _ns * matmul_cost(1, _naosq, _nk); _hf_total_flops = flop_count_direct + flop_count_exchange; - if (!utils::context.global_rank && _verbose > 1) { + if (!utils::context().global_rank && _verbose > 1) { std::cout << "############ Total HF Operations per Iteration ############" << std::endl; std::cout << "Total: " << _hf_total_flops << std::endl; std::cout << "Matmul (Direct diagram): " << flop_count_direct << std::endl; @@ -235,7 +235,7 @@ namespace green::gpu { // Each NxN AO block of the 2-component exchange potential is evalulated individually // using the non-relativistic functions with pseudo spin = 3 (i.e. aa, bb, ab blocks) int pseudo_ns = 3; - cuhf_utils hf_utils(_nk, _ink, pseudo_ns, _nao, _NQ, _nk_batch, dm_fbz_3kij, utils::context.global_rank, utils::context.node_rank, _devCount_per_node); + cuhf_utils hf_utils(_nk, _ink, pseudo_ns, _nao, _NQ, _nk_batch, dm_fbz_3kij, utils::context().global_rank, utils::context().node_rank, _devCount_per_node); statistics.end(); MPI_Barrier(_devices_comm); @@ -250,8 +250,8 @@ namespace green::gpu { } void x2c_hf_gpu_kernel::compute_direct_selfenergy(ztensor<4> &new_Fock, const ztensor<4> &dm) { - if (utils::context.global_rank < _ink) { - int direct_nprocs = (utils::context.global_size > _ink)? _ink : utils::context.global_size; + if (utils::context().global_rank < _ink) { + int direct_nprocs = (utils::context().global_size > _ink)? _ink : utils::context().global_size; ztensor<3> v(_NQ, _nao, _nao); MMatrixXcd vm(v.data(), _NQ, _nao * _nao); @@ -282,7 +282,7 @@ namespace green::gpu { MatrixXcd Fm(1, _nao * _nao); MMatrixXcd Fmm(Fm.data(), _nao, _nao); - for (int ik = utils::context.global_rank; ik < _ink; ik += direct_nprocs) { + for (int ik = utils::context().global_rank; ik < _ink; ik += direct_nprocs) { int k_ir = _bz_utils.k_symmetry().full_point(ik); if (_coul_int_reading_type == as_a_whole) { @@ -301,8 +301,8 @@ namespace green::gpu { } void x2c_hf_gpu_kernel::add_Ewald(ztensor<4>& new_Fock, const ztensor<4>& dm, const ztensor<4>& S, double madelung) { - if (utils::context.global_rank < _ink * _ns) { - int direct_nprocs = (utils::context.global_size > _ink)? _ink : utils::context.global_size; + if (utils::context().global_rank < _ink * _ns) { + int direct_nprocs = (utils::context().global_size > _ink)? _ink : utils::context().global_size; ztensor<3> dm_spblks[3] { {_ink, _nao, _nao}, {_ink, _nao, _nao}, {_ink, _nao, _nao} }; for (int ik = 0; ik < _ink; ++ik) { CMMatrixXcd dmm(dm.data() + ik*_nso*_nso, _nso, _nso); @@ -314,7 +314,7 @@ namespace green::gpu { matrix(dm_spblks[2](ik)) = dmm.block(0, _nao, _nao, _nao); } MatrixXcd buffer(_nao, _nao); - for (size_t iks = utils::context.global_rank; iks < 3*_ink; iks += direct_nprocs) { + for (size_t iks = utils::context().global_rank; iks < 3*_ink; iks += direct_nprocs) { size_t ik = iks / 3; size_t is = iks % 3; MMatrixXcd Fm_nso(new_Fock.data() + ik*_nso*_nso, _nso, _nso); diff --git a/test/cu_solver_test.cpp b/test/cu_solver_test.cpp index 9bedbf1..f047f13 100644 --- a/test/cu_solver_test.cpp +++ b/test/cu_solver_test.cpp @@ -81,7 +81,7 @@ void solve_hf(const std::string& input, const std::string& int_hf, const std::st { green::h5pp::archive ar(test_file, "r"); G_shared.fence(); - if (!green::utils::context.node_rank) ar["G_tau"] >> G_shared.object(); + if (!green::utils::context().node_rank) ar["G_tau"] >> G_shared.object(); G_shared.fence(); ar["result/Sigma1"] >> Sigma1_test; ar.close(); @@ -147,10 +147,10 @@ void solve_gw(const std::string& input, const std::string& int_f, const std::str { green::h5pp::archive ar(test_file, "r"); G_shared.fence(); - if (!green::utils::context.node_rank) ar["G_tau"] >> G_shared.object(); + if (!green::utils::context().node_rank) ar["G_tau"] >> G_shared.object(); G_shared.fence(); S_shared_tst.fence(); - if (!green::utils::context.node_rank) ar["result/Sigma_tau"] >> S_shared_tst.object(); + if (!green::utils::context().node_rank) ar["result/Sigma_tau"] >> S_shared_tst.object(); S_shared_tst.fence(); ar.close(); } From ab0862e12de944bf02157e3e377fcfba7bdec4e2 Mon Sep 17 00:00:00 2001 From: Emanuel Gull Date: Tue, 12 May 2026 14:52:44 +0200 Subject: [PATCH 2/3] Potential fix for pull request finding Co-authored-by: Copilot Autofix powered by AI <175728472+Copilot@users.noreply.github.com> --- src/green/gpu/gw_gpu_kernel.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/green/gpu/gw_gpu_kernel.h b/src/green/gpu/gw_gpu_kernel.h index b69afd4..0edd8e8 100644 --- a/src/green/gpu/gw_gpu_kernel.h +++ b/src/green/gpu/gw_gpu_kernel.h @@ -182,7 +182,7 @@ namespace green::gpu { */ x2c_gw_gpu_kernel(const params::params& p, size_t nao, size_t nso, size_t ns, size_t NQ, const grids::transformer_t& ft, const bz_utils_t& bz_utils, LinearSolverType cuda_lin_solver, int verbose = 1) : gw_gpu_kernel(p, nao, nso, ns, NQ, ft, bz_utils, cuda_lin_solver, verbose) { - if (!_low_device_memory && !utils::context().global_rank && _verbose > 2) std::cout<<"X2C GW force using low device memory implementation"< 2) std::cout<<"Forcing X2C GW to use low device memory implementation"< 0) { complexity_estimation(); From 8f336180f6075883a3e688475509f49339f58b71 Mon Sep 17 00:00:00 2001 From: Gaurav Harsha Date: Tue, 12 May 2026 09:35:51 -0400 Subject: [PATCH 3/3] fix spelling: psuedo_ns -> pseudo_ns --- src/gw_gpu_kernel.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/gw_gpu_kernel.cpp b/src/gw_gpu_kernel.cpp index 328766b..6f31bd1 100644 --- a/src/gw_gpu_kernel.cpp +++ b/src/gw_gpu_kernel.cpp @@ -467,10 +467,10 @@ namespace green::gpu { // Reuse the non-relativistic functions with pseudo spin = 4, the aa, bb, ab, ba blocks. // Since the size of the Green's function and self-energy is 4 times largeer, // low_device_memory mode is always used. - int psuedo_ns = 4; + int pseudo_ns = 4; // X2C: no k-space AO transforms needed; transform_k_ao_device_2c uses only TR flags. cu_symmetry_data sym_data_x2c = make_cu_symmetry_data(_bz_utils, _nao, _NQ, /*build_k_ao=*/false, /*build_q_p0=*/true); - cugw_utils cugw(_nts, _nt_batch, _nw_b, psuedo_ns, _nk, _ink, _nq, _inq, _nqkpt, _NQ, _nao, sym_data_x2c, + cugw_utils cugw(_nts, _nt_batch, _nw_b, pseudo_ns, _nk, _ink, _nq, _inq, _nqkpt, _NQ, _nao, sym_data_x2c, g.object(), true, _ft.Ttn_FB(), _ft.Tnt_BF(), _cuda_lin_solver, utils::context().global_rank, utils::context().node_rank, _devCount_per_node); statistics.end(); @@ -516,7 +516,7 @@ namespace green::gpu { ztensor<5> Sigma_tskij_host_local(_nts, 1, _ink, _nso, _nso); statistics.start("Solve cuGW"); - cugw.accumulate_gw_selfenergy_on_device(_nts, psuedo_ns, _nk, _ink, _nq, _inq, _nao, _Vk1k2_Qij, + cugw.accumulate_gw_selfenergy_on_device(_nts, pseudo_ns, _nk, _ink, _nq, _inq, _nao, _Vk1k2_Qij, Sigma_tskij_host_local, _devices_rank, _devices_size, true, _verbose, r0, r1, r2); statistics.end(); // Convert Sigma_tskij_host_local to (_nts, 1, _ink, _nso, _nso)