Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 5 additions & 5 deletions src/gpu_kernel.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -24,17 +24,17 @@
namespace green::gpu {

void gpu_kernel::setup_MPI_structure() {
_devCount_total = (utils::context.node_rank < _devCount_per_node) ? 1 : 0;
MPI_Allreduce(MPI_IN_PLACE, &_devCount_total, 1, MPI_INT, MPI_SUM, utils::context.global);
if (!utils::context.global_rank && _verbose > 1)
_devCount_total = (utils::context().node_rank < _devCount_per_node) ? 1 : 0;
MPI_Allreduce(MPI_IN_PLACE, &_devCount_total, 1, MPI_INT, MPI_SUM, utils::context().global);
if (!utils::context().global_rank && _verbose > 1)
std::cout << "Your host has " << _devCount_per_node << " devices/node and we'll use " << _devCount_total
<< " devices in total." << std::endl;
if (_devCount_total > _ink && !utils::context.global_rank && _verbose > 0) {
if (_devCount_total > _ink && !utils::context().global_rank && _verbose > 0) {
std::cerr << "***Warining***: The maximum number of GPUs to parallel would be " << _ink << " for cuGW and " << _ink
Comment thread
gauravharsha marked this conversation as resolved.
<< " for cuHF. Extra resources would simply be idle." << std::endl;
}

utils::setup_devices_communicator(utils::context.global, utils::context.global_rank, utils::context.node_rank, _devCount_per_node, _devCount_total, _devices_comm, _devices_rank,
utils::setup_devices_communicator(utils::context().global, utils::context().global_rank, utils::context().node_rank, _devCount_per_node, _devCount_total, _devices_comm, _devices_rank,
_devices_size);
}

Expand Down
14 changes: 7 additions & 7 deletions src/green/gpu/gpu_kernel.h
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ namespace green::gpu {
_naosq(nao * nao), _nao3(nao * nao * nao), _NQnaosq(NQ * nao * nao), _nk_batch(0), _devices_comm(MPI_COMM_NULL),
_devices_rank(0), _devices_size(0), _shared_win(MPI_WIN_NULL), _devCount_total(0), _devCount_per_node(0),
_low_device_memory(p["cuda_low_gpu_memory"]), _verbose(p["verbose"]), _Vk1k2_Qij(nullptr) {
check_for_cuda(utils::context.global, utils::context.global_rank, _devCount_per_node, _verbose);
check_for_cuda(utils::context().global, utils::context().global_rank, _devCount_per_node, _verbose);
if (p["cuda_low_cpu_memory"].as<bool>()) {
_coul_int_reading_type = chunks;
} else {
Expand Down Expand Up @@ -72,9 +72,9 @@ namespace green::gpu {
allocate_shared_Coulomb(&_Vk1k2_Qij);
statistics.end();
} else {
if (!utils::context.global_rank && _verbose > 0) std::cout << "Will read Coulomb integrals from chunks." << std::endl;
if (!utils::context().global_rank && _verbose > 0) std::cout << "Will read Coulomb integrals from chunks." << std::endl;
}
MPI_Barrier(utils::context.global);
MPI_Barrier(utils::context().global);
}

/**
Expand All @@ -93,7 +93,7 @@ namespace green::gpu {
if (_coul_int_reading_type == as_a_whole) {
statistics.start("read whole integral");
MPI_Win_fence(0, _shared_win);
coul_int->read_entire(_Vk1k2_Qij, utils::context.node_rank, utils::context.node_size);
coul_int->read_entire(_Vk1k2_Qij, utils::context().node_rank, utils::context().node_size);
MPI_Win_fence(0, _shared_win);
statistics.end();
}
Expand All @@ -106,15 +106,15 @@ namespace green::gpu {
void allocate_shared_Coulomb(std::complex<prec>** Vk1k2_Qij) {
size_t number_elements = _bz_utils.k_symmetry().num_kpair_stored() * _NQ * _naosq;
MPI_Aint shared_buffer_size = number_elements * sizeof(std::complex<prec>);
if (!utils::context.global_rank && _verbose > 0) {
if (!utils::context().global_rank && _verbose > 0) {
std::cout << std::setprecision(4);
std::cout << "Reading the entire Coulomb integrals at once. Estimated memory requirement per node = "
<< (double)shared_buffer_size / 1024 / 1024 / 1024 << " GB." << std::endl;
std::cout << std::setprecision(15);
}
// Collective operations among node_comm
utils::setup_mpi_shared_memory(Vk1k2_Qij, shared_buffer_size, _shared_win, utils::context.node_comm,
utils::context.node_rank);
utils::setup_mpi_shared_memory(Vk1k2_Qij, shared_buffer_size, _shared_win, utils::context().node_comm,
utils::context().node_rank);
}

protected:
Expand Down
2 changes: 1 addition & 1 deletion src/green/gpu/gw_gpu_kernel.h
Original file line number Diff line number Diff line change
Expand Up @@ -182,7 +182,7 @@ namespace green::gpu {
*/
x2c_gw_gpu_kernel(const params::params& p, size_t nao, size_t nso, size_t ns, size_t NQ, const grids::transformer_t& ft,
const bz_utils_t& bz_utils, LinearSolverType cuda_lin_solver, int verbose = 1) : gw_gpu_kernel(p, nao, nso, ns, NQ, ft, bz_utils, cuda_lin_solver, verbose) {
if (!_low_device_memory && !utils::context.global_rank && _verbose > 2) std::cout<<"X2C GW force using low device memory implementation"<<std::endl;
if (!_low_device_memory && !utils::context().global_rank && _verbose > 2) std::cout<<"Forcing X2C GW to use low device memory implementation"<<std::endl;
_low_device_memory = true;
if (verbose > 0) {
complexity_estimation();
Expand Down
36 changes: 18 additions & 18 deletions src/gw_gpu_kernel.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -123,7 +123,7 @@ namespace green::gpu {
_flop_count = flop_count_firstmatmul + flop_count_transforms + flop_count_fourier
+ flop_count_solver + flop_count_secondmatmul;

if (!utils::context.global_rank && _verbose > 1) {
if (!utils::context().global_rank && _verbose > 1) {
std::cout << "############ Total GW Operations per Iteration ############" << std::endl;
std::cout << "Total: " << _flop_count << std::endl;
std::cout << "First matmul: " << flop_count_firstmatmul << std::endl;
Expand All @@ -149,7 +149,7 @@ namespace green::gpu {
double flop_count_secondmatmul=_ink*_nk*4*_nts*(matmul_cost(_nao*_NQ, _nao, _nao)+matmul_cost(_NQ, _naosq, _NQ)+matmul_cost(_nao, _nao, _NQ*_nao));
_flop_count= flop_count_firstmatmul+flop_count_fourier+flop_count_solver+flop_count_secondmatmul;

if (!utils::context.global_rank && _verbose > 1) {
if (!utils::context().global_rank && _verbose > 1) {
std::cout << "############ Total Two-Component GW Operations per Iteration ############" << std::endl;
std::cout << "Total: " << _flop_count << std::endl;
std::cout << "First matmul: " << flop_count_firstmatmul << std::endl;
Expand All @@ -166,40 +166,40 @@ namespace green::gpu {
statistics.start("total");
statistics.start("Initialization: CPU");
sigma_tau.fence();
if (!utils::context.node_rank) sigma_tau.object().set_zero();
if (!utils::context().node_rank) sigma_tau.object().set_zero();
sigma_tau.fence();
setup_MPI_structure();
_coul_int = new df_integral_t(_path, _nao, _nk, _NQ, _bz_utils);
MPI_Barrier(utils::context.global);
MPI_Barrier(utils::context().global);
set_shared_Coulomb();
statistics.end();
update_integrals(_coul_int, statistics);
// Only those processes assigned with a device will be involved in GW self-energy calculation
if (_devices_comm != MPI_COMM_NULL) {
gw_innerloop(g, sigma_tau);
}
MPI_Barrier(utils::context.global);
MPI_Barrier(utils::context().global);
sigma_tau.fence();
// Print effective FLOPs achieved in the calculation
flops_achieved();
if (!utils::context.node_rank) {
if (!utils::context().node_rank) {
if (_devices_comm != MPI_COMM_NULL) statistics.start("selfenergy_reduce");
utils::allreduce(MPI_IN_PLACE, sigma_tau.object().data(), sigma_tau.object().size()/(_nso*_nso), dt_matrix, matrix_sum_op, utils::context.internode_comm);
utils::allreduce(MPI_IN_PLACE, sigma_tau.object().data(), sigma_tau.object().size()/(_nso*_nso), dt_matrix, matrix_sum_op, utils::context().internode_comm);
sigma_tau.object() /= (_nk);
if (_devices_comm != MPI_COMM_NULL) statistics.end();
}
sigma_tau.fence();
MPI_Barrier(utils::context.global);
MPI_Barrier(utils::context().global);
statistics.end();
statistics.print(utils::context.global);
statistics.print(utils::context().global);
print_effective_flops();
// Reset all timing stats for next iteration
statistics.reset();

clean_MPI_structure();
clean_shared_Coulomb();
delete _coul_int;
MPI_Barrier(utils::context.global);
MPI_Barrier(utils::context().global);
MPI_Type_free(&dt_matrix);
MPI_Op_free(&matrix_sum_op);
}
Expand Down Expand Up @@ -234,7 +234,7 @@ namespace green::gpu {
MPI_Reduce(&min_eff_flops, &min_eff_flops, 1, MPI_DOUBLE, MPI_MIN, 0, _devices_comm);
MPI_Reduce(&avg_eff_flops, &avg_eff_flops, 1, MPI_DOUBLE, MPI_SUM, 0, _devices_comm);
}
if (!utils::context.global_rank && _verbose > 1) {
if (!utils::context().global_rank && _verbose > 1) {
auto old_precision = std::cout.precision();
std::cout << std::setprecision(6);
std::cout << "=================== GPU Performance ====================" << std::endl;
Expand Down Expand Up @@ -300,8 +300,8 @@ namespace green::gpu {
// k-space AO transforms are only needed for scalar (non-relativistic) calculations.
cu_symmetry_data sym_data = make_cu_symmetry_data(_bz_utils, _nao, _NQ, /*build_k_ao=*/true, /*build_q_p0=*/true);
cugw_utils<prec> cugw(_nts, _nt_batch, _nw_b, _ns, _nk, _ink, _nq, _inq, _nqkpt, _NQ, _nao, sym_data, g.object(),
_low_device_memory, _ft.Ttn_FB(), _ft.Tnt_BF(), _cuda_lin_solver, utils::context.global_rank,
utils::context.node_rank, _devCount_per_node);
_low_device_memory, _ft.Ttn_FB(), _ft.Tnt_BF(), _cuda_lin_solver, utils::context().global_rank,
utils::context().node_rank, _devCount_per_node);
statistics.end();
gw_reader0_callback<prec> r0 = [&](int k_ibz, tensor<std::complex<prec>,4>& Gk_smtij) {
copy_Gk(g.object(), Gk_smtij, k_ibz, true);
Expand Down Expand Up @@ -425,7 +425,7 @@ namespace green::gpu {
throw std::runtime_error("Not enough memory to create qkpt even with nt_batch = 1. Cannot run application on GPU.");
if (_nqkpt == 0)
throw std::runtime_error("Not enough memory to create qkpt. Please reduce nt_batch");
if (_nqkpt == 1 && _ink != 1 && !utils::context.global_rank) {
if (_nqkpt == 1 && _ink != 1 && !utils::context().global_rank) {
if (_nt_batch > 1)
std::cerr << "WARNING: Only one qkpt created! Performance will be sub-optimal. Reduce nt_batch" << std::endl;
else
Expand Down Expand Up @@ -467,12 +467,12 @@ namespace green::gpu {
// Reuse the non-relativistic functions with pseudo spin = 4, the aa, bb, ab, ba blocks.
// Since the size of the Green's function and self-energy is 4 times largeer,
// low_device_memory mode is always used.
int psuedo_ns = 4;
int pseudo_ns = 4;
// X2C: no k-space AO transforms needed; transform_k_ao_device_2c uses only TR flags.
cu_symmetry_data sym_data_x2c = make_cu_symmetry_data(_bz_utils, _nao, _NQ, /*build_k_ao=*/false, /*build_q_p0=*/true);
cugw_utils<prec> cugw(_nts, _nt_batch, _nw_b, psuedo_ns, _nk, _ink, _nq, _inq, _nqkpt, _NQ, _nao, sym_data_x2c,
cugw_utils<prec> cugw(_nts, _nt_batch, _nw_b, pseudo_ns, _nk, _ink, _nq, _inq, _nqkpt, _NQ, _nao, sym_data_x2c,
g.object(), true, _ft.Ttn_FB(), _ft.Tnt_BF(), _cuda_lin_solver,
utils::context.global_rank, utils::context.node_rank, _devCount_per_node);
utils::context().global_rank, utils::context().node_rank, _devCount_per_node);
Comment thread
gauravharsha marked this conversation as resolved.
statistics.end();
// r0: called per star member (k_full) in cu_routines.cu.
// copy_Gk_2c looks up k_ibz internally and applies X2C TR (spin-flip + conj) on the CPU.
Expand Down Expand Up @@ -516,7 +516,7 @@ namespace green::gpu {

ztensor<5> Sigma_tskij_host_local(_nts, 1, _ink, _nso, _nso);
statistics.start("Solve cuGW");
cugw.accumulate_gw_selfenergy_on_device(_nts, psuedo_ns, _nk, _ink, _nq, _inq, _nao, _Vk1k2_Qij,
cugw.accumulate_gw_selfenergy_on_device(_nts, pseudo_ns, _nk, _ink, _nq, _inq, _nao, _Vk1k2_Qij,
Sigma_tskij_host_local, _devices_rank, _devices_size, true, _verbose, r0, r1, r2);
statistics.end();
// Convert Sigma_tskij_host_local to (_nts, 1, _ink, _nso, _nso)
Expand Down
38 changes: 19 additions & 19 deletions src/hf_gpu_kernel.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ namespace green::gpu {
new_Fock.set_zero();
setup_MPI_structure();
_coul_int = new df_integral_t(_path, _nao, _nk, _NQ, _bz_utils);
MPI_Barrier(utils::context.global);
MPI_Barrier(utils::context().global);
set_shared_Coulomb();
statistics.end();
update_integrals(_coul_int, statistics);
Expand All @@ -90,15 +90,15 @@ namespace green::gpu {
statistics.end();

statistics.start("Fock reduce");
utils::allreduce(MPI_IN_PLACE, new_Fock.data(), new_Fock.size(), MPI_C_DOUBLE_COMPLEX, MPI_SUM, utils::context.global);
utils::allreduce(MPI_IN_PLACE, new_Fock.data(), new_Fock.size(), MPI_C_DOUBLE_COMPLEX, MPI_SUM, utils::context().global);
statistics.end();
statistics.end();
statistics.print(utils::context.global);
statistics.print(utils::context().global);

clean_MPI_structure();
clean_shared_Coulomb();
delete _coul_int;
MPI_Barrier(utils::context.global);
MPI_Barrier(utils::context().global);
return new_Fock;
}

Expand All @@ -109,7 +109,7 @@ namespace green::gpu {
// Also determines _nk_batch
HF_check_devices_free_space();
// Each process gets one cuda runner hf_utils
cuhf_utils hf_utils(_nk, _ink, _ns, _nao, _NQ, _nk_batch, dm_fbz, utils::context.global_rank, utils::context.node_rank,
cuhf_utils hf_utils(_nk, _ink, _ns, _nao, _NQ, _nk_batch, dm_fbz, utils::context().global_rank, utils::context().node_rank,
_devCount_per_node);

statistics.end();
Expand Down Expand Up @@ -141,8 +141,8 @@ namespace green::gpu {
// TODO or NOTE: It looks like we are building the Hartree term on single CPU, with no MPI whatsoever
// I see - we build the Hartree bubble on all the cpu procs through full sum, and only then use MPI for _ink * _ns
// to update the Fock. This can be fixed later.
if (utils::context.global_rank < _ink * _ns) {
int hf_nprocs = (utils::context.global_size > _ink * _ns) ? _ink * _ns : utils::context.global_size;
if (utils::context().global_rank < _ink * _ns) {
int hf_nprocs = (utils::context().global_size > _ink * _ns) ? _ink * _ns : utils::context().global_size;

// Direct diagram
MatrixXcd X1(_nao, _nao);
Expand All @@ -166,7 +166,7 @@ namespace green::gpu {
}
upper_Coul /= double(_nk);

for (int ii = utils::context.global_rank; ii < _ink * _ns; ii += hf_nprocs) {
for (int ii = utils::context().global_rank; ii < _ink * _ns; ii += hf_nprocs) {
int is = ii / _ink;
int ik = ii % _ink;
int k_ir = _bz_utils.k_symmetry().full_point(ik);
Expand All @@ -185,10 +185,10 @@ namespace green::gpu {
}

void scalar_hf_gpu_kernel::add_Ewald(ztensor<4>& new_Fock, const ztensor<4>& dm, const ztensor<4>& S, double madelung) {
if (utils::context.global_rank < _ink * _ns) {
if (utils::context().global_rank < _ink * _ns) {
double prefactor = (_ns == 2) ? 1.0 : 0.5;
size_t hf_nprocs = (utils::context.global_size > _ink * _ns) ? _ink * _ns : utils::context.global_size;
for (size_t ii = utils::context.global_rank; ii < _ns * _ink; ii += hf_nprocs) {
size_t hf_nprocs = (utils::context().global_size > _ink * _ns) ? _ink * _ns : utils::context().global_size;
for (size_t ii = utils::context().global_rank; ii < _ns * _ink; ii += hf_nprocs) {
size_t is = ii / _ink;
size_t ik = ii % _ink;
CMMatrixXcd dmm(dm.data() + is * _ink * _nao * _nao + ik * _nao * _nao, _nao, _nao);
Expand Down Expand Up @@ -216,7 +216,7 @@ namespace green::gpu {
_ink * _ns * matmul_cost(1, _naosq, _nk);
_hf_total_flops = flop_count_direct + flop_count_exchange;

if (!utils::context.global_rank && _verbose > 1) {
if (!utils::context().global_rank && _verbose > 1) {
std::cout << "############ Total HF Operations per Iteration ############" << std::endl;
std::cout << "Total: " << _hf_total_flops << std::endl;
std::cout << "Matmul (Direct diagram): " << flop_count_direct << std::endl;
Expand All @@ -235,7 +235,7 @@ namespace green::gpu {
// Each NxN AO block of the 2-component exchange potential is evalulated individually
// using the non-relativistic functions with pseudo spin = 3 (i.e. aa, bb, ab blocks)
int pseudo_ns = 3;
cuhf_utils hf_utils(_nk, _ink, pseudo_ns, _nao, _NQ, _nk_batch, dm_fbz_3kij, utils::context.global_rank, utils::context.node_rank, _devCount_per_node);
cuhf_utils hf_utils(_nk, _ink, pseudo_ns, _nao, _NQ, _nk_batch, dm_fbz_3kij, utils::context().global_rank, utils::context().node_rank, _devCount_per_node);
statistics.end();
MPI_Barrier(_devices_comm);

Expand All @@ -250,8 +250,8 @@ namespace green::gpu {
}

void x2c_hf_gpu_kernel::compute_direct_selfenergy(ztensor<4> &new_Fock, const ztensor<4> &dm) {
if (utils::context.global_rank < _ink) {
int direct_nprocs = (utils::context.global_size > _ink)? _ink : utils::context.global_size;
if (utils::context().global_rank < _ink) {
int direct_nprocs = (utils::context().global_size > _ink)? _ink : utils::context().global_size;

ztensor<3> v(_NQ, _nao, _nao);
MMatrixXcd vm(v.data(), _NQ, _nao * _nao);
Expand Down Expand Up @@ -282,7 +282,7 @@ namespace green::gpu {

MatrixXcd Fm(1, _nao * _nao);
MMatrixXcd Fmm(Fm.data(), _nao, _nao);
for (int ik = utils::context.global_rank; ik < _ink; ik += direct_nprocs) {
for (int ik = utils::context().global_rank; ik < _ink; ik += direct_nprocs) {
int k_ir = _bz_utils.k_symmetry().full_point(ik);

if (_coul_int_reading_type == as_a_whole) {
Expand All @@ -301,8 +301,8 @@ namespace green::gpu {
}

void x2c_hf_gpu_kernel::add_Ewald(ztensor<4>& new_Fock, const ztensor<4>& dm, const ztensor<4>& S, double madelung) {
if (utils::context.global_rank < _ink * _ns) {
int direct_nprocs = (utils::context.global_size > _ink)? _ink : utils::context.global_size;
if (utils::context().global_rank < _ink * _ns) {
int direct_nprocs = (utils::context().global_size > _ink)? _ink : utils::context().global_size;
ztensor<3> dm_spblks[3] { {_ink, _nao, _nao}, {_ink, _nao, _nao}, {_ink, _nao, _nao} };
for (int ik = 0; ik < _ink; ++ik) {
CMMatrixXcd dmm(dm.data() + ik*_nso*_nso, _nso, _nso);
Expand All @@ -314,7 +314,7 @@ namespace green::gpu {
matrix(dm_spblks[2](ik)) = dmm.block(0, _nao, _nao, _nao);
}
MatrixXcd buffer(_nao, _nao);
for (size_t iks = utils::context.global_rank; iks < 3*_ink; iks += direct_nprocs) {
for (size_t iks = utils::context().global_rank; iks < 3*_ink; iks += direct_nprocs) {
size_t ik = iks / 3;
size_t is = iks % 3;
MMatrixXcd Fm_nso(new_Fock.data() + ik*_nso*_nso, _nso, _nso);
Expand Down
6 changes: 3 additions & 3 deletions test/cu_solver_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ void solve_hf(const std::string& input, const std::string& int_hf, const std::st
{
green::h5pp::archive ar(test_file, "r");
G_shared.fence();
if (!green::utils::context.node_rank) ar["G_tau"] >> G_shared.object();
if (!green::utils::context().node_rank) ar["G_tau"] >> G_shared.object();
G_shared.fence();
ar["result/Sigma1"] >> Sigma1_test;
ar.close();
Expand Down Expand Up @@ -147,10 +147,10 @@ void solve_gw(const std::string& input, const std::string& int_f, const std::str
{
green::h5pp::archive ar(test_file, "r");
G_shared.fence();
if (!green::utils::context.node_rank) ar["G_tau"] >> G_shared.object();
if (!green::utils::context().node_rank) ar["G_tau"] >> G_shared.object();
G_shared.fence();
S_shared_tst.fence();
if (!green::utils::context.node_rank) ar["result/Sigma_tau"] >> S_shared_tst.object();
if (!green::utils::context().node_rank) ar["result/Sigma_tau"] >> S_shared_tst.object();
S_shared_tst.fence();
ar.close();
}
Expand Down
Loading