diff --git a/.github/workflows/compile_on_aws.yml b/.github/workflows/compile_on_aws.yml deleted file mode 100644 index bdde16e7..00000000 --- a/.github/workflows/compile_on_aws.yml +++ /dev/null @@ -1,152 +0,0 @@ -# SPDX-FileCopyrightText: 2024 OGL authors -# -# SPDX-License-Identifier: GPL-3.0-or-later - -name: Compile on AWS -run-name: Compile on AWS - -on: - pull_request: - types: synchronize -jobs: - start-runner: - if: contains(github.event.pull_request.labels.*.name, 'full_ci') - name: Start self-hosted EC2 runner - runs-on: ubuntu-latest - permissions: - id-token: write - contents: read - outputs: - label: ${{ steps.start-ec2-runner.outputs.label }} - ec2-instance-id: ${{ steps.start-ec2-runner.outputs.ec2-instance-id }} - steps: - - name: Configure AWS credentials - uses: aws-actions/configure-aws-credentials@v4 - with: - role-to-assume: arn:aws:iam::308634587211:role/Github-OIDC-Role-29bocUD8VBZr - aws-region: us-east-1 - - name: Start EC2 runner - id: start-ec2-runner - uses: HendriceH/ec2-github-runner@v1.10 # Starts 60GB Root + 30 GB Share volume - with: - mode: start - github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }} - ec2-image-id: ami-03af087024bfdbbee # Deep learning AMI - ec2-instance-type: g4dn.xlarge - iam-role-name: Role4Github - subnet-id: subnet-b5d2adbb - security-group-id: sg-559f8967 - aws-resource-tags: > # optional, requires additional permissions - [ - {"Key": "ucfd-project", "Value": "BMBF_2022_EXASIM"}, - {"Key": "ucfd-client", "Value": "UCFD-RD"}, - {"Key": "GitHubRepository", "Value": "${{ github.repository }}"} - ] - pre-runner-script: | - #!/bin/bash - sudo yum update -y && \ - sudo yum install docker git libicu ninja-build libasan10 -y - sudo amazon-linux-extras install epel -y - sudo yum install Lmod -y - sudo systemctl enable docker - sudo curl -o- https://raw.githubusercontent.com/nvm-sh/nvm/v0.39.5/install.sh | bash - sudo mkfs -t xfs /dev/sda1 - sudo mkdir -p /share - sudo mount /dev/sda1 /share - aws s3 cp s3://ucfd-share/pcluster/3.x/alinux2/x86_64/postinstall_github . - chmod +x postinstall_github - sudo ./postinstall_github > ~/install.log - ln -s /share/software/cmake/3.27.8/share/cmake-3.27 /usr/share/cmake-3.27 - mkdir -p /share/ec2-user - export USER=ec2-user - do-the-job: - name: Do the job on the runner - needs: start-runner # required to start the main job when the runner is ready - runs-on: ${{ needs.start-runner.outputs.label }} # run the job on the newly created runner - steps: - - name: Checkout OGL - uses: actions/checkout@v2 - with: - ref: dev - - name: Test env - shell: bash -o pipefail -i {0} - run: | - export HOME=/share/ec2-user - module load gnu/10 - module load libfabric-aws - module load openmpi - module load OpenFOAM - module load cmake - env - - name: Install OBR - shell: bash -o pipefail -i {0} - run: | - module load conda - conda activate - python3 -m pip install --upgrade pip - pip install setuptools --upgrade - git clone https://github.com/exasim-project/OBR - cd OBR - pip install . - obr --version - - - name: Config - if: always() - shell: bash -o pipefail -i {0} - run: | - export HOME=/share/ec2-user - module load gnu/10 - module load libfabric-aws - module load openmpi - module load OpenFOAM - module load cmake - cmake --list-preset - cmake -DOGL_CUDA_ARCHITECTURES=52 --preset ninja-cuda-release - - name: Build and install - shell: bash -o pipefail -i {0} - run: | - export HOME=/share/ec2-user - module load gnu/10 - module load libfabric-aws - module load openmpi - module load OpenFOAM - module load cmake - cmake --build --preset ninja-cuda-release --target install - - name: Run integration tests - shell: bash -o pipefail -i {0} - run: | - export HOME=/share/ec2-user - module load conda - conda activate - module load gnu/10 - module load libfabric-aws - module load openmpi - module load OpenFOAM - export GINKGO_EXECUTOR=cuda - obr init -g --config test/integration.yaml - obr run -o runParallelSolver - obr status - - stop-runner: - name: Stop self-hosted EC2 runner - needs: - - start-runner # required to get output from the start-runner job - - do-the-job # required to wait when the main job is done - runs-on: ubuntu-latest - permissions: - id-token: write - contents: read - if: ${{ always() }} # required to stop the runner even if the error happened in the previous jobs - steps: - - name: Configure AWS credentials - uses: aws-actions/configure-aws-credentials@v4 - with: - role-to-assume: arn:aws:iam::308634587211:role/Github-OIDC-Role-29bocUD8VBZr - aws-region: us-east-1 - - name: Stop EC2 runner - uses: HendriceH/ec2-github-runner@v1.10 - with: - mode: stop - github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }} - label: ${{ needs.start-runner.outputs.label }} - ec2-instance-id: ${{ needs.start-runner.outputs.ec2-instance-id }} diff --git a/.github/workflows/static_checks.yaml b/.github/workflows/static_checks.yaml index f5f51fdc..49e11674 100644 --- a/.github/workflows/static_checks.yaml +++ b/.github/workflows/static_checks.yaml @@ -67,7 +67,7 @@ jobs: run: | git config --global --add safe.directory /__w/OGL/OGL # Create list of all source files belonging to this repository - git ls-files | grep -E "\.(C)" > pattern + git ls-files | grep -E "\.(cpp)" > pattern # Create list of .cpp files that are in this repository and part of the # compilation database # also filters out " at the begin and end of the filename diff --git a/include/OGL/CommunicationPattern.hpp b/include/OGL/CommunicationPattern.hpp index 7fcf542e..3e7134c3 100644 --- a/include/OGL/CommunicationPattern.hpp +++ b/include/OGL/CommunicationPattern.hpp @@ -19,6 +19,18 @@ struct AllToAllPattern { std::vector recv_offsets; }; +/* @brief computes AllToAllPattern for repartioned communincator from global + * allToAll pattern by discarding all zero communication before and after the + * repartioner scope. + * + * @param exec_handler The executor handler + * @param allToAll The original allToAll pattern + * @param start_rank the original comm_world rank + */ +AllToAllPattern compute_repart_allToall(const ExecutorHandler &exec_handler, + const AllToAllPattern allToAll, + label start_rank); + /* @brief This function computes the send and recv counts vectors and the send * and recv offsets vectors for scattering from an owner to all ranks, including * owner itself diff --git a/include/OGL/DevicePersistent/ExecutorHandler.hpp b/include/OGL/DevicePersistent/ExecutorHandler.hpp index 3c9b7da5..df56b32f 100644 --- a/include/OGL/DevicePersistent/ExecutorHandler.hpp +++ b/include/OGL/DevicePersistent/ExecutorHandler.hpp @@ -71,6 +71,14 @@ struct DeviceIdHandler { return device_global_id % num_devices_per_node; } + /* @brief returns the owner rank on the global comm world communicator + */ + label global_owner() const + { + label rank = Pstream::myProcNo(); + return rank - (rank % ranks_per_gpu); + } + /* @brief check if rank is an owning rank */ bool is_owner() const @@ -270,7 +278,7 @@ class ExecutorHandler // gko comm label group = device_id_handler_.compute_group(); MPI_Comm gko_comm; - label host_rank = 0; + label host_rank = Pstream::myProcNo(); MPI_Comm_split(MPI_COMM_WORLD, group, host_rank, &gko_comm); device_comm_ = std::make_shared( @@ -278,7 +286,8 @@ class ExecutorHandler // repart comm MPI_Comm repart_comm; - label device_id = device_id_handler_.compute_device_id(4); + label global_rank = Pstream::myProcNo(); + label device_id = global_rank / device_id_handler_.ranks_per_gpu; MPI_Comm_split(MPI_COMM_WORLD, device_id, host_rank, &repart_comm); repart_comm_ = std::make_shared( @@ -300,6 +309,15 @@ class ExecutorHandler * */ bool get_non_orig_device_comm() const { return non_orig_device_comm_; } + label get_ranks_per_gpu() const { return device_id_handler_.ranks_per_gpu; } + + void set_ranks_per_gpu(label ranks_per_gpu) + { + device_id_handler_.ranks_per_gpu = ranks_per_gpu; + } + + label get_owner_rank() const { return device_id_handler_.global_owner(); } + const std::shared_ptr get_device_exec() const { return this->get_persistent_object(); diff --git a/include/OGL/DevicePersistent/Vector.hpp b/include/OGL/DevicePersistent/Vector.hpp index 4d75f21b..c8610f59 100644 --- a/include/OGL/DevicePersistent/Vector.hpp +++ b/include/OGL/DevicePersistent/Vector.hpp @@ -67,7 +67,7 @@ struct VectorInitFunctor { //// TODO store auto comm_pattern = compute_gather_to_owner_counts( exec_, repartitioner->get_ranks_per_gpu(), host_size); - bool host_buffer = !exec_.get_non_orig_device_comm(); + bool host_buffer = exec_.get_gko_force_host_buffer(); communicate_values(ref_exec, exec, comm, comm_pattern, host_view.get_const_data(), @@ -177,7 +177,8 @@ class PersistentVector auto rank = exec_.get_host_rank(); auto ref_exec = exec_.get_ref_exec(); auto comm = exec_.get_host_comm(); - bool host_buffer = !exec_.get_non_orig_device_comm(); + auto repart_comm = exec_.get_repart_comm(); + bool host_buffer = exec_.get_gko_force_host_buffer(); auto repartitioner = dist_matrix_->get_repartitioner(); auto host_size = repartitioner->get_orig_size(); @@ -186,9 +187,36 @@ class PersistentVector auto comm_pattern = compute_scatter_from_owner_counts( exec_, repartitioner->get_ranks_per_gpu(), host_size); - communicate_values(exec, ref_exec, comm, comm_pattern, - get_vector()->get_local_values(), - const_cast(memory_), host_buffer); + label owner_rank = exec_.get_owner_rank(); + auto repartAllToAll = + compute_repart_allToall(exec_, comm_pattern, owner_rank); + + // if (owner_rank != Pstream::myProcNo()){ + // label recv_count = repartAllToAll.recv_counts[0]; + // repartAllToAll.recv_counts[Pstream::myProcNo()] = recv_count; + // repartAllToAll.recv_counts[0] = 0; + // } + + // NOTE instead of all_to_all_v based communication MPI_Iscatterv + // seems to be preferable + // communicate_values(exec, ref_exec, comm, comm_pattern, + // get_vector()->get_local_values(), + // const_cast(memory_), host_buffer); + + label send_size = comm_pattern.send_offsets.back(); + auto send_view = gko::array::const_view( + exec, send_size, get_vector()->get_local_values()); + auto tmp = gko::array(exec, send_size); + + tmp = send_view; + tmp.set_executor(ref_exec); + + MPI_Request copy_back_req; + MPI_Iscatterv(tmp.get_data(), repartAllToAll.send_counts.data(), + repartAllToAll.send_offsets.data(), MPI_DOUBLE, + const_cast(memory_), repartAllToAll.recv_counts[0], + MPI_DOUBLE, 0, repart_comm->get(), ©_back_req); + MPI_Wait(©_back_req, MPI_STATUS_IGNORE); } /** Writes the content of the distributed vector to disk diff --git a/include/OGL/StoppingCriterion.hpp b/include/OGL/StoppingCriterion.hpp index 0f70bd86..944acd60 100644 --- a/include/OGL/StoppingCriterion.hpp +++ b/include/OGL/StoppingCriterion.hpp @@ -147,6 +147,8 @@ class StoppingCriterion { const label frequency_; + const word frequencyMode_; + const scalar relaxationFactor_; const bool adapt_minIter_; @@ -172,6 +174,8 @@ class StoppingCriterion { norm_eval_limit_( controlDict.lookupOrDefault("normEvalLimit", label(100))), frequency_(controlDict.lookupOrDefault("evalFrequency", label(1))), + frequencyMode_(controlDict.lookupOrDefault( + "evalFrequencyMode", word("relative"))), // optimizer, fixed relaxationFactor_( controlDict.lookupOrDefault("relaxationFactor", scalar(0.6))), adapt_minIter_( @@ -197,21 +201,30 @@ class StoppingCriterion { bool export_res, label prev_solve_iters, scalar prev_rel_cost) const { + word frequencyMode = "optimizer"; label minIter = minIter_; label frequency = frequency_; + // in case of export_res all residuals need to be computed if (!export_res) { if (prev_solve_iters > 0 && adapt_minIter_ && prev_rel_cost > 0) { minIter = prev_solve_iters * relaxationFactor_; - auto alpha = - sqrt(1.0 / (prev_solve_iters * (1.0 - relaxationFactor_)) * - prev_rel_cost); - frequency = min(norm_eval_limit_, max(1, label(1 / alpha))); + if (frequencyMode == "optimizer") { + auto alpha = sqrt( + 1.0 / (prev_solve_iters * (1.0 - relaxationFactor_)) * + prev_rel_cost); + frequency = min(norm_eval_limit_, max(1, label(1 / alpha))); + } + if (frequencyMode == "relative") { + frequency = label(prev_solve_iters * 0.075) + 1; + } } } word msg = "Creating stopping criterion with minIter " + std::to_string(minIter) + " frequency " + - std::to_string(frequency); + std::to_string(frequency) + " prev_solve_iters " + + std::to_string(prev_solve_iters) + " adapt_minIter_ " + + std::to_string(adapt_minIter_) + " prev_rel_cost "; MLOG_0(verbose, msg) diff --git a/include/OGL/lduLduBase.hpp b/include/OGL/lduLduBase.hpp index e584adbe..3e997f77 100644 --- a/include/OGL/lduLduBase.hpp +++ b/include/OGL/lduLduBase.hpp @@ -216,6 +216,7 @@ class lduLduBase : public OGL_Info, solverPerformance &solverPerf) const { bool fused = solver_controls_.lookupOrDefault("fuse", true); + exec_handler_.init_device_comm(); auto repartitioner = std::make_shared( host_matrix_wrapper_->get_local_nrows(), ranks_per_gpu_, verbose_, @@ -334,8 +335,8 @@ class lduLduBase : public OGL_Info, std::to_string(time_per_dof) + std::string(" [ns]") + std::string("\n\tTime per iteration and DOF: ") + std::to_string(time_per_iter_and_dof) + std::string(" [ns]") + - std::string("\n\tRetrieve results bandwidth ") + - std::to_string(bandwidth_copy_back) + std::string(" [GByte/s]"); + std::string("\n\tRetrieve results bandwidth "); // + + std::to_string(bandwidth_copy_back) + std::string(" [GByte/s]"); MLOG_0(verbose_, msg) return solverPerf; diff --git a/src/CommunicationPattern.cpp b/src/CommunicationPattern.cpp index 192a7f6c..46502464 100644 --- a/src/CommunicationPattern.cpp +++ b/src/CommunicationPattern.cpp @@ -171,13 +171,40 @@ void communicate_values(const ExecutorHandler &exec_handler, // std::cout << __FILE__ << ":" << __LINE__ << " received " // << std::to_string(recv_size) << " elements of " // << std::to_string(recv_size * 8 / 1e9) - // << "[Gb] on device rank: " << std::to_string(rank) << " time " + // << "[Gb] on device rank: " << std::to_string(rank) << " + // time " // << std::to_string(delta_t) << "[ms] bandwidth: " // << std::to_string(recv_size * 8 / delta_t / 1e6) // << "[Gb/s] \n"; // } } +AllToAllPattern compute_repart_allToall(const ExecutorHandler &exec_handler, + const AllToAllPattern allToAllIn, + label start_rank) +{ + auto host_comm = exec_handler.get_host_comm(); + auto repart_comm = exec_handler.get_repart_comm(); + auto ranks = repart_comm->size(); + + std::vector send_counts(ranks, 0); + std::vector send_offsets(ranks + 1, 0); + std::vector recv_counts(ranks, 0); + std::vector recv_offsets(ranks + 1, 0); + + // label start_rank = host_comm->rank(); + for (auto i = 0; i < ranks; i++) { + send_counts[i] = allToAllIn.send_counts[start_rank + i]; + send_offsets[i] = allToAllIn.send_offsets[start_rank + i]; + recv_counts[i] = allToAllIn.recv_counts[start_rank + i]; + recv_offsets[i] = allToAllIn.recv_offsets[start_rank + i]; + } + + send_offsets.back() = scalar(allToAllIn.send_offsets.back()); + recv_offsets.back() = scalar(allToAllIn.recv_offsets.back()); + return {send_counts, send_offsets, recv_counts, recv_offsets}; +} + void communicate_values( std::shared_ptr src_exec, std::shared_ptr target_exec, diff --git a/src/MatrixWrapper/Distributed.cpp b/src/MatrixWrapper/Distributed.cpp index 72eaae43..6897cf7a 100644 --- a/src/MatrixWrapper/Distributed.cpp +++ b/src/MatrixWrapper/Distributed.cpp @@ -121,13 +121,15 @@ void generate_alltoall_update_data( std::vector &update_data) { label linop_offset_store{0}; - for (size_t i = 0; i < 3; i++) { + // NOTE in case of symmetric matrix 0 (upper) is same as 1 (lower) + // thus we can start at 1 + label start = 0; + for (size_t i = start; i < 3; i++) { label interface_size = in->get_rows()[i].size(); label linop_idx = (fuse) ? 0 : in->get_id()[i]; label linop_offset = (fuse) ? linop_offset_store : 0; auto comm_pattern = compute_gather_to_owner_counts( exec_handler, ranks_per_owner, interface_size); - size_t recv_size = comm_pattern.recv_offsets.back(); // NOTE Probably dont need to store linops[linop-idx] because we can @@ -384,7 +386,7 @@ void update_impl( std::map linops, label verbose) { auto comm = exec_handler.get_host_comm(); - // auto repart_comm = exec_handler.get_repart_comm(); + auto repart_comm = exec_handler.get_repart_comm(); auto ref_exec = exec_handler.get_ref_exec(); auto rank = exec_handler.get_host_rank(); auto device_exec = exec_handler.get_device_exec(); @@ -392,13 +394,63 @@ void update_impl( word fieldname = host_A->get_field_name(); // perform all-to-all updates first - auto all_to_all_update = [comm, ref_exec, device_exec, - all_to_all_update_data, host_A, - force_host_buffer]() { + auto all_to_all_update = [repart_comm, ref_exec, device_exec, + all_to_all_update_data, host_A, force_host_buffer, + exec_handler, rank]() { + // NOTE if symmetric (get it from host_A) we can skip id=0 and wait till + // id=1 has been copied to use device copy + // for (auto [id, comm_pattern, data_ptr] : all_to_all_update_data) { + // auto start = std::chrono::steady_clock::now(); + auto repartAllToAll = + compute_repart_allToall(exec_handler, comm_pattern, rank); + // auto end = std::chrono::steady_clock::now(); + // auto delta_t = + // std::chrono::duration_cast(end - + // start).count()/1000.0; std::cout << __FILE__ << ":" << "delta t " + // << delta_t << " [ms]\n"; + auto [length, send_data_ptr] = host_A->get_interface_data(id); - communicate_values(ref_exec, device_exec, comm, comm_pattern, - send_data_ptr, data_ptr, force_host_buffer); + // communicate_values(ref_exec, device_exec, repart_comm, + // repartAllToAll, + // send_data_ptr, data_ptr, force_host_buffer); + // std::cout << __FILE__ << + // " Pstream::rank " << Pstream::myProcNo() << + // " repart_rank() " << repart_comm->rank() << + // " send_offsets.back() " << + // " id " << id << + // repartAllToAll.send_offsets.back() << " recv_counts: " << + // repartAllToAll.recv_counts << " recv_offsets: " << + // repartAllToAll.recv_offsets << + // std::endl; + + if (id == 0 && host_A->get_symmetric()) { + } else { + MPI_Request request; + MPI_Igatherv(send_data_ptr, repartAllToAll.send_offsets.back(), + MPI_DOUBLE, data_ptr, + repartAllToAll.recv_counts.data(), + repartAllToAll.recv_offsets.data(), MPI_DOUBLE, 0, + repart_comm->get(), &request); + MPI_Wait(&request, MPI_STATUS_IGNORE); + } + + // Perform symmetric inter device copy + if (id == 1 && repart_comm->rank() == 0 && + host_A->get_symmetric()) { + auto [zid, zcomm_pattern, zdata_ptr] = + all_to_all_update_data[0]; + // copy recv size data from data_ptr to zdata_ptr + // + label recv_buffer_size = repartAllToAll.recv_offsets.back(); + auto l_view = gko::array::view( + device_exec, recv_buffer_size, data_ptr); + + auto u_view = gko::array::view( + device_exec, recv_buffer_size, zdata_ptr); + + u_view = l_view; + } } }; @@ -499,7 +551,6 @@ std::shared_ptr create_impl( label rank = exec_handler.get_host_rank(); auto exec = exec_handler.get_ref_exec(); auto host_comm = *exec_handler.get_host_comm().get(); - exec_handler.init_device_comm(); auto device_comm = *exec_handler.get_device_comm().get(); bool owner = repartitioner->is_owner(exec_handler); diff --git a/src/Repartitioner.cpp b/src/Repartitioner.cpp index 33269aee..fe26ecc4 100644 --- a/src/Repartitioner.cpp +++ b/src/Repartitioner.cpp @@ -27,7 +27,6 @@ Repartitioner::repartition_sparsity( { LOG_1(verbose_, "start repartition sparsity pattern") - auto exec = exec_handler.get_ref_exec(); auto comm = *exec_handler.get_host_comm().get(); label rank = exec_handler.get_host_rank(); @@ -67,7 +66,6 @@ Repartitioner::repartition_sparsity( return gather_closure(comm_pattern, tmp, offset); }; - /* Helper function, create and return gathered sparsity pattern based on * in_sparsity * */ diff --git a/src/StoppingCriterion.cpp b/src/StoppingCriterion.cpp index 38f9c13d..07c7c71b 100644 --- a/src/StoppingCriterion.cpp +++ b/src/StoppingCriterion.cpp @@ -45,9 +45,14 @@ StoppingCriterion::OpenFOAMDistStoppingCriterion::compute_normfactor_dist( dist_vec::create(device_exec, comm, global_size, local_size)); Axref->fill(0.0); + auto start_axref = std::chrono::steady_clock::now(); compute_Axref_dist(global_size[0], local_size[0], device_exec, gkomatrix, x, Axref); - + auto end_axref = std::chrono::steady_clock::now(); + auto delta_t_axref = std::chrono::duration_cast( + end_axref - start_axref) + .count() / + 1.0; auto unity = gko::initialize>(1, {1.0}, device_exec); diff --git a/test/unit/MatrixWrapper/Distributed.cpp b/test/unit/MatrixWrapper/Distributed.cpp index 496a6c0c..31b055f8 100644 --- a/test/unit/MatrixWrapper/Distributed.cpp +++ b/test/unit/MatrixWrapper/Distributed.cpp @@ -56,6 +56,7 @@ class Environment : public testing::Environment { Foam::IOobject::MUST_READ), false); + // FIXME this needs the device_id_handler exec = std::make_shared(runTime_->thisDb(), dict, "dummy", true); @@ -172,6 +173,7 @@ TEST_P(DistMatL2D, canCreateDistributedMatrix) { /* The test mesh is 6x6 grid decomposed into 4 3x3 subdomains */ auto [ranks_per_gpu, matrix_format, fused] = GetParam(); + exec.set_ranks_per_gpu(ranks_per_gpu); auto mesh = ((Environment *)global_env)->mesh; auto hostMatrix = ((Environment *)global_env)->hostMatrix; @@ -182,6 +184,7 @@ TEST_P(DistMatL2D, canCreateDistributedMatrix) gko::dim<2> global_vec_dim{repartitioner->get_orig_partition()->get_size(), 1}; gko::dim<2> local_vec_dim{repartitioner->get_repart_dim()[0], 1}; + exec.init_device_comm(); auto distributed = create_distributed(exec, repartitioner, hostMatrix, matrix_format, fused, 0); @@ -200,6 +203,8 @@ TEST_P(DistMatL2D, hasCorrectLocalMatrix) { /* The test mesh is 6x6 grid decomposed into 4 3x3 subdomains */ auto [ranks_per_gpu, matrix_format, fused] = GetParam(); + exec.set_ranks_per_gpu(ranks_per_gpu); + exec.init_device_comm(); auto mesh = ((Environment *)global_env)->mesh; auto hostMatrix = ((Environment *)global_env)->hostMatrix; auto repartitioner = std::make_shared( @@ -254,6 +259,8 @@ TEST_P(DistMatL2D, hasCorrectNonLocalMatrix) { /* The test mesh is 6x6 grid decomposed into 4 3x3 subdomains */ auto [ranks_per_gpu, matrix_format, fused] = GetParam(); + exec.set_ranks_per_gpu(ranks_per_gpu); + exec.init_device_comm(); auto mesh = ((Environment *)global_env)->mesh; auto hostMatrix = ((Environment *)global_env)->hostMatrix; auto name = ((Environment *)global_env)->name_; @@ -292,6 +299,8 @@ TEST_P(DistMatL2D, hasCorrectNonLocalMatrix) TEST_P(DistMatL2D, canApplyCorrectly) { auto [ranks_per_gpu, format, fused] = GetParam(); + exec.set_ranks_per_gpu(ranks_per_gpu); + exec.init_device_comm(); auto mesh = ((Environment *)global_env)->mesh; auto hostMatrix = ((Environment *)global_env)->hostMatrix; auto name = ((Environment *)global_env)->name_; @@ -314,12 +323,15 @@ TEST_P(DistMatL2D, canApplyCorrectly) x->fill(0); // Act - distributed->apply(b, x); - auto res_x = std::vector( - x->get_local_vector()->get_const_values(), - x->get_local_vector()->get_const_values() + local_vec_dim[0]); - - ASSERT_EQ(res_x, exp_x[name][fused][ranks_per_gpu][rank]); + bool active = repartitioner->get_repart_size() != 0; + if (active) { + distributed->apply(b, x); + auto res_x = std::vector( + x->get_local_vector()->get_const_values(), + x->get_local_vector()->get_const_values() + local_vec_dim[0]); + + ASSERT_EQ(res_x, exp_x[name][fused][ranks_per_gpu][rank]); + } } int main(int argc, char *argv[]) diff --git a/test/unit/Repartitioner1D.cpp b/test/unit/Repartitioner1D.cpp index bec8af74..4305f025 100644 --- a/test/unit/Repartitioner1D.cpp +++ b/test/unit/Repartitioner1D.cpp @@ -136,6 +136,7 @@ TEST_P(RepartitionerFixture1D, can_repartition_sparsity_pattern) // Arrange auto ranks_per_gpu = GetParam(); auto repartitioner = Repartitioner(local_size, ranks_per_gpu, 0, exec); + exec.init_device_comm(); // std::vector