diff --git a/.github/workflows/compile_on_aws.yml b/.github/workflows/compile_on_aws.yml
deleted file mode 100644
index bdde16e7..00000000
--- a/.github/workflows/compile_on_aws.yml
+++ /dev/null
@@ -1,152 +0,0 @@
-# SPDX-FileCopyrightText: 2024 OGL authors
-#
-# SPDX-License-Identifier: GPL-3.0-or-later
-
-name: Compile on AWS
-run-name: Compile on AWS
-
-on:
-  pull_request:
-    types: synchronize
-jobs:
-  start-runner:
-    if: contains(github.event.pull_request.labels.*.name, 'full_ci')
-    name: Start self-hosted EC2 runner
-    runs-on: ubuntu-latest
-    permissions:
-      id-token: write
-      contents: read
-    outputs:
-      label: ${{ steps.start-ec2-runner.outputs.label }}
-      ec2-instance-id: ${{ steps.start-ec2-runner.outputs.ec2-instance-id }}
-    steps:
-      - name: Configure AWS credentials
-        uses: aws-actions/configure-aws-credentials@v4
-        with:
-          role-to-assume: arn:aws:iam::308634587211:role/Github-OIDC-Role-29bocUD8VBZr
-          aws-region: us-east-1
-      - name: Start EC2 runner
-        id: start-ec2-runner
-        uses: HendriceH/ec2-github-runner@v1.10  # Starts 60GB Root + 30 GB Share volume
-        with:
-          mode: start
-          github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
-          ec2-image-id: ami-03af087024bfdbbee # Deep learning AMI
-          ec2-instance-type: g4dn.xlarge
-          iam-role-name: Role4Github
-          subnet-id: subnet-b5d2adbb
-          security-group-id: sg-559f8967
-          aws-resource-tags: > # optional, requires additional permissions
-            [
-              {"Key": "ucfd-project", "Value": "BMBF_2022_EXASIM"},
-              {"Key": "ucfd-client", "Value": "UCFD-RD"},
-              {"Key": "GitHubRepository", "Value": "${{ github.repository }}"}
-            ]
-          pre-runner-script: |
-                 #!/bin/bash
-                 sudo yum update -y && \
-                 sudo yum install docker git libicu ninja-build libasan10 -y
-                 sudo amazon-linux-extras install epel -y
-                 sudo yum install Lmod -y
-                 sudo systemctl enable docker
-                 sudo curl -o- https://raw.githubusercontent.com/nvm-sh/nvm/v0.39.5/install.sh | bash
-                 sudo mkfs -t xfs /dev/sda1
-                 sudo mkdir -p /share
-                 sudo mount /dev/sda1 /share
-                 aws s3 cp s3://ucfd-share/pcluster/3.x/alinux2/x86_64/postinstall_github .
-                 chmod +x postinstall_github
-                 sudo ./postinstall_github > ~/install.log
-                 ln -s /share/software/cmake/3.27.8/share/cmake-3.27 /usr/share/cmake-3.27
-                 mkdir -p /share/ec2-user
-                 export USER=ec2-user
-  do-the-job:
-    name: Do the job on the runner
-    needs: start-runner # required to start the main job when the runner is ready
-    runs-on: ${{ needs.start-runner.outputs.label }} # run the job on the newly created runner
-    steps:
-      - name: Checkout OGL
-        uses: actions/checkout@v2
-        with:
-          ref: dev
-      - name: Test env
-        shell: bash -o pipefail -i {0}
-        run: |
-          export HOME=/share/ec2-user
-          module load gnu/10
-          module load libfabric-aws
-          module load openmpi
-          module load OpenFOAM
-          module load cmake
-          env
-      - name: Install OBR
-        shell: bash -o pipefail -i {0}
-        run: |
-          module load conda
-          conda activate
-          python3 -m pip install --upgrade pip
-          pip install setuptools --upgrade
-          git clone https://github.com/exasim-project/OBR
-          cd OBR
-          pip install .
-          obr --version
-
-      - name: Config
-        if: always()
-        shell: bash -o pipefail -i {0}
-        run: |
-          export HOME=/share/ec2-user
-          module load gnu/10
-          module load libfabric-aws
-          module load openmpi
-          module load OpenFOAM
-          module load cmake
-          cmake --list-preset
-          cmake -DOGL_CUDA_ARCHITECTURES=52 --preset ninja-cuda-release
-      - name: Build and install
-        shell: bash -o pipefail -i {0}
-        run: |
-          export HOME=/share/ec2-user
-          module load gnu/10
-          module load libfabric-aws
-          module load openmpi
-          module load OpenFOAM
-          module load cmake
-          cmake --build --preset ninja-cuda-release  --target install
-      - name: Run integration tests
-        shell: bash -o pipefail -i {0}
-        run: |
-          export HOME=/share/ec2-user
-          module load conda
-          conda activate
-          module load gnu/10
-          module load libfabric-aws
-          module load openmpi
-          module load OpenFOAM
-          export GINKGO_EXECUTOR=cuda
-          obr init -g --config test/integration.yaml
-          obr run -o runParallelSolver
-          obr status
-
-  stop-runner:
-    name: Stop self-hosted EC2 runner
-    needs:
-      - start-runner # required to get output from the start-runner job
-      - do-the-job # required to wait when the main job is done
-    runs-on: ubuntu-latest
-    permissions:
-      id-token: write
-      contents: read
-    if: ${{ always() }} # required to stop the runner even if the error happened in the previous jobs
-    steps:
-      - name: Configure AWS credentials
-        uses: aws-actions/configure-aws-credentials@v4
-        with:
-          role-to-assume: arn:aws:iam::308634587211:role/Github-OIDC-Role-29bocUD8VBZr
-          aws-region: us-east-1
-      - name: Stop EC2 runner
-        uses: HendriceH/ec2-github-runner@v1.10
-        with:
-          mode: stop
-          github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
-          label: ${{ needs.start-runner.outputs.label }}
-          ec2-instance-id: ${{ needs.start-runner.outputs.ec2-instance-id }}
diff --git a/.github/workflows/static_checks.yaml b/.github/workflows/static_checks.yaml
index f5f51fdc..49e11674 100644
--- a/.github/workflows/static_checks.yaml
+++ b/.github/workflows/static_checks.yaml
@@ -67,7 +67,7 @@ jobs:
       run: |
         git config --global --add safe.directory /__w/OGL/OGL
         # Create list of all source files belonging to this repository
-        git ls-files | grep -E "\.(C)" > pattern
+        git ls-files | grep -E "\.(cpp)" > pattern
         # Create list of .cpp files that are in this repository and part of the
         # compilation database
         # also filters out " at the begin and end of the filename
diff --git a/include/OGL/CommunicationPattern.hpp b/include/OGL/CommunicationPattern.hpp
index 7fcf542e..3e7134c3 100644
--- a/include/OGL/CommunicationPattern.hpp
+++ b/include/OGL/CommunicationPattern.hpp
@@ -19,6 +19,18 @@ struct AllToAllPattern {
     std::vector<int> recv_offsets;
 };
 
+/* @brief computes AllToAllPattern for repartioned communincator from global
+ * allToAll pattern by discarding all zero communication before and after the
+ * repartioner scope.
+ *
+ * @param exec_handler The executor handler
+ * @param allToAll The original allToAll pattern
+ * @param start_rank the original comm_world rank
+ */
+AllToAllPattern compute_repart_allToall(const ExecutorHandler &exec_handler,
+                                        const AllToAllPattern allToAll,
+                                        label start_rank);
+
 /* @brief  This function computes the send and recv counts vectors and the send
  * and recv offsets vectors for scattering from an owner to all ranks, including
  * owner itself
diff --git a/include/OGL/DevicePersistent/ExecutorHandler.hpp b/include/OGL/DevicePersistent/ExecutorHandler.hpp
index 3c9b7da5..df56b32f 100644
--- a/include/OGL/DevicePersistent/ExecutorHandler.hpp
+++ b/include/OGL/DevicePersistent/ExecutorHandler.hpp
@@ -71,6 +71,14 @@ struct DeviceIdHandler {
         return device_global_id % num_devices_per_node;
     }
 
+    /* @brief returns the owner rank on the global comm world communicator
+     */
+    label global_owner() const
+    {
+        label rank = Pstream::myProcNo();
+        return rank - (rank % ranks_per_gpu);
+    }
+
     /* @brief check if rank is an owning rank
      */
     bool is_owner() const
@@ -270,7 +278,7 @@ class ExecutorHandler
             // gko comm
             label group = device_id_handler_.compute_group();
             MPI_Comm gko_comm;
-            label host_rank = 0;
+            label host_rank = Pstream::myProcNo();
             MPI_Comm_split(MPI_COMM_WORLD, group, host_rank, &gko_comm);
             device_comm_ =
                 std::make_shared<gko::experimental::mpi::communicator>(
@@ -278,7 +286,8 @@ class ExecutorHandler
 
             // repart comm
             MPI_Comm repart_comm;
-            label device_id = device_id_handler_.compute_device_id(4);
+            label global_rank = Pstream::myProcNo();
+            label device_id = global_rank / device_id_handler_.ranks_per_gpu;
             MPI_Comm_split(MPI_COMM_WORLD, device_id, host_rank, &repart_comm);
             repart_comm_ =
                 std::make_shared<gko::experimental::mpi::communicator>(
@@ -300,6 +309,15 @@ class ExecutorHandler
      * */
     bool get_non_orig_device_comm() const { return non_orig_device_comm_; }
 
+    label get_ranks_per_gpu() const { return device_id_handler_.ranks_per_gpu; }
+
+    void set_ranks_per_gpu(label ranks_per_gpu)
+    {
+        device_id_handler_.ranks_per_gpu = ranks_per_gpu;
+    }
+
+    label get_owner_rank() const { return device_id_handler_.global_owner(); }
+
     const std::shared_ptr<gko::Executor> get_device_exec() const
     {
         return this->get_persistent_object();
diff --git a/include/OGL/DevicePersistent/Vector.hpp b/include/OGL/DevicePersistent/Vector.hpp
index 4d75f21b..c8610f59 100644
--- a/include/OGL/DevicePersistent/Vector.hpp
+++ b/include/OGL/DevicePersistent/Vector.hpp
@@ -67,7 +67,7 @@ struct VectorInitFunctor {
         //// TODO store
         auto comm_pattern = compute_gather_to_owner_counts(
             exec_, repartitioner->get_ranks_per_gpu(), host_size);
-        bool host_buffer = !exec_.get_non_orig_device_comm();
+        bool host_buffer = exec_.get_gko_force_host_buffer();
 
         communicate_values(ref_exec, exec, comm, comm_pattern,
                            host_view.get_const_data(),
@@ -177,7 +177,8 @@ class PersistentVector
         auto rank = exec_.get_host_rank();
         auto ref_exec = exec_.get_ref_exec();
         auto comm = exec_.get_host_comm();
-        bool host_buffer = !exec_.get_non_orig_device_comm();
+        auto repart_comm = exec_.get_repart_comm();
+        bool host_buffer = exec_.get_gko_force_host_buffer();
 
         auto repartitioner = dist_matrix_->get_repartitioner();
         auto host_size = repartitioner->get_orig_size();
@@ -186,9 +187,36 @@ class PersistentVector
         auto comm_pattern = compute_scatter_from_owner_counts(
             exec_, repartitioner->get_ranks_per_gpu(), host_size);
 
-        communicate_values(exec, ref_exec, comm, comm_pattern,
-                           get_vector()->get_local_values(),
-                           const_cast<T *>(memory_), host_buffer);
+        label owner_rank = exec_.get_owner_rank();
+        auto repartAllToAll =
+            compute_repart_allToall(exec_, comm_pattern, owner_rank);
+
+        // if (owner_rank != Pstream::myProcNo()){
+        // 	label recv_count = repartAllToAll.recv_counts[0];
+        // 	repartAllToAll.recv_counts[Pstream::myProcNo()] = recv_count;
+        // 	repartAllToAll.recv_counts[0] = 0;
+        // }
+
+        // NOTE instead of all_to_all_v based communication MPI_Iscatterv
+        // seems to be preferable
+        // communicate_values(exec, ref_exec, comm, comm_pattern,
+        //                    get_vector()->get_local_values(),
+        //                    const_cast<T *>(memory_), host_buffer);
+
+        label send_size = comm_pattern.send_offsets.back();
+        auto send_view = gko::array<scalar>::const_view(
+            exec, send_size, get_vector()->get_local_values());
+        auto tmp = gko::array<scalar>(exec, send_size);
+
+        tmp = send_view;
+        tmp.set_executor(ref_exec);
+
+        MPI_Request copy_back_req;
+        MPI_Iscatterv(tmp.get_data(), repartAllToAll.send_counts.data(),
+                      repartAllToAll.send_offsets.data(), MPI_DOUBLE,
+                      const_cast<T *>(memory_), repartAllToAll.recv_counts[0],
+                      MPI_DOUBLE, 0, repart_comm->get(), &copy_back_req);
+        MPI_Wait(&copy_back_req, MPI_STATUS_IGNORE);
     }
 
     /** Writes the content of the distributed vector to disk
diff --git a/include/OGL/StoppingCriterion.hpp b/include/OGL/StoppingCriterion.hpp
index 0f70bd86..944acd60 100644
--- a/include/OGL/StoppingCriterion.hpp
+++ b/include/OGL/StoppingCriterion.hpp
@@ -147,6 +147,8 @@ class StoppingCriterion {
 
     const label frequency_;
 
+    const word frequencyMode_;
+
     const scalar relaxationFactor_;
 
     const bool adapt_minIter_;
@@ -172,6 +174,8 @@ class StoppingCriterion {
           norm_eval_limit_(
               controlDict.lookupOrDefault("normEvalLimit", label(100))),
           frequency_(controlDict.lookupOrDefault("evalFrequency", label(1))),
+          frequencyMode_(controlDict.lookupOrDefault(
+              "evalFrequencyMode", word("relative"))),  // optimizer, fixed
           relaxationFactor_(
               controlDict.lookupOrDefault("relaxationFactor", scalar(0.6))),
           adapt_minIter_(
@@ -197,21 +201,30 @@ class StoppingCriterion {
                                   bool export_res, label prev_solve_iters,
                                   scalar prev_rel_cost) const
     {
+        word frequencyMode = "optimizer";
         label minIter = minIter_;
         label frequency = frequency_;
+        // in case of export_res all residuals need to be computed
         if (!export_res) {
             if (prev_solve_iters > 0 && adapt_minIter_ && prev_rel_cost > 0) {
                 minIter = prev_solve_iters * relaxationFactor_;
-                auto alpha =
-                    sqrt(1.0 / (prev_solve_iters * (1.0 - relaxationFactor_)) *
-                         prev_rel_cost);
-                frequency = min(norm_eval_limit_, max(1, label(1 / alpha)));
+                if (frequencyMode == "optimizer") {
+                    auto alpha = sqrt(
+                        1.0 / (prev_solve_iters * (1.0 - relaxationFactor_)) *
+                        prev_rel_cost);
+                    frequency = min(norm_eval_limit_, max(1, label(1 / alpha)));
+                }
+                if (frequencyMode == "relative") {
+                    frequency = label(prev_solve_iters * 0.075) + 1;
+                }
             }
         }
 
         word msg = "Creating stopping criterion with minIter " +
                    std::to_string(minIter) + " frequency " +
-                   std::to_string(frequency);
+                   std::to_string(frequency) + " prev_solve_iters " +
+                   std::to_string(prev_solve_iters) + " adapt_minIter_  " +
+                   std::to_string(adapt_minIter_) + " prev_rel_cost  ";
 
         MLOG_0(verbose, msg)
 
diff --git a/include/OGL/lduLduBase.hpp b/include/OGL/lduLduBase.hpp
index e584adbe..3e997f77 100644
--- a/include/OGL/lduLduBase.hpp
+++ b/include/OGL/lduLduBase.hpp
@@ -216,6 +216,7 @@ class lduLduBase : public OGL_Info,
                                            solverPerformance &solverPerf) const
     {
         bool fused = solver_controls_.lookupOrDefault<Switch>("fuse", true);
+        exec_handler_.init_device_comm();
 
         auto repartitioner = std::make_shared<Repartitioner>(
             host_matrix_wrapper_->get_local_nrows(), ranks_per_gpu_, verbose_,
@@ -334,8 +335,8 @@ class lduLduBase : public OGL_Info,
             std::to_string(time_per_dof) + std::string(" [ns]") +
             std::string("\n\tTime per iteration and DOF: ") +
             std::to_string(time_per_iter_and_dof) + std::string(" [ns]") +
-            std::string("\n\tRetrieve results bandwidth ") +
-            std::to_string(bandwidth_copy_back) + std::string(" [GByte/s]");
+            std::string("\n\tRetrieve results bandwidth ");  // +
+        std::to_string(bandwidth_copy_back) + std::string(" [GByte/s]");
         MLOG_0(verbose_, msg)
 
         return solverPerf;
diff --git a/src/CommunicationPattern.cpp b/src/CommunicationPattern.cpp
index 192a7f6c..46502464 100644
--- a/src/CommunicationPattern.cpp
+++ b/src/CommunicationPattern.cpp
@@ -171,13 +171,40 @@ void communicate_values(const ExecutorHandler &exec_handler,
     //     std::cout << __FILE__ << ":" << __LINE__ << " received "
     //               << std::to_string(recv_size) << " elements of "
     //               << std::to_string(recv_size * 8 / 1e9)
-    //               << "[Gb] on device rank: " << std::to_string(rank) << " time "
+    //               << "[Gb] on device rank: " << std::to_string(rank) << "
+    //               time "
     //               << std::to_string(delta_t) << "[ms] bandwidth: "
     //               << std::to_string(recv_size * 8 / delta_t / 1e6)
     //               << "[Gb/s] \n";
     // }
 }
 
+AllToAllPattern compute_repart_allToall(const ExecutorHandler &exec_handler,
+                                        const AllToAllPattern allToAllIn,
+                                        label start_rank)
+{
+    auto host_comm = exec_handler.get_host_comm();
+    auto repart_comm = exec_handler.get_repart_comm();
+    auto ranks = repart_comm->size();
+
+    std::vector<int> send_counts(ranks, 0);
+    std::vector<int> send_offsets(ranks + 1, 0);
+    std::vector<int> recv_counts(ranks, 0);
+    std::vector<int> recv_offsets(ranks + 1, 0);
+
+    // label start_rank = host_comm->rank();
+    for (auto i = 0; i < ranks; i++) {
+        send_counts[i] = allToAllIn.send_counts[start_rank + i];
+        send_offsets[i] = allToAllIn.send_offsets[start_rank + i];
+        recv_counts[i] = allToAllIn.recv_counts[start_rank + i];
+        recv_offsets[i] = allToAllIn.recv_offsets[start_rank + i];
+    }
+
+    send_offsets.back() = scalar(allToAllIn.send_offsets.back());
+    recv_offsets.back() = scalar(allToAllIn.recv_offsets.back());
+    return {send_counts, send_offsets, recv_counts, recv_offsets};
+}
+
 void communicate_values(
     std::shared_ptr<const gko::Executor> src_exec,
     std::shared_ptr<const gko::Executor> target_exec,
diff --git a/src/MatrixWrapper/Distributed.cpp b/src/MatrixWrapper/Distributed.cpp
index 72eaae43..6897cf7a 100644
--- a/src/MatrixWrapper/Distributed.cpp
+++ b/src/MatrixWrapper/Distributed.cpp
@@ -121,13 +121,15 @@ void generate_alltoall_update_data(
     std::vector<RepartDistMatrix::all_to_all_data> &update_data)
 {
     label linop_offset_store{0};
-    for (size_t i = 0; i < 3; i++) {
+    // NOTE in case of symmetric matrix 0 (upper) is same as 1 (lower)
+    // thus we can start at 1
+    label start = 0;
+    for (size_t i = start; i < 3; i++) {
         label interface_size = in->get_rows()[i].size();
         label linop_idx = (fuse) ? 0 : in->get_id()[i];
         label linop_offset = (fuse) ? linop_offset_store : 0;
         auto comm_pattern = compute_gather_to_owner_counts(
             exec_handler, ranks_per_owner, interface_size);
-
         size_t recv_size = comm_pattern.recv_offsets.back();
 
         // NOTE Probably dont need to store linops[linop-idx] because we can
@@ -384,7 +386,7 @@ void update_impl(
     std::map<label, scalar *> linops, label verbose)
 {
     auto comm = exec_handler.get_host_comm();
-    // auto repart_comm = exec_handler.get_repart_comm();
+    auto repart_comm = exec_handler.get_repart_comm();
     auto ref_exec = exec_handler.get_ref_exec();
     auto rank = exec_handler.get_host_rank();
     auto device_exec = exec_handler.get_device_exec();
@@ -392,13 +394,63 @@ void update_impl(
     word fieldname = host_A->get_field_name();
 
     // perform all-to-all updates first
-    auto all_to_all_update = [comm, ref_exec, device_exec,
-                              all_to_all_update_data, host_A,
-                              force_host_buffer]() {
+    auto all_to_all_update = [repart_comm, ref_exec, device_exec,
+                              all_to_all_update_data, host_A, force_host_buffer,
+                              exec_handler, rank]() {
+        // NOTE if symmetric (get it from host_A) we can skip id=0 and wait till
+        // id=1 has been copied to use device copy
+        //
         for (auto [id, comm_pattern, data_ptr] : all_to_all_update_data) {
+            // auto start = std::chrono::steady_clock::now();
+            auto repartAllToAll =
+                compute_repart_allToall(exec_handler, comm_pattern, rank);
+            // auto end = std::chrono::steady_clock::now();
+            // auto delta_t =
+            // std::chrono::duration_cast<std::chrono::microseconds>(end -
+            // start).count()/1000.0; std::cout << __FILE__ << ":" << "delta t "
+            // << delta_t << " [ms]\n";
+
             auto [length, send_data_ptr] = host_A->get_interface_data(id);
-            communicate_values(ref_exec, device_exec, comm, comm_pattern,
-                               send_data_ptr, data_ptr, force_host_buffer);
+            // communicate_values(ref_exec, device_exec, repart_comm,
+            // repartAllToAll,
+            //                    send_data_ptr, data_ptr, force_host_buffer);
+            // std::cout << __FILE__ <<
+            //     " Pstream::rank " << Pstream::myProcNo() <<
+            //     " repart_rank() "  << repart_comm->rank() <<
+            //     " send_offsets.back() "  <<
+            //     " id " << id <<
+            //     repartAllToAll.send_offsets.back() << " recv_counts: "  <<
+            //     repartAllToAll.recv_counts << " recv_offsets: "  <<
+            //     repartAllToAll.recv_offsets <<
+            // std::endl;
+
+            if (id == 0 && host_A->get_symmetric()) {
+            } else {
+                MPI_Request request;
+                MPI_Igatherv(send_data_ptr, repartAllToAll.send_offsets.back(),
+                             MPI_DOUBLE, data_ptr,
+                             repartAllToAll.recv_counts.data(),
+                             repartAllToAll.recv_offsets.data(), MPI_DOUBLE, 0,
+                             repart_comm->get(), &request);
+                MPI_Wait(&request, MPI_STATUS_IGNORE);
+            }
+
+            // Perform symmetric inter device copy
+            if (id == 1 && repart_comm->rank() == 0 &&
+                host_A->get_symmetric()) {
+                auto [zid, zcomm_pattern, zdata_ptr] =
+                    all_to_all_update_data[0];
+                // copy recv size data from data_ptr to zdata_ptr
+                //
+                label recv_buffer_size = repartAllToAll.recv_offsets.back();
+                auto l_view = gko::array<scalar>::view(
+                    device_exec, recv_buffer_size, data_ptr);
+
+                auto u_view = gko::array<scalar>::view(
+                    device_exec, recv_buffer_size, zdata_ptr);
+
+                u_view = l_view;
+            }
         }
     };
 
@@ -499,7 +551,6 @@ std::shared_ptr<RepartDistMatrix> create_impl(
     label rank = exec_handler.get_host_rank();
     auto exec = exec_handler.get_ref_exec();
     auto host_comm = *exec_handler.get_host_comm().get();
-    exec_handler.init_device_comm();
     auto device_comm = *exec_handler.get_device_comm().get();
     bool owner = repartitioner->is_owner(exec_handler);
 
diff --git a/src/Repartitioner.cpp b/src/Repartitioner.cpp
index 33269aee..fe26ecc4 100644
--- a/src/Repartitioner.cpp
+++ b/src/Repartitioner.cpp
@@ -27,7 +27,6 @@ Repartitioner::repartition_sparsity(
 {
     LOG_1(verbose_, "start repartition sparsity pattern")
 
-
     auto exec = exec_handler.get_ref_exec();
     auto comm = *exec_handler.get_host_comm().get();
     label rank = exec_handler.get_host_rank();
@@ -67,7 +66,6 @@ Repartitioner::repartition_sparsity(
         return gather_closure(comm_pattern, tmp, offset);
     };
 
-
     /* Helper function, create and return gathered sparsity pattern based on
      * in_sparsity
      *  */
diff --git a/src/StoppingCriterion.cpp b/src/StoppingCriterion.cpp
index 38f9c13d..07c7c71b 100644
--- a/src/StoppingCriterion.cpp
+++ b/src/StoppingCriterion.cpp
@@ -45,9 +45,14 @@ StoppingCriterion::OpenFOAMDistStoppingCriterion::compute_normfactor_dist(
         dist_vec::create(device_exec, comm, global_size, local_size));
     Axref->fill(0.0);
 
+    auto start_axref = std::chrono::steady_clock::now();
     compute_Axref_dist(global_size[0], local_size[0], device_exec, gkomatrix, x,
                        Axref);
-
+    auto end_axref = std::chrono::steady_clock::now();
+    auto delta_t_axref = std::chrono::duration_cast<std::chrono::microseconds>(
+                             end_axref - start_axref)
+                             .count() /
+                         1.0;
     auto unity =
         gko::initialize<gko::matrix::Dense<scalar>>(1, {1.0}, device_exec);
 
diff --git a/test/unit/MatrixWrapper/Distributed.cpp b/test/unit/MatrixWrapper/Distributed.cpp
index 496a6c0c..31b055f8 100644
--- a/test/unit/MatrixWrapper/Distributed.cpp
+++ b/test/unit/MatrixWrapper/Distributed.cpp
@@ -56,6 +56,7 @@ class Environment : public testing::Environment {
                            Foam::IOobject::MUST_READ),
             false);
 
+        // FIXME this needs the device_id_handler
         exec = std::make_shared<ExecutorHandler>(runTime_->thisDb(), dict,
                                                  "dummy", true);
 
@@ -172,6 +173,7 @@ TEST_P(DistMatL2D, canCreateDistributedMatrix)
 {
     /* The test mesh is 6x6 grid decomposed into 4 3x3 subdomains */
     auto [ranks_per_gpu, matrix_format, fused] = GetParam();
+    exec.set_ranks_per_gpu(ranks_per_gpu);
 
     auto mesh = ((Environment *)global_env)->mesh;
     auto hostMatrix = ((Environment *)global_env)->hostMatrix;
@@ -182,6 +184,7 @@ TEST_P(DistMatL2D, canCreateDistributedMatrix)
     gko::dim<2> global_vec_dim{repartitioner->get_orig_partition()->get_size(),
                                1};
     gko::dim<2> local_vec_dim{repartitioner->get_repart_dim()[0], 1};
+    exec.init_device_comm();
 
     auto distributed = create_distributed(exec, repartitioner, hostMatrix,
                                           matrix_format, fused, 0);
@@ -200,6 +203,8 @@ TEST_P(DistMatL2D, hasCorrectLocalMatrix)
 {
     /* The test mesh is 6x6 grid decomposed into 4 3x3 subdomains */
     auto [ranks_per_gpu, matrix_format, fused] = GetParam();
+    exec.set_ranks_per_gpu(ranks_per_gpu);
+    exec.init_device_comm();
     auto mesh = ((Environment *)global_env)->mesh;
     auto hostMatrix = ((Environment *)global_env)->hostMatrix;
     auto repartitioner = std::make_shared<Repartitioner>(
@@ -254,6 +259,8 @@ TEST_P(DistMatL2D, hasCorrectNonLocalMatrix)
 {
     /* The test mesh is 6x6 grid decomposed into 4 3x3 subdomains */
     auto [ranks_per_gpu, matrix_format, fused] = GetParam();
+    exec.set_ranks_per_gpu(ranks_per_gpu);
+    exec.init_device_comm();
     auto mesh = ((Environment *)global_env)->mesh;
     auto hostMatrix = ((Environment *)global_env)->hostMatrix;
     auto name = ((Environment *)global_env)->name_;
@@ -292,6 +299,8 @@ TEST_P(DistMatL2D, hasCorrectNonLocalMatrix)
 TEST_P(DistMatL2D, canApplyCorrectly)
 {
     auto [ranks_per_gpu, format, fused] = GetParam();
+    exec.set_ranks_per_gpu(ranks_per_gpu);
+    exec.init_device_comm();
     auto mesh = ((Environment *)global_env)->mesh;
     auto hostMatrix = ((Environment *)global_env)->hostMatrix;
     auto name = ((Environment *)global_env)->name_;
@@ -314,12 +323,15 @@ TEST_P(DistMatL2D, canApplyCorrectly)
     x->fill(0);
 
     // Act
-    distributed->apply(b, x);
-    auto res_x = std::vector<scalar>(
-        x->get_local_vector()->get_const_values(),
-        x->get_local_vector()->get_const_values() + local_vec_dim[0]);
-
-    ASSERT_EQ(res_x, exp_x[name][fused][ranks_per_gpu][rank]);
+    bool active = repartitioner->get_repart_size() != 0;
+    if (active) {
+        distributed->apply(b, x);
+        auto res_x = std::vector<scalar>(
+            x->get_local_vector()->get_const_values(),
+            x->get_local_vector()->get_const_values() + local_vec_dim[0]);
+
+        ASSERT_EQ(res_x, exp_x[name][fused][ranks_per_gpu][rank]);
+    }
 }
 
 int main(int argc, char *argv[])
diff --git a/test/unit/Repartitioner1D.cpp b/test/unit/Repartitioner1D.cpp
index bec8af74..4305f025 100644
--- a/test/unit/Repartitioner1D.cpp
+++ b/test/unit/Repartitioner1D.cpp
@@ -136,6 +136,7 @@ TEST_P(RepartitionerFixture1D, can_repartition_sparsity_pattern)
     // Arrange
     auto ranks_per_gpu = GetParam();
     auto repartitioner = Repartitioner(local_size, ranks_per_gpu, 0, exec);
+    exec.init_device_comm();
 
     // std::vector<label> ranks{rank};
     auto local_sparsity = std::make_shared<SparsityPattern>();
diff --git a/test/unit/Repartitioner2D.cpp b/test/unit/Repartitioner2D.cpp
index accbfb67..030a01f6 100644
--- a/test/unit/Repartitioner2D.cpp
+++ b/test/unit/Repartitioner2D.cpp
@@ -161,6 +161,7 @@ INSTANTIATE_TEST_SUITE_P(RepartitionerFixture2DInstantiation,
 TEST_P(RepartitionerFixture2D, can_repartition_2D_comm_pattern_for_n_ranks)
 {
     // Arrange
+    exec.init_device_comm();
     auto ranks_per_gpu = GetParam();
     auto repartitioner = Repartitioner(local_size, ranks_per_gpu, 0, exec);
     auto ref_exec = exec.get_ref_exec();
@@ -209,7 +210,6 @@ TEST_P(RepartitionerFixture2D, can_repartition_2D_comm_pattern_for_n_ranks)
         exec, comm_target_ids[rank], rows[rank]);
 
     // Act
-    exec.init_device_comm();
     auto repart_comm_pattern =
         repartitioner.repartition_comm_pattern(exec, comm_pattern);
 
@@ -235,6 +235,7 @@ TEST_P(RepartitionerFixture2D, can_repartition_sparsity_pattern)
 {
     // Arrange
     auto ranks_per_gpu = GetParam();
+    exec.init_device_comm();
     auto repartitioner = Repartitioner(local_size, ranks_per_gpu, 0, exec);
     auto ref_exec = exec.get_ref_exec();