From 279a14638c486eff792a26aded64edd01086bdc4 Mon Sep 17 00:00:00 2001
From: Tyson Jones <tyson.jones.input@gmail.com>
Date: Sun, 17 May 2026 18:40:58 -0400
Subject: [PATCH 1/6] Avoid superfluous matrix re-init

---
 quest/src/api/operations.cpp | 27 ++++++++++++++-------------
 1 file changed, 14 insertions(+), 13 deletions(-)

diff --git a/quest/src/api/operations.cpp b/quest/src/api/operations.cpp
index 54807c16..850a992f 100644
--- a/quest/src/api/operations.cpp
+++ b/quest/src/api/operations.cpp
@@ -518,7 +518,7 @@ void applyMultiControlledS(Qureg qureg, int* controls, int numControls, int targ
 
 void applyMultiStateControlledS(Qureg qureg, int* controls, int* states, int numControls, int target) {
 
-    DiagMatr1 matr = getDiagMatr1({1, 1_i});
+    static const DiagMatr1 matr = getDiagMatr1({1, 1_i});
     validateAndApplyAnyCtrlAnyTargUnitaryMatrix(qureg, controls, states, numControls, &target, 1, matr, __func__);
 }
 
@@ -569,7 +569,7 @@ void applyMultiControlledT(Qureg qureg, int* controls, int numControls, int targ
 
 void applyMultiStateControlledT(Qureg qureg, int* controls, int* states, int numControls, int target) {
 
-    DiagMatr1 matr = getDiagMatr1({1, 1/std::sqrt(2) + 1_i/std::sqrt(2)});
+    static const DiagMatr1 matr = getDiagMatr1({1, (1 + 1_i)/std::sqrt(2)});
     validateAndApplyAnyCtrlAnyTargUnitaryMatrix(qureg, controls, states, numControls, &target, 1, matr, __func__);
 }
 
@@ -620,11 +620,11 @@ void applyMultiControlledHadamard(Qureg qureg, int* controls, int numControls, i
 
 void applyMultiStateControlledHadamard(Qureg qureg, int* controls, int* states, int numControls, int target) {
 
-    qcomp a = 1/std::sqrt(2);
-    CompMatr1 matr = getCompMatr1({
-        {a, a}, 
-        {a,-a}});
-
+    static const qcomp a = 1 / std::sqrt(2);
+    static const CompMatr1 matr = getCompMatr1({
+        {a,  a}, 
+        {a, -a}
+    });
     validateAndApplyAnyCtrlAnyTargUnitaryMatrix(qureg, controls, states, numControls, &target, 1, matr, __func__);
 }
 
@@ -749,7 +749,7 @@ void applyMultiStateControlledSqrtSwap(Qureg qureg, int* controls, int* states,
 
     validate_mixedAmpsFitInNode(qureg, 2, __func__); // to throw SqrtSwap error, not generic CompMatr2 error
 
-    CompMatr2 matr = getCompMatr2({
+    static const CompMatr2 matr = getCompMatr2({
         {1, 0, 0, 0},
         {0, .5+.5_i, .5-.5_i, 0},
         {0, .5-.5_i, .5+.5_i, 0},
@@ -869,7 +869,7 @@ void applyMultiStateControlledPauliX(Qureg qureg, int* controls, int* states, in
     /// since it avoids all superfluous flops; check worthwhile for multi-qubit
 
     // harmlessly re-validates, including hardcoded matrix unitarity
-    CompMatr1 matrix = util_getPauliX();
+    static const CompMatr1 matrix = util_getPauliX();
     validateAndApplyAnyCtrlAnyTargUnitaryMatrix(qureg, controls, states, numControls, &target, 1, matrix, __func__);
 }
 
@@ -879,7 +879,7 @@ void applyMultiStateControlledPauliY(Qureg qureg, int* controls, int* states, in
     validate_controlStates(states, numControls, __func__); // permits states==nullptr
 
     // harmlessly re-validates, including hardcoded matrix unitarity
-    CompMatr1 matrix = util_getPauliY();
+    static const CompMatr1 matrix = util_getPauliY();
     validateAndApplyAnyCtrlAnyTargUnitaryMatrix(qureg, controls, states, numControls, &target, 1, matrix, __func__);
 }
 
@@ -889,7 +889,7 @@ void applyMultiStateControlledPauliZ(Qureg qureg, int* controls, int* states, in
     validate_controlStates(states, numControls, __func__); // permits states==nullptr
 
     // harmlessly re-validates, including hardcoded matrix unitarity
-    DiagMatr1 matrix = util_getPauliZ();
+    static const DiagMatr1 matrix = util_getPauliZ();
     validateAndApplyAnyCtrlAnyTargUnitaryMatrix(qureg, controls, states, numControls, &target, 1, matrix, __func__);
 }
 
@@ -1424,7 +1424,8 @@ void applyMultiQubitPhaseShift(Qureg qureg, int* targets, int numTargets, qreal
     validate_targets(qureg, targets, numTargets, __func__);
 
     // treat as a (numTargets-1)-controlled 1-target diagonal matrix
-    DiagMatr1 matr = getDiagMatr1({1, std::exp(1_i * angle)});
+    static DiagMatr1 matr = getDiagMatr1({1, /*un-init*/ 0});
+    matr.elems[1] = std::exp(1_i * angle); // micro-optimisation
 
     // harmlessly re-validates
     applyMultiStateControlledDiagMatr1(qureg, &targets[1], nullptr, numTargets-1, targets[0], matr);
@@ -1467,7 +1468,7 @@ void applyMultiQubitPhaseFlip(Qureg qureg, int* targets, int numTargets) {
     validate_targets(qureg, targets, numTargets, __func__);
 
     // treat as a (numTargets-1)-controlled 1-target Pauli Z
-    DiagMatr1 matr = getDiagMatr1({1, -1});
+    static const DiagMatr1 matr = getDiagMatr1({1, -1});
 
     // harmlessly re-validates
     applyMultiStateControlledDiagMatr1(qureg, &targets[1], nullptr, numTargets-1, targets[0], matr);

From 3d22661506e26412f3256f499564a0cf5ea7c7ba Mon Sep 17 00:00:00 2001
From: Tyson Jones <tyson.jones.input@gmail.com>
Date: Sun, 17 May 2026 18:53:14 -0400
Subject: [PATCH 2/6] Change accelerator function vectors to array

---
 quest/src/core/accelerator.cpp | 47 ++++++++++++++++------------------
 quest/src/core/accelerator.hpp |  4 +--
 2 files changed, 23 insertions(+), 28 deletions(-)

diff --git a/quest/src/core/accelerator.cpp b/quest/src/core/accelerator.cpp
index 74a7bb9e..d8d4c411 100644
--- a/quest/src/core/accelerator.cpp
+++ b/quest/src/core/accelerator.cpp
@@ -29,11 +29,12 @@
 #include "quest/src/cpu/cpu_subroutines.hpp"
 #include "quest/src/gpu/gpu_subroutines.hpp"
 
+#include <array>
 #include <vector>
 #include <algorithm>
 
-using std::vector;
 using std::min;
+using std::array;
 
 
 
@@ -76,31 +77,29 @@ using std::min;
 
 
 #define GET_FUNC_OPTIMISED_FOR_NUM_QUREGS(f, numquregs) \
-    (vector <decltype(&f<0>)> {&f<0>, &f<1>, &f<2>, &f<3>, &f<4>, &f<5>, &f<-1>}) \
+    (array {&f<0>, &f<1>, &f<2>, &f<3>, &f<4>, &f<5>, &f<-1>}) \
     [std::min((int) numquregs, MAX_OPTIMISED_NUM_QUREGS + 1)]
 
 #define GET_FUNC_OPTIMISED_FOR_NUM_CTRLS(f, numctrls) \
-    (vector <decltype(&f<0>)> {&f<0>, &f<1>, &f<2>, &f<3>, &f<4>, &f<5>, &f<-1>}) \
+    (array {&f<0>, &f<1>, &f<2>, &f<3>, &f<4>, &f<5>, &f<-1>}) \
     [std::min((int) numctrls, MAX_OPTIMISED_NUM_CTRLS + 1)]
 
 #define GET_FUNC_OPTIMISED_FOR_NUM_TARGS(f, numtargs) \
-    (vector <decltype(&f<0>)> {&f<0>, &f<1>, &f<2>, &f<3>, &f<4>, &f<5>, &f<-1>}) \
+    (array {&f<0>, &f<1>, &f<2>, &f<3>, &f<4>, &f<5>, &f<-1>}) \
     [std::min((int) numtargs, MAX_OPTIMISED_NUM_TARGS + 1)]
 
 #define GET_FUNC_OPTIMISED_FOR_NUM_CTRLS_AND_TARGS(f, numctrls, numtargs) \
-    (vector <ARR(f)> { \
-        ARR(f) {&f<0,0>,  &f<0,1>,  &f<0,2>,  &f<0,3>,  &f<0,4>,  &f<0,5>,  &f<0,-1>}, \
-        ARR(f) {&f<1,0>,  &f<1,1>,  &f<1,2>,  &f<1,3>,  &f<1,4>,  &f<1,5>,  &f<1,-1>}, \
-        ARR(f) {&f<2,0>,  &f<2,1>,  &f<2,2>,  &f<2,3>,  &f<2,4>,  &f<2,5>,  &f<2,-1>}, \
-        ARR(f) {&f<3,0>,  &f<3,1>,  &f<3,2>,  &f<3,3>,  &f<3,4>,  &f<3,5>,  &f<3,-1>}, \
-        ARR(f) {&f<4,0>,  &f<4,1>,  &f<4,2>,  &f<4,3>,  &f<4,4>,  &f<4,5>,  &f<4,-1>}, \
-        ARR(f) {&f<5,0>,  &f<5,1>,  &f<5,2>,  &f<5,3>,  &f<5,4>,  &f<5,5>,  &f<5,-1>}, \
-        ARR(f) {&f<-1,0>, &f<-1,1>, &f<-1,2>, &f<-1,3>, &f<-1,4>, &f<-1,5>, &f<-1,-1>}}) \
+    (array { \
+        array {&f<0,0>,  &f<0,1>,  &f<0,2>,  &f<0,3>,  &f<0,4>,  &f<0,5>,  &f<0,-1>}, \
+        array {&f<1,0>,  &f<1,1>,  &f<1,2>,  &f<1,3>,  &f<1,4>,  &f<1,5>,  &f<1,-1>}, \
+        array {&f<2,0>,  &f<2,1>,  &f<2,2>,  &f<2,3>,  &f<2,4>,  &f<2,5>,  &f<2,-1>}, \
+        array {&f<3,0>,  &f<3,1>,  &f<3,2>,  &f<3,3>,  &f<3,4>,  &f<3,5>,  &f<3,-1>}, \
+        array {&f<4,0>,  &f<4,1>,  &f<4,2>,  &f<4,3>,  &f<4,4>,  &f<4,5>,  &f<4,-1>}, \
+        array {&f<5,0>,  &f<5,1>,  &f<5,2>,  &f<5,3>,  &f<5,4>,  &f<5,5>,  &f<5,-1>}, \
+        array {&f<-1,0>, &f<-1,1>, &f<-1,2>, &f<-1,3>, &f<-1,4>, &f<-1,5>, &f<-1,-1>}}) \
     [std::min((int) numctrls, MAX_OPTIMISED_NUM_CTRLS + 1)] \
     [std::min((int) numtargs, MAX_OPTIMISED_NUM_TARGS + 1)]
 
-#define ARR(f) vector<decltype(&f<0,0>)>
-
 
 #define GET_CPU_OR_GPU_FUNC_OPTIMISED_FOR_NUM_QUREGS(funcsuffix, qureg, numquregs) \
     ((qureg.isGpuAccelerated)? \
@@ -135,19 +134,17 @@ using std::min;
 /// instances at the cost of increased code complexity/asymmetry. Consider!
 
 #define GET_TWO_BOOL_FUNC_OPTIMISED_FOR_NUM_CTRLS_AND_TARGS(f, numctrls, numtargs, c, h) \
-    (vector <POWER_CONJ_ARR(f)> { \
-        POWER_CONJ_ARR(f) {&f<0,0,c,h>,  &f<0,1,c,h>,  &f<0,2,c,h>,  &f<0,3,c,h>,  &f<0,4,c,h>,  &f<0,5,c,h>,  &f<0,-1,c,h>}, \
-        POWER_CONJ_ARR(f) {&f<1,0,c,h>,  &f<1,1,c,h>,  &f<1,2,c,h>,  &f<1,3,c,h>,  &f<1,4,c,h>,  &f<1,5,c,h>,  &f<1,-1,c,h>}, \
-        POWER_CONJ_ARR(f) {&f<2,0,c,h>,  &f<2,1,c,h>,  &f<2,2,c,h>,  &f<2,3,c,h>,  &f<2,4,c,h>,  &f<2,5,c,h>,  &f<2,-1,c,h>}, \
-        POWER_CONJ_ARR(f) {&f<3,0,c,h>,  &f<3,1,c,h>,  &f<3,2,c,h>,  &f<3,3,c,h>,  &f<3,4,c,h>,  &f<3,5,c,h>,  &f<3,-1,c,h>}, \
-        POWER_CONJ_ARR(f) {&f<4,0,c,h>,  &f<4,1,c,h>,  &f<4,2,c,h>,  &f<4,3,c,h>,  &f<4,4,c,h>,  &f<4,5,c,h>,  &f<4,-1,c,h>}, \
-        POWER_CONJ_ARR(f) {&f<5,0,c,h>,  &f<5,1,c,h>,  &f<5,2,c,h>,  &f<5,3,c,h>,  &f<5,4,c,h>,  &f<5,5,c,h>,  &f<5,-1,c,h>}, \
-        POWER_CONJ_ARR(f) {&f<-1,0,c,h>, &f<-1,1,c,h>, &f<-1,2,c,h>, &f<-1,3,c,h>, &f<-1,4,c,h>, &f<-1,5,c,h>, &f<-1,-1,c,h>}}) \
+    (array { \
+        array {&f<0,0,c,h>,  &f<0,1,c,h>,  &f<0,2,c,h>,  &f<0,3,c,h>,  &f<0,4,c,h>,  &f<0,5,c,h>,  &f<0,-1,c,h>}, \
+        array {&f<1,0,c,h>,  &f<1,1,c,h>,  &f<1,2,c,h>,  &f<1,3,c,h>,  &f<1,4,c,h>,  &f<1,5,c,h>,  &f<1,-1,c,h>}, \
+        array {&f<2,0,c,h>,  &f<2,1,c,h>,  &f<2,2,c,h>,  &f<2,3,c,h>,  &f<2,4,c,h>,  &f<2,5,c,h>,  &f<2,-1,c,h>}, \
+        array {&f<3,0,c,h>,  &f<3,1,c,h>,  &f<3,2,c,h>,  &f<3,3,c,h>,  &f<3,4,c,h>,  &f<3,5,c,h>,  &f<3,-1,c,h>}, \
+        array {&f<4,0,c,h>,  &f<4,1,c,h>,  &f<4,2,c,h>,  &f<4,3,c,h>,  &f<4,4,c,h>,  &f<4,5,c,h>,  &f<4,-1,c,h>}, \
+        array {&f<5,0,c,h>,  &f<5,1,c,h>,  &f<5,2,c,h>,  &f<5,3,c,h>,  &f<5,4,c,h>,  &f<5,5,c,h>,  &f<5,-1,c,h>}, \
+        array {&f<-1,0,c,h>, &f<-1,1,c,h>, &f<-1,2,c,h>, &f<-1,3,c,h>, &f<-1,4,c,h>, &f<-1,5,c,h>, &f<-1,-1,c,h>}}) \
     [std::min((int) numctrls, MAX_OPTIMISED_NUM_CTRLS + 1)] \
     [std::min((int) numtargs, MAX_OPTIMISED_NUM_TARGS + 1)]
 
-#define POWER_CONJ_ARR(f) vector<decltype(&f<0,0,false,false>)>
-
 #define GET_CPU_OR_GPU_TWO_BOOL_FUNC_OPTIMISED_FOR_NUM_CTRLS_AND_TARGS(funcsuffix, qureg, numctrls, numtargs, conj, haspower) \
     ((qureg.isGpuAccelerated)? \
         ((conj)? \
@@ -549,7 +546,7 @@ void accel_statevector_anyCtrlAnyTargZOrPhaseGadget_sub(Qureg qureg, SmallList c
  */
 
 
-void accel_statevec_setQuregToWeightedSum_sub(Qureg outQureg, vector<qcomp> coeffs, vector<Qureg> inQuregs) {
+void accel_statevec_setQuregToWeightedSum_sub(Qureg outQureg, std::vector<qcomp> coeffs, std::vector<Qureg> inQuregs) {
 
     // consult outQureg's deployment since others are prior validated to match
     auto func = GET_CPU_OR_GPU_FUNC_OPTIMISED_FOR_NUM_QUREGS( statevec_setQuregToWeightedSum_sub, outQureg, inQuregs.size() );
diff --git a/quest/src/core/accelerator.hpp b/quest/src/core/accelerator.hpp
index bd89bbb9..91004a92 100644
--- a/quest/src/core/accelerator.hpp
+++ b/quest/src/core/accelerator.hpp
@@ -28,8 +28,6 @@
 
 #include <vector>
 
-using std::vector;
-
 
 /*
  * TEMPLATE INSTANTIATION MACROS
@@ -234,7 +232,7 @@ void accel_statevector_anyCtrlPauliTensorOrGadget_subB(Qureg qureg, SmallList ct
  * QUREG COMBINATION
  */
 
-void accel_statevec_setQuregToWeightedSum_sub(Qureg outQureg, vector<qcomp> coeffs, vector<Qureg> inQuregs);
+void accel_statevec_setQuregToWeightedSum_sub(Qureg outQureg, std::vector<qcomp> coeffs, std::vector<Qureg> inQuregs);
 
 void accel_densmatr_mixQureg_subA(qreal outProb, Qureg out, qreal inProb, Qureg in);
 void accel_densmatr_mixQureg_subB(qreal outProb, Qureg out, qreal inProb, Qureg in);

From 8e59c2e3243d8c8fc1325e48e396e6a1201bde8e Mon Sep 17 00:00:00 2001
From: Tyson Jones <tyson.jones.input@gmail.com>
Date: Sun, 17 May 2026 20:58:27 -0400
Subject: [PATCH 3/6] Made accelerator function arrays static

Also removed param-specific macros (like numCtrls vs numTargs), and instead made them generic (param)
---
 quest/src/core/accelerator.cpp | 173 ++++++++++++++-------------------
 quest/src/core/accelerator.hpp |   4 +-
 2 files changed, 76 insertions(+), 101 deletions(-)

diff --git a/quest/src/core/accelerator.cpp b/quest/src/core/accelerator.cpp
index d8d4c411..cfaaaf58 100644
--- a/quest/src/core/accelerator.cpp
+++ b/quest/src/core/accelerator.cpp
@@ -47,19 +47,16 @@ using std::array;
  * number of controls or targets exceeds that which have optimised compilations, 
  * we fall back to using a generic implementation, indicated by <-1>. In essence,
  * these macros simply call func<ctrls.size()> albeit without illegally passing
- * a runtime variable as a template parameter. Note an awkward use of decltype()
- * is to workaround a GCC <12 bug with implicitly-typed vector initialisations.
- * 
- * BEWARE that these macros are single-line expressions, so they can be used in
- * braceless if/else or ternary operators - but stay vigilant!
+ * a runtime variable as a template parameter.
  */
 
 
-#define GET_FUNC_OPTIMISED_FOR_BOOL(funcname, value) \
+
+#define GET_FUNC_OPTIMISED_FOR_BOOL( funcname, value ) \
     ((value)? funcname<true> : funcname<false>)
 
 
-#define GET_FUNC_OPTIMISED_FOR_TWO_BOOLS(funcname, b1, b2) \
+#define GET_FUNC_OPTIMISED_FOR_TWO_BOOLS( funcname, b1, b2 ) \
     ((b1)? \
         ((b2)? funcname<true, true> : funcname<true, false>) : \
         ((b2)? funcname<false,true> : funcname<false,false>))
@@ -71,59 +68,74 @@ using std::array;
         ((value)? cpu_##funcsuffix<true, fixed1,fixed2,fixed3> : cpu_##funcsuffix<false, fixed1,fixed2,fixed3> ))
 
 
-#if (MAX_OPTIMISED_NUM_CTRLS != 5) || (MAX_OPTIMISED_NUM_TARGS != 5)
+#if (MAX_OPTIMISED_PARAM != 5)
     #error "The number of optimised, templated QuEST functions was inconsistent between accelerator's source and header."
 #endif
 
+#define GET_TEMPLATE_PARAM( param ) \
+    std::min((int) param, MAX_OPTIMISED_PARAM + 1)
+
 
-#define GET_FUNC_OPTIMISED_FOR_NUM_QUREGS(f, numquregs) \
-    (array {&f<0>, &f<1>, &f<2>, &f<3>, &f<4>, &f<5>, &f<-1>}) \
-    [std::min((int) numquregs, MAX_OPTIMISED_NUM_QUREGS + 1)]
+#define GET_ONE_PARAM_TEMPLATED_FUNC_ARRAY( f ) \
+    array {&f<0>, &f<1>, &f<2>, &f<3>, &f<4>, &f<5>, &f<-1>}
 
-#define GET_FUNC_OPTIMISED_FOR_NUM_CTRLS(f, numctrls) \
-    (array {&f<0>, &f<1>, &f<2>, &f<3>, &f<4>, &f<5>, &f<-1>}) \
-    [std::min((int) numctrls, MAX_OPTIMISED_NUM_CTRLS + 1)]
+#define GET_FUNC_OPTIMISED_FOR_ONE_PARAM( outvar, funcname, param ) \
+    static constexpr auto (_ARRAY_##funcname) = GET_ONE_PARAM_TEMPLATED_FUNC_ARRAY( funcname ); \
+    const auto outvar = (_ARRAY_##funcname)[GET_TEMPLATE_PARAM( param )];
 
-#define GET_FUNC_OPTIMISED_FOR_NUM_TARGS(f, numtargs) \
-    (array {&f<0>, &f<1>, &f<2>, &f<3>, &f<4>, &f<5>, &f<-1>}) \
-    [std::min((int) numtargs, MAX_OPTIMISED_NUM_TARGS + 1)]
+#define GET_CPU_OR_GPU_FUNC_OPTIMISED_FOR_ONE_PARAM( outvar, funcsuffix, qureg, param ) \
+    GET_FUNC_OPTIMISED_FOR_ONE_PARAM( _GPU_FUNC, gpu_##funcsuffix, param ) \
+    GET_FUNC_OPTIMISED_FOR_ONE_PARAM( _CPU_FUNC, cpu_##funcsuffix, param ) \
+    const auto outvar = qureg.isGpuAccelerated ? _GPU_FUNC : _CPU_FUNC;
 
-#define GET_FUNC_OPTIMISED_FOR_NUM_CTRLS_AND_TARGS(f, numctrls, numtargs) \
-    (array { \
+    
+#define GET_TWO_PARAM_TEMPLATED_FUNC_MATRIX( f ) \
+    array { \
         array {&f<0,0>,  &f<0,1>,  &f<0,2>,  &f<0,3>,  &f<0,4>,  &f<0,5>,  &f<0,-1>}, \
         array {&f<1,0>,  &f<1,1>,  &f<1,2>,  &f<1,3>,  &f<1,4>,  &f<1,5>,  &f<1,-1>}, \
         array {&f<2,0>,  &f<2,1>,  &f<2,2>,  &f<2,3>,  &f<2,4>,  &f<2,5>,  &f<2,-1>}, \
         array {&f<3,0>,  &f<3,1>,  &f<3,2>,  &f<3,3>,  &f<3,4>,  &f<3,5>,  &f<3,-1>}, \
         array {&f<4,0>,  &f<4,1>,  &f<4,2>,  &f<4,3>,  &f<4,4>,  &f<4,5>,  &f<4,-1>}, \
         array {&f<5,0>,  &f<5,1>,  &f<5,2>,  &f<5,3>,  &f<5,4>,  &f<5,5>,  &f<5,-1>}, \
-        array {&f<-1,0>, &f<-1,1>, &f<-1,2>, &f<-1,3>, &f<-1,4>, &f<-1,5>, &f<-1,-1>}}) \
-    [std::min((int) numctrls, MAX_OPTIMISED_NUM_CTRLS + 1)] \
-    [std::min((int) numtargs, MAX_OPTIMISED_NUM_TARGS + 1)]
+        array {&f<-1,0>, &f<-1,1>, &f<-1,2>, &f<-1,3>, &f<-1,4>, &f<-1,5>, &f<-1,-1>}}
+
+#define GET_FUNC_OPTIMISED_FOR_TWO_PARAMS( outvar, funcname, param1, param2 ) \
+    static constexpr auto (_MATRIX_##funcname) = GET_TWO_PARAM_TEMPLATED_FUNC_MATRIX( funcname ); \
+    const auto outvar = (_MATRIX_##funcname)[GET_TEMPLATE_PARAM( param1 )][GET_TEMPLATE_PARAM( param2 )];
 
+#define GET_CPU_OR_GPU_FUNC_OPTIMISED_FOR_TWO_PARAMS( outvar, funcsuffix, qureg, param1, param2 ) \
+    GET_FUNC_OPTIMISED_FOR_TWO_PARAMS( _GPU_FUNC, gpu_##funcsuffix, param1, param2 ) \
+    GET_FUNC_OPTIMISED_FOR_TWO_PARAMS( _CPU_FUNC, cpu_##funcsuffix, param1, param2 ) \
+    const auto outvar = qureg.isGpuAccelerated ? _GPU_FUNC : _CPU_FUNC;
 
-#define GET_CPU_OR_GPU_FUNC_OPTIMISED_FOR_NUM_QUREGS(funcsuffix, qureg, numquregs) \
-    ((qureg.isGpuAccelerated)? \
-        GET_FUNC_OPTIMISED_FOR_NUM_QUREGS( gpu_##funcsuffix, numquregs ) : \
-        GET_FUNC_OPTIMISED_FOR_NUM_QUREGS( cpu_##funcsuffix, numquregs ))
 
-#define GET_CPU_OR_GPU_FUNC_OPTIMISED_FOR_NUM_CTRLS(funcsuffix, qureg, numctrls) \
-    ((qureg.isGpuAccelerated)? \
-        GET_FUNC_OPTIMISED_FOR_NUM_CTRLS( gpu_##funcsuffix, numctrls ) : \
-        GET_FUNC_OPTIMISED_FOR_NUM_CTRLS( cpu_##funcsuffix, numctrls ))
+#define GET_TWO_PARAM_TWO_BOOL_SUB_MATRIX( f, b1, b2 ) \
+    array { \
+        array {&f<0,0,b1,b2>,  &f<0,1,b1,b2>,  &f<0,2,b1,b2>,  &f<0,3,b1,b2>,  &f<0,4,b1,b2>,  &f<0,5,b1,b2>,  &f<0,-1,b1,b2>}, \
+        array {&f<1,0,b1,b2>,  &f<1,1,b1,b2>,  &f<1,2,b1,b2>,  &f<1,3,b1,b2>,  &f<1,4,b1,b2>,  &f<1,5,b1,b2>,  &f<1,-1,b1,b2>}, \
+        array {&f<2,0,b1,b2>,  &f<2,1,b1,b2>,  &f<2,2,b1,b2>,  &f<2,3,b1,b2>,  &f<2,4,b1,b2>,  &f<2,5,b1,b2>,  &f<2,-1,b1,b2>}, \
+        array {&f<3,0,b1,b2>,  &f<3,1,b1,b2>,  &f<3,2,b1,b2>,  &f<3,3,b1,b2>,  &f<3,4,b1,b2>,  &f<3,5,b1,b2>,  &f<3,-1,b1,b2>}, \
+        array {&f<4,0,b1,b2>,  &f<4,1,b1,b2>,  &f<4,2,b1,b2>,  &f<4,3,b1,b2>,  &f<4,4,b1,b2>,  &f<4,5,b1,b2>,  &f<4,-1,b1,b2>}, \
+        array {&f<5,0,b1,b2>,  &f<5,1,b1,b2>,  &f<5,2,b1,b2>,  &f<5,3,b1,b2>,  &f<5,4,b1,b2>,  &f<5,5,b1,b2>,  &f<5,-1,b1,b2>}, \
+        array {&f<-1,0,b1,b2>, &f<-1,1,b1,b2>, &f<-1,2,b1,b2>, &f<-1,3,b1,b2>, &f<-1,4,b1,b2>, &f<-1,5,b1,b2>, &f<-1,-1,b1,b2>}}
 
-#define GET_CPU_OR_GPU_FUNC_OPTIMISED_FOR_NUM_TARGS(funcsuffix, qureg, numtargs) \
-    ((qureg.isGpuAccelerated)? \
-        GET_FUNC_OPTIMISED_FOR_NUM_TARGS( gpu_##funcsuffix, numtargs ) : \
-        GET_FUNC_OPTIMISED_FOR_NUM_TARGS( cpu_##funcsuffix, numtargs ))
+#define GET_TWO_PARAM_TWO_BOOL_TEMPLATED_FUNC_MATRIX( f ) \
+    array { \
+        array{ GET_TWO_PARAM_TWO_BOOL_SUB_MATRIX( f, 0, 0 ), GET_TWO_PARAM_TWO_BOOL_SUB_MATRIX( f, 0, 1 ) }, \
+        array{ GET_TWO_PARAM_TWO_BOOL_SUB_MATRIX( f, 1, 0 ), GET_TWO_PARAM_TWO_BOOL_SUB_MATRIX( f, 1, 1 ) }}
 
-#define GET_CPU_OR_GPU_FUNC_OPTIMISED_FOR_NUM_CTRLS_AND_TARGS(funcsuffix, qureg, numctrls, numtargs) \
-    ((qureg.isGpuAccelerated)? \
-        GET_FUNC_OPTIMISED_FOR_NUM_CTRLS_AND_TARGS( gpu_##funcsuffix, numctrls, numtargs ) : \
-        GET_FUNC_OPTIMISED_FOR_NUM_CTRLS_AND_TARGS( cpu_##funcsuffix, numctrls, numtargs ))
+#define GET_FUNC_OPTIMISED_FOR_TWO_PARAMS_TWO_BOOLS( outvar, funcname, param1, param2, bool1, bool2 ) \
+    static constexpr auto (_MATRIX_##funcname) = GET_TWO_PARAM_TWO_BOOL_TEMPLATED_FUNC_MATRIX( funcname ); \
+    const auto outvar = (_MATRIX_##funcname)[GET_TEMPLATE_PARAM( param1 )][GET_TEMPLATE_PARAM( param2 )][bool1][bool2];
+
+#define GET_CPU_OR_GPU_FUNC_OPTIMISED_FOR_TWO_PARAMS_TWO_BOOLS( outvar, funcsuffix, qureg, param1, param2, bool1, bool2 ) \
+    GET_FUNC_OPTIMISED_FOR_TWO_PARAMS_TWO_BOOLS( _GPU_FUNC, gpu_##funcsuffix, param1, param2, bool1, bool2 ) \
+    GET_FUNC_OPTIMISED_FOR_TWO_PARAMS_TWO_BOOLS( _CPU_FUNC, cpu_##funcsuffix, param1, param2, bool1, bool2 ) \
+    const auto outvar = qureg.isGpuAccelerated ? _GPU_FUNC : _CPU_FUNC;
 
 
 /// @todo
-/// GET_TWO_BOOL_FUNC_OPTIMISED_FOR_NUM_CTRLS_AND_TARGS as defined below
+/// GET_CPU_OR_GPU_FUNC_OPTIMISED_FOR_TWO_PARAMS_TWO_BOOLS as defined above
 /// is used by anyCtrlAnyTargDiagMatr and anyCtrlAnyTargDenseMatr; the 
 /// latter only ever receives numTargs>=3 (due to accelerator redirecting 
 /// fewer targets to faster bespoke functions which e.g. avoid global GPU
@@ -133,38 +145,6 @@ using std::array;
 /// can ergo non-negligibly speed up compilation by avoiding these redundant 
 /// instances at the cost of increased code complexity/asymmetry. Consider!
 
-#define GET_TWO_BOOL_FUNC_OPTIMISED_FOR_NUM_CTRLS_AND_TARGS(f, numctrls, numtargs, c, h) \
-    (array { \
-        array {&f<0,0,c,h>,  &f<0,1,c,h>,  &f<0,2,c,h>,  &f<0,3,c,h>,  &f<0,4,c,h>,  &f<0,5,c,h>,  &f<0,-1,c,h>}, \
-        array {&f<1,0,c,h>,  &f<1,1,c,h>,  &f<1,2,c,h>,  &f<1,3,c,h>,  &f<1,4,c,h>,  &f<1,5,c,h>,  &f<1,-1,c,h>}, \
-        array {&f<2,0,c,h>,  &f<2,1,c,h>,  &f<2,2,c,h>,  &f<2,3,c,h>,  &f<2,4,c,h>,  &f<2,5,c,h>,  &f<2,-1,c,h>}, \
-        array {&f<3,0,c,h>,  &f<3,1,c,h>,  &f<3,2,c,h>,  &f<3,3,c,h>,  &f<3,4,c,h>,  &f<3,5,c,h>,  &f<3,-1,c,h>}, \
-        array {&f<4,0,c,h>,  &f<4,1,c,h>,  &f<4,2,c,h>,  &f<4,3,c,h>,  &f<4,4,c,h>,  &f<4,5,c,h>,  &f<4,-1,c,h>}, \
-        array {&f<5,0,c,h>,  &f<5,1,c,h>,  &f<5,2,c,h>,  &f<5,3,c,h>,  &f<5,4,c,h>,  &f<5,5,c,h>,  &f<5,-1,c,h>}, \
-        array {&f<-1,0,c,h>, &f<-1,1,c,h>, &f<-1,2,c,h>, &f<-1,3,c,h>, &f<-1,4,c,h>, &f<-1,5,c,h>, &f<-1,-1,c,h>}}) \
-    [std::min((int) numctrls, MAX_OPTIMISED_NUM_CTRLS + 1)] \
-    [std::min((int) numtargs, MAX_OPTIMISED_NUM_TARGS + 1)]
-
-#define GET_CPU_OR_GPU_TWO_BOOL_FUNC_OPTIMISED_FOR_NUM_CTRLS_AND_TARGS(funcsuffix, qureg, numctrls, numtargs, conj, haspower) \
-    ((qureg.isGpuAccelerated)? \
-        ((conj)? \
-            ((haspower)? \
-                GET_TWO_BOOL_FUNC_OPTIMISED_FOR_NUM_CTRLS_AND_TARGS( gpu_##funcsuffix, numctrls, numtargs, true, true ) : \
-                GET_TWO_BOOL_FUNC_OPTIMISED_FOR_NUM_CTRLS_AND_TARGS( gpu_##funcsuffix, numctrls, numtargs, true, false ) ) : \
-            ((haspower)? \
-                GET_TWO_BOOL_FUNC_OPTIMISED_FOR_NUM_CTRLS_AND_TARGS( gpu_##funcsuffix, numctrls, numtargs, false, true ) : \
-                GET_TWO_BOOL_FUNC_OPTIMISED_FOR_NUM_CTRLS_AND_TARGS( gpu_##funcsuffix, numctrls, numtargs, false, false ) ) ) : \
-        ((conj)? \
-            ((haspower)? \
-                GET_TWO_BOOL_FUNC_OPTIMISED_FOR_NUM_CTRLS_AND_TARGS( cpu_##funcsuffix, numctrls, numtargs, true, true ) : \
-                GET_TWO_BOOL_FUNC_OPTIMISED_FOR_NUM_CTRLS_AND_TARGS( cpu_##funcsuffix, numctrls, numtargs, true, false ) ) : \
-            ((haspower)? \
-                GET_TWO_BOOL_FUNC_OPTIMISED_FOR_NUM_CTRLS_AND_TARGS( cpu_##funcsuffix, numctrls, numtargs, false, true ) : \
-                GET_TWO_BOOL_FUNC_OPTIMISED_FOR_NUM_CTRLS_AND_TARGS( cpu_##funcsuffix, numctrls, numtargs, false, false ) ) ) )
-
-/// @todo
-/// The above macro spaghetti is diabolical - update using C++ metaprogamming!
-
 
 
 /*
@@ -251,7 +231,7 @@ qindex accel_statevec_packAmpsIntoBuffer(Qureg qureg, SmallList qubits, SmallLis
         error_noCtrlsGivenToBufferPacker();
 
     // note qubits may incidentally be ctrls or targs; it doesn't matter
-    auto func = GET_CPU_OR_GPU_FUNC_OPTIMISED_FOR_NUM_TARGS( statevec_packAmpsIntoBuffer, qureg, qubits.size() );
+    GET_CPU_OR_GPU_FUNC_OPTIMISED_FOR_ONE_PARAM( func, statevec_packAmpsIntoBuffer, qureg, qubits.size() );
     
     // return the number of packed amps, for caller convenience
     return func(qureg, qubits, qubitStates);
@@ -274,17 +254,17 @@ qindex accel_statevec_packPairSummedAmpsIntoBuffer(Qureg qureg, int qubit1, int
 
 void accel_statevec_anyCtrlSwap_subA(Qureg qureg, SmallList ctrls, SmallList ctrlStates, int targ1, int targ2) {
 
-    auto func = GET_CPU_OR_GPU_FUNC_OPTIMISED_FOR_NUM_CTRLS( statevec_anyCtrlSwap_subA, qureg, ctrls.size() );
+    GET_CPU_OR_GPU_FUNC_OPTIMISED_FOR_ONE_PARAM( func, statevec_anyCtrlSwap_subA, qureg, ctrls.size() );
     func(qureg, ctrls, ctrlStates, targ1, targ2);
 }
 void accel_statevec_anyCtrlSwap_subB(Qureg qureg, SmallList ctrls, SmallList ctrlStates) {
 
-    auto func = GET_CPU_OR_GPU_FUNC_OPTIMISED_FOR_NUM_CTRLS( statevec_anyCtrlSwap_subB, qureg, ctrls.size() );
+    GET_CPU_OR_GPU_FUNC_OPTIMISED_FOR_ONE_PARAM( func, statevec_anyCtrlSwap_subB, qureg, ctrls.size() );
     func(qureg, ctrls, ctrlStates);
 }
 void accel_statevec_anyCtrlSwap_subC(Qureg qureg, SmallList ctrls, SmallList ctrlStates, int targ, int targState) {
 
-    auto func = GET_CPU_OR_GPU_FUNC_OPTIMISED_FOR_NUM_CTRLS( statevec_anyCtrlSwap_subC, qureg, ctrls.size() );
+    GET_CPU_OR_GPU_FUNC_OPTIMISED_FOR_ONE_PARAM( func, statevec_anyCtrlSwap_subC, qureg, ctrls.size() );
     func(qureg, ctrls, ctrlStates, targ, targState);
 }
 
@@ -297,26 +277,26 @@ void accel_statevec_anyCtrlSwap_subC(Qureg qureg, SmallList ctrls, SmallList ctr
 
 void accel_statevec_anyCtrlOneTargDenseMatr_subA(Qureg qureg, SmallList ctrls, SmallList ctrlStates, int targ, CompMatr1 matr) {
 
-    auto func = GET_CPU_OR_GPU_FUNC_OPTIMISED_FOR_NUM_CTRLS( statevec_anyCtrlOneTargDenseMatr_subA, qureg, ctrls.size() );
+    GET_CPU_OR_GPU_FUNC_OPTIMISED_FOR_ONE_PARAM( func, statevec_anyCtrlOneTargDenseMatr_subA, qureg, ctrls.size() );
     func(qureg, ctrls, ctrlStates, targ, matr);
 }
 void accel_statevec_anyCtrlOneTargDenseMatr_subB(Qureg qureg, SmallList ctrls, SmallList ctrlStates, qcomp fac0, qcomp fac1) {
 
-    auto func = GET_CPU_OR_GPU_FUNC_OPTIMISED_FOR_NUM_CTRLS( statevec_anyCtrlOneTargDenseMatr_subB, qureg, ctrls.size() );
+    GET_CPU_OR_GPU_FUNC_OPTIMISED_FOR_ONE_PARAM( func, statevec_anyCtrlOneTargDenseMatr_subB, qureg, ctrls.size() );
     func(qureg, ctrls, ctrlStates, fac0, fac1);
 }
 
 
 void accel_statevec_anyCtrlTwoTargDenseMatr_sub(Qureg qureg, SmallList ctrls, SmallList ctrlStates, int targ1, int targ2, CompMatr2 matr) {
 
-    auto func = GET_CPU_OR_GPU_FUNC_OPTIMISED_FOR_NUM_CTRLS( statevec_anyCtrlTwoTargDenseMatr_sub, qureg, ctrls.size() );
+    GET_CPU_OR_GPU_FUNC_OPTIMISED_FOR_ONE_PARAM( func, statevec_anyCtrlTwoTargDenseMatr_sub, qureg, ctrls.size() );
     func(qureg, ctrls, ctrlStates, targ1, targ2, matr);
 }
 
 
 void accel_statevec_anyCtrlAnyTargDenseMatr_sub(Qureg qureg, SmallList ctrls, SmallList ctrlStates, SmallList targs, CompMatr matr, bool conj, bool transp) {
 
-    auto func = GET_CPU_OR_GPU_TWO_BOOL_FUNC_OPTIMISED_FOR_NUM_CTRLS_AND_TARGS( statevec_anyCtrlAnyTargDenseMatr_sub, qureg, ctrls.size(), targs.size(), conj, transp );
+    GET_CPU_OR_GPU_FUNC_OPTIMISED_FOR_TWO_PARAMS_TWO_BOOLS( func, statevec_anyCtrlAnyTargDenseMatr_sub, qureg, ctrls.size(), targs.size(), conj, transp );
     func(qureg, ctrls, ctrlStates, targs, matr);
 }
 
@@ -329,14 +309,14 @@ void accel_statevec_anyCtrlAnyTargDenseMatr_sub(Qureg qureg, SmallList ctrls, Sm
 
 void accel_statevec_anyCtrlOneTargDiagMatr_sub(Qureg qureg, SmallList ctrls, SmallList ctrlStates, int targ, DiagMatr1 matr) {
 
-    auto func = GET_CPU_OR_GPU_FUNC_OPTIMISED_FOR_NUM_CTRLS( statevec_anyCtrlOneTargDiagMatr_sub, qureg, ctrls.size() );
+    GET_CPU_OR_GPU_FUNC_OPTIMISED_FOR_ONE_PARAM( func, statevec_anyCtrlOneTargDiagMatr_sub, qureg, ctrls.size() );
     func(qureg, ctrls, ctrlStates, targ, matr);
 }
 
 
 void accel_statevec_anyCtrlTwoTargDiagMatr_sub(Qureg qureg, SmallList ctrls, SmallList ctrlStates, int targ1, int targ2, DiagMatr2 matr) {
 
-    auto func = GET_CPU_OR_GPU_FUNC_OPTIMISED_FOR_NUM_CTRLS( statevec_anyCtrlTwoTargDiagMatr_sub, qureg, ctrls.size() );
+    GET_CPU_OR_GPU_FUNC_OPTIMISED_FOR_ONE_PARAM( func, statevec_anyCtrlTwoTargDiagMatr_sub, qureg, ctrls.size() );
     func(qureg, ctrls, ctrlStates, targ1, targ2, matr);
 }
 
@@ -345,7 +325,7 @@ void accel_statevec_anyCtrlAnyTargDiagMatr_sub(Qureg qureg, SmallList ctrls, Sma
 
     bool hasPower = exponent != qcomp(1, 0);
 
-    auto func = GET_CPU_OR_GPU_TWO_BOOL_FUNC_OPTIMISED_FOR_NUM_CTRLS_AND_TARGS( statevec_anyCtrlAnyTargDiagMatr_sub, qureg, ctrls.size(), targs.size(), conj, hasPower );
+    GET_CPU_OR_GPU_FUNC_OPTIMISED_FOR_TWO_PARAMS_TWO_BOOLS( func, statevec_anyCtrlAnyTargDiagMatr_sub, qureg, ctrls.size(), targs.size(), conj, hasPower );
     func(qureg, ctrls, ctrlStates, targs, matr, exponent);
 }
 
@@ -523,19 +503,19 @@ void accel_statevector_anyCtrlPauliTensorOrGadget_subA(Qureg qureg, SmallList ct
     // only X and Y constitute target qubits (Z merely induces a phase)
     int numTargs = x.size() + y.size();
 
-    auto func = GET_CPU_OR_GPU_FUNC_OPTIMISED_FOR_NUM_CTRLS_AND_TARGS( statevector_anyCtrlPauliTensorOrGadget_subA, qureg, ctrls.size(), numTargs );
+    GET_CPU_OR_GPU_FUNC_OPTIMISED_FOR_TWO_PARAMS( func, statevector_anyCtrlPauliTensorOrGadget_subA, qureg, ctrls.size(), numTargs );
     func(qureg, ctrls, states, x, y, z, f0, f1);
 }
 void accel_statevector_anyCtrlPauliTensorOrGadget_subB(Qureg qureg, SmallList ctrls, SmallList states, SmallList x, SmallList y, SmallList z, qcomp f0, qcomp f1, qindex mask) {
 
-    auto func = GET_CPU_OR_GPU_FUNC_OPTIMISED_FOR_NUM_CTRLS( statevector_anyCtrlPauliTensorOrGadget_subB, qureg, ctrls.size() );
+    GET_CPU_OR_GPU_FUNC_OPTIMISED_FOR_ONE_PARAM( func, statevector_anyCtrlPauliTensorOrGadget_subB, qureg, ctrls.size() );
     func(qureg, ctrls, states, x, y, z, f0, f1, mask);
 }
 
 
 void accel_statevector_anyCtrlAnyTargZOrPhaseGadget_sub(Qureg qureg, SmallList ctrls, SmallList states, SmallList targs, qcomp f0, qcomp f1) {
 
-    auto func = GET_CPU_OR_GPU_FUNC_OPTIMISED_FOR_NUM_CTRLS( statevector_anyCtrlAnyTargZOrPhaseGadget_sub, qureg, ctrls.size() );
+    GET_CPU_OR_GPU_FUNC_OPTIMISED_FOR_ONE_PARAM( func, statevector_anyCtrlAnyTargZOrPhaseGadget_sub, qureg, ctrls.size() );
     func(qureg, ctrls, states, targs, f0, f1);
 }
 
@@ -549,7 +529,7 @@ void accel_statevector_anyCtrlAnyTargZOrPhaseGadget_sub(Qureg qureg, SmallList c
 void accel_statevec_setQuregToWeightedSum_sub(Qureg outQureg, std::vector<qcomp> coeffs, std::vector<Qureg> inQuregs) {
 
     // consult outQureg's deployment since others are prior validated to match
-    auto func = GET_CPU_OR_GPU_FUNC_OPTIMISED_FOR_NUM_QUREGS( statevec_setQuregToWeightedSum_sub, outQureg, inQuregs.size() );
+    GET_CPU_OR_GPU_FUNC_OPTIMISED_FOR_ONE_PARAM( func, statevec_setQuregToWeightedSum_sub, outQureg, inQuregs.size() );
     func(outQureg, coeffs, inQuregs);
 }
 
@@ -846,12 +826,9 @@ void accel_densmatr_oneQubitDamping_subD(Qureg qureg, int qubit, qreal prob) {
 void accel_densmatr_partialTrace_sub(Qureg inQureg, Qureg outQureg, SmallList targs, SmallList pairTargs) {
     assert_partialTraceQuregsAreIdenticallyDeployed(inQureg, outQureg);
 
-    auto cpuFunc = GET_FUNC_OPTIMISED_FOR_NUM_TARGS( cpu_densmatr_partialTrace_sub, targs.size() );
-    auto gpuFunc = GET_FUNC_OPTIMISED_FOR_NUM_TARGS( gpu_densmatr_partialTrace_sub, targs.size() );
-
-    // inQureg == outQureg except for dimension, so use common backend
-    auto useFunc = (inQureg.isGpuAccelerated)? gpuFunc : cpuFunc;
-    useFunc(inQureg, outQureg, targs, pairTargs);
+    // inQureg == outQureg (except for dimension), so use common backend, informed by inQureg
+    GET_CPU_OR_GPU_FUNC_OPTIMISED_FOR_ONE_PARAM( func, densmatr_partialTrace_sub, inQureg, targs.size() );
+    func(inQureg, outQureg, targs, pairTargs);
 }
 
 
@@ -877,24 +854,24 @@ qreal accel_densmatr_calcTotalProb_sub(Qureg qureg) {
 
 qreal accel_statevec_calcProbOfMultiQubitOutcome_sub(Qureg qureg, SmallList qubits, SmallList outcomes) {
 
-    auto func = GET_CPU_OR_GPU_FUNC_OPTIMISED_FOR_NUM_TARGS( statevec_calcProbOfMultiQubitOutcome_sub, qureg, qubits.size() );
+    GET_CPU_OR_GPU_FUNC_OPTIMISED_FOR_ONE_PARAM( func, statevec_calcProbOfMultiQubitOutcome_sub, qureg, qubits.size() );
     return func(qureg, qubits, outcomes);
 }
 qreal accel_densmatr_calcProbOfMultiQubitOutcome_sub(Qureg qureg, SmallList qubits, SmallList outcomes) {
 
-    auto func = GET_CPU_OR_GPU_FUNC_OPTIMISED_FOR_NUM_TARGS( densmatr_calcProbOfMultiQubitOutcome_sub, qureg, qubits.size() );
+    GET_CPU_OR_GPU_FUNC_OPTIMISED_FOR_ONE_PARAM( func, densmatr_calcProbOfMultiQubitOutcome_sub, qureg, qubits.size() );
     return func(qureg, qubits, outcomes);
 }
 
 
 void accel_statevec_calcProbsOfAllMultiQubitOutcomes_sub(qreal* outProbs, Qureg qureg, SmallList qubits) {
 
-    auto func = GET_CPU_OR_GPU_FUNC_OPTIMISED_FOR_NUM_TARGS( statevec_calcProbsOfAllMultiQubitOutcomes_sub, qureg, qubits.size() );
+    GET_CPU_OR_GPU_FUNC_OPTIMISED_FOR_ONE_PARAM( func, statevec_calcProbsOfAllMultiQubitOutcomes_sub, qureg, qubits.size() );
     func(outProbs, qureg, qubits);
 }
 void accel_densmatr_calcProbsOfAllMultiQubitOutcomes_sub(qreal* outProbs, Qureg qureg, SmallList qubits) {
 
-    auto func = GET_CPU_OR_GPU_FUNC_OPTIMISED_FOR_NUM_TARGS( densmatr_calcProbsOfAllMultiQubitOutcomes_sub, qureg, qubits.size() );
+    GET_CPU_OR_GPU_FUNC_OPTIMISED_FOR_ONE_PARAM( func, densmatr_calcProbsOfAllMultiQubitOutcomes_sub, qureg, qubits.size() );
     func(outProbs, qureg, qubits);
 }
 
@@ -1110,12 +1087,12 @@ qcomp accel_densmatr_calcExpecFullStateDiagMatr_sub(Qureg qureg, FullStateDiagMa
 
 void accel_statevec_multiQubitProjector_sub(Qureg qureg, SmallList qubits, SmallList outcomes, qreal prob) {
 
-    auto func = GET_CPU_OR_GPU_FUNC_OPTIMISED_FOR_NUM_TARGS( statevec_multiQubitProjector_sub, qureg, qubits.size() );
+    GET_CPU_OR_GPU_FUNC_OPTIMISED_FOR_ONE_PARAM( func, statevec_multiQubitProjector_sub, qureg, qubits.size() );
     func(qureg, qubits, outcomes, prob);
 }
 void accel_densmatr_multiQubitProjector_sub(Qureg qureg, SmallList qubits, SmallList outcomes, qreal prob) {
 
-    auto func = GET_CPU_OR_GPU_FUNC_OPTIMISED_FOR_NUM_TARGS( densmatr_multiQubitProjector_sub, qureg, qubits.size() );
+    GET_CPU_OR_GPU_FUNC_OPTIMISED_FOR_ONE_PARAM( func, densmatr_multiQubitProjector_sub, qureg, qubits.size() );
     func(qureg, qubits, outcomes, prob);
 }
 
diff --git a/quest/src/core/accelerator.hpp b/quest/src/core/accelerator.hpp
index 91004a92..507fed0c 100644
--- a/quest/src/core/accelerator.hpp
+++ b/quest/src/core/accelerator.hpp
@@ -42,9 +42,7 @@
  */
 
 // must match the macros below, and those in accelerator.cpp
-#define MAX_OPTIMISED_NUM_CTRLS 5
-#define MAX_OPTIMISED_NUM_TARGS 5
-#define MAX_OPTIMISED_NUM_QUREGS 5
+#define MAX_OPTIMISED_PARAM 5
 
 
 #define INSTANTIATE_FUNC_OPTIMISED_FOR_NUM_TARGS(returntype, funcname, args) \

From 8f7fcada3fe499c8d5c6edf59a31f6022f53fff7 Mon Sep 17 00:00:00 2001
From: Tyson Jones <tyson.jones.input@gmail.com>
Date: Sun, 17 May 2026 21:07:47 -0400
Subject: [PATCH 4/6] Patch template ordering

---
 quest/src/core/accelerator.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/quest/src/core/accelerator.cpp b/quest/src/core/accelerator.cpp
index cfaaaf58..53a82296 100644
--- a/quest/src/core/accelerator.cpp
+++ b/quest/src/core/accelerator.cpp
@@ -126,7 +126,7 @@ using std::array;
 
 #define GET_FUNC_OPTIMISED_FOR_TWO_PARAMS_TWO_BOOLS( outvar, funcname, param1, param2, bool1, bool2 ) \
     static constexpr auto (_MATRIX_##funcname) = GET_TWO_PARAM_TWO_BOOL_TEMPLATED_FUNC_MATRIX( funcname ); \
-    const auto outvar = (_MATRIX_##funcname)[GET_TEMPLATE_PARAM( param1 )][GET_TEMPLATE_PARAM( param2 )][bool1][bool2];
+    const auto outvar = (_MATRIX_##funcname)[bool1][bool2][GET_TEMPLATE_PARAM( param1 )][GET_TEMPLATE_PARAM( param2 )];
 
 #define GET_CPU_OR_GPU_FUNC_OPTIMISED_FOR_TWO_PARAMS_TWO_BOOLS( outvar, funcsuffix, qureg, param1, param2, bool1, bool2 ) \
     GET_FUNC_OPTIMISED_FOR_TWO_PARAMS_TWO_BOOLS( _GPU_FUNC, gpu_##funcsuffix, param1, param2, bool1, bool2 ) \

From 4ba1b34faac1f25f90be9a6118ac721e6e703372 Mon Sep 17 00:00:00 2001
From: Tyson Jones <tyson.jones.input@gmail.com>
Date: Sun, 17 May 2026 23:34:47 -0400
Subject: [PATCH 5/6] Early-exit all validation when disabled

Previously, even when global_isValidationEnabled=0, the validate functions would continue to their assertThat() inner functions, which would itself consult global_isValidationEnabled and discontinue. This meant everything within the validation function body would be executed, even when validation was actually disabled. For most functions, this was deemed a minor expense, outweighed by the boilerplate of early exit everywhere. Several relatively expensive validation functions explicitly checked global_isValidationEnabled upfront.

Alas, the relative cost of validation vs the backend of an API function changes with varying Qureg size. For small quregs (less than 10 qubits), some of the validation deemed trivial (at-scale) becomes significant. It is hard to judge this by eye. This commit makes ALL validate functions check global_isValidationEnabled upfront and exit early when validation is disabled. This consistency means more duplication (wah), but removes small-qureg performance pitfalls.
---
 quest/src/core/validation.cpp | 650 +++++++++++++++++++++++++++++++---
 1 file changed, 598 insertions(+), 52 deletions(-)

diff --git a/quest/src/core/validation.cpp b/quest/src/core/validation.cpp
index 3b6fc18a..cf474096 100644
--- a/quest/src/core/validation.cpp
+++ b/quest/src/core/validation.cpp
@@ -1400,12 +1400,18 @@ bool doQuregsHaveIdenticalMemoryLayouts(Qureg a, Qureg b) {
 
 void validate_envNeverInit(bool isQuESTInit, bool isQuESTFinal, const char* caller) {
 
+    if (!global_isValidationEnabled)
+        return;
+
     assertThat(!isQuESTInit, report::QUEST_ENV_ALREADY_INIT, caller);
     assertThat(!isQuESTFinal, report::QUEST_ENV_ALREADY_FINAL, caller);
 }
 
 void validate_newEnvDeploymentMode(int isDistrib, int isGpuAccel, int isMultithread, const char* caller) {
 
+    if (!global_isValidationEnabled)
+        return;
+
     // deployment flags must be boolean or auto
     tokenSubs vars = {{"${AUTO_DEPLOYMENT_FLAG}", modeflag::USE_AUTO}};
     assertThat(isDistrib     == 0 || isDistrib     == 1 || isDistrib     == modeflag::USE_AUTO, report::INVALID_OPTION_FOR_ENV_IS_DISTRIB,     vars, caller);
@@ -1435,6 +1441,9 @@ void validate_newEnvDeploymentMode(int isDistrib, int isGpuAccel, int isMultithr
 
 void validate_newEnvDistributedBetweenPower2Nodes(const char* caller) {
 
+    if (!global_isValidationEnabled)
+        return;
+
     // note that we do NOT finalize MPI before erroring below, because that would necessitate
     // every node (launched by mpirun) serially print the error message, causing spam.
     // Instead, we permit the evil of every MPI process calling exit() and MPI aborting when
@@ -1448,12 +1457,18 @@ void validate_newEnvDistributedBetweenPower2Nodes(const char* caller) {
 
 void validate_newEnvNodesEachHaveUniqueGpu(const char* caller) {
 
+    if (!global_isValidationEnabled)
+        return;
+
     bool sharedGpus = gpu_areAnyNodesBoundToSameGpu();
     assertAllNodesAgreeThat(!sharedGpus, report::MULTIPLE_NODES_BOUND_TO_SAME_GPU, caller);
 }
 
 void validate_gpuIsCuQuantumCompatible(const char* caller) {
 
+    if (!global_isValidationEnabled)
+        return;
+
     int minCC = 70;
     int ourCC = gpu_getComputeCapability();
     tokenSubs vars = {
@@ -1474,6 +1489,9 @@ void validate_gpuIsCuQuantumCompatible(const char* caller) {
 
 void validate_envIsInit(const char* caller) {
 
+    if (!global_isValidationEnabled)
+        return;
+
     assertThat(isQuESTEnvInit(), report::QUEST_ENV_NOT_INIT, caller);
 }
 
@@ -1485,6 +1503,9 @@ void validate_envIsInit(const char* caller) {
 
 void validate_randomSeeds(unsigned* seeds, int numSeeds, const char* caller) {
 
+    if (!global_isValidationEnabled)
+        return;
+
     // only the root node's seeds are consulted, so we permit all non-root
     // nodes to have invalid parameters. All nodes however must know/agree
     // when the root node's seeds are invalid, to synchronise validation
@@ -1498,32 +1519,50 @@ void validate_randomSeeds(unsigned* seeds, int numSeeds, const char* caller) {
 
 void validate_newEpsilonValue(qreal eps, const char* caller) {
 
+    if (!global_isValidationEnabled)
+        return;
+
     assertThat(eps >= 0, report::INVALID_NEW_EPSILON, {{"${NEW_EPS}", eps}}, caller);
 }
 
 void validate_newMaxNumReportedScalars(qindex numRows, qindex numCols, const char* caller) {
 
+    if (!global_isValidationEnabled)
+        return;
+
     assertThat(numRows >= 0, report::INVALID_NUM_REPORTED_SCALARS, {{"${NUM_ITEMS}", numRows}}, caller);
     assertThat(numCols >= 0, report::INVALID_NUM_REPORTED_SCALARS, {{"${NUM_ITEMS}", numCols}}, caller);
 }
 
 void validate_newMaxNumReportedSigFigs(int numSigFigs, const char* caller) {
 
+    if (!global_isValidationEnabled)
+        return;
+
     assertThat(numSigFigs >= 1, report::INVALID_NUM_REPORTED_SIG_FIGS, {{"${NUM_SIG_FIGS}", numSigFigs}}, caller);
 }
 
 void validate_newNumReportedNewlines(int numNewlines, const char* caller) {
 
+    if (!global_isValidationEnabled)
+        return;
+
     assertThat(numNewlines >= 0, report::INVALID_NUM_REPORTED_NEWLINES, {{"${NUM_NEWLINES}", numNewlines}}, caller);
 }
 
 void validate_numReportedNewlinesAboveZero(const char* caller) {
 
+    if (!global_isValidationEnabled)
+        return;
+
     assertThat(printer_getNumTrailingNewlines() > 0, report::INSUFFICIENT_NUM_REPORTED_NEWLINES, caller);
 }
 
 void validate_numPauliChars(const char* paulis, const char* caller) {
 
+    if (!global_isValidationEnabled)
+        return;
+
     // check position of terminal char, else default to numChars=5 (illegal)
     int numChars = 0;
     for (int i=0; i<5 && paulis[i] != '\0'; i++)
@@ -1534,6 +1573,9 @@ void validate_numPauliChars(const char* paulis, const char* caller) {
 
 void validate_reportedPauliStrStyleFlag(int flag, const char* caller) {
 
+    if (!global_isValidationEnabled)
+        return;
+
     assertThat(flag==0 || flag==1, report::INVALID_REPORTED_PAULI_STR_STYLE_FLAG, {{"${FLAG}",flag}}, caller);
 }
 
@@ -1707,8 +1749,6 @@ void assertQuregFitsInGpuMem(int numQubits, int isDensMatr, int isDistrib, int i
 
 void validate_newQuregParams(int numQubits, int isDensMatr, int isDistrib, int isGpuAccel, int isMultithread, QuESTEnv env, const char* caller) {
 
-    // some of the below validation involves getting distributed node consensus, which
-    // can be an expensive synchronisation, which we avoid if validation is anyway disabled
     if (!global_isValidationEnabled)
         return;
 
@@ -1724,6 +1764,9 @@ void validate_newQuregParams(int numQubits, int isDensMatr, int isDistrib, int i
 
 void validate_newQuregAllocs(Qureg qureg, const char* caller) {
 
+    if (!global_isValidationEnabled)
+        return;
+
     // this validation is called AFTER the caller has checked for failed
     // allocs and (in that scenario) freed every pointer, but does not 
     // overwrite any pointers to nullptr, so the failed alloc is known.
@@ -1752,6 +1795,9 @@ void validate_newQuregAllocs(Qureg qureg, const char* caller) {
 
 void validate_quregFields(Qureg qureg, const char* caller) {
 
+    if (!global_isValidationEnabled)
+        return;
+
     // attempt to detect the Qureg was not initialised with createQureg by the 
     // struct fields being randomised, and ergo being dimensionally incompatible
     bool valid = true;
@@ -1781,11 +1827,17 @@ void validate_quregFields(Qureg qureg, const char* caller) {
 
 void validate_quregIsStateVector(Qureg qureg, const char* caller) {
 
+    if (!global_isValidationEnabled)
+        return;
+
     assertThat(!qureg.isDensityMatrix, report::QUREG_NOT_STATE_VECTOR, caller);
 }
 
 void validate_quregIsDensityMatrix(Qureg qureg, const char* caller) {
 
+    if (!global_isValidationEnabled)
+        return;
+
     assertThat(qureg.isDensityMatrix, report::QUREG_NOT_DENSITY_MATRIX, caller);
 }
 
@@ -2025,6 +2077,10 @@ void assertNewMatrixParamsAreValid(int numQubits, int useDistrib, int useGpu, in
 }
 
 void validate_newCompMatrParams(int numQubits, const char* caller) {
+
+    if (!global_isValidationEnabled)
+        return;
+
     validate_envIsInit(caller);
 
     // CompMatr can never be distributed nor multithreaded
@@ -2040,6 +2096,10 @@ void validate_newCompMatrParams(int numQubits, const char* caller) {
     assertNewMatrixParamsAreValid(numQubits, useDistrib, useGpu, useMultithread, isDenseType, caller);
 }
 void validate_newDiagMatrParams(int numQubits, const char* caller) {
+
+    if (!global_isValidationEnabled)
+        return;
+
     validate_envIsInit(caller);
 
     // DiagMatr can never be distributed nor multithreaded
@@ -2055,6 +2115,10 @@ void validate_newDiagMatrParams(int numQubits, const char* caller) {
     assertNewMatrixParamsAreValid(numQubits, useDistrib, useGpu, useMultithread, isDenseType, caller);
 }
 void validate_newFullStateDiagMatrParams(int numQubits, int useDistrib, int useGpu, int useMultithread, const char* caller) {
+
+    if (!global_isValidationEnabled)
+        return;
+
     validate_envIsInit(caller);
 
     // FullStateDiagMatr stores only the diagonals
@@ -2127,6 +2191,9 @@ void assertNewMatrixAllocsSucceeded(T matr, size_t numBytes, const char* caller)
 
 void validate_newMatrixAllocs(CompMatr matr, const char* caller) {
 
+    if (!global_isValidationEnabled)
+        return;
+
     bool isDenseMatrix = true;
     int numNodes = 1;
     size_t numBytes = mem_getLocalMatrixMemoryRequired(matr.numQubits, isDenseMatrix, numNodes);
@@ -2134,6 +2201,9 @@ void validate_newMatrixAllocs(CompMatr matr, const char* caller) {
 }
 void validate_newMatrixAllocs(DiagMatr matr, const char* caller) {
 
+    if (!global_isValidationEnabled)
+        return;
+
     bool isDenseMatrix = false;
     int numNodes = 1;
     size_t numBytes = mem_getLocalMatrixMemoryRequired(matr.numQubits, isDenseMatrix, numNodes);
@@ -2141,6 +2211,9 @@ void validate_newMatrixAllocs(DiagMatr matr, const char* caller) {
 }
 void validate_newMatrixAllocs(FullStateDiagMatr matr, const char* caller) {
 
+    if (!global_isValidationEnabled)
+        return;
+
     bool isDenseMatrix = false;
     int numNodes = (matr.isDistributed)? comm_getNumNodes() : 1;
     size_t numBytes = mem_getLocalMatrixMemoryRequired(matr.numQubits, isDenseMatrix, numNodes);
@@ -2155,6 +2228,9 @@ void validate_newMatrixAllocs(FullStateDiagMatr matr, const char* caller) {
 
 void validate_matrixNumNewElems(int numQubits, vector<vector<qcomp>> elems, const char* caller) {
 
+    if (!global_isValidationEnabled)
+        return;
+
     // CompMatr accept 2D elems   
     qindex dim = powerOf2(numQubits);
     tokenSubs vars = {
@@ -2176,6 +2252,9 @@ void validate_matrixNumNewElems(int numQubits, vector<vector<qcomp>> elems, cons
 }
 void validate_matrixNumNewElems(int numQubits, vector<qcomp> elems, const char* caller) {
 
+    if (!global_isValidationEnabled)
+        return;
+
     // DiagMatr accept 1D elems
     qindex dim = powerOf2(numQubits);
     tokenSubs vars = {
@@ -2188,11 +2267,17 @@ void validate_matrixNumNewElems(int numQubits, vector<qcomp> elems, const char*
 
 void validate_matrixNewElemsPtrNotNull(qcomp* elems, const char* caller) {
 
+    if (!global_isValidationEnabled)
+        return;
+
     assertThat(mem_isAllocated(elems), report::DIAG_MATR_NEW_ELEMS_NULL_PTR, caller);
 }
 
 void validate_matrixNewElemsPtrNotNull(qcomp** elems, qindex numRows, const char* caller) {
 
+    if (!global_isValidationEnabled)
+        return;
+
     // messages are suitable for all dense matrices, including SuperOp
 
     assertThat(mem_isOuterAllocated(elems), report::DENSE_MATR_NEW_ELEMS_OUTER_NULL_PTR, caller);
@@ -2203,6 +2288,9 @@ void validate_matrixNewElemsPtrNotNull(qcomp** elems, qindex numRows, const char
 
 void validate_fullStateDiagMatrNewElems(FullStateDiagMatr matr, qindex startInd, qindex numElems, const char* caller) {
 
+    if (!global_isValidationEnabled)
+        return;
+
     assertThat(
         startInd >= 0 && startInd < matr.numElems, 
         report::FULL_STATE_DIAG_MATR_NEW_ELEMS_INVALID_START_INDEX, 
@@ -2234,6 +2322,9 @@ void validate_fullStateDiagMatrNewElems(FullStateDiagMatr matr, qindex startInd,
 
 void validate_matrixNumQubitsMatchesParam(int numMatrQubits, int numSetterQubits, const char* caller) {
 
+    if (!global_isValidationEnabled)
+        return;
+
     tokenSubs vars = {
         {"${NUM_SETTER_QUBITS}", numSetterQubits},
         {"${NUM_MATRIX_QUBITS}", numMatrQubits}};
@@ -2243,6 +2334,9 @@ void validate_matrixNumQubitsMatchesParam(int numMatrQubits, int numSetterQubits
 
 void validate_declaredNumElemsMatchesVectorLength(qindex numElems, qindex vecLength, const char* caller) {
 
+    if (!global_isValidationEnabled)
+        return;
+
     tokenSubs vars = {
         {"${NUM_ELEMS}", numElems},
         {"${VEC_LENGTH}", vecLength}};
@@ -2252,6 +2346,9 @@ void validate_declaredNumElemsMatchesVectorLength(qindex numElems, qindex vecLen
 
 void validate_multiVarFuncQubits(int numMatrQubits, int* numQubitsPerVar, int numVars, const char* caller) {
 
+    if (!global_isValidationEnabled)
+        return;
+
     assertThat(numVars > 0, report::MULTI_VAR_FUNC_INVALID_NUM_VARS, {{"${NUM_VARS}", numVars}}, caller);
 
     for (int v=0; v<numVars; v++)
@@ -2267,11 +2364,17 @@ void validate_multiVarFuncQubits(int numMatrQubits, int* numQubitsPerVar, int nu
 
 void validate_funcVarSignedFlag(int areSigned, const char* caller) {
 
+    if (!global_isValidationEnabled)
+        return;
+
     assertThat(areSigned == 0 || areSigned == 1, report::MULTI_VAR_FUNC_INVALID_ARE_SIGNED_FLAG, {{"${ARE_SIGNED}", areSigned}}, caller);
 }
 
 void validate_matrixRowsAllSameSize(vector<vector<qcomp>> matrix, const char* caller) {
 
+    if (!global_isValidationEnabled)
+        return;
+
     if (matrix.empty())
         return;
 
@@ -2396,13 +2499,55 @@ void assertMatrixFieldsAreValid(T matr, int expectedNumQb, string badFieldMsg, c
     // no risk that they're wrong (because they're const so users cannot modify them) unless 
     // the struct was unitialised, which we have already validated against
 }
-void validate_matrixFields(CompMatr1 m, const char* caller) { assertMatrixFieldsAreValid(m, 1,           report::INVALID_COMP_MATR_1_FIELDS, caller); }
-void validate_matrixFields(CompMatr2 m, const char* caller) { assertMatrixFieldsAreValid(m, 2,           report::INVALID_COMP_MATR_2_FIELDS, caller); }
-void validate_matrixFields(CompMatr  m, const char* caller) { assertMatrixFieldsAreValid(m, m.numQubits, report::INVALID_COMP_MATR_FIELDS,   caller); }
-void validate_matrixFields(DiagMatr1 m, const char* caller) { assertMatrixFieldsAreValid(m, 1,           report::INVALID_DIAG_MATR_1_FIELDS, caller); }
-void validate_matrixFields(DiagMatr2 m, const char* caller) { assertMatrixFieldsAreValid(m, 2,           report::INVALID_DIAG_MATR_2_FIELDS, caller); }
-void validate_matrixFields(DiagMatr  m, const char* caller) { assertMatrixFieldsAreValid(m, m.numQubits, report::INVALID_DIAG_MATR_FIELDS,   caller); }
-void validate_matrixFields(FullStateDiagMatr m, const char* caller) { assertMatrixFieldsAreValid(m, m.numQubits, report::INVALID_FULL_STATE_DIAG_MATR_FIELDS, caller); }
+void validate_matrixFields(CompMatr1 m, const char* caller) {
+
+    if (!global_isValidationEnabled)
+        return;
+
+    assertMatrixFieldsAreValid(m, 1, report::INVALID_COMP_MATR_1_FIELDS, caller);
+}
+void validate_matrixFields(CompMatr2 m, const char* caller) {
+
+    if (!global_isValidationEnabled)
+        return;
+
+    assertMatrixFieldsAreValid(m, 2, report::INVALID_COMP_MATR_2_FIELDS, caller);
+}
+void validate_matrixFields(CompMatr  m, const char* caller) {
+
+    if (!global_isValidationEnabled)
+        return;
+
+    assertMatrixFieldsAreValid(m, m.numQubits, report::INVALID_COMP_MATR_FIELDS,   caller);
+}
+void validate_matrixFields(DiagMatr1 m, const char* caller) {
+
+    if (!global_isValidationEnabled)
+        return;
+
+    assertMatrixFieldsAreValid(m, 1, report::INVALID_DIAG_MATR_1_FIELDS, caller);
+}
+void validate_matrixFields(DiagMatr2 m, const char* caller) {
+
+    if (!global_isValidationEnabled)
+        return;
+
+    assertMatrixFieldsAreValid(m, 2, report::INVALID_DIAG_MATR_2_FIELDS, caller);
+}
+void validate_matrixFields(DiagMatr  m, const char* caller) {
+
+    if (!global_isValidationEnabled)
+        return;
+
+    assertMatrixFieldsAreValid(m, m.numQubits, report::INVALID_DIAG_MATR_FIELDS,   caller);
+}
+void validate_matrixFields(FullStateDiagMatr m, const char* caller) {
+
+    if (!global_isValidationEnabled)
+        return;
+
+    assertMatrixFieldsAreValid(m, m.numQubits, report::INVALID_FULL_STATE_DIAG_MATR_FIELDS, caller);
+}
 
 // type T can be CompMatr, DiagMatr or FullStateDiagMatr
 template <class T>
@@ -2417,9 +2562,27 @@ void assertMatrixIsSynced(T matr, string errMsg, const char* caller) {
     // NOT GPU-accelerated and ergo the GPU memory is not consulted. It's best to build the habit in the user!
     assertThat(*(matr.wasGpuSynced) == 1, errMsg, caller);
 }
-void validate_matrixIsSynced(CompMatr matr, const char* caller) { assertMatrixIsSynced(matr, report::COMP_MATR_NOT_SYNCED_TO_GPU, caller);}
-void validate_matrixIsSynced(DiagMatr matr, const char* caller) { assertMatrixIsSynced(matr, report::DIAG_MATR_NOT_SYNCED_TO_GPU, caller); }
-void validate_matrixIsSynced(FullStateDiagMatr matr, const char* caller) { assertMatrixIsSynced(matr, report::FULL_STATE_DIAG_MATR_NOT_SYNCED_TO_GPU, caller); }
+void validate_matrixIsSynced(CompMatr matr, const char* caller) {
+
+    if (!global_isValidationEnabled)
+        return;
+
+    assertMatrixIsSynced(matr, report::COMP_MATR_NOT_SYNCED_TO_GPU, caller);
+}
+void validate_matrixIsSynced(DiagMatr matr, const char* caller) {
+
+    if (!global_isValidationEnabled)
+        return;
+
+    assertMatrixIsSynced(matr, report::DIAG_MATR_NOT_SYNCED_TO_GPU, caller);
+}
+void validate_matrixIsSynced(FullStateDiagMatr matr, const char* caller) {
+
+    if (!global_isValidationEnabled)
+        return;
+
+    assertMatrixIsSynced(matr, report::FULL_STATE_DIAG_MATR_NOT_SYNCED_TO_GPU, caller);
+}
 
 // type T can be CompMatr1, CompMatr2, CompMatr, DiagMatr1, DiagMatr2, DiagMatr, FullStateDiagMatr
 template <class T> 
@@ -2439,13 +2602,55 @@ void assertMatrixIsUnitary(T matr, const char* caller) {
     // may overwrite matr.isApproxUnitary of heap matrices, otherwise ignores epsilon
     assertThat(util_isUnitary(matr, global_validationEpsilon), report::MATRIX_NOT_UNITARY, caller);
 }
-void validate_matrixIsUnitary(CompMatr1 m, const char* caller) { assertMatrixIsUnitary(m, caller); }
-void validate_matrixIsUnitary(CompMatr2 m, const char* caller) { assertMatrixIsUnitary(m, caller); }
-void validate_matrixIsUnitary(CompMatr  m, const char* caller) { assertMatrixIsUnitary(m, caller); }
-void validate_matrixIsUnitary(DiagMatr1 m, const char* caller) { assertMatrixIsUnitary(m, caller); }
-void validate_matrixIsUnitary(DiagMatr2 m, const char* caller) { assertMatrixIsUnitary(m, caller); }
-void validate_matrixIsUnitary(DiagMatr  m, const char* caller) { assertMatrixIsUnitary(m, caller); }
-void validate_matrixIsUnitary(FullStateDiagMatr m, const char* caller) { assertMatrixIsUnitary(m, caller); }
+void validate_matrixIsUnitary(CompMatr1 m, const char* caller) {
+
+    if (!global_isValidationEnabled)
+        return;
+
+    assertMatrixIsUnitary(m, caller);
+}
+void validate_matrixIsUnitary(CompMatr2 m, const char* caller) {
+
+    if (!global_isValidationEnabled)
+        return;
+
+    assertMatrixIsUnitary(m, caller);
+}
+void validate_matrixIsUnitary(CompMatr  m, const char* caller) {
+
+    if (!global_isValidationEnabled)
+        return;
+
+    assertMatrixIsUnitary(m, caller);
+}
+void validate_matrixIsUnitary(DiagMatr1 m, const char* caller) {
+
+    if (!global_isValidationEnabled)
+        return;
+
+    assertMatrixIsUnitary(m, caller);
+}
+void validate_matrixIsUnitary(DiagMatr2 m, const char* caller) {
+
+    if (!global_isValidationEnabled)
+        return;
+
+    assertMatrixIsUnitary(m, caller);
+}
+void validate_matrixIsUnitary(DiagMatr  m, const char* caller) {
+
+    if (!global_isValidationEnabled)
+        return;
+
+    assertMatrixIsUnitary(m, caller);
+}
+void validate_matrixIsUnitary(FullStateDiagMatr m, const char* caller) {
+
+    if (!global_isValidationEnabled)
+        return;
+
+    assertMatrixIsUnitary(m, caller);
+}
 
 void validate_unitaryExponentIsReal(qcomp exponent, const char* caller) {
 
@@ -2479,13 +2684,55 @@ void assertMatrixIsHermitian(T matr, const char* caller) {
     // may overwrite matr.isApproxHermitian of heap matrices, otherwise ignores epsilon
     assertThat(util_isHermitian(matr, global_validationEpsilon), report::MATRIX_NOT_HERMITIAN, caller);
 }
-void validate_matrixIsHermitian(CompMatr1 m, const char* caller) { assertMatrixIsHermitian(m, caller); }
-void validate_matrixIsHermitian(CompMatr2 m, const char* caller) { assertMatrixIsHermitian(m, caller); }
-void validate_matrixIsHermitian(CompMatr  m, const char* caller) { assertMatrixIsHermitian(m, caller); }
-void validate_matrixIsHermitian(DiagMatr1 m, const char* caller) { assertMatrixIsHermitian(m, caller); }
-void validate_matrixIsHermitian(DiagMatr2 m, const char* caller) { assertMatrixIsHermitian(m, caller); }
-void validate_matrixIsHermitian(DiagMatr  m, const char* caller) { assertMatrixIsHermitian(m, caller); }
-void validate_matrixIsHermitian(FullStateDiagMatr m, const char* caller) { assertMatrixIsHermitian(m, caller); }
+void validate_matrixIsHermitian(CompMatr1 m, const char* caller) {
+
+    if (!global_isValidationEnabled)
+        return;
+
+    assertMatrixIsHermitian(m, caller);
+}
+void validate_matrixIsHermitian(CompMatr2 m, const char* caller) {
+
+    if (!global_isValidationEnabled)
+        return;
+
+    assertMatrixIsHermitian(m, caller);
+}
+void validate_matrixIsHermitian(CompMatr  m, const char* caller) {
+
+    if (!global_isValidationEnabled)
+        return;
+
+    assertMatrixIsHermitian(m, caller);
+}
+void validate_matrixIsHermitian(DiagMatr1 m, const char* caller) {
+
+    if (!global_isValidationEnabled)
+        return;
+
+    assertMatrixIsHermitian(m, caller);
+}
+void validate_matrixIsHermitian(DiagMatr2 m, const char* caller) {
+
+    if (!global_isValidationEnabled)
+        return;
+
+    assertMatrixIsHermitian(m, caller);
+}
+void validate_matrixIsHermitian(DiagMatr  m, const char* caller) {
+
+    if (!global_isValidationEnabled)
+        return;
+
+    assertMatrixIsHermitian(m, caller);
+}
+void validate_matrixIsHermitian(FullStateDiagMatr m, const char* caller) {
+
+    if (!global_isValidationEnabled)
+        return;
+
+    assertMatrixIsHermitian(m, caller);
+}
 
 // type T can be DiagMatr, FullStateDiagMatr
 template <class T> 
@@ -2512,8 +2759,20 @@ void assertMatrExpIsNonDiverging(T matr, qcomp exponent, const char* caller) {
     if (std::real(exponent) < 0)
         assertThat(util_isApproxNonZero(matr, global_validationEpsilon), report::DIAG_MATR_APPROX_ZERO_WHILE_EXPONENT_REAL_AND_NEGATIVE, caller);
 }
-void validate_matrixExpIsNonDiverging(DiagMatr          m, qcomp p, const char* caller) { assertMatrExpIsNonDiverging(m, p, caller); }
-void validate_matrixExpIsNonDiverging(FullStateDiagMatr m, qcomp p, const char* caller) { assertMatrExpIsNonDiverging(m, p, caller); }
+void validate_matrixExpIsNonDiverging(DiagMatr m, qcomp p, const char* caller) {
+
+    if (!global_isValidationEnabled)
+        return;
+
+    assertMatrExpIsNonDiverging(m, p, caller);
+}
+void validate_matrixExpIsNonDiverging(FullStateDiagMatr m, qcomp p, const char* caller) {
+
+    if (!global_isValidationEnabled)
+        return;
+
+    assertMatrExpIsNonDiverging(m, p, caller);
+}
 
 // type T can be DiagMatr, FullStateDiagMatr
 template <class T> 
@@ -2551,8 +2810,20 @@ void assertMatrExpIsHermitian(T matr, qreal exponent, const char* caller) {
     // result tends to 1 so does not vanish or blow up unexpectedly. All fine!
 }
 
-void validate_matrixExpIsHermitian(DiagMatr          m, qreal p, const char* caller) { assertMatrExpIsHermitian(m, p, caller); }
-void validate_matrixExpIsHermitian(FullStateDiagMatr m, qreal p, const char* caller) { assertMatrExpIsHermitian(m, p, caller); }
+void validate_matrixExpIsHermitian(DiagMatr m, qreal p, const char* caller) {
+
+    if (!global_isValidationEnabled)
+        return;
+
+    assertMatrExpIsHermitian(m, p, caller);
+}
+void validate_matrixExpIsHermitian(FullStateDiagMatr m, qreal p, const char* caller) {
+
+    if (!global_isValidationEnabled)
+        return;
+
+    assertMatrExpIsHermitian(m, p, caller);
+}
 
 template <class T>
 void assertMatrixDimMatchesTargs(T matr, int numTargs, const char* caller) {
@@ -2573,15 +2844,54 @@ void assertMatrixDimMatchesTargs(T matr, int numTargs, const char* caller) {
     assertThat(numMatrQubits == numTargs, report::MATRIX_SIZE_MISMATCHES_NUM_TARGETS, vars, caller);
 }
 
-void validate_matrixDimMatchesTargets(CompMatr1 matr, int numTargs, const char* caller) { assertMatrixDimMatchesTargs(matr, numTargs, caller); }
-void validate_matrixDimMatchesTargets(CompMatr2 matr, int numTargs, const char* caller) { assertMatrixDimMatchesTargs(matr, numTargs, caller); }
-void validate_matrixDimMatchesTargets(CompMatr  matr, int numTargs, const char* caller) { assertMatrixDimMatchesTargs(matr, numTargs, caller); }
-void validate_matrixDimMatchesTargets(DiagMatr1 matr, int numTargs, const char* caller) { assertMatrixDimMatchesTargs(matr, numTargs, caller); }
-void validate_matrixDimMatchesTargets(DiagMatr2 matr, int numTargs, const char* caller) { assertMatrixDimMatchesTargs(matr, numTargs, caller); }
-void validate_matrixDimMatchesTargets(DiagMatr  matr, int numTargs, const char* caller) { assertMatrixDimMatchesTargs(matr, numTargs, caller); }
+void validate_matrixDimMatchesTargets(CompMatr1 matr, int numTargs, const char* caller) {
+
+    if (!global_isValidationEnabled)
+        return;
+
+    assertMatrixDimMatchesTargs(matr, numTargs, caller);
+}
+void validate_matrixDimMatchesTargets(CompMatr2 matr, int numTargs, const char* caller) {
+
+    if (!global_isValidationEnabled)
+        return;
+
+    assertMatrixDimMatchesTargs(matr, numTargs, caller);
+}
+void validate_matrixDimMatchesTargets(CompMatr  matr, int numTargs, const char* caller) {
+
+    if (!global_isValidationEnabled)
+        return;
+
+    assertMatrixDimMatchesTargs(matr, numTargs, caller);
+}
+void validate_matrixDimMatchesTargets(DiagMatr1 matr, int numTargs, const char* caller) {
+
+    if (!global_isValidationEnabled)
+        return;
+
+    assertMatrixDimMatchesTargs(matr, numTargs, caller);
+}
+void validate_matrixDimMatchesTargets(DiagMatr2 matr, int numTargs, const char* caller) {
+
+    if (!global_isValidationEnabled)
+        return;
+
+    assertMatrixDimMatchesTargs(matr, numTargs, caller);
+}
+void validate_matrixDimMatchesTargets(DiagMatr  matr, int numTargs, const char* caller) {
+
+    if (!global_isValidationEnabled)
+        return;
+
+    assertMatrixDimMatchesTargs(matr, numTargs, caller);
+}
 
 void validate_matrixAndQuregAreCompatible(FullStateDiagMatr matr, Qureg qureg, bool expecOnly, const char* caller) {
 
+    if (!global_isValidationEnabled)
+        return;
+
     // we do not need to define this function for the other matrix types,
     // since their validation will happen through validation of the
     // user-given list of target qubits. But we do need to define it for
@@ -2707,8 +3017,6 @@ void assertSuperOpFitsInGpuMem(int numQubits, int isEnvGpuAccel, bool isInKrausM
 
 void validate_newSuperOpParams(int numQubits, const char* caller) {
 
-    // some of the below validation involves getting distributed node consensus, which
-    // can be an expensive synchronisation, which we avoid if validation is anyway disabled
     if (!global_isValidationEnabled)
         return;
 
@@ -2761,13 +3069,15 @@ void assertNewSuperOpAllocs(SuperOp op, bool isInKrausMap, const char* caller) {
 
 void validate_newSuperOpAllocs(SuperOp op, const char* caller) {
 
+    if (!global_isValidationEnabled)
+        return;
+
     bool isInKrausMap = false;
     assertNewSuperOpAllocs(op, isInKrausMap, caller);
 }
 
 void validate_newInlineSuperOpDimMatchesVectors(int numDeclaredQubits, vector<vector<qcomp>> matrix, const char* caller) {
 
-    // avoid potentially expensive matrix enumeration if validation is anyway disabled
     if (!global_isValidationEnabled)
         return;
 
@@ -2796,7 +3106,6 @@ void validate_newInlineSuperOpDimMatchesVectors(int numDeclaredQubits, vector<ve
 
 void validate_superOpNewMatrixDims(SuperOp op, vector<vector<qcomp>> matrix, const char* caller) {
 
-    // avoid potentially expensive matrix enumeration if validation is anyway disabled
     if (!global_isValidationEnabled)
         return;
 
@@ -2818,6 +3127,9 @@ void validate_superOpNewMatrixDims(SuperOp op, vector<vector<qcomp>> matrix, con
 
 void validate_superOpFieldsMatchPassedParams(SuperOp op, int numQb, const char* caller) {
 
+    if (!global_isValidationEnabled)
+        return;
+
     tokenSubs vars = {
         {"${NUM_PASSED_QUBITS}", numQb},
         {"${NUM_OP_QUBITS}",     op.numQubits}};
@@ -2864,12 +3176,18 @@ void assertSuperOpFieldsAreValid(SuperOp op, bool isInKrausMap, const char* call
 
 void validate_superOpFields(SuperOp op, const char* caller) {
 
+    if (!global_isValidationEnabled)
+        return;
+
     bool isInKrausMap = false;
     assertSuperOpFieldsAreValid(op, isInKrausMap, caller);
 }
 
 void validate_superOpIsSynced(SuperOp op, const char* caller) {
 
+    if (!global_isValidationEnabled)
+        return;
+
     // we don't need to perform any sync check in CPU-only mode
     if (!mem_isAllocated(util_getGpuMemPtr(op)))
         return;
@@ -2880,6 +3198,9 @@ void validate_superOpIsSynced(SuperOp op, const char* caller) {
 
 void validate_superOpDimMatchesTargs(SuperOp op, int numTargets, const char* caller) {
 
+    if (!global_isValidationEnabled)
+        return;
+
     tokenSubs vars = {{"${OP_QUBITS}", op.numQubits}, {"{NUM_TARGS}", numTargets}};
     assertThat(op.numQubits == numTargets, report::SUPER_OP_SIZE_MISMATCHES_NUM_TARGETS, vars, caller);
 }
@@ -2915,8 +3236,6 @@ void assertKrausMapValidNumMatrices(int numQubits, int numMatrices, const char*
 
 void validate_newKrausMapParams(int numQubits, int numMatrices, const char* caller) {
 
-    // some of the below validation involves getting distributed node consensus, which
-    // can be an expensive synchronisation, which we avoid if validation is anyway disabled
     if (!global_isValidationEnabled)
         return;
 
@@ -2944,6 +3263,9 @@ void validate_newKrausMapParams(int numQubits, int numMatrices, const char* call
 
 void validate_newKrausMapAllocs(KrausMap map, const char* caller) {
 
+    if (!global_isValidationEnabled)
+        return;
+
     // unlike other post-creation allocation validation, this function
     // expects that when allocation failed and the heap fields have already
     // been cleared, that any nested field (like map.matrices) has had the
@@ -2954,11 +3276,6 @@ void validate_newKrausMapAllocs(KrausMap map, const char* caller) {
     // (and is nullptr), so we must check it last so as not to false report 
     // it as the cause of the failure!
 
-    // we expensively get node consensus about malloc failure, in case of heterogeneous hardware/loads,
-    // but we avoid this if validation is anyway disabled
-    if (!global_isValidationEnabled)
-        return;
-
     // prior validation gaurantees this will not overflow
     qindex matrListMem = map.numMatrices * mem_getLocalMatrixMemoryRequired(map.numQubits, true, 1);
     tokenSubs vars = {
@@ -2981,7 +3298,6 @@ void validate_newKrausMapAllocs(KrausMap map, const char* caller) {
 
 void validate_newInlineKrausMapDimMatchesVectors(int numQubits, int numOperators, vector<vector<vector<qcomp>>> matrices, const char* caller) {
 
-    // avoid potentially expensive matrix enumeration if validation is anyway disabled
     if (!global_isValidationEnabled)
         return;
 
@@ -3012,10 +3328,9 @@ void validate_newInlineKrausMapDimMatchesVectors(int numQubits, int numOperators
 
 void validate_krausMapNewMatrixDims(KrausMap map, vector<vector<vector<qcomp>>> matrices, const char* caller) {
 
-    // avoid potentially expensive matrix enumeration if validation is anyway disabled
     if (!global_isValidationEnabled)
         return;
-    
+
     assertThat(map.numMatrices == (int) matrices.size(), report::KRAUS_MAP_INCOMPATIBLE_NUM_NEW_MATRICES,
         {{"${NUM_GIVEN}", matrices.size()}, {"${NUM_EXPECTED}", map.numMatrices}}, caller);
 
@@ -3035,6 +3350,9 @@ void validate_krausMapNewMatrixDims(KrausMap map, vector<vector<vector<qcomp>>>
 
 void validate_krausMapFieldsMatchPassedParams(KrausMap map, int numQb, int numOps, const char* caller) {
 
+    if (!global_isValidationEnabled)
+        return;
+
     tokenSubs vars = {
         {"${NUM_MAP_QUBITS}",    map.numQubits},
         {"${NUM_MAP_OPS}",       map.numMatrices},
@@ -3053,6 +3371,9 @@ void validate_krausMapFieldsMatchPassedParams(KrausMap map, int numQb, int numOp
 
 void validate_krausMapFields(KrausMap map, const char* caller) {
 
+    if (!global_isValidationEnabled)
+        return;
+
     tokenSubs vars = {
         {"${NUM_QUBITS}",   map.numQubits},
         {"${NUM_MATRICES}", map.numMatrices},
@@ -3087,6 +3408,9 @@ void validate_krausMapFields(KrausMap map, const char* caller) {
 
 void validate_krausMapIsSynced(KrausMap map, const char* caller) {
 
+    if (!global_isValidationEnabled)
+        return;
+
     // we don't need to perform any sync check in CPU-only mode
     if (!mem_isAllocated(util_getGpuMemPtr(map.superop)))
         return;
@@ -3096,6 +3420,10 @@ void validate_krausMapIsSynced(KrausMap map, const char* caller) {
 }
 
 void validate_krausMapIsCPTP(KrausMap map, const char* caller) {
+
+    if (!global_isValidationEnabled)
+        return;
+
     validate_krausMapFields(map, caller);
     validate_krausMapIsSynced(map, caller);
 
@@ -3109,6 +3437,9 @@ void validate_krausMapIsCPTP(KrausMap map, const char* caller) {
 
 void validate_krausMapMatchesTargets(KrausMap map, int numTargets, const char* caller) {
 
+    if (!global_isValidationEnabled)
+        return;
+
     tokenSubs vars = {{"${KRAUS_QUBITS}", map.numQubits}, {"${TARG_QUBITS}", numTargets}};
     assertThat(map.numQubits == numTargets, report::KRAUS_MAP_SIZE_MISMATCHES_TARGETS, vars, caller);
 }
@@ -3179,6 +3510,9 @@ void assertValidNewPauliIndices(int* indices, int numInds, int maxIndExcl, const
 
 void validate_newPauliStrNumPaulis(int numPaulis, int maxNumPaulis, const char* caller) {
 
+    if (!global_isValidationEnabled)
+        return;
+
     tokenSubs vars = {{"${NUM_PAULIS}", numPaulis}};
     assertThat(numPaulis > 0, report::NEW_PAULI_STR_NON_POSITIVE_NUM_PAULIS, vars, caller);
 
@@ -3188,6 +3522,9 @@ void validate_newPauliStrNumPaulis(int numPaulis, int maxNumPaulis, const char*
 
 void validate_newPauliStrParams(const char* paulis, int* indices, int numPaulis, int maxNumPaulis, const char* caller) {
 
+    if (!global_isValidationEnabled)
+        return;
+
     validate_newPauliStrNumPaulis(numPaulis, maxNumPaulis, caller);
     assertCorrectNumPauliCharsBeforeTerminationChar(paulis, numPaulis, caller);
     assertRecognisedNewPaulis(paulis, numPaulis, caller);
@@ -3195,6 +3532,9 @@ void validate_newPauliStrParams(const char* paulis, int* indices, int numPaulis,
 }
 void validate_newPauliStrParams(int* paulis, int* indices, int numPaulis, int maxNumPaulis, const char* caller) {
 
+    if (!global_isValidationEnabled)
+        return;
+
     validate_newPauliStrNumPaulis(numPaulis, maxNumPaulis, caller);
     assertValidNewPauliCodes(paulis, numPaulis, caller);
     assertValidNewPauliIndices(indices, numPaulis, maxNumPaulis, caller);
@@ -3202,6 +3542,9 @@ void validate_newPauliStrParams(int* paulis, int* indices, int numPaulis, int ma
 
 void validate_newPauliStrNumChars(int numPaulis, int numIndices, const char* caller) {
 
+    if (!global_isValidationEnabled)
+        return;
+
     // this is a C++-only validation, because only std::string gaurantees we can know
     // the passed string length (C char arrays might not contain termination char)
     tokenSubs vars = {{"${NUM_PAULIS}", numPaulis}, {"${NUM_INDS}", numIndices}};
@@ -3216,6 +3559,9 @@ void validate_newPauliStrNumChars(int numPaulis, int numIndices, const char* cal
 
 void validate_pauliStrTargets(Qureg qureg, PauliStr str, const char* caller) {
 
+    if (!global_isValidationEnabled)
+        return;
+
     // avoid producing a list of targets which requires enumerating all bits
     int maxTarg = paulis_getIndOfLefmostNonIdentityPauli(str);
 
@@ -3225,6 +3571,9 @@ void validate_pauliStrTargets(Qureg qureg, PauliStr str, const char* caller) {
 
 void validate_controlsAndPauliStrTargets(Qureg qureg, int* ctrls, int numCtrls, PauliStr str, const char* caller) {
 
+    if (!global_isValidationEnabled)
+        return;
+
     // validate targets and controls in isolation
     validate_pauliStrTargets(qureg, str, caller);
     validate_controls(qureg, ctrls, numCtrls, caller);
@@ -3237,6 +3586,9 @@ void validate_controlsAndPauliStrTargets(Qureg qureg, int* ctrls, int numCtrls,
 
 void validate_controlAndPauliStrTargets(Qureg qureg, int ctrl, PauliStr str, const char* caller) {
 
+    if (!global_isValidationEnabled)
+        return;
+
     validate_controlsAndPauliStrTargets(qureg, &ctrl, 1, str, caller);
 }
 
@@ -3248,6 +3600,9 @@ void validate_controlAndPauliStrTargets(Qureg qureg, int ctrl, PauliStr str, con
 
 void validate_newPauliStrSumParams(qindex numTerms, const char* caller) {
 
+    if (!global_isValidationEnabled)
+        return;
+
     assertThat(numTerms > 0, report::NEW_PAULI_STR_SUM_NON_POSITIVE_NUM_STRINGS, {{"${NUM_TERMS}", numTerms}}, caller);
 
     // assert that the total memory required does not overflow
@@ -3278,12 +3633,18 @@ void validate_newPauliStrSumParams(qindex numTerms, const char* caller) {
 
 void validate_newPauliStrSumMatchingListLens(qindex numStrs, qindex numCoeffs, const char* caller) {
 
+    if (!global_isValidationEnabled)
+        return;
+
     tokenSubs vars = {{"${NUM_STRS}", numStrs}, {"${NUM_COEFFS}", numCoeffs}};
     assertThat(numStrs == numCoeffs, report::NEW_PAULI_STR_SUM_DIFFERENT_NUM_STRINGS_AND_COEFFS, vars, caller);
 }
 
 void validate_newPauliStrSumAllocs(PauliStrSum sum, qindex numBytesStrings, qindex numBytesCoeffs, const char* caller) {
 
+    if (!global_isValidationEnabled)
+        return;
+
     // this validation is called AFTER the caller has checked for failed
     // allocs and (in that scenario) freed every pointer, but does not 
     // overwrite any pointers to nullptr, so the failed alloc is known.
@@ -3311,6 +3672,9 @@ void validate_newPauliStrSumAllocs(PauliStrSum sum, qindex numBytesStrings, qind
 
 void validate_parsedPauliStrSumLineIsInterpretable(bool isInterpretable, string line, qindex lineIndex, const char* caller) {
 
+    if (!global_isValidationEnabled)
+        return;
+
     /// @todo we cannot yet report 'line' because tokenSubs so far only accepts integers :(
 
     tokenSubs vars = {{"${LINE_NUMBER}", lineIndex + 1}}; // line numbers begin at 1
@@ -3319,6 +3683,9 @@ void validate_parsedPauliStrSumLineIsInterpretable(bool isInterpretable, string
 
 void validate_parsedPauliStrSumLineHasConsistentNumPaulis(int numPaulis, int numLinePaulis, string line, qindex lineIndex, const char* caller) {
 
+    if (!global_isValidationEnabled)
+        return;
+
     /// @todo we cannot yet report 'line' because tokenSubs so far only accepts integers :(
 
     tokenSubs vars = {
@@ -3330,6 +3697,9 @@ void validate_parsedPauliStrSumLineHasConsistentNumPaulis(int numPaulis, int num
 
 void validate_parsedPauliStrSumCoeffWithinQcompRange(bool isCoeffValid, string line, qindex lineIndex, const char* caller) {
 
+    if (!global_isValidationEnabled)
+        return;
+
     /// @todo we cannot yet report 'line' because tokenSubs so far only accepts integers :(
 
     tokenSubs vars = {{"${LINE_NUMBER}", lineIndex + 1}}; // lines begin at 1
@@ -3338,6 +3708,9 @@ void validate_parsedPauliStrSumCoeffWithinQcompRange(bool isCoeffValid, string l
 
 void validate_parsedStringIsNotEmpty(bool stringIsNotEmpty, const char* caller) {
 
+    if (!global_isValidationEnabled)
+        return;
+
     assertThat(stringIsNotEmpty, report::PARSED_STRING_IS_EMPTY, caller);
 }
 
@@ -3351,6 +3724,9 @@ bool areQubitsDisjoint(qindex qubitsMaskA, int* qubitsB, int numQubitsB);
 
 void validate_pauliStrSumFields(PauliStrSum sum, const char* caller) {
 
+    if (!global_isValidationEnabled)
+        return;
+
     assertThat(sum.numTerms > 0, report::INVALID_PAULI_STR_SUM_FIELDS, {{"${NUM_TERMS}", sum.numTerms}}, caller);
 
     assertThat(mem_isAllocated(sum.coeffs),  report::INVALID_PAULI_STR_HEAP_PTR, caller);
@@ -3378,6 +3754,9 @@ void validate_pauliStrSumIsHermitian(PauliStrSum sum, const char* caller) {
 
 void validate_pauliStrSumTargets(PauliStrSum sum, Qureg qureg, const char* caller) {
 
+    if (!global_isValidationEnabled)
+        return;
+
     int maxInd = paulis_getIndOfLefmostNonIdentityPauli(sum);
     int minNumQb = maxInd + 1;
 
@@ -3391,6 +3770,9 @@ void validate_pauliStrSumTargets(PauliStrSum sum, Qureg qureg, const char* calle
 
 void validate_controlsAndPauliStrSumTargets(Qureg qureg, int* ctrls, int numCtrls, PauliStrSum sum, const char* caller) {
 
+    if (!global_isValidationEnabled)
+        return;
+
     // validate targets and controls in isolation
     validate_pauliStrSumTargets(sum, qureg, caller);
     validate_controls(qureg, ctrls, numCtrls, caller);
@@ -3402,11 +3784,17 @@ void validate_controlsAndPauliStrSumTargets(Qureg qureg, int* ctrls, int numCtrl
 
 void validate_controlAndPauliStrSumTargets(Qureg qureg, int ctrl, PauliStrSum sum, const char* caller) {
 
+    if (!global_isValidationEnabled)
+        return;
+
     validate_controlsAndPauliStrSumTargets(qureg, &ctrl, 1, sum, caller);
 }
 
 void validate_pauliStrSumCanInitMatrix(FullStateDiagMatr matr, PauliStrSum sum, const char* caller) {
 
+    if (!global_isValidationEnabled)
+        return;
+
     assertThat(!paulis_containsXOrY(sum), report::PAULI_STR_SUM_NOT_ALL_I_Z, caller);
 
     int maxInd = paulis_getIndOfLefmostNonIdentityPauli(sum);
@@ -3428,6 +3816,9 @@ void validate_pauliStrSumCanInitMatrix(FullStateDiagMatr matr, PauliStrSum sum,
 
 void validate_basisStateIndex(Qureg qureg, qindex ind, const char* caller) {
 
+    if (!global_isValidationEnabled)
+        return;
+
     qindex maxIndExcl = powerOf2(qureg.numQubits);
 
     tokenSubs vars = {
@@ -3440,6 +3831,9 @@ void validate_basisStateIndex(Qureg qureg, qindex ind, const char* caller) {
 
 void validate_basisStateRowCol(Qureg qureg, qindex row, qindex col, const char* caller) {
 
+    if (!global_isValidationEnabled)
+        return;
+
     qindex maxIndExcl = powerOf2(qureg.numQubits);
 
     tokenSubs vars = {
@@ -3454,6 +3848,9 @@ void validate_basisStateRowCol(Qureg qureg, qindex row, qindex col, const char*
 
 void validate_basisStateIndices(Qureg qureg, qindex startInd, qindex numInds, const char* caller) {
 
+    if (!global_isValidationEnabled)
+        return;
+
     assertThat(
         startInd >= 0 && startInd < qureg.numAmps, 
         report::INVALID_STARTING_BASIS_STATE_INDEX, 
@@ -3482,6 +3879,9 @@ void validate_basisStateIndices(Qureg qureg, qindex startInd, qindex numInds, co
 
 void validate_basisStateRowCols(Qureg qureg, qindex startRow, qindex startCol, qindex numRows, qindex numCols, const char* caller) {
 
+    if (!global_isValidationEnabled)
+        return;
+
     qindex maxRowOrColExcl = powerOf2(qureg.numQubits);
 
     assertThat(
@@ -3519,6 +3919,9 @@ void validate_basisStateRowCols(Qureg qureg, qindex startRow, qindex startCol, q
 
 void validate_localAmpIndices(Qureg qureg, qindex localStartInd, qindex numInds, const char* caller) {
 
+    if (!global_isValidationEnabled)
+        return;
+
     // note that localStartInd and numInds can validly DIFFER between nodes,
     // so we use assertAllNodesAgreeThat() in lieu of assertThat()
 
@@ -3616,11 +4019,17 @@ void assertValidQubits(
 
 void validate_target(Qureg qureg, int target, const char* caller) {
 
+    if (!global_isValidationEnabled)
+        return;
+
     assertValidQubit(qureg, target, report::INVALID_TARGET_QUBIT, caller);
 }
 
 void validate_targets(Qureg qureg, int* targets, int numTargets, const char* caller) {
 
+    if (!global_isValidationEnabled)
+        return;
+
     // must always have at least 1 target
     bool numCanBeZero = false;
 
@@ -3631,12 +4040,18 @@ void validate_targets(Qureg qureg, int* targets, int numTargets, const char* cal
 }
 void validate_twoTargets(Qureg qureg, int target1, int target2, const char* caller) {
 
+    if (!global_isValidationEnabled)
+        return;
+
     int targs[] = {target1, target2};
     validate_targets(qureg, targs, 2, caller);
 }
 
 void validate_controls(Qureg qureg, int* ctrls, int numCtrls, const char* caller) {
 
+    if (!global_isValidationEnabled)
+        return;
+
     // it is fine to have zero controls
     bool numCanBeZero = true;
 
@@ -3648,6 +4063,9 @@ void validate_controls(Qureg qureg, int* ctrls, int numCtrls, const char* caller
 
 void validate_controlsAndTargets(Qureg qureg, int* ctrls, int numCtrls, int* targs, int numTargs, const char* caller) {
 
+    if (!global_isValidationEnabled)
+        return;
+
     // validate controls and targets in isolation
     validate_targets(qureg, targs, numTargs, caller);
     validate_controls(qureg, ctrls, numCtrls, caller);
@@ -3657,29 +4075,47 @@ void validate_controlsAndTargets(Qureg qureg, int* ctrls, int numCtrls, int* tar
 }
 void validate_controlAndTarget(Qureg qureg, int ctrl, int targ, const char* caller) {
 
+    if (!global_isValidationEnabled)
+        return;
+
     validate_controlsAndTargets(qureg, &ctrl, 1, &targ, 1, caller);
 }
 void validate_controlAndTargets(Qureg qureg, int ctrl, int* targs, int numTargs, const char* caller) {
 
+    if (!global_isValidationEnabled)
+        return;
+
     validate_controlsAndTargets(qureg, &ctrl, 1, targs, numTargs, caller);
 }
 void validate_controlsAndTarget(Qureg qureg, int* ctrls, int numCtrls, int targ, const char* caller) {
 
+    if (!global_isValidationEnabled)
+        return;
+
     validate_controlsAndTargets(qureg, ctrls, numCtrls, &targ, 1, caller);
 }
 void validate_controlAndTwoTargets(Qureg qureg, int ctrl, int targ1, int targ2, const char* caller) {
 
+    if (!global_isValidationEnabled)
+        return;
+
     int targs[] = {targ1, targ2};
     validate_controlsAndTargets(qureg, &ctrl, 1, targs, 2, caller);
 }
 void validate_controlsAndTwoTargets(Qureg qureg, int* ctrls, int numCtrls, int targ1, int targ2, const char* caller) {
 
+    if (!global_isValidationEnabled)
+        return;
+
     int targs[] = {targ1, targ2};
     validate_controlsAndTargets(qureg, ctrls, numCtrls, targs, 2, caller);
 }
 
 void validate_controlStates(int* states, int numCtrls, const char* caller) {
 
+    if (!global_isValidationEnabled)
+        return;
+
     // states is permittedly unallocated (nullptr) even when numCtrls != 0
     if (!mem_isAllocated(states))
         return;
@@ -3690,6 +4126,9 @@ void validate_controlStates(int* states, int numCtrls, const char* caller) {
 
 void validate_controlsMatchStates(int numCtrls, int numStates, const char* caller) {
 
+    if (!global_isValidationEnabled)
+        return;
+
     // only invocable by the C++ interface
     tokenSubs vars = {
         {"${NUM_CTRLS}",  numCtrls},
@@ -3706,11 +4145,17 @@ void validate_controlsMatchStates(int numCtrls, int numStates, const char* calle
 
 void validate_measurementOutcomeIsValid(int outcome, const char* caller) {
 
+    if (!global_isValidationEnabled)
+        return;
+
     assertThat(outcome == 0 || outcome == 1, report::ONE_QUBIT_MEASUREMENT_OUTCOME_INVALID, {{"${OUTCOME}", outcome}}, caller);
 }
 
 void validate_measurementOutcomesAreValid(int* outcomes, int numOutcomes, const char* caller) {
 
+    if (!global_isValidationEnabled)
+        return;
+
     // no need to validate numOutcomes; it is already validated by caller (e.g. through numTargets)
 
     for (int i=0; i<numOutcomes; i++)
@@ -3742,6 +4187,9 @@ void validate_measurementOutcomesProbNotZero(int* outcomes, int numQubits, qreal
 
 void validate_measurementOutcomesFitInGpuMem(Qureg qureg, int numQubits, const char* caller) {
 
+    if (!global_isValidationEnabled)
+        return;
+
     // only GPU backend needs temp memory
     if (!qureg.isGpuAccelerated)
         return;
@@ -3774,6 +4222,9 @@ void validate_measurementProbsAreNormalised(vector<qreal> probs, const char* cal
 
 void validate_measurementOutcomesMatchTargets(int numQubits, int numOutcomes, const char* caller) {
 
+    if (!global_isValidationEnabled)
+        return;
+
     // invoked only by the C++ user interface
     tokenSubs vars = {
         {"${NUM_QUBITS}",    numQubits},
@@ -3800,6 +4251,9 @@ void validate_rotationAxisNotZeroVector(qreal x, qreal y, qreal z, const char* c
 
 void validate_mixedAmpsFitInNode(Qureg qureg, int numTargets, const char* caller) {
 
+    if (!global_isValidationEnabled)
+        return;
+
     // only relevant to distributed quregs
     if (!qureg.isDistributed)
         return;
@@ -3831,6 +4285,9 @@ void validate_mixedAmpsFitInNode(Qureg qureg, int numTargets, const char* caller
 
 void validate_trotterParams(Qureg qureg, int order, int reps, const char* caller) {
 
+    if (!global_isValidationEnabled)
+        return;
+
     bool isEven = (order % 2) == 0;
     assertThat(order > 0 && (isEven || order==1), report::INVALID_TROTTER_ORDER, {{"${ORDER}", order}}, caller);
     assertThat(reps > 0, report::INVALID_TROTTER_REPS, {{"${REPS}", reps}}, caller);
@@ -3844,6 +4301,9 @@ void validate_trotterParams(Qureg qureg, int order, int reps, const char* caller
 
 void validate_lindbladJumpOps(PauliStrSum* jumps, int numJumps, Qureg qureg, const char* caller) {
 
+    if (!global_isValidationEnabled)
+        return;
+
     assertThat(numJumps >= 0, report::NEGATIVE_NUM_LINDBLAD_JUMP_OPS, caller);
 
     // @todo
@@ -3860,6 +4320,9 @@ void validate_lindbladJumpOps(PauliStrSum* jumps, int numJumps, Qureg qureg, con
 
 void validate_lindbladDampingRates(qreal* damps, int numJumps, const char* caller) {
 
+    if (!global_isValidationEnabled)
+        return;
+
     // possibly repeated from jump op validation, for safety
     assertThat(numJumps >= 0, report::NEGATIVE_NUM_LINDBLAD_JUMP_OPS, caller);
 
@@ -3874,6 +4337,9 @@ void validate_lindbladDampingRates(qreal* damps, int numJumps, const char* calle
 
 void validate_numLindbladSuperPropagatorTerms(qindex numSuperTerms, const char* caller) {
 
+    if (!global_isValidationEnabled)
+        return;
+
     assertThat(numSuperTerms != 0, report::NUM_LINDBLAD_SUPER_PROPAGATOR_TERMS_OVERFLOWED, caller);
 
     // attempt to fetch RAM, and simply return if we fail; if we unknowingly
@@ -3888,7 +4354,6 @@ void validate_numLindbladSuperPropagatorTerms(qindex numSuperTerms, const char*
     // check whether the superpropagator fits in memory
     bool fits = mem_canPauliStrSumFitInMemory(numSuperTerms, memPerNode);
     assertThat(fits, report::NEW_LINDBLAD_SUPER_PROPAGATOR_CANNOT_FIT_INTO_CPU_MEM, {{"${NUM_TERMS}", numSuperTerms}, {"${NUM_BYTES}", memPerNode}}, caller);
-
 }
 
 
@@ -3899,6 +4364,9 @@ void validate_numLindbladSuperPropagatorTerms(qindex numSuperTerms, const char*
 
 void validate_probability(qreal prob, const char* caller) {
 
+    if (!global_isValidationEnabled)
+        return;
+
     /// @todo report 'prob' once validation reporting can handle floats
 
     /// @todo 
@@ -3910,6 +4378,9 @@ void validate_probability(qreal prob, const char* caller) {
 
 void validate_probabilities(qreal* probs, int numProbs, const char* caller) {
 
+    if (!global_isValidationEnabled)
+        return;
+
     // we assume that numProbs>0 was prior validated
 
     /// @todo like above, should we permit -eps <= prob <= 1+eps?
@@ -3931,6 +4402,9 @@ void validate_probabilities(qreal* probs, int numProbs, const char* caller) {
 
 void validate_oneQubitDepashingProb(qreal prob, const char* caller) {
 
+    if (!global_isValidationEnabled)
+        return;
+
     /// @todo report 'prob' once validation reporting can handle floats
 
     validate_probability(prob, caller);
@@ -3941,6 +4415,9 @@ void validate_oneQubitDepashingProb(qreal prob, const char* caller) {
 
 void validate_twoQubitDepashingProb(qreal prob, const char* caller) {
 
+    if (!global_isValidationEnabled)
+        return;
+
     /// @todo report 'prob' once validation reporting can handle floats
 
     validate_probability(prob, caller);
@@ -3951,6 +4428,9 @@ void validate_twoQubitDepashingProb(qreal prob, const char* caller) {
 
 void validate_oneQubitDepolarisingProb(qreal prob, const char* caller) {
 
+    if (!global_isValidationEnabled)
+        return;
+
     /// @todo report 'prob' once validation reporting can handle floats
 
     validate_probability(prob, caller);
@@ -3961,6 +4441,9 @@ void validate_oneQubitDepolarisingProb(qreal prob, const char* caller) {
 
 void validate_twoQubitDepolarisingProb(qreal prob, const char* caller) {
 
+    if (!global_isValidationEnabled)
+        return;
+
     /// @todo report 'prob' once validation reporting can handle floats
 
     validate_probability(prob, caller);
@@ -3971,6 +4454,9 @@ void validate_twoQubitDepolarisingProb(qreal prob, const char* caller) {
 
 void validate_oneQubitDampingProb(qreal prob, const char* caller) {
 
+    if (!global_isValidationEnabled)
+        return;
+
     /// @todo report 'prob' once validation reporting can handle floats
 
     // permit one-qubit amplitude damping of any valid probability, 
@@ -3980,6 +4466,9 @@ void validate_oneQubitDampingProb(qreal prob, const char* caller) {
 
 void validate_oneQubitPauliChannelProbs(qreal pX, qreal pY, qreal pZ, const char* caller) {
 
+    if (!global_isValidationEnabled)
+        return;
+
     validate_probability(pX, caller);
     validate_probability(pY, caller);
     validate_probability(pZ, caller);
@@ -4005,6 +4494,9 @@ void validate_oneQubitPauliChannelProbs(qreal pX, qreal pY, qreal pZ, const char
 
 void validate_quregCanBeWorkspace(Qureg qureg, Qureg workspace, const char* caller) {
 
+    if (!global_isValidationEnabled)
+        return;
+
     assertThat(
         doQuregsHaveIdenticalMemoryLayouts(qureg, workspace),
         report::QUREG_IS_INCOMPATIBLE_WITH_WORKSPACE, caller);
@@ -4015,11 +4507,17 @@ void validate_quregCanBeWorkspace(Qureg qureg, Qureg workspace, const char* call
 
 void validate_numQuregsInSum(int numQuregs, const char* caller) {
 
+    if (!global_isValidationEnabled)
+        return;
+
     assertThat(numQuregs > 0, report::NON_POSITIVE_NUM_QUREGS_IN_SUM, {{"${NUM_QUREGS}", numQuregs}}, caller);
 }
 
 void validate_quregsCanBeSummed(Qureg out, Qureg* in, int numIn, const char* caller) {
 
+    if (!global_isValidationEnabled)
+        return;
+
     for (int i=0; i<numIn; i++)
         validate_quregFields(in[i], caller);
 
@@ -4032,6 +4530,9 @@ void validate_quregsCanBeSummed(Qureg out, Qureg* in, int numIn, const char* cal
 
 void validate_quregsCanBeMixed(Qureg out, Qureg* in, int numIn, const char* caller) {
 
+    if (!global_isValidationEnabled)
+        return;
+
     // mixing in multiple quregs (done here) is much stricter than when 
     // only one pair is being mixed in, which is handled below
 
@@ -4050,6 +4551,9 @@ void validate_quregsCanBeMixed(Qureg out, Qureg* in, int numIn, const char* call
 
 void validate_quregPairCanBeMixed(Qureg quregOut, Qureg quregIn, const char* caller) {
 
+    if (!global_isValidationEnabled)
+        return;
+
     // mixing must be mathematically possible; dims are compatible, but quregIn can be a statevector
     assertThat(quregOut.isDensityMatrix, report::MIXED_QUREG_NOT_DENSITY_MATRIX, caller);
     assertThat(
@@ -4068,6 +4572,9 @@ void validate_quregPairCanBeMixed(Qureg quregOut, Qureg quregIn, const char* cal
 
 void validate_numQuregsMatchesCoeffs(size_t numQuregs, size_t numCoeffs, const char* caller) {
 
+    if (!global_isValidationEnabled)
+        return;
+
     tokenSubs vars = {
         {"${NUM_QUREGS}", numQuregs},
         {"${NUM_COEFFS}", numCoeffs}
@@ -4077,6 +4584,9 @@ void validate_numQuregsMatchesCoeffs(size_t numQuregs, size_t numCoeffs, const c
 
 void validate_numQuregsMatchesProbs(size_t numQuregs, size_t numProbs, const char* caller) {
 
+    if (!global_isValidationEnabled)
+        return;
+
     tokenSubs vars = {
         {"${NUM_QUREGS}", numQuregs},
         {"${NUM_PROBS}",  numProbs}
@@ -4108,6 +4618,9 @@ void validateStateVecCanBeInitialisedToPureState(Qureg qureg, Qureg pure, const
 
 void validate_quregCanBeInitialisedToPureState(Qureg qureg, Qureg pure, const char* caller) {
 
+    if (!global_isValidationEnabled)
+        return;
+
     assertThat(!pure.isDensityMatrix, report::INIT_PURE_STATE_IS_DENSMATR, caller);
 
     // quregs must have the same number of qubits, regardless of dimension
@@ -4124,6 +4637,9 @@ void validate_quregCanBeInitialisedToPureState(Qureg qureg, Qureg pure, const ch
 
 void validate_quregsCanBeCloned(Qureg quregA, Qureg quregB, const char* caller) {
 
+    if (!global_isValidationEnabled)
+        return;
+
     // quregs must have identical sizes... 
     assertThat(
         quregA.numQubits == quregB.numQubits, report::CLONED_QUREGS_DIFFER_IN_NUM_QUBITS, 
@@ -4146,7 +4662,10 @@ void validate_quregsCanBeCloned(Qureg quregA, Qureg quregB, const char* caller)
 
 void validate_quregsCanBeProducted(Qureg quregA, Qureg quregB, const char* caller) {
 
-   // number of qubits must always match
+    if (!global_isValidationEnabled)
+        return;
+
+    // number of qubits must always match
     assertThat(
         quregA.numQubits == quregB.numQubits, 
         report::PRODUCTED_QUREGS_HAVE_DIFFERENT_NUM_QUBITS,
@@ -4177,6 +4696,9 @@ void validate_quregsCanBeProducted(Qureg quregA, Qureg quregB, const char* calle
 
 void validate_throwErrorBecauseCalcFidOfDensMatrNotYetImplemented(const char* caller) {
 
+    if (!global_isValidationEnabled)
+        return;
+
     assertThat(false, report::CALC_FIDELITY_OF_DENSITY_MATRICES_NOT_YET_SUPPORTED, caller);
 }
 
@@ -4236,6 +4758,9 @@ void validate_quregRenormProbIsNotZero(qreal prob, const char* caller) {
 
 void validate_numInitRandomPureStates(qindex numPureStates,  const char* caller) {
 
+    if (!global_isValidationEnabled)
+        return;
+
     assertThat(numPureStates >= 1, report::INVALID_NUM_INIT_PURE_STATES, {{"${NUM_STATES}", numPureStates}}, caller);
 }
 
@@ -4302,6 +4827,9 @@ void validate_densMatrExpecDiagMatrValueIsReal(qcomp value, qcomp exponent, cons
 
 void validate_quregCanBeReduced(Qureg qureg, int numTraceQubits, const char* caller) {
 
+    if (!global_isValidationEnabled)
+        return;
+
     // 0 < numTraceQubits <= numQubits is assured by validate_targets(), but
     // numTraceQubits == numQubtis is permitted there though forbidden here
     assertThat(numTraceQubits < qureg.numQubits, report::NUM_TRACE_QUBITS_EQUALS_QUREG_SIZE, caller);
@@ -4328,6 +4856,9 @@ void validate_quregCanBeReduced(Qureg qureg, int numTraceQubits, const char* cal
 
 void validate_quregCanBeSetToReducedDensMatr(Qureg out, Qureg in, int numTraceQubits, const char* caller) {
 
+    if (!global_isValidationEnabled)
+        return;
+
     int numRemainingQubits = in.numQubits - numTraceQubits;
 
     tokenSubs vars = {
@@ -4350,6 +4881,9 @@ void validate_quregCanBeSetToReducedDensMatr(Qureg out, Qureg in, int numTraceQu
 
 void validate_canReadFile(string fn, const char* caller) {
 
+    if (!global_isValidationEnabled)
+        return;
+
     /// @todo embed filename into error message when tokenSubs is updated to permit strings
     assertThat(parser_canReadFile(fn), report::CANNOT_READ_FILE, caller);
 }
@@ -4362,6 +4896,9 @@ void validate_canReadFile(string fn, const char* caller) {
 
 void validate_tempListAllocSucceeded(bool succeeded, qindex numElems, qindex numBytesPerElem, const char* caller) {
 
+    if (!global_isValidationEnabled)
+        return;
+
     // avoid showing total bytes in case it overflows
     tokenSubs vars = {
         {"${NUM_ELEMS}", numElems},
@@ -4372,6 +4909,9 @@ void validate_tempListAllocSucceeded(bool succeeded, qindex numElems, qindex num
 
 void validate_tempAllocSucceeded(bool succeeded, size_t numBytes, const char* caller) {
 
+    if (!global_isValidationEnabled)
+        return;
+
     assertThat(succeeded, report::TEMP_ALLOC_FAILED, {{"${NUM_BYTES}", numBytes}}, caller);
 }
 
@@ -4383,6 +4923,9 @@ void validate_tempAllocSucceeded(bool succeeded, size_t numBytes, const char* ca
 
 void validate_envVarPermitNodesToShareGpu(string varValue, const char* caller) {
 
+    if (!global_isValidationEnabled)
+        return;
+
     // though caller should gaurantee varValue contains at least one character, 
     // we'll still check to avoid a segfault if this gaurantee is broken
     bool isValid = (varValue.size() == 1) && (varValue[0] == '0' || varValue[0] == '1');
@@ -4391,6 +4934,9 @@ void validate_envVarPermitNodesToShareGpu(string varValue, const char* caller) {
 
 void validate_envVarDefaultValidationEpsilon(string varValue, const char* caller) {
 
+    if (!global_isValidationEnabled)
+        return;
+
     assertThat(parser_isAnySizedReal(varValue), report::DEFAULT_EPSILON_ENV_VAR_NOT_A_REAL, caller);
     assertThat(parser_isValidReal(varValue), report::DEFAULT_EPSILON_ENV_VAR_EXCEEDS_QREAL_RANGE, caller);
 

From 641d2f383e1a8989f21baab0d1764995904d213f Mon Sep 17 00:00:00 2001
From: Tyson Jones <tyson.jones.input@gmail.com>
Date: Sun, 17 May 2026 23:35:04 -0400
Subject: [PATCH 6/6] Remove dead macro

---
 quest/src/core/accelerator.hpp | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/quest/src/core/accelerator.hpp b/quest/src/core/accelerator.hpp
index 507fed0c..b1a2273f 100644
--- a/quest/src/core/accelerator.hpp
+++ b/quest/src/core/accelerator.hpp
@@ -80,10 +80,6 @@
     template returntype funcname <-1,numtargs> args;
 
 
-#define INSTANTIATE_CONJUGABLE_FUNC_OPTIMISED_FOR_NUM_CTRLS_AND_TARGS(returntype, funcname, args) \
-    private_CONJUGABLE_INSTANTIATE_outer(returntype, funcname, true,  args) \
-    private_CONJUGABLE_INSTANTIATE_outer(returntype, funcname, false, args)
-
 #define private_CONJUGABLE_INSTANTIATE_outer(returntype, funcname, conj, args) \
     private_CONJUGABLE_INSTANTIATE_inner(returntype, funcname, 0, conj, args) \
     private_CONJUGABLE_INSTANTIATE_inner(returntype, funcname, 1, conj, args) \