QuEST/quest/src/core/accelerator.cpp at v4.0.0 · QuEST-Kit/QuEST · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
/** @file
 * Internal functions for choosing which accelerator backend
 * (CPU or GPU) to dispatch to, and which preconditions the
 * qubit indices satisfy (informing which compile-time
 * optimisations to use) in order to effect local simulation
 * subroutines upon Quregs.
 *
 * These routines are called by localiser.cpp and are embarrassingly
 * parallel, so are always called before/after any necessary
 * communication has happened. The data they need must already be
 * localised into the appropriate memory (RAM or VRAM) and location
 * (qureg's amplitudes or buffer space).
 *
 * @author Tyson Jones
 */

#include "quest/include/types.h"
#include "quest/include/qureg.h"
#include "quest/include/paulis.h"
#include "quest/include/matrices.h"

#include "quest/src/core/accelerator.hpp"
#include "quest/src/core/errors.hpp"
#include "quest/src/core/memory.hpp"
#include "quest/src/core/bitwise.hpp"
#include "quest/src/cpu/cpu_config.hpp"
#include "quest/src/gpu/gpu_config.hpp"
#include "quest/src/cpu/cpu_subroutines.hpp"
#include "quest/src/gpu/gpu_subroutines.hpp"

#include <vector>
#include <algorithm>

using std::vector;
using std::min;


/*
 * MACROS
 *
 * which automate the choosing of the appropriate backend template function,
 * optimised for the given configuration of qubit indices, for example through
 * automatic unrolling of loops with bounds known at compile-time. When the
 * number of controls or targets exceeds that which have optimised compilations,
 * we fall back to using a generic implementation, indicated by <-1>. In essence,
 * these macros simply call func<ctrls.size()> albeit without illegally passing
 * a runtime variable as a template parameter. Note an awkward use of decltype()
 * is to workaround a GCC <12 bug with implicitly-typed vector initialisations.
 *
 * BEWARE that these macros are single-line expressions, so they can be used in
 * braceless if/else or ternary operators - but stay vigilant!
 */


#define GET_FUNC_OPTIMISED_FOR_BOOL(funcname, value) \
    ((value)? funcname<true> : funcname<false>)


#define GET_FUNC_OPTIMISED_FOR_TWO_BOOLS(funcname, b1, b2) \
    ((b1)? \
        ((b2)? funcname<true, true> : funcname<true, false>) : \
        ((b2)? funcname<false,true> : funcname<false,false>))


#if (MAX_OPTIMISED_NUM_CTRLS != 5) || (MAX_OPTIMISED_NUM_TARGS != 5)
    #error "The number of optimised, templated QuEST functions was inconsistent between accelerator's source and header."
#endif


#define GET_FUNC_OPTIMISED_FOR_NUM_CTRLS(f, numctrls) \
    (vector <decltype(&f<0>)> {&f<0>, &f<1>, &f<2>, &f<3>, &f<4>, &f<5>, &f<-1>}) \
    [std::min((int) numctrls, MAX_OPTIMISED_NUM_CTRLS + 1)]

#define GET_FUNC_OPTIMISED_FOR_NUM_TARGS(f, numtargs) \
    (vector <decltype(&f<0>)> {&f<0>, &f<1>, &f<2>, &f<3>, &f<4>, &f<5>, &f<-1>}) \
    [std::min((int) numtargs, MAX_OPTIMISED_NUM_TARGS + 1)]

#define GET_FUNC_OPTIMISED_FOR_NUM_CTRLS_AND_TARGS(f, numctrls, numtargs) \
    (vector <ARR(f)> { \
        ARR(f) {&f<0,0>,  &f<0,1>,  &f<0,2>,  &f<0,3>,  &f<0,4>,  &f<0,5>,  &f<0,-1>}, \
        ARR(f) {&f<1,0>,  &f<1,1>,  &f<1,2>,  &f<1,3>,  &f<1,4>,  &f<1,5>,  &f<1,-1>}, \
        ARR(f) {&f<2,0>,  &f<2,1>,  &f<2,2>,  &f<2,3>,  &f<2,4>,  &f<2,5>,  &f<2,-1>}, \
        ARR(f) {&f<3,0>,  &f<3,1>,  &f<3,2>,  &f<3,3>,  &f<3,4>,  &f<3,5>,  &f<3,-1>}, \
        ARR(f) {&f<4,0>,  &f<4,1>,  &f<4,2>,  &f<4,3>,  &f<4,4>,  &f<4,5>,  &f<4,-1>}, \
        ARR(f) {&f<5,0>,  &f<5,1>,  &f<5,2>,  &f<5,3>,  &f<5,4>,  &f<5,5>,  &f<5,-1>}, \
        ARR(f) {&f<-1,0>, &f<-1,1>, &f<-1,2>, &f<-1,3>, &f<-1,4>, &f<-1,5>, &f<-1,-1>}}) \
    [std::min((int) numctrls, MAX_OPTIMISED_NUM_CTRLS + 1)] \
    [std::min((int) numtargs, MAX_OPTIMISED_NUM_TARGS + 1)]

#define ARR(f) vector<decltype(&f<0,0>)>


#define GET_CPU_OR_GPU_FUNC_OPTIMISED_FOR_NUM_CTRLS(funcsuffix, qureg, numctrls) \
    ((qureg.isGpuAccelerated)? \
        GET_FUNC_OPTIMISED_FOR_NUM_CTRLS( gpu_##funcsuffix, numctrls ) : \
        GET_FUNC_OPTIMISED_FOR_NUM_CTRLS( cpu_##funcsuffix, numctrls ))

#define GET_CPU_OR_GPU_FUNC_OPTIMISED_FOR_NUM_TARGS(funcsuffix, qureg, numtargs) \
    ((qureg.isGpuAccelerated)? \
        GET_FUNC_OPTIMISED_FOR_NUM_TARGS( gpu_##funcsuffix, numtargs ) : \
        GET_FUNC_OPTIMISED_FOR_NUM_TARGS( cpu_##funcsuffix, numtargs ))

#define GET_CPU_OR_GPU_FUNC_OPTIMISED_FOR_NUM_CTRLS_AND_TARGS(funcsuffix, qureg, numctrls, numtargs) \
    ((qureg.isGpuAccelerated)? \
        GET_FUNC_OPTIMISED_FOR_NUM_CTRLS_AND_TARGS( gpu_##funcsuffix, numctrls, numtargs ) : \
        GET_FUNC_OPTIMISED_FOR_NUM_CTRLS_AND_TARGS( cpu_##funcsuffix, numctrls, numtargs ))


/// @todo
/// GET_CPU_OR_GPU_CONJUGABLE_FUNC_OPTIMISED_FOR_NUM_CTRLS_AND_TARGS,
/// as defined below, is only ever called by used by anyCtrlAnyTargDenseMatr,
/// which only ever receives numTargs>=3 (due to accelerator redirecting
/// fewer targets to faster bespoke functions which e.g. avoid global GPU
/// cache emory access). This means its instantiation with numTargs=0,1,2
/// is useless, though contributes to 42% of the function's compilation
/// time which is large because of the 7*7*2=98 unique instantiations. We
/// can ergo non-negligibly speed up compilation by avoiding these redundant
/// instances at the cost of increased code complexity/asymmetry. Consider!

#define GET_CONJUGABLE_FUNC_OPTIMISED_FOR_NUM_CTRLS_AND_TARGS(f, numctrls, numtargs, c) \
    (vector <CONJ_ARR(f)> { \
        CONJ_ARR(f) {&f<0,0,c>,  &f<0,1,c>,  &f<0,2,c>,  &f<0,3,c>,  &f<0,4,c>,  &f<0,5,c>,  &f<0,-1,c>}, \
        CONJ_ARR(f) {&f<1,0,c>,  &f<1,1,c>,  &f<1,2,c>,  &f<1,3,c>,  &f<1,4,c>,  &f<1,5,c>,  &f<1,-1,c>}, \
        CONJ_ARR(f) {&f<2,0,c>,  &f<2,1,c>,  &f<2,2,c>,  &f<2,3,c>,  &f<2,4,c>,  &f<2,5,c>,  &f<2,-1,c>}, \
        CONJ_ARR(f) {&f<3,0,c>,  &f<3,1,c>,  &f<3,2,c>,  &f<3,3,c>,  &f<3,4,c>,  &f<3,5,c>,  &f<3,-1,c>}, \
        CONJ_ARR(f) {&f<4,0,c>,  &f<4,1,c>,  &f<4,2,c>,  &f<4,3,c>,  &f<4,4,c>,  &f<4,5,c>,  &f<4,-1,c>}, \
        CONJ_ARR(f) {&f<5,0,c>,  &f<5,1,c>,  &f<5,2,c>,  &f<5,3,c>,  &f<5,4,c>,  &f<5,5,c>,  &f<5,-1,c>}, \
        CONJ_ARR(f) {&f<-1,0,c>, &f<-1,1,c>, &f<-1,2,c>, &f<-1,3,c>, &f<-1,4,c>, &f<-1,5,c>, &f<-1,-1,c>}}) \
    [std::min((int) numctrls, MAX_OPTIMISED_NUM_CTRLS + 1)] \
    [std::min((int) numtargs, MAX_OPTIMISED_NUM_TARGS + 1)]

#define CONJ_ARR(f) vector<decltype(&f<0,0,false>)>

#define GET_CPU_OR_GPU_CONJUGABLE_FUNC_OPTIMISED_FOR_NUM_CTRLS_AND_TARGS(funcsuffix, qureg, numctrls, numtargs, conj) \
    ((qureg.isGpuAccelerated)? \
        ((conj)? \
            GET_CONJUGABLE_FUNC_OPTIMISED_FOR_NUM_CTRLS_AND_TARGS( gpu_##funcsuffix, numctrls, numtargs, true ) : \
            GET_CONJUGABLE_FUNC_OPTIMISED_FOR_NUM_CTRLS_AND_TARGS( gpu_##funcsuffix, numctrls, numtargs, false ) ) : \
        ((conj)? \
            GET_CONJUGABLE_FUNC_OPTIMISED_FOR_NUM_CTRLS_AND_TARGS( cpu_##funcsuffix, numctrls, numtargs, true ) : \
            GET_CONJUGABLE_FUNC_OPTIMISED_FOR_NUM_CTRLS_AND_TARGS( cpu_##funcsuffix, numctrls, numtargs, false ) ) )


/// @todo
/// This has gotten a bit ridiculous. Is there a way to use (likely)
/// more abominable pre-processor mischief which negates the need
/// to repeat the entire macro(s) when the number of templated
/// parameters grows?


#define GET_EXPONENTIABLE_CONJUGABLE_FUNC_OPTIMISED_FOR_NUM_CTRLS_AND_TARGS(f, numctrls, numtargs, c, h) \
    (vector <POWER_CONJ_ARR(f)> { \
        POWER_CONJ_ARR(f) {&f<0,0,c,h>,  &f<0,1,c,h>,  &f<0,2,c,h>,  &f<0,3,c,h>,  &f<0,4,c,h>,  &f<0,5,c,h>,  &f<0,-1,c,h>}, \
        POWER_CONJ_ARR(f) {&f<1,0,c,h>,  &f<1,1,c,h>,  &f<1,2,c,h>,  &f<1,3,c,h>,  &f<1,4,c,h>,  &f<1,5,c,h>,  &f<1,-1,c,h>}, \
        POWER_CONJ_ARR(f) {&f<2,0,c,h>,  &f<2,1,c,h>,  &f<2,2,c,h>,  &f<2,3,c,h>,  &f<2,4,c,h>,  &f<2,5,c,h>,  &f<2,-1,c,h>}, \
        POWER_CONJ_ARR(f) {&f<3,0,c,h>,  &f<3,1,c,h>,  &f<3,2,c,h>,  &f<3,3,c,h>,  &f<3,4,c,h>,  &f<3,5,c,h>,  &f<3,-1,c,h>}, \
        POWER_CONJ_ARR(f) {&f<4,0,c,h>,  &f<4,1,c,h>,  &f<4,2,c,h>,  &f<4,3,c,h>,  &f<4,4,c,h>,  &f<4,5,c,h>,  &f<4,-1,c,h>}, \
        POWER_CONJ_ARR(f) {&f<5,0,c,h>,  &f<5,1,c,h>,  &f<5,2,c,h>,  &f<5,3,c,h>,  &f<5,4,c,h>,  &f<5,5,c,h>,  &f<5,-1,c,h>}, \
        POWER_CONJ_ARR(f) {&f<-1,0,c,h>, &f<-1,1,c,h>, &f<-1,2,c,h>, &f<-1,3,c,h>, &f<-1,4,c,h>, &f<-1,5,c,h>, &f<-1,-1,c,h>}}) \
    [std::min((int) numctrls, MAX_OPTIMISED_NUM_CTRLS + 1)] \
    [std::min((int) numtargs, MAX_OPTIMISED_NUM_TARGS + 1)]

#define POWER_CONJ_ARR(f) vector<decltype(&f<0,0,false,false>)>

#define GET_CPU_OR_GPU_EXPONENTIABLE_CONJUGABLE_FUNC_OPTIMISED_FOR_NUM_CTRLS_AND_TARGS(funcsuffix, qureg, numctrls, numtargs, conj, haspower) \
    ((qureg.isGpuAccelerated)? \
        ((conj)? \
            ((haspower)? \
                GET_EXPONENTIABLE_CONJUGABLE_FUNC_OPTIMISED_FOR_NUM_CTRLS_AND_TARGS( gpu_##funcsuffix, numctrls, numtargs, true, true ) : \
                GET_EXPONENTIABLE_CONJUGABLE_FUNC_OPTIMISED_FOR_NUM_CTRLS_AND_TARGS( gpu_##funcsuffix, numctrls, numtargs, true, false ) ) : \
            ((haspower)? \
                GET_EXPONENTIABLE_CONJUGABLE_FUNC_OPTIMISED_FOR_NUM_CTRLS_AND_TARGS( gpu_##funcsuffix, numctrls, numtargs, false, true ) : \
                GET_EXPONENTIABLE_CONJUGABLE_FUNC_OPTIMISED_FOR_NUM_CTRLS_AND_TARGS( gpu_##funcsuffix, numctrls, numtargs, false, false ) ) ) : \
        ((conj)? \
            ((haspower)? \
                GET_EXPONENTIABLE_CONJUGABLE_FUNC_OPTIMISED_FOR_NUM_CTRLS_AND_TARGS( cpu_##funcsuffix, numctrls, numtargs, true, true ) : \
                GET_EXPONENTIABLE_CONJUGABLE_FUNC_OPTIMISED_FOR_NUM_CTRLS_AND_TARGS( cpu_##funcsuffix, numctrls, numtargs, true, false ) ) : \
            ((haspower)? \
                GET_EXPONENTIABLE_CONJUGABLE_FUNC_OPTIMISED_FOR_NUM_CTRLS_AND_TARGS( cpu_##funcsuffix, numctrls, numtargs, false, true ) : \
                GET_EXPONENTIABLE_CONJUGABLE_FUNC_OPTIMISED_FOR_NUM_CTRLS_AND_TARGS( cpu_##funcsuffix, numctrls, numtargs, false, false ) ) ) )


/*
 * GETTERS
 */


qcomp accel_statevec_getAmp_sub(Qureg qureg, qindex localInd) {

    // we use a bespoke function, rather than merely invoking
    // getAmps() below, so that the CPU implementation can
    // make use of the faster array access, rather than memcpy,
    // and we keep the bespoke GPU function for symmetry/consistency

    return (qureg.isGpuAccelerated)?
        gpu_statevec_getAmp_sub(qureg, localInd):
        cpu_statevec_getAmp_sub(qureg, localInd);
}


void accel_statevec_getAmps_sub(qcomp* outAmps, Qureg qureg, qindex localStartInd, qindex numLocalAmps) {

    // copy directly from GPU/CPU to outAmps
    (qureg.isGpuAccelerated)?
        gpu_copyGpuToCpu(&qureg.gpuAmps[localStartInd], outAmps, numLocalAmps): // (src, dest) = (gpu, cpu)
        cpu_copyArray(   outAmps, &qureg.cpuAmps[localStartInd], numLocalAmps); // (dest, src)
}


/*
 * SETTERS
 */


void accel_statevec_setAmps_sub(qcomp* inAmps, Qureg qureg, qindex localStartInd, qindex numLocalAmps) {

    // in CPU settings, we use memory-copying rather than OpenMP
    // loop updating, because the latter is only faster when carefully
    // optimising parallelisation granularity with the memory
    // architecture, which we cannot reliably do in a platform
    // agnostic way (except via hwloc or something)

    // copy directly from inAmps to GPU/CPU
    (qureg.isGpuAccelerated)?
        gpu_copyCpuToGpu(inAmps, &qureg.gpuAmps[localStartInd], numLocalAmps): // (src, dest) = (cpu, gpu)
        cpu_copyArray(   &qureg.cpuAmps[localStartInd], inAmps, numLocalAmps); // (dest, src)
}


void accel_densmatr_setAmpsToPauliStrSum_sub(Qureg qureg, PauliStrSum sum) {

    (qureg.isGpuAccelerated)?
        gpu_densmatr_setAmpsToPauliStrSum_sub(qureg, sum):
        cpu_densmatr_setAmpsToPauliStrSum_sub(qureg, sum);
}


void accel_fullstatediagmatr_setElemsToPauliStrSum(FullStateDiagMatr out, PauliStrSum in) {

    // use GPU to populate FullStateDiagMatr if available
    (out.isGpuAccelerated)?
        gpu_fullstatediagmatr_setElemsToPauliStrSum(out, in):
        cpu_fullstatediagmatr_setElemsToPauliStrSum(out, in);

    // but thereafter copy to CPU, to keep GPU and CPU consistent
    if (out.isGpuAccelerated)
        gpu_copyGpuToCpu(out.gpuElems, out.cpuElems, out.numElemsPerNode);
}


/*
 * COMMUNICATION BUFFER PACKING
 */


qindex accel_statevec_packAmpsIntoBuffer(Qureg qureg, vector<int> qubits, vector<int> qubitStates) {

    // we can never pack and swap buffers when there are no constrained qubit states, because we'd
    // then fill the entire buffer andhave no room to receive the other node's buffer; caller would
    // instead send amps straight to buffer
    if (qubitStates.empty())
        error_noCtrlsGivenToBufferPacker();

    // note qubits may incidentally be ctrls or targs; it doesn't matter
    auto func = GET_CPU_OR_GPU_FUNC_OPTIMISED_FOR_NUM_TARGS( statevec_packAmpsIntoBuffer, qureg, qubits.size() );

    // return the number of packed amps, for caller convenience
    return func(qureg, qubits, qubitStates);
}


qindex accel_statevec_packPairSummedAmpsIntoBuffer(Qureg qureg, int qubit1, int qubit2, int qubit3, int bit2) {

    return (qureg.isGpuAccelerated)?
        gpu_statevec_packPairSummedAmpsIntoBuffer(qureg, qubit1, qubit2, qubit3, bit2):
        cpu_statevec_packPairSummedAmpsIntoBuffer(qureg, qubit1, qubit2, qubit3, bit2);
}


/*
 * SWAPS
 */


void accel_statevec_anyCtrlSwap_subA(Qureg qureg, vector<int> ctrls, vector<int> ctrlStates, int targ1, int targ2) {

    auto func = GET_CPU_OR_GPU_FUNC_OPTIMISED_FOR_NUM_CTRLS( statevec_anyCtrlSwap_subA, qureg, ctrls.size() );
    func(qureg, ctrls, ctrlStates, targ1, targ2);
}
void accel_statevec_anyCtrlSwap_subB(Qureg qureg, vector<int> ctrls, vector<int> ctrlStates) {

    auto func = GET_CPU_OR_GPU_FUNC_OPTIMISED_FOR_NUM_CTRLS( statevec_anyCtrlSwap_subB, qureg, ctrls.size() );
    func(qureg, ctrls, ctrlStates);
}
void accel_statevec_anyCtrlSwap_subC(Qureg qureg, vector<int> ctrls, vector<int> ctrlStates, int targ, int targState) {

    auto func = GET_CPU_OR_GPU_FUNC_OPTIMISED_FOR_NUM_CTRLS( statevec_anyCtrlSwap_subC, qureg, ctrls.size() );
    func(qureg, ctrls, ctrlStates, targ, targState);
}


/*
 * DENSE MATRIX
 */


void accel_statevec_anyCtrlOneTargDenseMatr_subA(Qureg qureg, vector<int> ctrls, vector<int> ctrlStates, int targ, CompMatr1 matr) {

    auto func = GET_CPU_OR_GPU_FUNC_OPTIMISED_FOR_NUM_CTRLS( statevec_anyCtrlOneTargDenseMatr_subA, qureg, ctrls.size() );
    func(qureg, ctrls, ctrlStates, targ, matr);
}
void accel_statevec_anyCtrlOneTargDenseMatr_subB(Qureg qureg, vector<int> ctrls, vector<int> ctrlStates, qcomp fac0, qcomp fac1) {

    auto func = GET_CPU_OR_GPU_FUNC_OPTIMISED_FOR_NUM_CTRLS( statevec_anyCtrlOneTargDenseMatr_subB, qureg, ctrls.size() );
    func(qureg, ctrls, ctrlStates, fac0, fac1);
}


void accel_statevec_anyCtrlTwoTargDenseMatr_sub(Qureg qureg, vector<int> ctrls, vector<int> ctrlStates, int targ1, int targ2, CompMatr2 matr) {

    auto func = GET_CPU_OR_GPU_FUNC_OPTIMISED_FOR_NUM_CTRLS( statevec_anyCtrlTwoTargDenseMatr_sub, qureg, ctrls.size() );
    func(qureg, ctrls, ctrlStates, targ1, targ2, matr);
}


void accel_statevec_anyCtrlAnyTargDenseMatr_sub(Qureg qureg, vector<int> ctrls, vector<int> ctrlStates, vector<int> targs, CompMatr matr, bool conj) {

    auto func = GET_CPU_OR_GPU_CONJUGABLE_FUNC_OPTIMISED_FOR_NUM_CTRLS_AND_TARGS( statevec_anyCtrlAnyTargDenseMatr_sub, qureg, ctrls.size(), targs.size(), conj );
    func(qureg, ctrls, ctrlStates, targs, matr);
}


/*
 * ANY-TARG DIAGONAL MATRIX
 */


void accel_statevec_anyCtrlOneTargDiagMatr_sub(Qureg qureg, vector<int> ctrls, vector<int> ctrlStates, int targ, DiagMatr1 matr) {

    auto func = GET_CPU_OR_GPU_FUNC_OPTIMISED_FOR_NUM_CTRLS( statevec_anyCtrlOneTargDiagMatr_sub, qureg, ctrls.size() );
    func(qureg, ctrls, ctrlStates, targ, matr);
}


void accel_statevec_anyCtrlTwoTargDiagMatr_sub(Qureg qureg, vector<int> ctrls, vector<int> ctrlStates, int targ1, int targ2, DiagMatr2 matr) {

    auto func = GET_CPU_OR_GPU_FUNC_OPTIMISED_FOR_NUM_CTRLS( statevec_anyCtrlTwoTargDiagMatr_sub, qureg, ctrls.size() );
    func(qureg, ctrls, ctrlStates, targ1, targ2, matr);
}


void accel_statevec_anyCtrlAnyTargDiagMatr_sub(Qureg qureg, vector<int> ctrls, vector<int> ctrlStates, vector<int> targs, DiagMatr matr, qcomp exponent, bool conj) {

    bool hasPower = exponent != qcomp(1, 0);

    auto func = GET_CPU_OR_GPU_EXPONENTIABLE_CONJUGABLE_FUNC_OPTIMISED_FOR_NUM_CTRLS_AND_TARGS( statevec_anyCtrlAnyTargDiagMatr_sub, qureg, ctrls.size(), targs.size(), conj, hasPower );
    func(qureg, ctrls, ctrlStates, targs, matr, exponent);
}


/*
 * ALL-TARGS DIAGONAL MATRIX
 */


void accel_statevec_allTargDiagMatr_sub(Qureg qureg, FullStateDiagMatr matr, qcomp exponent) {

    // qureg and matr are equal size and identically distributed...
    assert_quregAndFullStateDiagMatrAreBothOrNeitherDistrib(qureg, matr);

    // but they may have differing GPU deployments
    bool quregGPU = qureg.isGpuAccelerated;
    bool matrGPU = matr.isGpuAccelerated;

    bool hasPower = exponent != qcomp(1, 0);
    auto cpuFunc = GET_FUNC_OPTIMISED_FOR_BOOL( cpu_statevec_allTargDiagMatr_sub, hasPower );
    auto gpuFunc = GET_FUNC_OPTIMISED_FOR_BOOL( gpu_statevec_allTargDiagMatr_sub, hasPower );

    // when deployments match, we trivially call the common backend
    if ( quregGPU &&  matrGPU) gpuFunc(qureg, matr, exponent);
    if (!quregGPU && !matrGPU) cpuFunc(qureg, matr, exponent);

    // deployments differing is a strange and expectedly rare scenario;
    // why use GPU-acceleration for a Qureg but not the equally-sized
    // matrix? We provide the below fallbacks for defensive design, and
    // fall-back to copying memory to RAM and using the CPU backend.
    // In theory, we could leverage exsting GPU memory of the Qureg's
    // communication buffer (if it existed), but this is an even rarer
    // situation and is hacky. We could also create new, temporary GPU
    // memory and graft it to the non-accelerated object, but the new
    // allocation would be the same size as the objects and ergo be
    // dangerously large.

    if (!quregGPU && matrGPU) {

        // copying matr GPU memory to CPU is unnecessary,
        // because it should never have diverged
        cpuFunc(qureg, matr, exponent);
    }

    if (quregGPU && !matrGPU) {
        gpu_copyGpuToCpu(qureg);
        cpuFunc(qureg, matr, exponent);
        gpu_copyCpuToGpu(qureg);
    }
}


void accel_densmatr_allTargDiagMatr_subA(Qureg qureg, FullStateDiagMatr matr, qcomp exponent, bool multiplyOnly) {

    // matr is always local, qureg can be local or distributed...
    assert_fullStateDiagMatrIsLocal(matr);

    // and their GPU deployments can differ
    bool quregGPU = qureg.isGpuAccelerated;
    bool matrGPU = matr.isGpuAccelerated;

    bool hasPower = exponent != qcomp(1, 0);
    auto cpuFunc = GET_FUNC_OPTIMISED_FOR_TWO_BOOLS( cpu_densmatr_allTargDiagMatr_sub, hasPower, multiplyOnly );
    auto gpuFunc = GET_FUNC_OPTIMISED_FOR_TWO_BOOLS( gpu_densmatr_allTargDiagMatr_sub, hasPower, multiplyOnly );

    // when deployments match, we trivially call the common backend
    if ( quregGPU &&  matrGPU) gpuFunc(qureg, matr, exponent);
    if (!quregGPU && !matrGPU) cpuFunc(qureg, matr, exponent);

    // when only the matr is GPU-accelerated (which is strange, but
    // supported for defensive design), we must use CPU simulation.
    // No need to copy memory; matr's CPU copy should be unchanged
    if (!quregGPU && matrGPU)
        cpuFunc(qureg, matr, exponent);

    // the most common scenario is that qureg (which is quadratically
    // larger than matr) is GPU-accelerated, while matr is not. In that
    // case, we graft GPU memory onto matr and call gpuFunc(). If
    // qureg is distributed, we can re-use its existing GPU communication
    // buffer memory, otherwise we will have to allocate temporary memory;
    // not a big deal given it is quadratically smaller than Qureg's memory
    if (quregGPU && !matrGPU) {

        // binding qureg's GPU communication buffer to matrix is safe,
        // even when subB() below (which itself grafts qureg's buffer to matr)
        // calls this function; that scenario never triggers condition (GPU
        // deployments will match) and instead calls the both-gpu function above.
        assert_quregGpuBufferIsNotGraftedToMatrix(qureg, matr);

        // spoof a GPU-accelerated matrix, grafting buffer or new memory
        // (we use a paranoid copy of matr, even though matr is already a
        // mere copy of the user's matrix, in case this code changes to
        // accept a reference. Still, beware addressing temp's ptr fields!)
        FullStateDiagMatr temp = matr;
        temp.isGpuAccelerated = 1;
        temp.gpuElems = (qureg.isDistributed)?
            qureg.gpuCommBuffer :
            gpu_allocArray(temp.numElemsPerNode);

        // error if that (relatively) small allocation failed (always succeeds if buffer)
        assert_applyFullStateDiagMatrTempGpuAllocSucceeded(temp.gpuElems);

        // harmlessly overwrite new memory or qureg's buffer, and call GPU routine
        gpu_copyCpuToGpu(temp);
        gpuFunc(qureg, temp, exponent);

        // free new GPU memory, but do NOT free qureg's communication buffer
        if (!qureg.isDistributed)
            gpu_deallocArray(temp.gpuElems);
    }
}


void accel_densmatr_allTargDiagMatr_subB(Qureg qureg, FullStateDiagMatr matr, qcomp exponent, bool multiplyOnly) {

    assert_fullStateDiagMatrIsDistributed(matr);
    assert_acceleratorQuregIsDistributed(qureg);

    // qureg's communication buffer (matching its own CPU or GPU deployment)
    // already contains all elements of matr; so we simply spoof matr having
    // its own full-size local memory (matching qureg's GPU/CPU), by grafting
    // qureg's buffer to it, and call _subA() above. It's ergo crucial _subA()
    // does not try to access qureg's communication buffer, which it safely
    // does not in this "qureg deployment = matr deployment" scenario.

    // we use a paranoid copy of matr, even though it is already a mere copy
    // of the user's matr, in case this one day changes to a reference
    FullStateDiagMatr temp = matr;

    // which is non-distributed
    temp.isDistributed = 0;
    temp.numElemsPerNode = temp.numElems;

    // and matches qureg's CPU vs GPU deployment
    temp.isGpuAccelerated = qureg.isGpuAccelerated;
    temp.cpuElems = qureg.cpuCommBuffer;
    temp.gpuElems = qureg.gpuCommBuffer;

    accel_densmatr_allTargDiagMatr_subA(qureg, temp, exponent, multiplyOnly);
}


/*
 * PAULI TENSOR AND GADGET
 */


void accel_statevector_anyCtrlPauliTensorOrGadget_subA(Qureg qureg, vector<int> ctrls, vector<int> states, vector<int> x, vector<int> y, vector<int> z, qcomp f0, qcomp f1) {

    // only X and Y constitute target qubits (Z merely induces a phase)
    int numTargs = x.size() + y.size();

    auto func = GET_CPU_OR_GPU_FUNC_OPTIMISED_FOR_NUM_CTRLS_AND_TARGS( statevector_anyCtrlPauliTensorOrGadget_subA, qureg, ctrls.size(), numTargs );
    func(qureg, ctrls, states, x, y, z, f0, f1);
}
void accel_statevector_anyCtrlPauliTensorOrGadget_subB(Qureg qureg, vector<int> ctrls, vector<int> states, vector<int> x, vector<int> y, vector<int> z, qcomp f0, qcomp f1, qindex mask) {

    auto func = GET_CPU_OR_GPU_FUNC_OPTIMISED_FOR_NUM_CTRLS( statevector_anyCtrlPauliTensorOrGadget_subB, qureg, ctrls.size() );
    func(qureg, ctrls, states, x, y, z, f0, f1, mask);
}


void accel_statevector_anyCtrlAnyTargZOrPhaseGadget_sub(Qureg qureg, vector<int> ctrls, vector<int> states, vector<int> targs, qcomp f0, qcomp f1) {

    auto func = GET_CPU_OR_GPU_FUNC_OPTIMISED_FOR_NUM_CTRLS( statevector_anyCtrlAnyTargZOrPhaseGadget_sub, qureg, ctrls.size() );
    func(qureg, ctrls, states, targs, f0, f1);
}


/*
 * QUREG COMBINATION
 */


void accel_statevec_setQuregToSuperposition_sub(qcomp facOut, Qureg outQureg, qcomp fac1, Qureg inQureg1, qcomp fac2, Qureg inQureg2) {

    // consult outQureg's deployment (other quregs should match, though we dangerously do not assert this post-validation)
    (outQureg.isGpuAccelerated)?
        gpu_statevec_setQuregToSuperposition_sub(facOut, outQureg, fac1, inQureg1, fac2, inQureg2):
        cpu_statevec_setQuregToSuperposition_sub(facOut, outQureg, fac1, inQureg1, fac2, inQureg2);
}


void accel_densmatr_mixQureg_subA(qreal outProb, Qureg out, qreal inProb, Qureg in) {

    // quregs are equally-sized density matrices and are equally-distributed...
    assert_mixedQuregIsDensityMatrix(out);
    assert_mixedQuregIsDensityMatrix(in);
    assert_mixedQuregsAreBothOrNeitherDistributed(out, in);

    // but may differ in GPU accel
    bool outGPU = out.isGpuAccelerated;
    bool inGPU = in.isGpuAccelerated;

    // when deployments match, we trivially call the common backend
    if (outGPU && inGPU)
        gpu_densmatr_mixQureg_subA(outProb, out, inProb, in);
    if (!outGPU && !inGPU)
        cpu_densmatr_mixQureg_subA(outProb, out, inProb, in);

    // deployments differing is a strange and expectedly rare scenario;
    // why use GPU-acceleration for a Qureg but not the equally-sized
    // matrix? We provide the below fallbacks for defensive design, and
    // fall-back to copying memory to RAM and using the CPU backend.
    // In theory, we could leverage exsting GPU memory of the Qureg's
    // communication buffer (if it existed), but this is an even rarer
    // situation and is hacky. We could also create new, temporary GPU
    // memory and graft it to the non-accelerated object, but the new
    // allocation would be the same size as the objects and ergo be
    // dangerously large.

    if (!outGPU && inGPU) {
        gpu_copyGpuToCpu(in);
        cpu_densmatr_mixQureg_subA(outProb, out, inProb, in);
    }

    if (outGPU && !inGPU) {
        gpu_copyGpuToCpu(out);
        cpu_densmatr_mixQureg_subA(outProb, out, inProb, in);
        gpu_copyCpuToGpu(out);
    }
}


void accel_densmatr_mixQureg_subB(qreal outProb, Qureg out, qreal inProb, Qureg in) {

    // quregs are densmatr and statevec, and are both non-distributed...
    assert_mixedQuregIsDensityMatrix(out);
    assert_mixedQuregIsStatevector(in);
    assert_mixedQuregIsLocal(out);
    assert_mixedQuregIsLocal(in);

    // but may differ in GPU accel
    bool outGPU = out.isGpuAccelerated;
    bool inGPU = in.isGpuAccelerated;

    // when deployments match, we trivially call the common backend
    if (outGPU && inGPU)
        gpu_densmatr_mixQureg_subB(outProb, out, inProb, in);
    if (!outGPU && !inGPU)
        cpu_densmatr_mixQureg_subB(outProb, out, inProb, in);

    // GPU-accelarated smaller register defaults to CPU
    if (!outGPU && inGPU) {
        gpu_copyGpuToCpu(in);
        cpu_densmatr_mixQureg_subB(outProb, out, inProb, in);
    }

    // GPU-accelerated larger register is a very common scenario,
    // but is irksome because without communication buffers, there
    // is no existing GPU memory to copy CPU-only small register to.
    // So we regrettably create temporary GPU memory, which will
    // thankfully be very small; quadratically smaller than 'out').
    // Because quregs are local, there are no buffers to re-use
    if (outGPU && !inGPU) {

        // make a cheap copy of 'in' but with GPU memory
        // (we use a paranoid copy of 'in', even though 'in'' is already a
        // mere copy of the user's qureg, in case this code changes to
        // accept a reference. Still, beware addressing in's ptr fields!)
        Qureg temp = in;
        temp.isGpuAccelerated = 1;
        temp.gpuAmps = gpu_allocArray(temp.numAmpsPerNode);
        assert_mixQuregTempGpuAllocSucceeded(temp.gpuAmps);

        // clone in's CPU memory to copy's new GPU memory, simulate, then free
        gpu_copyCpuToGpu(temp);
        gpu_densmatr_mixQureg_subB(outProb, out, inProb, temp);
        gpu_deallocArray(temp.gpuAmps);
    }
}


void accel_densmatr_mixQureg_subC(qreal outProb, Qureg out, qreal inProb) {

    // statevector has been copied to out's GPU or CPU buffer
    assert_mixedQuregIsDensityMatrix(out);
    assert_mixedQuregIsDistributed(out);

    (out.isGpuAccelerated)?
        gpu_densmatr_mixQureg_subC(outProb, out, inProb):
        cpu_densmatr_mixQureg_subC(outProb, out, inProb);
}


void accel_densmatr_mixQureg_subD(qreal outProb, Qureg out, qreal inProb, Qureg in) {

    // 'in' is local statevec and 'out' is a distributed density matrix...
    assert_mixedQuregIsDensityMatrix(out);
    assert_mixedQuregIsStatevector(in);
    assert_mixedQuregIsDistributed(out);
    assert_mixedQuregIsLocal(in);

    // but they may differ in GPU deployment
    bool outGPU = out.isGpuAccelerated;
    bool inGPU = in.isGpuAccelerated;

    // we copy 'in' into 'out's communication buffer and invoke subC;
    // the choice of buffer (CPU or GPU) depends on 'out's deployment
    qindex len = in.numAmps;

    if (outGPU && !inGPU)
        gpu_copyCpuToGpu(in.cpuAmps, out.gpuCommBuffer, len);
    if (!outGPU && inGPU)
        gpu_copyGpuToCpu(in.gpuAmps, out.cpuCommBuffer, len);

    // when 'in' and 'out' are identically deployed, we can
    // avoid copies by temporarily re-assigning pointers
    qcomp* cpuPtr = out.cpuCommBuffer;
    qcomp* gpuPtr = out.gpuCommBuffer; // may be nullptr

    if ( outGPU &&  inGPU) out.gpuCommBuffer = in.gpuAmps;
    if (!outGPU && !inGPU) out.cpuCommBuffer = in.cpuAmps;

    accel_densmatr_mixQureg_subC(outProb, out, inProb);

    // restore pointers in case they were modified
    out.cpuCommBuffer = cpuPtr;
    out.gpuCommBuffer = gpuPtr;
}


/*
 * DEPHASING
 */


void accel_densmatr_oneQubitDephasing_subA(Qureg qureg, int qubit, qreal prob) {

    (qureg.isGpuAccelerated)?
        gpu_densmatr_oneQubitDephasing_subA(qureg, qubit, prob):
        cpu_densmatr_oneQubitDephasing_subA(qureg, qubit, prob);
}
void accel_densmatr_oneQubitDephasing_subB(Qureg qureg, int qubit, qreal prob) {

    (qureg.isGpuAccelerated)?
        gpu_densmatr_oneQubitDephasing_subB(qureg, qubit, prob):
        cpu_densmatr_oneQubitDephasing_subB(qureg, qubit, prob);
}


void accel_densmatr_twoQubitDephasing_subA(Qureg qureg, int qubit1, int qubit2, qreal prob) {

    (qureg.isGpuAccelerated)?
        gpu_densmatr_twoQubitDephasing_subA(qureg, qubit1, qubit2, prob):
        cpu_densmatr_twoQubitDephasing_subA(qureg, qubit1, qubit2, prob);
}
void accel_densmatr_twoQubitDephasing_subB(Qureg qureg, int qubit1, int qubit2, qreal prob) {

    (qureg.isGpuAccelerated)?
        gpu_densmatr_twoQubitDephasing_subB(qureg, qubit1, qubit2, prob):
        cpu_densmatr_twoQubitDephasing_subB(qureg, qubit1, qubit2, prob);
}


/*
 * DEPOLARISING
 */


void accel_densmatr_oneQubitDepolarising_subA(Qureg qureg, int qubit, qreal prob) {

    (qureg.isGpuAccelerated)?
        gpu_densmatr_oneQubitDepolarising_subA(qureg, qubit, prob):
        cpu_densmatr_oneQubitDepolarising_subA(qureg, qubit, prob);
}
void accel_densmatr_oneQubitDepolarising_subB(Qureg qureg, int qubit, qreal prob) {

    (qureg.isGpuAccelerated)?
        gpu_densmatr_oneQubitDepolarising_subB(qureg, qubit, prob):
        cpu_densmatr_oneQubitDepolarising_subB(qureg, qubit, prob);
}


void accel_densmatr_twoQubitDepolarising_subA(Qureg qureg, int qubit1, int qubit2, qreal prob) {

    (qureg.isGpuAccelerated)?
        gpu_densmatr_twoQubitDepolarising_subA(qureg, qubit1, qubit2, prob):
        cpu_densmatr_twoQubitDepolarising_subA(qureg, qubit1, qubit2, prob);
}
void accel_densmatr_twoQubitDepolarising_subB(Qureg qureg, int qubit1, int qubit2, qreal prob) {

    (qureg.isGpuAccelerated)?
        gpu_densmatr_twoQubitDepolarising_subB(qureg, qubit1, qubit2, prob):
        cpu_densmatr_twoQubitDepolarising_subB(qureg, qubit1, qubit2, prob);
}
void accel_densmatr_twoQubitDepolarising_subC(Qureg qureg, int qubit1, int qubit2, qreal prob) {

    (qureg.isGpuAccelerated)?
        gpu_densmatr_twoQubitDepolarising_subC(qureg, qubit1, qubit2, prob):
        cpu_densmatr_twoQubitDepolarising_subC(qureg, qubit1, qubit2, prob);
}
void accel_densmatr_twoQubitDepolarising_subD(Qureg qureg, int qubit1, int qubit2, qreal prob) {

    (qureg.isGpuAccelerated)?
        gpu_densmatr_twoQubitDepolarising_subD(qureg, qubit1, qubit2, prob):
        cpu_densmatr_twoQubitDepolarising_subD(qureg, qubit1, qubit2, prob);
}
void accel_densmatr_twoQubitDepolarising_subE(Qureg qureg, int qubit1, int qubit2, qreal prob) {

    (qureg.isGpuAccelerated)?
        gpu_densmatr_twoQubitDepolarising_subE(qureg, qubit1, qubit2, prob):
        cpu_densmatr_twoQubitDepolarising_subE(qureg, qubit1, qubit2, prob);
}
void accel_densmatr_twoQubitDepolarising_subF(Qureg qureg, int qubit1, int qubit2, qreal prob) {

    (qureg.isGpuAccelerated)?
        gpu_densmatr_twoQubitDepolarising_subF(qureg, qubit1, qubit2, prob):
        cpu_densmatr_twoQubitDepolarising_subF(qureg, qubit1, qubit2, prob);
}


/*
 * PAULI CHANNEL
 */


void accel_densmatr_oneQubitPauliChannel_subA(Qureg qureg, int qubit, qreal pI, qreal pX, qreal pY, qreal pZ) {

    (qureg.isGpuAccelerated)?
        gpu_densmatr_oneQubitPauliChannel_subA(qureg, qubit, pI, pX, pY, pZ):
        cpu_densmatr_oneQubitPauliChannel_subA(qureg, qubit, pI, pX, pY, pZ);
}
void accel_densmatr_oneQubitPauliChannel_subB(Qureg qureg, int ketQubit, qreal pI, qreal pX, qreal pY, qreal pZ) {

    (qureg.isGpuAccelerated)?
        gpu_densmatr_oneQubitPauliChannel_subB(qureg, ketQubit, pI, pX, pY, pZ):
        cpu_densmatr_oneQubitPauliChannel_subB(qureg, ketQubit, pI, pX, pY, pZ);
}


/*
 * AMPLITUDE DAMPING CHANNEL
 */


void accel_densmatr_oneQubitDamping_subA(Qureg qureg, int qubit, qreal prob) {

    (qureg.isGpuAccelerated)?
        gpu_densmatr_oneQubitDamping_subA(qureg, qubit, prob):
        cpu_densmatr_oneQubitDamping_subA(qureg, qubit, prob);
}
void accel_densmatr_oneQubitDamping_subB(Qureg qureg, int qubit, qreal prob) {

    (qureg.isGpuAccelerated)?
        gpu_densmatr_oneQubitDamping_subB(qureg, qubit, prob):
        cpu_densmatr_oneQubitDamping_subB(qureg, qubit, prob);
}
void accel_densmatr_oneQubitDamping_subC(Qureg qureg, int qubit, qreal prob) {

    (qureg.isGpuAccelerated)?
        gpu_densmatr_oneQubitDamping_subC(qureg, qubit, prob):
        cpu_densmatr_oneQubitDamping_subC(qureg, qubit, prob);
}
void accel_densmatr_oneQubitDamping_subD(Qureg qureg, int qubit, qreal prob) {

    (qureg.isGpuAccelerated)?
        gpu_densmatr_oneQubitDamping_subD(qureg, qubit, prob):
        cpu_densmatr_oneQubitDamping_subD(qureg, qubit, prob);
}


/*
 * PARTIAL TRACE
 */


void accel_densmatr_partialTrace_sub(Qureg inQureg, Qureg outQureg, vector<int> targs, vector<int> pairTargs) {
    assert_partialTraceQuregsAreIdenticallyDeployed(inQureg, outQureg);

    auto cpuFunc = GET_FUNC_OPTIMISED_FOR_NUM_TARGS( cpu_densmatr_partialTrace_sub, targs.size() );
    auto gpuFunc = GET_FUNC_OPTIMISED_FOR_NUM_TARGS( gpu_densmatr_partialTrace_sub, targs.size() );

    // inQureg == outQureg except for dimension, so use common backend
    auto useFunc = (inQureg.isGpuAccelerated)? gpuFunc : cpuFunc;
    useFunc(inQureg, outQureg, targs, pairTargs);
}


/*
 * PROBABILITIES
 */


qreal accel_statevec_calcTotalProb_sub(Qureg qureg) {

    return (qureg.isGpuAccelerated)?
        gpu_statevec_calcTotalProb_sub(qureg):
        cpu_statevec_calcTotalProb_sub(qureg);
}
qreal accel_densmatr_calcTotalProb_sub(Qureg qureg) {

    return (qureg.isGpuAccelerated)?
        gpu_densmatr_calcTotalProb_sub(qureg):
        cpu_densmatr_calcTotalProb_sub(qureg);
}


qreal accel_statevec_calcProbOfMultiQubitOutcome_sub(Qureg qureg, vector<int> qubits, vector<int> outcomes) {

    auto func = GET_CPU_OR_GPU_FUNC_OPTIMISED_FOR_NUM_TARGS( statevec_calcProbOfMultiQubitOutcome_sub, qureg, qubits.size() );
    return func(qureg, qubits, outcomes);
}
qreal accel_densmatr_calcProbOfMultiQubitOutcome_sub(Qureg qureg, vector<int> qubits, vector<int> outcomes) {

    auto func = GET_CPU_OR_GPU_FUNC_OPTIMISED_FOR_NUM_TARGS( densmatr_calcProbOfMultiQubitOutcome_sub, qureg, qubits.size() );
    return func(qureg, qubits, outcomes);
}


void accel_statevec_calcProbsOfAllMultiQubitOutcomes_sub(qreal* outProbs, Qureg qureg, vector<int> qubits) {

    auto func = GET_CPU_OR_GPU_FUNC_OPTIMISED_FOR_NUM_TARGS( statevec_calcProbsOfAllMultiQubitOutcomes_sub, qureg, qubits.size() );
    func(outProbs, qureg, qubits);
}
void accel_densmatr_calcProbsOfAllMultiQubitOutcomes_sub(qreal* outProbs, Qureg qureg, vector<int> qubits) {

    auto func = GET_CPU_OR_GPU_FUNC_OPTIMISED_FOR_NUM_TARGS( densmatr_calcProbsOfAllMultiQubitOutcomes_sub, qureg, qubits.size() );
    func(outProbs, qureg, qubits);
}


qreal accel_densmatr_calcHilbertSchmidtDistance_sub(Qureg quregA, Qureg quregB) {

    // quregs are gauranteed to be identically deployed
    return (quregA.isGpuAccelerated)?
        gpu_densmatr_calcHilbertSchmidtDistance_sub(quregA, quregB):
        cpu_densmatr_calcHilbertSchmidtDistance_sub(quregA, quregB);
}


/*
 * INNER PRODUCTS
 */


qcomp accel_statevec_calcInnerProduct_sub(Qureg quregA, Qureg quregB) {
    assert_innerProductedSameDimQuregsHaveSameGpuAccel(quregA, quregB);

    // in theory, we could permit them to differ in GPU-acceleration
    // if one (or both) is distributed; we could then hijack the
    // GPU communication buffer and copy over the CPU-only Qureg's
    // amps. But this is a nonsensical and inefficient scenario to support.

    return (quregA.isGpuAccelerated)?
        gpu_statevec_calcInnerProduct_sub(quregA, quregB):
        cpu_statevec_calcInnerProduct_sub(quregA, quregB);
}


qcomp accel_densmatr_calcFidelityWithPureState_sub(Qureg rho, Qureg psi, bool conj) {
    assert_calcFidStateVecIsLocal(psi);

    auto cpuFunc = GET_FUNC_OPTIMISED_FOR_BOOL( cpu_densmatr_calcFidelityWithPureState_sub, conj );
    auto gpuFunc = GET_FUNC_OPTIMISED_FOR_BOOL( gpu_densmatr_calcFidelityWithPureState_sub, conj );

    // quregs may differ in their GPU vs CPU deployments
    bool rhoGpu = rho.isGpuAccelerated;
    bool psiGpu = psi.isGpuAccelerated;

    // if deployments agree, trivially call the common backend
    if (rhoGpu == psiGpu)
        return (rhoGpu)? gpuFunc(rho,psi) : cpuFunc(rho,psi);

    // if only the smaller psi is GPU-accel (which is sensible when the larger
    // rho is distributed and/or exceeds the GPU memory capacity), copy psi's
    // GPU memory to CPU and proceed with CPU calculation
    if (!rhoGpu && psiGpu) {
        gpu_copyGpuToCpu(psi);
        return cpuFunc(rho, psi);
    }

    // it is also possible/sensible that rho is GPU-accelerated while the quadratically-smaller
    // psi is not. In that case, we spoof a GPU-accelerated psi which re-uses rho's
    // GPU communication buffer if it exists, else creates temporary memory (not so big).
    Qureg temp = psi;
    temp.isGpuAccelerated = 1;
    temp.gpuAmps = (rho.isDistributed)?
        rho.gpuCommBuffer :
        gpu_allocArray(temp.numAmpsPerNode);

    // error if that (relatively) small allocation failed (always succeeds if buffer)
    assert_calcFidTempGpuAllocSucceeded(temp.gpuAmps);

    // harmlessly overwrite new memory or rho's buffer, and call GPU routine
    gpu_copyCpuToGpu(temp);
    qcomp prod = gpuFunc(rho, temp);

    // free new GPU memory, but do NOT free rho's communication buffer
    if (!rho.isDistributed)
        gpu_deallocArray(temp.gpuAmps);

    return prod;
}


/*
 * EXPECTATION VALUES
 */


qreal accel_statevec_calcExpecAnyTargZ_sub(Qureg qureg, vector<int> targs) {

    return (qureg.isGpuAccelerated)?
        gpu_statevec_calcExpecAnyTargZ_sub(qureg, targs):
        cpu_statevec_calcExpecAnyTargZ_sub(qureg, targs);
}
qcomp accel_densmatr_calcExpecAnyTargZ_sub(Qureg qureg, vector<int> targs) {

    return (qureg.isGpuAccelerated)?
        gpu_densmatr_calcExpecAnyTargZ_sub(qureg, targs):
        cpu_densmatr_calcExpecAnyTargZ_sub(qureg, targs);
}


qcomp accel_statevec_calcExpecPauliStr_subA(Qureg qureg, vector<int> x, vector<int> y, vector<int> z) {

    return (qureg.isGpuAccelerated)?
        gpu_statevec_calcExpecPauliStr_subA(qureg, x, y, z):
        cpu_statevec_calcExpecPauliStr_subA(qureg, x, y, z);
}
qcomp accel_statevec_calcExpecPauliStr_subB(Qureg qureg, vector<int> x, vector<int> y, vector<int> z) {

    return (qureg.isGpuAccelerated)?
        gpu_statevec_calcExpecPauliStr_subB(qureg, x, y, z):
        cpu_statevec_calcExpecPauliStr_subB(qureg, x, y, z);